kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +0 -6
- data/.rubocop.yaml +534 -1
- data/Gemfile +2 -1
- data/Gemfile.lock +28 -116
- data/README.md +269 -629
- data/Rakefile +0 -9
- data/Steepfile +4 -8
- data/examples/async_patterns.rb +58 -1
- data/ext/kreuzberg_rb/extconf.rb +5 -35
- data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
- data/ext/kreuzberg_rb/native/build.rs +14 -12
- data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
- data/ext/kreuzberg_rb/native/include/strings.h +2 -2
- data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
- data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
- data/extconf.rb +6 -38
- data/kreuzberg.gemspec +20 -114
- data/lib/kreuzberg/api_proxy.rb +18 -2
- data/lib/kreuzberg/cache_api.rb +0 -22
- data/lib/kreuzberg/cli.rb +10 -2
- data/lib/kreuzberg/cli_proxy.rb +10 -0
- data/lib/kreuzberg/config.rb +22 -274
- data/lib/kreuzberg/errors.rb +7 -73
- data/lib/kreuzberg/extraction_api.rb +8 -237
- data/lib/kreuzberg/mcp_proxy.rb +11 -2
- data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
- data/lib/kreuzberg/post_processor_protocol.rb +71 -0
- data/lib/kreuzberg/result.rb +33 -151
- data/lib/kreuzberg/setup_lib_path.rb +2 -22
- data/lib/kreuzberg/validator_protocol.rb +73 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +13 -27
- data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
- data/sig/kreuzberg.rbs +12 -105
- data/spec/binding/cache_spec.rb +22 -22
- data/spec/binding/cli_proxy_spec.rb +4 -2
- data/spec/binding/cli_spec.rb +11 -12
- data/spec/binding/config_spec.rb +0 -74
- data/spec/binding/config_validation_spec.rb +6 -100
- data/spec/binding/error_handling_spec.rb +97 -283
- data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
- data/spec/binding/plugins/postprocessor_spec.rb +11 -11
- data/spec/binding/plugins/validator_spec.rb +13 -12
- data/spec/examples.txt +104 -0
- data/spec/fixtures/config.toml +1 -0
- data/spec/fixtures/config.yaml +1 -0
- data/spec/fixtures/invalid_config.toml +1 -0
- data/spec/smoke/package_spec.rb +3 -2
- data/spec/spec_helper.rb +3 -1
- data/vendor/kreuzberg/Cargo.toml +67 -192
- data/vendor/kreuzberg/README.md +9 -97
- data/vendor/kreuzberg/build.rs +194 -516
- data/vendor/kreuzberg/src/api/handlers.rs +9 -130
- data/vendor/kreuzberg/src/api/mod.rs +3 -18
- data/vendor/kreuzberg/src/api/server.rs +71 -236
- data/vendor/kreuzberg/src/api/types.rs +7 -43
- data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
- data/vendor/kreuzberg/src/cache/mod.rs +3 -27
- data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
- data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
- data/vendor/kreuzberg/src/core/config.rs +23 -905
- data/vendor/kreuzberg/src/core/extractor.rs +106 -403
- data/vendor/kreuzberg/src/core/io.rs +2 -4
- data/vendor/kreuzberg/src/core/mime.rs +12 -2
- data/vendor/kreuzberg/src/core/mod.rs +3 -22
- data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
- data/vendor/kreuzberg/src/embeddings.rs +21 -169
- data/vendor/kreuzberg/src/error.rs +2 -2
- data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
- data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
- data/vendor/kreuzberg/src/extraction/email.rs +11 -12
- data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
- data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
- data/vendor/kreuzberg/src/extraction/image.rs +14 -138
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
- data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
- data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
- data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
- data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
- data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
- data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
- data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
- data/vendor/kreuzberg/src/extraction/table.rs +1 -2
- data/vendor/kreuzberg/src/extraction/text.rs +10 -18
- data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
- data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
- data/vendor/kreuzberg/src/extractors/email.rs +9 -37
- data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
- data/vendor/kreuzberg/src/extractors/html.rs +173 -182
- data/vendor/kreuzberg/src/extractors/image.rs +8 -32
- data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
- data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
- data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
- data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
- data/vendor/kreuzberg/src/extractors/text.rs +7 -30
- data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
- data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
- data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
- data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
- data/vendor/kreuzberg/src/lib.rs +5 -17
- data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
- data/vendor/kreuzberg/src/mcp/server.rs +21 -145
- data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
- data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
- data/vendor/kreuzberg/src/pdf/error.rs +1 -93
- data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
- data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
- data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
- data/vendor/kreuzberg/src/pdf/table.rs +64 -61
- data/vendor/kreuzberg/src/pdf/text.rs +24 -416
- data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
- data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
- data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
- data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
- data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
- data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
- data/vendor/kreuzberg/src/text/mod.rs +0 -8
- data/vendor/kreuzberg/src/text/quality.rs +15 -28
- data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
- data/vendor/kreuzberg/src/types.rs +67 -907
- data/vendor/kreuzberg/src/utils/mod.rs +0 -14
- data/vendor/kreuzberg/src/utils/quality.rs +3 -12
- data/vendor/kreuzberg/tests/api_tests.rs +0 -506
- data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
- data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
- data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
- data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
- data/vendor/kreuzberg/tests/config_features.rs +1 -33
- data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
- data/vendor/kreuzberg/tests/core_integration.rs +9 -35
- data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
- data/vendor/kreuzberg/tests/email_integration.rs +1 -3
- data/vendor/kreuzberg/tests/error_handling.rs +34 -43
- data/vendor/kreuzberg/tests/format_integration.rs +1 -7
- data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
- data/vendor/kreuzberg/tests/image_integration.rs +0 -2
- data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
- data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
- data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
- data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
- data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
- data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
- data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
- data/vendor/kreuzberg/tests/security_validation.rs +1 -13
- data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
- metadata +25 -171
- data/.rubocop.yml +0 -543
- data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
- data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
- data/lib/kreuzberg/error_context.rb +0 -136
- data/lib/kreuzberg/types.rb +0 -170
- data/lib/libpdfium.so +0 -0
- data/spec/binding/async_operations_spec.rb +0 -473
- data/spec/binding/batch_operations_spec.rb +0 -595
- data/spec/binding/batch_spec.rb +0 -359
- data/spec/binding/config_result_spec.rb +0 -377
- data/spec/binding/embeddings_spec.rb +0 -816
- data/spec/binding/error_recovery_spec.rb +0 -488
- data/spec/binding/font_config_spec.rb +0 -220
- data/spec/binding/images_spec.rb +0 -738
- data/spec/binding/keywords_extraction_spec.rb +0 -600
- data/spec/binding/metadata_types_spec.rb +0 -1228
- data/spec/binding/pages_extraction_spec.rb +0 -471
- data/spec/binding/tables_spec.rb +0 -641
- data/spec/unit/config/chunking_config_spec.rb +0 -213
- data/spec/unit/config/embedding_config_spec.rb +0 -343
- data/spec/unit/config/extraction_config_spec.rb +0 -438
- data/spec/unit/config/font_config_spec.rb +0 -285
- data/spec/unit/config/hierarchy_config_spec.rb +0 -314
- data/spec/unit/config/image_extraction_config_spec.rb +0 -209
- data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
- data/spec/unit/config/keyword_config_spec.rb +0 -229
- data/spec/unit/config/language_detection_config_spec.rb +0 -258
- data/spec/unit/config/ocr_config_spec.rb +0 -171
- data/spec/unit/config/page_config_spec.rb +0 -221
- data/spec/unit/config/pdf_config_spec.rb +0 -267
- data/spec/unit/config/postprocessor_config_spec.rb +0 -290
- data/spec/unit/config/tesseract_config_spec.rb +0 -181
- data/spec/unit/config/token_reduction_config_spec.rb +0 -251
- data/test/metadata_types_test.rb +0 -959
- data/vendor/Cargo.toml +0 -61
- data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
- data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
- data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
- data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/formats.rs +0 -235
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
- data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
- data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
- data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
- data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
- data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
- data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
- data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/extractors/security.rs +0 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
- data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
- data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
- data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
- data/vendor/kreuzberg/src/panic_context.rs +0 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
- data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
- data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
- data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
- data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
- data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
- data/vendor/kreuzberg/src/utils/pool.rs +0 -503
- data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
- data/vendor/kreuzberg/tests/api_embed.rs +0 -360
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
- data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
- data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
- data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
- data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
- data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
- data/vendor/kreuzberg/tests/page_markers.rs +0 -297
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
- data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
- data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
- data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
- data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
- data/vendor/kreuzberg-ffi/README.md +0 -851
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
- data/vendor/kreuzberg-ffi/build.rs +0 -168
- data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
- data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
- data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
- data/vendor/kreuzberg-ffi/src/error.rs +0 -901
- data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
- data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
- data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
- data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
- data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
- data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
- data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
- data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
- data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
- data/vendor/kreuzberg-ffi/src/result.rs +0 -510
- data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
- data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
- data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
- data/vendor/kreuzberg-ffi/src/types.rs +0 -363
- data/vendor/kreuzberg-ffi/src/util.rs +0 -210
- data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
- data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
- data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
- data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
- data/vendor/kreuzberg-tesseract/LICENSE +0 -22
- data/vendor/kreuzberg-tesseract/README.md +0 -399
- data/vendor/kreuzberg-tesseract/build.rs +0 -1127
- data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
- data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
- data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
- data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
- data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
- data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
# Configuration validation tests
|
|
4
|
+
|
|
3
5
|
RSpec.describe 'Configuration Validation' do
|
|
4
6
|
describe Kreuzberg::Config::Extraction do
|
|
5
7
|
it 'accepts all valid parameters' do
|
|
@@ -234,102 +236,6 @@ RSpec.describe 'Configuration Validation' do
|
|
|
234
236
|
end
|
|
235
237
|
end
|
|
236
238
|
|
|
237
|
-
describe Kreuzberg::Config::ImagePreprocessing do
|
|
238
|
-
it 'has sensible defaults' do
|
|
239
|
-
config = described_class.new
|
|
240
|
-
expect(config.target_dpi).to eq(300)
|
|
241
|
-
expect(config.auto_rotate).to be true
|
|
242
|
-
expect(config.deskew).to be true
|
|
243
|
-
expect(config.denoise).to be false
|
|
244
|
-
expect(config.contrast_enhance).to be true
|
|
245
|
-
expect(config.binarization_method).to eq('otsu')
|
|
246
|
-
expect(config.invert_colors).to be false
|
|
247
|
-
end
|
|
248
|
-
|
|
249
|
-
it 'accepts custom values' do
|
|
250
|
-
config = described_class.new(
|
|
251
|
-
target_dpi: 600,
|
|
252
|
-
auto_rotate: false,
|
|
253
|
-
deskew: false,
|
|
254
|
-
denoise: true,
|
|
255
|
-
contrast_enhance: false,
|
|
256
|
-
binarization_method: 'sauvola',
|
|
257
|
-
invert_colors: true
|
|
258
|
-
)
|
|
259
|
-
expect(config.target_dpi).to eq(600)
|
|
260
|
-
expect(config.auto_rotate).to be false
|
|
261
|
-
expect(config.deskew).to be false
|
|
262
|
-
expect(config.denoise).to be true
|
|
263
|
-
expect(config.contrast_enhance).to be false
|
|
264
|
-
expect(config.binarization_method).to eq('sauvola')
|
|
265
|
-
expect(config.invert_colors).to be true
|
|
266
|
-
end
|
|
267
|
-
|
|
268
|
-
it 'validates binarization method via FFI' do
|
|
269
|
-
expect { described_class.new(binarization_method: 'otsu') }.not_to raise_error
|
|
270
|
-
expect { described_class.new(binarization_method: 'adaptive') }.not_to raise_error
|
|
271
|
-
expect { described_class.new(binarization_method: 'sauvola') }.not_to raise_error
|
|
272
|
-
end
|
|
273
|
-
|
|
274
|
-
it 'rejects invalid binarization methods' do
|
|
275
|
-
expect do
|
|
276
|
-
described_class.new(binarization_method: 'invalid_method')
|
|
277
|
-
end.to raise_error(ArgumentError, /Invalid binarization_method/)
|
|
278
|
-
end
|
|
279
|
-
|
|
280
|
-
it 'converts to hash correctly' do
|
|
281
|
-
config = described_class.new(
|
|
282
|
-
target_dpi: 500,
|
|
283
|
-
binarization_method: 'adaptive'
|
|
284
|
-
)
|
|
285
|
-
hash = config.to_h
|
|
286
|
-
expect(hash[:target_dpi]).to eq(500)
|
|
287
|
-
expect(hash[:binarization_method]).to eq('adaptive')
|
|
288
|
-
expect(hash[:auto_rotate]).to be true
|
|
289
|
-
end
|
|
290
|
-
end
|
|
291
|
-
|
|
292
|
-
describe Kreuzberg::Config::TokenReduction do
|
|
293
|
-
it 'has sensible defaults' do
|
|
294
|
-
config = described_class.new
|
|
295
|
-
expect(config.mode).to eq('off')
|
|
296
|
-
expect(config.preserve_important_words).to be true
|
|
297
|
-
end
|
|
298
|
-
|
|
299
|
-
it 'accepts custom values' do
|
|
300
|
-
config = described_class.new(
|
|
301
|
-
mode: 'aggressive',
|
|
302
|
-
preserve_important_words: false
|
|
303
|
-
)
|
|
304
|
-
expect(config.mode).to eq('aggressive')
|
|
305
|
-
expect(config.preserve_important_words).to be false
|
|
306
|
-
end
|
|
307
|
-
|
|
308
|
-
it 'validates token reduction levels via FFI' do
|
|
309
|
-
expect { described_class.new(mode: 'off') }.not_to raise_error
|
|
310
|
-
expect { described_class.new(mode: 'light') }.not_to raise_error
|
|
311
|
-
expect { described_class.new(mode: 'moderate') }.not_to raise_error
|
|
312
|
-
expect { described_class.new(mode: 'aggressive') }.not_to raise_error
|
|
313
|
-
expect { described_class.new(mode: 'maximum') }.not_to raise_error
|
|
314
|
-
end
|
|
315
|
-
|
|
316
|
-
it 'rejects invalid token reduction modes' do
|
|
317
|
-
expect do
|
|
318
|
-
described_class.new(mode: 'extreme')
|
|
319
|
-
end.to raise_error(ArgumentError, /Invalid token reduction mode/)
|
|
320
|
-
end
|
|
321
|
-
|
|
322
|
-
it 'converts to hash correctly' do
|
|
323
|
-
config = described_class.new(
|
|
324
|
-
mode: 'light',
|
|
325
|
-
preserve_important_words: true
|
|
326
|
-
)
|
|
327
|
-
hash = config.to_h
|
|
328
|
-
expect(hash[:mode]).to eq('light')
|
|
329
|
-
expect(hash[:preserve_important_words]).to be true
|
|
330
|
-
end
|
|
331
|
-
end
|
|
332
|
-
|
|
333
239
|
describe 'config usage in extraction' do
|
|
334
240
|
it 'works with OCR config' do
|
|
335
241
|
path = create_test_file('OCR config test')
|
|
@@ -337,7 +243,7 @@ RSpec.describe 'Configuration Validation' do
|
|
|
337
243
|
ocr: Kreuzberg::Config::OCR.new(backend: 'tesseract', language: 'eng')
|
|
338
244
|
)
|
|
339
245
|
|
|
340
|
-
result = Kreuzberg.extract_file_sync(path
|
|
246
|
+
result = Kreuzberg.extract_file_sync(path, config: config)
|
|
341
247
|
expect(result).to be_a(Kreuzberg::Result)
|
|
342
248
|
end
|
|
343
249
|
|
|
@@ -347,7 +253,7 @@ RSpec.describe 'Configuration Validation' do
|
|
|
347
253
|
chunking: Kreuzberg::Config::Chunking.new(max_chars: 50)
|
|
348
254
|
)
|
|
349
255
|
|
|
350
|
-
result = Kreuzberg.extract_file_sync(path
|
|
256
|
+
result = Kreuzberg.extract_file_sync(path, config: config)
|
|
351
257
|
expect(result).to be_a(Kreuzberg::Result)
|
|
352
258
|
end
|
|
353
259
|
|
|
@@ -357,7 +263,7 @@ RSpec.describe 'Configuration Validation' do
|
|
|
357
263
|
language_detection: Kreuzberg::Config::LanguageDetection.new(enabled: true)
|
|
358
264
|
)
|
|
359
265
|
|
|
360
|
-
result = Kreuzberg.extract_file_sync(path
|
|
266
|
+
result = Kreuzberg.extract_file_sync(path, config: config)
|
|
361
267
|
expect(result).to be_a(Kreuzberg::Result)
|
|
362
268
|
end
|
|
363
269
|
|
|
@@ -370,7 +276,7 @@ RSpec.describe 'Configuration Validation' do
|
|
|
370
276
|
language_detection: { enabled: false }
|
|
371
277
|
)
|
|
372
278
|
|
|
373
|
-
result = Kreuzberg.extract_file_sync(path
|
|
279
|
+
result = Kreuzberg.extract_file_sync(path, config: config)
|
|
374
280
|
expect(result).to be_a(Kreuzberg::Result)
|
|
375
281
|
end
|
|
376
282
|
end
|
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
# Error handling and exception mapping tests
|
|
4
|
+
|
|
3
5
|
RSpec.describe 'Error Handling' do
|
|
4
6
|
let(:nested_ocr_result) do
|
|
5
7
|
{
|
|
@@ -32,362 +34,163 @@ RSpec.describe 'Error Handling' do
|
|
|
32
34
|
}
|
|
33
35
|
end
|
|
34
36
|
|
|
35
|
-
describe '
|
|
36
|
-
it 'raises error for
|
|
37
|
-
# rubocop:disable Style/MultilineBlockChain
|
|
38
|
-
expect do
|
|
39
|
-
Kreuzberg::Config::Extraction.new(
|
|
40
|
-
chunking: Kreuzberg::Config::Chunking.new(max_chars: -100)
|
|
41
|
-
)
|
|
42
|
-
end.to raise_error do |error|
|
|
43
|
-
expect(error).to be_a(StandardError)
|
|
44
|
-
expect(error.message.downcase).to match(/negative|invalid|positive|max_chars/)
|
|
45
|
-
end
|
|
46
|
-
# rubocop:enable Style/MultilineBlockChain
|
|
47
|
-
end
|
|
48
|
-
|
|
49
|
-
it 'raises error for negative max_overlap in chunking' do
|
|
50
|
-
# rubocop:disable Style/MultilineBlockChain
|
|
51
|
-
expect do
|
|
52
|
-
Kreuzberg::Config::Chunking.new(max_overlap: -50)
|
|
53
|
-
end.to raise_error do |error|
|
|
54
|
-
expect(error).to be_a(StandardError)
|
|
55
|
-
expect(error.message.downcase).to match(/negative|invalid|overlap/)
|
|
56
|
-
end
|
|
57
|
-
# rubocop:enable Style/MultilineBlockChain
|
|
58
|
-
end
|
|
59
|
-
|
|
60
|
-
it 'raises ArgumentError for invalid OCR config type' do
|
|
61
|
-
# rubocop:disable Style/MultilineBlockChain
|
|
62
|
-
expect do
|
|
63
|
-
Kreuzberg::Config::Extraction.new(ocr: 'invalid_string')
|
|
64
|
-
end.to raise_error(ArgumentError) do |error|
|
|
65
|
-
expect(error.message).to include('Expected')
|
|
66
|
-
expect(error.message).to include('OCR')
|
|
67
|
-
end
|
|
68
|
-
# rubocop:enable Style/MultilineBlockChain
|
|
69
|
-
end
|
|
70
|
-
|
|
71
|
-
it 'raises ArgumentError for invalid chunking config type' do
|
|
37
|
+
describe 'file not found errors' do
|
|
38
|
+
it 'raises error for non-existent file' do
|
|
72
39
|
expect do
|
|
73
|
-
Kreuzberg
|
|
74
|
-
end.to raise_error(
|
|
75
|
-
end
|
|
76
|
-
|
|
77
|
-
it 'raises ArgumentError for invalid language_detection config' do
|
|
78
|
-
expect do
|
|
79
|
-
Kreuzberg::Config::Extraction.new(language_detection: [])
|
|
80
|
-
end.to raise_error(ArgumentError)
|
|
81
|
-
end
|
|
82
|
-
|
|
83
|
-
it 'raises ArgumentError for invalid pdf_options config' do
|
|
84
|
-
expect do
|
|
85
|
-
Kreuzberg::Config::Extraction.new(pdf_options: 'invalid_string')
|
|
86
|
-
end.to raise_error(ArgumentError)
|
|
87
|
-
end
|
|
88
|
-
|
|
89
|
-
it 'provides descriptive error messages for config validation' do
|
|
90
|
-
error = nil
|
|
91
|
-
begin
|
|
92
|
-
Kreuzberg::Config::Extraction.new(ocr: 12_345)
|
|
93
|
-
rescue ArgumentError => e
|
|
94
|
-
error = e
|
|
95
|
-
end
|
|
96
|
-
|
|
97
|
-
expect(error).not_to be_nil
|
|
98
|
-
expect(error.message).to be_a(String)
|
|
99
|
-
expect(error.message).not_to be_empty
|
|
100
|
-
end
|
|
101
|
-
end
|
|
102
|
-
|
|
103
|
-
describe 'file not found and corrupted files' do
|
|
104
|
-
it 'raises error for non-existent file with meaningful message' do
|
|
105
|
-
# rubocop:disable Style/MultilineBlockChain
|
|
106
|
-
expect do
|
|
107
|
-
Kreuzberg.extract_file_sync(path: '/nonexistent/path/file.txt')
|
|
108
|
-
end.to raise_error do |error|
|
|
109
|
-
expect(error).to be_a(StandardError)
|
|
110
|
-
expect(error.message).not_to be_empty
|
|
111
|
-
end
|
|
112
|
-
# rubocop:enable Style/MultilineBlockChain
|
|
40
|
+
Kreuzberg.extract_file_sync('/nonexistent/path/file.txt')
|
|
41
|
+
end.to raise_error(StandardError)
|
|
113
42
|
end
|
|
114
43
|
|
|
115
|
-
it 'raises error for empty
|
|
44
|
+
it 'raises error for empty path' do
|
|
116
45
|
expect do
|
|
117
|
-
Kreuzberg.extract_file_sync(
|
|
46
|
+
Kreuzberg.extract_file_sync('')
|
|
118
47
|
end.to raise_error(StandardError)
|
|
119
48
|
end
|
|
120
49
|
|
|
121
|
-
it 'raises error for nil
|
|
50
|
+
it 'raises error for nil path' do
|
|
122
51
|
expect do
|
|
123
|
-
Kreuzberg.extract_file_sync(
|
|
52
|
+
Kreuzberg.extract_file_sync(nil)
|
|
124
53
|
end.to raise_error(StandardError)
|
|
125
54
|
end
|
|
55
|
+
end
|
|
126
56
|
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
57
|
+
describe 'invalid MIME type handling' do
|
|
58
|
+
it 'handles unknown MIME types' do
|
|
59
|
+
path = create_test_file('Unknown MIME')
|
|
130
60
|
|
|
61
|
+
# Implementation may either handle gracefully or raise error for unknown MIME types
|
|
131
62
|
begin
|
|
132
|
-
result = Kreuzberg.extract_file_sync(path
|
|
133
|
-
# May succeed with empty content or raise error - both acceptable
|
|
63
|
+
result = Kreuzberg.extract_file_sync(path, mime_type: 'application/x-unknown-type')
|
|
134
64
|
expect(result).to be_a(Kreuzberg::Result)
|
|
135
|
-
rescue Kreuzberg::Errors::ParsingError => e
|
|
136
|
-
expect(e).to be_a(Kreuzberg::Errors::ParsingError)
|
|
137
|
-
expect(e.message).not_to be_empty
|
|
138
65
|
rescue StandardError => e
|
|
139
66
|
expect(e).to be_a(StandardError)
|
|
140
67
|
end
|
|
141
68
|
end
|
|
142
69
|
end
|
|
143
70
|
|
|
144
|
-
describe 'invalid
|
|
145
|
-
it '
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
begin
|
|
150
|
-
result_or_error = Kreuzberg.extract_file_sync(path, mime_type: 'application/x-custom-unknown-format')
|
|
151
|
-
rescue Kreuzberg::Errors::UnsupportedFormatError, StandardError => e
|
|
152
|
-
result_or_error = e
|
|
153
|
-
end
|
|
154
|
-
|
|
155
|
-
if result_or_error.is_a?(Kreuzberg::Result)
|
|
156
|
-
expect(result_or_error).to be_a(Kreuzberg::Result)
|
|
157
|
-
else
|
|
158
|
-
expect(result_or_error).to be_a(StandardError)
|
|
159
|
-
expect(result_or_error.message).not_to be_empty
|
|
160
|
-
end
|
|
71
|
+
describe 'invalid configuration' do
|
|
72
|
+
it 'raises error for invalid ocr config' do
|
|
73
|
+
expect do
|
|
74
|
+
Kreuzberg::Config::Extraction.new(ocr: 'invalid')
|
|
75
|
+
end.to raise_error(ArgumentError)
|
|
161
76
|
end
|
|
162
77
|
|
|
163
|
-
it '
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
result_or_error = nil
|
|
168
|
-
begin
|
|
169
|
-
result_or_error = Kreuzberg.extract_file_sync(path, mime_type: '///invalid@@@')
|
|
170
|
-
rescue StandardError => e
|
|
171
|
-
result_or_error = e
|
|
172
|
-
end
|
|
173
|
-
|
|
174
|
-
expect([Kreuzberg::Result, StandardError].any? { |klass| result_or_error.is_a?(klass) }).to be_truthy
|
|
78
|
+
it 'raises error for invalid chunking config' do
|
|
79
|
+
expect do
|
|
80
|
+
Kreuzberg::Config::Extraction.new(chunking: 123)
|
|
81
|
+
end.to raise_error(ArgumentError)
|
|
175
82
|
end
|
|
176
83
|
|
|
177
|
-
it '
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
result_or_error = nil
|
|
182
|
-
begin
|
|
183
|
-
Kreuzberg.extract_file_sync(path, mime_type: '')
|
|
184
|
-
rescue StandardError => e
|
|
185
|
-
result_or_error = e
|
|
186
|
-
end
|
|
187
|
-
|
|
188
|
-
expect(result_or_error).to be_a(StandardError) if result_or_error
|
|
84
|
+
it 'raises error for invalid language_detection config' do
|
|
85
|
+
expect do
|
|
86
|
+
Kreuzberg::Config::Extraction.new(language_detection: [])
|
|
87
|
+
end.to raise_error(ArgumentError)
|
|
189
88
|
end
|
|
190
|
-
end
|
|
191
|
-
|
|
192
|
-
describe 'permission and I/O errors' do
|
|
193
|
-
it 'raises IOError or subclass for permission denied scenario' do
|
|
194
|
-
# This is environment-dependent, so we test gracefully
|
|
195
|
-
|
|
196
|
-
# Try to write to a file we cannot read from (if setup permits)
|
|
197
|
-
test_file = create_test_file('test content')
|
|
198
|
-
File.chmod(0o000, test_file)
|
|
199
89
|
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
end
|
|
205
|
-
rescue Kreuzberg::Errors::IOError => e
|
|
206
|
-
expect(e).to be_a(Kreuzberg::Errors::IOError)
|
|
207
|
-
rescue Errno::EACCES
|
|
208
|
-
# Platform-specific permission error is acceptable
|
|
209
|
-
expect(true).to be_truthy
|
|
210
|
-
rescue StandardError => e
|
|
211
|
-
# Other IO errors are acceptable
|
|
212
|
-
expect(e).to be_a(StandardError)
|
|
90
|
+
it 'raises error for invalid pdf_options config' do
|
|
91
|
+
expect do
|
|
92
|
+
Kreuzberg::Config::Extraction.new(pdf_options: 'invalid')
|
|
93
|
+
end.to raise_error(ArgumentError)
|
|
213
94
|
end
|
|
214
95
|
end
|
|
215
96
|
|
|
216
|
-
describe '
|
|
217
|
-
it '
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
)
|
|
223
|
-
|
|
224
|
-
expect(result.content).to eq('Test content')
|
|
225
|
-
expect(result.metadata).to eq({})
|
|
226
|
-
expect(result.metadata).to be_a(Hash)
|
|
227
|
-
end
|
|
228
|
-
|
|
229
|
-
it 'handles empty metadata JSON' do
|
|
230
|
-
result = Kreuzberg::Result.new(
|
|
231
|
-
content: 'Test',
|
|
232
|
-
mime_type: 'text/plain',
|
|
233
|
-
metadata_json: ''
|
|
234
|
-
)
|
|
235
|
-
|
|
236
|
-
expect(result.metadata).to eq({})
|
|
237
|
-
expect(result.content).to eq('Test')
|
|
238
|
-
end
|
|
239
|
-
|
|
240
|
-
it 'handles nil metadata JSON' do
|
|
241
|
-
result = Kreuzberg::Result.new(
|
|
242
|
-
content: 'Test',
|
|
243
|
-
mime_type: 'text/plain',
|
|
244
|
-
metadata_json: nil
|
|
245
|
-
)
|
|
246
|
-
|
|
247
|
-
expect(result.metadata).to eq({})
|
|
248
|
-
end
|
|
249
|
-
|
|
250
|
-
it 'handles malformed result object gracefully' do
|
|
251
|
-
result = Kreuzberg::Result.new({})
|
|
252
|
-
|
|
253
|
-
expect(result.content).to eq('')
|
|
254
|
-
expect(result.mime_type).to eq('')
|
|
255
|
-
expect(result.metadata).to eq({})
|
|
256
|
-
expect(result.tables).to eq([])
|
|
257
|
-
expect(result.detected_languages).to be_nil
|
|
258
|
-
expect(result.chunks).to eq([])
|
|
259
|
-
expect(result.images).to be_nil
|
|
260
|
-
end
|
|
261
|
-
|
|
262
|
-
it 'handles partial result data without errors' do
|
|
263
|
-
result = Kreuzberg::Result.new(
|
|
264
|
-
content: 'Partial content',
|
|
265
|
-
mime_type: 'text/plain'
|
|
266
|
-
)
|
|
267
|
-
|
|
268
|
-
expect(result.content).to eq('Partial content')
|
|
269
|
-
expect(result.mime_type).to eq('text/plain')
|
|
270
|
-
expect(result.tables).to eq([])
|
|
271
|
-
expect(result.metadata).to eq({})
|
|
97
|
+
describe 'error context' do
|
|
98
|
+
it 'provides meaningful error messages' do
|
|
99
|
+
Kreuzberg.extract_file_sync('/nonexistent/file.pdf')
|
|
100
|
+
raise 'Expected an error to be raised'
|
|
101
|
+
rescue StandardError => e
|
|
102
|
+
expect(e.message).not_to be_empty
|
|
272
103
|
end
|
|
273
104
|
end
|
|
274
105
|
|
|
275
|
-
describe 'batch extraction
|
|
276
|
-
it 'handles mixed valid and invalid files
|
|
106
|
+
describe 'batch extraction errors' do
|
|
107
|
+
it 'handles mixed valid and invalid files' do
|
|
277
108
|
files = [
|
|
278
|
-
create_test_file('Valid
|
|
109
|
+
create_test_file('Valid'),
|
|
279
110
|
'/definitely/nonexistent/file.txt'
|
|
280
111
|
]
|
|
281
112
|
|
|
113
|
+
# Implementation may either raise error or handle gracefully
|
|
282
114
|
begin
|
|
283
115
|
result = Kreuzberg.batch_extract_files_sync(files)
|
|
284
116
|
expect(result).to be_an(Array)
|
|
285
117
|
rescue StandardError => e
|
|
286
118
|
expect(e).to be_a(StandardError)
|
|
287
|
-
expect(e.message).not_to be_empty
|
|
288
119
|
end
|
|
289
120
|
end
|
|
290
121
|
|
|
291
|
-
it 'handles all invalid files
|
|
122
|
+
it 'handles all invalid files' do
|
|
292
123
|
files = [
|
|
293
124
|
'/nonexistent1.txt',
|
|
294
125
|
'/nonexistent2.txt',
|
|
295
126
|
'/nonexistent3.txt'
|
|
296
127
|
]
|
|
297
128
|
|
|
129
|
+
# Batch operations may either fail fast or return partial results
|
|
298
130
|
begin
|
|
299
131
|
result = Kreuzberg.batch_extract_files_sync(files)
|
|
132
|
+
# If no error is raised, result should be an array (possibly empty or with errors)
|
|
300
133
|
expect(result).to be_an(Array)
|
|
301
134
|
rescue StandardError => e
|
|
302
|
-
|
|
303
|
-
end
|
|
304
|
-
end
|
|
305
|
-
|
|
306
|
-
it 'provides error context in batch results' do
|
|
307
|
-
files = [
|
|
308
|
-
create_test_file('First file'),
|
|
309
|
-
'/nonexistent/second.txt'
|
|
310
|
-
]
|
|
311
|
-
|
|
312
|
-
begin
|
|
313
|
-
results = Kreuzberg.batch_extract_files_sync(files)
|
|
314
|
-
expect(results).to be_an(Array)
|
|
315
|
-
rescue StandardError => e
|
|
135
|
+
# If error is raised, it should be a StandardError
|
|
316
136
|
expect(e).to be_a(StandardError)
|
|
317
137
|
end
|
|
318
138
|
end
|
|
319
139
|
end
|
|
320
140
|
|
|
321
|
-
describe '
|
|
322
|
-
it '
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
Kreuzberg.extract_file_sync(path: "/nonexistent#{i}.pdf")
|
|
327
|
-
rescue StandardError => e
|
|
328
|
-
errors << e
|
|
329
|
-
end
|
|
330
|
-
|
|
331
|
-
expect(errors.length).to eq(3)
|
|
332
|
-
expect(errors).to all(be_a(StandardError))
|
|
141
|
+
describe 'async error handling' do
|
|
142
|
+
it 'propagates errors in async extraction' do
|
|
143
|
+
expect do
|
|
144
|
+
Kreuzberg.extract_file('/nonexistent/async/file.txt')
|
|
145
|
+
end.to raise_error(StandardError)
|
|
333
146
|
end
|
|
334
|
-
end
|
|
335
|
-
|
|
336
|
-
describe 'error recovery and graceful degradation' do
|
|
337
|
-
it 'recovers gracefully after file not found error' do
|
|
338
|
-
# First operation: try to extract from nonexistent file
|
|
339
|
-
error_caught = false
|
|
340
|
-
begin
|
|
341
|
-
Kreuzberg.extract_file_sync(path: '/nonexistent/does_not_exist.txt')
|
|
342
|
-
rescue StandardError
|
|
343
|
-
error_caught = true
|
|
344
|
-
end
|
|
345
|
-
|
|
346
|
-
expect(error_caught).to be_truthy
|
|
347
147
|
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
result = Kreuzberg.extract_file_sync(path: valid_file)
|
|
148
|
+
it 'propagates errors in async bytes extraction' do
|
|
149
|
+
# Implementation may either handle invalid MIME types or raise error
|
|
351
150
|
|
|
151
|
+
result = Kreuzberg.extract_bytes('data', 'invalid/mime/type/that/causes/error')
|
|
352
152
|
expect(result).to be_a(Kreuzberg::Result)
|
|
153
|
+
rescue StandardError => e
|
|
154
|
+
expect(e).to be_a(StandardError)
|
|
353
155
|
end
|
|
156
|
+
end
|
|
354
157
|
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
begin
|
|
360
|
-
Kreuzberg.extract_file_sync(path: '/nonexistent/file1.txt')
|
|
361
|
-
rescue StandardError
|
|
362
|
-
results << :error1
|
|
363
|
-
end
|
|
158
|
+
describe 'result parsing errors' do
|
|
159
|
+
it 'handles malformed result gracefully' do
|
|
160
|
+
# This tests the Result class constructor with edge cases
|
|
161
|
+
result = Kreuzberg::Result.new({})
|
|
364
162
|
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
163
|
+
expect(result.content).to eq('')
|
|
164
|
+
expect(result.mime_type).to eq('')
|
|
165
|
+
expect(result.metadata).to eq({})
|
|
166
|
+
expect(result.tables).to eq([])
|
|
167
|
+
expect(result.detected_languages).to be_nil
|
|
168
|
+
expect(result.chunks).to be_nil
|
|
169
|
+
expect(result.images).to be_nil
|
|
170
|
+
end
|
|
369
171
|
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
end
|
|
172
|
+
it 'handles partial result data' do
|
|
173
|
+
result = Kreuzberg::Result.new(
|
|
174
|
+
content: 'Test',
|
|
175
|
+
mime_type: 'text/plain'
|
|
176
|
+
)
|
|
376
177
|
|
|
377
|
-
expect(
|
|
178
|
+
expect(result.content).to eq('Test')
|
|
179
|
+
expect(result.mime_type).to eq('text/plain')
|
|
180
|
+
expect(result.tables).to eq([])
|
|
378
181
|
end
|
|
379
|
-
end
|
|
380
182
|
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
183
|
+
it 'parses invalid metadata JSON' do
|
|
184
|
+
result = Kreuzberg::Result.new(
|
|
185
|
+
content: 'Test',
|
|
186
|
+
mime_type: 'text/plain',
|
|
187
|
+
metadata_json: 'invalid json{'
|
|
188
|
+
)
|
|
385
189
|
|
|
386
|
-
expect(result.
|
|
387
|
-
expect(result.mime_type).to be_a(String)
|
|
190
|
+
expect(result.metadata).to eq({})
|
|
388
191
|
end
|
|
389
192
|
|
|
390
|
-
it '
|
|
193
|
+
it 'parses extracted images' do
|
|
391
194
|
result = Kreuzberg::Result.new(image_result_payload)
|
|
392
195
|
image = result.images&.first
|
|
393
196
|
|
|
@@ -396,4 +199,15 @@ RSpec.describe 'Error Handling' do
|
|
|
396
199
|
expect(image&.ocr_result).to be_a(Kreuzberg::Result)
|
|
397
200
|
end
|
|
398
201
|
end
|
|
202
|
+
|
|
203
|
+
describe 'type conversion errors' do
|
|
204
|
+
it 'handles non-string content gracefully' do
|
|
205
|
+
# Test that the wrapper handles type coercion
|
|
206
|
+
path = create_test_file('Type test')
|
|
207
|
+
result = Kreuzberg.extract_file_sync(path)
|
|
208
|
+
|
|
209
|
+
expect(result.content).to be_a(String)
|
|
210
|
+
expect(result.mime_type).to be_a(String)
|
|
211
|
+
end
|
|
212
|
+
end
|
|
399
213
|
end
|
|
@@ -36,7 +36,7 @@ RSpec.describe 'OCR Backend Plugin System' do
|
|
|
36
36
|
ocr: Kreuzberg::Config::Ocr.new(backend: 'mock-ocr')
|
|
37
37
|
)
|
|
38
38
|
|
|
39
|
-
result = Kreuzberg.extract_file_sync(
|
|
39
|
+
result = Kreuzberg.extract_file_sync(test_image, config: config)
|
|
40
40
|
|
|
41
41
|
expect(backend.process_called).to be true
|
|
42
42
|
expect(result.content).to include('Mocked OCR text')
|
|
@@ -69,7 +69,7 @@ RSpec.describe 'OCR Backend Plugin System' do
|
|
|
69
69
|
)
|
|
70
70
|
)
|
|
71
71
|
|
|
72
|
-
Kreuzberg.extract_file_sync(
|
|
72
|
+
Kreuzberg.extract_file_sync(test_image, config: config)
|
|
73
73
|
|
|
74
74
|
expect(backend.received_config).to be_a(Hash)
|
|
75
75
|
expect(backend.received_config['backend']).to eq('config-capture')
|
|
@@ -102,7 +102,7 @@ RSpec.describe 'OCR Backend Plugin System' do
|
|
|
102
102
|
ocr: Kreuzberg::Config::Ocr.new(backend: 'bytes-capture')
|
|
103
103
|
)
|
|
104
104
|
|
|
105
|
-
Kreuzberg.extract_file_sync(
|
|
105
|
+
Kreuzberg.extract_file_sync(test_image, config: config)
|
|
106
106
|
|
|
107
107
|
received_bytes = BytesCapturingBackend.instance_variable_get(:@received_bytes)
|
|
108
108
|
expect(received_bytes).to be_a(String)
|
|
@@ -131,7 +131,7 @@ RSpec.describe 'OCR Backend Plugin System' do
|
|
|
131
131
|
ocr: Kreuzberg::Config::Ocr.new(backend: 'simple-ocr')
|
|
132
132
|
)
|
|
133
133
|
|
|
134
|
-
result = Kreuzberg.extract_file_sync(
|
|
134
|
+
result = Kreuzberg.extract_file_sync(test_image, config: config)
|
|
135
135
|
|
|
136
136
|
expect(result.content).to include('Invoice Total')
|
|
137
137
|
expect(result.content).to include('1,234.56')
|
|
@@ -167,8 +167,8 @@ RSpec.describe 'OCR Backend Plugin System' do
|
|
|
167
167
|
ocr: Kreuzberg::Config::Ocr.new(backend: 'stateful-ocr')
|
|
168
168
|
)
|
|
169
169
|
|
|
170
|
-
Kreuzberg.extract_file_sync(
|
|
171
|
-
Kreuzberg.extract_file_sync(
|
|
170
|
+
Kreuzberg.extract_file_sync(test_image, config: config)
|
|
171
|
+
Kreuzberg.extract_file_sync(test_image, config: config)
|
|
172
172
|
|
|
173
173
|
expect(backend.call_count).to be >= 1
|
|
174
174
|
end
|
|
@@ -197,7 +197,7 @@ RSpec.describe 'OCR Backend Plugin System' do
|
|
|
197
197
|
)
|
|
198
198
|
|
|
199
199
|
expect do
|
|
200
|
-
Kreuzberg.extract_file_sync(
|
|
200
|
+
Kreuzberg.extract_file_sync(test_image, config: config)
|
|
201
201
|
end.to raise_error(StandardError, /OCR processing failed/)
|
|
202
202
|
end
|
|
203
203
|
|
|
@@ -208,7 +208,7 @@ RSpec.describe 'OCR Backend Plugin System' do
|
|
|
208
208
|
)
|
|
209
209
|
|
|
210
210
|
expect do
|
|
211
|
-
Kreuzberg.extract_file_sync(
|
|
211
|
+
Kreuzberg.extract_file_sync(test_image, config: config)
|
|
212
212
|
end.to raise_error
|
|
213
213
|
end
|
|
214
214
|
end
|