kreuzberg 4.0.0.rc1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +8 -0
- data/.rspec +3 -0
- data/.rubocop.yaml +534 -0
- data/Gemfile +9 -0
- data/Gemfile.lock +157 -0
- data/README.md +421 -0
- data/Rakefile +25 -0
- data/Steepfile +47 -0
- data/examples/async_patterns.rb +340 -0
- data/ext/kreuzberg_rb/extconf.rb +35 -0
- data/ext/kreuzberg_rb/native/Cargo.toml +36 -0
- data/ext/kreuzberg_rb/native/README.md +425 -0
- data/ext/kreuzberg_rb/native/build.rs +17 -0
- data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -0
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -0
- data/ext/kreuzberg_rb/native/include/strings.h +20 -0
- data/ext/kreuzberg_rb/native/include/unistd.h +47 -0
- data/ext/kreuzberg_rb/native/src/lib.rs +2939 -0
- data/extconf.rb +28 -0
- data/kreuzberg.gemspec +105 -0
- data/lib/kreuzberg/api_proxy.rb +142 -0
- data/lib/kreuzberg/cache_api.rb +45 -0
- data/lib/kreuzberg/cli.rb +55 -0
- data/lib/kreuzberg/cli_proxy.rb +127 -0
- data/lib/kreuzberg/config.rb +684 -0
- data/lib/kreuzberg/errors.rb +50 -0
- data/lib/kreuzberg/extraction_api.rb +84 -0
- data/lib/kreuzberg/mcp_proxy.rb +186 -0
- data/lib/kreuzberg/ocr_backend_protocol.rb +113 -0
- data/lib/kreuzberg/post_processor_protocol.rb +86 -0
- data/lib/kreuzberg/result.rb +216 -0
- data/lib/kreuzberg/setup_lib_path.rb +79 -0
- data/lib/kreuzberg/validator_protocol.rb +89 -0
- data/lib/kreuzberg/version.rb +5 -0
- data/lib/kreuzberg.rb +82 -0
- data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
- data/sig/kreuzberg/internal.rbs +184 -0
- data/sig/kreuzberg.rbs +468 -0
- data/spec/binding/cache_spec.rb +227 -0
- data/spec/binding/cli_proxy_spec.rb +87 -0
- data/spec/binding/cli_spec.rb +54 -0
- data/spec/binding/config_spec.rb +345 -0
- data/spec/binding/config_validation_spec.rb +283 -0
- data/spec/binding/error_handling_spec.rb +213 -0
- data/spec/binding/errors_spec.rb +66 -0
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
- data/spec/binding/plugins/postprocessor_spec.rb +269 -0
- data/spec/binding/plugins/validator_spec.rb +274 -0
- data/spec/examples.txt +104 -0
- data/spec/fixtures/config.toml +39 -0
- data/spec/fixtures/config.yaml +42 -0
- data/spec/fixtures/invalid_config.toml +4 -0
- data/spec/smoke/package_spec.rb +178 -0
- data/spec/spec_helper.rb +42 -0
- data/vendor/kreuzberg/Cargo.toml +134 -0
- data/vendor/kreuzberg/README.md +175 -0
- data/vendor/kreuzberg/build.rs +460 -0
- data/vendor/kreuzberg/src/api/error.rs +81 -0
- data/vendor/kreuzberg/src/api/handlers.rs +199 -0
- data/vendor/kreuzberg/src/api/mod.rs +79 -0
- data/vendor/kreuzberg/src/api/server.rs +353 -0
- data/vendor/kreuzberg/src/api/types.rs +170 -0
- data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
- data/vendor/kreuzberg/src/cache/mod.rs +1143 -0
- data/vendor/kreuzberg/src/chunking/mod.rs +677 -0
- data/vendor/kreuzberg/src/core/batch_mode.rs +35 -0
- data/vendor/kreuzberg/src/core/config.rs +1032 -0
- data/vendor/kreuzberg/src/core/extractor.rs +903 -0
- data/vendor/kreuzberg/src/core/io.rs +327 -0
- data/vendor/kreuzberg/src/core/mime.rs +615 -0
- data/vendor/kreuzberg/src/core/mod.rs +42 -0
- data/vendor/kreuzberg/src/core/pipeline.rs +906 -0
- data/vendor/kreuzberg/src/embeddings.rs +323 -0
- data/vendor/kreuzberg/src/error.rs +431 -0
- data/vendor/kreuzberg/src/extraction/archive.rs +954 -0
- data/vendor/kreuzberg/src/extraction/docx.rs +40 -0
- data/vendor/kreuzberg/src/extraction/email.rs +854 -0
- data/vendor/kreuzberg/src/extraction/excel.rs +688 -0
- data/vendor/kreuzberg/src/extraction/html.rs +553 -0
- data/vendor/kreuzberg/src/extraction/image.rs +368 -0
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +564 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +77 -0
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -0
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -0
- data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -0
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +128 -0
- data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
- data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
- data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
- data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
- data/vendor/kreuzberg/src/extraction/pptx.rs +3000 -0
- data/vendor/kreuzberg/src/extraction/structured.rs +490 -0
- data/vendor/kreuzberg/src/extraction/table.rs +328 -0
- data/vendor/kreuzberg/src/extraction/text.rs +269 -0
- data/vendor/kreuzberg/src/extraction/xml.rs +333 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +425 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +479 -0
- data/vendor/kreuzberg/src/extractors/email.rs +129 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +344 -0
- data/vendor/kreuzberg/src/extractors/html.rs +410 -0
- data/vendor/kreuzberg/src/extractors/image.rs +195 -0
- data/vendor/kreuzberg/src/extractors/mod.rs +268 -0
- data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +496 -0
- data/vendor/kreuzberg/src/extractors/pptx.rs +234 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +126 -0
- data/vendor/kreuzberg/src/extractors/text.rs +242 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +128 -0
- data/vendor/kreuzberg/src/image/dpi.rs +164 -0
- data/vendor/kreuzberg/src/image/mod.rs +6 -0
- data/vendor/kreuzberg/src/image/preprocessing.rs +417 -0
- data/vendor/kreuzberg/src/image/resize.rs +89 -0
- data/vendor/kreuzberg/src/keywords/config.rs +154 -0
- data/vendor/kreuzberg/src/keywords/mod.rs +237 -0
- data/vendor/kreuzberg/src/keywords/processor.rs +267 -0
- data/vendor/kreuzberg/src/keywords/rake.rs +294 -0
- data/vendor/kreuzberg/src/keywords/types.rs +68 -0
- data/vendor/kreuzberg/src/keywords/yake.rs +163 -0
- data/vendor/kreuzberg/src/language_detection/mod.rs +942 -0
- data/vendor/kreuzberg/src/lib.rs +102 -0
- data/vendor/kreuzberg/src/mcp/mod.rs +32 -0
- data/vendor/kreuzberg/src/mcp/server.rs +1966 -0
- data/vendor/kreuzberg/src/ocr/cache.rs +469 -0
- data/vendor/kreuzberg/src/ocr/error.rs +37 -0
- data/vendor/kreuzberg/src/ocr/hocr.rs +216 -0
- data/vendor/kreuzberg/src/ocr/mod.rs +58 -0
- data/vendor/kreuzberg/src/ocr/processor.rs +847 -0
- data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -0
- data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +450 -0
- data/vendor/kreuzberg/src/ocr/types.rs +393 -0
- data/vendor/kreuzberg/src/ocr/utils.rs +47 -0
- data/vendor/kreuzberg/src/ocr/validation.rs +206 -0
- data/vendor/kreuzberg/src/pdf/error.rs +122 -0
- data/vendor/kreuzberg/src/pdf/images.rs +139 -0
- data/vendor/kreuzberg/src/pdf/metadata.rs +346 -0
- data/vendor/kreuzberg/src/pdf/mod.rs +50 -0
- data/vendor/kreuzberg/src/pdf/rendering.rs +369 -0
- data/vendor/kreuzberg/src/pdf/table.rs +420 -0
- data/vendor/kreuzberg/src/pdf/text.rs +161 -0
- data/vendor/kreuzberg/src/plugins/extractor.rs +1010 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +209 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +629 -0
- data/vendor/kreuzberg/src/plugins/processor.rs +641 -0
- data/vendor/kreuzberg/src/plugins/registry.rs +1324 -0
- data/vendor/kreuzberg/src/plugins/traits.rs +258 -0
- data/vendor/kreuzberg/src/plugins/validator.rs +955 -0
- data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -0
- data/vendor/kreuzberg/src/text/mod.rs +19 -0
- data/vendor/kreuzberg/src/text/quality.rs +697 -0
- data/vendor/kreuzberg/src/text/string_utils.rs +217 -0
- data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -0
- data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -0
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -0
- data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -0
- data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -0
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -0
- data/vendor/kreuzberg/src/types.rs +873 -0
- data/vendor/kreuzberg/src/utils/mod.rs +17 -0
- data/vendor/kreuzberg/src/utils/quality.rs +959 -0
- data/vendor/kreuzberg/src/utils/string_utils.rs +381 -0
- data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -0
- data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -0
- data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -0
- data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -0
- data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -0
- data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -0
- data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -0
- data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -0
- data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -0
- data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -0
- data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -0
- data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -0
- data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -0
- data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -0
- data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -0
- data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -0
- data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -0
- data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -0
- data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -0
- data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -0
- data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -0
- data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -0
- data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -0
- data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -0
- data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -0
- data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -0
- data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -0
- data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -0
- data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -0
- data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -0
- data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -0
- data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -0
- data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -0
- data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -0
- data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -0
- data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -0
- data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -0
- data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -0
- data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -0
- data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -0
- data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -0
- data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -0
- data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -0
- data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -0
- data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -0
- data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -0
- data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -0
- data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -0
- data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -0
- data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -0
- data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -0
- data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -0
- data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -0
- data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -0
- data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -0
- data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -0
- data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -0
- data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -0
- data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -0
- data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -0
- data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -0
- data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -0
- data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -0
- data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -0
- data/vendor/kreuzberg/tests/api_tests.rs +966 -0
- data/vendor/kreuzberg/tests/archive_integration.rs +543 -0
- data/vendor/kreuzberg/tests/batch_orchestration.rs +542 -0
- data/vendor/kreuzberg/tests/batch_processing.rs +304 -0
- data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +509 -0
- data/vendor/kreuzberg/tests/config_features.rs +580 -0
- data/vendor/kreuzberg/tests/config_loading_tests.rs +439 -0
- data/vendor/kreuzberg/tests/core_integration.rs +493 -0
- data/vendor/kreuzberg/tests/csv_integration.rs +424 -0
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +124 -0
- data/vendor/kreuzberg/tests/email_integration.rs +325 -0
- data/vendor/kreuzberg/tests/error_handling.rs +393 -0
- data/vendor/kreuzberg/tests/format_integration.rs +159 -0
- data/vendor/kreuzberg/tests/helpers/mod.rs +142 -0
- data/vendor/kreuzberg/tests/image_integration.rs +253 -0
- data/vendor/kreuzberg/tests/keywords_integration.rs +479 -0
- data/vendor/kreuzberg/tests/keywords_quality.rs +509 -0
- data/vendor/kreuzberg/tests/mime_detection.rs +428 -0
- data/vendor/kreuzberg/tests/ocr_configuration.rs +510 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +676 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +627 -0
- data/vendor/kreuzberg/tests/ocr_stress.rs +469 -0
- data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +43 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1412 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +771 -0
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +561 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +921 -0
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +607 -0
- data/vendor/kreuzberg/tests/security_validation.rs +404 -0
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -0
- data/vendor/kreuzberg/tests/test_fastembed.rs +609 -0
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -0
- metadata +471 -0
|
@@ -0,0 +1,283 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# Configuration validation tests
|
|
4
|
+
|
|
5
|
+
RSpec.describe 'Configuration Validation' do
|
|
6
|
+
describe Kreuzberg::Config::Extraction do
|
|
7
|
+
it 'accepts all valid parameters' do
|
|
8
|
+
config = described_class.new(
|
|
9
|
+
use_cache: true,
|
|
10
|
+
enable_quality_processing: false,
|
|
11
|
+
force_ocr: false,
|
|
12
|
+
ocr: Kreuzberg::Config::OCR.new,
|
|
13
|
+
chunking: Kreuzberg::Config::Chunking.new,
|
|
14
|
+
language_detection: Kreuzberg::Config::LanguageDetection.new,
|
|
15
|
+
pdf_options: Kreuzberg::Config::PDF.new
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
expect(config.use_cache).to be true
|
|
19
|
+
expect(config.enable_quality_processing).to be false
|
|
20
|
+
expect(config.force_ocr).to be false
|
|
21
|
+
expect(config.ocr).to be_a(Kreuzberg::Config::OCR)
|
|
22
|
+
expect(config.chunking).to be_a(Kreuzberg::Config::Chunking)
|
|
23
|
+
expect(config.language_detection).to be_a(Kreuzberg::Config::LanguageDetection)
|
|
24
|
+
expect(config.pdf_options).to be_a(Kreuzberg::Config::PDF)
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
it 'accepts hashes for nested configs' do
|
|
28
|
+
config = described_class.new(
|
|
29
|
+
ocr: { backend: 'tesseract', language: 'eng' },
|
|
30
|
+
chunking: { max_chars: 500 }
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
expect(config.ocr).to be_a(Kreuzberg::Config::OCR)
|
|
34
|
+
expect(config.ocr.backend).to eq('tesseract')
|
|
35
|
+
expect(config.chunking).to be_a(Kreuzberg::Config::Chunking)
|
|
36
|
+
expect(config.chunking.max_chars).to eq(500)
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
it 'validates ocr config type' do
|
|
40
|
+
expect do
|
|
41
|
+
described_class.new(ocr: 'invalid')
|
|
42
|
+
end.to raise_error(ArgumentError, /Expected.*OCR/)
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
it 'validates chunking config type' do
|
|
46
|
+
expect do
|
|
47
|
+
described_class.new(chunking: 'invalid')
|
|
48
|
+
end.to raise_error(ArgumentError, /Expected.*Chunking/)
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
it 'converts to hash correctly' do
|
|
52
|
+
config = described_class.new(
|
|
53
|
+
use_cache: false,
|
|
54
|
+
force_ocr: true
|
|
55
|
+
)
|
|
56
|
+
hash = config.to_h
|
|
57
|
+
|
|
58
|
+
expect(hash).to be_a(Hash)
|
|
59
|
+
expect(hash[:use_cache]).to be false
|
|
60
|
+
expect(hash[:force_ocr]).to be true
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
it 'omits nil values from hash' do
|
|
64
|
+
config = described_class.new
|
|
65
|
+
hash = config.to_h
|
|
66
|
+
|
|
67
|
+
expect(hash[:ocr]).to be_nil
|
|
68
|
+
expect(hash[:chunking]).to be_nil
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
it 'accepts html options hashes' do
|
|
72
|
+
config = described_class.new(html_options: { heading_style: :atx, wrap: true })
|
|
73
|
+
expect(config.html_options).to be_a(Kreuzberg::Config::HtmlOptions)
|
|
74
|
+
expect(config.html_options.to_h[:heading_style]).to eq(:atx)
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
it 'accepts keyword configurations' do
|
|
78
|
+
keywords = Kreuzberg::Config::Keywords.new(algorithm: :yake, max_keywords: 5)
|
|
79
|
+
config = described_class.new(keywords: keywords, max_concurrent_extractions: 4)
|
|
80
|
+
expect(config.keywords).to be_a(Kreuzberg::Config::Keywords)
|
|
81
|
+
expect(config.max_concurrent_extractions).to eq(4)
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
describe Kreuzberg::Config::OCR do
|
|
86
|
+
it 'has sensible defaults' do
|
|
87
|
+
config = described_class.new
|
|
88
|
+
|
|
89
|
+
expect(config.backend).to eq('tesseract')
|
|
90
|
+
expect(config.language).to eq('eng')
|
|
91
|
+
expect(config.tesseract_config).to be_nil
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
it 'accepts custom values' do
|
|
95
|
+
config = described_class.new(
|
|
96
|
+
backend: 'easyocr',
|
|
97
|
+
language: 'deu'
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
expect(config.backend).to eq('easyocr')
|
|
101
|
+
expect(config.language).to eq('deu')
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
it 'coerces types correctly' do
|
|
105
|
+
config = described_class.new(
|
|
106
|
+
backend: :tesseract,
|
|
107
|
+
language: 123
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
expect(config.backend).to eq('tesseract')
|
|
111
|
+
expect(config.language).to eq('123')
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
it 'accepts tesseract config hashes' do
|
|
115
|
+
config = described_class.new(
|
|
116
|
+
tesseract_config: {
|
|
117
|
+
psm: 6,
|
|
118
|
+
enable_table_detection: true
|
|
119
|
+
}
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
expect(config.tesseract_config).to be_a(Kreuzberg::Config::Tesseract)
|
|
123
|
+
expect(config.tesseract_config.to_h[:psm]).to eq(6)
|
|
124
|
+
end
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
describe Kreuzberg::Config::Chunking do
|
|
128
|
+
it 'has sensible defaults' do
|
|
129
|
+
config = described_class.new
|
|
130
|
+
|
|
131
|
+
expect(config.max_chars).to eq(1000)
|
|
132
|
+
expect(config.max_overlap).to eq(200)
|
|
133
|
+
expect(config.preset).to be_nil
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
it 'accepts custom chunk sizes' do
|
|
137
|
+
config = described_class.new(
|
|
138
|
+
max_chars: 500,
|
|
139
|
+
max_overlap: 100
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
expect(config.max_chars).to eq(500)
|
|
143
|
+
expect(config.max_overlap).to eq(100)
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
it 'supports different strategies' do
|
|
147
|
+
config = described_class.new(preset: 'fast')
|
|
148
|
+
expect(config.preset).to eq('fast')
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
it 'accepts embedding configs' do
|
|
152
|
+
embedding = { model: { type: :preset, name: 'quality' }, normalize: false }
|
|
153
|
+
config = described_class.new(embedding: embedding)
|
|
154
|
+
expect(config.embedding).to be_a(Kreuzberg::Config::Embedding)
|
|
155
|
+
expect(config.embedding.to_h[:model]).to include(type: :preset, name: 'quality')
|
|
156
|
+
end
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
describe Kreuzberg::Config::LanguageDetection do
|
|
160
|
+
it 'has sensible defaults' do
|
|
161
|
+
config = described_class.new
|
|
162
|
+
|
|
163
|
+
expect(config.enabled).to be false
|
|
164
|
+
expect(config.min_confidence).to eq(0.5)
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
it 'accepts custom confidence thresholds' do
|
|
168
|
+
config = described_class.new(
|
|
169
|
+
enabled: true,
|
|
170
|
+
min_confidence: 0.9
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
expect(config.enabled).to be true
|
|
174
|
+
expect(config.min_confidence).to eq(0.9)
|
|
175
|
+
end
|
|
176
|
+
|
|
177
|
+
it 'coerces confidence to float' do
|
|
178
|
+
config = described_class.new(min_confidence: '0.75')
|
|
179
|
+
expect(config.min_confidence).to eq(0.75)
|
|
180
|
+
end
|
|
181
|
+
|
|
182
|
+
it 'supports detect_multiple flag' do
|
|
183
|
+
config = described_class.new(detect_multiple: true)
|
|
184
|
+
expect(config.detect_multiple).to be true
|
|
185
|
+
expect(config.to_h[:detect_multiple]).to be true
|
|
186
|
+
end
|
|
187
|
+
end
|
|
188
|
+
|
|
189
|
+
describe Kreuzberg::Config::PDF do
|
|
190
|
+
it 'has sensible defaults' do
|
|
191
|
+
config = described_class.new
|
|
192
|
+
|
|
193
|
+
expect(config.extract_images).to be false
|
|
194
|
+
expect(config.passwords).to be_nil
|
|
195
|
+
expect(config.extract_metadata).to be true
|
|
196
|
+
end
|
|
197
|
+
|
|
198
|
+
it 'accepts custom values' do
|
|
199
|
+
config = described_class.new(
|
|
200
|
+
extract_images: true,
|
|
201
|
+
passwords: ['secret123']
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
expect(config.extract_images).to be true
|
|
205
|
+
expect(config.passwords).to eq(['secret123'])
|
|
206
|
+
end
|
|
207
|
+
|
|
208
|
+
it 'converts password to string' do
|
|
209
|
+
config = described_class.new(passwords: 12_345)
|
|
210
|
+
expect(config.passwords).to eq(['12345'])
|
|
211
|
+
end
|
|
212
|
+
end
|
|
213
|
+
|
|
214
|
+
describe Kreuzberg::Config::HtmlOptions do
|
|
215
|
+
it 'normalizes preprocessing settings' do
|
|
216
|
+
options = described_class.new(
|
|
217
|
+
heading_style: :atx_closed,
|
|
218
|
+
preprocessing: { enabled: true, preset: :standard }
|
|
219
|
+
)
|
|
220
|
+
hash = options.to_h
|
|
221
|
+
expect(hash[:heading_style]).to eq(:atx_closed)
|
|
222
|
+
expect(hash[:preprocessing]).to include(preset: :standard)
|
|
223
|
+
end
|
|
224
|
+
end
|
|
225
|
+
|
|
226
|
+
describe Kreuzberg::Config::Keywords do
|
|
227
|
+
it 'accepts hash arguments' do
|
|
228
|
+
config = described_class.new(
|
|
229
|
+
algorithm: :yake,
|
|
230
|
+
max_keywords: 10,
|
|
231
|
+
ngram_range: [1, 3],
|
|
232
|
+
yake_params: { window_size: 4 }
|
|
233
|
+
)
|
|
234
|
+
expect(config.to_h[:algorithm]).to eq('yake')
|
|
235
|
+
expect(config.to_h[:yake_params]).to eq(window_size: 4)
|
|
236
|
+
end
|
|
237
|
+
end
|
|
238
|
+
|
|
239
|
+
describe 'config usage in extraction' do
|
|
240
|
+
it 'works with OCR config' do
|
|
241
|
+
path = create_test_file('OCR config test')
|
|
242
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
243
|
+
ocr: Kreuzberg::Config::OCR.new(backend: 'tesseract', language: 'eng')
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
result = Kreuzberg.extract_file_sync(path, config: config)
|
|
247
|
+
expect(result).to be_a(Kreuzberg::Result)
|
|
248
|
+
end
|
|
249
|
+
|
|
250
|
+
it 'works with chunking config' do
|
|
251
|
+
path = create_test_file('Chunking config test' * 50)
|
|
252
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
253
|
+
chunking: Kreuzberg::Config::Chunking.new(max_chars: 50)
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
result = Kreuzberg.extract_file_sync(path, config: config)
|
|
257
|
+
expect(result).to be_a(Kreuzberg::Result)
|
|
258
|
+
end
|
|
259
|
+
|
|
260
|
+
it 'works with language detection config' do
|
|
261
|
+
path = create_test_file('Language detection test')
|
|
262
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
263
|
+
language_detection: Kreuzberg::Config::LanguageDetection.new(enabled: true)
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
result = Kreuzberg.extract_file_sync(path, config: config)
|
|
267
|
+
expect(result).to be_a(Kreuzberg::Result)
|
|
268
|
+
end
|
|
269
|
+
|
|
270
|
+
it 'works with combined configs' do
|
|
271
|
+
path = create_test_file('Combined config test')
|
|
272
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
273
|
+
use_cache: false,
|
|
274
|
+
force_ocr: false,
|
|
275
|
+
ocr: { backend: 'tesseract', language: 'eng' },
|
|
276
|
+
language_detection: { enabled: false }
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
result = Kreuzberg.extract_file_sync(path, config: config)
|
|
280
|
+
expect(result).to be_a(Kreuzberg::Result)
|
|
281
|
+
end
|
|
282
|
+
end
|
|
283
|
+
end
|
|
@@ -0,0 +1,213 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# Error handling and exception mapping tests
|
|
4
|
+
|
|
5
|
+
RSpec.describe 'Error Handling' do
|
|
6
|
+
let(:nested_ocr_result) do
|
|
7
|
+
{
|
|
8
|
+
'content' => 'ocr text',
|
|
9
|
+
'mime_type' => 'text/plain',
|
|
10
|
+
'metadata_json' => '{}',
|
|
11
|
+
'tables' => []
|
|
12
|
+
}
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
let(:image_result_payload) do
|
|
16
|
+
{
|
|
17
|
+
content: 'Test',
|
|
18
|
+
mime_type: 'text/plain',
|
|
19
|
+
images: [
|
|
20
|
+
{
|
|
21
|
+
'data' => "binary\0data",
|
|
22
|
+
'format' => 'png',
|
|
23
|
+
'image_index' => 0,
|
|
24
|
+
'page_number' => 1,
|
|
25
|
+
'width' => 100,
|
|
26
|
+
'height' => 200,
|
|
27
|
+
'colorspace' => 'RGB',
|
|
28
|
+
'bits_per_component' => 8,
|
|
29
|
+
'is_mask' => false,
|
|
30
|
+
'description' => 'inline image',
|
|
31
|
+
'ocr_result' => nested_ocr_result
|
|
32
|
+
}
|
|
33
|
+
]
|
|
34
|
+
}
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
describe 'file not found errors' do
|
|
38
|
+
it 'raises error for non-existent file' do
|
|
39
|
+
expect do
|
|
40
|
+
Kreuzberg.extract_file_sync('/nonexistent/path/file.txt')
|
|
41
|
+
end.to raise_error(StandardError)
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
it 'raises error for empty path' do
|
|
45
|
+
expect do
|
|
46
|
+
Kreuzberg.extract_file_sync('')
|
|
47
|
+
end.to raise_error(StandardError)
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
it 'raises error for nil path' do
|
|
51
|
+
expect do
|
|
52
|
+
Kreuzberg.extract_file_sync(nil)
|
|
53
|
+
end.to raise_error(StandardError)
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
describe 'invalid MIME type handling' do
|
|
58
|
+
it 'handles unknown MIME types' do
|
|
59
|
+
path = create_test_file('Unknown MIME')
|
|
60
|
+
|
|
61
|
+
# Implementation may either handle gracefully or raise error for unknown MIME types
|
|
62
|
+
begin
|
|
63
|
+
result = Kreuzberg.extract_file_sync(path, mime_type: 'application/x-unknown-type')
|
|
64
|
+
expect(result).to be_a(Kreuzberg::Result)
|
|
65
|
+
rescue StandardError => e
|
|
66
|
+
expect(e).to be_a(StandardError)
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
describe 'invalid configuration' do
|
|
72
|
+
it 'raises error for invalid ocr config' do
|
|
73
|
+
expect do
|
|
74
|
+
Kreuzberg::Config::Extraction.new(ocr: 'invalid')
|
|
75
|
+
end.to raise_error(ArgumentError)
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
it 'raises error for invalid chunking config' do
|
|
79
|
+
expect do
|
|
80
|
+
Kreuzberg::Config::Extraction.new(chunking: 123)
|
|
81
|
+
end.to raise_error(ArgumentError)
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
it 'raises error for invalid language_detection config' do
|
|
85
|
+
expect do
|
|
86
|
+
Kreuzberg::Config::Extraction.new(language_detection: [])
|
|
87
|
+
end.to raise_error(ArgumentError)
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
it 'raises error for invalid pdf_options config' do
|
|
91
|
+
expect do
|
|
92
|
+
Kreuzberg::Config::Extraction.new(pdf_options: 'invalid')
|
|
93
|
+
end.to raise_error(ArgumentError)
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
describe 'error context' do
|
|
98
|
+
it 'provides meaningful error messages' do
|
|
99
|
+
Kreuzberg.extract_file_sync('/nonexistent/file.pdf')
|
|
100
|
+
raise 'Expected an error to be raised'
|
|
101
|
+
rescue StandardError => e
|
|
102
|
+
expect(e.message).not_to be_empty
|
|
103
|
+
end
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
describe 'batch extraction errors' do
|
|
107
|
+
it 'handles mixed valid and invalid files' do
|
|
108
|
+
files = [
|
|
109
|
+
create_test_file('Valid'),
|
|
110
|
+
'/definitely/nonexistent/file.txt'
|
|
111
|
+
]
|
|
112
|
+
|
|
113
|
+
# Implementation may either raise error or handle gracefully
|
|
114
|
+
begin
|
|
115
|
+
result = Kreuzberg.batch_extract_files_sync(files)
|
|
116
|
+
expect(result).to be_an(Array)
|
|
117
|
+
rescue StandardError => e
|
|
118
|
+
expect(e).to be_a(StandardError)
|
|
119
|
+
end
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
it 'handles all invalid files' do
|
|
123
|
+
files = [
|
|
124
|
+
'/nonexistent1.txt',
|
|
125
|
+
'/nonexistent2.txt',
|
|
126
|
+
'/nonexistent3.txt'
|
|
127
|
+
]
|
|
128
|
+
|
|
129
|
+
# Batch operations may either fail fast or return partial results
|
|
130
|
+
begin
|
|
131
|
+
result = Kreuzberg.batch_extract_files_sync(files)
|
|
132
|
+
# If no error is raised, result should be an array (possibly empty or with errors)
|
|
133
|
+
expect(result).to be_an(Array)
|
|
134
|
+
rescue StandardError => e
|
|
135
|
+
# If error is raised, it should be a StandardError
|
|
136
|
+
expect(e).to be_a(StandardError)
|
|
137
|
+
end
|
|
138
|
+
end
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
describe 'async error handling' do
|
|
142
|
+
it 'propagates errors in async extraction' do
|
|
143
|
+
expect do
|
|
144
|
+
Kreuzberg.extract_file('/nonexistent/async/file.txt')
|
|
145
|
+
end.to raise_error(StandardError)
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
it 'propagates errors in async bytes extraction' do
|
|
149
|
+
# Implementation may either handle invalid MIME types or raise error
|
|
150
|
+
|
|
151
|
+
result = Kreuzberg.extract_bytes('data', 'invalid/mime/type/that/causes/error')
|
|
152
|
+
expect(result).to be_a(Kreuzberg::Result)
|
|
153
|
+
rescue StandardError => e
|
|
154
|
+
expect(e).to be_a(StandardError)
|
|
155
|
+
end
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
describe 'result parsing errors' do
|
|
159
|
+
it 'handles malformed result gracefully' do
|
|
160
|
+
# This tests the Result class constructor with edge cases
|
|
161
|
+
result = Kreuzberg::Result.new({})
|
|
162
|
+
|
|
163
|
+
expect(result.content).to eq('')
|
|
164
|
+
expect(result.mime_type).to eq('')
|
|
165
|
+
expect(result.metadata).to eq({})
|
|
166
|
+
expect(result.tables).to eq([])
|
|
167
|
+
expect(result.detected_languages).to be_nil
|
|
168
|
+
expect(result.chunks).to be_nil
|
|
169
|
+
expect(result.images).to be_nil
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
it 'handles partial result data' do
|
|
173
|
+
result = Kreuzberg::Result.new(
|
|
174
|
+
content: 'Test',
|
|
175
|
+
mime_type: 'text/plain'
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
expect(result.content).to eq('Test')
|
|
179
|
+
expect(result.mime_type).to eq('text/plain')
|
|
180
|
+
expect(result.tables).to eq([])
|
|
181
|
+
end
|
|
182
|
+
|
|
183
|
+
it 'parses invalid metadata JSON' do
|
|
184
|
+
result = Kreuzberg::Result.new(
|
|
185
|
+
content: 'Test',
|
|
186
|
+
mime_type: 'text/plain',
|
|
187
|
+
metadata_json: 'invalid json{'
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
expect(result.metadata).to eq({})
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
it 'parses extracted images' do
|
|
194
|
+
result = Kreuzberg::Result.new(image_result_payload)
|
|
195
|
+
image = result.images&.first
|
|
196
|
+
|
|
197
|
+
expect(image&.format).to eq('png')
|
|
198
|
+
expect(image&.data&.encoding).to eq(Encoding::BINARY)
|
|
199
|
+
expect(image&.ocr_result).to be_a(Kreuzberg::Result)
|
|
200
|
+
end
|
|
201
|
+
end
|
|
202
|
+
|
|
203
|
+
describe 'type conversion errors' do
|
|
204
|
+
it 'handles non-string content gracefully' do
|
|
205
|
+
# Test that the wrapper handles type coercion
|
|
206
|
+
path = create_test_file('Type test')
|
|
207
|
+
result = Kreuzberg.extract_file_sync(path)
|
|
208
|
+
|
|
209
|
+
expect(result.content).to be_a(String)
|
|
210
|
+
expect(result.mime_type).to be_a(String)
|
|
211
|
+
end
|
|
212
|
+
end
|
|
213
|
+
end
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
RSpec.describe Kreuzberg::Errors do
|
|
4
|
+
describe Kreuzberg::Errors::Error do
|
|
5
|
+
it 'is a StandardError subclass' do
|
|
6
|
+
expect(described_class).to be < StandardError
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
it 'can be raised with a message' do
|
|
10
|
+
expect do
|
|
11
|
+
raise described_class, 'Test error'
|
|
12
|
+
end.to raise_error(described_class, 'Test error')
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
describe Kreuzberg::Errors::ValidationError do
|
|
17
|
+
it 'is an Error subclass' do
|
|
18
|
+
expect(described_class).to be < Kreuzberg::Errors::Error
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
describe Kreuzberg::Errors::ParsingError do
|
|
23
|
+
it 'is an Error subclass' do
|
|
24
|
+
expect(described_class).to be < Kreuzberg::Errors::Error
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
it 'stores context' do
|
|
28
|
+
error = described_class.new('Parsing failed', context: { file: 'test.pdf' })
|
|
29
|
+
expect(error.context).to eq({ file: 'test.pdf' })
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
describe Kreuzberg::Errors::OCRError do
|
|
34
|
+
it 'is an Error subclass' do
|
|
35
|
+
expect(described_class).to be < Kreuzberg::Errors::Error
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
it 'stores context' do
|
|
39
|
+
error = described_class.new('OCR failed', context: { page: 1 })
|
|
40
|
+
expect(error.context).to eq({ page: 1 })
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
describe Kreuzberg::Errors::MissingDependencyError do
|
|
45
|
+
it 'is an Error subclass' do
|
|
46
|
+
expect(described_class).to be < Kreuzberg::Errors::Error
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
it 'stores dependency name' do
|
|
50
|
+
error = described_class.new('Tesseract not found', dependency: 'tesseract')
|
|
51
|
+
expect(error.dependency).to eq('tesseract')
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
describe Kreuzberg::Errors::IOError do
|
|
56
|
+
it 'is an Error subclass' do
|
|
57
|
+
expect(described_class).to be < Kreuzberg::Errors::Error
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
describe Kreuzberg::Errors::PluginError do
|
|
62
|
+
it 'is an Error subclass' do
|
|
63
|
+
expect(described_class).to be < Kreuzberg::Errors::Error
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
end
|