kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +0 -6
- data/.rubocop.yaml +534 -1
- data/Gemfile +2 -1
- data/Gemfile.lock +28 -116
- data/README.md +269 -629
- data/Rakefile +0 -9
- data/Steepfile +4 -8
- data/examples/async_patterns.rb +58 -1
- data/ext/kreuzberg_rb/extconf.rb +5 -35
- data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
- data/ext/kreuzberg_rb/native/build.rs +14 -12
- data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
- data/ext/kreuzberg_rb/native/include/strings.h +2 -2
- data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
- data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
- data/extconf.rb +6 -38
- data/kreuzberg.gemspec +20 -114
- data/lib/kreuzberg/api_proxy.rb +18 -2
- data/lib/kreuzberg/cache_api.rb +0 -22
- data/lib/kreuzberg/cli.rb +10 -2
- data/lib/kreuzberg/cli_proxy.rb +10 -0
- data/lib/kreuzberg/config.rb +22 -274
- data/lib/kreuzberg/errors.rb +7 -73
- data/lib/kreuzberg/extraction_api.rb +8 -237
- data/lib/kreuzberg/mcp_proxy.rb +11 -2
- data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
- data/lib/kreuzberg/post_processor_protocol.rb +71 -0
- data/lib/kreuzberg/result.rb +33 -151
- data/lib/kreuzberg/setup_lib_path.rb +2 -22
- data/lib/kreuzberg/validator_protocol.rb +73 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +13 -27
- data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
- data/sig/kreuzberg.rbs +12 -105
- data/spec/binding/cache_spec.rb +22 -22
- data/spec/binding/cli_proxy_spec.rb +4 -2
- data/spec/binding/cli_spec.rb +11 -12
- data/spec/binding/config_spec.rb +0 -74
- data/spec/binding/config_validation_spec.rb +6 -100
- data/spec/binding/error_handling_spec.rb +97 -283
- data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
- data/spec/binding/plugins/postprocessor_spec.rb +11 -11
- data/spec/binding/plugins/validator_spec.rb +13 -12
- data/spec/examples.txt +104 -0
- data/spec/fixtures/config.toml +1 -0
- data/spec/fixtures/config.yaml +1 -0
- data/spec/fixtures/invalid_config.toml +1 -0
- data/spec/smoke/package_spec.rb +3 -2
- data/spec/spec_helper.rb +3 -1
- data/vendor/kreuzberg/Cargo.toml +67 -192
- data/vendor/kreuzberg/README.md +9 -97
- data/vendor/kreuzberg/build.rs +194 -516
- data/vendor/kreuzberg/src/api/handlers.rs +9 -130
- data/vendor/kreuzberg/src/api/mod.rs +3 -18
- data/vendor/kreuzberg/src/api/server.rs +71 -236
- data/vendor/kreuzberg/src/api/types.rs +7 -43
- data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
- data/vendor/kreuzberg/src/cache/mod.rs +3 -27
- data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
- data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
- data/vendor/kreuzberg/src/core/config.rs +23 -905
- data/vendor/kreuzberg/src/core/extractor.rs +106 -403
- data/vendor/kreuzberg/src/core/io.rs +2 -4
- data/vendor/kreuzberg/src/core/mime.rs +12 -2
- data/vendor/kreuzberg/src/core/mod.rs +3 -22
- data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
- data/vendor/kreuzberg/src/embeddings.rs +21 -169
- data/vendor/kreuzberg/src/error.rs +2 -2
- data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
- data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
- data/vendor/kreuzberg/src/extraction/email.rs +11 -12
- data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
- data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
- data/vendor/kreuzberg/src/extraction/image.rs +14 -138
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
- data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
- data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
- data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
- data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
- data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
- data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
- data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
- data/vendor/kreuzberg/src/extraction/table.rs +1 -2
- data/vendor/kreuzberg/src/extraction/text.rs +10 -18
- data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
- data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
- data/vendor/kreuzberg/src/extractors/email.rs +9 -37
- data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
- data/vendor/kreuzberg/src/extractors/html.rs +173 -182
- data/vendor/kreuzberg/src/extractors/image.rs +8 -32
- data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
- data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
- data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
- data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
- data/vendor/kreuzberg/src/extractors/text.rs +7 -30
- data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
- data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
- data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
- data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
- data/vendor/kreuzberg/src/lib.rs +5 -17
- data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
- data/vendor/kreuzberg/src/mcp/server.rs +21 -145
- data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
- data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
- data/vendor/kreuzberg/src/pdf/error.rs +1 -93
- data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
- data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
- data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
- data/vendor/kreuzberg/src/pdf/table.rs +64 -61
- data/vendor/kreuzberg/src/pdf/text.rs +24 -416
- data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
- data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
- data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
- data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
- data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
- data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
- data/vendor/kreuzberg/src/text/mod.rs +0 -8
- data/vendor/kreuzberg/src/text/quality.rs +15 -28
- data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
- data/vendor/kreuzberg/src/types.rs +67 -907
- data/vendor/kreuzberg/src/utils/mod.rs +0 -14
- data/vendor/kreuzberg/src/utils/quality.rs +3 -12
- data/vendor/kreuzberg/tests/api_tests.rs +0 -506
- data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
- data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
- data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
- data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
- data/vendor/kreuzberg/tests/config_features.rs +1 -33
- data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
- data/vendor/kreuzberg/tests/core_integration.rs +9 -35
- data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
- data/vendor/kreuzberg/tests/email_integration.rs +1 -3
- data/vendor/kreuzberg/tests/error_handling.rs +34 -43
- data/vendor/kreuzberg/tests/format_integration.rs +1 -7
- data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
- data/vendor/kreuzberg/tests/image_integration.rs +0 -2
- data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
- data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
- data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
- data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
- data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
- data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
- data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
- data/vendor/kreuzberg/tests/security_validation.rs +1 -13
- data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
- metadata +25 -171
- data/.rubocop.yml +0 -543
- data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
- data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
- data/lib/kreuzberg/error_context.rb +0 -136
- data/lib/kreuzberg/types.rb +0 -170
- data/lib/libpdfium.so +0 -0
- data/spec/binding/async_operations_spec.rb +0 -473
- data/spec/binding/batch_operations_spec.rb +0 -595
- data/spec/binding/batch_spec.rb +0 -359
- data/spec/binding/config_result_spec.rb +0 -377
- data/spec/binding/embeddings_spec.rb +0 -816
- data/spec/binding/error_recovery_spec.rb +0 -488
- data/spec/binding/font_config_spec.rb +0 -220
- data/spec/binding/images_spec.rb +0 -738
- data/spec/binding/keywords_extraction_spec.rb +0 -600
- data/spec/binding/metadata_types_spec.rb +0 -1228
- data/spec/binding/pages_extraction_spec.rb +0 -471
- data/spec/binding/tables_spec.rb +0 -641
- data/spec/unit/config/chunking_config_spec.rb +0 -213
- data/spec/unit/config/embedding_config_spec.rb +0 -343
- data/spec/unit/config/extraction_config_spec.rb +0 -438
- data/spec/unit/config/font_config_spec.rb +0 -285
- data/spec/unit/config/hierarchy_config_spec.rb +0 -314
- data/spec/unit/config/image_extraction_config_spec.rb +0 -209
- data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
- data/spec/unit/config/keyword_config_spec.rb +0 -229
- data/spec/unit/config/language_detection_config_spec.rb +0 -258
- data/spec/unit/config/ocr_config_spec.rb +0 -171
- data/spec/unit/config/page_config_spec.rb +0 -221
- data/spec/unit/config/pdf_config_spec.rb +0 -267
- data/spec/unit/config/postprocessor_config_spec.rb +0 -290
- data/spec/unit/config/tesseract_config_spec.rb +0 -181
- data/spec/unit/config/token_reduction_config_spec.rb +0 -251
- data/test/metadata_types_test.rb +0 -959
- data/vendor/Cargo.toml +0 -61
- data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
- data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
- data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
- data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/formats.rs +0 -235
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
- data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
- data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
- data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
- data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
- data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
- data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
- data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/extractors/security.rs +0 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
- data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
- data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
- data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
- data/vendor/kreuzberg/src/panic_context.rs +0 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
- data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
- data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
- data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
- data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
- data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
- data/vendor/kreuzberg/src/utils/pool.rs +0 -503
- data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
- data/vendor/kreuzberg/tests/api_embed.rs +0 -360
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
- data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
- data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
- data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
- data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
- data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
- data/vendor/kreuzberg/tests/page_markers.rs +0 -297
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
- data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
- data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
- data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
- data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
- data/vendor/kreuzberg-ffi/README.md +0 -851
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
- data/vendor/kreuzberg-ffi/build.rs +0 -168
- data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
- data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
- data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
- data/vendor/kreuzberg-ffi/src/error.rs +0 -901
- data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
- data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
- data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
- data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
- data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
- data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
- data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
- data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
- data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
- data/vendor/kreuzberg-ffi/src/result.rs +0 -510
- data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
- data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
- data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
- data/vendor/kreuzberg-ffi/src/types.rs +0 -363
- data/vendor/kreuzberg-ffi/src/util.rs +0 -210
- data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
- data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
- data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
- data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
- data/vendor/kreuzberg-tesseract/LICENSE +0 -22
- data/vendor/kreuzberg-tesseract/README.md +0 -399
- data/vendor/kreuzberg-tesseract/build.rs +0 -1127
- data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
- data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
- data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
- data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
- data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
- data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211
|
@@ -1,258 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
RSpec.describe Kreuzberg::Config::LanguageDetection do
|
|
4
|
-
describe '#initialize' do
|
|
5
|
-
it 'creates config with default values' do
|
|
6
|
-
config = described_class.new
|
|
7
|
-
|
|
8
|
-
expect(config.enabled).to be false
|
|
9
|
-
expect(config.min_confidence).to eq 0.5
|
|
10
|
-
expect(config.detect_multiple).to be false
|
|
11
|
-
end
|
|
12
|
-
|
|
13
|
-
it 'creates config with custom values' do
|
|
14
|
-
config = described_class.new(
|
|
15
|
-
enabled: true,
|
|
16
|
-
min_confidence: 0.9,
|
|
17
|
-
detect_multiple: true
|
|
18
|
-
)
|
|
19
|
-
|
|
20
|
-
expect(config.enabled).to be true
|
|
21
|
-
expect(config.min_confidence).to eq 0.9
|
|
22
|
-
expect(config.detect_multiple).to be true
|
|
23
|
-
end
|
|
24
|
-
|
|
25
|
-
it 'converts enabled to boolean' do
|
|
26
|
-
config = described_class.new(enabled: 1)
|
|
27
|
-
|
|
28
|
-
expect(config.enabled).to be true
|
|
29
|
-
expect(config.enabled).to be_a TrueClass
|
|
30
|
-
end
|
|
31
|
-
|
|
32
|
-
it 'converts min_confidence to float' do
|
|
33
|
-
config = described_class.new(min_confidence: '0.75')
|
|
34
|
-
|
|
35
|
-
expect(config.min_confidence).to eq 0.75
|
|
36
|
-
expect(config.min_confidence).to be_a Float
|
|
37
|
-
end
|
|
38
|
-
|
|
39
|
-
it 'converts detect_multiple to boolean' do
|
|
40
|
-
config = described_class.new(detect_multiple: 'yes')
|
|
41
|
-
|
|
42
|
-
expect(config.detect_multiple).to be true
|
|
43
|
-
end
|
|
44
|
-
end
|
|
45
|
-
|
|
46
|
-
describe '#to_h' do
|
|
47
|
-
it 'serializes to hash with all values' do
|
|
48
|
-
config = described_class.new(
|
|
49
|
-
enabled: true,
|
|
50
|
-
min_confidence: 0.8,
|
|
51
|
-
detect_multiple: true
|
|
52
|
-
)
|
|
53
|
-
hash = config.to_h
|
|
54
|
-
|
|
55
|
-
expect(hash).to be_a Hash
|
|
56
|
-
expect(hash[:enabled]).to be true
|
|
57
|
-
expect(hash[:min_confidence]).to eq 0.8
|
|
58
|
-
expect(hash[:detect_multiple]).to be true
|
|
59
|
-
end
|
|
60
|
-
|
|
61
|
-
it 'always includes all keys in hash' do
|
|
62
|
-
config = described_class.new
|
|
63
|
-
hash = config.to_h
|
|
64
|
-
|
|
65
|
-
expect(hash.keys).to contain_exactly(
|
|
66
|
-
:enabled,
|
|
67
|
-
:min_confidence,
|
|
68
|
-
:detect_multiple
|
|
69
|
-
)
|
|
70
|
-
end
|
|
71
|
-
end
|
|
72
|
-
|
|
73
|
-
describe 'validation' do
|
|
74
|
-
it 'accepts confidence value of 0.5' do
|
|
75
|
-
expect do
|
|
76
|
-
described_class.new(min_confidence: 0.5)
|
|
77
|
-
end.not_to raise_error
|
|
78
|
-
end
|
|
79
|
-
|
|
80
|
-
it 'accepts confidence value of 0.0' do
|
|
81
|
-
expect do
|
|
82
|
-
described_class.new(min_confidence: 0.0)
|
|
83
|
-
end.not_to raise_error
|
|
84
|
-
end
|
|
85
|
-
|
|
86
|
-
it 'accepts confidence value of 1.0' do
|
|
87
|
-
expect do
|
|
88
|
-
described_class.new(min_confidence: 1.0)
|
|
89
|
-
end.not_to raise_error
|
|
90
|
-
end
|
|
91
|
-
|
|
92
|
-
it 'accepts boolean enabled' do
|
|
93
|
-
expect do
|
|
94
|
-
described_class.new(enabled: true)
|
|
95
|
-
end.not_to raise_error
|
|
96
|
-
end
|
|
97
|
-
end
|
|
98
|
-
|
|
99
|
-
describe 'keyword arguments' do
|
|
100
|
-
it 'accepts all keyword arguments' do
|
|
101
|
-
config = described_class.new(
|
|
102
|
-
enabled: true,
|
|
103
|
-
min_confidence: 0.85,
|
|
104
|
-
detect_multiple: true
|
|
105
|
-
)
|
|
106
|
-
|
|
107
|
-
expect(config.enabled).to be true
|
|
108
|
-
expect(config.min_confidence).to eq 0.85
|
|
109
|
-
expect(config.detect_multiple).to be true
|
|
110
|
-
end
|
|
111
|
-
end
|
|
112
|
-
|
|
113
|
-
describe 'equality' do
|
|
114
|
-
it 'compares configs by value' do
|
|
115
|
-
config1 = described_class.new(
|
|
116
|
-
enabled: true,
|
|
117
|
-
min_confidence: 0.8
|
|
118
|
-
)
|
|
119
|
-
config2 = described_class.new(
|
|
120
|
-
enabled: true,
|
|
121
|
-
min_confidence: 0.8
|
|
122
|
-
)
|
|
123
|
-
|
|
124
|
-
expect(config1.enabled).to eq config2.enabled
|
|
125
|
-
expect(config1.min_confidence).to eq config2.min_confidence
|
|
126
|
-
end
|
|
127
|
-
|
|
128
|
-
it 'detects differences in enabled' do
|
|
129
|
-
config1 = described_class.new(enabled: true)
|
|
130
|
-
config2 = described_class.new(enabled: false)
|
|
131
|
-
|
|
132
|
-
expect(config1.enabled).not_to eq config2.enabled
|
|
133
|
-
end
|
|
134
|
-
|
|
135
|
-
it 'detects differences in min_confidence' do
|
|
136
|
-
config1 = described_class.new(min_confidence: 0.5)
|
|
137
|
-
config2 = described_class.new(min_confidence: 0.9)
|
|
138
|
-
|
|
139
|
-
expect(config1.min_confidence).not_to eq config2.min_confidence
|
|
140
|
-
end
|
|
141
|
-
|
|
142
|
-
it 'detects differences in detect_multiple' do
|
|
143
|
-
config1 = described_class.new(detect_multiple: true)
|
|
144
|
-
config2 = described_class.new(detect_multiple: false)
|
|
145
|
-
|
|
146
|
-
expect(config1.detect_multiple).not_to eq config2.detect_multiple
|
|
147
|
-
end
|
|
148
|
-
end
|
|
149
|
-
|
|
150
|
-
describe 'nested config integration' do
|
|
151
|
-
it 'can be nested in Extraction config' do
|
|
152
|
-
lang_detect = described_class.new(enabled: true, min_confidence: 0.9)
|
|
153
|
-
extraction = Kreuzberg::Config::Extraction.new(language_detection: lang_detect)
|
|
154
|
-
|
|
155
|
-
expect(extraction.language_detection).to be_a described_class
|
|
156
|
-
expect(extraction.language_detection.enabled).to be true
|
|
157
|
-
expect(extraction.language_detection.min_confidence).to eq 0.9
|
|
158
|
-
end
|
|
159
|
-
|
|
160
|
-
it 'accepts hash in Extraction config' do
|
|
161
|
-
extraction = Kreuzberg::Config::Extraction.new(
|
|
162
|
-
language_detection: { enabled: true, min_confidence: 0.75 }
|
|
163
|
-
)
|
|
164
|
-
|
|
165
|
-
expect(extraction.language_detection).to be_a described_class
|
|
166
|
-
expect(extraction.language_detection.enabled).to be true
|
|
167
|
-
expect(extraction.language_detection.min_confidence).to eq 0.75
|
|
168
|
-
end
|
|
169
|
-
end
|
|
170
|
-
|
|
171
|
-
describe 'symbol vs string key handling' do
|
|
172
|
-
it 'accepts symbol and string enabled values' do
|
|
173
|
-
config = described_class.new(enabled: true)
|
|
174
|
-
|
|
175
|
-
expect(config.enabled).to be true
|
|
176
|
-
end
|
|
177
|
-
|
|
178
|
-
it 'converts min_confidence string to float' do
|
|
179
|
-
config = described_class.new(min_confidence: '0.95')
|
|
180
|
-
|
|
181
|
-
expect(config.min_confidence).to eq 0.95
|
|
182
|
-
expect(config.min_confidence).to be_a Float
|
|
183
|
-
end
|
|
184
|
-
end
|
|
185
|
-
|
|
186
|
-
describe 'boolean conversion' do
|
|
187
|
-
it 'converts truthy enabled to true' do
|
|
188
|
-
config = described_class.new(enabled: 1)
|
|
189
|
-
|
|
190
|
-
expect(config.enabled).to be true
|
|
191
|
-
end
|
|
192
|
-
|
|
193
|
-
it 'converts false enabled to false' do
|
|
194
|
-
config = described_class.new(enabled: false)
|
|
195
|
-
|
|
196
|
-
expect(config.enabled).to be false
|
|
197
|
-
end
|
|
198
|
-
|
|
199
|
-
it 'converts truthy detect_multiple to true' do
|
|
200
|
-
config = described_class.new(detect_multiple: 'yes')
|
|
201
|
-
|
|
202
|
-
expect(config.detect_multiple).to be true
|
|
203
|
-
end
|
|
204
|
-
|
|
205
|
-
it 'converts false detect_multiple to false' do
|
|
206
|
-
config = described_class.new(detect_multiple: false)
|
|
207
|
-
|
|
208
|
-
expect(config.detect_multiple).to be false
|
|
209
|
-
end
|
|
210
|
-
end
|
|
211
|
-
|
|
212
|
-
describe 'confidence range' do
|
|
213
|
-
it 'accepts minimum confidence value' do
|
|
214
|
-
config = described_class.new(min_confidence: 0.0)
|
|
215
|
-
|
|
216
|
-
expect(config.min_confidence).to eq 0.0
|
|
217
|
-
end
|
|
218
|
-
|
|
219
|
-
it 'accepts maximum confidence value' do
|
|
220
|
-
config = described_class.new(min_confidence: 1.0)
|
|
221
|
-
|
|
222
|
-
expect(config.min_confidence).to eq 1.0
|
|
223
|
-
end
|
|
224
|
-
|
|
225
|
-
it 'accepts mid-range confidence value' do
|
|
226
|
-
config = described_class.new(min_confidence: 0.6)
|
|
227
|
-
|
|
228
|
-
expect(config.min_confidence).to eq 0.6
|
|
229
|
-
end
|
|
230
|
-
|
|
231
|
-
it 'preserves high precision confidence values' do
|
|
232
|
-
config = described_class.new(min_confidence: 0.123456)
|
|
233
|
-
|
|
234
|
-
expect(config.min_confidence).to be_within(0.00001).of(0.123456)
|
|
235
|
-
end
|
|
236
|
-
end
|
|
237
|
-
|
|
238
|
-
describe 'multiple language detection' do
|
|
239
|
-
it 'allows enabling multiple language detection' do
|
|
240
|
-
config = described_class.new(detect_multiple: true)
|
|
241
|
-
|
|
242
|
-
expect(config.detect_multiple).to be true
|
|
243
|
-
end
|
|
244
|
-
|
|
245
|
-
it 'defaults to single language detection' do
|
|
246
|
-
config = described_class.new
|
|
247
|
-
|
|
248
|
-
expect(config.detect_multiple).to be false
|
|
249
|
-
end
|
|
250
|
-
|
|
251
|
-
it 'can be disabled when enabled is true' do
|
|
252
|
-
config = described_class.new(enabled: true, detect_multiple: false)
|
|
253
|
-
|
|
254
|
-
expect(config.enabled).to be true
|
|
255
|
-
expect(config.detect_multiple).to be false
|
|
256
|
-
end
|
|
257
|
-
end
|
|
258
|
-
end
|
|
@@ -1,171 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
RSpec.describe Kreuzberg::Config::OCR do
|
|
4
|
-
describe '#initialize' do
|
|
5
|
-
it 'creates config with default values' do
|
|
6
|
-
config = described_class.new
|
|
7
|
-
|
|
8
|
-
expect(config.backend).to eq 'tesseract'
|
|
9
|
-
expect(config.language).to eq 'eng'
|
|
10
|
-
expect(config.tesseract_config).to be_nil
|
|
11
|
-
end
|
|
12
|
-
|
|
13
|
-
it 'creates config with custom string values' do
|
|
14
|
-
config = described_class.new(
|
|
15
|
-
backend: 'easyocr',
|
|
16
|
-
language: 'fra'
|
|
17
|
-
)
|
|
18
|
-
|
|
19
|
-
expect(config.backend).to eq 'easyocr'
|
|
20
|
-
expect(config.language).to eq 'fra'
|
|
21
|
-
end
|
|
22
|
-
|
|
23
|
-
it 'converts symbol keys to strings' do
|
|
24
|
-
config = described_class.new(backend: :tesseract, language: :deu)
|
|
25
|
-
|
|
26
|
-
expect(config.backend).to eq 'tesseract'
|
|
27
|
-
expect(config.language).to eq 'deu'
|
|
28
|
-
end
|
|
29
|
-
|
|
30
|
-
it 'accepts tesseract_config as instance' do
|
|
31
|
-
tesseract = Kreuzberg::Config::Tesseract.new(options: 'value')
|
|
32
|
-
config = described_class.new(tesseract_config: tesseract)
|
|
33
|
-
|
|
34
|
-
expect(config.tesseract_config).to be_a Kreuzberg::Config::Tesseract
|
|
35
|
-
end
|
|
36
|
-
|
|
37
|
-
it 'converts tesseract_config hash to instance' do
|
|
38
|
-
config = described_class.new(tesseract_config: { option: 'value' })
|
|
39
|
-
|
|
40
|
-
expect(config.tesseract_config).to be_a Kreuzberg::Config::Tesseract
|
|
41
|
-
end
|
|
42
|
-
end
|
|
43
|
-
|
|
44
|
-
describe '#to_h' do
|
|
45
|
-
it 'serializes to hash with default values' do
|
|
46
|
-
config = described_class.new
|
|
47
|
-
hash = config.to_h
|
|
48
|
-
|
|
49
|
-
expect(hash).to be_a Hash
|
|
50
|
-
expect(hash[:backend]).to eq 'tesseract'
|
|
51
|
-
expect(hash[:language]).to eq 'eng'
|
|
52
|
-
expect(hash[:tesseract_config]).to be_nil
|
|
53
|
-
end
|
|
54
|
-
|
|
55
|
-
it 'includes tesseract_config in hash when present' do
|
|
56
|
-
config = described_class.new(
|
|
57
|
-
backend: 'tesseract',
|
|
58
|
-
tesseract_config: { dpi: 300 }
|
|
59
|
-
)
|
|
60
|
-
hash = config.to_h
|
|
61
|
-
|
|
62
|
-
expect(hash[:tesseract_config]).to be_a Hash
|
|
63
|
-
end
|
|
64
|
-
|
|
65
|
-
it 'compacts nil values from hash' do
|
|
66
|
-
config = described_class.new(backend: 'tesseract')
|
|
67
|
-
hash = config.to_h
|
|
68
|
-
|
|
69
|
-
expect(hash.key?(:tesseract_config)).to be false
|
|
70
|
-
end
|
|
71
|
-
end
|
|
72
|
-
|
|
73
|
-
describe 'validation' do
|
|
74
|
-
it 'accepts valid backends' do
|
|
75
|
-
expect do
|
|
76
|
-
described_class.new(backend: 'tesseract')
|
|
77
|
-
end.not_to raise_error
|
|
78
|
-
end
|
|
79
|
-
|
|
80
|
-
it 'accepts symbol language' do
|
|
81
|
-
expect do
|
|
82
|
-
described_class.new(language: :fra)
|
|
83
|
-
end.not_to raise_error
|
|
84
|
-
end
|
|
85
|
-
|
|
86
|
-
it 'raises error for invalid tesseract_config type' do
|
|
87
|
-
expect do
|
|
88
|
-
described_class.new(tesseract_config: 'invalid')
|
|
89
|
-
end.to raise_error ArgumentError, /Expected.*Tesseract.*Hash.*nil/
|
|
90
|
-
end
|
|
91
|
-
end
|
|
92
|
-
|
|
93
|
-
describe 'keyword arguments' do
|
|
94
|
-
it 'accepts keyword arguments only' do
|
|
95
|
-
config = described_class.new(backend: 'tesseract', language: 'eng')
|
|
96
|
-
|
|
97
|
-
expect(config.backend).to eq 'tesseract'
|
|
98
|
-
expect(config.language).to eq 'eng'
|
|
99
|
-
end
|
|
100
|
-
|
|
101
|
-
it 'ignores unknown keywords gracefully' do
|
|
102
|
-
# This test documents current behavior
|
|
103
|
-
# The initialize method doesn't explicitly reject unknown keys
|
|
104
|
-
config = described_class.new(backend: 'tesseract')
|
|
105
|
-
expect(config).to be_a described_class
|
|
106
|
-
end
|
|
107
|
-
end
|
|
108
|
-
|
|
109
|
-
describe 'equality' do
|
|
110
|
-
it 'compares configs by value' do
|
|
111
|
-
config1 = described_class.new(backend: 'tesseract', language: 'eng')
|
|
112
|
-
config2 = described_class.new(backend: 'tesseract', language: 'eng')
|
|
113
|
-
|
|
114
|
-
expect(config1.backend).to eq config2.backend
|
|
115
|
-
expect(config1.language).to eq config2.language
|
|
116
|
-
end
|
|
117
|
-
|
|
118
|
-
it 'detects differences in backend' do
|
|
119
|
-
config1 = described_class.new(backend: 'tesseract')
|
|
120
|
-
config2 = described_class.new(backend: 'easyocr')
|
|
121
|
-
|
|
122
|
-
expect(config1.backend).not_to eq config2.backend
|
|
123
|
-
end
|
|
124
|
-
|
|
125
|
-
it 'detects differences in language' do
|
|
126
|
-
config1 = described_class.new(language: 'eng')
|
|
127
|
-
config2 = described_class.new(language: 'fra')
|
|
128
|
-
|
|
129
|
-
expect(config1.language).not_to eq config2.language
|
|
130
|
-
end
|
|
131
|
-
end
|
|
132
|
-
|
|
133
|
-
describe 'nested config integration' do
|
|
134
|
-
it 'integrates with Extraction config' do
|
|
135
|
-
ocr_config = described_class.new(backend: 'tesseract', language: 'deu')
|
|
136
|
-
extraction = Kreuzberg::Config::Extraction.new(ocr: ocr_config)
|
|
137
|
-
|
|
138
|
-
expect(extraction.ocr).to be_a described_class
|
|
139
|
-
expect(extraction.ocr.backend).to eq 'tesseract'
|
|
140
|
-
expect(extraction.ocr.language).to eq 'deu'
|
|
141
|
-
end
|
|
142
|
-
|
|
143
|
-
it 'accepts hash in Extraction config and converts to instance' do
|
|
144
|
-
extraction = Kreuzberg::Config::Extraction.new(
|
|
145
|
-
ocr: { backend: 'easyocr', language: 'fra' }
|
|
146
|
-
)
|
|
147
|
-
|
|
148
|
-
expect(extraction.ocr).to be_a described_class
|
|
149
|
-
expect(extraction.ocr.backend).to eq 'easyocr'
|
|
150
|
-
end
|
|
151
|
-
end
|
|
152
|
-
|
|
153
|
-
describe 'symbol vs string key handling' do
|
|
154
|
-
it 'converts symbol keys to correct attributes' do
|
|
155
|
-
config = described_class.new(backend: :tesseract, language: :fra)
|
|
156
|
-
|
|
157
|
-
expect(config.backend).to eq 'tesseract'
|
|
158
|
-
expect(config.language).to eq 'fra'
|
|
159
|
-
end
|
|
160
|
-
|
|
161
|
-
it 'handles mixed symbol and string values' do
|
|
162
|
-
config = described_class.new(
|
|
163
|
-
backend: 'tesseract',
|
|
164
|
-
language: :eng
|
|
165
|
-
)
|
|
166
|
-
|
|
167
|
-
expect(config.backend).to eq 'tesseract'
|
|
168
|
-
expect(config.language).to eq 'eng'
|
|
169
|
-
end
|
|
170
|
-
end
|
|
171
|
-
end
|
|
@@ -1,221 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
RSpec.describe Kreuzberg::Config::PageConfig do
|
|
4
|
-
describe '#initialize' do
|
|
5
|
-
it 'creates config with default values' do
|
|
6
|
-
config = described_class.new
|
|
7
|
-
|
|
8
|
-
expect(config.extract_pages).to be false
|
|
9
|
-
expect(config.insert_page_markers).to be false
|
|
10
|
-
expect(config.marker_format).to eq "\n\n<!-- PAGE {page_num} -->\n\n"
|
|
11
|
-
end
|
|
12
|
-
|
|
13
|
-
it 'creates config with custom values' do
|
|
14
|
-
config = described_class.new(
|
|
15
|
-
extract_pages: true,
|
|
16
|
-
insert_page_markers: true,
|
|
17
|
-
marker_format: '--- PAGE {page_num} ---'
|
|
18
|
-
)
|
|
19
|
-
|
|
20
|
-
expect(config.extract_pages).to be true
|
|
21
|
-
expect(config.insert_page_markers).to be true
|
|
22
|
-
expect(config.marker_format).to eq '--- PAGE {page_num} ---'
|
|
23
|
-
end
|
|
24
|
-
|
|
25
|
-
it 'converts boolean values' do
|
|
26
|
-
config = described_class.new(
|
|
27
|
-
extract_pages: true,
|
|
28
|
-
insert_page_markers: false
|
|
29
|
-
)
|
|
30
|
-
|
|
31
|
-
expect(config.extract_pages).to be true
|
|
32
|
-
expect(config.insert_page_markers).to be false
|
|
33
|
-
end
|
|
34
|
-
|
|
35
|
-
it 'converts marker_format to string' do
|
|
36
|
-
config = described_class.new(marker_format: :default)
|
|
37
|
-
|
|
38
|
-
expect(config.marker_format).to be_a String
|
|
39
|
-
end
|
|
40
|
-
end
|
|
41
|
-
|
|
42
|
-
describe '#to_h' do
|
|
43
|
-
it 'serializes to hash with all values' do
|
|
44
|
-
config = described_class.new(extract_pages: true)
|
|
45
|
-
hash = config.to_h
|
|
46
|
-
|
|
47
|
-
expect(hash).to be_a Hash
|
|
48
|
-
expect(hash[:extract_pages]).to be true
|
|
49
|
-
expect(hash[:insert_page_markers]).to be false
|
|
50
|
-
expect(hash[:marker_format]).to eq "\n\n<!-- PAGE {page_num} -->\n\n"
|
|
51
|
-
end
|
|
52
|
-
|
|
53
|
-
it 'always includes all keys in hash' do
|
|
54
|
-
config = described_class.new
|
|
55
|
-
hash = config.to_h
|
|
56
|
-
|
|
57
|
-
expect(hash.keys).to contain_exactly(
|
|
58
|
-
:extract_pages,
|
|
59
|
-
:insert_page_markers,
|
|
60
|
-
:marker_format
|
|
61
|
-
)
|
|
62
|
-
end
|
|
63
|
-
end
|
|
64
|
-
|
|
65
|
-
describe 'validation' do
|
|
66
|
-
it 'accepts boolean extract_pages' do
|
|
67
|
-
expect do
|
|
68
|
-
described_class.new(extract_pages: true)
|
|
69
|
-
end.not_to raise_error
|
|
70
|
-
end
|
|
71
|
-
|
|
72
|
-
it 'accepts boolean insert_page_markers' do
|
|
73
|
-
expect do
|
|
74
|
-
described_class.new(insert_page_markers: true)
|
|
75
|
-
end.not_to raise_error
|
|
76
|
-
end
|
|
77
|
-
|
|
78
|
-
it 'accepts custom marker formats' do
|
|
79
|
-
expect do
|
|
80
|
-
described_class.new(marker_format: '===== PAGE {page_num} =====')
|
|
81
|
-
end.not_to raise_error
|
|
82
|
-
end
|
|
83
|
-
end
|
|
84
|
-
|
|
85
|
-
describe 'keyword arguments' do
|
|
86
|
-
it 'accepts all keyword arguments' do
|
|
87
|
-
config = described_class.new(
|
|
88
|
-
extract_pages: true,
|
|
89
|
-
insert_page_markers: true,
|
|
90
|
-
marker_format: 'Page: {page_num}'
|
|
91
|
-
)
|
|
92
|
-
|
|
93
|
-
expect(config.extract_pages).to be true
|
|
94
|
-
expect(config.insert_page_markers).to be true
|
|
95
|
-
expect(config.marker_format).to eq 'Page: {page_num}'
|
|
96
|
-
end
|
|
97
|
-
end
|
|
98
|
-
|
|
99
|
-
describe 'equality' do
|
|
100
|
-
it 'compares configs by value' do
|
|
101
|
-
config1 = described_class.new(
|
|
102
|
-
extract_pages: true,
|
|
103
|
-
insert_page_markers: true,
|
|
104
|
-
marker_format: '--- PAGE {page_num} ---'
|
|
105
|
-
)
|
|
106
|
-
config2 = described_class.new(
|
|
107
|
-
extract_pages: true,
|
|
108
|
-
insert_page_markers: true,
|
|
109
|
-
marker_format: '--- PAGE {page_num} ---'
|
|
110
|
-
)
|
|
111
|
-
|
|
112
|
-
expect(config1.extract_pages).to eq config2.extract_pages
|
|
113
|
-
expect(config1.insert_page_markers).to eq config2.insert_page_markers
|
|
114
|
-
expect(config1.marker_format).to eq config2.marker_format
|
|
115
|
-
end
|
|
116
|
-
|
|
117
|
-
it 'detects differences in extract_pages' do
|
|
118
|
-
config1 = described_class.new(extract_pages: true)
|
|
119
|
-
config2 = described_class.new(extract_pages: false)
|
|
120
|
-
|
|
121
|
-
expect(config1.extract_pages).not_to eq config2.extract_pages
|
|
122
|
-
end
|
|
123
|
-
|
|
124
|
-
it 'detects differences in marker_format' do
|
|
125
|
-
config1 = described_class.new(marker_format: 'Format A')
|
|
126
|
-
config2 = described_class.new(marker_format: 'Format B')
|
|
127
|
-
|
|
128
|
-
expect(config1.marker_format).not_to eq config2.marker_format
|
|
129
|
-
end
|
|
130
|
-
end
|
|
131
|
-
|
|
132
|
-
describe 'nested config integration' do
|
|
133
|
-
it 'can be nested in Extraction config' do
|
|
134
|
-
pages = described_class.new(extract_pages: true)
|
|
135
|
-
extraction = Kreuzberg::Config::Extraction.new(pages: pages)
|
|
136
|
-
|
|
137
|
-
expect(extraction.pages).to be_a described_class
|
|
138
|
-
expect(extraction.pages.extract_pages).to be true
|
|
139
|
-
end
|
|
140
|
-
|
|
141
|
-
it 'accepts hash in Extraction config' do
|
|
142
|
-
extraction = Kreuzberg::Config::Extraction.new(
|
|
143
|
-
pages: { extract_pages: true, insert_page_markers: true }
|
|
144
|
-
)
|
|
145
|
-
|
|
146
|
-
expect(extraction.pages).to be_a described_class
|
|
147
|
-
expect(extraction.pages.extract_pages).to be true
|
|
148
|
-
expect(extraction.pages.insert_page_markers).to be true
|
|
149
|
-
end
|
|
150
|
-
end
|
|
151
|
-
|
|
152
|
-
describe 'marker format' do
|
|
153
|
-
it 'preserves custom marker format' do
|
|
154
|
-
format = '=== PAGE {page_num} ==='
|
|
155
|
-
config = described_class.new(marker_format: format)
|
|
156
|
-
|
|
157
|
-
expect(config.marker_format).to eq format
|
|
158
|
-
end
|
|
159
|
-
|
|
160
|
-
it 'preserves default marker format' do
|
|
161
|
-
config = described_class.new
|
|
162
|
-
|
|
163
|
-
expect(config.marker_format).to include '{page_num}'
|
|
164
|
-
end
|
|
165
|
-
|
|
166
|
-
it 'allows empty marker format' do
|
|
167
|
-
config = described_class.new(marker_format: '')
|
|
168
|
-
|
|
169
|
-
expect(config.marker_format).to eq ''
|
|
170
|
-
end
|
|
171
|
-
|
|
172
|
-
it 'handles multiline marker formats' do
|
|
173
|
-
format = "\n--- PAGE {page_num} ---\n"
|
|
174
|
-
config = described_class.new(marker_format: format)
|
|
175
|
-
|
|
176
|
-
expect(config.marker_format).to eq format
|
|
177
|
-
end
|
|
178
|
-
end
|
|
179
|
-
|
|
180
|
-
describe 'symbol vs string key handling' do
|
|
181
|
-
it 'converts symbol values to strings' do
|
|
182
|
-
config = described_class.new(marker_format: :default_format)
|
|
183
|
-
|
|
184
|
-
expect(config.marker_format).to be_a String
|
|
185
|
-
end
|
|
186
|
-
|
|
187
|
-
it 'preserves string marker format' do
|
|
188
|
-
format = 'Custom Format'
|
|
189
|
-
config = described_class.new(marker_format: format)
|
|
190
|
-
|
|
191
|
-
expect(config.marker_format).to eq format
|
|
192
|
-
expect(config.marker_format).to be_a String
|
|
193
|
-
end
|
|
194
|
-
end
|
|
195
|
-
|
|
196
|
-
describe 'boolean conversion' do
|
|
197
|
-
it 'converts truthy extract_pages to true' do
|
|
198
|
-
config = described_class.new(extract_pages: 1)
|
|
199
|
-
|
|
200
|
-
expect(config.extract_pages).to be true
|
|
201
|
-
end
|
|
202
|
-
|
|
203
|
-
it 'converts false extract_pages to false' do
|
|
204
|
-
config = described_class.new(extract_pages: false)
|
|
205
|
-
|
|
206
|
-
expect(config.extract_pages).to be false
|
|
207
|
-
end
|
|
208
|
-
|
|
209
|
-
it 'converts truthy insert_page_markers to true' do
|
|
210
|
-
config = described_class.new(insert_page_markers: 'yes')
|
|
211
|
-
|
|
212
|
-
expect(config.insert_page_markers).to be true
|
|
213
|
-
end
|
|
214
|
-
|
|
215
|
-
it 'converts false insert_page_markers to false' do
|
|
216
|
-
config = described_class.new(insert_page_markers: false)
|
|
217
|
-
|
|
218
|
-
expect(config.insert_page_markers).to be false
|
|
219
|
-
end
|
|
220
|
-
end
|
|
221
|
-
end
|