kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +0 -6
- data/.rubocop.yaml +534 -1
- data/Gemfile +2 -1
- data/Gemfile.lock +28 -116
- data/README.md +269 -629
- data/Rakefile +0 -9
- data/Steepfile +4 -8
- data/examples/async_patterns.rb +58 -1
- data/ext/kreuzberg_rb/extconf.rb +5 -35
- data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
- data/ext/kreuzberg_rb/native/build.rs +14 -12
- data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
- data/ext/kreuzberg_rb/native/include/strings.h +2 -2
- data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
- data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
- data/extconf.rb +6 -38
- data/kreuzberg.gemspec +20 -114
- data/lib/kreuzberg/api_proxy.rb +18 -2
- data/lib/kreuzberg/cache_api.rb +0 -22
- data/lib/kreuzberg/cli.rb +10 -2
- data/lib/kreuzberg/cli_proxy.rb +10 -0
- data/lib/kreuzberg/config.rb +22 -274
- data/lib/kreuzberg/errors.rb +7 -73
- data/lib/kreuzberg/extraction_api.rb +8 -237
- data/lib/kreuzberg/mcp_proxy.rb +11 -2
- data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
- data/lib/kreuzberg/post_processor_protocol.rb +71 -0
- data/lib/kreuzberg/result.rb +33 -151
- data/lib/kreuzberg/setup_lib_path.rb +2 -22
- data/lib/kreuzberg/validator_protocol.rb +73 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +13 -27
- data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
- data/sig/kreuzberg.rbs +12 -105
- data/spec/binding/cache_spec.rb +22 -22
- data/spec/binding/cli_proxy_spec.rb +4 -2
- data/spec/binding/cli_spec.rb +11 -12
- data/spec/binding/config_spec.rb +0 -74
- data/spec/binding/config_validation_spec.rb +6 -100
- data/spec/binding/error_handling_spec.rb +97 -283
- data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
- data/spec/binding/plugins/postprocessor_spec.rb +11 -11
- data/spec/binding/plugins/validator_spec.rb +13 -12
- data/spec/examples.txt +104 -0
- data/spec/fixtures/config.toml +1 -0
- data/spec/fixtures/config.yaml +1 -0
- data/spec/fixtures/invalid_config.toml +1 -0
- data/spec/smoke/package_spec.rb +3 -2
- data/spec/spec_helper.rb +3 -1
- data/vendor/kreuzberg/Cargo.toml +67 -192
- data/vendor/kreuzberg/README.md +9 -97
- data/vendor/kreuzberg/build.rs +194 -516
- data/vendor/kreuzberg/src/api/handlers.rs +9 -130
- data/vendor/kreuzberg/src/api/mod.rs +3 -18
- data/vendor/kreuzberg/src/api/server.rs +71 -236
- data/vendor/kreuzberg/src/api/types.rs +7 -43
- data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
- data/vendor/kreuzberg/src/cache/mod.rs +3 -27
- data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
- data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
- data/vendor/kreuzberg/src/core/config.rs +23 -905
- data/vendor/kreuzberg/src/core/extractor.rs +106 -403
- data/vendor/kreuzberg/src/core/io.rs +2 -4
- data/vendor/kreuzberg/src/core/mime.rs +12 -2
- data/vendor/kreuzberg/src/core/mod.rs +3 -22
- data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
- data/vendor/kreuzberg/src/embeddings.rs +21 -169
- data/vendor/kreuzberg/src/error.rs +2 -2
- data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
- data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
- data/vendor/kreuzberg/src/extraction/email.rs +11 -12
- data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
- data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
- data/vendor/kreuzberg/src/extraction/image.rs +14 -138
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
- data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
- data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
- data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
- data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
- data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
- data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
- data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
- data/vendor/kreuzberg/src/extraction/table.rs +1 -2
- data/vendor/kreuzberg/src/extraction/text.rs +10 -18
- data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
- data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
- data/vendor/kreuzberg/src/extractors/email.rs +9 -37
- data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
- data/vendor/kreuzberg/src/extractors/html.rs +173 -182
- data/vendor/kreuzberg/src/extractors/image.rs +8 -32
- data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
- data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
- data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
- data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
- data/vendor/kreuzberg/src/extractors/text.rs +7 -30
- data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
- data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
- data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
- data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
- data/vendor/kreuzberg/src/lib.rs +5 -17
- data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
- data/vendor/kreuzberg/src/mcp/server.rs +21 -145
- data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
- data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
- data/vendor/kreuzberg/src/pdf/error.rs +1 -93
- data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
- data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
- data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
- data/vendor/kreuzberg/src/pdf/table.rs +64 -61
- data/vendor/kreuzberg/src/pdf/text.rs +24 -416
- data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
- data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
- data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
- data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
- data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
- data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
- data/vendor/kreuzberg/src/text/mod.rs +0 -8
- data/vendor/kreuzberg/src/text/quality.rs +15 -28
- data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
- data/vendor/kreuzberg/src/types.rs +67 -907
- data/vendor/kreuzberg/src/utils/mod.rs +0 -14
- data/vendor/kreuzberg/src/utils/quality.rs +3 -12
- data/vendor/kreuzberg/tests/api_tests.rs +0 -506
- data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
- data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
- data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
- data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
- data/vendor/kreuzberg/tests/config_features.rs +1 -33
- data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
- data/vendor/kreuzberg/tests/core_integration.rs +9 -35
- data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
- data/vendor/kreuzberg/tests/email_integration.rs +1 -3
- data/vendor/kreuzberg/tests/error_handling.rs +34 -43
- data/vendor/kreuzberg/tests/format_integration.rs +1 -7
- data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
- data/vendor/kreuzberg/tests/image_integration.rs +0 -2
- data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
- data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
- data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
- data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
- data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
- data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
- data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
- data/vendor/kreuzberg/tests/security_validation.rs +1 -13
- data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
- metadata +25 -171
- data/.rubocop.yml +0 -543
- data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
- data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
- data/lib/kreuzberg/error_context.rb +0 -136
- data/lib/kreuzberg/types.rb +0 -170
- data/lib/libpdfium.so +0 -0
- data/spec/binding/async_operations_spec.rb +0 -473
- data/spec/binding/batch_operations_spec.rb +0 -595
- data/spec/binding/batch_spec.rb +0 -359
- data/spec/binding/config_result_spec.rb +0 -377
- data/spec/binding/embeddings_spec.rb +0 -816
- data/spec/binding/error_recovery_spec.rb +0 -488
- data/spec/binding/font_config_spec.rb +0 -220
- data/spec/binding/images_spec.rb +0 -738
- data/spec/binding/keywords_extraction_spec.rb +0 -600
- data/spec/binding/metadata_types_spec.rb +0 -1228
- data/spec/binding/pages_extraction_spec.rb +0 -471
- data/spec/binding/tables_spec.rb +0 -641
- data/spec/unit/config/chunking_config_spec.rb +0 -213
- data/spec/unit/config/embedding_config_spec.rb +0 -343
- data/spec/unit/config/extraction_config_spec.rb +0 -438
- data/spec/unit/config/font_config_spec.rb +0 -285
- data/spec/unit/config/hierarchy_config_spec.rb +0 -314
- data/spec/unit/config/image_extraction_config_spec.rb +0 -209
- data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
- data/spec/unit/config/keyword_config_spec.rb +0 -229
- data/spec/unit/config/language_detection_config_spec.rb +0 -258
- data/spec/unit/config/ocr_config_spec.rb +0 -171
- data/spec/unit/config/page_config_spec.rb +0 -221
- data/spec/unit/config/pdf_config_spec.rb +0 -267
- data/spec/unit/config/postprocessor_config_spec.rb +0 -290
- data/spec/unit/config/tesseract_config_spec.rb +0 -181
- data/spec/unit/config/token_reduction_config_spec.rb +0 -251
- data/test/metadata_types_test.rb +0 -959
- data/vendor/Cargo.toml +0 -61
- data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
- data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
- data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
- data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/formats.rs +0 -235
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
- data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
- data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
- data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
- data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
- data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
- data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
- data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/extractors/security.rs +0 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
- data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
- data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
- data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
- data/vendor/kreuzberg/src/panic_context.rs +0 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
- data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
- data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
- data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
- data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
- data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
- data/vendor/kreuzberg/src/utils/pool.rs +0 -503
- data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
- data/vendor/kreuzberg/tests/api_embed.rs +0 -360
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
- data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
- data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
- data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
- data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
- data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
- data/vendor/kreuzberg/tests/page_markers.rs +0 -297
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
- data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
- data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
- data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
- data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
- data/vendor/kreuzberg-ffi/README.md +0 -851
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
- data/vendor/kreuzberg-ffi/build.rs +0 -168
- data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
- data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
- data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
- data/vendor/kreuzberg-ffi/src/error.rs +0 -901
- data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
- data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
- data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
- data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
- data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
- data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
- data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
- data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
- data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
- data/vendor/kreuzberg-ffi/src/result.rs +0 -510
- data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
- data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
- data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
- data/vendor/kreuzberg-ffi/src/types.rs +0 -363
- data/vendor/kreuzberg-ffi/src/util.rs +0 -210
- data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
- data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
- data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
- data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
- data/vendor/kreuzberg-tesseract/LICENSE +0 -22
- data/vendor/kreuzberg-tesseract/README.md +0 -399
- data/vendor/kreuzberg-tesseract/build.rs +0 -1127
- data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
- data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
- data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
- data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
- data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
- data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211
data/sig/kreuzberg.rbs
CHANGED
|
@@ -3,16 +3,6 @@
|
|
|
3
3
|
module Kreuzberg
|
|
4
4
|
VERSION: String
|
|
5
5
|
|
|
6
|
-
# Error code constants
|
|
7
|
-
ERROR_CODE_SUCCESS: Integer
|
|
8
|
-
ERROR_CODE_GENERIC: Integer
|
|
9
|
-
ERROR_CODE_PANIC: Integer
|
|
10
|
-
ERROR_CODE_INVALID_ARGUMENT: Integer
|
|
11
|
-
ERROR_CODE_IO: Integer
|
|
12
|
-
ERROR_CODE_PARSING: Integer
|
|
13
|
-
ERROR_CODE_OCR: Integer
|
|
14
|
-
ERROR_CODE_MISSING_DEPENDENCY: Integer
|
|
15
|
-
|
|
16
6
|
# Config namespace (defined in lib/kreuzberg/config.rb)
|
|
17
7
|
module Config
|
|
18
8
|
class OCR
|
|
@@ -74,21 +64,12 @@ module Kreuzberg
|
|
|
74
64
|
def to_h: () -> Hash[Symbol, untyped]
|
|
75
65
|
end
|
|
76
66
|
|
|
77
|
-
class FontConfig
|
|
78
|
-
attr_accessor enabled: bool
|
|
79
|
-
attr_accessor custom_font_dirs: Array[String]?
|
|
80
|
-
|
|
81
|
-
def initialize: (?enabled: bool, ?custom_font_dirs: Array[String]?) -> void
|
|
82
|
-
def to_h: () -> Hash[Symbol, untyped]
|
|
83
|
-
end
|
|
84
|
-
|
|
85
67
|
class PDF
|
|
86
68
|
attr_reader extract_images: bool
|
|
87
69
|
attr_reader passwords: Array[String]?
|
|
88
70
|
attr_reader extract_metadata: bool
|
|
89
|
-
attr_reader font_config: FontConfig?
|
|
90
71
|
|
|
91
|
-
def initialize: (?extract_images: bool, ?passwords: (Array[String] | String)?, ?extract_metadata: bool
|
|
72
|
+
def initialize: (?extract_images: bool, ?passwords: (Array[String] | String)?, ?extract_metadata: bool) -> void
|
|
92
73
|
def to_h: () -> Hash[Symbol, untyped]
|
|
93
74
|
end
|
|
94
75
|
|
|
@@ -177,15 +158,6 @@ module Kreuzberg
|
|
|
177
158
|
def to_h: () -> Hash[Symbol, untyped]
|
|
178
159
|
end
|
|
179
160
|
|
|
180
|
-
class PageConfig
|
|
181
|
-
attr_reader extract_pages: bool
|
|
182
|
-
attr_reader insert_page_markers: bool
|
|
183
|
-
attr_reader marker_format: String
|
|
184
|
-
|
|
185
|
-
def initialize: (?extract_pages: bool, ?insert_page_markers: bool, ?marker_format: String) -> void
|
|
186
|
-
def to_h: () -> Hash[Symbol, untyped]
|
|
187
|
-
end
|
|
188
|
-
|
|
189
161
|
class Extraction
|
|
190
162
|
attr_reader use_cache: bool
|
|
191
163
|
attr_reader enable_quality_processing: bool
|
|
@@ -200,7 +172,6 @@ module Kreuzberg
|
|
|
200
172
|
attr_reader token_reduction: TokenReduction?
|
|
201
173
|
attr_reader keywords: Keywords?
|
|
202
174
|
attr_reader html_options: HtmlOptions?
|
|
203
|
-
attr_reader pages: PageConfig?
|
|
204
175
|
attr_reader max_concurrent_extractions: Integer?
|
|
205
176
|
|
|
206
177
|
def self.from_file: (String path) -> Extraction
|
|
@@ -218,7 +189,6 @@ module Kreuzberg
|
|
|
218
189
|
?token_reduction: (TokenReduction | Hash[Symbol, untyped])?,
|
|
219
190
|
?keywords: (Keywords | Hash[Symbol, untyped])?,
|
|
220
191
|
?html_options: (HtmlOptions | Hash[Symbol, untyped])?,
|
|
221
|
-
?pages: (PageConfig | Hash[Symbol, untyped])?,
|
|
222
192
|
?max_concurrent_extractions: Integer?
|
|
223
193
|
) -> void
|
|
224
194
|
def to_h: () -> Hash[Symbol, untyped]
|
|
@@ -228,20 +198,13 @@ module Kreuzberg
|
|
|
228
198
|
def normalize_config: [T] (T | Hash[Symbol, untyped] | nil value, Class klass) -> T?
|
|
229
199
|
end
|
|
230
200
|
|
|
201
|
+
# Backwards compatibility alias
|
|
202
|
+
Ocr: singleton(OCR)
|
|
231
203
|
end
|
|
232
204
|
|
|
233
205
|
# Alias for Config::Extraction (for API consistency with other language bindings)
|
|
234
206
|
ExtractionConfig: singleton(Config::Extraction)
|
|
235
207
|
|
|
236
|
-
# Alias for Config::PageConfig (for API consistency with other language bindings)
|
|
237
|
-
PageConfig: singleton(Config::PageConfig)
|
|
238
|
-
|
|
239
|
-
# Keyword algorithm constants
|
|
240
|
-
module KeywordAlgorithm
|
|
241
|
-
YAKE: Symbol
|
|
242
|
-
RAKE: Symbol
|
|
243
|
-
end
|
|
244
|
-
|
|
245
208
|
# Extraction result type
|
|
246
209
|
type extraction_result_hash = {
|
|
247
210
|
content: String,
|
|
@@ -261,13 +224,11 @@ module Kreuzberg
|
|
|
261
224
|
|
|
262
225
|
type chunk_hash = {
|
|
263
226
|
content: String,
|
|
264
|
-
|
|
265
|
-
|
|
227
|
+
char_start: Integer,
|
|
228
|
+
char_end: Integer,
|
|
266
229
|
token_count: Integer?,
|
|
267
230
|
chunk_index: Integer?,
|
|
268
231
|
total_chunks: Integer?,
|
|
269
|
-
first_page: Integer?,
|
|
270
|
-
last_page: Integer?,
|
|
271
232
|
embedding: Array[Float]?
|
|
272
233
|
}
|
|
273
234
|
|
|
@@ -307,24 +268,20 @@ module Kreuzberg
|
|
|
307
268
|
# Text chunk
|
|
308
269
|
class Chunk
|
|
309
270
|
attr_reader content: String
|
|
310
|
-
attr_reader
|
|
311
|
-
attr_reader
|
|
271
|
+
attr_reader char_start: Integer
|
|
272
|
+
attr_reader char_end: Integer
|
|
312
273
|
attr_reader token_count: Integer?
|
|
313
274
|
attr_reader chunk_index: Integer?
|
|
314
275
|
attr_reader total_chunks: Integer?
|
|
315
|
-
attr_reader first_page: Integer?
|
|
316
|
-
attr_reader last_page: Integer?
|
|
317
276
|
attr_reader embedding: Array[Float]?
|
|
318
277
|
|
|
319
278
|
def initialize: (
|
|
320
279
|
content: String,
|
|
321
|
-
|
|
322
|
-
|
|
280
|
+
char_start: Integer,
|
|
281
|
+
char_end: Integer,
|
|
323
282
|
token_count: Integer?,
|
|
324
283
|
chunk_index: Integer?,
|
|
325
284
|
total_chunks: Integer?,
|
|
326
|
-
first_page: Integer?,
|
|
327
|
-
last_page: Integer?,
|
|
328
285
|
embedding: Array[Float]?
|
|
329
286
|
) -> void
|
|
330
287
|
def to_h: () -> chunk_hash
|
|
@@ -434,14 +391,6 @@ module Kreuzberg
|
|
|
434
391
|
# Config loading (native method)
|
|
435
392
|
def self._config_from_file_native: (String path) -> Hash[Symbol, untyped]
|
|
436
393
|
|
|
437
|
-
# Error introspection (native methods)
|
|
438
|
-
def self._last_error_code_native: () -> Integer
|
|
439
|
-
def self._last_panic_context_json_native: () -> String?
|
|
440
|
-
def self._get_error_details_native: () -> Hash[Symbol, untyped]
|
|
441
|
-
def self._classify_error_native: (String message) -> Integer
|
|
442
|
-
def self._error_code_name_native: (Integer code) -> String
|
|
443
|
-
def self._error_code_description_native: (Integer code) -> String
|
|
444
|
-
|
|
445
394
|
# Plugin registration
|
|
446
395
|
def self.register_post_processor: (String name, _PostProcessor processor, ?stage: Symbol?) -> void
|
|
447
396
|
def self.unregister_post_processor: (String name) -> void
|
|
@@ -464,67 +413,25 @@ module Kreuzberg
|
|
|
464
413
|
def extract_text: (String file_path_or_bytes, Hash[Symbol, untyped] config) -> String
|
|
465
414
|
end
|
|
466
415
|
|
|
467
|
-
module ErrorContext
|
|
468
|
-
def self.last_error_code: () -> Integer
|
|
469
|
-
def self.last_panic_context: () -> Errors::PanicContext?
|
|
470
|
-
def self.last_panic_context_json: () -> String?
|
|
471
|
-
def self.error_details: () -> Hash[Symbol, untyped]
|
|
472
|
-
def self.classify_error: (String message) -> Integer
|
|
473
|
-
def self.error_code_name: (Integer code) -> String
|
|
474
|
-
def self.error_code_description: (Integer code) -> String
|
|
475
|
-
end
|
|
476
|
-
|
|
477
416
|
module Errors
|
|
478
|
-
# Panic context information from FFI error introspection
|
|
479
|
-
class PanicContext
|
|
480
|
-
attr_reader file: String
|
|
481
|
-
attr_reader line: Integer
|
|
482
|
-
attr_reader function: String
|
|
483
|
-
attr_reader message: String
|
|
484
|
-
attr_reader timestamp_secs: Integer
|
|
485
|
-
|
|
486
|
-
def initialize: (
|
|
487
|
-
file: String,
|
|
488
|
-
line: Integer,
|
|
489
|
-
function: String,
|
|
490
|
-
message: String,
|
|
491
|
-
timestamp_secs: Integer
|
|
492
|
-
) -> void
|
|
493
|
-
def to_s: () -> String
|
|
494
|
-
def to_h: () -> Hash[Symbol, String | Integer]
|
|
495
|
-
def self.from_json: (String) -> PanicContext?
|
|
496
|
-
|
|
497
|
-
private
|
|
498
|
-
|
|
499
|
-
def self.with_defaults: (Hash[Symbol, untyped] sliced) -> {file: String, line: Integer, function: String, message: String, timestamp_secs: Integer}
|
|
500
|
-
end
|
|
501
|
-
|
|
502
417
|
class Error < StandardError
|
|
503
|
-
attr_reader panic_context: PanicContext?
|
|
504
|
-
attr_reader error_code: Integer?
|
|
505
|
-
|
|
506
|
-
def initialize: (String message, ?panic_context: PanicContext?, ?error_code: Integer?) -> void
|
|
507
418
|
end
|
|
508
419
|
|
|
509
420
|
class ValidationError < Error
|
|
510
421
|
end
|
|
511
422
|
|
|
512
423
|
class ParsingError < Error
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
def initialize: (String message, ?context: Hash[untyped, untyped]?, ?panic_context: PanicContext?, ?error_code: Integer?) -> void
|
|
424
|
+
def initialize: (String message, ?context: Hash[untyped, untyped]?) -> void
|
|
516
425
|
end
|
|
517
426
|
|
|
518
427
|
class OCRError < Error
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
def initialize: (String message, ?context: Hash[untyped, untyped]?, ?panic_context: PanicContext?, ?error_code: Integer?) -> void
|
|
428
|
+
def initialize: (String message, ?context: Hash[untyped, untyped]?) -> void
|
|
522
429
|
end
|
|
523
430
|
|
|
524
431
|
class MissingDependencyError < Error
|
|
525
432
|
attr_reader dependency: String?
|
|
526
433
|
|
|
527
|
-
def initialize: (String message, ?dependency: String
|
|
434
|
+
def initialize: (String message, ?dependency: String?) -> void
|
|
528
435
|
end
|
|
529
436
|
|
|
530
437
|
class IOError < Error
|
data/spec/binding/cache_spec.rb
CHANGED
|
@@ -19,8 +19,8 @@ RSpec.describe 'Cache Management' do
|
|
|
19
19
|
|
|
20
20
|
describe 'clear_cache' do
|
|
21
21
|
it 'removes all cached results' do
|
|
22
|
-
Kreuzberg.extract_file_sync(
|
|
23
|
-
Kreuzberg.extract_file_sync(
|
|
22
|
+
Kreuzberg.extract_file_sync(test_pdf)
|
|
23
|
+
Kreuzberg.extract_file_sync(test_text)
|
|
24
24
|
|
|
25
25
|
stats_before = Kreuzberg.cache_stats
|
|
26
26
|
expect(stats_before['total_entries']).to be_positive
|
|
@@ -47,10 +47,10 @@ RSpec.describe 'Cache Management' do
|
|
|
47
47
|
end
|
|
48
48
|
|
|
49
49
|
it 'does not affect future extractions' do
|
|
50
|
-
Kreuzberg.extract_file_sync(
|
|
50
|
+
Kreuzberg.extract_file_sync(test_pdf)
|
|
51
51
|
Kreuzberg.clear_cache
|
|
52
52
|
|
|
53
|
-
result = Kreuzberg.extract_file_sync(
|
|
53
|
+
result = Kreuzberg.extract_file_sync(test_pdf)
|
|
54
54
|
|
|
55
55
|
expect(result).to be_a(Kreuzberg::Result)
|
|
56
56
|
expect(result.content).not_to be_empty
|
|
@@ -77,7 +77,7 @@ RSpec.describe 'Cache Management' do
|
|
|
77
77
|
it 'shows entries after extractions' do
|
|
78
78
|
Kreuzberg.clear_cache
|
|
79
79
|
|
|
80
|
-
Kreuzberg.extract_file_sync(
|
|
80
|
+
Kreuzberg.extract_file_sync(test_pdf)
|
|
81
81
|
stats = Kreuzberg.cache_stats
|
|
82
82
|
|
|
83
83
|
expect(stats['total_entries']).to be_positive
|
|
@@ -86,7 +86,7 @@ RSpec.describe 'Cache Management' do
|
|
|
86
86
|
it 'shows total size in bytes' do
|
|
87
87
|
Kreuzberg.clear_cache
|
|
88
88
|
|
|
89
|
-
Kreuzberg.extract_file_sync(
|
|
89
|
+
Kreuzberg.extract_file_sync(test_pdf)
|
|
90
90
|
stats = Kreuzberg.cache_stats
|
|
91
91
|
|
|
92
92
|
expect(stats['total_size_bytes']).to be_positive
|
|
@@ -95,10 +95,10 @@ RSpec.describe 'Cache Management' do
|
|
|
95
95
|
it 'increases stats with multiple extractions' do
|
|
96
96
|
Kreuzberg.clear_cache
|
|
97
97
|
|
|
98
|
-
Kreuzberg.extract_file_sync(
|
|
98
|
+
Kreuzberg.extract_file_sync(test_pdf)
|
|
99
99
|
stats_after_one = Kreuzberg.cache_stats
|
|
100
100
|
|
|
101
|
-
Kreuzberg.extract_file_sync(
|
|
101
|
+
Kreuzberg.extract_file_sync(test_text)
|
|
102
102
|
stats_after_two = Kreuzberg.cache_stats
|
|
103
103
|
|
|
104
104
|
expect(stats_after_two['total_entries']).to be >= stats_after_one['total_entries']
|
|
@@ -111,11 +111,11 @@ RSpec.describe 'Cache Management' do
|
|
|
111
111
|
stats_initial = Kreuzberg.cache_stats
|
|
112
112
|
expect(stats_initial['total_entries']).to eq(0)
|
|
113
113
|
|
|
114
|
-
result1 = Kreuzberg.extract_file_sync(
|
|
114
|
+
result1 = Kreuzberg.extract_file_sync(test_pdf)
|
|
115
115
|
stats_after_first = Kreuzberg.cache_stats
|
|
116
116
|
expect(stats_after_first['total_entries']).to be_positive
|
|
117
117
|
|
|
118
|
-
result2 = Kreuzberg.extract_file_sync(
|
|
118
|
+
result2 = Kreuzberg.extract_file_sync(test_pdf)
|
|
119
119
|
stats_after_second = Kreuzberg.cache_stats
|
|
120
120
|
|
|
121
121
|
expect(result1.content).to eq(result2.content)
|
|
@@ -125,10 +125,10 @@ RSpec.describe 'Cache Management' do
|
|
|
125
125
|
it 'tracks different files separately' do
|
|
126
126
|
Kreuzberg.clear_cache
|
|
127
127
|
|
|
128
|
-
Kreuzberg.extract_file_sync(
|
|
128
|
+
Kreuzberg.extract_file_sync(test_pdf)
|
|
129
129
|
stats_after_pdf = Kreuzberg.cache_stats
|
|
130
130
|
|
|
131
|
-
Kreuzberg.extract_file_sync(
|
|
131
|
+
Kreuzberg.extract_file_sync(test_text)
|
|
132
132
|
stats_after_text = Kreuzberg.cache_stats
|
|
133
133
|
|
|
134
134
|
expect(stats_after_text['total_entries']).to be >= stats_after_pdf['total_entries']
|
|
@@ -138,11 +138,11 @@ RSpec.describe 'Cache Management' do
|
|
|
138
138
|
Kreuzberg.clear_cache
|
|
139
139
|
|
|
140
140
|
Time.now
|
|
141
|
-
result1 = Kreuzberg.extract_file_sync(
|
|
141
|
+
result1 = Kreuzberg.extract_file_sync(test_pdf)
|
|
142
142
|
Time.now
|
|
143
143
|
|
|
144
144
|
Time.now
|
|
145
|
-
result2 = Kreuzberg.extract_file_sync(
|
|
145
|
+
result2 = Kreuzberg.extract_file_sync(test_pdf)
|
|
146
146
|
Time.now
|
|
147
147
|
|
|
148
148
|
expect(result1.content).to eq(result2.content)
|
|
@@ -150,11 +150,11 @@ RSpec.describe 'Cache Management' do
|
|
|
150
150
|
end
|
|
151
151
|
|
|
152
152
|
it 'clears cache between extractions when requested' do
|
|
153
|
-
result1 = Kreuzberg.extract_file_sync(
|
|
153
|
+
result1 = Kreuzberg.extract_file_sync(test_pdf)
|
|
154
154
|
|
|
155
155
|
Kreuzberg.clear_cache
|
|
156
156
|
|
|
157
|
-
result2 = Kreuzberg.extract_file_sync(
|
|
157
|
+
result2 = Kreuzberg.extract_file_sync(test_pdf)
|
|
158
158
|
|
|
159
159
|
expect(result1.content).to eq(result2.content)
|
|
160
160
|
end
|
|
@@ -167,10 +167,10 @@ RSpec.describe 'Cache Management' do
|
|
|
167
167
|
config1 = Kreuzberg::Config::Extraction.new(use_cache: true)
|
|
168
168
|
config2 = Kreuzberg::Config::Extraction.new(use_cache: false)
|
|
169
169
|
|
|
170
|
-
Kreuzberg.extract_file_sync(
|
|
170
|
+
Kreuzberg.extract_file_sync(test_pdf, config: config1)
|
|
171
171
|
stats_after_first = Kreuzberg.cache_stats
|
|
172
172
|
|
|
173
|
-
Kreuzberg.extract_file_sync(
|
|
173
|
+
Kreuzberg.extract_file_sync(test_pdf, config: config2)
|
|
174
174
|
stats_after_second = Kreuzberg.cache_stats
|
|
175
175
|
|
|
176
176
|
expect(stats_after_second['total_entries']).to eq(stats_after_first['total_entries'])
|
|
@@ -179,8 +179,8 @@ RSpec.describe 'Cache Management' do
|
|
|
179
179
|
|
|
180
180
|
describe 'cache stats consistency' do
|
|
181
181
|
it 'stats remain consistent after clear' do
|
|
182
|
-
Kreuzberg.extract_file_sync(
|
|
183
|
-
Kreuzberg.extract_file_sync(
|
|
182
|
+
Kreuzberg.extract_file_sync(test_pdf)
|
|
183
|
+
Kreuzberg.extract_file_sync(test_text)
|
|
184
184
|
|
|
185
185
|
Kreuzberg.clear_cache
|
|
186
186
|
stats = Kreuzberg.cache_stats
|
|
@@ -192,12 +192,12 @@ RSpec.describe 'Cache Management' do
|
|
|
192
192
|
it 'stats update correctly after new extractions' do
|
|
193
193
|
Kreuzberg.clear_cache
|
|
194
194
|
|
|
195
|
-
Kreuzberg.extract_file_sync(
|
|
195
|
+
Kreuzberg.extract_file_sync(test_pdf)
|
|
196
196
|
Kreuzberg.cache_stats
|
|
197
197
|
|
|
198
198
|
Kreuzberg.clear_cache
|
|
199
199
|
|
|
200
|
-
Kreuzberg.extract_file_sync(
|
|
200
|
+
Kreuzberg.extract_file_sync(test_text)
|
|
201
201
|
stats2 = Kreuzberg.cache_stats
|
|
202
202
|
|
|
203
203
|
expect(stats2['total_entries']).to be_positive
|
|
@@ -3,7 +3,8 @@
|
|
|
3
3
|
RSpec.describe Kreuzberg::CLIProxy do
|
|
4
4
|
describe '.find_cli_binary' do
|
|
5
5
|
context 'when binary exists' do
|
|
6
|
-
it 'finds the binary in search paths' do
|
|
6
|
+
it 'finds the binary in search paths', :skip do
|
|
7
|
+
# Skip in CI/test environments where binary might not be built
|
|
7
8
|
binary = described_class.find_cli_binary
|
|
8
9
|
expect(binary).to be_a(Pathname)
|
|
9
10
|
expect(binary.file?).to be true
|
|
@@ -24,8 +25,9 @@ RSpec.describe Kreuzberg::CLIProxy do
|
|
|
24
25
|
end
|
|
25
26
|
|
|
26
27
|
describe '.call' do
|
|
27
|
-
context 'when binary is available' do
|
|
28
|
+
context 'when binary is available', :skip do
|
|
28
29
|
it 'executes CLI command successfully' do
|
|
30
|
+
# Skip in environments without built binary
|
|
29
31
|
output = described_class.call(['--version'])
|
|
30
32
|
expect(output).to be_a(String)
|
|
31
33
|
expect(output).not_to be_empty
|
data/spec/binding/cli_spec.rb
CHANGED
|
@@ -1,35 +1,34 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
RSpec.describe Kreuzberg::CLI do
|
|
4
|
-
describe '.extract' do
|
|
4
|
+
describe '.extract', :skip do
|
|
5
5
|
it 'extracts content from a file' do
|
|
6
|
-
|
|
6
|
+
# Skip in environments without CLI binary
|
|
7
|
+
path = create_test_file('CLI test content')
|
|
7
8
|
output = described_class.extract(path)
|
|
8
9
|
|
|
9
10
|
expect(output).to be_a(String)
|
|
10
|
-
expect(output).
|
|
11
|
+
expect(output).to include('CLI test content')
|
|
11
12
|
end
|
|
12
13
|
|
|
13
14
|
it 'accepts output format option' do
|
|
14
|
-
path =
|
|
15
|
+
path = create_test_file('JSON output test')
|
|
15
16
|
output = described_class.extract(path, output: 'json')
|
|
16
17
|
|
|
17
18
|
expect(output).to be_a(String)
|
|
18
|
-
expect(output).not_to be_empty
|
|
19
19
|
end
|
|
20
20
|
|
|
21
21
|
it 'accepts OCR option' do
|
|
22
|
-
path =
|
|
23
|
-
output = described_class.extract(path, ocr:
|
|
22
|
+
path = create_test_file('OCR test')
|
|
23
|
+
output = described_class.extract(path, ocr: true)
|
|
24
24
|
|
|
25
25
|
expect(output).to be_a(String)
|
|
26
|
-
expect(output).not_to be_empty
|
|
27
26
|
end
|
|
28
27
|
end
|
|
29
28
|
|
|
30
|
-
describe '.detect' do
|
|
29
|
+
describe '.detect', :skip do
|
|
31
30
|
it 'detects MIME type' do
|
|
32
|
-
path =
|
|
31
|
+
path = create_test_file('MIME detection test')
|
|
33
32
|
mime_type = described_class.detect(path)
|
|
34
33
|
|
|
35
34
|
expect(mime_type).to be_a(String)
|
|
@@ -37,7 +36,7 @@ RSpec.describe Kreuzberg::CLI do
|
|
|
37
36
|
end
|
|
38
37
|
end
|
|
39
38
|
|
|
40
|
-
describe '.version' do
|
|
39
|
+
describe '.version', :skip do
|
|
41
40
|
it 'returns version string' do
|
|
42
41
|
version = described_class.version
|
|
43
42
|
expect(version).to be_a(String)
|
|
@@ -45,7 +44,7 @@ RSpec.describe Kreuzberg::CLI do
|
|
|
45
44
|
end
|
|
46
45
|
end
|
|
47
46
|
|
|
48
|
-
describe '.help' do
|
|
47
|
+
describe '.help', :skip do
|
|
49
48
|
it 'returns help text' do
|
|
50
49
|
help_text = described_class.help
|
|
51
50
|
expect(help_text).to be_a(String)
|
data/spec/binding/config_spec.rb
CHANGED
|
@@ -86,47 +86,6 @@ RSpec.describe Kreuzberg::Config do
|
|
|
86
86
|
end
|
|
87
87
|
end
|
|
88
88
|
|
|
89
|
-
describe Kreuzberg::Config::FontConfig do
|
|
90
|
-
it 'creates with default values' do
|
|
91
|
-
font_config = described_class.new
|
|
92
|
-
|
|
93
|
-
expect(font_config.enabled).to be true
|
|
94
|
-
expect(font_config.custom_font_dirs).to be_nil
|
|
95
|
-
end
|
|
96
|
-
|
|
97
|
-
it 'creates with custom values' do
|
|
98
|
-
dirs = ['/usr/share/fonts', '/home/user/.fonts']
|
|
99
|
-
font_config = described_class.new(
|
|
100
|
-
enabled: false,
|
|
101
|
-
custom_font_dirs: dirs
|
|
102
|
-
)
|
|
103
|
-
|
|
104
|
-
expect(font_config.enabled).to be false
|
|
105
|
-
expect(font_config.custom_font_dirs).to eq(dirs)
|
|
106
|
-
end
|
|
107
|
-
|
|
108
|
-
it 'converts to hash' do
|
|
109
|
-
dirs = ['/usr/share/fonts']
|
|
110
|
-
font_config = described_class.new(
|
|
111
|
-
enabled: true,
|
|
112
|
-
custom_font_dirs: dirs
|
|
113
|
-
)
|
|
114
|
-
hash = font_config.to_h
|
|
115
|
-
|
|
116
|
-
expect(hash).to be_a(Hash)
|
|
117
|
-
expect(hash[:enabled]).to be true
|
|
118
|
-
expect(hash[:custom_font_dirs]).to eq(dirs)
|
|
119
|
-
end
|
|
120
|
-
|
|
121
|
-
it 'compacts nil values in hash' do
|
|
122
|
-
font_config = described_class.new(enabled: true)
|
|
123
|
-
hash = font_config.to_h
|
|
124
|
-
|
|
125
|
-
expect(hash).to be_a(Hash)
|
|
126
|
-
expect(hash.key?(:custom_font_dirs)).to be false
|
|
127
|
-
end
|
|
128
|
-
end
|
|
129
|
-
|
|
130
89
|
describe Kreuzberg::Config::PDF do
|
|
131
90
|
it 'creates with default values' do
|
|
132
91
|
pdf = described_class.new
|
|
@@ -134,7 +93,6 @@ RSpec.describe Kreuzberg::Config do
|
|
|
134
93
|
expect(pdf.extract_images).to be false
|
|
135
94
|
expect(pdf.passwords).to be_nil
|
|
136
95
|
expect(pdf.extract_metadata).to be true
|
|
137
|
-
expect(pdf.font_config).to be_nil
|
|
138
96
|
end
|
|
139
97
|
|
|
140
98
|
it 'creates with custom values' do
|
|
@@ -147,23 +105,6 @@ RSpec.describe Kreuzberg::Config do
|
|
|
147
105
|
expect(pdf.passwords).to eq(%w[secret backup])
|
|
148
106
|
end
|
|
149
107
|
|
|
150
|
-
it 'creates with font_config as instance' do
|
|
151
|
-
font_config = Kreuzberg::Config::FontConfig.new(enabled: true)
|
|
152
|
-
pdf = described_class.new(font_config: font_config)
|
|
153
|
-
|
|
154
|
-
expect(pdf.font_config).to be_a(Kreuzberg::Config::FontConfig)
|
|
155
|
-
expect(pdf.font_config.enabled).to be true
|
|
156
|
-
end
|
|
157
|
-
|
|
158
|
-
it 'creates with font_config as hash' do
|
|
159
|
-
font_config_hash = { enabled: false, custom_font_dirs: ['/fonts'] }
|
|
160
|
-
pdf = described_class.new(font_config: font_config_hash)
|
|
161
|
-
|
|
162
|
-
expect(pdf.font_config).to be_a(Kreuzberg::Config::FontConfig)
|
|
163
|
-
expect(pdf.font_config.enabled).to be false
|
|
164
|
-
expect(pdf.font_config.custom_font_dirs).to eq(['/fonts'])
|
|
165
|
-
end
|
|
166
|
-
|
|
167
108
|
it 'converts to hash' do
|
|
168
109
|
pdf = described_class.new(extract_images: true, passwords: ['test'])
|
|
169
110
|
hash = pdf.to_h
|
|
@@ -172,21 +113,6 @@ RSpec.describe Kreuzberg::Config do
|
|
|
172
113
|
expect(hash[:extract_images]).to be true
|
|
173
114
|
expect(hash[:passwords]).to eq(['test'])
|
|
174
115
|
end
|
|
175
|
-
|
|
176
|
-
it 'includes font_config in hash when present' do
|
|
177
|
-
font_config = Kreuzberg::Config::FontConfig.new(enabled: true)
|
|
178
|
-
pdf = described_class.new(font_config: font_config)
|
|
179
|
-
hash = pdf.to_h
|
|
180
|
-
|
|
181
|
-
expect(hash[:font_config]).to be_a(Hash)
|
|
182
|
-
expect(hash[:font_config][:enabled]).to be true
|
|
183
|
-
end
|
|
184
|
-
|
|
185
|
-
it 'raises error with invalid font_config type' do
|
|
186
|
-
expect do
|
|
187
|
-
described_class.new(font_config: 'invalid')
|
|
188
|
-
end.to raise_error(ArgumentError)
|
|
189
|
-
end
|
|
190
116
|
end
|
|
191
117
|
|
|
192
118
|
describe Kreuzberg::Config::Extraction do
|