kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +0 -6
- data/.rubocop.yaml +534 -1
- data/Gemfile +2 -1
- data/Gemfile.lock +28 -116
- data/README.md +269 -629
- data/Rakefile +0 -9
- data/Steepfile +4 -8
- data/examples/async_patterns.rb +58 -1
- data/ext/kreuzberg_rb/extconf.rb +5 -35
- data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
- data/ext/kreuzberg_rb/native/build.rs +14 -12
- data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
- data/ext/kreuzberg_rb/native/include/strings.h +2 -2
- data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
- data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
- data/extconf.rb +6 -38
- data/kreuzberg.gemspec +20 -114
- data/lib/kreuzberg/api_proxy.rb +18 -2
- data/lib/kreuzberg/cache_api.rb +0 -22
- data/lib/kreuzberg/cli.rb +10 -2
- data/lib/kreuzberg/cli_proxy.rb +10 -0
- data/lib/kreuzberg/config.rb +22 -274
- data/lib/kreuzberg/errors.rb +7 -73
- data/lib/kreuzberg/extraction_api.rb +8 -237
- data/lib/kreuzberg/mcp_proxy.rb +11 -2
- data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
- data/lib/kreuzberg/post_processor_protocol.rb +71 -0
- data/lib/kreuzberg/result.rb +33 -151
- data/lib/kreuzberg/setup_lib_path.rb +2 -22
- data/lib/kreuzberg/validator_protocol.rb +73 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +13 -27
- data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
- data/sig/kreuzberg.rbs +12 -105
- data/spec/binding/cache_spec.rb +22 -22
- data/spec/binding/cli_proxy_spec.rb +4 -2
- data/spec/binding/cli_spec.rb +11 -12
- data/spec/binding/config_spec.rb +0 -74
- data/spec/binding/config_validation_spec.rb +6 -100
- data/spec/binding/error_handling_spec.rb +97 -283
- data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
- data/spec/binding/plugins/postprocessor_spec.rb +11 -11
- data/spec/binding/plugins/validator_spec.rb +13 -12
- data/spec/examples.txt +104 -0
- data/spec/fixtures/config.toml +1 -0
- data/spec/fixtures/config.yaml +1 -0
- data/spec/fixtures/invalid_config.toml +1 -0
- data/spec/smoke/package_spec.rb +3 -2
- data/spec/spec_helper.rb +3 -1
- data/vendor/kreuzberg/Cargo.toml +67 -192
- data/vendor/kreuzberg/README.md +9 -97
- data/vendor/kreuzberg/build.rs +194 -516
- data/vendor/kreuzberg/src/api/handlers.rs +9 -130
- data/vendor/kreuzberg/src/api/mod.rs +3 -18
- data/vendor/kreuzberg/src/api/server.rs +71 -236
- data/vendor/kreuzberg/src/api/types.rs +7 -43
- data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
- data/vendor/kreuzberg/src/cache/mod.rs +3 -27
- data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
- data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
- data/vendor/kreuzberg/src/core/config.rs +23 -905
- data/vendor/kreuzberg/src/core/extractor.rs +106 -403
- data/vendor/kreuzberg/src/core/io.rs +2 -4
- data/vendor/kreuzberg/src/core/mime.rs +12 -2
- data/vendor/kreuzberg/src/core/mod.rs +3 -22
- data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
- data/vendor/kreuzberg/src/embeddings.rs +21 -169
- data/vendor/kreuzberg/src/error.rs +2 -2
- data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
- data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
- data/vendor/kreuzberg/src/extraction/email.rs +11 -12
- data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
- data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
- data/vendor/kreuzberg/src/extraction/image.rs +14 -138
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
- data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
- data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
- data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
- data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
- data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
- data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
- data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
- data/vendor/kreuzberg/src/extraction/table.rs +1 -2
- data/vendor/kreuzberg/src/extraction/text.rs +10 -18
- data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
- data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
- data/vendor/kreuzberg/src/extractors/email.rs +9 -37
- data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
- data/vendor/kreuzberg/src/extractors/html.rs +173 -182
- data/vendor/kreuzberg/src/extractors/image.rs +8 -32
- data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
- data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
- data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
- data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
- data/vendor/kreuzberg/src/extractors/text.rs +7 -30
- data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
- data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
- data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
- data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
- data/vendor/kreuzberg/src/lib.rs +5 -17
- data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
- data/vendor/kreuzberg/src/mcp/server.rs +21 -145
- data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
- data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
- data/vendor/kreuzberg/src/pdf/error.rs +1 -93
- data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
- data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
- data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
- data/vendor/kreuzberg/src/pdf/table.rs +64 -61
- data/vendor/kreuzberg/src/pdf/text.rs +24 -416
- data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
- data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
- data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
- data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
- data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
- data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
- data/vendor/kreuzberg/src/text/mod.rs +0 -8
- data/vendor/kreuzberg/src/text/quality.rs +15 -28
- data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
- data/vendor/kreuzberg/src/types.rs +67 -907
- data/vendor/kreuzberg/src/utils/mod.rs +0 -14
- data/vendor/kreuzberg/src/utils/quality.rs +3 -12
- data/vendor/kreuzberg/tests/api_tests.rs +0 -506
- data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
- data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
- data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
- data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
- data/vendor/kreuzberg/tests/config_features.rs +1 -33
- data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
- data/vendor/kreuzberg/tests/core_integration.rs +9 -35
- data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
- data/vendor/kreuzberg/tests/email_integration.rs +1 -3
- data/vendor/kreuzberg/tests/error_handling.rs +34 -43
- data/vendor/kreuzberg/tests/format_integration.rs +1 -7
- data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
- data/vendor/kreuzberg/tests/image_integration.rs +0 -2
- data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
- data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
- data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
- data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
- data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
- data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
- data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
- data/vendor/kreuzberg/tests/security_validation.rs +1 -13
- data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
- metadata +25 -171
- data/.rubocop.yml +0 -543
- data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
- data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
- data/lib/kreuzberg/error_context.rb +0 -136
- data/lib/kreuzberg/types.rb +0 -170
- data/lib/libpdfium.so +0 -0
- data/spec/binding/async_operations_spec.rb +0 -473
- data/spec/binding/batch_operations_spec.rb +0 -595
- data/spec/binding/batch_spec.rb +0 -359
- data/spec/binding/config_result_spec.rb +0 -377
- data/spec/binding/embeddings_spec.rb +0 -816
- data/spec/binding/error_recovery_spec.rb +0 -488
- data/spec/binding/font_config_spec.rb +0 -220
- data/spec/binding/images_spec.rb +0 -738
- data/spec/binding/keywords_extraction_spec.rb +0 -600
- data/spec/binding/metadata_types_spec.rb +0 -1228
- data/spec/binding/pages_extraction_spec.rb +0 -471
- data/spec/binding/tables_spec.rb +0 -641
- data/spec/unit/config/chunking_config_spec.rb +0 -213
- data/spec/unit/config/embedding_config_spec.rb +0 -343
- data/spec/unit/config/extraction_config_spec.rb +0 -438
- data/spec/unit/config/font_config_spec.rb +0 -285
- data/spec/unit/config/hierarchy_config_spec.rb +0 -314
- data/spec/unit/config/image_extraction_config_spec.rb +0 -209
- data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
- data/spec/unit/config/keyword_config_spec.rb +0 -229
- data/spec/unit/config/language_detection_config_spec.rb +0 -258
- data/spec/unit/config/ocr_config_spec.rb +0 -171
- data/spec/unit/config/page_config_spec.rb +0 -221
- data/spec/unit/config/pdf_config_spec.rb +0 -267
- data/spec/unit/config/postprocessor_config_spec.rb +0 -290
- data/spec/unit/config/tesseract_config_spec.rb +0 -181
- data/spec/unit/config/token_reduction_config_spec.rb +0 -251
- data/test/metadata_types_test.rb +0 -959
- data/vendor/Cargo.toml +0 -61
- data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
- data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
- data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
- data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/formats.rs +0 -235
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
- data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
- data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
- data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
- data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
- data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
- data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
- data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/extractors/security.rs +0 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
- data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
- data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
- data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
- data/vendor/kreuzberg/src/panic_context.rs +0 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
- data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
- data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
- data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
- data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
- data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
- data/vendor/kreuzberg/src/utils/pool.rs +0 -503
- data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
- data/vendor/kreuzberg/tests/api_embed.rs +0 -360
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
- data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
- data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
- data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
- data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
- data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
- data/vendor/kreuzberg/tests/page_markers.rs +0 -297
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
- data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
- data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
- data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
- data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
- data/vendor/kreuzberg-ffi/README.md +0 -851
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
- data/vendor/kreuzberg-ffi/build.rs +0 -168
- data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
- data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
- data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
- data/vendor/kreuzberg-ffi/src/error.rs +0 -901
- data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
- data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
- data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
- data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
- data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
- data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
- data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
- data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
- data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
- data/vendor/kreuzberg-ffi/src/result.rs +0 -510
- data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
- data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
- data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
- data/vendor/kreuzberg-ffi/src/types.rs +0 -363
- data/vendor/kreuzberg-ffi/src/util.rs +0 -210
- data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
- data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
- data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
- data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
- data/vendor/kreuzberg-tesseract/LICENSE +0 -22
- data/vendor/kreuzberg-tesseract/README.md +0 -399
- data/vendor/kreuzberg-tesseract/build.rs +0 -1127
- data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
- data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
- data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
- data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
- data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
- data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211
data/lib/kreuzberg/result.rb
CHANGED
|
@@ -7,58 +7,66 @@ rescue LoadError
|
|
|
7
7
|
end
|
|
8
8
|
|
|
9
9
|
module Kreuzberg
|
|
10
|
+
# Extraction result wrapper
|
|
11
|
+
#
|
|
12
|
+
# Provides structured access to extraction results from the native extension.
|
|
13
|
+
#
|
|
10
14
|
# @example
|
|
15
|
+
# result = Kreuzberg.extract_file_sync("document.pdf")
|
|
16
|
+
# puts result.content
|
|
17
|
+
# puts "MIME type: #{result.mime_type}"
|
|
18
|
+
# puts "Metadata: #{result.metadata.inspect}"
|
|
19
|
+
# result.tables.each { |table| puts table.inspect }
|
|
20
|
+
#
|
|
11
21
|
# rubocop:disable Metrics/ClassLength
|
|
12
22
|
class Result
|
|
13
23
|
attr_reader :content, :mime_type, :metadata, :metadata_json, :tables,
|
|
14
|
-
:detected_languages, :chunks, :images
|
|
24
|
+
:detected_languages, :chunks, :images
|
|
15
25
|
|
|
26
|
+
# Table structure
|
|
27
|
+
#
|
|
16
28
|
# @!attribute [r] cells
|
|
17
29
|
# @return [Array<Array<String>>] Table cells (2D array)
|
|
18
30
|
# @!attribute [r] markdown
|
|
19
31
|
# @return [String] Markdown representation
|
|
20
32
|
# @!attribute [r] page_number
|
|
21
33
|
# @return [Integer] Page number where table was found
|
|
34
|
+
#
|
|
22
35
|
Table = Struct.new(:cells, :markdown, :page_number, keyword_init: true) do
|
|
23
36
|
def to_h
|
|
24
37
|
{ cells: cells, markdown: markdown, page_number: page_number }
|
|
25
38
|
end
|
|
26
39
|
end
|
|
27
40
|
|
|
41
|
+
# Text chunk
|
|
42
|
+
#
|
|
28
43
|
# @!attribute [r] content
|
|
29
44
|
# @return [String] Chunk content
|
|
30
|
-
# @!attribute [r]
|
|
31
|
-
# @return [Integer] Starting
|
|
32
|
-
# @!attribute [r]
|
|
33
|
-
# @return [Integer] Ending
|
|
45
|
+
# @!attribute [r] char_start
|
|
46
|
+
# @return [Integer] Starting character index
|
|
47
|
+
# @!attribute [r] char_end
|
|
48
|
+
# @return [Integer] Ending character index
|
|
34
49
|
# @!attribute [r] token_count
|
|
35
50
|
# @return [Integer, nil] Approximate token count (may be nil)
|
|
36
|
-
#
|
|
37
|
-
# @return [Integer, nil] First page number (1-indexed)
|
|
38
|
-
# @!attribute [r] last_page
|
|
39
|
-
# @return [Integer, nil] Last page number (1-indexed)
|
|
51
|
+
#
|
|
40
52
|
Chunk = Struct.new(
|
|
41
53
|
:content,
|
|
42
|
-
:
|
|
43
|
-
:
|
|
54
|
+
:char_start,
|
|
55
|
+
:char_end,
|
|
44
56
|
:token_count,
|
|
45
57
|
:chunk_index,
|
|
46
58
|
:total_chunks,
|
|
47
|
-
:first_page,
|
|
48
|
-
:last_page,
|
|
49
59
|
:embedding,
|
|
50
60
|
keyword_init: true
|
|
51
61
|
) do
|
|
52
62
|
def to_h
|
|
53
63
|
{
|
|
54
64
|
content: content,
|
|
55
|
-
|
|
56
|
-
|
|
65
|
+
char_start: char_start,
|
|
66
|
+
char_end: char_end,
|
|
57
67
|
token_count: token_count,
|
|
58
68
|
chunk_index: chunk_index,
|
|
59
69
|
total_chunks: total_chunks,
|
|
60
|
-
first_page: first_page,
|
|
61
|
-
last_page: last_page,
|
|
62
70
|
embedding: embedding
|
|
63
71
|
}
|
|
64
72
|
end
|
|
@@ -95,30 +103,12 @@ module Kreuzberg
|
|
|
95
103
|
end
|
|
96
104
|
end
|
|
97
105
|
|
|
98
|
-
# @!attribute [r] page_number
|
|
99
|
-
# @return [Integer] Page number (1-indexed)
|
|
100
|
-
# @!attribute [r] content
|
|
101
|
-
# @return [String] Text content for this page
|
|
102
|
-
# @!attribute [r] tables
|
|
103
|
-
# @return [Array<Table>] Tables on this page
|
|
104
|
-
# @!attribute [r] images
|
|
105
|
-
# @return [Array<Image>] Images on this page
|
|
106
|
-
PageContent = Struct.new(:page_number, :content, :tables, :images, keyword_init: true) do
|
|
107
|
-
def to_h
|
|
108
|
-
{
|
|
109
|
-
page_number: page_number,
|
|
110
|
-
content: content,
|
|
111
|
-
tables: tables.map(&:to_h),
|
|
112
|
-
images: images.map(&:to_h)
|
|
113
|
-
}
|
|
114
|
-
end
|
|
115
|
-
end
|
|
116
|
-
|
|
117
106
|
# Initialize from native hash result
|
|
118
107
|
#
|
|
119
108
|
# @param hash [Hash] Hash returned from native extension
|
|
120
109
|
#
|
|
121
110
|
def initialize(hash)
|
|
111
|
+
# Handle both string and symbol keys for flexibility
|
|
122
112
|
@content = get_value(hash, 'content', '')
|
|
123
113
|
@mime_type = get_value(hash, 'mime_type', '')
|
|
124
114
|
@metadata_json = get_value(hash, 'metadata_json', '{}')
|
|
@@ -127,7 +117,6 @@ module Kreuzberg
|
|
|
127
117
|
@detected_languages = parse_detected_languages(get_value(hash, 'detected_languages'))
|
|
128
118
|
@chunks = parse_chunks(get_value(hash, 'chunks'))
|
|
129
119
|
@images = parse_images(get_value(hash, 'images'))
|
|
130
|
-
@pages = parse_pages(get_value(hash, 'pages'))
|
|
131
120
|
end
|
|
132
121
|
|
|
133
122
|
# Convert to hash
|
|
@@ -139,11 +128,10 @@ module Kreuzberg
|
|
|
139
128
|
content: @content,
|
|
140
129
|
mime_type: @mime_type,
|
|
141
130
|
metadata: @metadata,
|
|
142
|
-
tables:
|
|
131
|
+
tables: @tables.map(&:to_h),
|
|
143
132
|
detected_languages: @detected_languages,
|
|
144
|
-
chunks:
|
|
145
|
-
images:
|
|
146
|
-
pages: serialize_pages
|
|
133
|
+
chunks: @chunks&.map(&:to_h),
|
|
134
|
+
images: @images&.map(&:to_h)
|
|
147
135
|
}
|
|
148
136
|
end
|
|
149
137
|
|
|
@@ -155,100 +143,8 @@ module Kreuzberg
|
|
|
155
143
|
to_h.to_json(*)
|
|
156
144
|
end
|
|
157
145
|
|
|
158
|
-
# Get the total number of pages in the document
|
|
159
|
-
#
|
|
160
|
-
# @return [Integer] Total page count (>= 0), or -1 on error
|
|
161
|
-
#
|
|
162
|
-
# @example
|
|
163
|
-
# result = Kreuzberg.extract_file_sync("document.pdf")
|
|
164
|
-
# puts "Document has #{result.page_count} pages"
|
|
165
|
-
#
|
|
166
|
-
def page_count
|
|
167
|
-
if @metadata.is_a?(Hash) && @metadata['pages'].is_a?(Hash)
|
|
168
|
-
@metadata['pages']['total_count'] || 0
|
|
169
|
-
else
|
|
170
|
-
0
|
|
171
|
-
end
|
|
172
|
-
end
|
|
173
|
-
|
|
174
|
-
# Get the total number of text chunks
|
|
175
|
-
#
|
|
176
|
-
# Returns 0 if chunking was not performed.
|
|
177
|
-
#
|
|
178
|
-
# @return [Integer] Total chunk count (>= 0), or -1 on error
|
|
179
|
-
#
|
|
180
|
-
# @example
|
|
181
|
-
# result = Kreuzberg.extract_file_sync("document.pdf")
|
|
182
|
-
# puts "Document has #{result.chunk_count} chunks"
|
|
183
|
-
#
|
|
184
|
-
def chunk_count
|
|
185
|
-
@chunks&.length || 0
|
|
186
|
-
end
|
|
187
|
-
|
|
188
|
-
# Get the primary detected language
|
|
189
|
-
#
|
|
190
|
-
# @return [String, nil] ISO 639 language code (e.g., "en", "de"), or nil if not detected
|
|
191
|
-
#
|
|
192
|
-
# @example
|
|
193
|
-
# result = Kreuzberg.extract_file_sync("document.pdf")
|
|
194
|
-
# lang = result.detected_language
|
|
195
|
-
# puts "Language: #{lang}" if lang
|
|
196
|
-
#
|
|
197
|
-
def detected_language
|
|
198
|
-
return @metadata['language'] if @metadata.is_a?(Hash) && @metadata['language']
|
|
199
|
-
return @detected_languages&.first if @detected_languages&.any?
|
|
200
|
-
|
|
201
|
-
nil
|
|
202
|
-
end
|
|
203
|
-
|
|
204
|
-
# Get a metadata field by name
|
|
205
|
-
#
|
|
206
|
-
# Supports dot notation for nested fields (e.g., "format.pages").
|
|
207
|
-
#
|
|
208
|
-
# @param name [String, Symbol] Field name
|
|
209
|
-
# @return [Object, nil] Field value, or nil if field doesn't exist
|
|
210
|
-
#
|
|
211
|
-
# @example Get a top-level field
|
|
212
|
-
# result = Kreuzberg.extract_file_sync("document.pdf")
|
|
213
|
-
# title = result.metadata_field("title")
|
|
214
|
-
# puts "Title: #{title}" if title
|
|
215
|
-
#
|
|
216
|
-
# @example Get a nested field
|
|
217
|
-
# format_info = result.metadata_field("format.pages")
|
|
218
|
-
#
|
|
219
|
-
def metadata_field(name)
|
|
220
|
-
return nil unless @metadata.is_a?(Hash)
|
|
221
|
-
|
|
222
|
-
parts = name.to_s.split('.')
|
|
223
|
-
value = @metadata
|
|
224
|
-
|
|
225
|
-
parts.each do |part|
|
|
226
|
-
return nil unless value.is_a?(Hash)
|
|
227
|
-
|
|
228
|
-
value = value[part]
|
|
229
|
-
end
|
|
230
|
-
|
|
231
|
-
value
|
|
232
|
-
end
|
|
233
|
-
|
|
234
146
|
private
|
|
235
147
|
|
|
236
|
-
def serialize_tables
|
|
237
|
-
@tables.map(&:to_h)
|
|
238
|
-
end
|
|
239
|
-
|
|
240
|
-
def serialize_chunks
|
|
241
|
-
@chunks&.map(&:to_h)
|
|
242
|
-
end
|
|
243
|
-
|
|
244
|
-
def serialize_images
|
|
245
|
-
@images&.map(&:to_h)
|
|
246
|
-
end
|
|
247
|
-
|
|
248
|
-
def serialize_pages
|
|
249
|
-
@pages&.map(&:to_h)
|
|
250
|
-
end
|
|
251
|
-
|
|
252
148
|
def get_value(hash, key, default = nil)
|
|
253
149
|
hash[key] || hash[key.to_sym] || default
|
|
254
150
|
end
|
|
@@ -274,22 +170,21 @@ module Kreuzberg
|
|
|
274
170
|
def parse_detected_languages(langs_data)
|
|
275
171
|
return nil if langs_data.nil?
|
|
276
172
|
|
|
173
|
+
# Detected languages is now just an array of strings
|
|
277
174
|
langs_data.is_a?(Array) ? langs_data : []
|
|
278
175
|
end
|
|
279
176
|
|
|
280
177
|
def parse_chunks(chunks_data)
|
|
281
|
-
return
|
|
178
|
+
return nil if chunks_data.nil?
|
|
282
179
|
|
|
283
180
|
chunks_data.map do |chunk_hash|
|
|
284
181
|
Chunk.new(
|
|
285
182
|
content: chunk_hash['content'],
|
|
286
|
-
|
|
287
|
-
|
|
183
|
+
char_start: chunk_hash['char_start'],
|
|
184
|
+
char_end: chunk_hash['char_end'],
|
|
288
185
|
token_count: chunk_hash['token_count'],
|
|
289
186
|
chunk_index: chunk_hash['chunk_index'],
|
|
290
187
|
total_chunks: chunk_hash['total_chunks'],
|
|
291
|
-
first_page: chunk_hash['first_page'],
|
|
292
|
-
last_page: chunk_hash['last_page'],
|
|
293
188
|
embedding: chunk_hash['embedding']
|
|
294
189
|
)
|
|
295
190
|
end
|
|
@@ -316,19 +211,6 @@ module Kreuzberg
|
|
|
316
211
|
)
|
|
317
212
|
end
|
|
318
213
|
end
|
|
319
|
-
|
|
320
|
-
def parse_pages(pages_data)
|
|
321
|
-
return nil if pages_data.nil?
|
|
322
|
-
|
|
323
|
-
pages_data.map do |page_hash|
|
|
324
|
-
PageContent.new(
|
|
325
|
-
page_number: page_hash['page_number'],
|
|
326
|
-
content: page_hash['content'],
|
|
327
|
-
tables: parse_tables(page_hash['tables']),
|
|
328
|
-
images: parse_images(page_hash['images'])
|
|
329
|
-
)
|
|
330
|
-
end
|
|
331
|
-
end
|
|
332
214
|
end
|
|
333
215
|
# rubocop:enable Metrics/ClassLength
|
|
334
216
|
end
|
|
@@ -19,31 +19,10 @@ module Kreuzberg
|
|
|
19
19
|
when /linux/
|
|
20
20
|
prepend_env('LD_LIBRARY_PATH', lib_dir)
|
|
21
21
|
when /mswin|mingw|cygwin/
|
|
22
|
-
# Windows uses PATH to locate DLLs
|
|
23
22
|
prepend_env('PATH', lib_dir, separator: ';')
|
|
24
|
-
# Also check common locations for PDFium on Windows
|
|
25
|
-
setup_windows_library_paths(lib_dir)
|
|
26
23
|
end
|
|
27
24
|
end
|
|
28
25
|
|
|
29
|
-
def setup_windows_library_paths(lib_dir)
|
|
30
|
-
# Add target/release to PATH for DLL lookup during development
|
|
31
|
-
target_release = File.expand_path('../../target/release', lib_dir)
|
|
32
|
-
prepend_env('PATH', target_release, separator: ';') if Dir.exist?(target_release)
|
|
33
|
-
|
|
34
|
-
# Check for short path CARGO_TARGET_DIR (CI uses C:\t)
|
|
35
|
-
cargo_target_dir = ENV.fetch('CARGO_TARGET_DIR', nil)
|
|
36
|
-
return unless cargo_target_dir
|
|
37
|
-
|
|
38
|
-
target_release_alt = File.join(cargo_target_dir, 'release')
|
|
39
|
-
prepend_env('PATH', target_release_alt, separator: ';') if Dir.exist?(target_release_alt)
|
|
40
|
-
|
|
41
|
-
# Also check for target-specific subdirectory (Windows GNU builds)
|
|
42
|
-
gnu_release = File.join(cargo_target_dir, 'x86_64-pc-windows-gnu', 'release')
|
|
43
|
-
prepend_env('PATH', gnu_release, separator: ';') if Dir.exist?(gnu_release)
|
|
44
|
-
end
|
|
45
|
-
private_class_method :setup_windows_library_paths
|
|
46
|
-
|
|
47
26
|
def prepend_env(key, value, separator: ':')
|
|
48
27
|
current = ENV.fetch(key, nil)
|
|
49
28
|
return if current&.split(separator)&.include?(value)
|
|
@@ -58,7 +37,8 @@ module Kreuzberg
|
|
|
58
37
|
|
|
59
38
|
ensure_install_name(bundle)
|
|
60
39
|
ensure_loader_rpath(bundle)
|
|
61
|
-
rescue Errno::ENOENT, IOError
|
|
40
|
+
rescue Errno::ENOENT, IOError
|
|
41
|
+
# Tool not available (e.g., on CI). The dynamic loader can still use the updated env vars.
|
|
62
42
|
end
|
|
63
43
|
private_class_method :fix_macos_install_name
|
|
64
44
|
|
|
@@ -1,14 +1,87 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
module Kreuzberg
|
|
4
|
+
# Validator protocol interface.
|
|
5
|
+
#
|
|
6
|
+
# This module defines the protocol that all Ruby validators must implement
|
|
7
|
+
# to be registered with the Rust core via the FFI bridge.
|
|
8
|
+
#
|
|
9
|
+
# Validators are called during extraction to validate results. If validation fails,
|
|
10
|
+
# the validator should raise a Kreuzberg::Errors::ValidationError, which will
|
|
11
|
+
# cause the extraction to fail.
|
|
12
|
+
#
|
|
4
13
|
# @example Implementing a minimum length validator
|
|
14
|
+
# class MinimumLengthValidator
|
|
15
|
+
# include Kreuzberg::ValidatorProtocol
|
|
16
|
+
#
|
|
17
|
+
# def initialize(min_length = 10)
|
|
18
|
+
# @min_length = min_length
|
|
19
|
+
# end
|
|
20
|
+
#
|
|
21
|
+
# def call(result)
|
|
22
|
+
# if result["content"].length < @min_length
|
|
23
|
+
# raise Kreuzberg::Errors::ValidationError.new(
|
|
24
|
+
# "Content too short: #{result["content"].length} < #{@min_length}"
|
|
25
|
+
# )
|
|
26
|
+
# end
|
|
27
|
+
# end
|
|
28
|
+
# end
|
|
29
|
+
#
|
|
30
|
+
# Kreuzberg.register_validator("min_length", MinimumLengthValidator.new(100))
|
|
31
|
+
#
|
|
5
32
|
# @example Implementing a content quality validator
|
|
33
|
+
# class QualityValidator
|
|
34
|
+
# include Kreuzberg::ValidatorProtocol
|
|
35
|
+
#
|
|
36
|
+
# def call(result)
|
|
37
|
+
# # Check if content has sufficient quality
|
|
38
|
+
# if result["content"].strip.empty?
|
|
39
|
+
# raise Kreuzberg::Errors::ValidationError.new("Empty content extracted")
|
|
40
|
+
# end
|
|
41
|
+
#
|
|
42
|
+
# # Check if metadata is present
|
|
43
|
+
# if result["metadata"].empty?
|
|
44
|
+
# raise Kreuzberg::Errors::ValidationError.new("No metadata extracted")
|
|
45
|
+
# end
|
|
46
|
+
# end
|
|
47
|
+
# end
|
|
48
|
+
#
|
|
49
|
+
# Kreuzberg.register_validator("quality", QualityValidator.new)
|
|
50
|
+
#
|
|
6
51
|
# @example Using a Proc as a validator
|
|
52
|
+
# Kreuzberg.register_validator("not_empty", ->(result) {
|
|
53
|
+
# if result["content"].strip.empty?
|
|
54
|
+
# raise Kreuzberg::Errors::ValidationError.new("Content cannot be empty")
|
|
55
|
+
# end
|
|
56
|
+
# })
|
|
57
|
+
#
|
|
7
58
|
module ValidatorProtocol
|
|
59
|
+
# Validate an extraction result.
|
|
60
|
+
#
|
|
61
|
+
# This method is called during extraction to validate results. If validation fails,
|
|
62
|
+
# raise a Kreuzberg::Errors::ValidationError with a descriptive message explaining
|
|
63
|
+
# why validation failed. If validation passes, return without raising.
|
|
64
|
+
#
|
|
65
|
+
# The validator receives the extraction result as a hash with the same structure
|
|
66
|
+
# as post-processors (see PostProcessorProtocol for details).
|
|
67
|
+
#
|
|
8
68
|
# @param result [Hash] Extraction result to validate with the following structure:
|
|
69
|
+
# - "content" [String] - Extracted text content
|
|
70
|
+
# - "mime_type" [String] - MIME type of the source document
|
|
71
|
+
# - "metadata" [Hash] - Document metadata (title, author, etc.)
|
|
72
|
+
# - "tables" [Array<Hash>] - Extracted tables
|
|
73
|
+
# - "detected_languages" [Array<String>, nil] - Detected language codes
|
|
74
|
+
# - "chunks" [Array<String>, nil] - Content chunks (if chunking enabled)
|
|
75
|
+
#
|
|
9
76
|
# @return [void]
|
|
10
77
|
# @raise [Kreuzberg::Errors::ValidationError] if validation fails
|
|
78
|
+
#
|
|
11
79
|
# @example
|
|
80
|
+
# def call(result)
|
|
81
|
+
# if result["content"].length < 10
|
|
82
|
+
# raise Kreuzberg::Errors::ValidationError.new("Content too short")
|
|
83
|
+
# end
|
|
84
|
+
# end
|
|
12
85
|
def call(result)
|
|
13
86
|
raise NotImplementedError, "#{self.class} must implement #call(result)"
|
|
14
87
|
end
|
data/lib/kreuzberg/version.rb
CHANGED
data/lib/kreuzberg.rb
CHANGED
|
@@ -6,8 +6,6 @@ Kreuzberg::SetupLibPath.configure
|
|
|
6
6
|
require_relative 'kreuzberg/version'
|
|
7
7
|
require 'kreuzberg_rb'
|
|
8
8
|
|
|
9
|
-
# Kreuzberg is a Ruby binding for the Rust core library providing document extraction,
|
|
10
|
-
# text extraction, and OCR capabilities.
|
|
11
9
|
module Kreuzberg
|
|
12
10
|
autoload :Config, 'kreuzberg/config'
|
|
13
11
|
autoload :Result, 'kreuzberg/result'
|
|
@@ -16,28 +14,17 @@ module Kreuzberg
|
|
|
16
14
|
autoload :APIProxy, 'kreuzberg/api_proxy'
|
|
17
15
|
autoload :MCPProxy, 'kreuzberg/mcp_proxy'
|
|
18
16
|
autoload :Errors, 'kreuzberg/errors'
|
|
19
|
-
autoload :ErrorContext, 'kreuzberg/error_context'
|
|
20
17
|
autoload :PostProcessorProtocol, 'kreuzberg/post_processor_protocol'
|
|
21
18
|
autoload :ValidatorProtocol, 'kreuzberg/validator_protocol'
|
|
22
19
|
autoload :OcrBackendProtocol, 'kreuzberg/ocr_backend_protocol'
|
|
23
20
|
|
|
24
|
-
|
|
25
|
-
autoload :HeaderMetadata, 'kreuzberg/types'
|
|
26
|
-
autoload :LinkMetadata, 'kreuzberg/types'
|
|
27
|
-
autoload :ImageMetadata, 'kreuzberg/types'
|
|
28
|
-
autoload :StructuredData, 'kreuzberg/types'
|
|
29
|
-
|
|
21
|
+
# Alias for API consistency with other language bindings
|
|
30
22
|
ExtractionConfig = Config::Extraction
|
|
31
|
-
PageConfig = Config::PageConfig
|
|
32
|
-
|
|
33
|
-
module KeywordAlgorithm
|
|
34
|
-
YAKE = :yake
|
|
35
|
-
RAKE = :rake
|
|
36
|
-
end
|
|
37
23
|
|
|
38
24
|
@__cache_tracker = { entries: 0, bytes: 0 }
|
|
39
25
|
|
|
40
26
|
class << self
|
|
27
|
+
# Store native methods as private methods
|
|
41
28
|
alias native_extract_file_sync extract_file_sync
|
|
42
29
|
alias native_extract_bytes_sync extract_bytes_sync
|
|
43
30
|
alias native_batch_extract_files_sync batch_extract_files_sync
|
|
@@ -54,39 +41,38 @@ module Kreuzberg
|
|
|
54
41
|
private :native_batch_extract_bytes_sync, :native_batch_extract_bytes
|
|
55
42
|
end
|
|
56
43
|
|
|
44
|
+
# Register a Ruby post-processor that conforms to PostProcessorProtocol.
|
|
57
45
|
module_function :register_post_processor
|
|
58
46
|
|
|
47
|
+
# Remove a post-processor by name.
|
|
59
48
|
module_function :unregister_post_processor
|
|
60
49
|
|
|
50
|
+
# Purge all registered post-processors.
|
|
61
51
|
module_function :clear_post_processors
|
|
62
52
|
|
|
53
|
+
# Register a validator that follows ValidatorProtocol.
|
|
63
54
|
module_function :register_validator
|
|
64
55
|
|
|
56
|
+
# Remove a validator by name.
|
|
65
57
|
module_function :unregister_validator
|
|
66
58
|
|
|
59
|
+
# Purge all validators.
|
|
67
60
|
module_function :clear_validators
|
|
68
61
|
|
|
62
|
+
# List all registered validators.
|
|
69
63
|
module_function :list_validators
|
|
70
64
|
|
|
65
|
+
# List all registered post-processors.
|
|
71
66
|
module_function :list_post_processors
|
|
72
67
|
|
|
68
|
+
# Register an OCR backend instance implementing OcrBackendProtocol.
|
|
73
69
|
module_function :register_ocr_backend
|
|
74
70
|
|
|
71
|
+
# Unregister an OCR backend by name.
|
|
75
72
|
module_function :unregister_ocr_backend
|
|
76
73
|
|
|
74
|
+
# List all registered OCR backends.
|
|
77
75
|
module_function :list_ocr_backends
|
|
78
|
-
|
|
79
|
-
module_function :detect_mime_type
|
|
80
|
-
|
|
81
|
-
module_function :detect_mime_type_from_path
|
|
82
|
-
|
|
83
|
-
module_function :validate_mime_type
|
|
84
|
-
|
|
85
|
-
module_function :get_extensions_for_mime
|
|
86
|
-
|
|
87
|
-
module_function :list_embedding_presets
|
|
88
|
-
|
|
89
|
-
module_function :get_embedding_preset
|
|
90
76
|
end
|
|
91
77
|
|
|
92
78
|
require_relative 'kreuzberg/cache_api'
|
|
Binary file
|