kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +0 -6
- data/.rubocop.yaml +534 -1
- data/Gemfile +2 -1
- data/Gemfile.lock +28 -116
- data/README.md +269 -629
- data/Rakefile +0 -9
- data/Steepfile +4 -8
- data/examples/async_patterns.rb +58 -1
- data/ext/kreuzberg_rb/extconf.rb +5 -35
- data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
- data/ext/kreuzberg_rb/native/build.rs +14 -12
- data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
- data/ext/kreuzberg_rb/native/include/strings.h +2 -2
- data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
- data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
- data/extconf.rb +6 -38
- data/kreuzberg.gemspec +20 -114
- data/lib/kreuzberg/api_proxy.rb +18 -2
- data/lib/kreuzberg/cache_api.rb +0 -22
- data/lib/kreuzberg/cli.rb +10 -2
- data/lib/kreuzberg/cli_proxy.rb +10 -0
- data/lib/kreuzberg/config.rb +22 -274
- data/lib/kreuzberg/errors.rb +7 -73
- data/lib/kreuzberg/extraction_api.rb +8 -237
- data/lib/kreuzberg/mcp_proxy.rb +11 -2
- data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
- data/lib/kreuzberg/post_processor_protocol.rb +71 -0
- data/lib/kreuzberg/result.rb +33 -151
- data/lib/kreuzberg/setup_lib_path.rb +2 -22
- data/lib/kreuzberg/validator_protocol.rb +73 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +13 -27
- data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
- data/sig/kreuzberg.rbs +12 -105
- data/spec/binding/cache_spec.rb +22 -22
- data/spec/binding/cli_proxy_spec.rb +4 -2
- data/spec/binding/cli_spec.rb +11 -12
- data/spec/binding/config_spec.rb +0 -74
- data/spec/binding/config_validation_spec.rb +6 -100
- data/spec/binding/error_handling_spec.rb +97 -283
- data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
- data/spec/binding/plugins/postprocessor_spec.rb +11 -11
- data/spec/binding/plugins/validator_spec.rb +13 -12
- data/spec/examples.txt +104 -0
- data/spec/fixtures/config.toml +1 -0
- data/spec/fixtures/config.yaml +1 -0
- data/spec/fixtures/invalid_config.toml +1 -0
- data/spec/smoke/package_spec.rb +3 -2
- data/spec/spec_helper.rb +3 -1
- data/vendor/kreuzberg/Cargo.toml +67 -192
- data/vendor/kreuzberg/README.md +9 -97
- data/vendor/kreuzberg/build.rs +194 -516
- data/vendor/kreuzberg/src/api/handlers.rs +9 -130
- data/vendor/kreuzberg/src/api/mod.rs +3 -18
- data/vendor/kreuzberg/src/api/server.rs +71 -236
- data/vendor/kreuzberg/src/api/types.rs +7 -43
- data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
- data/vendor/kreuzberg/src/cache/mod.rs +3 -27
- data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
- data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
- data/vendor/kreuzberg/src/core/config.rs +23 -905
- data/vendor/kreuzberg/src/core/extractor.rs +106 -403
- data/vendor/kreuzberg/src/core/io.rs +2 -4
- data/vendor/kreuzberg/src/core/mime.rs +12 -2
- data/vendor/kreuzberg/src/core/mod.rs +3 -22
- data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
- data/vendor/kreuzberg/src/embeddings.rs +21 -169
- data/vendor/kreuzberg/src/error.rs +2 -2
- data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
- data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
- data/vendor/kreuzberg/src/extraction/email.rs +11 -12
- data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
- data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
- data/vendor/kreuzberg/src/extraction/image.rs +14 -138
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
- data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
- data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
- data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
- data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
- data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
- data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
- data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
- data/vendor/kreuzberg/src/extraction/table.rs +1 -2
- data/vendor/kreuzberg/src/extraction/text.rs +10 -18
- data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
- data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
- data/vendor/kreuzberg/src/extractors/email.rs +9 -37
- data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
- data/vendor/kreuzberg/src/extractors/html.rs +173 -182
- data/vendor/kreuzberg/src/extractors/image.rs +8 -32
- data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
- data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
- data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
- data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
- data/vendor/kreuzberg/src/extractors/text.rs +7 -30
- data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
- data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
- data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
- data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
- data/vendor/kreuzberg/src/lib.rs +5 -17
- data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
- data/vendor/kreuzberg/src/mcp/server.rs +21 -145
- data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
- data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
- data/vendor/kreuzberg/src/pdf/error.rs +1 -93
- data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
- data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
- data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
- data/vendor/kreuzberg/src/pdf/table.rs +64 -61
- data/vendor/kreuzberg/src/pdf/text.rs +24 -416
- data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
- data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
- data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
- data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
- data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
- data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
- data/vendor/kreuzberg/src/text/mod.rs +0 -8
- data/vendor/kreuzberg/src/text/quality.rs +15 -28
- data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
- data/vendor/kreuzberg/src/types.rs +67 -907
- data/vendor/kreuzberg/src/utils/mod.rs +0 -14
- data/vendor/kreuzberg/src/utils/quality.rs +3 -12
- data/vendor/kreuzberg/tests/api_tests.rs +0 -506
- data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
- data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
- data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
- data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
- data/vendor/kreuzberg/tests/config_features.rs +1 -33
- data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
- data/vendor/kreuzberg/tests/core_integration.rs +9 -35
- data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
- data/vendor/kreuzberg/tests/email_integration.rs +1 -3
- data/vendor/kreuzberg/tests/error_handling.rs +34 -43
- data/vendor/kreuzberg/tests/format_integration.rs +1 -7
- data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
- data/vendor/kreuzberg/tests/image_integration.rs +0 -2
- data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
- data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
- data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
- data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
- data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
- data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
- data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
- data/vendor/kreuzberg/tests/security_validation.rs +1 -13
- data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
- metadata +25 -171
- data/.rubocop.yml +0 -543
- data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
- data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
- data/lib/kreuzberg/error_context.rb +0 -136
- data/lib/kreuzberg/types.rb +0 -170
- data/lib/libpdfium.so +0 -0
- data/spec/binding/async_operations_spec.rb +0 -473
- data/spec/binding/batch_operations_spec.rb +0 -595
- data/spec/binding/batch_spec.rb +0 -359
- data/spec/binding/config_result_spec.rb +0 -377
- data/spec/binding/embeddings_spec.rb +0 -816
- data/spec/binding/error_recovery_spec.rb +0 -488
- data/spec/binding/font_config_spec.rb +0 -220
- data/spec/binding/images_spec.rb +0 -738
- data/spec/binding/keywords_extraction_spec.rb +0 -600
- data/spec/binding/metadata_types_spec.rb +0 -1228
- data/spec/binding/pages_extraction_spec.rb +0 -471
- data/spec/binding/tables_spec.rb +0 -641
- data/spec/unit/config/chunking_config_spec.rb +0 -213
- data/spec/unit/config/embedding_config_spec.rb +0 -343
- data/spec/unit/config/extraction_config_spec.rb +0 -438
- data/spec/unit/config/font_config_spec.rb +0 -285
- data/spec/unit/config/hierarchy_config_spec.rb +0 -314
- data/spec/unit/config/image_extraction_config_spec.rb +0 -209
- data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
- data/spec/unit/config/keyword_config_spec.rb +0 -229
- data/spec/unit/config/language_detection_config_spec.rb +0 -258
- data/spec/unit/config/ocr_config_spec.rb +0 -171
- data/spec/unit/config/page_config_spec.rb +0 -221
- data/spec/unit/config/pdf_config_spec.rb +0 -267
- data/spec/unit/config/postprocessor_config_spec.rb +0 -290
- data/spec/unit/config/tesseract_config_spec.rb +0 -181
- data/spec/unit/config/token_reduction_config_spec.rb +0 -251
- data/test/metadata_types_test.rb +0 -959
- data/vendor/Cargo.toml +0 -61
- data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
- data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
- data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
- data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/formats.rs +0 -235
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
- data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
- data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
- data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
- data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
- data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
- data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
- data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/extractors/security.rs +0 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
- data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
- data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
- data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
- data/vendor/kreuzberg/src/panic_context.rs +0 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
- data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
- data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
- data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
- data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
- data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
- data/vendor/kreuzberg/src/utils/pool.rs +0 -503
- data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
- data/vendor/kreuzberg/tests/api_embed.rs +0 -360
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
- data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
- data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
- data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
- data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
- data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
- data/vendor/kreuzberg/tests/page_markers.rs +0 -297
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
- data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
- data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
- data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
- data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
- data/vendor/kreuzberg-ffi/README.md +0 -851
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
- data/vendor/kreuzberg-ffi/build.rs +0 -168
- data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
- data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
- data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
- data/vendor/kreuzberg-ffi/src/error.rs +0 -901
- data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
- data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
- data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
- data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
- data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
- data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
- data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
- data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
- data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
- data/vendor/kreuzberg-ffi/src/result.rs +0 -510
- data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
- data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
- data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
- data/vendor/kreuzberg-ffi/src/types.rs +0 -363
- data/vendor/kreuzberg-ffi/src/util.rs +0 -210
- data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
- data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
- data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
- data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
- data/vendor/kreuzberg-tesseract/LICENSE +0 -22
- data/vendor/kreuzberg-tesseract/README.md +0 -399
- data/vendor/kreuzberg-tesseract/build.rs +0 -1127
- data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
- data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
- data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
- data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
- data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
- data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211
data/Rakefile
CHANGED
|
@@ -6,13 +6,6 @@ require 'rspec/core/rake_task'
|
|
|
6
6
|
|
|
7
7
|
GEMSPEC = Gem::Specification.load(File.expand_path('kreuzberg.gemspec', __dir__))
|
|
8
8
|
|
|
9
|
-
# Vendor kreuzberg core crates before compilation
|
|
10
|
-
task :vendor do
|
|
11
|
-
vendor_script = File.expand_path('../../scripts/ci/ruby/vendor-kreuzberg-core.sh', __dir__)
|
|
12
|
-
puts 'Vendoring kreuzberg core crates...'
|
|
13
|
-
sh "bash #{vendor_script}"
|
|
14
|
-
end
|
|
15
|
-
|
|
16
9
|
Rake::ExtensionTask.new('kreuzberg_rb', GEMSPEC) do |ext|
|
|
17
10
|
ext.lib_dir = 'lib'
|
|
18
11
|
ext.ext_dir = 'ext/kreuzberg_rb'
|
|
@@ -23,12 +16,10 @@ Rake::ExtensionTask.new('kreuzberg_rb', GEMSPEC) do |ext|
|
|
|
23
16
|
x86_64-darwin
|
|
24
17
|
arm64-darwin
|
|
25
18
|
x64-mingw32
|
|
26
|
-
x64-mingw-ucrt
|
|
27
19
|
]
|
|
28
20
|
end
|
|
29
21
|
|
|
30
22
|
RSpec::Core::RakeTask.new(:spec)
|
|
31
23
|
|
|
32
|
-
task compile: :vendor
|
|
33
24
|
task spec: :compile
|
|
34
25
|
task default: :spec
|
data/Steepfile
CHANGED
|
@@ -15,23 +15,19 @@ target :lib do
|
|
|
15
15
|
|
|
16
16
|
# Strategic ignores for steep limitations (not fixable, safe to ignore):
|
|
17
17
|
|
|
18
|
-
# 1.
|
|
19
|
-
# This file uses Sorbet exclusively for type definitions
|
|
20
|
-
ignore 'lib/kreuzberg/types.rb'
|
|
21
|
-
|
|
22
|
-
# 2. Struct.new with keyword_init - steep cannot understand implicit attr_readers
|
|
18
|
+
# 1. Struct.new with keyword_init - steep cannot understand implicit attr_readers
|
|
23
19
|
# defined by Struct.new in blocks (Table and Chunk classes)
|
|
24
20
|
ignore 'lib/kreuzberg/result.rb'
|
|
25
21
|
|
|
26
|
-
#
|
|
22
|
+
# 2. Generic type parameters in normalize_config - steep has difficulty with
|
|
27
23
|
# methods that take Class as parameter and return instances
|
|
28
24
|
ignore 'lib/kreuzberg/config.rb'
|
|
29
25
|
|
|
30
|
-
#
|
|
26
|
+
# 3. Interface types - steep doesn't recognize that all Ruby objects have nil? and is_a?
|
|
31
27
|
# even for interface types like _ToH
|
|
32
28
|
ignore 'lib/kreuzberg/extraction_api.rb'
|
|
33
29
|
|
|
34
|
-
#
|
|
30
|
+
# 4. Open3 methods - steep's built-in Open3 RBS signatures incomplete
|
|
35
31
|
# (capture2, capture3, popen3 are standard library methods)
|
|
36
32
|
ignore 'lib/kreuzberg/setup_lib_path.rb'
|
|
37
33
|
ignore 'lib/kreuzberg/cli_proxy.rb'
|
data/examples/async_patterns.rb
CHANGED
|
@@ -1,5 +1,10 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
# Async Patterns for Kreuzberg Ruby Bindings
|
|
4
|
+
#
|
|
5
|
+
# This example demonstrates async patterns and concurrency approaches for Ruby,
|
|
6
|
+
# with comparison to the underlying Rust implementation.
|
|
7
|
+
|
|
3
8
|
require 'kreuzberg'
|
|
4
9
|
|
|
5
10
|
# NOTE: Ruby bindings use Tokio runtime with block_on() internally.
|
|
@@ -21,6 +26,8 @@ end
|
|
|
21
26
|
# ============================================================================
|
|
22
27
|
|
|
23
28
|
def basic_async_extraction
|
|
29
|
+
# This LOOKS async but actually blocks the Ruby thread
|
|
30
|
+
# Internally uses: runtime.block_on(async { ... })
|
|
24
31
|
result = Kreuzberg.extract_file('document.pdf')
|
|
25
32
|
puts "Content: #{result[:content]}"
|
|
26
33
|
end
|
|
@@ -32,6 +39,8 @@ end
|
|
|
32
39
|
def concurrent_with_threads
|
|
33
40
|
files = ['doc1.pdf', 'doc2.pdf', 'doc3.pdf']
|
|
34
41
|
|
|
42
|
+
# Use Ruby threads to achieve parallelism
|
|
43
|
+
# Each thread calls the synchronous API
|
|
35
44
|
threads = files.map do |file|
|
|
36
45
|
Thread.new do
|
|
37
46
|
Kreuzberg.extract_file_sync(file)
|
|
@@ -51,6 +60,8 @@ end
|
|
|
51
60
|
def batch_processing
|
|
52
61
|
files = ['doc1.pdf', 'doc2.pdf', 'doc3.pdf']
|
|
53
62
|
|
|
63
|
+
# The batch API handles concurrency internally via Rust/Tokio
|
|
64
|
+
# This is more efficient than Ruby threads
|
|
54
65
|
results = Kreuzberg.batch_extract_files_sync(files)
|
|
55
66
|
|
|
56
67
|
puts "Processed #{results.length} files"
|
|
@@ -64,6 +75,7 @@ end
|
|
|
64
75
|
# ============================================================================
|
|
65
76
|
|
|
66
77
|
def extraction_with_config
|
|
78
|
+
# Configure OCR
|
|
67
79
|
config = {
|
|
68
80
|
ocr: {
|
|
69
81
|
backend: 'tesseract',
|
|
@@ -129,8 +141,11 @@ end
|
|
|
129
141
|
# Example ActiveJob for async processing in Rails
|
|
130
142
|
# < ApplicationJob
|
|
131
143
|
class DocumentExtractionJob
|
|
144
|
+
# queue_as :default
|
|
145
|
+
|
|
132
146
|
def perform(file_path)
|
|
133
147
|
result = Kreuzberg.extract_file_sync(file_path)
|
|
148
|
+
# Store result in database or process further
|
|
134
149
|
puts "Background extraction complete: #{result[:content][0..100]}"
|
|
135
150
|
end
|
|
136
151
|
end
|
|
@@ -147,6 +162,7 @@ def concurrent_with_parallel_gem
|
|
|
147
162
|
|
|
148
163
|
files = ['doc1.pdf', 'doc2.pdf', 'doc3.pdf', 'doc4.pdf']
|
|
149
164
|
|
|
165
|
+
# Process files in parallel using multiple CPU cores
|
|
150
166
|
results = Parallel.map(files, in_processes: 4) do |file|
|
|
151
167
|
Kreuzberg.extract_file_sync(file)
|
|
152
168
|
end
|
|
@@ -176,6 +192,7 @@ end
|
|
|
176
192
|
# ============================================================================
|
|
177
193
|
|
|
178
194
|
def register_postprocessor
|
|
195
|
+
# Register a Ruby-based post-processor
|
|
179
196
|
uppercase_processor = lambda do |result|
|
|
180
197
|
result[:content] = result[:content].upcase
|
|
181
198
|
result
|
|
@@ -183,9 +200,11 @@ def register_postprocessor
|
|
|
183
200
|
|
|
184
201
|
Kreuzberg.register_post_processor('uppercase', uppercase_processor, 100)
|
|
185
202
|
|
|
203
|
+
# Now all extractions will use the uppercase processor
|
|
186
204
|
result = Kreuzberg.extract_file_sync('document.pdf')
|
|
187
205
|
puts "Uppercase content: #{result[:content]}"
|
|
188
206
|
|
|
207
|
+
# Clean up
|
|
189
208
|
Kreuzberg.unregister_post_processor('uppercase')
|
|
190
209
|
end
|
|
191
210
|
|
|
@@ -194,12 +213,14 @@ end
|
|
|
194
213
|
# ============================================================================
|
|
195
214
|
|
|
196
215
|
def register_validator
|
|
216
|
+
# Register a Ruby-based validator
|
|
197
217
|
min_length_validator = lambda do |result|
|
|
198
218
|
raise 'Content too short' if result[:content].length < 100
|
|
199
219
|
end
|
|
200
220
|
|
|
201
221
|
Kreuzberg.register_validator('min_length', min_length_validator, 100)
|
|
202
222
|
|
|
223
|
+
# Validation will run automatically during extraction
|
|
203
224
|
begin
|
|
204
225
|
result = Kreuzberg.extract_file_sync('short_document.pdf')
|
|
205
226
|
puts "Validation passed: #{result[:content]}"
|
|
@@ -207,6 +228,7 @@ def register_validator
|
|
|
207
228
|
puts "Validation failed: #{e.message}"
|
|
208
229
|
end
|
|
209
230
|
|
|
231
|
+
# Clean up
|
|
210
232
|
Kreuzberg.unregister_validator('min_length')
|
|
211
233
|
end
|
|
212
234
|
|
|
@@ -214,9 +236,12 @@ end
|
|
|
214
236
|
# Pattern 15: Custom Ruby OCR Backend Plugin
|
|
215
237
|
# ============================================================================
|
|
216
238
|
|
|
217
|
-
# Example OCR backend implementation for custom processing.
|
|
218
239
|
class CustomOcrBackend
|
|
219
240
|
def process_image(image_bytes, language)
|
|
241
|
+
# In a real implementation, you would:
|
|
242
|
+
# 1. Call an external OCR service
|
|
243
|
+
# 2. Use an HTTP API
|
|
244
|
+
# 3. Process with a Ruby gem
|
|
220
245
|
"Extracted text from #{image_bytes.length} bytes using #{language}"
|
|
221
246
|
end
|
|
222
247
|
|
|
@@ -229,6 +254,7 @@ def register_ocr_backend
|
|
|
229
254
|
backend = CustomOcrBackend.new
|
|
230
255
|
Kreuzberg.register_ocr_backend('custom', backend)
|
|
231
256
|
|
|
257
|
+
# Now you can use the custom backend
|
|
232
258
|
config = {
|
|
233
259
|
ocr: {
|
|
234
260
|
backend: 'custom',
|
|
@@ -280,4 +306,35 @@ def main
|
|
|
280
306
|
register_validator
|
|
281
307
|
end
|
|
282
308
|
|
|
309
|
+
# Run if executed directly
|
|
283
310
|
main if __FILE__ == $PROGRAM_NAME
|
|
311
|
+
|
|
312
|
+
# ============================================================================
|
|
313
|
+
# Key Takeaways:
|
|
314
|
+
#
|
|
315
|
+
# 1. Ruby bindings use Tokio runtime with block_on() internally
|
|
316
|
+
# 2. "Async" functions block the Ruby GVL - no concurrency benefit
|
|
317
|
+
# 3. Use _sync variants for clarity (same performance)
|
|
318
|
+
# 4. Use Ruby threads or Parallel gem for concurrent processing
|
|
319
|
+
# 5. Batch API is most efficient for multiple files
|
|
320
|
+
# 6. ActiveJob for background processing in Rails
|
|
321
|
+
# 7. Ruby plugins (PostProcessor, Validator, OCR) are fully supported
|
|
322
|
+
#
|
|
323
|
+
# Performance Comparison:
|
|
324
|
+
# - Magnus: Blocks GVL, same overhead as sync (~Xms per call)
|
|
325
|
+
# - PyO3 (optimized): ~0.17ms overhead, GIL released during await
|
|
326
|
+
# - NAPI-RS: ~0ms overhead, automatic Promise conversion
|
|
327
|
+
#
|
|
328
|
+
# When to Use Ruby Bindings:
|
|
329
|
+
# ✅ Rails applications (ActiveJob for background processing)
|
|
330
|
+
# ✅ Ruby scripts (existing Ruby codebases)
|
|
331
|
+
# ✅ Simple extraction (single-file processing)
|
|
332
|
+
# ✅ Batch processing (batch API handles concurrency)
|
|
333
|
+
#
|
|
334
|
+
# Consider Other Bindings For:
|
|
335
|
+
# ❌ High concurrency (use Node.js/NAPI-RS instead)
|
|
336
|
+
# ❌ Real-time processing (use Node.js/NAPI-RS instead)
|
|
337
|
+
# ❌ I/O-bound workloads (use Python/PyO3 or Node.js/NAPI-RS)
|
|
338
|
+
#
|
|
339
|
+
# See packages/ruby/ext/kreuzberg_rb/native/README.md for detailed async runtime documentation.
|
|
340
|
+
# ============================================================================
|
data/ext/kreuzberg_rb/extconf.rb
CHANGED
|
@@ -3,57 +3,27 @@
|
|
|
3
3
|
require 'mkmf'
|
|
4
4
|
require 'rb_sys/mkmf'
|
|
5
5
|
require 'rbconfig'
|
|
6
|
-
require 'fileutils'
|
|
7
|
-
|
|
8
|
-
if Gem.win_platform?
|
|
9
|
-
# Use CI-provided CARGO_TARGET_DIR if available, otherwise use a short path
|
|
10
|
-
# GitHub Actions sets CARGO_TARGET_DIR=C:\t for MAX_PATH mitigation
|
|
11
|
-
if ENV['CARGO_TARGET_DIR']
|
|
12
|
-
puts "Windows detected: Using existing CARGO_TARGET_DIR=#{ENV['CARGO_TARGET_DIR']}"
|
|
13
|
-
else
|
|
14
|
-
# Try C:\t first (CI convention), fall back to D:/kz-build
|
|
15
|
-
short_target_dir = Dir.exist?('C:/t') ? 'C:/t' : 'C:/kz-build'
|
|
16
|
-
begin
|
|
17
|
-
FileUtils.mkdir_p(short_target_dir)
|
|
18
|
-
ENV['CARGO_TARGET_DIR'] = short_target_dir
|
|
19
|
-
ENV['OUT_DIR'] = short_target_dir
|
|
20
|
-
puts "Windows detected: Using short build path #{short_target_dir}"
|
|
21
|
-
rescue StandardError => e
|
|
22
|
-
puts "Warning: Could not create short path #{short_target_dir}: #{e.message}"
|
|
23
|
-
# Fall back to relative path which rb_sys will handle
|
|
24
|
-
end
|
|
25
|
-
end
|
|
26
|
-
end
|
|
27
6
|
|
|
28
7
|
if /mswin|mingw/.match?(RbConfig::CONFIG['host_os'])
|
|
29
8
|
devkit = ENV.fetch('RI_DEVKIT', nil)
|
|
30
9
|
prefix = ENV['MSYSTEM_PREFIX'] || '/ucrt64'
|
|
31
|
-
|
|
32
|
-
# Set up include paths for MSVC compatibility headers
|
|
33
|
-
native_include = File.expand_path('native/include', __dir__).tr('\\', '/')
|
|
34
10
|
compat_include = File.expand_path('native/include/msvc_compat', __dir__).tr('\\', '/')
|
|
35
11
|
|
|
36
12
|
extra_args = []
|
|
37
|
-
extra_args << "-I#{native_include}"
|
|
38
13
|
extra_args << "-I#{compat_include}"
|
|
39
14
|
|
|
40
|
-
# Add Windows-specific flags for better compatibility
|
|
41
|
-
extra_args << '-fms-extensions'
|
|
42
|
-
extra_args << '-fno-omit-frame-pointer'
|
|
43
|
-
|
|
44
15
|
if devkit
|
|
45
|
-
sysroot = "#{devkit}#{prefix}".tr('
|
|
46
|
-
extra_args.
|
|
16
|
+
sysroot = "#{devkit}#{prefix}".tr('\\\\', '/')
|
|
17
|
+
extra_args.concat([
|
|
18
|
+
'--target=x86_64-pc-windows-gnu',
|
|
19
|
+
"--sysroot=#{sysroot}"
|
|
20
|
+
])
|
|
47
21
|
end
|
|
48
22
|
|
|
49
23
|
unless extra_args.empty?
|
|
50
24
|
existing = ENV['BINDGEN_EXTRA_CLANG_ARGS'].to_s.split(/\s+/).reject(&:empty?)
|
|
51
25
|
ENV['BINDGEN_EXTRA_CLANG_ARGS'] = (existing + extra_args).uniq.join(' ')
|
|
52
|
-
puts "BINDGEN_EXTRA_CLANG_ARGS set to: #{ENV.fetch('BINDGEN_EXTRA_CLANG_ARGS', nil)}"
|
|
53
26
|
end
|
|
54
|
-
|
|
55
|
-
# Set target for Windows GNU toolchain if not already set
|
|
56
|
-
ENV['CARGO_BUILD_TARGET'] ||= 'x86_64-pc-windows-gnu' if devkit || ENV['MSYSTEM']
|
|
57
27
|
end
|
|
58
28
|
|
|
59
29
|
default_profile = ENV.fetch('CARGO_PROFILE', 'release')
|
|
@@ -1,75 +1,36 @@
|
|
|
1
|
-
[workspace]
|
|
2
|
-
|
|
3
|
-
[workspace.lints.clippy]
|
|
4
|
-
collapsible_if = "allow"
|
|
5
|
-
|
|
6
1
|
[package]
|
|
7
2
|
name = "kreuzberg-rb"
|
|
8
|
-
version = "4.0.0-rc.
|
|
3
|
+
version = "4.0.0-rc.1"
|
|
9
4
|
edition = "2024"
|
|
10
|
-
rust-version = "1.
|
|
5
|
+
rust-version = "1.85"
|
|
11
6
|
authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
|
|
12
7
|
license = "MIT"
|
|
13
|
-
repository = "https://github.com/
|
|
8
|
+
repository = "https://github.com/Goldziher/kreuzberg"
|
|
14
9
|
homepage = "https://kreuzberg.dev"
|
|
15
10
|
documentation = "https://docs.rs/kreuzberg"
|
|
16
11
|
readme = "README.md"
|
|
17
12
|
description = "Ruby bindings (Magnus) for Kreuzberg - high-performance document intelligence framework"
|
|
18
|
-
keywords = ["
|
|
19
|
-
categories = ["api-bindings"
|
|
20
|
-
|
|
21
|
-
[lints]
|
|
22
|
-
workspace = true
|
|
13
|
+
keywords = ["document", "extraction", "ocr", "pdf", "ruby"]
|
|
14
|
+
categories = ["api-bindings"]
|
|
23
15
|
|
|
24
16
|
[lib]
|
|
25
17
|
name = "kreuzberg_rb"
|
|
26
18
|
crate-type = ["cdylib", "rlib"]
|
|
27
19
|
|
|
28
20
|
[features]
|
|
29
|
-
default = [
|
|
30
|
-
embeddings = ["kreuzberg/embeddings"]
|
|
21
|
+
default = []
|
|
31
22
|
|
|
32
23
|
[dependencies]
|
|
33
|
-
async-trait = "0.1
|
|
34
|
-
kreuzberg = {
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
"xml",
|
|
41
|
-
"archives",
|
|
42
|
-
"ocr",
|
|
43
|
-
"language-detection",
|
|
44
|
-
"chunking",
|
|
45
|
-
"embeddings",
|
|
46
|
-
"quality",
|
|
47
|
-
"keywords",
|
|
48
|
-
"api",
|
|
49
|
-
"mcp",
|
|
50
|
-
"otel",
|
|
51
|
-
"bundled-pdfium",
|
|
52
|
-
"tokio-runtime",
|
|
53
|
-
] }
|
|
54
|
-
kreuzberg-ffi = { path = "../../../vendor/kreuzberg-ffi" }
|
|
55
|
-
magnus = { git = "https://github.com/matsadler/magnus", rev = "f6db11769efb517427bf7f121f9c32e18b059b38", features = [
|
|
56
|
-
"rb-sys",
|
|
57
|
-
] }
|
|
58
|
-
rb-sys = { version = "0.9.119", default-features = false, features = [
|
|
59
|
-
"stable-api-compiled-fallback",
|
|
60
|
-
] }
|
|
61
|
-
serde_json = "1.0.145"
|
|
62
|
-
tokio = { version = "1.48.0", features = [
|
|
63
|
-
"rt",
|
|
64
|
-
"rt-multi-thread",
|
|
65
|
-
"macros",
|
|
66
|
-
"sync",
|
|
67
|
-
"process",
|
|
68
|
-
"fs",
|
|
69
|
-
"time",
|
|
70
|
-
"io-util",
|
|
71
|
-
] }
|
|
72
|
-
html-to-markdown-rs = { version = "2.14.2", default-features = false }
|
|
24
|
+
async-trait = "0.1"
|
|
25
|
+
kreuzberg = { version = "4.0.0-rc.1", features = ["full", "embeddings"] }
|
|
26
|
+
magnus = { git = "https://github.com/matsadler/magnus", rev = "f6db11769efb517427bf7f121f9c32e18b059b38", features = ["rb-sys"] }
|
|
27
|
+
rb-sys = { version = "0.9.117", default-features = false, features = ["stable-api-compiled-fallback"] }
|
|
28
|
+
serde_json = "1.0"
|
|
29
|
+
tokio = { version = "1.48", features = ["rt", "macros"] }
|
|
30
|
+
html-to-markdown-rs = { version = "2.9.1", default-features = false }
|
|
73
31
|
|
|
74
32
|
[dev-dependencies]
|
|
75
33
|
pretty_assertions = "1.4"
|
|
34
|
+
|
|
35
|
+
[patch.crates-io]
|
|
36
|
+
kreuzberg = { path = "../../../../vendor/kreuzberg" }
|
|
@@ -1,15 +1,17 @@
|
|
|
1
|
+
#[cfg(target_os = "macos")]
|
|
1
2
|
fn main() {
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
} else if target.contains("linux") {
|
|
8
|
-
println!("cargo:rustc-link-arg=-Wl,-rpath,$ORIGIN");
|
|
9
|
-
} else if target.contains("windows") {
|
|
10
|
-
// Windows doesn't need rpath or dynamic_lookup equivalents
|
|
11
|
-
// The linker flags are already configured in .cargo/config.toml
|
|
12
|
-
}
|
|
3
|
+
println!("cargo:rustc-link-arg=-Wl,-undefined,dynamic_lookup");
|
|
4
|
+
// Set rpath to look for libpdfium.dylib in the same directory as the Ruby extension
|
|
5
|
+
println!("cargo:rustc-link-arg=-Wl,-rpath,@loader_path");
|
|
6
|
+
println!("cargo:rustc-link-arg=-Wl,-rpath,@loader_path/.");
|
|
7
|
+
}
|
|
13
8
|
|
|
14
|
-
|
|
9
|
+
#[cfg(target_os = "linux")]
|
|
10
|
+
fn main() {
|
|
11
|
+
// Set rpath to look for libpdfium.so in the same directory as the Ruby extension
|
|
12
|
+
println!("cargo:rustc-link-arg=-Wl,-rpath,$ORIGIN");
|
|
13
|
+
println!("cargo:rustc-link-arg=-Wl,-rpath,$ORIGIN/.");
|
|
15
14
|
}
|
|
15
|
+
|
|
16
|
+
#[cfg(not(any(target_os = "macos", target_os = "linux")))]
|
|
17
|
+
fn main() {}
|