kreuzberg 4.0.0.rc1 → 4.0.0.rc2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +14 -8
- data/.rspec +3 -3
- data/.rubocop.yaml +1 -534
- data/.rubocop.yml +538 -0
- data/Gemfile +8 -9
- data/Gemfile.lock +9 -109
- data/README.md +426 -421
- data/Rakefile +25 -25
- data/Steepfile +47 -47
- data/examples/async_patterns.rb +341 -340
- data/ext/kreuzberg_rb/extconf.rb +45 -35
- data/ext/kreuzberg_rb/native/Cargo.lock +6535 -0
- data/ext/kreuzberg_rb/native/Cargo.toml +44 -36
- data/ext/kreuzberg_rb/native/README.md +425 -425
- data/ext/kreuzberg_rb/native/build.rs +15 -17
- data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
- data/ext/kreuzberg_rb/native/include/strings.h +20 -20
- data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
- data/ext/kreuzberg_rb/native/src/lib.rs +2998 -2939
- data/extconf.rb +28 -28
- data/kreuzberg.gemspec +148 -105
- data/lib/kreuzberg/api_proxy.rb +142 -142
- data/lib/kreuzberg/cache_api.rb +46 -45
- data/lib/kreuzberg/cli.rb +55 -55
- data/lib/kreuzberg/cli_proxy.rb +127 -127
- data/lib/kreuzberg/config.rb +691 -684
- data/lib/kreuzberg/error_context.rb +32 -0
- data/lib/kreuzberg/errors.rb +118 -50
- data/lib/kreuzberg/extraction_api.rb +85 -84
- data/lib/kreuzberg/mcp_proxy.rb +186 -186
- data/lib/kreuzberg/ocr_backend_protocol.rb +113 -113
- data/lib/kreuzberg/post_processor_protocol.rb +86 -86
- data/lib/kreuzberg/result.rb +216 -216
- data/lib/kreuzberg/setup_lib_path.rb +80 -79
- data/lib/kreuzberg/validator_protocol.rb +89 -89
- data/lib/kreuzberg/version.rb +5 -5
- data/lib/kreuzberg.rb +103 -82
- data/sig/kreuzberg/internal.rbs +184 -184
- data/sig/kreuzberg.rbs +520 -468
- data/spec/binding/cache_spec.rb +227 -227
- data/spec/binding/cli_proxy_spec.rb +85 -87
- data/spec/binding/cli_spec.rb +55 -54
- data/spec/binding/config_spec.rb +345 -345
- data/spec/binding/config_validation_spec.rb +283 -283
- data/spec/binding/error_handling_spec.rb +213 -213
- data/spec/binding/errors_spec.rb +66 -66
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
- data/spec/binding/plugins/postprocessor_spec.rb +269 -269
- data/spec/binding/plugins/validator_spec.rb +274 -274
- data/spec/fixtures/config.toml +39 -39
- data/spec/fixtures/config.yaml +41 -42
- data/spec/fixtures/invalid_config.toml +4 -4
- data/spec/smoke/package_spec.rb +178 -178
- data/spec/spec_helper.rb +42 -42
- data/vendor/kreuzberg/Cargo.toml +204 -134
- data/vendor/kreuzberg/README.md +175 -175
- data/vendor/kreuzberg/benches/otel_overhead.rs +48 -0
- data/vendor/kreuzberg/build.rs +474 -460
- data/vendor/kreuzberg/src/api/error.rs +81 -81
- data/vendor/kreuzberg/src/api/handlers.rs +199 -199
- data/vendor/kreuzberg/src/api/mod.rs +79 -79
- data/vendor/kreuzberg/src/api/server.rs +353 -353
- data/vendor/kreuzberg/src/api/types.rs +170 -170
- data/vendor/kreuzberg/src/cache/mod.rs +1167 -1143
- data/vendor/kreuzberg/src/chunking/mod.rs +677 -677
- data/vendor/kreuzberg/src/core/batch_mode.rs +95 -35
- data/vendor/kreuzberg/src/core/config.rs +1032 -1032
- data/vendor/kreuzberg/src/core/extractor.rs +1024 -903
- data/vendor/kreuzberg/src/core/io.rs +329 -327
- data/vendor/kreuzberg/src/core/mime.rs +605 -615
- data/vendor/kreuzberg/src/core/mod.rs +45 -42
- data/vendor/kreuzberg/src/core/pipeline.rs +984 -906
- data/vendor/kreuzberg/src/embeddings.rs +432 -323
- data/vendor/kreuzberg/src/error.rs +431 -431
- data/vendor/kreuzberg/src/extraction/archive.rs +954 -954
- data/vendor/kreuzberg/src/extraction/docx.rs +40 -40
- data/vendor/kreuzberg/src/extraction/email.rs +854 -854
- data/vendor/kreuzberg/src/extraction/excel.rs +688 -688
- data/vendor/kreuzberg/src/extraction/html.rs +553 -553
- data/vendor/kreuzberg/src/extraction/image.rs +368 -368
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +563 -564
- data/vendor/kreuzberg/src/extraction/markdown.rs +213 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +81 -77
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
- data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -128
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +287 -0
- data/vendor/kreuzberg/src/extraction/pptx.rs +3000 -3000
- data/vendor/kreuzberg/src/extraction/structured.rs +490 -490
- data/vendor/kreuzberg/src/extraction/table.rs +328 -328
- data/vendor/kreuzberg/src/extraction/text.rs +269 -269
- data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
- data/vendor/kreuzberg/src/extractors/archive.rs +446 -425
- data/vendor/kreuzberg/src/extractors/bibtex.rs +469 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +502 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +367 -479
- data/vendor/kreuzberg/src/extractors/email.rs +143 -129
- data/vendor/kreuzberg/src/extractors/epub.rs +707 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +343 -344
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +491 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +738 -0
- data/vendor/kreuzberg/src/extractors/html.rs +393 -410
- data/vendor/kreuzberg/src/extractors/image.rs +198 -195
- data/vendor/kreuzberg/src/extractors/jats.rs +1051 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +367 -0
- data/vendor/kreuzberg/src/extractors/latex.rs +652 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +700 -0
- data/vendor/kreuzberg/src/extractors/mod.rs +365 -268
- data/vendor/kreuzberg/src/extractors/odt.rs +628 -0
- data/vendor/kreuzberg/src/extractors/opml.rs +634 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +528 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +493 -496
- data/vendor/kreuzberg/src/extractors/pptx.rs +248 -234
- data/vendor/kreuzberg/src/extractors/rst.rs +576 -0
- data/vendor/kreuzberg/src/extractors/rtf.rs +810 -0
- data/vendor/kreuzberg/src/extractors/security.rs +484 -0
- data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +140 -126
- data/vendor/kreuzberg/src/extractors/text.rs +260 -242
- data/vendor/kreuzberg/src/extractors/typst.rs +650 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +135 -128
- data/vendor/kreuzberg/src/image/dpi.rs +164 -164
- data/vendor/kreuzberg/src/image/mod.rs +6 -6
- data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
- data/vendor/kreuzberg/src/image/resize.rs +89 -89
- data/vendor/kreuzberg/src/keywords/config.rs +154 -154
- data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
- data/vendor/kreuzberg/src/keywords/processor.rs +267 -267
- data/vendor/kreuzberg/src/keywords/rake.rs +293 -294
- data/vendor/kreuzberg/src/keywords/types.rs +68 -68
- data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
- data/vendor/kreuzberg/src/language_detection/mod.rs +942 -942
- data/vendor/kreuzberg/src/lib.rs +105 -102
- data/vendor/kreuzberg/src/mcp/mod.rs +32 -32
- data/vendor/kreuzberg/src/mcp/server.rs +1968 -1966
- data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
- data/vendor/kreuzberg/src/ocr/error.rs +37 -37
- data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
- data/vendor/kreuzberg/src/ocr/mod.rs +58 -58
- data/vendor/kreuzberg/src/ocr/processor.rs +863 -847
- data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
- data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +450 -450
- data/vendor/kreuzberg/src/ocr/types.rs +393 -393
- data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
- data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
- data/vendor/kreuzberg/src/panic_context.rs +154 -0
- data/vendor/kreuzberg/src/pdf/error.rs +122 -122
- data/vendor/kreuzberg/src/pdf/images.rs +139 -139
- data/vendor/kreuzberg/src/pdf/metadata.rs +346 -346
- data/vendor/kreuzberg/src/pdf/mod.rs +50 -50
- data/vendor/kreuzberg/src/pdf/rendering.rs +369 -369
- data/vendor/kreuzberg/src/pdf/table.rs +393 -420
- data/vendor/kreuzberg/src/pdf/text.rs +158 -161
- data/vendor/kreuzberg/src/plugins/extractor.rs +1013 -1010
- data/vendor/kreuzberg/src/plugins/mod.rs +209 -209
- data/vendor/kreuzberg/src/plugins/ocr.rs +620 -629
- data/vendor/kreuzberg/src/plugins/processor.rs +642 -641
- data/vendor/kreuzberg/src/plugins/registry.rs +1337 -1324
- data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
- data/vendor/kreuzberg/src/plugins/validator.rs +956 -955
- data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
- data/vendor/kreuzberg/src/text/mod.rs +19 -19
- data/vendor/kreuzberg/src/text/quality.rs +697 -697
- data/vendor/kreuzberg/src/text/string_utils.rs +217 -217
- data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
- data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -796
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -902
- data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
- data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -147
- data/vendor/kreuzberg/src/types.rs +903 -873
- data/vendor/kreuzberg/src/utils/mod.rs +17 -17
- data/vendor/kreuzberg/src/utils/quality.rs +959 -959
- data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
- data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
- data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
- data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
- data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
- data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
- data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
- data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
- data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
- data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
- data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
- data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
- data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
- data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
- data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
- data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
- data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
- data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
- data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
- data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
- data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
- data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
- data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
- data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
- data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
- data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
- data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
- data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
- data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
- data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
- data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
- data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
- data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
- data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
- data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
- data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
- data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
- data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
- data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
- data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
- data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
- data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
- data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
- data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
- data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
- data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
- data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
- data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
- data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
- data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
- data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
- data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
- data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
- data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
- data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
- data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
- data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
- data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
- data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
- data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
- data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -0
- data/vendor/kreuzberg/tests/api_tests.rs +966 -966
- data/vendor/kreuzberg/tests/archive_integration.rs +543 -543
- data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -542
- data/vendor/kreuzberg/tests/batch_processing.rs +316 -304
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +525 -509
- data/vendor/kreuzberg/tests/config_features.rs +598 -580
- data/vendor/kreuzberg/tests/config_loading_tests.rs +415 -439
- data/vendor/kreuzberg/tests/core_integration.rs +510 -493
- data/vendor/kreuzberg/tests/csv_integration.rs +414 -424
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +498 -0
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -124
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -0
- data/vendor/kreuzberg/tests/email_integration.rs +325 -325
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -0
- data/vendor/kreuzberg/tests/error_handling.rs +393 -393
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -0
- data/vendor/kreuzberg/tests/format_integration.rs +159 -159
- data/vendor/kreuzberg/tests/helpers/mod.rs +142 -142
- data/vendor/kreuzberg/tests/html_table_test.rs +551 -0
- data/vendor/kreuzberg/tests/image_integration.rs +253 -253
- data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -0
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -0
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -0
- data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
- data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -0
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -0
- data/vendor/kreuzberg/tests/mime_detection.rs +428 -428
- data/vendor/kreuzberg/tests/ocr_configuration.rs +510 -510
- data/vendor/kreuzberg/tests/ocr_errors.rs +676 -676
- data/vendor/kreuzberg/tests/ocr_quality.rs +627 -627
- data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +695 -0
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -0
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +43 -43
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1411 -1412
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +771 -771
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -561
- data/vendor/kreuzberg/tests/plugin_system.rs +921 -921
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +586 -607
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +692 -0
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +776 -0
- data/vendor/kreuzberg/tests/security_validation.rs +415 -404
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
- data/vendor/kreuzberg/tests/test_fastembed.rs +609 -609
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1259 -0
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +647 -0
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
- data/vendor/rb-sys/.cargo-ok +1 -0
- data/vendor/rb-sys/.cargo_vcs_info.json +6 -0
- data/vendor/rb-sys/Cargo.lock +393 -0
- data/vendor/rb-sys/Cargo.toml +70 -0
- data/vendor/rb-sys/Cargo.toml.orig +57 -0
- data/vendor/rb-sys/LICENSE-APACHE +190 -0
- data/vendor/rb-sys/LICENSE-MIT +21 -0
- data/vendor/rb-sys/bin/release.sh +21 -0
- data/vendor/rb-sys/build/features.rs +108 -0
- data/vendor/rb-sys/build/main.rs +246 -0
- data/vendor/rb-sys/build/stable_api_config.rs +153 -0
- data/vendor/rb-sys/build/version.rs +48 -0
- data/vendor/rb-sys/readme.md +36 -0
- data/vendor/rb-sys/src/bindings.rs +21 -0
- data/vendor/rb-sys/src/hidden.rs +11 -0
- data/vendor/rb-sys/src/lib.rs +34 -0
- data/vendor/rb-sys/src/macros.rs +371 -0
- data/vendor/rb-sys/src/memory.rs +53 -0
- data/vendor/rb-sys/src/ruby_abi_version.rs +38 -0
- data/vendor/rb-sys/src/special_consts.rs +31 -0
- data/vendor/rb-sys/src/stable_api/compiled.c +179 -0
- data/vendor/rb-sys/src/stable_api/compiled.rs +257 -0
- data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +316 -0
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +316 -0
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +324 -0
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +317 -0
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +315 -0
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +326 -0
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +327 -0
- data/vendor/rb-sys/src/stable_api.rs +261 -0
- data/vendor/rb-sys/src/symbol.rs +31 -0
- data/vendor/rb-sys/src/tracking_allocator.rs +332 -0
- data/vendor/rb-sys/src/utils.rs +89 -0
- data/vendor/rb-sys/src/value_type.rs +7 -0
- metadata +90 -95
- data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
- data/spec/examples.txt +0 -104
- data/vendor/kreuzberg/src/bin/profile_extract.rs +0 -455
- data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +0 -275
- data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +0 -178
- data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +0 -491
- data/vendor/kreuzberg/src/extraction/pandoc/server.rs +0 -496
- data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +0 -1188
- data/vendor/kreuzberg/src/extraction/pandoc/version.rs +0 -162
- data/vendor/kreuzberg/src/extractors/pandoc.rs +0 -201
- data/vendor/kreuzberg/tests/chunking_offset_demo.rs +0 -92
- data/vendor/kreuzberg/tests/pandoc_integration.rs +0 -503
data/lib/kreuzberg/cache_api.rb
CHANGED
|
@@ -1,45 +1,46 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
module Kreuzberg
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
stats[
|
|
17
|
-
stats[
|
|
18
|
-
stats[
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
results_array.
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
@__cache_tracker[:
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
@__cache_tracker[:
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
end
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Kreuzberg
|
|
4
|
+
# Provides caching capabilities for extraction results.
|
|
5
|
+
module CacheAPI
|
|
6
|
+
def clear_cache
|
|
7
|
+
native_clear_cache
|
|
8
|
+
reset_cache_tracker!
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def cache_stats
|
|
12
|
+
stats = native_cache_stats
|
|
13
|
+
total_entries = (stats['total_entries'] || stats[:total_entries] || 0) + @__cache_tracker[:entries]
|
|
14
|
+
total_size = (stats['total_size_bytes'] || stats[:total_size_bytes] || 0) + @__cache_tracker[:bytes]
|
|
15
|
+
|
|
16
|
+
stats['total_entries'] = total_entries
|
|
17
|
+
stats[:total_entries] = total_entries
|
|
18
|
+
stats['total_size_bytes'] = total_size
|
|
19
|
+
stats[:total_size_bytes] = total_size
|
|
20
|
+
|
|
21
|
+
stats
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
private
|
|
25
|
+
|
|
26
|
+
def record_cache_entry!(results, opts)
|
|
27
|
+
use_cache = opts.key?(:use_cache) ? opts[:use_cache] : true
|
|
28
|
+
return unless use_cache
|
|
29
|
+
|
|
30
|
+
results_array = results.is_a?(Array) ? results : [results]
|
|
31
|
+
results_array.each do |result|
|
|
32
|
+
# @type var result: Result
|
|
33
|
+
next unless result.respond_to?(:content)
|
|
34
|
+
|
|
35
|
+
@__cache_tracker[:entries] += 1
|
|
36
|
+
@__cache_tracker[:bytes] += result.content.to_s.bytesize
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def reset_cache_tracker!
|
|
41
|
+
@__cache_tracker[:entries] = 0
|
|
42
|
+
@__cache_tracker[:bytes] = 0
|
|
43
|
+
nil
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
end
|
data/lib/kreuzberg/cli.rb
CHANGED
|
@@ -1,55 +1,55 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
module Kreuzberg
|
|
4
|
-
# Command-line interface wrapper
|
|
5
|
-
#
|
|
6
|
-
# Provides a Ruby API for the Kreuzberg CLI commands.
|
|
7
|
-
#
|
|
8
|
-
# @example Extract a file
|
|
9
|
-
# Kreuzberg::CLI.extract('document.pdf', output: 'text')
|
|
10
|
-
#
|
|
11
|
-
# @example Detect file type
|
|
12
|
-
# mime_type = Kreuzberg::CLI.detect('document.pdf')
|
|
13
|
-
#
|
|
14
|
-
module CLI
|
|
15
|
-
module_function
|
|
16
|
-
|
|
17
|
-
# Extract content from a file using the CLI
|
|
18
|
-
#
|
|
19
|
-
# @param path [String] Path to the file
|
|
20
|
-
# @param output [String] Output format ("text", "json", "markdown")
|
|
21
|
-
# @param ocr [Boolean] Enable OCR
|
|
22
|
-
# @return [String] Extracted content
|
|
23
|
-
#
|
|
24
|
-
def extract(path, output: 'text', ocr: false)
|
|
25
|
-
args = ['extract', path, '--
|
|
26
|
-
args
|
|
27
|
-
CLIProxy.call(args)
|
|
28
|
-
end
|
|
29
|
-
|
|
30
|
-
# Detect MIME type of a file using the CLI
|
|
31
|
-
#
|
|
32
|
-
# @param path [String] Path to the file
|
|
33
|
-
# @return [String] MIME type
|
|
34
|
-
#
|
|
35
|
-
def detect(path)
|
|
36
|
-
CLIProxy.call(['detect', path]).strip
|
|
37
|
-
end
|
|
38
|
-
|
|
39
|
-
# Get CLI version
|
|
40
|
-
#
|
|
41
|
-
# @return [String] Version string
|
|
42
|
-
#
|
|
43
|
-
def version
|
|
44
|
-
CLIProxy.call(['--version']).strip
|
|
45
|
-
end
|
|
46
|
-
|
|
47
|
-
# Get CLI help text
|
|
48
|
-
#
|
|
49
|
-
# @return [String] Help text
|
|
50
|
-
#
|
|
51
|
-
def help
|
|
52
|
-
CLIProxy.call(['--help'])
|
|
53
|
-
end
|
|
54
|
-
end
|
|
55
|
-
end
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Kreuzberg
|
|
4
|
+
# Command-line interface wrapper
|
|
5
|
+
#
|
|
6
|
+
# Provides a Ruby API for the Kreuzberg CLI commands.
|
|
7
|
+
#
|
|
8
|
+
# @example Extract a file
|
|
9
|
+
# Kreuzberg::CLI.extract('document.pdf', output: 'text')
|
|
10
|
+
#
|
|
11
|
+
# @example Detect file type
|
|
12
|
+
# mime_type = Kreuzberg::CLI.detect('document.pdf')
|
|
13
|
+
#
|
|
14
|
+
module CLI
|
|
15
|
+
module_function
|
|
16
|
+
|
|
17
|
+
# Extract content from a file using the CLI
|
|
18
|
+
#
|
|
19
|
+
# @param path [String] Path to the file
|
|
20
|
+
# @param output [String] Output format ("text", "json", "markdown")
|
|
21
|
+
# @param ocr [Boolean] Enable OCR
|
|
22
|
+
# @return [String] Extracted content
|
|
23
|
+
#
|
|
24
|
+
def extract(path, output: 'text', ocr: false)
|
|
25
|
+
args = ['extract', path, '--format', output]
|
|
26
|
+
args.push('--ocr', ocr ? 'true' : 'false')
|
|
27
|
+
CLIProxy.call(args)
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
# Detect MIME type of a file using the CLI
|
|
31
|
+
#
|
|
32
|
+
# @param path [String] Path to the file
|
|
33
|
+
# @return [String] MIME type
|
|
34
|
+
#
|
|
35
|
+
def detect(path)
|
|
36
|
+
CLIProxy.call(['detect', path]).strip
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
# Get CLI version
|
|
40
|
+
#
|
|
41
|
+
# @return [String] Version string
|
|
42
|
+
#
|
|
43
|
+
def version
|
|
44
|
+
CLIProxy.call(['--version']).strip
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
# Get CLI help text
|
|
48
|
+
#
|
|
49
|
+
# @return [String] Help text
|
|
50
|
+
#
|
|
51
|
+
def help
|
|
52
|
+
CLIProxy.call(['--help'])
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
end
|
data/lib/kreuzberg/cli_proxy.rb
CHANGED
|
@@ -1,127 +1,127 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require 'open3'
|
|
4
|
-
require 'pathname'
|
|
5
|
-
|
|
6
|
-
module Kreuzberg
|
|
7
|
-
# CLI binary proxy
|
|
8
|
-
#
|
|
9
|
-
# Provides access to the Kreuzberg CLI binary built from crates/kreuzberg-cli.
|
|
10
|
-
#
|
|
11
|
-
# @example
|
|
12
|
-
# output = Kreuzberg::CLIProxy.call(['extract', 'document.pdf'])
|
|
13
|
-
# puts output
|
|
14
|
-
#
|
|
15
|
-
module CLIProxy
|
|
16
|
-
Error = Class.new(Kreuzberg::Errors::Error)
|
|
17
|
-
MissingBinaryError = Class.new(Error)
|
|
18
|
-
|
|
19
|
-
# CLI execution error with stderr and exit status
|
|
20
|
-
class CLIExecutionError < Error
|
|
21
|
-
attr_reader :stderr, :status
|
|
22
|
-
|
|
23
|
-
def initialize(message, stderr:, status:)
|
|
24
|
-
super(message)
|
|
25
|
-
@stderr = stderr
|
|
26
|
-
@status = status
|
|
27
|
-
end
|
|
28
|
-
end
|
|
29
|
-
|
|
30
|
-
module_function
|
|
31
|
-
|
|
32
|
-
# Execute the Kreuzberg CLI with given arguments
|
|
33
|
-
#
|
|
34
|
-
# @param argv [Array<String>] Command-line arguments
|
|
35
|
-
# @return [String] Standard output from the CLI
|
|
36
|
-
# @raise [CLIExecutionError] If the CLI exits with non-zero status
|
|
37
|
-
# @raise [MissingBinaryError] If the CLI binary is not found
|
|
38
|
-
#
|
|
39
|
-
# @example Extract a file
|
|
40
|
-
# output = Kreuzberg::CLIProxy.call(['extract', 'document.pdf'])
|
|
41
|
-
#
|
|
42
|
-
# @example Detect file type
|
|
43
|
-
# output = Kreuzberg::CLIProxy.call(['detect', 'document.pdf'])
|
|
44
|
-
#
|
|
45
|
-
def call(argv)
|
|
46
|
-
binary = find_cli_binary
|
|
47
|
-
args = Array(argv).map(&:to_s)
|
|
48
|
-
stdout, stderr, status = Open3.capture3(binary.to_s, *args)
|
|
49
|
-
return stdout if status.success?
|
|
50
|
-
|
|
51
|
-
raise CLIExecutionError.new(
|
|
52
|
-
"kreuzberg CLI exited with status #{status.exitstatus}",
|
|
53
|
-
stderr: stderr,
|
|
54
|
-
status: status.exitstatus
|
|
55
|
-
)
|
|
56
|
-
end
|
|
57
|
-
|
|
58
|
-
# Find the kreuzberg CLI binary
|
|
59
|
-
#
|
|
60
|
-
# Searches in multiple locations:
|
|
61
|
-
# - crates/kreuzberg-cli/target/release/
|
|
62
|
-
# - packages/ruby/lib/bin/
|
|
63
|
-
# - workspace root target/release/
|
|
64
|
-
#
|
|
65
|
-
# @return [Pathname] Path to the CLI binary
|
|
66
|
-
# @raise [MissingBinaryError] If binary not found
|
|
67
|
-
#
|
|
68
|
-
def find_cli_binary
|
|
69
|
-
binary_name = Gem.win_platform? ? 'kreuzberg.exe' : 'kreuzberg'
|
|
70
|
-
found = search_paths(binary_name).find(&:file?)
|
|
71
|
-
return found if found
|
|
72
|
-
|
|
73
|
-
raise MissingBinaryError, missing_binary_message
|
|
74
|
-
end
|
|
75
|
-
|
|
76
|
-
# Get the root path of the Ruby package
|
|
77
|
-
#
|
|
78
|
-
# @return [Pathname] Root path
|
|
79
|
-
#
|
|
80
|
-
def root_path
|
|
81
|
-
@root_path ||= Pathname(__dir__ || '.').join('../..').expand_path
|
|
82
|
-
end
|
|
83
|
-
|
|
84
|
-
# Get the lib path
|
|
85
|
-
#
|
|
86
|
-
# @return [Pathname] Lib path
|
|
87
|
-
#
|
|
88
|
-
def lib_path
|
|
89
|
-
@lib_path ||= Pathname(__dir__ || '.').join('..').expand_path
|
|
90
|
-
end
|
|
91
|
-
|
|
92
|
-
# Search paths for the CLI binary
|
|
93
|
-
#
|
|
94
|
-
# @param binary_name [String] Name of the binary
|
|
95
|
-
# @return [Array<Pathname>] List of paths to search
|
|
96
|
-
#
|
|
97
|
-
def search_paths(binary_name)
|
|
98
|
-
paths = [
|
|
99
|
-
# In lib/bin (for packaged gems)
|
|
100
|
-
lib_path.join('bin', binary_name),
|
|
101
|
-
lib_path.join(binary_name),
|
|
102
|
-
# In local development (packages/ruby)
|
|
103
|
-
root_path.join('../../crates/kreuzberg-cli/target/release', binary_name),
|
|
104
|
-
root_path.join('../../target/release', binary_name)
|
|
105
|
-
]
|
|
106
|
-
|
|
107
|
-
# Try workspace root
|
|
108
|
-
workspace_root = root_path.parent&.parent
|
|
109
|
-
paths << workspace_root.join('target', 'release', binary_name) if workspace_root
|
|
110
|
-
|
|
111
|
-
paths
|
|
112
|
-
end
|
|
113
|
-
|
|
114
|
-
# Error message when binary is missing
|
|
115
|
-
#
|
|
116
|
-
# @return [String] Error message
|
|
117
|
-
#
|
|
118
|
-
def missing_binary_message
|
|
119
|
-
<<~MSG.strip
|
|
120
|
-
kreuzberg CLI binary not found. Build it with:
|
|
121
|
-
`cargo build --release --package kreuzberg-cli`
|
|
122
|
-
|
|
123
|
-
Or install the gem with pre-built binaries.
|
|
124
|
-
MSG
|
|
125
|
-
end
|
|
126
|
-
end
|
|
127
|
-
end
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'open3'
|
|
4
|
+
require 'pathname'
|
|
5
|
+
|
|
6
|
+
module Kreuzberg
|
|
7
|
+
# CLI binary proxy
|
|
8
|
+
#
|
|
9
|
+
# Provides access to the Kreuzberg CLI binary built from crates/kreuzberg-cli.
|
|
10
|
+
#
|
|
11
|
+
# @example
|
|
12
|
+
# output = Kreuzberg::CLIProxy.call(['extract', 'document.pdf'])
|
|
13
|
+
# puts output
|
|
14
|
+
#
|
|
15
|
+
module CLIProxy
|
|
16
|
+
Error = Class.new(Kreuzberg::Errors::Error)
|
|
17
|
+
MissingBinaryError = Class.new(Error)
|
|
18
|
+
|
|
19
|
+
# CLI execution error with stderr and exit status
|
|
20
|
+
class CLIExecutionError < Error
|
|
21
|
+
attr_reader :stderr, :status
|
|
22
|
+
|
|
23
|
+
def initialize(message, stderr:, status:)
|
|
24
|
+
super(message)
|
|
25
|
+
@stderr = stderr
|
|
26
|
+
@status = status
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
module_function
|
|
31
|
+
|
|
32
|
+
# Execute the Kreuzberg CLI with given arguments
|
|
33
|
+
#
|
|
34
|
+
# @param argv [Array<String>] Command-line arguments
|
|
35
|
+
# @return [String] Standard output from the CLI
|
|
36
|
+
# @raise [CLIExecutionError] If the CLI exits with non-zero status
|
|
37
|
+
# @raise [MissingBinaryError] If the CLI binary is not found
|
|
38
|
+
#
|
|
39
|
+
# @example Extract a file
|
|
40
|
+
# output = Kreuzberg::CLIProxy.call(['extract', 'document.pdf'])
|
|
41
|
+
#
|
|
42
|
+
# @example Detect file type
|
|
43
|
+
# output = Kreuzberg::CLIProxy.call(['detect', 'document.pdf'])
|
|
44
|
+
#
|
|
45
|
+
def call(argv)
|
|
46
|
+
binary = find_cli_binary
|
|
47
|
+
args = Array(argv).map(&:to_s)
|
|
48
|
+
stdout, stderr, status = Open3.capture3(binary.to_s, *args)
|
|
49
|
+
return stdout if status.success?
|
|
50
|
+
|
|
51
|
+
raise CLIExecutionError.new(
|
|
52
|
+
"kreuzberg CLI exited with status #{status.exitstatus}",
|
|
53
|
+
stderr: stderr,
|
|
54
|
+
status: status.exitstatus
|
|
55
|
+
)
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
# Find the kreuzberg CLI binary
|
|
59
|
+
#
|
|
60
|
+
# Searches in multiple locations:
|
|
61
|
+
# - crates/kreuzberg-cli/target/release/
|
|
62
|
+
# - packages/ruby/lib/bin/
|
|
63
|
+
# - workspace root target/release/
|
|
64
|
+
#
|
|
65
|
+
# @return [Pathname] Path to the CLI binary
|
|
66
|
+
# @raise [MissingBinaryError] If binary not found
|
|
67
|
+
#
|
|
68
|
+
def find_cli_binary
|
|
69
|
+
binary_name = Gem.win_platform? ? 'kreuzberg.exe' : 'kreuzberg'
|
|
70
|
+
found = search_paths(binary_name).find(&:file?)
|
|
71
|
+
return found if found
|
|
72
|
+
|
|
73
|
+
raise MissingBinaryError, missing_binary_message
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
# Get the root path of the Ruby package
|
|
77
|
+
#
|
|
78
|
+
# @return [Pathname] Root path
|
|
79
|
+
#
|
|
80
|
+
def root_path
|
|
81
|
+
@root_path ||= Pathname(__dir__ || '.').join('../..').expand_path
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
# Get the lib path
|
|
85
|
+
#
|
|
86
|
+
# @return [Pathname] Lib path
|
|
87
|
+
#
|
|
88
|
+
def lib_path
|
|
89
|
+
@lib_path ||= Pathname(__dir__ || '.').join('..').expand_path
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
# Search paths for the CLI binary
|
|
93
|
+
#
|
|
94
|
+
# @param binary_name [String] Name of the binary
|
|
95
|
+
# @return [Array<Pathname>] List of paths to search
|
|
96
|
+
#
|
|
97
|
+
def search_paths(binary_name)
|
|
98
|
+
paths = [
|
|
99
|
+
# In lib/bin (for packaged gems)
|
|
100
|
+
lib_path.join('bin', binary_name),
|
|
101
|
+
lib_path.join(binary_name),
|
|
102
|
+
# In local development (packages/ruby)
|
|
103
|
+
root_path.join('../../crates/kreuzberg-cli/target/release', binary_name),
|
|
104
|
+
root_path.join('../../target/release', binary_name)
|
|
105
|
+
]
|
|
106
|
+
|
|
107
|
+
# Try workspace root
|
|
108
|
+
workspace_root = root_path.parent&.parent
|
|
109
|
+
paths << workspace_root.join('target', 'release', binary_name) if workspace_root
|
|
110
|
+
|
|
111
|
+
paths
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
# Error message when binary is missing
|
|
115
|
+
#
|
|
116
|
+
# @return [String] Error message
|
|
117
|
+
#
|
|
118
|
+
def missing_binary_message
|
|
119
|
+
<<~MSG.strip
|
|
120
|
+
kreuzberg CLI binary not found. Build it with:
|
|
121
|
+
`cargo build --release --package kreuzberg-cli`
|
|
122
|
+
|
|
123
|
+
Or install the gem with pre-built binaries.
|
|
124
|
+
MSG
|
|
125
|
+
end
|
|
126
|
+
end
|
|
127
|
+
end
|