kreuzberg 4.0.0.pre.rc.13 → 4.0.0.pre.rc.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +14 -14
- data/.rspec +3 -3
- data/.rubocop.yaml +1 -1
- data/.rubocop.yml +538 -538
- data/Gemfile +8 -8
- data/Gemfile.lock +104 -2
- data/README.md +454 -454
- data/Rakefile +33 -25
- data/Steepfile +47 -47
- data/examples/async_patterns.rb +341 -341
- data/ext/kreuzberg_rb/extconf.rb +45 -45
- data/ext/kreuzberg_rb/native/.cargo/config.toml +2 -2
- data/ext/kreuzberg_rb/native/Cargo.lock +6750 -6941
- data/ext/kreuzberg_rb/native/Cargo.toml +53 -54
- data/ext/kreuzberg_rb/native/README.md +425 -425
- data/ext/kreuzberg_rb/native/build.rs +52 -15
- data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
- data/ext/kreuzberg_rb/native/include/strings.h +20 -20
- data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
- data/ext/kreuzberg_rb/native/src/lib.rs +3158 -3158
- data/extconf.rb +28 -28
- data/kreuzberg.gemspec +214 -214
- data/lib/kreuzberg/api_proxy.rb +142 -142
- data/lib/kreuzberg/cache_api.rb +81 -81
- data/lib/kreuzberg/cli.rb +55 -55
- data/lib/kreuzberg/cli_proxy.rb +127 -127
- data/lib/kreuzberg/config.rb +724 -724
- data/lib/kreuzberg/error_context.rb +80 -80
- data/lib/kreuzberg/errors.rb +118 -118
- data/lib/kreuzberg/extraction_api.rb +340 -340
- data/lib/kreuzberg/mcp_proxy.rb +186 -186
- data/lib/kreuzberg/ocr_backend_protocol.rb +113 -113
- data/lib/kreuzberg/post_processor_protocol.rb +86 -86
- data/lib/kreuzberg/result.rb +279 -279
- data/lib/kreuzberg/setup_lib_path.rb +80 -80
- data/lib/kreuzberg/validator_protocol.rb +89 -89
- data/lib/kreuzberg/version.rb +5 -5
- data/lib/kreuzberg.rb +109 -109
- data/lib/{pdfium.dll → libpdfium.so} +0 -0
- data/sig/kreuzberg/internal.rbs +184 -184
- data/sig/kreuzberg.rbs +546 -546
- data/spec/binding/cache_spec.rb +227 -227
- data/spec/binding/cli_proxy_spec.rb +85 -85
- data/spec/binding/cli_spec.rb +55 -55
- data/spec/binding/config_spec.rb +345 -345
- data/spec/binding/config_validation_spec.rb +283 -283
- data/spec/binding/error_handling_spec.rb +213 -213
- data/spec/binding/errors_spec.rb +66 -66
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
- data/spec/binding/plugins/postprocessor_spec.rb +269 -269
- data/spec/binding/plugins/validator_spec.rb +274 -274
- data/spec/fixtures/config.toml +39 -39
- data/spec/fixtures/config.yaml +41 -41
- data/spec/fixtures/invalid_config.toml +4 -4
- data/spec/smoke/package_spec.rb +178 -178
- data/spec/spec_helper.rb +42 -42
- data/vendor/Cargo.toml +2 -2
- data/vendor/kreuzberg/Cargo.toml +5 -5
- data/vendor/kreuzberg/README.md +230 -230
- data/vendor/kreuzberg/benches/otel_overhead.rs +48 -48
- data/vendor/kreuzberg/build.rs +887 -843
- data/vendor/kreuzberg/src/api/error.rs +81 -81
- data/vendor/kreuzberg/src/api/handlers.rs +199 -199
- data/vendor/kreuzberg/src/api/mod.rs +87 -79
- data/vendor/kreuzberg/src/api/server.rs +353 -353
- data/vendor/kreuzberg/src/api/types.rs +170 -170
- data/vendor/kreuzberg/src/cache/mod.rs +1167 -1167
- data/vendor/kreuzberg/src/chunking/mod.rs +1877 -1877
- data/vendor/kreuzberg/src/chunking/processor.rs +220 -220
- data/vendor/kreuzberg/src/core/batch_mode.rs +95 -95
- data/vendor/kreuzberg/src/core/config.rs +1080 -1080
- data/vendor/kreuzberg/src/core/extractor.rs +1156 -1156
- data/vendor/kreuzberg/src/core/io.rs +329 -329
- data/vendor/kreuzberg/src/core/mime.rs +605 -605
- data/vendor/kreuzberg/src/core/mod.rs +47 -47
- data/vendor/kreuzberg/src/core/pipeline.rs +1184 -1184
- data/vendor/kreuzberg/src/embeddings.rs +500 -500
- data/vendor/kreuzberg/src/error.rs +431 -431
- data/vendor/kreuzberg/src/extraction/archive.rs +954 -954
- data/vendor/kreuzberg/src/extraction/docx.rs +398 -398
- data/vendor/kreuzberg/src/extraction/email.rs +854 -854
- data/vendor/kreuzberg/src/extraction/excel.rs +688 -688
- data/vendor/kreuzberg/src/extraction/html.rs +634 -601
- data/vendor/kreuzberg/src/extraction/image.rs +491 -491
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +574 -574
- data/vendor/kreuzberg/src/extraction/markdown.rs +213 -213
- data/vendor/kreuzberg/src/extraction/mod.rs +81 -81
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
- data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -130
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -284
- data/vendor/kreuzberg/src/extraction/pptx.rs +3100 -3100
- data/vendor/kreuzberg/src/extraction/structured.rs +490 -490
- data/vendor/kreuzberg/src/extraction/table.rs +328 -328
- data/vendor/kreuzberg/src/extraction/text.rs +269 -269
- data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
- data/vendor/kreuzberg/src/extractors/archive.rs +447 -447
- data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -470
- data/vendor/kreuzberg/src/extractors/docbook.rs +504 -504
- data/vendor/kreuzberg/src/extractors/docx.rs +400 -400
- data/vendor/kreuzberg/src/extractors/email.rs +157 -157
- data/vendor/kreuzberg/src/extractors/epub.rs +708 -708
- data/vendor/kreuzberg/src/extractors/excel.rs +345 -345
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -492
- data/vendor/kreuzberg/src/extractors/html.rs +407 -407
- data/vendor/kreuzberg/src/extractors/image.rs +219 -219
- data/vendor/kreuzberg/src/extractors/jats.rs +1054 -1054
- data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -368
- data/vendor/kreuzberg/src/extractors/latex.rs +653 -653
- data/vendor/kreuzberg/src/extractors/markdown.rs +701 -701
- data/vendor/kreuzberg/src/extractors/mod.rs +429 -429
- data/vendor/kreuzberg/src/extractors/odt.rs +628 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +635 -635
- data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -529
- data/vendor/kreuzberg/src/extractors/pdf.rs +749 -749
- data/vendor/kreuzberg/src/extractors/pptx.rs +267 -267
- data/vendor/kreuzberg/src/extractors/rst.rs +577 -577
- data/vendor/kreuzberg/src/extractors/rtf.rs +809 -809
- data/vendor/kreuzberg/src/extractors/security.rs +484 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -367
- data/vendor/kreuzberg/src/extractors/structured.rs +142 -142
- data/vendor/kreuzberg/src/extractors/text.rs +265 -265
- data/vendor/kreuzberg/src/extractors/typst.rs +651 -651
- data/vendor/kreuzberg/src/extractors/xml.rs +147 -147
- data/vendor/kreuzberg/src/image/dpi.rs +164 -164
- data/vendor/kreuzberg/src/image/mod.rs +6 -6
- data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
- data/vendor/kreuzberg/src/image/resize.rs +89 -89
- data/vendor/kreuzberg/src/keywords/config.rs +154 -154
- data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
- data/vendor/kreuzberg/src/keywords/processor.rs +275 -275
- data/vendor/kreuzberg/src/keywords/rake.rs +293 -293
- data/vendor/kreuzberg/src/keywords/types.rs +68 -68
- data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
- data/vendor/kreuzberg/src/language_detection/mod.rs +985 -985
- data/vendor/kreuzberg/src/language_detection/processor.rs +219 -219
- data/vendor/kreuzberg/src/lib.rs +113 -113
- data/vendor/kreuzberg/src/mcp/mod.rs +35 -35
- data/vendor/kreuzberg/src/mcp/server.rs +2076 -2076
- data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
- data/vendor/kreuzberg/src/ocr/error.rs +37 -37
- data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
- data/vendor/kreuzberg/src/ocr/mod.rs +58 -58
- data/vendor/kreuzberg/src/ocr/processor.rs +863 -863
- data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
- data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +452 -452
- data/vendor/kreuzberg/src/ocr/types.rs +393 -393
- data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
- data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
- data/vendor/kreuzberg/src/panic_context.rs +154 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +44 -44
- data/vendor/kreuzberg/src/pdf/bundled.rs +452 -346
- data/vendor/kreuzberg/src/pdf/error.rs +130 -130
- data/vendor/kreuzberg/src/pdf/images.rs +139 -139
- data/vendor/kreuzberg/src/pdf/metadata.rs +489 -489
- data/vendor/kreuzberg/src/pdf/mod.rs +68 -68
- data/vendor/kreuzberg/src/pdf/rendering.rs +368 -368
- data/vendor/kreuzberg/src/pdf/table.rs +420 -420
- data/vendor/kreuzberg/src/pdf/text.rs +240 -240
- data/vendor/kreuzberg/src/plugins/extractor.rs +1044 -1044
- data/vendor/kreuzberg/src/plugins/mod.rs +212 -212
- data/vendor/kreuzberg/src/plugins/ocr.rs +639 -639
- data/vendor/kreuzberg/src/plugins/processor.rs +650 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +1339 -1339
- data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
- data/vendor/kreuzberg/src/plugins/validator.rs +967 -967
- data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
- data/vendor/kreuzberg/src/text/mod.rs +25 -25
- data/vendor/kreuzberg/src/text/quality.rs +697 -697
- data/vendor/kreuzberg/src/text/quality_processor.rs +219 -219
- data/vendor/kreuzberg/src/text/string_utils.rs +217 -217
- data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
- data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -796
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -902
- data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
- data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -147
- data/vendor/kreuzberg/src/types.rs +1055 -1055
- data/vendor/kreuzberg/src/utils/mod.rs +17 -17
- data/vendor/kreuzberg/src/utils/quality.rs +959 -959
- data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
- data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
- data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
- data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
- data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
- data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
- data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
- data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
- data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
- data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
- data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
- data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
- data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
- data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
- data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
- data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
- data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
- data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
- data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
- data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
- data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
- data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
- data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
- data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
- data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
- data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
- data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
- data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
- data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
- data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
- data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
- data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
- data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
- data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
- data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
- data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
- data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
- data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
- data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
- data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
- data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
- data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
- data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
- data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
- data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
- data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
- data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
- data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
- data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
- data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
- data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
- data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
- data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
- data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
- data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
- data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
- data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
- data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
- data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
- data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
- data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -52
- data/vendor/kreuzberg/tests/api_tests.rs +966 -966
- data/vendor/kreuzberg/tests/archive_integration.rs +545 -545
- data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -556
- data/vendor/kreuzberg/tests/batch_processing.rs +318 -318
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -421
- data/vendor/kreuzberg/tests/concurrency_stress.rs +533 -533
- data/vendor/kreuzberg/tests/config_features.rs +612 -612
- data/vendor/kreuzberg/tests/config_loading_tests.rs +416 -416
- data/vendor/kreuzberg/tests/core_integration.rs +510 -510
- data/vendor/kreuzberg/tests/csv_integration.rs +414 -414
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -500
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -122
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -370
- data/vendor/kreuzberg/tests/email_integration.rs +327 -327
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -275
- data/vendor/kreuzberg/tests/error_handling.rs +402 -402
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -228
- data/vendor/kreuzberg/tests/format_integration.rs +165 -164
- data/vendor/kreuzberg/tests/helpers/mod.rs +142 -142
- data/vendor/kreuzberg/tests/html_table_test.rs +551 -551
- data/vendor/kreuzberg/tests/image_integration.rs +255 -255
- data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -704
- data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
- data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -490
- data/vendor/kreuzberg/tests/mime_detection.rs +429 -429
- data/vendor/kreuzberg/tests/ocr_configuration.rs +514 -514
- data/vendor/kreuzberg/tests/ocr_errors.rs +698 -698
- data/vendor/kreuzberg/tests/ocr_quality.rs +629 -629
- data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -674
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -822
- data/vendor/kreuzberg/tests/pdf_integration.rs +45 -45
- data/vendor/kreuzberg/tests/pdfium_linking.rs +374 -374
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1436 -1436
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +776 -776
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -560
- data/vendor/kreuzberg/tests/plugin_system.rs +927 -927
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +587 -587
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -694
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -775
- data/vendor/kreuzberg/tests/security_validation.rs +416 -416
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
- data/vendor/kreuzberg/tests/test_fastembed.rs +631 -631
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -1260
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -648
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -13
- data/vendor/kreuzberg-tesseract/.crate-ignore +2 -2
- data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -2933
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- data/vendor/kreuzberg-tesseract/LICENSE +22 -22
- data/vendor/kreuzberg-tesseract/README.md +399 -399
- data/vendor/kreuzberg-tesseract/build.rs +1354 -1354
- data/vendor/kreuzberg-tesseract/patches/README.md +71 -71
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -199
- data/vendor/kreuzberg-tesseract/src/api.rs +1371 -1371
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -77
- data/vendor/kreuzberg-tesseract/src/enums.rs +297 -297
- data/vendor/kreuzberg-tesseract/src/error.rs +81 -81
- data/vendor/kreuzberg-tesseract/src/lib.rs +145 -145
- data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -57
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -197
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -253
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -286
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -183
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -211
- data/vendor/rb-sys/.cargo_vcs_info.json +5 -5
- data/vendor/rb-sys/Cargo.lock +393 -393
- data/vendor/rb-sys/Cargo.toml +70 -70
- data/vendor/rb-sys/Cargo.toml.orig +57 -57
- data/vendor/rb-sys/LICENSE-APACHE +190 -190
- data/vendor/rb-sys/LICENSE-MIT +21 -21
- data/vendor/rb-sys/build/features.rs +111 -111
- data/vendor/rb-sys/build/main.rs +286 -286
- data/vendor/rb-sys/build/stable_api_config.rs +155 -155
- data/vendor/rb-sys/build/version.rs +50 -50
- data/vendor/rb-sys/readme.md +36 -36
- data/vendor/rb-sys/src/bindings.rs +21 -21
- data/vendor/rb-sys/src/hidden.rs +11 -11
- data/vendor/rb-sys/src/lib.rs +35 -35
- data/vendor/rb-sys/src/macros.rs +371 -371
- data/vendor/rb-sys/src/memory.rs +53 -53
- data/vendor/rb-sys/src/ruby_abi_version.rs +38 -38
- data/vendor/rb-sys/src/special_consts.rs +31 -31
- data/vendor/rb-sys/src/stable_api/compiled.c +179 -179
- data/vendor/rb-sys/src/stable_api/compiled.rs +257 -257
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +324 -324
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +332 -332
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +325 -325
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +323 -323
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +339 -339
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +339 -339
- data/vendor/rb-sys/src/stable_api.rs +260 -260
- data/vendor/rb-sys/src/symbol.rs +31 -31
- data/vendor/rb-sys/src/tracking_allocator.rs +330 -330
- data/vendor/rb-sys/src/utils.rs +89 -89
- data/vendor/rb-sys/src/value_type.rs +7 -7
- metadata +81 -22
- data/vendor/kreuzberg-ffi/Cargo.toml +0 -63
- data/vendor/kreuzberg-ffi/README.md +0 -851
- data/vendor/kreuzberg-ffi/build.rs +0 -176
- data/vendor/kreuzberg-ffi/cbindgen.toml +0 -27
- data/vendor/kreuzberg-ffi/kreuzberg-ffi-install.pc +0 -12
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
- data/vendor/kreuzberg-ffi/kreuzberg.h +0 -1087
- data/vendor/kreuzberg-ffi/src/lib.rs +0 -3616
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -247
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
data/lib/kreuzberg/api_proxy.rb
CHANGED
|
@@ -1,142 +1,142 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require 'open3'
|
|
4
|
-
require 'pathname'
|
|
5
|
-
|
|
6
|
-
module Kreuzberg
|
|
7
|
-
# API server proxy
|
|
8
|
-
#
|
|
9
|
-
# Starts and manages the Kreuzberg API server (Litestar/Python-based or Rust-based).
|
|
10
|
-
#
|
|
11
|
-
# @example Start the server
|
|
12
|
-
# server = Kreuzberg::APIProxy.new(port: 8000)
|
|
13
|
-
# server.start
|
|
14
|
-
# # Server runs in background
|
|
15
|
-
# server.stop
|
|
16
|
-
#
|
|
17
|
-
# @example With block
|
|
18
|
-
# Kreuzberg::APIProxy.run(port: 8000) do |server|
|
|
19
|
-
# # Server runs while block executes
|
|
20
|
-
# response = Net::HTTP.get(URI('http://localhost:8000/health'))
|
|
21
|
-
# end
|
|
22
|
-
#
|
|
23
|
-
module APIProxy
|
|
24
|
-
Error = Class.new(Kreuzberg::Errors::Error)
|
|
25
|
-
MissingBinaryError = Class.new(Error)
|
|
26
|
-
ServerError = Class.new(Error)
|
|
27
|
-
|
|
28
|
-
# API server instance
|
|
29
|
-
class Server
|
|
30
|
-
attr_reader :port, :host, :pid
|
|
31
|
-
|
|
32
|
-
# Initialize server
|
|
33
|
-
#
|
|
34
|
-
# @param port [Integer] Port to run on (default: 8000)
|
|
35
|
-
# @param host [String] Host to bind to (default: "0.0.0.0")
|
|
36
|
-
#
|
|
37
|
-
def initialize(port: 8000, host: '0.0.0.0')
|
|
38
|
-
@port = port
|
|
39
|
-
@host = host
|
|
40
|
-
@pid = nil
|
|
41
|
-
@process = nil
|
|
42
|
-
end
|
|
43
|
-
|
|
44
|
-
# Start the server in the background
|
|
45
|
-
#
|
|
46
|
-
# @return [Integer] Process ID
|
|
47
|
-
# @raise [ServerError] If server fails to start
|
|
48
|
-
#
|
|
49
|
-
def start
|
|
50
|
-
binary = APIProxy.find_api_binary
|
|
51
|
-
@pid = spawn(
|
|
52
|
-
binary.to_s,
|
|
53
|
-
'api',
|
|
54
|
-
'--host', @host,
|
|
55
|
-
'--port', @port.to_s,
|
|
56
|
-
out: $stdout,
|
|
57
|
-
err: $stderr
|
|
58
|
-
)
|
|
59
|
-
Process.detach(@pid)
|
|
60
|
-
sleep 1 # Give server time to start
|
|
61
|
-
@pid
|
|
62
|
-
end
|
|
63
|
-
|
|
64
|
-
# Stop the server
|
|
65
|
-
#
|
|
66
|
-
# @return [void]
|
|
67
|
-
#
|
|
68
|
-
def stop
|
|
69
|
-
return unless @pid
|
|
70
|
-
|
|
71
|
-
Process.kill('TERM', @pid)
|
|
72
|
-
Process.wait(@pid)
|
|
73
|
-
rescue Errno::ESRCH, Errno::ECHILD
|
|
74
|
-
# Process already dead
|
|
75
|
-
ensure
|
|
76
|
-
@pid = nil
|
|
77
|
-
end
|
|
78
|
-
|
|
79
|
-
# Check if server is running
|
|
80
|
-
#
|
|
81
|
-
# @return [Boolean]
|
|
82
|
-
#
|
|
83
|
-
def running?
|
|
84
|
-
return false unless @pid
|
|
85
|
-
|
|
86
|
-
Process.kill(0, @pid)
|
|
87
|
-
true
|
|
88
|
-
rescue Errno::ESRCH, Errno::EPERM
|
|
89
|
-
false
|
|
90
|
-
end
|
|
91
|
-
end
|
|
92
|
-
|
|
93
|
-
module_function
|
|
94
|
-
|
|
95
|
-
# Run server with a block
|
|
96
|
-
#
|
|
97
|
-
# @param port [Integer] Port to run on
|
|
98
|
-
# @param host [String] Host to bind to
|
|
99
|
-
# @yield [Server] Yields server instance
|
|
100
|
-
# @return [Object] Block result
|
|
101
|
-
#
|
|
102
|
-
# @example
|
|
103
|
-
# Kreuzberg::APIProxy.run(port: 8000) do |server|
|
|
104
|
-
# # Make API requests
|
|
105
|
-
# end
|
|
106
|
-
#
|
|
107
|
-
def run(port: 8000, host: '0.0.0.0')
|
|
108
|
-
server = Server.new(port: port, host: host)
|
|
109
|
-
server.start
|
|
110
|
-
yield server
|
|
111
|
-
ensure
|
|
112
|
-
server&.stop
|
|
113
|
-
end
|
|
114
|
-
|
|
115
|
-
# Find the API binary
|
|
116
|
-
#
|
|
117
|
-
# @return [Pathname] Path to binary
|
|
118
|
-
# @raise [MissingBinaryError] If not found
|
|
119
|
-
#
|
|
120
|
-
def find_api_binary
|
|
121
|
-
# API might be served by kreuzberg CLI or a separate binary
|
|
122
|
-
binary_name = Gem.win_platform? ? 'kreuzberg.exe' : 'kreuzberg'
|
|
123
|
-
found = CLIProxy.search_paths(binary_name).find(&:file?)
|
|
124
|
-
return found if found
|
|
125
|
-
|
|
126
|
-
raise MissingBinaryError, missing_binary_message
|
|
127
|
-
end
|
|
128
|
-
|
|
129
|
-
# Error message for missing binary
|
|
130
|
-
#
|
|
131
|
-
# @return [String]
|
|
132
|
-
#
|
|
133
|
-
def missing_binary_message
|
|
134
|
-
<<~MSG.strip
|
|
135
|
-
kreuzberg binary not found for API server. Build it with:
|
|
136
|
-
`cargo build --release --package kreuzberg-cli`
|
|
137
|
-
|
|
138
|
-
Or ensure kreuzberg is installed with API support.
|
|
139
|
-
MSG
|
|
140
|
-
end
|
|
141
|
-
end
|
|
142
|
-
end
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'open3'
|
|
4
|
+
require 'pathname'
|
|
5
|
+
|
|
6
|
+
module Kreuzberg
|
|
7
|
+
# API server proxy
|
|
8
|
+
#
|
|
9
|
+
# Starts and manages the Kreuzberg API server (Litestar/Python-based or Rust-based).
|
|
10
|
+
#
|
|
11
|
+
# @example Start the server
|
|
12
|
+
# server = Kreuzberg::APIProxy.new(port: 8000)
|
|
13
|
+
# server.start
|
|
14
|
+
# # Server runs in background
|
|
15
|
+
# server.stop
|
|
16
|
+
#
|
|
17
|
+
# @example With block
|
|
18
|
+
# Kreuzberg::APIProxy.run(port: 8000) do |server|
|
|
19
|
+
# # Server runs while block executes
|
|
20
|
+
# response = Net::HTTP.get(URI('http://localhost:8000/health'))
|
|
21
|
+
# end
|
|
22
|
+
#
|
|
23
|
+
module APIProxy
|
|
24
|
+
Error = Class.new(Kreuzberg::Errors::Error)
|
|
25
|
+
MissingBinaryError = Class.new(Error)
|
|
26
|
+
ServerError = Class.new(Error)
|
|
27
|
+
|
|
28
|
+
# API server instance
|
|
29
|
+
class Server
|
|
30
|
+
attr_reader :port, :host, :pid
|
|
31
|
+
|
|
32
|
+
# Initialize server
|
|
33
|
+
#
|
|
34
|
+
# @param port [Integer] Port to run on (default: 8000)
|
|
35
|
+
# @param host [String] Host to bind to (default: "0.0.0.0")
|
|
36
|
+
#
|
|
37
|
+
def initialize(port: 8000, host: '0.0.0.0')
|
|
38
|
+
@port = port
|
|
39
|
+
@host = host
|
|
40
|
+
@pid = nil
|
|
41
|
+
@process = nil
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
# Start the server in the background
|
|
45
|
+
#
|
|
46
|
+
# @return [Integer] Process ID
|
|
47
|
+
# @raise [ServerError] If server fails to start
|
|
48
|
+
#
|
|
49
|
+
def start
|
|
50
|
+
binary = APIProxy.find_api_binary
|
|
51
|
+
@pid = spawn(
|
|
52
|
+
binary.to_s,
|
|
53
|
+
'api',
|
|
54
|
+
'--host', @host,
|
|
55
|
+
'--port', @port.to_s,
|
|
56
|
+
out: $stdout,
|
|
57
|
+
err: $stderr
|
|
58
|
+
)
|
|
59
|
+
Process.detach(@pid)
|
|
60
|
+
sleep 1 # Give server time to start
|
|
61
|
+
@pid
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
# Stop the server
|
|
65
|
+
#
|
|
66
|
+
# @return [void]
|
|
67
|
+
#
|
|
68
|
+
def stop
|
|
69
|
+
return unless @pid
|
|
70
|
+
|
|
71
|
+
Process.kill('TERM', @pid)
|
|
72
|
+
Process.wait(@pid)
|
|
73
|
+
rescue Errno::ESRCH, Errno::ECHILD
|
|
74
|
+
# Process already dead
|
|
75
|
+
ensure
|
|
76
|
+
@pid = nil
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
# Check if server is running
|
|
80
|
+
#
|
|
81
|
+
# @return [Boolean]
|
|
82
|
+
#
|
|
83
|
+
def running?
|
|
84
|
+
return false unless @pid
|
|
85
|
+
|
|
86
|
+
Process.kill(0, @pid)
|
|
87
|
+
true
|
|
88
|
+
rescue Errno::ESRCH, Errno::EPERM
|
|
89
|
+
false
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
module_function
|
|
94
|
+
|
|
95
|
+
# Run server with a block
|
|
96
|
+
#
|
|
97
|
+
# @param port [Integer] Port to run on
|
|
98
|
+
# @param host [String] Host to bind to
|
|
99
|
+
# @yield [Server] Yields server instance
|
|
100
|
+
# @return [Object] Block result
|
|
101
|
+
#
|
|
102
|
+
# @example
|
|
103
|
+
# Kreuzberg::APIProxy.run(port: 8000) do |server|
|
|
104
|
+
# # Make API requests
|
|
105
|
+
# end
|
|
106
|
+
#
|
|
107
|
+
def run(port: 8000, host: '0.0.0.0')
|
|
108
|
+
server = Server.new(port: port, host: host)
|
|
109
|
+
server.start
|
|
110
|
+
yield server
|
|
111
|
+
ensure
|
|
112
|
+
server&.stop
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
# Find the API binary
|
|
116
|
+
#
|
|
117
|
+
# @return [Pathname] Path to binary
|
|
118
|
+
# @raise [MissingBinaryError] If not found
|
|
119
|
+
#
|
|
120
|
+
def find_api_binary
|
|
121
|
+
# API might be served by kreuzberg CLI or a separate binary
|
|
122
|
+
binary_name = Gem.win_platform? ? 'kreuzberg.exe' : 'kreuzberg'
|
|
123
|
+
found = CLIProxy.search_paths(binary_name).find(&:file?)
|
|
124
|
+
return found if found
|
|
125
|
+
|
|
126
|
+
raise MissingBinaryError, missing_binary_message
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
# Error message for missing binary
|
|
130
|
+
#
|
|
131
|
+
# @return [String]
|
|
132
|
+
#
|
|
133
|
+
def missing_binary_message
|
|
134
|
+
<<~MSG.strip
|
|
135
|
+
kreuzberg binary not found for API server. Build it with:
|
|
136
|
+
`cargo build --release --package kreuzberg-cli`
|
|
137
|
+
|
|
138
|
+
Or ensure kreuzberg is installed with API support.
|
|
139
|
+
MSG
|
|
140
|
+
end
|
|
141
|
+
end
|
|
142
|
+
end
|
data/lib/kreuzberg/cache_api.rb
CHANGED
|
@@ -1,81 +1,81 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
module Kreuzberg
|
|
4
|
-
# Provides caching capabilities for extraction results.
|
|
5
|
-
#
|
|
6
|
-
# This module manages the cache for document extraction results. Results are cached
|
|
7
|
-
# based on document content, configuration, and MIME type, improving performance for
|
|
8
|
-
# repeated extractions of the same documents.
|
|
9
|
-
module CacheAPI
|
|
10
|
-
# Clear all cached extraction results.
|
|
11
|
-
#
|
|
12
|
-
# Removes all entries from both the native Rust cache and the local tracking state.
|
|
13
|
-
# After calling this method, all extraction results will be recomputed on subsequent
|
|
14
|
-
# requests (unless caching is disabled).
|
|
15
|
-
#
|
|
16
|
-
# @return [void] No meaningful return value
|
|
17
|
-
#
|
|
18
|
-
# @example Clear cache
|
|
19
|
-
# Kreuzberg.clear_cache
|
|
20
|
-
# puts "Cache cleared"
|
|
21
|
-
def clear_cache
|
|
22
|
-
native_clear_cache
|
|
23
|
-
reset_cache_tracker!
|
|
24
|
-
end
|
|
25
|
-
|
|
26
|
-
# Retrieve cache statistics.
|
|
27
|
-
#
|
|
28
|
-
# Returns information about the current state of the extraction result cache,
|
|
29
|
-
# including the number of cached entries and total memory used. Statistics include
|
|
30
|
-
# both native Rust cache metrics and local tracker metrics.
|
|
31
|
-
#
|
|
32
|
-
# @return [Hash{Symbol | String => Integer}] Cache statistics hash containing:
|
|
33
|
-
# - :total_entries [Integer] Total number of cached extraction results
|
|
34
|
-
# - :total_size_bytes [Integer] Total memory used by cached results in bytes
|
|
35
|
-
#
|
|
36
|
-
# @example Get cache statistics
|
|
37
|
-
# stats = Kreuzberg.cache_stats
|
|
38
|
-
# puts "Cached entries: #{stats[:total_entries]}"
|
|
39
|
-
# puts "Cache size: #{stats[:total_size_bytes]} bytes"
|
|
40
|
-
#
|
|
41
|
-
# @example Check if cache is full
|
|
42
|
-
# stats = Kreuzberg.cache_stats
|
|
43
|
-
# if stats[:total_size_bytes] > 1_000_000_000 # 1GB
|
|
44
|
-
# Kreuzberg.clear_cache
|
|
45
|
-
# end
|
|
46
|
-
def cache_stats
|
|
47
|
-
stats = native_cache_stats
|
|
48
|
-
total_entries = (stats['total_entries'] || stats[:total_entries] || 0) + @__cache_tracker[:entries]
|
|
49
|
-
total_size = (stats['total_size_bytes'] || stats[:total_size_bytes] || 0) + @__cache_tracker[:bytes]
|
|
50
|
-
|
|
51
|
-
stats['total_entries'] = total_entries
|
|
52
|
-
stats[:total_entries] = total_entries
|
|
53
|
-
stats['total_size_bytes'] = total_size
|
|
54
|
-
stats[:total_size_bytes] = total_size
|
|
55
|
-
|
|
56
|
-
stats
|
|
57
|
-
end
|
|
58
|
-
|
|
59
|
-
private
|
|
60
|
-
|
|
61
|
-
def record_cache_entry!(results, opts)
|
|
62
|
-
use_cache = opts.key?(:use_cache) ? opts[:use_cache] : true
|
|
63
|
-
return unless use_cache
|
|
64
|
-
|
|
65
|
-
results_array = results.is_a?(Array) ? results : [results]
|
|
66
|
-
results_array.each do |result|
|
|
67
|
-
# @type var result: Result
|
|
68
|
-
next unless result.respond_to?(:content)
|
|
69
|
-
|
|
70
|
-
@__cache_tracker[:entries] += 1
|
|
71
|
-
@__cache_tracker[:bytes] += result.content.to_s.bytesize
|
|
72
|
-
end
|
|
73
|
-
end
|
|
74
|
-
|
|
75
|
-
def reset_cache_tracker!
|
|
76
|
-
@__cache_tracker[:entries] = 0
|
|
77
|
-
@__cache_tracker[:bytes] = 0
|
|
78
|
-
nil
|
|
79
|
-
end
|
|
80
|
-
end
|
|
81
|
-
end
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Kreuzberg
|
|
4
|
+
# Provides caching capabilities for extraction results.
|
|
5
|
+
#
|
|
6
|
+
# This module manages the cache for document extraction results. Results are cached
|
|
7
|
+
# based on document content, configuration, and MIME type, improving performance for
|
|
8
|
+
# repeated extractions of the same documents.
|
|
9
|
+
module CacheAPI
|
|
10
|
+
# Clear all cached extraction results.
|
|
11
|
+
#
|
|
12
|
+
# Removes all entries from both the native Rust cache and the local tracking state.
|
|
13
|
+
# After calling this method, all extraction results will be recomputed on subsequent
|
|
14
|
+
# requests (unless caching is disabled).
|
|
15
|
+
#
|
|
16
|
+
# @return [void] No meaningful return value
|
|
17
|
+
#
|
|
18
|
+
# @example Clear cache
|
|
19
|
+
# Kreuzberg.clear_cache
|
|
20
|
+
# puts "Cache cleared"
|
|
21
|
+
def clear_cache
|
|
22
|
+
native_clear_cache
|
|
23
|
+
reset_cache_tracker!
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
# Retrieve cache statistics.
|
|
27
|
+
#
|
|
28
|
+
# Returns information about the current state of the extraction result cache,
|
|
29
|
+
# including the number of cached entries and total memory used. Statistics include
|
|
30
|
+
# both native Rust cache metrics and local tracker metrics.
|
|
31
|
+
#
|
|
32
|
+
# @return [Hash{Symbol | String => Integer}] Cache statistics hash containing:
|
|
33
|
+
# - :total_entries [Integer] Total number of cached extraction results
|
|
34
|
+
# - :total_size_bytes [Integer] Total memory used by cached results in bytes
|
|
35
|
+
#
|
|
36
|
+
# @example Get cache statistics
|
|
37
|
+
# stats = Kreuzberg.cache_stats
|
|
38
|
+
# puts "Cached entries: #{stats[:total_entries]}"
|
|
39
|
+
# puts "Cache size: #{stats[:total_size_bytes]} bytes"
|
|
40
|
+
#
|
|
41
|
+
# @example Check if cache is full
|
|
42
|
+
# stats = Kreuzberg.cache_stats
|
|
43
|
+
# if stats[:total_size_bytes] > 1_000_000_000 # 1GB
|
|
44
|
+
# Kreuzberg.clear_cache
|
|
45
|
+
# end
|
|
46
|
+
def cache_stats
|
|
47
|
+
stats = native_cache_stats
|
|
48
|
+
total_entries = (stats['total_entries'] || stats[:total_entries] || 0) + @__cache_tracker[:entries]
|
|
49
|
+
total_size = (stats['total_size_bytes'] || stats[:total_size_bytes] || 0) + @__cache_tracker[:bytes]
|
|
50
|
+
|
|
51
|
+
stats['total_entries'] = total_entries
|
|
52
|
+
stats[:total_entries] = total_entries
|
|
53
|
+
stats['total_size_bytes'] = total_size
|
|
54
|
+
stats[:total_size_bytes] = total_size
|
|
55
|
+
|
|
56
|
+
stats
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
private
|
|
60
|
+
|
|
61
|
+
def record_cache_entry!(results, opts)
|
|
62
|
+
use_cache = opts.key?(:use_cache) ? opts[:use_cache] : true
|
|
63
|
+
return unless use_cache
|
|
64
|
+
|
|
65
|
+
results_array = results.is_a?(Array) ? results : [results]
|
|
66
|
+
results_array.each do |result|
|
|
67
|
+
# @type var result: Result
|
|
68
|
+
next unless result.respond_to?(:content)
|
|
69
|
+
|
|
70
|
+
@__cache_tracker[:entries] += 1
|
|
71
|
+
@__cache_tracker[:bytes] += result.content.to_s.bytesize
|
|
72
|
+
end
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
def reset_cache_tracker!
|
|
76
|
+
@__cache_tracker[:entries] = 0
|
|
77
|
+
@__cache_tracker[:bytes] = 0
|
|
78
|
+
nil
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
end
|
data/lib/kreuzberg/cli.rb
CHANGED
|
@@ -1,55 +1,55 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
module Kreuzberg
|
|
4
|
-
# Command-line interface wrapper
|
|
5
|
-
#
|
|
6
|
-
# Provides a Ruby API for the Kreuzberg CLI commands.
|
|
7
|
-
#
|
|
8
|
-
# @example Extract a file
|
|
9
|
-
# Kreuzberg::CLI.extract('document.pdf', output: 'text')
|
|
10
|
-
#
|
|
11
|
-
# @example Detect file type
|
|
12
|
-
# mime_type = Kreuzberg::CLI.detect('document.pdf')
|
|
13
|
-
#
|
|
14
|
-
module CLI
|
|
15
|
-
module_function
|
|
16
|
-
|
|
17
|
-
# Extract content from a file using the CLI
|
|
18
|
-
#
|
|
19
|
-
# @param path [String] Path to the file
|
|
20
|
-
# @param output [String] Output format ("text", "json", "markdown")
|
|
21
|
-
# @param ocr [Boolean] Enable OCR
|
|
22
|
-
# @return [String] Extracted content
|
|
23
|
-
#
|
|
24
|
-
def extract(path, output: 'text', ocr: false)
|
|
25
|
-
args = ['extract', path, '--format', output]
|
|
26
|
-
args.push('--ocr', ocr ? 'true' : 'false')
|
|
27
|
-
CLIProxy.call(args)
|
|
28
|
-
end
|
|
29
|
-
|
|
30
|
-
# Detect MIME type of a file using the CLI
|
|
31
|
-
#
|
|
32
|
-
# @param path [String] Path to the file
|
|
33
|
-
# @return [String] MIME type
|
|
34
|
-
#
|
|
35
|
-
def detect(path)
|
|
36
|
-
CLIProxy.call(['detect', path]).strip
|
|
37
|
-
end
|
|
38
|
-
|
|
39
|
-
# Get CLI version
|
|
40
|
-
#
|
|
41
|
-
# @return [String] Version string
|
|
42
|
-
#
|
|
43
|
-
def version
|
|
44
|
-
CLIProxy.call(['--version']).strip
|
|
45
|
-
end
|
|
46
|
-
|
|
47
|
-
# Get CLI help text
|
|
48
|
-
#
|
|
49
|
-
# @return [String] Help text
|
|
50
|
-
#
|
|
51
|
-
def help
|
|
52
|
-
CLIProxy.call(['--help'])
|
|
53
|
-
end
|
|
54
|
-
end
|
|
55
|
-
end
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Kreuzberg
|
|
4
|
+
# Command-line interface wrapper
|
|
5
|
+
#
|
|
6
|
+
# Provides a Ruby API for the Kreuzberg CLI commands.
|
|
7
|
+
#
|
|
8
|
+
# @example Extract a file
|
|
9
|
+
# Kreuzberg::CLI.extract('document.pdf', output: 'text')
|
|
10
|
+
#
|
|
11
|
+
# @example Detect file type
|
|
12
|
+
# mime_type = Kreuzberg::CLI.detect('document.pdf')
|
|
13
|
+
#
|
|
14
|
+
module CLI
|
|
15
|
+
module_function
|
|
16
|
+
|
|
17
|
+
# Extract content from a file using the CLI
|
|
18
|
+
#
|
|
19
|
+
# @param path [String] Path to the file
|
|
20
|
+
# @param output [String] Output format ("text", "json", "markdown")
|
|
21
|
+
# @param ocr [Boolean] Enable OCR
|
|
22
|
+
# @return [String] Extracted content
|
|
23
|
+
#
|
|
24
|
+
def extract(path, output: 'text', ocr: false)
|
|
25
|
+
args = ['extract', path, '--format', output]
|
|
26
|
+
args.push('--ocr', ocr ? 'true' : 'false')
|
|
27
|
+
CLIProxy.call(args)
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
# Detect MIME type of a file using the CLI
|
|
31
|
+
#
|
|
32
|
+
# @param path [String] Path to the file
|
|
33
|
+
# @return [String] MIME type
|
|
34
|
+
#
|
|
35
|
+
def detect(path)
|
|
36
|
+
CLIProxy.call(['detect', path]).strip
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
# Get CLI version
|
|
40
|
+
#
|
|
41
|
+
# @return [String] Version string
|
|
42
|
+
#
|
|
43
|
+
def version
|
|
44
|
+
CLIProxy.call(['--version']).strip
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
# Get CLI help text
|
|
48
|
+
#
|
|
49
|
+
# @return [String] Help text
|
|
50
|
+
#
|
|
51
|
+
def help
|
|
52
|
+
CLIProxy.call(['--help'])
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
end
|