kreuzberg 4.0.0.pre.rc.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +14 -0
- data/.rspec +3 -0
- data/.rubocop.yaml +1 -0
- data/.rubocop.yml +538 -0
- data/Gemfile +8 -0
- data/Gemfile.lock +157 -0
- data/README.md +426 -0
- data/Rakefile +25 -0
- data/Steepfile +47 -0
- data/examples/async_patterns.rb +341 -0
- data/ext/kreuzberg_rb/extconf.rb +45 -0
- data/ext/kreuzberg_rb/native/Cargo.lock +6535 -0
- data/ext/kreuzberg_rb/native/Cargo.toml +44 -0
- data/ext/kreuzberg_rb/native/README.md +425 -0
- data/ext/kreuzberg_rb/native/build.rs +15 -0
- data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -0
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -0
- data/ext/kreuzberg_rb/native/include/strings.h +20 -0
- data/ext/kreuzberg_rb/native/include/unistd.h +47 -0
- data/ext/kreuzberg_rb/native/src/lib.rs +2998 -0
- data/extconf.rb +28 -0
- data/kreuzberg.gemspec +148 -0
- data/lib/kreuzberg/api_proxy.rb +142 -0
- data/lib/kreuzberg/cache_api.rb +46 -0
- data/lib/kreuzberg/cli.rb +55 -0
- data/lib/kreuzberg/cli_proxy.rb +127 -0
- data/lib/kreuzberg/config.rb +691 -0
- data/lib/kreuzberg/error_context.rb +32 -0
- data/lib/kreuzberg/errors.rb +118 -0
- data/lib/kreuzberg/extraction_api.rb +85 -0
- data/lib/kreuzberg/mcp_proxy.rb +186 -0
- data/lib/kreuzberg/ocr_backend_protocol.rb +113 -0
- data/lib/kreuzberg/post_processor_protocol.rb +86 -0
- data/lib/kreuzberg/result.rb +216 -0
- data/lib/kreuzberg/setup_lib_path.rb +80 -0
- data/lib/kreuzberg/validator_protocol.rb +89 -0
- data/lib/kreuzberg/version.rb +5 -0
- data/lib/kreuzberg.rb +103 -0
- data/sig/kreuzberg/internal.rbs +184 -0
- data/sig/kreuzberg.rbs +520 -0
- data/spec/binding/cache_spec.rb +227 -0
- data/spec/binding/cli_proxy_spec.rb +85 -0
- data/spec/binding/cli_spec.rb +55 -0
- data/spec/binding/config_spec.rb +345 -0
- data/spec/binding/config_validation_spec.rb +283 -0
- data/spec/binding/error_handling_spec.rb +213 -0
- data/spec/binding/errors_spec.rb +66 -0
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
- data/spec/binding/plugins/postprocessor_spec.rb +269 -0
- data/spec/binding/plugins/validator_spec.rb +274 -0
- data/spec/fixtures/config.toml +39 -0
- data/spec/fixtures/config.yaml +41 -0
- data/spec/fixtures/invalid_config.toml +4 -0
- data/spec/smoke/package_spec.rb +178 -0
- data/spec/spec_helper.rb +42 -0
- data/vendor/kreuzberg/Cargo.toml +204 -0
- data/vendor/kreuzberg/README.md +175 -0
- data/vendor/kreuzberg/benches/otel_overhead.rs +48 -0
- data/vendor/kreuzberg/build.rs +474 -0
- data/vendor/kreuzberg/src/api/error.rs +81 -0
- data/vendor/kreuzberg/src/api/handlers.rs +199 -0
- data/vendor/kreuzberg/src/api/mod.rs +79 -0
- data/vendor/kreuzberg/src/api/server.rs +353 -0
- data/vendor/kreuzberg/src/api/types.rs +170 -0
- data/vendor/kreuzberg/src/cache/mod.rs +1167 -0
- data/vendor/kreuzberg/src/chunking/mod.rs +677 -0
- data/vendor/kreuzberg/src/core/batch_mode.rs +95 -0
- data/vendor/kreuzberg/src/core/config.rs +1032 -0
- data/vendor/kreuzberg/src/core/extractor.rs +1024 -0
- data/vendor/kreuzberg/src/core/io.rs +329 -0
- data/vendor/kreuzberg/src/core/mime.rs +605 -0
- data/vendor/kreuzberg/src/core/mod.rs +45 -0
- data/vendor/kreuzberg/src/core/pipeline.rs +984 -0
- data/vendor/kreuzberg/src/embeddings.rs +432 -0
- data/vendor/kreuzberg/src/error.rs +431 -0
- data/vendor/kreuzberg/src/extraction/archive.rs +954 -0
- data/vendor/kreuzberg/src/extraction/docx.rs +40 -0
- data/vendor/kreuzberg/src/extraction/email.rs +854 -0
- data/vendor/kreuzberg/src/extraction/excel.rs +688 -0
- data/vendor/kreuzberg/src/extraction/html.rs +553 -0
- data/vendor/kreuzberg/src/extraction/image.rs +368 -0
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +563 -0
- data/vendor/kreuzberg/src/extraction/markdown.rs +213 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +81 -0
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -0
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -0
- data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -0
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -0
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +287 -0
- data/vendor/kreuzberg/src/extraction/pptx.rs +3000 -0
- data/vendor/kreuzberg/src/extraction/structured.rs +490 -0
- data/vendor/kreuzberg/src/extraction/table.rs +328 -0
- data/vendor/kreuzberg/src/extraction/text.rs +269 -0
- data/vendor/kreuzberg/src/extraction/xml.rs +333 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +446 -0
- data/vendor/kreuzberg/src/extractors/bibtex.rs +469 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +502 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +367 -0
- data/vendor/kreuzberg/src/extractors/email.rs +143 -0
- data/vendor/kreuzberg/src/extractors/epub.rs +707 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +343 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +491 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +738 -0
- data/vendor/kreuzberg/src/extractors/html.rs +393 -0
- data/vendor/kreuzberg/src/extractors/image.rs +198 -0
- data/vendor/kreuzberg/src/extractors/jats.rs +1051 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +367 -0
- data/vendor/kreuzberg/src/extractors/latex.rs +652 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +700 -0
- data/vendor/kreuzberg/src/extractors/mod.rs +365 -0
- data/vendor/kreuzberg/src/extractors/odt.rs +628 -0
- data/vendor/kreuzberg/src/extractors/opml.rs +634 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +528 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +493 -0
- data/vendor/kreuzberg/src/extractors/pptx.rs +248 -0
- data/vendor/kreuzberg/src/extractors/rst.rs +576 -0
- data/vendor/kreuzberg/src/extractors/rtf.rs +810 -0
- data/vendor/kreuzberg/src/extractors/security.rs +484 -0
- data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +140 -0
- data/vendor/kreuzberg/src/extractors/text.rs +260 -0
- data/vendor/kreuzberg/src/extractors/typst.rs +650 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +135 -0
- data/vendor/kreuzberg/src/image/dpi.rs +164 -0
- data/vendor/kreuzberg/src/image/mod.rs +6 -0
- data/vendor/kreuzberg/src/image/preprocessing.rs +417 -0
- data/vendor/kreuzberg/src/image/resize.rs +89 -0
- data/vendor/kreuzberg/src/keywords/config.rs +154 -0
- data/vendor/kreuzberg/src/keywords/mod.rs +237 -0
- data/vendor/kreuzberg/src/keywords/processor.rs +267 -0
- data/vendor/kreuzberg/src/keywords/rake.rs +293 -0
- data/vendor/kreuzberg/src/keywords/types.rs +68 -0
- data/vendor/kreuzberg/src/keywords/yake.rs +163 -0
- data/vendor/kreuzberg/src/language_detection/mod.rs +942 -0
- data/vendor/kreuzberg/src/lib.rs +105 -0
- data/vendor/kreuzberg/src/mcp/mod.rs +32 -0
- data/vendor/kreuzberg/src/mcp/server.rs +1968 -0
- data/vendor/kreuzberg/src/ocr/cache.rs +469 -0
- data/vendor/kreuzberg/src/ocr/error.rs +37 -0
- data/vendor/kreuzberg/src/ocr/hocr.rs +216 -0
- data/vendor/kreuzberg/src/ocr/mod.rs +58 -0
- data/vendor/kreuzberg/src/ocr/processor.rs +863 -0
- data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -0
- data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +450 -0
- data/vendor/kreuzberg/src/ocr/types.rs +393 -0
- data/vendor/kreuzberg/src/ocr/utils.rs +47 -0
- data/vendor/kreuzberg/src/ocr/validation.rs +206 -0
- data/vendor/kreuzberg/src/panic_context.rs +154 -0
- data/vendor/kreuzberg/src/pdf/error.rs +122 -0
- data/vendor/kreuzberg/src/pdf/images.rs +139 -0
- data/vendor/kreuzberg/src/pdf/metadata.rs +346 -0
- data/vendor/kreuzberg/src/pdf/mod.rs +50 -0
- data/vendor/kreuzberg/src/pdf/rendering.rs +369 -0
- data/vendor/kreuzberg/src/pdf/table.rs +393 -0
- data/vendor/kreuzberg/src/pdf/text.rs +158 -0
- data/vendor/kreuzberg/src/plugins/extractor.rs +1013 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +209 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +620 -0
- data/vendor/kreuzberg/src/plugins/processor.rs +642 -0
- data/vendor/kreuzberg/src/plugins/registry.rs +1337 -0
- data/vendor/kreuzberg/src/plugins/traits.rs +258 -0
- data/vendor/kreuzberg/src/plugins/validator.rs +956 -0
- data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -0
- data/vendor/kreuzberg/src/text/mod.rs +19 -0
- data/vendor/kreuzberg/src/text/quality.rs +697 -0
- data/vendor/kreuzberg/src/text/string_utils.rs +217 -0
- data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -0
- data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -0
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -0
- data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -0
- data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -0
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -0
- data/vendor/kreuzberg/src/types.rs +903 -0
- data/vendor/kreuzberg/src/utils/mod.rs +17 -0
- data/vendor/kreuzberg/src/utils/quality.rs +959 -0
- data/vendor/kreuzberg/src/utils/string_utils.rs +381 -0
- data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -0
- data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -0
- data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -0
- data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -0
- data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -0
- data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -0
- data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -0
- data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -0
- data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -0
- data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -0
- data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -0
- data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -0
- data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -0
- data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -0
- data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -0
- data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -0
- data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -0
- data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -0
- data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -0
- data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -0
- data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -0
- data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -0
- data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -0
- data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -0
- data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -0
- data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -0
- data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -0
- data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -0
- data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -0
- data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -0
- data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -0
- data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -0
- data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -0
- data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -0
- data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -0
- data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -0
- data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -0
- data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -0
- data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -0
- data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -0
- data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -0
- data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -0
- data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -0
- data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -0
- data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -0
- data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -0
- data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -0
- data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -0
- data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -0
- data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -0
- data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -0
- data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -0
- data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -0
- data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -0
- data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -0
- data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -0
- data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -0
- data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -0
- data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -0
- data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -0
- data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -0
- data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -0
- data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -0
- data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -0
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -0
- data/vendor/kreuzberg/tests/api_tests.rs +966 -0
- data/vendor/kreuzberg/tests/archive_integration.rs +543 -0
- data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -0
- data/vendor/kreuzberg/tests/batch_processing.rs +316 -0
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +525 -0
- data/vendor/kreuzberg/tests/config_features.rs +598 -0
- data/vendor/kreuzberg/tests/config_loading_tests.rs +415 -0
- data/vendor/kreuzberg/tests/core_integration.rs +510 -0
- data/vendor/kreuzberg/tests/csv_integration.rs +414 -0
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +498 -0
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -0
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -0
- data/vendor/kreuzberg/tests/email_integration.rs +325 -0
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -0
- data/vendor/kreuzberg/tests/error_handling.rs +393 -0
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -0
- data/vendor/kreuzberg/tests/format_integration.rs +159 -0
- data/vendor/kreuzberg/tests/helpers/mod.rs +142 -0
- data/vendor/kreuzberg/tests/html_table_test.rs +551 -0
- data/vendor/kreuzberg/tests/image_integration.rs +253 -0
- data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -0
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -0
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -0
- data/vendor/kreuzberg/tests/keywords_integration.rs +479 -0
- data/vendor/kreuzberg/tests/keywords_quality.rs +509 -0
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -0
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -0
- data/vendor/kreuzberg/tests/mime_detection.rs +428 -0
- data/vendor/kreuzberg/tests/ocr_configuration.rs +510 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +676 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +627 -0
- data/vendor/kreuzberg/tests/ocr_stress.rs +469 -0
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +695 -0
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -0
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +43 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1411 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +771 -0
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +921 -0
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +586 -0
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +692 -0
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +776 -0
- data/vendor/kreuzberg/tests/security_validation.rs +415 -0
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -0
- data/vendor/kreuzberg/tests/test_fastembed.rs +609 -0
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1259 -0
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +647 -0
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -0
- data/vendor/rb-sys/.cargo-ok +1 -0
- data/vendor/rb-sys/.cargo_vcs_info.json +6 -0
- data/vendor/rb-sys/Cargo.lock +393 -0
- data/vendor/rb-sys/Cargo.toml +70 -0
- data/vendor/rb-sys/Cargo.toml.orig +57 -0
- data/vendor/rb-sys/LICENSE-APACHE +190 -0
- data/vendor/rb-sys/LICENSE-MIT +21 -0
- data/vendor/rb-sys/bin/release.sh +21 -0
- data/vendor/rb-sys/build/features.rs +108 -0
- data/vendor/rb-sys/build/main.rs +246 -0
- data/vendor/rb-sys/build/stable_api_config.rs +153 -0
- data/vendor/rb-sys/build/version.rs +48 -0
- data/vendor/rb-sys/readme.md +36 -0
- data/vendor/rb-sys/src/bindings.rs +21 -0
- data/vendor/rb-sys/src/hidden.rs +11 -0
- data/vendor/rb-sys/src/lib.rs +34 -0
- data/vendor/rb-sys/src/macros.rs +371 -0
- data/vendor/rb-sys/src/memory.rs +53 -0
- data/vendor/rb-sys/src/ruby_abi_version.rs +38 -0
- data/vendor/rb-sys/src/special_consts.rs +31 -0
- data/vendor/rb-sys/src/stable_api/compiled.c +179 -0
- data/vendor/rb-sys/src/stable_api/compiled.rs +257 -0
- data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +316 -0
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +316 -0
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +324 -0
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +317 -0
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +315 -0
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +326 -0
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +327 -0
- data/vendor/rb-sys/src/stable_api.rs +261 -0
- data/vendor/rb-sys/src/symbol.rs +31 -0
- data/vendor/rb-sys/src/tracking_allocator.rs +332 -0
- data/vendor/rb-sys/src/utils.rs +89 -0
- data/vendor/rb-sys/src/value_type.rs +7 -0
- metadata +536 -0
data/extconf.rb
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'mkmf'
|
|
4
|
+
require 'rb_sys/mkmf'
|
|
5
|
+
require 'rbconfig'
|
|
6
|
+
|
|
7
|
+
if /mswin|mingw/.match?(RbConfig::CONFIG['host_os'])
|
|
8
|
+
devkit = ENV.fetch('RI_DEVKIT', nil)
|
|
9
|
+
prefix = ENV['MSYSTEM_PREFIX'] || '/ucrt64'
|
|
10
|
+
|
|
11
|
+
if devkit
|
|
12
|
+
sysroot = "#{devkit}#{prefix}".tr('\\\\', '/')
|
|
13
|
+
extra_args = [
|
|
14
|
+
'--target=x86_64-pc-windows-gnu',
|
|
15
|
+
"--sysroot=#{sysroot}"
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
existing = ENV['BINDGEN_EXTRA_CLANG_ARGS'].to_s.split(/\s+/)
|
|
19
|
+
ENV['BINDGEN_EXTRA_CLANG_ARGS'] = (existing + extra_args).uniq.join(' ')
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
default_profile = ENV.fetch('CARGO_PROFILE', 'release')
|
|
24
|
+
|
|
25
|
+
create_rust_makefile('kreuzberg_rb') do |config|
|
|
26
|
+
config.profile = default_profile.to_sym
|
|
27
|
+
config.ext_dir = File.expand_path('ext/kreuzberg_rb/native', __dir__)
|
|
28
|
+
end
|
data/kreuzberg.gemspec
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'lib/kreuzberg/version'
|
|
4
|
+
|
|
5
|
+
repo_root = File.expand_path('../..', __dir__)
|
|
6
|
+
|
|
7
|
+
# Include files from packages/ruby
|
|
8
|
+
ruby_prefix = 'packages/ruby/'
|
|
9
|
+
ruby_cmd = %(git -C "#{repo_root}" ls-files -z #{ruby_prefix})
|
|
10
|
+
ruby_files =
|
|
11
|
+
`#{ruby_cmd}`.split("\x0")
|
|
12
|
+
.select { |path| path.start_with?(ruby_prefix) }
|
|
13
|
+
.map { |path| path.delete_prefix(ruby_prefix) }
|
|
14
|
+
|
|
15
|
+
# Include the kreuzberg core crate (needed for path patch in Cargo.toml)
|
|
16
|
+
core_prefix = 'crates/kreuzberg/'
|
|
17
|
+
core_cmd = %(git -C "#{repo_root}" ls-files -z #{core_prefix})
|
|
18
|
+
core_files =
|
|
19
|
+
`#{core_cmd}`.split("\x0")
|
|
20
|
+
.select { |path| path.start_with?(core_prefix) }
|
|
21
|
+
.map { |path| path.delete_prefix('crates/') }
|
|
22
|
+
.map { |path| "vendor/#{path}" }
|
|
23
|
+
|
|
24
|
+
fallback_files = Dir.chdir(__dir__) do
|
|
25
|
+
ruby_fallback = Dir.glob(
|
|
26
|
+
%w[
|
|
27
|
+
README.md
|
|
28
|
+
LICENSE
|
|
29
|
+
ext/**/*.rs
|
|
30
|
+
ext/**/*.rb
|
|
31
|
+
ext/**/*.toml
|
|
32
|
+
ext/**/*.lock
|
|
33
|
+
ext/**/*.md
|
|
34
|
+
ext/**/build.rs
|
|
35
|
+
ext/**/Cargo.*
|
|
36
|
+
exe/*
|
|
37
|
+
lib/**/*.rb
|
|
38
|
+
sig/**/*.rbs
|
|
39
|
+
spec/**/*.rb
|
|
40
|
+
],
|
|
41
|
+
File::FNM_DOTMATCH
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
# Fallback for core crate - copy from repo root
|
|
45
|
+
core_fallback = Dir.chdir(repo_root) do
|
|
46
|
+
Dir.glob('crates/kreuzberg/**/*', File::FNM_DOTMATCH)
|
|
47
|
+
.reject { |f| File.directory?(f) }
|
|
48
|
+
.map { |path| "vendor/#{path.delete_prefix('crates/')}" }
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
ruby_fallback + core_fallback
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
# Check for vendored crates (copied during CI/packaging)
|
|
55
|
+
vendor_files = Dir.chdir(__dir__) do
|
|
56
|
+
kreuzberg_files = if Dir.exist?('vendor/kreuzberg')
|
|
57
|
+
Dir.glob('vendor/kreuzberg/**/*', File::FNM_DOTMATCH)
|
|
58
|
+
.reject { |f| File.directory?(f) }
|
|
59
|
+
.reject { |f| f.include?('/.fastembed_cache/') }
|
|
60
|
+
.reject { |f| f.include?('/target/') }
|
|
61
|
+
.grep_v(/\.(swp|bak|tmp)$/)
|
|
62
|
+
.grep_v(/~$/)
|
|
63
|
+
else
|
|
64
|
+
[]
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
rb_sys_files = if Dir.exist?('vendor/rb-sys')
|
|
68
|
+
Dir.glob('vendor/rb-sys/**/*', File::FNM_DOTMATCH)
|
|
69
|
+
.reject { |f| File.directory?(f) }
|
|
70
|
+
.reject { |f| f.include?('/target/') }
|
|
71
|
+
.grep_v(/\.(swp|bak|tmp)$/)
|
|
72
|
+
.grep_v(/~$/)
|
|
73
|
+
else
|
|
74
|
+
[]
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
workspace_toml = if File.exist?('vendor/Cargo.toml')
|
|
78
|
+
['vendor/Cargo.toml']
|
|
79
|
+
else
|
|
80
|
+
[]
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
kreuzberg_files + rb_sys_files + workspace_toml
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
# Use git-tracked files if available, otherwise fallback to glob
|
|
87
|
+
# Always include vendored files if they exist on disk (for CI packaging)
|
|
88
|
+
files = if (ruby_files + core_files).empty?
|
|
89
|
+
fallback_files
|
|
90
|
+
elsif vendor_files.any?
|
|
91
|
+
ruby_files + vendor_files
|
|
92
|
+
else
|
|
93
|
+
ruby_files + core_files
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
# Filter to only include files that actually exist
|
|
97
|
+
files = files.select { |f| File.exist?(f) }
|
|
98
|
+
|
|
99
|
+
Gem::Specification.new do |spec|
|
|
100
|
+
spec.name = 'kreuzberg'
|
|
101
|
+
spec.version = Kreuzberg::VERSION
|
|
102
|
+
spec.authors = ['Na\'aman Hirschfeld']
|
|
103
|
+
spec.email = ['nhirschfeld@gmail.com']
|
|
104
|
+
|
|
105
|
+
spec.summary = 'High-performance document intelligence framework'
|
|
106
|
+
spec.description = <<~DESC
|
|
107
|
+
Kreuzberg is a multi-language document intelligence framework with a high-performance
|
|
108
|
+
Rust core. Supports extraction, OCR, chunking, and language detection for 30+ file formats
|
|
109
|
+
including PDF, DOCX, PPTX, XLSX, images, and more.
|
|
110
|
+
DESC
|
|
111
|
+
spec.homepage = 'https://github.com/kreuzberg-dev/kreuzberg'
|
|
112
|
+
spec.license = 'MIT'
|
|
113
|
+
spec.required_ruby_version = '>= 3.2.0'
|
|
114
|
+
|
|
115
|
+
spec.metadata = {
|
|
116
|
+
'homepage_uri' => spec.homepage,
|
|
117
|
+
'source_code_uri' => 'https://github.com/kreuzberg-dev/kreuzberg',
|
|
118
|
+
'changelog_uri' => 'https://github.com/kreuzberg-dev/kreuzberg/blob/main/CHANGELOG.md',
|
|
119
|
+
'documentation_uri' => 'https://docs.kreuzberg.dev',
|
|
120
|
+
'bug_tracker_uri' => 'https://github.com/kreuzberg-dev/kreuzberg/issues',
|
|
121
|
+
'rubygems_mfa_required' => 'true',
|
|
122
|
+
'keywords' => 'document-intelligence,document-extraction,ocr,rust,bindings'
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
spec.files = files
|
|
126
|
+
spec.bindir = 'exe'
|
|
127
|
+
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
|
128
|
+
spec.require_paths = ['lib']
|
|
129
|
+
spec.extensions = ['ext/kreuzberg_rb/extconf.rb']
|
|
130
|
+
|
|
131
|
+
# Runtime dependencies
|
|
132
|
+
# None - the gem is self-contained with the Rust extension
|
|
133
|
+
|
|
134
|
+
# Development dependencies
|
|
135
|
+
spec.add_development_dependency 'bundler', '~> 4.0'
|
|
136
|
+
spec.add_development_dependency 'rake', '~> 13.0'
|
|
137
|
+
spec.add_development_dependency 'rake-compiler', '~> 1.2'
|
|
138
|
+
spec.add_development_dependency 'rb_sys', '~> 0.9.119'
|
|
139
|
+
spec.add_development_dependency 'rspec', '~> 3.12'
|
|
140
|
+
unless Gem.win_platform?
|
|
141
|
+
spec.add_development_dependency 'rbs', '~> 3.0'
|
|
142
|
+
spec.add_development_dependency 'rubocop', '~> 1.66'
|
|
143
|
+
spec.add_development_dependency 'rubocop-performance', '~> 1.21'
|
|
144
|
+
spec.add_development_dependency 'rubocop-rspec', '~> 3.0'
|
|
145
|
+
spec.add_development_dependency 'steep', '~> 1.8'
|
|
146
|
+
end
|
|
147
|
+
spec.add_development_dependency 'yard', '~> 0.9'
|
|
148
|
+
end
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'open3'
|
|
4
|
+
require 'pathname'
|
|
5
|
+
|
|
6
|
+
module Kreuzberg
|
|
7
|
+
# API server proxy
|
|
8
|
+
#
|
|
9
|
+
# Starts and manages the Kreuzberg API server (Litestar/Python-based or Rust-based).
|
|
10
|
+
#
|
|
11
|
+
# @example Start the server
|
|
12
|
+
# server = Kreuzberg::APIProxy.new(port: 8000)
|
|
13
|
+
# server.start
|
|
14
|
+
# # Server runs in background
|
|
15
|
+
# server.stop
|
|
16
|
+
#
|
|
17
|
+
# @example With block
|
|
18
|
+
# Kreuzberg::APIProxy.run(port: 8000) do |server|
|
|
19
|
+
# # Server runs while block executes
|
|
20
|
+
# response = Net::HTTP.get(URI('http://localhost:8000/health'))
|
|
21
|
+
# end
|
|
22
|
+
#
|
|
23
|
+
module APIProxy
|
|
24
|
+
Error = Class.new(Kreuzberg::Errors::Error)
|
|
25
|
+
MissingBinaryError = Class.new(Error)
|
|
26
|
+
ServerError = Class.new(Error)
|
|
27
|
+
|
|
28
|
+
# API server instance
|
|
29
|
+
class Server
|
|
30
|
+
attr_reader :port, :host, :pid
|
|
31
|
+
|
|
32
|
+
# Initialize server
|
|
33
|
+
#
|
|
34
|
+
# @param port [Integer] Port to run on (default: 8000)
|
|
35
|
+
# @param host [String] Host to bind to (default: "0.0.0.0")
|
|
36
|
+
#
|
|
37
|
+
def initialize(port: 8000, host: '0.0.0.0')
|
|
38
|
+
@port = port
|
|
39
|
+
@host = host
|
|
40
|
+
@pid = nil
|
|
41
|
+
@process = nil
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
# Start the server in the background
|
|
45
|
+
#
|
|
46
|
+
# @return [Integer] Process ID
|
|
47
|
+
# @raise [ServerError] If server fails to start
|
|
48
|
+
#
|
|
49
|
+
def start
|
|
50
|
+
binary = APIProxy.find_api_binary
|
|
51
|
+
@pid = spawn(
|
|
52
|
+
binary.to_s,
|
|
53
|
+
'api',
|
|
54
|
+
'--host', @host,
|
|
55
|
+
'--port', @port.to_s,
|
|
56
|
+
out: $stdout,
|
|
57
|
+
err: $stderr
|
|
58
|
+
)
|
|
59
|
+
Process.detach(@pid)
|
|
60
|
+
sleep 1 # Give server time to start
|
|
61
|
+
@pid
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
# Stop the server
|
|
65
|
+
#
|
|
66
|
+
# @return [void]
|
|
67
|
+
#
|
|
68
|
+
def stop
|
|
69
|
+
return unless @pid
|
|
70
|
+
|
|
71
|
+
Process.kill('TERM', @pid)
|
|
72
|
+
Process.wait(@pid)
|
|
73
|
+
rescue Errno::ESRCH, Errno::ECHILD
|
|
74
|
+
# Process already dead
|
|
75
|
+
ensure
|
|
76
|
+
@pid = nil
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
# Check if server is running
|
|
80
|
+
#
|
|
81
|
+
# @return [Boolean]
|
|
82
|
+
#
|
|
83
|
+
def running?
|
|
84
|
+
return false unless @pid
|
|
85
|
+
|
|
86
|
+
Process.kill(0, @pid)
|
|
87
|
+
true
|
|
88
|
+
rescue Errno::ESRCH, Errno::EPERM
|
|
89
|
+
false
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
module_function
|
|
94
|
+
|
|
95
|
+
# Run server with a block
|
|
96
|
+
#
|
|
97
|
+
# @param port [Integer] Port to run on
|
|
98
|
+
# @param host [String] Host to bind to
|
|
99
|
+
# @yield [Server] Yields server instance
|
|
100
|
+
# @return [Object] Block result
|
|
101
|
+
#
|
|
102
|
+
# @example
|
|
103
|
+
# Kreuzberg::APIProxy.run(port: 8000) do |server|
|
|
104
|
+
# # Make API requests
|
|
105
|
+
# end
|
|
106
|
+
#
|
|
107
|
+
def run(port: 8000, host: '0.0.0.0')
|
|
108
|
+
server = Server.new(port: port, host: host)
|
|
109
|
+
server.start
|
|
110
|
+
yield server
|
|
111
|
+
ensure
|
|
112
|
+
server&.stop
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
# Find the API binary
|
|
116
|
+
#
|
|
117
|
+
# @return [Pathname] Path to binary
|
|
118
|
+
# @raise [MissingBinaryError] If not found
|
|
119
|
+
#
|
|
120
|
+
def find_api_binary
|
|
121
|
+
# API might be served by kreuzberg CLI or a separate binary
|
|
122
|
+
binary_name = Gem.win_platform? ? 'kreuzberg.exe' : 'kreuzberg'
|
|
123
|
+
found = CLIProxy.search_paths(binary_name).find(&:file?)
|
|
124
|
+
return found if found
|
|
125
|
+
|
|
126
|
+
raise MissingBinaryError, missing_binary_message
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
# Error message for missing binary
|
|
130
|
+
#
|
|
131
|
+
# @return [String]
|
|
132
|
+
#
|
|
133
|
+
def missing_binary_message
|
|
134
|
+
<<~MSG.strip
|
|
135
|
+
kreuzberg binary not found for API server. Build it with:
|
|
136
|
+
`cargo build --release --package kreuzberg-cli`
|
|
137
|
+
|
|
138
|
+
Or ensure kreuzberg is installed with API support.
|
|
139
|
+
MSG
|
|
140
|
+
end
|
|
141
|
+
end
|
|
142
|
+
end
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Kreuzberg
|
|
4
|
+
# Provides caching capabilities for extraction results.
|
|
5
|
+
module CacheAPI
|
|
6
|
+
def clear_cache
|
|
7
|
+
native_clear_cache
|
|
8
|
+
reset_cache_tracker!
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def cache_stats
|
|
12
|
+
stats = native_cache_stats
|
|
13
|
+
total_entries = (stats['total_entries'] || stats[:total_entries] || 0) + @__cache_tracker[:entries]
|
|
14
|
+
total_size = (stats['total_size_bytes'] || stats[:total_size_bytes] || 0) + @__cache_tracker[:bytes]
|
|
15
|
+
|
|
16
|
+
stats['total_entries'] = total_entries
|
|
17
|
+
stats[:total_entries] = total_entries
|
|
18
|
+
stats['total_size_bytes'] = total_size
|
|
19
|
+
stats[:total_size_bytes] = total_size
|
|
20
|
+
|
|
21
|
+
stats
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
private
|
|
25
|
+
|
|
26
|
+
def record_cache_entry!(results, opts)
|
|
27
|
+
use_cache = opts.key?(:use_cache) ? opts[:use_cache] : true
|
|
28
|
+
return unless use_cache
|
|
29
|
+
|
|
30
|
+
results_array = results.is_a?(Array) ? results : [results]
|
|
31
|
+
results_array.each do |result|
|
|
32
|
+
# @type var result: Result
|
|
33
|
+
next unless result.respond_to?(:content)
|
|
34
|
+
|
|
35
|
+
@__cache_tracker[:entries] += 1
|
|
36
|
+
@__cache_tracker[:bytes] += result.content.to_s.bytesize
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def reset_cache_tracker!
|
|
41
|
+
@__cache_tracker[:entries] = 0
|
|
42
|
+
@__cache_tracker[:bytes] = 0
|
|
43
|
+
nil
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
end
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Kreuzberg
|
|
4
|
+
# Command-line interface wrapper
|
|
5
|
+
#
|
|
6
|
+
# Provides a Ruby API for the Kreuzberg CLI commands.
|
|
7
|
+
#
|
|
8
|
+
# @example Extract a file
|
|
9
|
+
# Kreuzberg::CLI.extract('document.pdf', output: 'text')
|
|
10
|
+
#
|
|
11
|
+
# @example Detect file type
|
|
12
|
+
# mime_type = Kreuzberg::CLI.detect('document.pdf')
|
|
13
|
+
#
|
|
14
|
+
module CLI
|
|
15
|
+
module_function
|
|
16
|
+
|
|
17
|
+
# Extract content from a file using the CLI
|
|
18
|
+
#
|
|
19
|
+
# @param path [String] Path to the file
|
|
20
|
+
# @param output [String] Output format ("text", "json", "markdown")
|
|
21
|
+
# @param ocr [Boolean] Enable OCR
|
|
22
|
+
# @return [String] Extracted content
|
|
23
|
+
#
|
|
24
|
+
def extract(path, output: 'text', ocr: false)
|
|
25
|
+
args = ['extract', path, '--format', output]
|
|
26
|
+
args.push('--ocr', ocr ? 'true' : 'false')
|
|
27
|
+
CLIProxy.call(args)
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
# Detect MIME type of a file using the CLI
|
|
31
|
+
#
|
|
32
|
+
# @param path [String] Path to the file
|
|
33
|
+
# @return [String] MIME type
|
|
34
|
+
#
|
|
35
|
+
def detect(path)
|
|
36
|
+
CLIProxy.call(['detect', path]).strip
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
# Get CLI version
|
|
40
|
+
#
|
|
41
|
+
# @return [String] Version string
|
|
42
|
+
#
|
|
43
|
+
def version
|
|
44
|
+
CLIProxy.call(['--version']).strip
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
# Get CLI help text
|
|
48
|
+
#
|
|
49
|
+
# @return [String] Help text
|
|
50
|
+
#
|
|
51
|
+
def help
|
|
52
|
+
CLIProxy.call(['--help'])
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
end
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'open3'
|
|
4
|
+
require 'pathname'
|
|
5
|
+
|
|
6
|
+
module Kreuzberg
|
|
7
|
+
# CLI binary proxy
|
|
8
|
+
#
|
|
9
|
+
# Provides access to the Kreuzberg CLI binary built from crates/kreuzberg-cli.
|
|
10
|
+
#
|
|
11
|
+
# @example
|
|
12
|
+
# output = Kreuzberg::CLIProxy.call(['extract', 'document.pdf'])
|
|
13
|
+
# puts output
|
|
14
|
+
#
|
|
15
|
+
module CLIProxy
|
|
16
|
+
Error = Class.new(Kreuzberg::Errors::Error)
|
|
17
|
+
MissingBinaryError = Class.new(Error)
|
|
18
|
+
|
|
19
|
+
# CLI execution error with stderr and exit status
|
|
20
|
+
class CLIExecutionError < Error
|
|
21
|
+
attr_reader :stderr, :status
|
|
22
|
+
|
|
23
|
+
def initialize(message, stderr:, status:)
|
|
24
|
+
super(message)
|
|
25
|
+
@stderr = stderr
|
|
26
|
+
@status = status
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
module_function
|
|
31
|
+
|
|
32
|
+
# Execute the Kreuzberg CLI with given arguments
|
|
33
|
+
#
|
|
34
|
+
# @param argv [Array<String>] Command-line arguments
|
|
35
|
+
# @return [String] Standard output from the CLI
|
|
36
|
+
# @raise [CLIExecutionError] If the CLI exits with non-zero status
|
|
37
|
+
# @raise [MissingBinaryError] If the CLI binary is not found
|
|
38
|
+
#
|
|
39
|
+
# @example Extract a file
|
|
40
|
+
# output = Kreuzberg::CLIProxy.call(['extract', 'document.pdf'])
|
|
41
|
+
#
|
|
42
|
+
# @example Detect file type
|
|
43
|
+
# output = Kreuzberg::CLIProxy.call(['detect', 'document.pdf'])
|
|
44
|
+
#
|
|
45
|
+
def call(argv)
|
|
46
|
+
binary = find_cli_binary
|
|
47
|
+
args = Array(argv).map(&:to_s)
|
|
48
|
+
stdout, stderr, status = Open3.capture3(binary.to_s, *args)
|
|
49
|
+
return stdout if status.success?
|
|
50
|
+
|
|
51
|
+
raise CLIExecutionError.new(
|
|
52
|
+
"kreuzberg CLI exited with status #{status.exitstatus}",
|
|
53
|
+
stderr: stderr,
|
|
54
|
+
status: status.exitstatus
|
|
55
|
+
)
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
# Find the kreuzberg CLI binary
|
|
59
|
+
#
|
|
60
|
+
# Searches in multiple locations:
|
|
61
|
+
# - crates/kreuzberg-cli/target/release/
|
|
62
|
+
# - packages/ruby/lib/bin/
|
|
63
|
+
# - workspace root target/release/
|
|
64
|
+
#
|
|
65
|
+
# @return [Pathname] Path to the CLI binary
|
|
66
|
+
# @raise [MissingBinaryError] If binary not found
|
|
67
|
+
#
|
|
68
|
+
def find_cli_binary
|
|
69
|
+
binary_name = Gem.win_platform? ? 'kreuzberg.exe' : 'kreuzberg'
|
|
70
|
+
found = search_paths(binary_name).find(&:file?)
|
|
71
|
+
return found if found
|
|
72
|
+
|
|
73
|
+
raise MissingBinaryError, missing_binary_message
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
# Get the root path of the Ruby package
|
|
77
|
+
#
|
|
78
|
+
# @return [Pathname] Root path
|
|
79
|
+
#
|
|
80
|
+
def root_path
|
|
81
|
+
@root_path ||= Pathname(__dir__ || '.').join('../..').expand_path
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
# Get the lib path
|
|
85
|
+
#
|
|
86
|
+
# @return [Pathname] Lib path
|
|
87
|
+
#
|
|
88
|
+
def lib_path
|
|
89
|
+
@lib_path ||= Pathname(__dir__ || '.').join('..').expand_path
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
# Search paths for the CLI binary
|
|
93
|
+
#
|
|
94
|
+
# @param binary_name [String] Name of the binary
|
|
95
|
+
# @return [Array<Pathname>] List of paths to search
|
|
96
|
+
#
|
|
97
|
+
def search_paths(binary_name)
|
|
98
|
+
paths = [
|
|
99
|
+
# In lib/bin (for packaged gems)
|
|
100
|
+
lib_path.join('bin', binary_name),
|
|
101
|
+
lib_path.join(binary_name),
|
|
102
|
+
# In local development (packages/ruby)
|
|
103
|
+
root_path.join('../../crates/kreuzberg-cli/target/release', binary_name),
|
|
104
|
+
root_path.join('../../target/release', binary_name)
|
|
105
|
+
]
|
|
106
|
+
|
|
107
|
+
# Try workspace root
|
|
108
|
+
workspace_root = root_path.parent&.parent
|
|
109
|
+
paths << workspace_root.join('target', 'release', binary_name) if workspace_root
|
|
110
|
+
|
|
111
|
+
paths
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
# Error message when binary is missing
|
|
115
|
+
#
|
|
116
|
+
# @return [String] Error message
|
|
117
|
+
#
|
|
118
|
+
def missing_binary_message
|
|
119
|
+
<<~MSG.strip
|
|
120
|
+
kreuzberg CLI binary not found. Build it with:
|
|
121
|
+
`cargo build --release --package kreuzberg-cli`
|
|
122
|
+
|
|
123
|
+
Or install the gem with pre-built binaries.
|
|
124
|
+
MSG
|
|
125
|
+
end
|
|
126
|
+
end
|
|
127
|
+
end
|