kreuzberg 4.0.0.rc1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +8 -0
- data/.rspec +3 -0
- data/.rubocop.yaml +534 -0
- data/Gemfile +9 -0
- data/Gemfile.lock +157 -0
- data/README.md +421 -0
- data/Rakefile +25 -0
- data/Steepfile +47 -0
- data/examples/async_patterns.rb +340 -0
- data/ext/kreuzberg_rb/extconf.rb +35 -0
- data/ext/kreuzberg_rb/native/Cargo.toml +36 -0
- data/ext/kreuzberg_rb/native/README.md +425 -0
- data/ext/kreuzberg_rb/native/build.rs +17 -0
- data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -0
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -0
- data/ext/kreuzberg_rb/native/include/strings.h +20 -0
- data/ext/kreuzberg_rb/native/include/unistd.h +47 -0
- data/ext/kreuzberg_rb/native/src/lib.rs +2939 -0
- data/extconf.rb +28 -0
- data/kreuzberg.gemspec +105 -0
- data/lib/kreuzberg/api_proxy.rb +142 -0
- data/lib/kreuzberg/cache_api.rb +45 -0
- data/lib/kreuzberg/cli.rb +55 -0
- data/lib/kreuzberg/cli_proxy.rb +127 -0
- data/lib/kreuzberg/config.rb +684 -0
- data/lib/kreuzberg/errors.rb +50 -0
- data/lib/kreuzberg/extraction_api.rb +84 -0
- data/lib/kreuzberg/mcp_proxy.rb +186 -0
- data/lib/kreuzberg/ocr_backend_protocol.rb +113 -0
- data/lib/kreuzberg/post_processor_protocol.rb +86 -0
- data/lib/kreuzberg/result.rb +216 -0
- data/lib/kreuzberg/setup_lib_path.rb +79 -0
- data/lib/kreuzberg/validator_protocol.rb +89 -0
- data/lib/kreuzberg/version.rb +5 -0
- data/lib/kreuzberg.rb +82 -0
- data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
- data/sig/kreuzberg/internal.rbs +184 -0
- data/sig/kreuzberg.rbs +468 -0
- data/spec/binding/cache_spec.rb +227 -0
- data/spec/binding/cli_proxy_spec.rb +87 -0
- data/spec/binding/cli_spec.rb +54 -0
- data/spec/binding/config_spec.rb +345 -0
- data/spec/binding/config_validation_spec.rb +283 -0
- data/spec/binding/error_handling_spec.rb +213 -0
- data/spec/binding/errors_spec.rb +66 -0
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
- data/spec/binding/plugins/postprocessor_spec.rb +269 -0
- data/spec/binding/plugins/validator_spec.rb +274 -0
- data/spec/examples.txt +104 -0
- data/spec/fixtures/config.toml +39 -0
- data/spec/fixtures/config.yaml +42 -0
- data/spec/fixtures/invalid_config.toml +4 -0
- data/spec/smoke/package_spec.rb +178 -0
- data/spec/spec_helper.rb +42 -0
- data/vendor/kreuzberg/Cargo.toml +134 -0
- data/vendor/kreuzberg/README.md +175 -0
- data/vendor/kreuzberg/build.rs +460 -0
- data/vendor/kreuzberg/src/api/error.rs +81 -0
- data/vendor/kreuzberg/src/api/handlers.rs +199 -0
- data/vendor/kreuzberg/src/api/mod.rs +79 -0
- data/vendor/kreuzberg/src/api/server.rs +353 -0
- data/vendor/kreuzberg/src/api/types.rs +170 -0
- data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
- data/vendor/kreuzberg/src/cache/mod.rs +1143 -0
- data/vendor/kreuzberg/src/chunking/mod.rs +677 -0
- data/vendor/kreuzberg/src/core/batch_mode.rs +35 -0
- data/vendor/kreuzberg/src/core/config.rs +1032 -0
- data/vendor/kreuzberg/src/core/extractor.rs +903 -0
- data/vendor/kreuzberg/src/core/io.rs +327 -0
- data/vendor/kreuzberg/src/core/mime.rs +615 -0
- data/vendor/kreuzberg/src/core/mod.rs +42 -0
- data/vendor/kreuzberg/src/core/pipeline.rs +906 -0
- data/vendor/kreuzberg/src/embeddings.rs +323 -0
- data/vendor/kreuzberg/src/error.rs +431 -0
- data/vendor/kreuzberg/src/extraction/archive.rs +954 -0
- data/vendor/kreuzberg/src/extraction/docx.rs +40 -0
- data/vendor/kreuzberg/src/extraction/email.rs +854 -0
- data/vendor/kreuzberg/src/extraction/excel.rs +688 -0
- data/vendor/kreuzberg/src/extraction/html.rs +553 -0
- data/vendor/kreuzberg/src/extraction/image.rs +368 -0
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +564 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +77 -0
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -0
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -0
- data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -0
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +128 -0
- data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
- data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
- data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
- data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
- data/vendor/kreuzberg/src/extraction/pptx.rs +3000 -0
- data/vendor/kreuzberg/src/extraction/structured.rs +490 -0
- data/vendor/kreuzberg/src/extraction/table.rs +328 -0
- data/vendor/kreuzberg/src/extraction/text.rs +269 -0
- data/vendor/kreuzberg/src/extraction/xml.rs +333 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +425 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +479 -0
- data/vendor/kreuzberg/src/extractors/email.rs +129 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +344 -0
- data/vendor/kreuzberg/src/extractors/html.rs +410 -0
- data/vendor/kreuzberg/src/extractors/image.rs +195 -0
- data/vendor/kreuzberg/src/extractors/mod.rs +268 -0
- data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +496 -0
- data/vendor/kreuzberg/src/extractors/pptx.rs +234 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +126 -0
- data/vendor/kreuzberg/src/extractors/text.rs +242 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +128 -0
- data/vendor/kreuzberg/src/image/dpi.rs +164 -0
- data/vendor/kreuzberg/src/image/mod.rs +6 -0
- data/vendor/kreuzberg/src/image/preprocessing.rs +417 -0
- data/vendor/kreuzberg/src/image/resize.rs +89 -0
- data/vendor/kreuzberg/src/keywords/config.rs +154 -0
- data/vendor/kreuzberg/src/keywords/mod.rs +237 -0
- data/vendor/kreuzberg/src/keywords/processor.rs +267 -0
- data/vendor/kreuzberg/src/keywords/rake.rs +294 -0
- data/vendor/kreuzberg/src/keywords/types.rs +68 -0
- data/vendor/kreuzberg/src/keywords/yake.rs +163 -0
- data/vendor/kreuzberg/src/language_detection/mod.rs +942 -0
- data/vendor/kreuzberg/src/lib.rs +102 -0
- data/vendor/kreuzberg/src/mcp/mod.rs +32 -0
- data/vendor/kreuzberg/src/mcp/server.rs +1966 -0
- data/vendor/kreuzberg/src/ocr/cache.rs +469 -0
- data/vendor/kreuzberg/src/ocr/error.rs +37 -0
- data/vendor/kreuzberg/src/ocr/hocr.rs +216 -0
- data/vendor/kreuzberg/src/ocr/mod.rs +58 -0
- data/vendor/kreuzberg/src/ocr/processor.rs +847 -0
- data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -0
- data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +450 -0
- data/vendor/kreuzberg/src/ocr/types.rs +393 -0
- data/vendor/kreuzberg/src/ocr/utils.rs +47 -0
- data/vendor/kreuzberg/src/ocr/validation.rs +206 -0
- data/vendor/kreuzberg/src/pdf/error.rs +122 -0
- data/vendor/kreuzberg/src/pdf/images.rs +139 -0
- data/vendor/kreuzberg/src/pdf/metadata.rs +346 -0
- data/vendor/kreuzberg/src/pdf/mod.rs +50 -0
- data/vendor/kreuzberg/src/pdf/rendering.rs +369 -0
- data/vendor/kreuzberg/src/pdf/table.rs +420 -0
- data/vendor/kreuzberg/src/pdf/text.rs +161 -0
- data/vendor/kreuzberg/src/plugins/extractor.rs +1010 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +209 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +629 -0
- data/vendor/kreuzberg/src/plugins/processor.rs +641 -0
- data/vendor/kreuzberg/src/plugins/registry.rs +1324 -0
- data/vendor/kreuzberg/src/plugins/traits.rs +258 -0
- data/vendor/kreuzberg/src/plugins/validator.rs +955 -0
- data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -0
- data/vendor/kreuzberg/src/text/mod.rs +19 -0
- data/vendor/kreuzberg/src/text/quality.rs +697 -0
- data/vendor/kreuzberg/src/text/string_utils.rs +217 -0
- data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -0
- data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -0
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -0
- data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -0
- data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -0
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -0
- data/vendor/kreuzberg/src/types.rs +873 -0
- data/vendor/kreuzberg/src/utils/mod.rs +17 -0
- data/vendor/kreuzberg/src/utils/quality.rs +959 -0
- data/vendor/kreuzberg/src/utils/string_utils.rs +381 -0
- data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -0
- data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -0
- data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -0
- data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -0
- data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -0
- data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -0
- data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -0
- data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -0
- data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -0
- data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -0
- data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -0
- data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -0
- data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -0
- data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -0
- data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -0
- data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -0
- data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -0
- data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -0
- data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -0
- data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -0
- data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -0
- data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -0
- data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -0
- data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -0
- data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -0
- data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -0
- data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -0
- data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -0
- data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -0
- data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -0
- data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -0
- data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -0
- data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -0
- data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -0
- data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -0
- data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -0
- data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -0
- data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -0
- data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -0
- data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -0
- data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -0
- data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -0
- data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -0
- data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -0
- data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -0
- data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -0
- data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -0
- data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -0
- data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -0
- data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -0
- data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -0
- data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -0
- data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -0
- data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -0
- data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -0
- data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -0
- data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -0
- data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -0
- data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -0
- data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -0
- data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -0
- data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -0
- data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -0
- data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -0
- data/vendor/kreuzberg/tests/api_tests.rs +966 -0
- data/vendor/kreuzberg/tests/archive_integration.rs +543 -0
- data/vendor/kreuzberg/tests/batch_orchestration.rs +542 -0
- data/vendor/kreuzberg/tests/batch_processing.rs +304 -0
- data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +509 -0
- data/vendor/kreuzberg/tests/config_features.rs +580 -0
- data/vendor/kreuzberg/tests/config_loading_tests.rs +439 -0
- data/vendor/kreuzberg/tests/core_integration.rs +493 -0
- data/vendor/kreuzberg/tests/csv_integration.rs +424 -0
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +124 -0
- data/vendor/kreuzberg/tests/email_integration.rs +325 -0
- data/vendor/kreuzberg/tests/error_handling.rs +393 -0
- data/vendor/kreuzberg/tests/format_integration.rs +159 -0
- data/vendor/kreuzberg/tests/helpers/mod.rs +142 -0
- data/vendor/kreuzberg/tests/image_integration.rs +253 -0
- data/vendor/kreuzberg/tests/keywords_integration.rs +479 -0
- data/vendor/kreuzberg/tests/keywords_quality.rs +509 -0
- data/vendor/kreuzberg/tests/mime_detection.rs +428 -0
- data/vendor/kreuzberg/tests/ocr_configuration.rs +510 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +676 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +627 -0
- data/vendor/kreuzberg/tests/ocr_stress.rs +469 -0
- data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +43 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1412 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +771 -0
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +561 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +921 -0
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +607 -0
- data/vendor/kreuzberg/tests/security_validation.rs +404 -0
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -0
- data/vendor/kreuzberg/tests/test_fastembed.rs +609 -0
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -0
- metadata +471 -0
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Kreuzberg
|
|
4
|
+
module Errors
|
|
5
|
+
# Base error class for all Kreuzberg errors
|
|
6
|
+
class Error < StandardError; end
|
|
7
|
+
|
|
8
|
+
# Raised when validation fails
|
|
9
|
+
class ValidationError < Error; end
|
|
10
|
+
|
|
11
|
+
# Raised when document parsing fails
|
|
12
|
+
class ParsingError < Error
|
|
13
|
+
attr_reader :context
|
|
14
|
+
|
|
15
|
+
def initialize(message, context: nil)
|
|
16
|
+
super(message)
|
|
17
|
+
@context = context
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
# Raised when OCR processing fails
|
|
22
|
+
class OCRError < Error
|
|
23
|
+
attr_reader :context
|
|
24
|
+
|
|
25
|
+
def initialize(message, context: nil)
|
|
26
|
+
super(message)
|
|
27
|
+
@context = context
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
# Raised when a required dependency is missing
|
|
32
|
+
class MissingDependencyError < Error
|
|
33
|
+
attr_reader :dependency
|
|
34
|
+
|
|
35
|
+
def initialize(message, dependency: nil)
|
|
36
|
+
super(message)
|
|
37
|
+
@dependency = dependency
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
# Raised when an I/O operation fails
|
|
42
|
+
class IOError < Error; end
|
|
43
|
+
|
|
44
|
+
# Raised when plugin operations fail
|
|
45
|
+
class PluginError < Error; end
|
|
46
|
+
|
|
47
|
+
# Raised when an unsupported file format or MIME type is encountered
|
|
48
|
+
class UnsupportedFormatError < Error; end
|
|
49
|
+
end
|
|
50
|
+
end
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Kreuzberg
|
|
4
|
+
module ExtractionAPI
|
|
5
|
+
def extract_file_sync(path, mime_type: nil, config: nil)
|
|
6
|
+
opts = normalize_config(config)
|
|
7
|
+
hash = if mime_type
|
|
8
|
+
native_extract_file_sync(path.to_s, mime_type.to_s, **opts)
|
|
9
|
+
else
|
|
10
|
+
native_extract_file_sync(path.to_s, **opts)
|
|
11
|
+
end
|
|
12
|
+
result = Result.new(hash)
|
|
13
|
+
record_cache_entry!(result, opts)
|
|
14
|
+
result
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def extract_bytes_sync(data, mime_type, config: nil)
|
|
18
|
+
opts = normalize_config(config)
|
|
19
|
+
hash = native_extract_bytes_sync(data.to_s, mime_type.to_s, **opts)
|
|
20
|
+
result = Result.new(hash)
|
|
21
|
+
record_cache_entry!(result, opts)
|
|
22
|
+
result
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def batch_extract_files_sync(paths, config: nil)
|
|
26
|
+
opts = normalize_config(config)
|
|
27
|
+
hashes = native_batch_extract_files_sync(paths.map(&:to_s), **opts)
|
|
28
|
+
results = hashes.map { |hash| Result.new(hash) }
|
|
29
|
+
record_cache_entry!(results, opts)
|
|
30
|
+
results
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def extract_file(path, mime_type: nil, config: nil)
|
|
34
|
+
opts = normalize_config(config)
|
|
35
|
+
hash = if mime_type
|
|
36
|
+
native_extract_file(path.to_s, mime_type.to_s, **opts)
|
|
37
|
+
else
|
|
38
|
+
native_extract_file(path.to_s, **opts)
|
|
39
|
+
end
|
|
40
|
+
result = Result.new(hash)
|
|
41
|
+
record_cache_entry!(result, opts)
|
|
42
|
+
result
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def extract_bytes(data, mime_type, config: nil)
|
|
46
|
+
opts = normalize_config(config)
|
|
47
|
+
hash = native_extract_bytes(data.to_s, mime_type.to_s, **opts)
|
|
48
|
+
result = Result.new(hash)
|
|
49
|
+
record_cache_entry!(result, opts)
|
|
50
|
+
result
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
def batch_extract_files(paths, config: nil)
|
|
54
|
+
opts = normalize_config(config)
|
|
55
|
+
hashes = native_batch_extract_files(paths.map(&:to_s), **opts)
|
|
56
|
+
results = hashes.map { |hash| Result.new(hash) }
|
|
57
|
+
record_cache_entry!(results, opts)
|
|
58
|
+
results
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
def batch_extract_bytes_sync(data_array, mime_types, config: nil)
|
|
62
|
+
opts = normalize_config(config)
|
|
63
|
+
hashes = native_batch_extract_bytes_sync(data_array.map(&:to_s), mime_types.map(&:to_s), **opts)
|
|
64
|
+
results = hashes.map { |hash| Result.new(hash) }
|
|
65
|
+
record_cache_entry!(results, opts)
|
|
66
|
+
results
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
def batch_extract_bytes(data_array, mime_types, config: nil)
|
|
70
|
+
opts = normalize_config(config)
|
|
71
|
+
hashes = native_batch_extract_bytes(data_array.map(&:to_s), mime_types.map(&:to_s), **opts)
|
|
72
|
+
results = hashes.map { |hash| Result.new(hash) }
|
|
73
|
+
record_cache_entry!(results, opts)
|
|
74
|
+
results
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
def normalize_config(config)
|
|
78
|
+
return {} if config.nil?
|
|
79
|
+
return config if config.is_a?(Hash)
|
|
80
|
+
|
|
81
|
+
config.to_h
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
end
|
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'open3'
|
|
4
|
+
require 'pathname'
|
|
5
|
+
require 'json'
|
|
6
|
+
|
|
7
|
+
module Kreuzberg
|
|
8
|
+
# MCP (Model Context Protocol) server proxy
|
|
9
|
+
#
|
|
10
|
+
# Starts and manages the Kreuzberg MCP server for Claude Desktop integration.
|
|
11
|
+
#
|
|
12
|
+
# @example Start MCP server
|
|
13
|
+
# server = Kreuzberg::MCPProxy.new
|
|
14
|
+
# server.start
|
|
15
|
+
#
|
|
16
|
+
module MCPProxy
|
|
17
|
+
Error = Class.new(Kreuzberg::Errors::Error)
|
|
18
|
+
MissingBinaryError = Class.new(Error)
|
|
19
|
+
ServerError = Class.new(Error)
|
|
20
|
+
|
|
21
|
+
# MCP server instance
|
|
22
|
+
class Server
|
|
23
|
+
attr_reader :pid, :transport
|
|
24
|
+
|
|
25
|
+
# Initialize MCP server
|
|
26
|
+
#
|
|
27
|
+
# @param transport [String] Transport method ("stdio" or "sse")
|
|
28
|
+
#
|
|
29
|
+
def initialize(transport: 'stdio')
|
|
30
|
+
@transport = transport
|
|
31
|
+
@pid = nil
|
|
32
|
+
@stdin = nil
|
|
33
|
+
@stdout = nil
|
|
34
|
+
@stderr = nil
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
# Start the MCP server
|
|
38
|
+
#
|
|
39
|
+
# @return [Integer, nil] Process ID (for SSE) or nil (for stdio)
|
|
40
|
+
#
|
|
41
|
+
def start
|
|
42
|
+
binary = MCPProxy.find_mcp_binary
|
|
43
|
+
|
|
44
|
+
case @transport
|
|
45
|
+
when 'stdio'
|
|
46
|
+
start_stdio(binary)
|
|
47
|
+
when 'sse'
|
|
48
|
+
start_sse(binary)
|
|
49
|
+
else
|
|
50
|
+
raise ServerError, "Unknown transport: #{@transport}"
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
# Stop the server
|
|
55
|
+
#
|
|
56
|
+
# @return [void]
|
|
57
|
+
#
|
|
58
|
+
def stop
|
|
59
|
+
return unless @pid
|
|
60
|
+
|
|
61
|
+
Process.kill('TERM', @pid)
|
|
62
|
+
Process.wait(@pid)
|
|
63
|
+
rescue Errno::ESRCH, Errno::ECHILD
|
|
64
|
+
# Process already dead
|
|
65
|
+
ensure
|
|
66
|
+
@pid = nil
|
|
67
|
+
close_pipes
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
# Send a message to the server (stdio only)
|
|
71
|
+
#
|
|
72
|
+
# @param message [Hash] JSON-RPC message
|
|
73
|
+
# @return [void]
|
|
74
|
+
#
|
|
75
|
+
def send_message(message)
|
|
76
|
+
raise ServerError, 'Can only send messages in stdio mode' unless @transport == 'stdio'
|
|
77
|
+
raise ServerError, 'Server not started' unless @stdin
|
|
78
|
+
|
|
79
|
+
@stdin.puts(JSON.generate(message))
|
|
80
|
+
@stdin.flush
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
# Read a message from the server (stdio only)
|
|
84
|
+
#
|
|
85
|
+
# @return [Hash] JSON-RPC message
|
|
86
|
+
#
|
|
87
|
+
def read_message
|
|
88
|
+
raise ServerError, 'Can only read messages in stdio mode' unless @transport == 'stdio'
|
|
89
|
+
raise ServerError, 'Server not started' unless @stdout
|
|
90
|
+
|
|
91
|
+
line = @stdout.gets
|
|
92
|
+
JSON.parse(line) if line
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
# Check if server is running
|
|
96
|
+
#
|
|
97
|
+
# @return [Boolean]
|
|
98
|
+
#
|
|
99
|
+
def running?
|
|
100
|
+
return false unless @pid
|
|
101
|
+
|
|
102
|
+
Process.kill(0, @pid)
|
|
103
|
+
true
|
|
104
|
+
rescue Errno::ESRCH, Errno::EPERM
|
|
105
|
+
false
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
private
|
|
109
|
+
|
|
110
|
+
def start_stdio(binary)
|
|
111
|
+
@stdin, @stdout, @stderr, wait_thr = Open3.popen3(binary.to_s, 'mcp', '--transport', 'stdio')
|
|
112
|
+
@pid = wait_thr.pid
|
|
113
|
+
nil
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
def start_sse(binary)
|
|
117
|
+
@pid = spawn(
|
|
118
|
+
binary.to_s,
|
|
119
|
+
'mcp',
|
|
120
|
+
'--transport', 'sse',
|
|
121
|
+
out: $stdout,
|
|
122
|
+
err: $stderr
|
|
123
|
+
)
|
|
124
|
+
Process.detach(@pid)
|
|
125
|
+
sleep 1 # Give server time to start
|
|
126
|
+
@pid
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
def close_pipes
|
|
130
|
+
@stdin&.close
|
|
131
|
+
@stdout&.close
|
|
132
|
+
@stderr&.close
|
|
133
|
+
@stdin = @stdout = @stderr = nil
|
|
134
|
+
end
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
module_function
|
|
138
|
+
|
|
139
|
+
# Run MCP server with a block
|
|
140
|
+
#
|
|
141
|
+
# @param transport [String] Transport method
|
|
142
|
+
# @yield [Server] Yields server instance
|
|
143
|
+
# @return [Object] Block result
|
|
144
|
+
#
|
|
145
|
+
# @example
|
|
146
|
+
# Kreuzberg::MCPProxy.run(transport: 'stdio') do |server|
|
|
147
|
+
# server.send_message({ method: 'tools/list' })
|
|
148
|
+
# response = server.read_message
|
|
149
|
+
# end
|
|
150
|
+
#
|
|
151
|
+
def run(transport: 'stdio')
|
|
152
|
+
server = Server.new(transport: transport)
|
|
153
|
+
server.start
|
|
154
|
+
yield server
|
|
155
|
+
ensure
|
|
156
|
+
server&.stop
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
# Find the MCP binary
|
|
160
|
+
#
|
|
161
|
+
# @return [Pathname] Path to binary
|
|
162
|
+
# @raise [MissingBinaryError] If not found
|
|
163
|
+
#
|
|
164
|
+
def find_mcp_binary
|
|
165
|
+
# MCP is served by kreuzberg CLI
|
|
166
|
+
binary_name = Gem.win_platform? ? 'kreuzberg.exe' : 'kreuzberg'
|
|
167
|
+
found = CLIProxy.search_paths(binary_name).find(&:file?)
|
|
168
|
+
return found if found
|
|
169
|
+
|
|
170
|
+
raise MissingBinaryError, missing_binary_message
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
# Error message for missing binary
|
|
174
|
+
#
|
|
175
|
+
# @return [String]
|
|
176
|
+
#
|
|
177
|
+
def missing_binary_message
|
|
178
|
+
<<~MSG.strip
|
|
179
|
+
kreuzberg binary not found for MCP server. Build it with:
|
|
180
|
+
`cargo build --release --package kreuzberg-cli`
|
|
181
|
+
|
|
182
|
+
Or ensure kreuzberg is installed with MCP support.
|
|
183
|
+
MSG
|
|
184
|
+
end
|
|
185
|
+
end
|
|
186
|
+
end
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Kreuzberg
|
|
4
|
+
# OCR backend protocol interface.
|
|
5
|
+
#
|
|
6
|
+
# This module defines the protocol that all Ruby OCR backends must implement
|
|
7
|
+
# to be registered with the Rust core via the FFI bridge.
|
|
8
|
+
#
|
|
9
|
+
# OCR backends implement optical character recognition for images and scanned documents.
|
|
10
|
+
# They are called when OCR is enabled in the extraction configuration.
|
|
11
|
+
#
|
|
12
|
+
# @example Implementing a custom OCR backend
|
|
13
|
+
# class CustomOcrBackend
|
|
14
|
+
# include Kreuzberg::OcrBackendProtocol
|
|
15
|
+
#
|
|
16
|
+
# def name
|
|
17
|
+
# "custom-ocr"
|
|
18
|
+
# end
|
|
19
|
+
#
|
|
20
|
+
# def process_image(image_bytes, config)
|
|
21
|
+
# # Perform OCR on image_bytes
|
|
22
|
+
# # This is a placeholder - integrate with a real OCR engine
|
|
23
|
+
# text = my_ocr_engine.recognize(image_bytes, language: config["language"])
|
|
24
|
+
# text
|
|
25
|
+
# end
|
|
26
|
+
# end
|
|
27
|
+
#
|
|
28
|
+
# backend = CustomOcrBackend.new
|
|
29
|
+
# Kreuzberg.register_ocr_backend(backend.name, backend)
|
|
30
|
+
#
|
|
31
|
+
# # Use in extraction
|
|
32
|
+
# result = Kreuzberg.extract_file_sync(
|
|
33
|
+
# "scanned.pdf",
|
|
34
|
+
# config: { ocr: { backend: "custom-ocr", language: "eng" } }
|
|
35
|
+
# )
|
|
36
|
+
#
|
|
37
|
+
# @example Implementing an OCR backend with initialization
|
|
38
|
+
# class ModelBasedOcr
|
|
39
|
+
# include Kreuzberg::OcrBackendProtocol
|
|
40
|
+
#
|
|
41
|
+
# def initialize
|
|
42
|
+
# @model = nil
|
|
43
|
+
# end
|
|
44
|
+
#
|
|
45
|
+
# def name
|
|
46
|
+
# "model-ocr"
|
|
47
|
+
# end
|
|
48
|
+
#
|
|
49
|
+
# def process_image(image_bytes, config)
|
|
50
|
+
# # Load model on first use (lazy initialization)
|
|
51
|
+
# @model ||= load_model
|
|
52
|
+
#
|
|
53
|
+
# # Run OCR
|
|
54
|
+
# @model.recognize(image_bytes, config)
|
|
55
|
+
# end
|
|
56
|
+
#
|
|
57
|
+
# private
|
|
58
|
+
#
|
|
59
|
+
# def load_model
|
|
60
|
+
# # Load ML model for OCR
|
|
61
|
+
# MyOcrModel.load("path/to/model")
|
|
62
|
+
# end
|
|
63
|
+
# end
|
|
64
|
+
#
|
|
65
|
+
# Kreuzberg.register_ocr_backend("model-ocr", ModelBasedOcr.new)
|
|
66
|
+
#
|
|
67
|
+
module OcrBackendProtocol
|
|
68
|
+
# Return the unique name of this OCR backend.
|
|
69
|
+
#
|
|
70
|
+
# This name is used in ExtractionConfig to select the backend:
|
|
71
|
+
#
|
|
72
|
+
# config = { ocr: { backend: "custom-ocr", language: "eng" } }
|
|
73
|
+
#
|
|
74
|
+
# The name should be a lowercase string with hyphens (e.g., "custom-ocr", "tesseract").
|
|
75
|
+
#
|
|
76
|
+
# @return [String] Unique backend identifier
|
|
77
|
+
#
|
|
78
|
+
# @example
|
|
79
|
+
# def name
|
|
80
|
+
# "custom-ocr"
|
|
81
|
+
# end
|
|
82
|
+
def name
|
|
83
|
+
raise NotImplementedError, "#{self.class} must implement #name"
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
# Process image bytes and extract text via OCR.
|
|
87
|
+
#
|
|
88
|
+
# This method receives raw image data (PNG, JPEG, TIFF, etc.) and an OCR configuration
|
|
89
|
+
# hash. It must return the extracted text as a string.
|
|
90
|
+
#
|
|
91
|
+
# The config hash contains OCR settings such as:
|
|
92
|
+
# - "language" [String] - Language code (e.g., "eng", "deu", "fra")
|
|
93
|
+
# - "backend" [String] - Backend name (same as #name)
|
|
94
|
+
# - Additional backend-specific settings
|
|
95
|
+
#
|
|
96
|
+
# @param image_bytes [String] Binary image data (PNG, JPEG, TIFF, etc.)
|
|
97
|
+
# @param config [Hash] OCR configuration with the following keys:
|
|
98
|
+
# - "language" [String] - Language code for OCR (e.g., "eng", "deu")
|
|
99
|
+
# - "backend" [String] - Backend name
|
|
100
|
+
#
|
|
101
|
+
# @return [String] Extracted text content
|
|
102
|
+
#
|
|
103
|
+
# @example
|
|
104
|
+
# def process_image(image_bytes, config)
|
|
105
|
+
# language = config["language"] || "eng"
|
|
106
|
+
# text = my_ocr_engine.recognize(image_bytes, language: language)
|
|
107
|
+
# text
|
|
108
|
+
# end
|
|
109
|
+
def process_image(image_bytes, config)
|
|
110
|
+
raise NotImplementedError, "#{self.class} must implement #process_image(image_bytes, config)"
|
|
111
|
+
end
|
|
112
|
+
end
|
|
113
|
+
end
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Kreuzberg
|
|
4
|
+
# PostProcessor protocol interface.
|
|
5
|
+
#
|
|
6
|
+
# This module defines the protocol that all Ruby post-processors must implement
|
|
7
|
+
# to be registered with the Rust core via the FFI bridge.
|
|
8
|
+
#
|
|
9
|
+
# Post-processors enrich extraction results by adding metadata, transforming content,
|
|
10
|
+
# or performing additional analysis. They are called after extraction completes.
|
|
11
|
+
#
|
|
12
|
+
# @example Implementing a simple post-processor
|
|
13
|
+
# class UpcaseProcessor
|
|
14
|
+
# include Kreuzberg::PostProcessorProtocol
|
|
15
|
+
#
|
|
16
|
+
# def call(result)
|
|
17
|
+
# result["content"] = result["content"].upcase
|
|
18
|
+
# result
|
|
19
|
+
# end
|
|
20
|
+
# end
|
|
21
|
+
#
|
|
22
|
+
# Kreuzberg.register_post_processor("upcase", UpcaseProcessor.new)
|
|
23
|
+
#
|
|
24
|
+
# @example Implementing a post-processor that adds metadata
|
|
25
|
+
# class EntityExtractor
|
|
26
|
+
# include Kreuzberg::PostProcessorProtocol
|
|
27
|
+
#
|
|
28
|
+
# def call(result)
|
|
29
|
+
# entities = extract_entities(result["content"])
|
|
30
|
+
# result["metadata"]["entities"] = entities
|
|
31
|
+
# result
|
|
32
|
+
# end
|
|
33
|
+
#
|
|
34
|
+
# private
|
|
35
|
+
#
|
|
36
|
+
# def extract_entities(text)
|
|
37
|
+
# # Extract named entities from text
|
|
38
|
+
# # This is a placeholder - use a real NER library in production
|
|
39
|
+
# text.scan(/[A-Z][a-z]+(?:\s[A-Z][a-z]+)*/)
|
|
40
|
+
# end
|
|
41
|
+
# end
|
|
42
|
+
#
|
|
43
|
+
# Kreuzberg.register_post_processor("entities", EntityExtractor.new)
|
|
44
|
+
#
|
|
45
|
+
# @example Using a Proc as a post-processor
|
|
46
|
+
# Kreuzberg.register_post_processor("word_count", ->(result) {
|
|
47
|
+
# word_count = result["content"].split.length
|
|
48
|
+
# result["metadata"]["word_count"] = word_count
|
|
49
|
+
# result
|
|
50
|
+
# })
|
|
51
|
+
#
|
|
52
|
+
module PostProcessorProtocol
|
|
53
|
+
# Process and enrich an extraction result.
|
|
54
|
+
#
|
|
55
|
+
# This method is called after extraction completes. It receives the extraction result
|
|
56
|
+
# as a hash and must return the modified hash. The processor can:
|
|
57
|
+
# - Add new keys to result["metadata"]
|
|
58
|
+
# - Transform result["content"]
|
|
59
|
+
# - Add entries to result["tables"]
|
|
60
|
+
# - Modify any other result fields
|
|
61
|
+
#
|
|
62
|
+
# Existing metadata keys will not be overwritten by the FFI bridge, so it's safe
|
|
63
|
+
# to add new keys without worrying about conflicts.
|
|
64
|
+
#
|
|
65
|
+
# @param result [Hash] Extraction result with the following structure:
|
|
66
|
+
# - "content" [String] - Extracted text content
|
|
67
|
+
# - "mime_type" [String] - MIME type of the source document
|
|
68
|
+
# - "metadata" [Hash] - Document metadata (title, author, etc.)
|
|
69
|
+
# - "tables" [Array<Hash>] - Extracted tables
|
|
70
|
+
# - "detected_languages" [Array<String>, nil] - Detected language codes
|
|
71
|
+
# - "chunks" [Array<String>, nil] - Content chunks (if chunking enabled)
|
|
72
|
+
#
|
|
73
|
+
# @return [Hash] Modified extraction result with enriched metadata
|
|
74
|
+
#
|
|
75
|
+
# @example
|
|
76
|
+
# def call(result)
|
|
77
|
+
# text = result["content"]
|
|
78
|
+
# entities = extract_entities(text)
|
|
79
|
+
# result["metadata"]["entities"] = entities
|
|
80
|
+
# result
|
|
81
|
+
# end
|
|
82
|
+
def call(result)
|
|
83
|
+
raise NotImplementedError, "#{self.class} must implement #call(result)"
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
end
|