kreuzberg 4.0.0.rc1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +8 -0
- data/.rspec +3 -0
- data/.rubocop.yaml +534 -0
- data/Gemfile +9 -0
- data/Gemfile.lock +157 -0
- data/README.md +421 -0
- data/Rakefile +25 -0
- data/Steepfile +47 -0
- data/examples/async_patterns.rb +340 -0
- data/ext/kreuzberg_rb/extconf.rb +35 -0
- data/ext/kreuzberg_rb/native/Cargo.toml +36 -0
- data/ext/kreuzberg_rb/native/README.md +425 -0
- data/ext/kreuzberg_rb/native/build.rs +17 -0
- data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -0
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -0
- data/ext/kreuzberg_rb/native/include/strings.h +20 -0
- data/ext/kreuzberg_rb/native/include/unistd.h +47 -0
- data/ext/kreuzberg_rb/native/src/lib.rs +2939 -0
- data/extconf.rb +28 -0
- data/kreuzberg.gemspec +105 -0
- data/lib/kreuzberg/api_proxy.rb +142 -0
- data/lib/kreuzberg/cache_api.rb +45 -0
- data/lib/kreuzberg/cli.rb +55 -0
- data/lib/kreuzberg/cli_proxy.rb +127 -0
- data/lib/kreuzberg/config.rb +684 -0
- data/lib/kreuzberg/errors.rb +50 -0
- data/lib/kreuzberg/extraction_api.rb +84 -0
- data/lib/kreuzberg/mcp_proxy.rb +186 -0
- data/lib/kreuzberg/ocr_backend_protocol.rb +113 -0
- data/lib/kreuzberg/post_processor_protocol.rb +86 -0
- data/lib/kreuzberg/result.rb +216 -0
- data/lib/kreuzberg/setup_lib_path.rb +79 -0
- data/lib/kreuzberg/validator_protocol.rb +89 -0
- data/lib/kreuzberg/version.rb +5 -0
- data/lib/kreuzberg.rb +82 -0
- data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
- data/sig/kreuzberg/internal.rbs +184 -0
- data/sig/kreuzberg.rbs +468 -0
- data/spec/binding/cache_spec.rb +227 -0
- data/spec/binding/cli_proxy_spec.rb +87 -0
- data/spec/binding/cli_spec.rb +54 -0
- data/spec/binding/config_spec.rb +345 -0
- data/spec/binding/config_validation_spec.rb +283 -0
- data/spec/binding/error_handling_spec.rb +213 -0
- data/spec/binding/errors_spec.rb +66 -0
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
- data/spec/binding/plugins/postprocessor_spec.rb +269 -0
- data/spec/binding/plugins/validator_spec.rb +274 -0
- data/spec/examples.txt +104 -0
- data/spec/fixtures/config.toml +39 -0
- data/spec/fixtures/config.yaml +42 -0
- data/spec/fixtures/invalid_config.toml +4 -0
- data/spec/smoke/package_spec.rb +178 -0
- data/spec/spec_helper.rb +42 -0
- data/vendor/kreuzberg/Cargo.toml +134 -0
- data/vendor/kreuzberg/README.md +175 -0
- data/vendor/kreuzberg/build.rs +460 -0
- data/vendor/kreuzberg/src/api/error.rs +81 -0
- data/vendor/kreuzberg/src/api/handlers.rs +199 -0
- data/vendor/kreuzberg/src/api/mod.rs +79 -0
- data/vendor/kreuzberg/src/api/server.rs +353 -0
- data/vendor/kreuzberg/src/api/types.rs +170 -0
- data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
- data/vendor/kreuzberg/src/cache/mod.rs +1143 -0
- data/vendor/kreuzberg/src/chunking/mod.rs +677 -0
- data/vendor/kreuzberg/src/core/batch_mode.rs +35 -0
- data/vendor/kreuzberg/src/core/config.rs +1032 -0
- data/vendor/kreuzberg/src/core/extractor.rs +903 -0
- data/vendor/kreuzberg/src/core/io.rs +327 -0
- data/vendor/kreuzberg/src/core/mime.rs +615 -0
- data/vendor/kreuzberg/src/core/mod.rs +42 -0
- data/vendor/kreuzberg/src/core/pipeline.rs +906 -0
- data/vendor/kreuzberg/src/embeddings.rs +323 -0
- data/vendor/kreuzberg/src/error.rs +431 -0
- data/vendor/kreuzberg/src/extraction/archive.rs +954 -0
- data/vendor/kreuzberg/src/extraction/docx.rs +40 -0
- data/vendor/kreuzberg/src/extraction/email.rs +854 -0
- data/vendor/kreuzberg/src/extraction/excel.rs +688 -0
- data/vendor/kreuzberg/src/extraction/html.rs +553 -0
- data/vendor/kreuzberg/src/extraction/image.rs +368 -0
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +564 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +77 -0
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -0
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -0
- data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -0
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +128 -0
- data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
- data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
- data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
- data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
- data/vendor/kreuzberg/src/extraction/pptx.rs +3000 -0
- data/vendor/kreuzberg/src/extraction/structured.rs +490 -0
- data/vendor/kreuzberg/src/extraction/table.rs +328 -0
- data/vendor/kreuzberg/src/extraction/text.rs +269 -0
- data/vendor/kreuzberg/src/extraction/xml.rs +333 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +425 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +479 -0
- data/vendor/kreuzberg/src/extractors/email.rs +129 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +344 -0
- data/vendor/kreuzberg/src/extractors/html.rs +410 -0
- data/vendor/kreuzberg/src/extractors/image.rs +195 -0
- data/vendor/kreuzberg/src/extractors/mod.rs +268 -0
- data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +496 -0
- data/vendor/kreuzberg/src/extractors/pptx.rs +234 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +126 -0
- data/vendor/kreuzberg/src/extractors/text.rs +242 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +128 -0
- data/vendor/kreuzberg/src/image/dpi.rs +164 -0
- data/vendor/kreuzberg/src/image/mod.rs +6 -0
- data/vendor/kreuzberg/src/image/preprocessing.rs +417 -0
- data/vendor/kreuzberg/src/image/resize.rs +89 -0
- data/vendor/kreuzberg/src/keywords/config.rs +154 -0
- data/vendor/kreuzberg/src/keywords/mod.rs +237 -0
- data/vendor/kreuzberg/src/keywords/processor.rs +267 -0
- data/vendor/kreuzberg/src/keywords/rake.rs +294 -0
- data/vendor/kreuzberg/src/keywords/types.rs +68 -0
- data/vendor/kreuzberg/src/keywords/yake.rs +163 -0
- data/vendor/kreuzberg/src/language_detection/mod.rs +942 -0
- data/vendor/kreuzberg/src/lib.rs +102 -0
- data/vendor/kreuzberg/src/mcp/mod.rs +32 -0
- data/vendor/kreuzberg/src/mcp/server.rs +1966 -0
- data/vendor/kreuzberg/src/ocr/cache.rs +469 -0
- data/vendor/kreuzberg/src/ocr/error.rs +37 -0
- data/vendor/kreuzberg/src/ocr/hocr.rs +216 -0
- data/vendor/kreuzberg/src/ocr/mod.rs +58 -0
- data/vendor/kreuzberg/src/ocr/processor.rs +847 -0
- data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -0
- data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +450 -0
- data/vendor/kreuzberg/src/ocr/types.rs +393 -0
- data/vendor/kreuzberg/src/ocr/utils.rs +47 -0
- data/vendor/kreuzberg/src/ocr/validation.rs +206 -0
- data/vendor/kreuzberg/src/pdf/error.rs +122 -0
- data/vendor/kreuzberg/src/pdf/images.rs +139 -0
- data/vendor/kreuzberg/src/pdf/metadata.rs +346 -0
- data/vendor/kreuzberg/src/pdf/mod.rs +50 -0
- data/vendor/kreuzberg/src/pdf/rendering.rs +369 -0
- data/vendor/kreuzberg/src/pdf/table.rs +420 -0
- data/vendor/kreuzberg/src/pdf/text.rs +161 -0
- data/vendor/kreuzberg/src/plugins/extractor.rs +1010 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +209 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +629 -0
- data/vendor/kreuzberg/src/plugins/processor.rs +641 -0
- data/vendor/kreuzberg/src/plugins/registry.rs +1324 -0
- data/vendor/kreuzberg/src/plugins/traits.rs +258 -0
- data/vendor/kreuzberg/src/plugins/validator.rs +955 -0
- data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -0
- data/vendor/kreuzberg/src/text/mod.rs +19 -0
- data/vendor/kreuzberg/src/text/quality.rs +697 -0
- data/vendor/kreuzberg/src/text/string_utils.rs +217 -0
- data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -0
- data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -0
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -0
- data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -0
- data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -0
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -0
- data/vendor/kreuzberg/src/types.rs +873 -0
- data/vendor/kreuzberg/src/utils/mod.rs +17 -0
- data/vendor/kreuzberg/src/utils/quality.rs +959 -0
- data/vendor/kreuzberg/src/utils/string_utils.rs +381 -0
- data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -0
- data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -0
- data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -0
- data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -0
- data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -0
- data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -0
- data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -0
- data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -0
- data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -0
- data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -0
- data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -0
- data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -0
- data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -0
- data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -0
- data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -0
- data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -0
- data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -0
- data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -0
- data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -0
- data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -0
- data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -0
- data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -0
- data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -0
- data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -0
- data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -0
- data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -0
- data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -0
- data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -0
- data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -0
- data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -0
- data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -0
- data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -0
- data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -0
- data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -0
- data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -0
- data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -0
- data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -0
- data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -0
- data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -0
- data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -0
- data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -0
- data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -0
- data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -0
- data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -0
- data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -0
- data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -0
- data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -0
- data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -0
- data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -0
- data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -0
- data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -0
- data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -0
- data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -0
- data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -0
- data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -0
- data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -0
- data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -0
- data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -0
- data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -0
- data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -0
- data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -0
- data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -0
- data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -0
- data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -0
- data/vendor/kreuzberg/tests/api_tests.rs +966 -0
- data/vendor/kreuzberg/tests/archive_integration.rs +543 -0
- data/vendor/kreuzberg/tests/batch_orchestration.rs +542 -0
- data/vendor/kreuzberg/tests/batch_processing.rs +304 -0
- data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +509 -0
- data/vendor/kreuzberg/tests/config_features.rs +580 -0
- data/vendor/kreuzberg/tests/config_loading_tests.rs +439 -0
- data/vendor/kreuzberg/tests/core_integration.rs +493 -0
- data/vendor/kreuzberg/tests/csv_integration.rs +424 -0
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +124 -0
- data/vendor/kreuzberg/tests/email_integration.rs +325 -0
- data/vendor/kreuzberg/tests/error_handling.rs +393 -0
- data/vendor/kreuzberg/tests/format_integration.rs +159 -0
- data/vendor/kreuzberg/tests/helpers/mod.rs +142 -0
- data/vendor/kreuzberg/tests/image_integration.rs +253 -0
- data/vendor/kreuzberg/tests/keywords_integration.rs +479 -0
- data/vendor/kreuzberg/tests/keywords_quality.rs +509 -0
- data/vendor/kreuzberg/tests/mime_detection.rs +428 -0
- data/vendor/kreuzberg/tests/ocr_configuration.rs +510 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +676 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +627 -0
- data/vendor/kreuzberg/tests/ocr_stress.rs +469 -0
- data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +43 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1412 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +771 -0
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +561 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +921 -0
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +607 -0
- data/vendor/kreuzberg/tests/security_validation.rs +404 -0
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -0
- data/vendor/kreuzberg/tests/test_fastembed.rs +609 -0
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -0
- metadata +471 -0
|
@@ -0,0 +1,216 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
begin
|
|
4
|
+
require 'json'
|
|
5
|
+
rescue LoadError
|
|
6
|
+
require 'json/pure'
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
module Kreuzberg
|
|
10
|
+
# Extraction result wrapper
|
|
11
|
+
#
|
|
12
|
+
# Provides structured access to extraction results from the native extension.
|
|
13
|
+
#
|
|
14
|
+
# @example
|
|
15
|
+
# result = Kreuzberg.extract_file_sync("document.pdf")
|
|
16
|
+
# puts result.content
|
|
17
|
+
# puts "MIME type: #{result.mime_type}"
|
|
18
|
+
# puts "Metadata: #{result.metadata.inspect}"
|
|
19
|
+
# result.tables.each { |table| puts table.inspect }
|
|
20
|
+
#
|
|
21
|
+
# rubocop:disable Metrics/ClassLength
|
|
22
|
+
class Result
|
|
23
|
+
attr_reader :content, :mime_type, :metadata, :metadata_json, :tables,
|
|
24
|
+
:detected_languages, :chunks, :images
|
|
25
|
+
|
|
26
|
+
# Table structure
|
|
27
|
+
#
|
|
28
|
+
# @!attribute [r] cells
|
|
29
|
+
# @return [Array<Array<String>>] Table cells (2D array)
|
|
30
|
+
# @!attribute [r] markdown
|
|
31
|
+
# @return [String] Markdown representation
|
|
32
|
+
# @!attribute [r] page_number
|
|
33
|
+
# @return [Integer] Page number where table was found
|
|
34
|
+
#
|
|
35
|
+
Table = Struct.new(:cells, :markdown, :page_number, keyword_init: true) do
|
|
36
|
+
def to_h
|
|
37
|
+
{ cells: cells, markdown: markdown, page_number: page_number }
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
# Text chunk
|
|
42
|
+
#
|
|
43
|
+
# @!attribute [r] content
|
|
44
|
+
# @return [String] Chunk content
|
|
45
|
+
# @!attribute [r] char_start
|
|
46
|
+
# @return [Integer] Starting character index
|
|
47
|
+
# @!attribute [r] char_end
|
|
48
|
+
# @return [Integer] Ending character index
|
|
49
|
+
# @!attribute [r] token_count
|
|
50
|
+
# @return [Integer, nil] Approximate token count (may be nil)
|
|
51
|
+
#
|
|
52
|
+
Chunk = Struct.new(
|
|
53
|
+
:content,
|
|
54
|
+
:char_start,
|
|
55
|
+
:char_end,
|
|
56
|
+
:token_count,
|
|
57
|
+
:chunk_index,
|
|
58
|
+
:total_chunks,
|
|
59
|
+
:embedding,
|
|
60
|
+
keyword_init: true
|
|
61
|
+
) do
|
|
62
|
+
def to_h
|
|
63
|
+
{
|
|
64
|
+
content: content,
|
|
65
|
+
char_start: char_start,
|
|
66
|
+
char_end: char_end,
|
|
67
|
+
token_count: token_count,
|
|
68
|
+
chunk_index: chunk_index,
|
|
69
|
+
total_chunks: total_chunks,
|
|
70
|
+
embedding: embedding
|
|
71
|
+
}
|
|
72
|
+
end
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
Image = Struct.new(
|
|
76
|
+
:data,
|
|
77
|
+
:format,
|
|
78
|
+
:image_index,
|
|
79
|
+
:page_number,
|
|
80
|
+
:width,
|
|
81
|
+
:height,
|
|
82
|
+
:colorspace,
|
|
83
|
+
:bits_per_component,
|
|
84
|
+
:is_mask,
|
|
85
|
+
:description,
|
|
86
|
+
:ocr_result,
|
|
87
|
+
keyword_init: true
|
|
88
|
+
) do
|
|
89
|
+
def to_h
|
|
90
|
+
{
|
|
91
|
+
data: data,
|
|
92
|
+
format: format,
|
|
93
|
+
image_index: image_index,
|
|
94
|
+
page_number: page_number,
|
|
95
|
+
width: width,
|
|
96
|
+
height: height,
|
|
97
|
+
colorspace: colorspace,
|
|
98
|
+
bits_per_component: bits_per_component,
|
|
99
|
+
is_mask: is_mask,
|
|
100
|
+
description: description,
|
|
101
|
+
ocr_result: ocr_result&.to_h
|
|
102
|
+
}
|
|
103
|
+
end
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
# Initialize from native hash result
|
|
107
|
+
#
|
|
108
|
+
# @param hash [Hash] Hash returned from native extension
|
|
109
|
+
#
|
|
110
|
+
def initialize(hash)
|
|
111
|
+
# Handle both string and symbol keys for flexibility
|
|
112
|
+
@content = get_value(hash, 'content', '')
|
|
113
|
+
@mime_type = get_value(hash, 'mime_type', '')
|
|
114
|
+
@metadata_json = get_value(hash, 'metadata_json', '{}')
|
|
115
|
+
@metadata = parse_metadata(@metadata_json)
|
|
116
|
+
@tables = parse_tables(get_value(hash, 'tables'))
|
|
117
|
+
@detected_languages = parse_detected_languages(get_value(hash, 'detected_languages'))
|
|
118
|
+
@chunks = parse_chunks(get_value(hash, 'chunks'))
|
|
119
|
+
@images = parse_images(get_value(hash, 'images'))
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
# Convert to hash
|
|
123
|
+
#
|
|
124
|
+
# @return [Hash] Hash representation
|
|
125
|
+
#
|
|
126
|
+
def to_h
|
|
127
|
+
{
|
|
128
|
+
content: @content,
|
|
129
|
+
mime_type: @mime_type,
|
|
130
|
+
metadata: @metadata,
|
|
131
|
+
tables: @tables.map(&:to_h),
|
|
132
|
+
detected_languages: @detected_languages,
|
|
133
|
+
chunks: @chunks&.map(&:to_h),
|
|
134
|
+
images: @images&.map(&:to_h)
|
|
135
|
+
}
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
# Convert to JSON
|
|
139
|
+
#
|
|
140
|
+
# @return [String] JSON representation
|
|
141
|
+
#
|
|
142
|
+
def to_json(*)
|
|
143
|
+
to_h.to_json(*)
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
private
|
|
147
|
+
|
|
148
|
+
def get_value(hash, key, default = nil)
|
|
149
|
+
hash[key] || hash[key.to_sym] || default
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
def parse_metadata(metadata_json)
|
|
153
|
+
JSON.parse(metadata_json)
|
|
154
|
+
rescue JSON::ParserError
|
|
155
|
+
{}
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
def parse_tables(tables_data)
|
|
159
|
+
return [] if tables_data.nil? || tables_data.empty?
|
|
160
|
+
|
|
161
|
+
tables_data.map do |table_hash|
|
|
162
|
+
Table.new(
|
|
163
|
+
cells: table_hash['cells'] || [],
|
|
164
|
+
markdown: table_hash['markdown'] || '',
|
|
165
|
+
page_number: table_hash['page_number'] || 0
|
|
166
|
+
)
|
|
167
|
+
end
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
def parse_detected_languages(langs_data)
|
|
171
|
+
return nil if langs_data.nil?
|
|
172
|
+
|
|
173
|
+
# Detected languages is now just an array of strings
|
|
174
|
+
langs_data.is_a?(Array) ? langs_data : []
|
|
175
|
+
end
|
|
176
|
+
|
|
177
|
+
def parse_chunks(chunks_data)
|
|
178
|
+
return nil if chunks_data.nil?
|
|
179
|
+
|
|
180
|
+
chunks_data.map do |chunk_hash|
|
|
181
|
+
Chunk.new(
|
|
182
|
+
content: chunk_hash['content'],
|
|
183
|
+
char_start: chunk_hash['char_start'],
|
|
184
|
+
char_end: chunk_hash['char_end'],
|
|
185
|
+
token_count: chunk_hash['token_count'],
|
|
186
|
+
chunk_index: chunk_hash['chunk_index'],
|
|
187
|
+
total_chunks: chunk_hash['total_chunks'],
|
|
188
|
+
embedding: chunk_hash['embedding']
|
|
189
|
+
)
|
|
190
|
+
end
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
def parse_images(images_data)
|
|
194
|
+
return nil if images_data.nil?
|
|
195
|
+
|
|
196
|
+
images_data.map do |image_hash|
|
|
197
|
+
data = image_hash['data']
|
|
198
|
+
data = data.dup.force_encoding(Encoding::BINARY) if data.respond_to?(:force_encoding)
|
|
199
|
+
Image.new(
|
|
200
|
+
data: data,
|
|
201
|
+
format: image_hash['format'],
|
|
202
|
+
image_index: image_hash['image_index'],
|
|
203
|
+
page_number: image_hash['page_number'],
|
|
204
|
+
width: image_hash['width'],
|
|
205
|
+
height: image_hash['height'],
|
|
206
|
+
colorspace: image_hash['colorspace'],
|
|
207
|
+
bits_per_component: image_hash['bits_per_component'],
|
|
208
|
+
is_mask: image_hash['is_mask'],
|
|
209
|
+
description: image_hash['description'],
|
|
210
|
+
ocr_result: image_hash['ocr_result'] ? Result.new(image_hash['ocr_result']) : nil
|
|
211
|
+
)
|
|
212
|
+
end
|
|
213
|
+
end
|
|
214
|
+
end
|
|
215
|
+
# rubocop:enable Metrics/ClassLength
|
|
216
|
+
end
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'rbconfig'
|
|
4
|
+
require 'open3'
|
|
5
|
+
|
|
6
|
+
module Kreuzberg
|
|
7
|
+
module SetupLibPath
|
|
8
|
+
module_function
|
|
9
|
+
|
|
10
|
+
def configure
|
|
11
|
+
lib_dir = File.expand_path('..', __dir__ || '.')
|
|
12
|
+
host_os = RbConfig::CONFIG['host_os']
|
|
13
|
+
|
|
14
|
+
case host_os
|
|
15
|
+
when /darwin/
|
|
16
|
+
prepend_env('DYLD_LIBRARY_PATH', lib_dir)
|
|
17
|
+
prepend_env('DYLD_FALLBACK_LIBRARY_PATH', "#{lib_dir}:/usr/local/lib:/usr/lib")
|
|
18
|
+
fix_macos_install_name(lib_dir)
|
|
19
|
+
when /linux/
|
|
20
|
+
prepend_env('LD_LIBRARY_PATH', lib_dir)
|
|
21
|
+
when /mswin|mingw|cygwin/
|
|
22
|
+
prepend_env('PATH', lib_dir, separator: ';')
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def prepend_env(key, value, separator: ':')
|
|
27
|
+
current = ENV.fetch(key, nil)
|
|
28
|
+
return if current&.split(separator)&.include?(value)
|
|
29
|
+
|
|
30
|
+
ENV[key] = current.nil? || current.empty? ? value : "#{value}#{separator}#{current}"
|
|
31
|
+
end
|
|
32
|
+
private_class_method :prepend_env
|
|
33
|
+
|
|
34
|
+
def fix_macos_install_name(lib_dir)
|
|
35
|
+
bundle = macos_bundle(lib_dir)
|
|
36
|
+
return unless bundle
|
|
37
|
+
|
|
38
|
+
ensure_install_name(bundle)
|
|
39
|
+
ensure_loader_rpath(bundle)
|
|
40
|
+
rescue Errno::ENOENT, IOError
|
|
41
|
+
# Tool not available (e.g., on CI). The dynamic loader can still use the updated env vars.
|
|
42
|
+
end
|
|
43
|
+
private_class_method :fix_macos_install_name
|
|
44
|
+
|
|
45
|
+
def macos_bundle(lib_dir)
|
|
46
|
+
bundle = File.join(lib_dir, 'kreuzberg_rb.bundle')
|
|
47
|
+
pdfium = File.join(lib_dir, 'libpdfium.dylib')
|
|
48
|
+
return unless File.exist?(bundle) && File.exist?(pdfium)
|
|
49
|
+
|
|
50
|
+
bundle
|
|
51
|
+
end
|
|
52
|
+
private_class_method :macos_bundle
|
|
53
|
+
|
|
54
|
+
def ensure_install_name(bundle)
|
|
55
|
+
output, status = Open3.capture2('otool', '-L', bundle)
|
|
56
|
+
return unless status.success?
|
|
57
|
+
|
|
58
|
+
replacements = {
|
|
59
|
+
'./libpdfium.dylib' => '@loader_path/libpdfium.dylib',
|
|
60
|
+
'@rpath/libpdfium.dylib' => '@loader_path/libpdfium.dylib'
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
replacements.each do |current, desired|
|
|
64
|
+
next unless output.include?(current)
|
|
65
|
+
|
|
66
|
+
Open3.capture2('install_name_tool', '-change', current, desired, bundle)
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
private_class_method :ensure_install_name
|
|
70
|
+
|
|
71
|
+
def ensure_loader_rpath(bundle)
|
|
72
|
+
rpath_output, rpath_status = Open3.capture2('otool', '-l', bundle)
|
|
73
|
+
return unless rpath_status.success? && !rpath_output.include?('@loader_path')
|
|
74
|
+
|
|
75
|
+
Open3.capture2('install_name_tool', '-add_rpath', '@loader_path', bundle)
|
|
76
|
+
end
|
|
77
|
+
private_class_method :ensure_loader_rpath
|
|
78
|
+
end
|
|
79
|
+
end
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Kreuzberg
|
|
4
|
+
# Validator protocol interface.
|
|
5
|
+
#
|
|
6
|
+
# This module defines the protocol that all Ruby validators must implement
|
|
7
|
+
# to be registered with the Rust core via the FFI bridge.
|
|
8
|
+
#
|
|
9
|
+
# Validators are called during extraction to validate results. If validation fails,
|
|
10
|
+
# the validator should raise a Kreuzberg::Errors::ValidationError, which will
|
|
11
|
+
# cause the extraction to fail.
|
|
12
|
+
#
|
|
13
|
+
# @example Implementing a minimum length validator
|
|
14
|
+
# class MinimumLengthValidator
|
|
15
|
+
# include Kreuzberg::ValidatorProtocol
|
|
16
|
+
#
|
|
17
|
+
# def initialize(min_length = 10)
|
|
18
|
+
# @min_length = min_length
|
|
19
|
+
# end
|
|
20
|
+
#
|
|
21
|
+
# def call(result)
|
|
22
|
+
# if result["content"].length < @min_length
|
|
23
|
+
# raise Kreuzberg::Errors::ValidationError.new(
|
|
24
|
+
# "Content too short: #{result["content"].length} < #{@min_length}"
|
|
25
|
+
# )
|
|
26
|
+
# end
|
|
27
|
+
# end
|
|
28
|
+
# end
|
|
29
|
+
#
|
|
30
|
+
# Kreuzberg.register_validator("min_length", MinimumLengthValidator.new(100))
|
|
31
|
+
#
|
|
32
|
+
# @example Implementing a content quality validator
|
|
33
|
+
# class QualityValidator
|
|
34
|
+
# include Kreuzberg::ValidatorProtocol
|
|
35
|
+
#
|
|
36
|
+
# def call(result)
|
|
37
|
+
# # Check if content has sufficient quality
|
|
38
|
+
# if result["content"].strip.empty?
|
|
39
|
+
# raise Kreuzberg::Errors::ValidationError.new("Empty content extracted")
|
|
40
|
+
# end
|
|
41
|
+
#
|
|
42
|
+
# # Check if metadata is present
|
|
43
|
+
# if result["metadata"].empty?
|
|
44
|
+
# raise Kreuzberg::Errors::ValidationError.new("No metadata extracted")
|
|
45
|
+
# end
|
|
46
|
+
# end
|
|
47
|
+
# end
|
|
48
|
+
#
|
|
49
|
+
# Kreuzberg.register_validator("quality", QualityValidator.new)
|
|
50
|
+
#
|
|
51
|
+
# @example Using a Proc as a validator
|
|
52
|
+
# Kreuzberg.register_validator("not_empty", ->(result) {
|
|
53
|
+
# if result["content"].strip.empty?
|
|
54
|
+
# raise Kreuzberg::Errors::ValidationError.new("Content cannot be empty")
|
|
55
|
+
# end
|
|
56
|
+
# })
|
|
57
|
+
#
|
|
58
|
+
module ValidatorProtocol
|
|
59
|
+
# Validate an extraction result.
|
|
60
|
+
#
|
|
61
|
+
# This method is called during extraction to validate results. If validation fails,
|
|
62
|
+
# raise a Kreuzberg::Errors::ValidationError with a descriptive message explaining
|
|
63
|
+
# why validation failed. If validation passes, return without raising.
|
|
64
|
+
#
|
|
65
|
+
# The validator receives the extraction result as a hash with the same structure
|
|
66
|
+
# as post-processors (see PostProcessorProtocol for details).
|
|
67
|
+
#
|
|
68
|
+
# @param result [Hash] Extraction result to validate with the following structure:
|
|
69
|
+
# - "content" [String] - Extracted text content
|
|
70
|
+
# - "mime_type" [String] - MIME type of the source document
|
|
71
|
+
# - "metadata" [Hash] - Document metadata (title, author, etc.)
|
|
72
|
+
# - "tables" [Array<Hash>] - Extracted tables
|
|
73
|
+
# - "detected_languages" [Array<String>, nil] - Detected language codes
|
|
74
|
+
# - "chunks" [Array<String>, nil] - Content chunks (if chunking enabled)
|
|
75
|
+
#
|
|
76
|
+
# @return [void]
|
|
77
|
+
# @raise [Kreuzberg::Errors::ValidationError] if validation fails
|
|
78
|
+
#
|
|
79
|
+
# @example
|
|
80
|
+
# def call(result)
|
|
81
|
+
# if result["content"].length < 10
|
|
82
|
+
# raise Kreuzberg::Errors::ValidationError.new("Content too short")
|
|
83
|
+
# end
|
|
84
|
+
# end
|
|
85
|
+
def call(result)
|
|
86
|
+
raise NotImplementedError, "#{self.class} must implement #call(result)"
|
|
87
|
+
end
|
|
88
|
+
end
|
|
89
|
+
end
|
data/lib/kreuzberg.rb
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'kreuzberg/setup_lib_path'
|
|
4
|
+
Kreuzberg::SetupLibPath.configure
|
|
5
|
+
|
|
6
|
+
require_relative 'kreuzberg/version'
|
|
7
|
+
require 'kreuzberg_rb'
|
|
8
|
+
|
|
9
|
+
module Kreuzberg
|
|
10
|
+
autoload :Config, 'kreuzberg/config'
|
|
11
|
+
autoload :Result, 'kreuzberg/result'
|
|
12
|
+
autoload :CLI, 'kreuzberg/cli'
|
|
13
|
+
autoload :CLIProxy, 'kreuzberg/cli_proxy'
|
|
14
|
+
autoload :APIProxy, 'kreuzberg/api_proxy'
|
|
15
|
+
autoload :MCPProxy, 'kreuzberg/mcp_proxy'
|
|
16
|
+
autoload :Errors, 'kreuzberg/errors'
|
|
17
|
+
autoload :PostProcessorProtocol, 'kreuzberg/post_processor_protocol'
|
|
18
|
+
autoload :ValidatorProtocol, 'kreuzberg/validator_protocol'
|
|
19
|
+
autoload :OcrBackendProtocol, 'kreuzberg/ocr_backend_protocol'
|
|
20
|
+
|
|
21
|
+
# Alias for API consistency with other language bindings
|
|
22
|
+
ExtractionConfig = Config::Extraction
|
|
23
|
+
|
|
24
|
+
@__cache_tracker = { entries: 0, bytes: 0 }
|
|
25
|
+
|
|
26
|
+
class << self
|
|
27
|
+
# Store native methods as private methods
|
|
28
|
+
alias native_extract_file_sync extract_file_sync
|
|
29
|
+
alias native_extract_bytes_sync extract_bytes_sync
|
|
30
|
+
alias native_batch_extract_files_sync batch_extract_files_sync
|
|
31
|
+
alias native_extract_file extract_file
|
|
32
|
+
alias native_extract_bytes extract_bytes
|
|
33
|
+
alias native_batch_extract_files batch_extract_files
|
|
34
|
+
alias native_batch_extract_bytes_sync batch_extract_bytes_sync
|
|
35
|
+
alias native_batch_extract_bytes batch_extract_bytes
|
|
36
|
+
alias native_clear_cache clear_cache
|
|
37
|
+
alias native_cache_stats cache_stats
|
|
38
|
+
|
|
39
|
+
private :native_extract_file_sync, :native_extract_bytes_sync, :native_batch_extract_files_sync
|
|
40
|
+
private :native_extract_file, :native_extract_bytes, :native_batch_extract_files
|
|
41
|
+
private :native_batch_extract_bytes_sync, :native_batch_extract_bytes
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
# Register a Ruby post-processor that conforms to PostProcessorProtocol.
|
|
45
|
+
module_function :register_post_processor
|
|
46
|
+
|
|
47
|
+
# Remove a post-processor by name.
|
|
48
|
+
module_function :unregister_post_processor
|
|
49
|
+
|
|
50
|
+
# Purge all registered post-processors.
|
|
51
|
+
module_function :clear_post_processors
|
|
52
|
+
|
|
53
|
+
# Register a validator that follows ValidatorProtocol.
|
|
54
|
+
module_function :register_validator
|
|
55
|
+
|
|
56
|
+
# Remove a validator by name.
|
|
57
|
+
module_function :unregister_validator
|
|
58
|
+
|
|
59
|
+
# Purge all validators.
|
|
60
|
+
module_function :clear_validators
|
|
61
|
+
|
|
62
|
+
# List all registered validators.
|
|
63
|
+
module_function :list_validators
|
|
64
|
+
|
|
65
|
+
# List all registered post-processors.
|
|
66
|
+
module_function :list_post_processors
|
|
67
|
+
|
|
68
|
+
# Register an OCR backend instance implementing OcrBackendProtocol.
|
|
69
|
+
module_function :register_ocr_backend
|
|
70
|
+
|
|
71
|
+
# Unregister an OCR backend by name.
|
|
72
|
+
module_function :unregister_ocr_backend
|
|
73
|
+
|
|
74
|
+
# List all registered OCR backends.
|
|
75
|
+
module_function :list_ocr_backends
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
require_relative 'kreuzberg/cache_api'
|
|
79
|
+
require_relative 'kreuzberg/extraction_api'
|
|
80
|
+
|
|
81
|
+
Kreuzberg.singleton_class.prepend(Kreuzberg::CacheAPI)
|
|
82
|
+
Kreuzberg.singleton_class.prepend(Kreuzberg::ExtractionAPI)
|
|
Binary file
|
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
# Internal modules - not part of public API but fully typed
|
|
2
|
+
|
|
3
|
+
module Kreuzberg
|
|
4
|
+
module SetupLibPath
|
|
5
|
+
# Public method
|
|
6
|
+
def configure: () -> void
|
|
7
|
+
def self.configure: () -> void
|
|
8
|
+
|
|
9
|
+
# Private methods (module_function makes them both instance and class methods)
|
|
10
|
+
def prepend_env: (String key, String value, ?separator: String) -> void
|
|
11
|
+
def self.prepend_env: (String key, String value, ?separator: String) -> void
|
|
12
|
+
def fix_macos_install_name: (String lib_dir) -> void
|
|
13
|
+
def self.fix_macos_install_name: (String lib_dir) -> void
|
|
14
|
+
def macos_bundle: (String lib_dir) -> String?
|
|
15
|
+
def self.macos_bundle: (String lib_dir) -> String?
|
|
16
|
+
def ensure_install_name: (String bundle) -> void
|
|
17
|
+
def self.ensure_install_name: (String bundle) -> void
|
|
18
|
+
def ensure_loader_rpath: (String bundle) -> void
|
|
19
|
+
def self.ensure_loader_rpath: (String bundle) -> void
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
module CLI
|
|
23
|
+
# All methods are both instance and class methods due to module_function
|
|
24
|
+
def extract: (String path, ?output: String, ?ocr: bool) -> String
|
|
25
|
+
def self.extract: (String path, ?output: String, ?ocr: bool) -> String
|
|
26
|
+
def detect: (String path) -> String
|
|
27
|
+
def self.detect: (String path) -> String
|
|
28
|
+
def version: () -> String
|
|
29
|
+
def self.version: () -> String
|
|
30
|
+
def help: () -> String
|
|
31
|
+
def self.help: () -> String
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
module CLIProxy
|
|
35
|
+
class Error < Kreuzberg::Errors::Error
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
class MissingBinaryError < Error
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
class CLIExecutionError < Error
|
|
42
|
+
attr_reader stderr: String
|
|
43
|
+
attr_reader status: Integer
|
|
44
|
+
|
|
45
|
+
def initialize: (String message, stderr: String, status: Integer) -> void
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
# All methods are both instance and class methods due to module_function
|
|
49
|
+
def call: (Array[String] argv) -> String
|
|
50
|
+
def self.call: (Array[String] argv) -> String
|
|
51
|
+
def find_cli_binary: () -> Pathname
|
|
52
|
+
def self.find_cli_binary: () -> Pathname
|
|
53
|
+
def root_path: () -> Pathname
|
|
54
|
+
def self.root_path: () -> Pathname
|
|
55
|
+
def lib_path: () -> Pathname
|
|
56
|
+
def self.lib_path: () -> Pathname
|
|
57
|
+
def search_paths: (String binary_name) -> Array[Pathname]
|
|
58
|
+
def self.search_paths: (String binary_name) -> Array[Pathname]
|
|
59
|
+
def missing_binary_message: () -> String
|
|
60
|
+
def self.missing_binary_message: () -> String
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
module APIProxy
|
|
64
|
+
class Error < Kreuzberg::Errors::Error
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
class MissingBinaryError < Error
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
class ServerError < Error
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
class Server
|
|
74
|
+
attr_reader port: Integer
|
|
75
|
+
attr_reader host: String
|
|
76
|
+
attr_reader process: Process::Status?
|
|
77
|
+
|
|
78
|
+
def initialize: (?port: Integer, ?host: String) -> void
|
|
79
|
+
def start: () -> void
|
|
80
|
+
def stop: () -> void
|
|
81
|
+
def running?: () -> bool
|
|
82
|
+
def health_check: () -> bool
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
# Module function methods are both instance and class methods
|
|
86
|
+
def run: (?port: Integer, ?host: String) { (Server) -> untyped } -> untyped
|
|
87
|
+
def self.run: (?port: Integer, ?host: String) { (Server) -> untyped } -> untyped
|
|
88
|
+
def start_server: (?port: Integer?) -> Server
|
|
89
|
+
def self.start_server: (?port: Integer?) -> Server
|
|
90
|
+
def extract_file: (String path, ?mime_type: String?, ?config: Hash[Symbol, untyped]?, ?server: Server?) -> Kreuzberg::Result
|
|
91
|
+
def self.extract_file: (String path, ?mime_type: String?, ?config: Hash[Symbol, untyped]?, ?server: Server?) -> Kreuzberg::Result
|
|
92
|
+
def batch_extract: (Array[String] paths, ?config: Hash[Symbol, untyped]?, ?server: Server?) -> Array[Kreuzberg::Result]
|
|
93
|
+
def self.batch_extract: (Array[String] paths, ?config: Hash[Symbol, untyped]?, ?server: Server?) -> Array[Kreuzberg::Result]
|
|
94
|
+
def find_api_binary: () -> Pathname
|
|
95
|
+
def self.find_api_binary: () -> Pathname
|
|
96
|
+
def missing_binary_message: () -> String
|
|
97
|
+
def self.missing_binary_message: () -> String
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
module MCPProxy
|
|
101
|
+
class Error < Kreuzberg::Errors::Error
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
class MissingBinaryError < Error
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
class ServerError < Error
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
class Server
|
|
111
|
+
attr_reader pid: Integer?
|
|
112
|
+
attr_reader transport: String
|
|
113
|
+
|
|
114
|
+
def initialize: (?transport: String) -> void
|
|
115
|
+
def start: () -> (Integer | nil)
|
|
116
|
+
def stop: () -> void
|
|
117
|
+
def running?: () -> bool
|
|
118
|
+
def send_message: (Hash[untyped, untyped] message) -> void
|
|
119
|
+
def read_message: () -> (Hash[untyped, untyped] | nil)
|
|
120
|
+
|
|
121
|
+
private
|
|
122
|
+
|
|
123
|
+
def start_stdio: (Pathname binary) -> nil
|
|
124
|
+
def start_sse: (Pathname binary) -> (Integer | nil)
|
|
125
|
+
def close_pipes: () -> void
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
# Module function methods are both instance and class methods
|
|
129
|
+
def run: (?transport: String) { (Server) -> untyped } -> untyped
|
|
130
|
+
def self.run: (?transport: String) { (Server) -> untyped } -> untyped
|
|
131
|
+
def extract_file: (String path, ?mime_type: String?, ?config: Hash[Symbol, untyped]?) -> Kreuzberg::Result
|
|
132
|
+
def self.extract_file: (String path, ?mime_type: String?, ?config: Hash[Symbol, untyped]?) -> Kreuzberg::Result
|
|
133
|
+
def batch_extract: (Array[String] paths, ?config: Hash[Symbol, untyped]?) -> Array[Kreuzberg::Result]
|
|
134
|
+
def self.batch_extract: (Array[String] paths, ?config: Hash[Symbol, untyped]?) -> Array[Kreuzberg::Result]
|
|
135
|
+
def find_mcp_binary: () -> Pathname
|
|
136
|
+
def self.find_mcp_binary: () -> Pathname
|
|
137
|
+
def missing_binary_message: () -> String
|
|
138
|
+
def self.missing_binary_message: () -> String
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
# Cache API module (prepended to Kreuzberg singleton class)
|
|
142
|
+
module CacheAPI
|
|
143
|
+
@__cache_tracker: Hash[Symbol, Integer]
|
|
144
|
+
|
|
145
|
+
def clear_cache: () -> void
|
|
146
|
+
def cache_stats: () -> Hash[Symbol | String, Integer]
|
|
147
|
+
|
|
148
|
+
private
|
|
149
|
+
|
|
150
|
+
def record_cache_entry!: (Kreuzberg::Result | Array[Kreuzberg::Result] results, Hash[Symbol, untyped] opts) -> void
|
|
151
|
+
def reset_cache_tracker!: () -> nil
|
|
152
|
+
|
|
153
|
+
# Native methods (defined in Rust)
|
|
154
|
+
def native_clear_cache: () -> void
|
|
155
|
+
def native_cache_stats: () -> Hash[Symbol | String, Integer]
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
# Extraction API module (prepended to Kreuzberg singleton class)
|
|
159
|
+
module ExtractionAPI
|
|
160
|
+
def extract_file_sync: (String | Pathname path, ?mime_type: String?, ?config: config_input?) -> Result
|
|
161
|
+
def extract_bytes_sync: (String data, String mime_type, ?config: config_input?) -> Result
|
|
162
|
+
def batch_extract_files_sync: (Array[String | Pathname] paths, ?config: config_input?) -> Array[Result]
|
|
163
|
+
def extract_file: (String | Pathname path, ?mime_type: String?, ?config: config_input?) -> Result
|
|
164
|
+
def extract_bytes: (String data, String mime_type, ?config: config_input?) -> Result
|
|
165
|
+
def batch_extract_files: (Array[String | Pathname] paths, ?config: config_input?) -> Array[Result]
|
|
166
|
+
def batch_extract_bytes_sync: (Array[String] data_array, Array[String] mime_types, ?config: config_input?) -> Array[Result]
|
|
167
|
+
def batch_extract_bytes: (Array[String] data_array, Array[String] mime_types, ?config: config_input?) -> Array[Result]
|
|
168
|
+
|
|
169
|
+
def normalize_config: (config_input? config) -> Hash[Symbol, untyped]
|
|
170
|
+
|
|
171
|
+
# Native methods (defined in Rust)
|
|
172
|
+
def native_extract_file_sync: (String path, ?String? mime_type, **untyped opts) -> extraction_result_hash
|
|
173
|
+
def native_extract_bytes_sync: (String data, String mime_type, **untyped opts) -> extraction_result_hash
|
|
174
|
+
def native_batch_extract_files_sync: (Array[String] paths, **untyped opts) -> Array[extraction_result_hash]
|
|
175
|
+
def native_extract_file: (String path, ?String? mime_type, **untyped opts) -> extraction_result_hash
|
|
176
|
+
def native_extract_bytes: (String data, String mime_type, **untyped opts) -> extraction_result_hash
|
|
177
|
+
def native_batch_extract_files: (Array[String] paths, **untyped opts) -> Array[extraction_result_hash]
|
|
178
|
+
def native_batch_extract_bytes_sync: (Array[String] data_array, Array[String] mime_types, **untyped opts) -> Array[extraction_result_hash]
|
|
179
|
+
def native_batch_extract_bytes: (Array[String] data_array, Array[String] mime_types, **untyped opts) -> Array[extraction_result_hash]
|
|
180
|
+
|
|
181
|
+
# Cache API methods (from prepended CacheAPI module)
|
|
182
|
+
def record_cache_entry!: (Kreuzberg::Result | Array[Kreuzberg::Result] results, Hash[Symbol, untyped] opts) -> void
|
|
183
|
+
end
|
|
184
|
+
end
|