kreuzberg 4.1.2 → 4.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +26 -17
- data/ext/kreuzberg_rb/native/libpdfium.so +0 -0
- data/kreuzberg.gemspec +13 -1
- data/lib/kreuzberg/cli.rb +16 -6
- data/lib/kreuzberg/cli_proxy.rb +3 -1
- data/lib/kreuzberg/config.rb +121 -39
- data/lib/kreuzberg/djot_content.rb +225 -0
- data/lib/kreuzberg/extraction_api.rb +20 -4
- data/lib/kreuzberg/result.rb +12 -2
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +1 -0
- data/sig/kreuzberg.rbs +28 -12
- data/spec/binding/batch_operations_spec.rb +80 -0
- data/spec/binding/batch_spec.rb +6 -5
- data/spec/binding/error_recovery_spec.rb +3 -3
- data/spec/binding/metadata_types_spec.rb +77 -57
- data/spec/binding/tables_spec.rb +11 -2
- data/spec/serialization_spec.rb +134 -0
- data/spec/unit/config/output_format_spec.rb +380 -0
- data/vendor/Cargo.toml +1 -1
- data/vendor/kreuzberg/Cargo.toml +1 -1
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/startup.rs +15 -1
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +16 -4
- data/vendor/kreuzberg/src/core/extractor/file.rs +1 -2
- data/vendor/kreuzberg/src/core/extractor/mod.rs +2 -1
- data/vendor/kreuzberg/src/core/io.rs +7 -7
- data/vendor/kreuzberg/src/core/mime.rs +4 -4
- data/vendor/kreuzberg/src/embeddings.rs +4 -4
- data/vendor/kreuzberg/src/extraction/pptx/parser.rs +6 -0
- data/vendor/kreuzberg/src/mcp/format.rs +237 -39
- data/vendor/kreuzberg/src/mcp/params.rs +26 -33
- data/vendor/kreuzberg/src/mcp/server.rs +6 -3
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +16 -23
- data/vendor/kreuzberg/src/plugins/mod.rs +1 -0
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +251 -5
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +150 -2
- data/vendor/kreuzberg/src/plugins/registry/processor.rs +213 -5
- data/vendor/kreuzberg/src/plugins/registry/validator.rs +220 -4
- data/vendor/kreuzberg/src/plugins/startup_validation.rs +385 -0
- data/vendor/kreuzberg/tests/api_chunk.rs +40 -30
- data/vendor/kreuzberg/tests/api_consistency.rs +349 -0
- data/vendor/kreuzberg/tests/api_embed.rs +84 -50
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +8 -2
- data/vendor/kreuzberg/tests/api_tests.rs +298 -139
- data/vendor/kreuzberg/tests/archive_integration.rs +63 -56
- data/vendor/kreuzberg/tests/batch_orchestration.rs +22 -14
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +13 -13
- data/vendor/kreuzberg/tests/batch_processing.rs +13 -9
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +10 -10
- data/vendor/kreuzberg/tests/concurrency_stress.rs +10 -6
- data/vendor/kreuzberg/tests/config_behavioral.rs +416 -0
- data/vendor/kreuzberg/tests/config_features.rs +19 -15
- data/vendor/kreuzberg/tests/config_integration_test.rs +68 -68
- data/vendor/kreuzberg/tests/config_loading_tests.rs +71 -62
- data/vendor/kreuzberg/tests/contract_mcp.rs +314 -0
- data/vendor/kreuzberg/tests/core_integration.rs +57 -57
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +23 -23
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +15 -14
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +4 -4
- data/vendor/kreuzberg/tests/email_integration.rs +7 -7
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/error_handling.rs +13 -11
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
- data/vendor/kreuzberg/tests/instrumentation_test.rs +18 -13
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +17 -17
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +9 -9
- data/vendor/kreuzberg/tests/keywords_integration.rs +25 -25
- data/vendor/kreuzberg/tests/keywords_quality.rs +9 -9
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +1 -1
- data/vendor/kreuzberg/tests/mcp_integration.rs +849 -0
- data/vendor/kreuzberg/tests/mime_detection.rs +75 -43
- data/vendor/kreuzberg/tests/ocr_errors.rs +10 -4
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +1 -1
- data/vendor/kreuzberg/tests/ocr_stress.rs +3 -3
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +6 -6
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/page_markers.rs +1 -1
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +6 -6
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +2 -2
- data/vendor/kreuzberg/tests/pipeline_integration.rs +77 -61
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +97 -77
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +78 -61
- data/vendor/kreuzberg/tests/plugin_system.rs +49 -46
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +109 -97
- data/vendor/kreuzberg/tests/pptx_regression_tests.rs +324 -31
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +26 -23
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +1 -1
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +18 -18
- data/vendor/kreuzberg/tests/security_validation.rs +20 -19
- data/vendor/kreuzberg/tests/serialization_integration.rs +112 -0
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +36 -36
- data/vendor/kreuzberg/tests/test_fastembed.rs +8 -8
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +9 -9
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +12 -9
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- metadata +12 -2
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
begin
|
|
4
|
+
require 'json'
|
|
5
|
+
rescue LoadError
|
|
6
|
+
require 'json/pure'
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
module Kreuzberg
|
|
10
|
+
class Result
|
|
11
|
+
# Djot structured content representation
|
|
12
|
+
#
|
|
13
|
+
# Represents document content in Djot format with structured metadata about
|
|
14
|
+
# blocks, images, links, footnotes, and other document elements.
|
|
15
|
+
#
|
|
16
|
+
class DjotContent
|
|
17
|
+
attr_reader :plain_text, :blocks, :metadata_json, :tables, :images, :links, :footnotes, :attributes
|
|
18
|
+
|
|
19
|
+
# Represents a formatted block in Djot content
|
|
20
|
+
class FormattedBlock
|
|
21
|
+
attr_reader :block_type, :children, :attributes, :content, :level
|
|
22
|
+
|
|
23
|
+
# rubocop:disable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
|
24
|
+
def initialize(hash_or_type = nil, children: nil, attributes: nil, content: nil, level: nil, block_type: nil)
|
|
25
|
+
if hash_or_type.is_a?(Hash)
|
|
26
|
+
# Initialize from hash
|
|
27
|
+
@block_type = hash_or_type[:block_type] || hash_or_type['block_type'] || ''
|
|
28
|
+
@children = hash_or_type[:children] || hash_or_type['children']
|
|
29
|
+
@attributes = hash_or_type[:attributes] || hash_or_type['attributes'] || {}
|
|
30
|
+
@content = hash_or_type[:content] || hash_or_type['content']
|
|
31
|
+
@level = hash_or_type[:level] || hash_or_type['level']
|
|
32
|
+
else
|
|
33
|
+
# Initialize from keyword arguments (for backward compatibility)
|
|
34
|
+
@block_type = block_type || hash_or_type || ''
|
|
35
|
+
@children = children || []
|
|
36
|
+
@attributes = attributes || {}
|
|
37
|
+
@content = content
|
|
38
|
+
@level = level
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
# rubocop:enable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
|
42
|
+
|
|
43
|
+
def to_h
|
|
44
|
+
{
|
|
45
|
+
block_type: @block_type,
|
|
46
|
+
children: @children,
|
|
47
|
+
attributes: @attributes,
|
|
48
|
+
content: @content,
|
|
49
|
+
level: @level
|
|
50
|
+
}.compact
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
# Represents an image in Djot content
|
|
55
|
+
class DjotImage
|
|
56
|
+
attr_reader :url, :alt, :title, :width, :height
|
|
57
|
+
alias src url
|
|
58
|
+
|
|
59
|
+
# rubocop:disable Metrics/CyclomaticComplexity
|
|
60
|
+
def initialize(hash_or_url = nil, alt: nil, title: nil, width: nil, height: nil, url: nil, src: nil)
|
|
61
|
+
if hash_or_url.is_a?(Hash)
|
|
62
|
+
# Initialize from hash (supports both 'url' and 'src' keys)
|
|
63
|
+
@url = hash_or_url[:url] || hash_or_url['url'] || hash_or_url[:src] || hash_or_url['src']
|
|
64
|
+
@alt = hash_or_url[:alt] || hash_or_url['alt']
|
|
65
|
+
@title = hash_or_url[:title] || hash_or_url['title']
|
|
66
|
+
@width = hash_or_url[:width] || hash_or_url['width']
|
|
67
|
+
@height = hash_or_url[:height] || hash_or_url['height']
|
|
68
|
+
else
|
|
69
|
+
# Initialize from keyword arguments
|
|
70
|
+
@url = url || src || hash_or_url
|
|
71
|
+
@alt = alt
|
|
72
|
+
@title = title
|
|
73
|
+
@width = width
|
|
74
|
+
@height = height
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
# rubocop:enable Metrics/CyclomaticComplexity
|
|
78
|
+
|
|
79
|
+
def to_h
|
|
80
|
+
{
|
|
81
|
+
url: @url,
|
|
82
|
+
alt: @alt,
|
|
83
|
+
title: @title,
|
|
84
|
+
width: @width,
|
|
85
|
+
height: @height
|
|
86
|
+
}.compact
|
|
87
|
+
end
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
# Represents a link in Djot content
|
|
91
|
+
class DjotLink
|
|
92
|
+
attr_reader :url, :text, :title, :link_type
|
|
93
|
+
alias href url
|
|
94
|
+
|
|
95
|
+
# rubocop:disable Metrics/CyclomaticComplexity
|
|
96
|
+
def initialize(hash_or_url = nil, text: nil, title: nil, url: nil, href: nil, link_type: nil)
|
|
97
|
+
if hash_or_url.is_a?(Hash)
|
|
98
|
+
# Initialize from hash (supports both 'url' and 'href' keys)
|
|
99
|
+
@url = hash_or_url[:url] || hash_or_url['url'] || hash_or_url[:href] || hash_or_url['href']
|
|
100
|
+
@text = hash_or_url[:text] || hash_or_url['text']
|
|
101
|
+
@title = hash_or_url[:title] || hash_or_url['title']
|
|
102
|
+
@link_type = hash_or_url[:link_type] || hash_or_url['link_type']
|
|
103
|
+
else
|
|
104
|
+
# Initialize from keyword arguments
|
|
105
|
+
@url = url || href || hash_or_url
|
|
106
|
+
@text = text
|
|
107
|
+
@title = title
|
|
108
|
+
@link_type = link_type
|
|
109
|
+
end
|
|
110
|
+
end
|
|
111
|
+
# rubocop:enable Metrics/CyclomaticComplexity
|
|
112
|
+
|
|
113
|
+
def to_h
|
|
114
|
+
{
|
|
115
|
+
url: @url,
|
|
116
|
+
text: @text,
|
|
117
|
+
title: @title,
|
|
118
|
+
link_type: @link_type
|
|
119
|
+
}.compact
|
|
120
|
+
end
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
# Represents a footnote in Djot content
|
|
124
|
+
class Footnote
|
|
125
|
+
attr_reader :label, :content
|
|
126
|
+
|
|
127
|
+
def initialize(label:, content:)
|
|
128
|
+
@label = label
|
|
129
|
+
@content = content
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
def to_h
|
|
133
|
+
{
|
|
134
|
+
label: @label,
|
|
135
|
+
content: @content
|
|
136
|
+
}
|
|
137
|
+
end
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
# rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
|
141
|
+
def initialize(hash)
|
|
142
|
+
@plain_text = hash['plain_text'] || hash[:plain_text] || ''
|
|
143
|
+
@blocks = parse_blocks(hash['blocks'] || hash[:blocks] || [])
|
|
144
|
+
@metadata_json = hash['metadata_json'] || hash[:metadata_json] || '{}'
|
|
145
|
+
@tables = hash['tables'] || hash[:tables] || []
|
|
146
|
+
@images = parse_images(hash['images'] || hash[:images] || [])
|
|
147
|
+
@links = parse_links(hash['links'] || hash[:links] || [])
|
|
148
|
+
@footnotes = parse_footnotes(hash['footnotes'] || hash[:footnotes] || [])
|
|
149
|
+
@attributes = hash['attributes'] || hash[:attributes] || {}
|
|
150
|
+
end
|
|
151
|
+
# rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
|
152
|
+
|
|
153
|
+
def metadata
|
|
154
|
+
@metadata ||= parse_metadata(@metadata_json)
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
def to_h
|
|
158
|
+
{
|
|
159
|
+
plain_text: @plain_text,
|
|
160
|
+
blocks: @blocks.map(&:to_h),
|
|
161
|
+
metadata_json: @metadata_json,
|
|
162
|
+
tables: @tables,
|
|
163
|
+
images: @images.map(&:to_h),
|
|
164
|
+
links: @links.map(&:to_h),
|
|
165
|
+
footnotes: @footnotes.map(&:to_h),
|
|
166
|
+
attributes: @attributes
|
|
167
|
+
}
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
private
|
|
171
|
+
|
|
172
|
+
def parse_metadata(metadata_json)
|
|
173
|
+
JSON.parse(metadata_json)
|
|
174
|
+
rescue JSON::ParserError
|
|
175
|
+
{}
|
|
176
|
+
end
|
|
177
|
+
|
|
178
|
+
def parse_blocks(blocks_data)
|
|
179
|
+
blocks_data.map do |block|
|
|
180
|
+
FormattedBlock.new(
|
|
181
|
+
block_type: block['block_type'] || block[:block_type] || '',
|
|
182
|
+
children: block['children'] || block[:children],
|
|
183
|
+
attributes: block['attributes'] || block[:attributes]
|
|
184
|
+
)
|
|
185
|
+
end
|
|
186
|
+
end
|
|
187
|
+
|
|
188
|
+
# rubocop:disable Metrics/CyclomaticComplexity
|
|
189
|
+
def parse_images(images_data)
|
|
190
|
+
images_data.map do |image|
|
|
191
|
+
DjotImage.new(
|
|
192
|
+
url: image['url'] || image[:url] || image['src'] || image[:src],
|
|
193
|
+
alt: image['alt'] || image[:alt],
|
|
194
|
+
title: image['title'] || image[:title],
|
|
195
|
+
width: image['width'] || image[:width],
|
|
196
|
+
height: image['height'] || image[:height]
|
|
197
|
+
)
|
|
198
|
+
end
|
|
199
|
+
end
|
|
200
|
+
# rubocop:enable Metrics/CyclomaticComplexity
|
|
201
|
+
|
|
202
|
+
# rubocop:disable Metrics/CyclomaticComplexity
|
|
203
|
+
def parse_links(links_data)
|
|
204
|
+
links_data.map do |link|
|
|
205
|
+
DjotLink.new(
|
|
206
|
+
url: link['url'] || link[:url] || link['href'] || link[:href],
|
|
207
|
+
text: link['text'] || link[:text],
|
|
208
|
+
title: link['title'] || link[:title],
|
|
209
|
+
link_type: link['link_type'] || link[:link_type]
|
|
210
|
+
)
|
|
211
|
+
end
|
|
212
|
+
end
|
|
213
|
+
# rubocop:enable Metrics/CyclomaticComplexity
|
|
214
|
+
|
|
215
|
+
def parse_footnotes(footnotes_data)
|
|
216
|
+
footnotes_data.map do |note|
|
|
217
|
+
Footnote.new(
|
|
218
|
+
label: note['label'] || note[:label],
|
|
219
|
+
content: note['content'] || note[:content]
|
|
220
|
+
)
|
|
221
|
+
end
|
|
222
|
+
end
|
|
223
|
+
end
|
|
224
|
+
end
|
|
225
|
+
end
|
|
@@ -15,11 +15,15 @@ module Kreuzberg
|
|
|
15
15
|
# @example Extract with explicit MIME type
|
|
16
16
|
# @example Extract with OCR enabled
|
|
17
17
|
def extract_file_sync(path:, mime_type: nil, config: nil)
|
|
18
|
+
# Validate that the file exists
|
|
19
|
+
path_str = path.to_s
|
|
20
|
+
raise Errors::IOError, "File not found: #{path_str}" unless File.exist?(path_str)
|
|
21
|
+
|
|
18
22
|
opts = normalize_config(config)
|
|
19
23
|
hash = if mime_type
|
|
20
|
-
native_extract_file_sync(
|
|
24
|
+
native_extract_file_sync(path_str, mime_type.to_s, **opts)
|
|
21
25
|
else
|
|
22
|
-
native_extract_file_sync(
|
|
26
|
+
native_extract_file_sync(path_str, **opts)
|
|
23
27
|
end
|
|
24
28
|
result = Result.new(hash)
|
|
25
29
|
record_cache_entry!(result, opts)
|
|
@@ -53,6 +57,8 @@ module Kreuzberg
|
|
|
53
57
|
# response = HTTParty.get("https://example.com/document.docx")
|
|
54
58
|
# result = Kreuzberg.extract_bytes_sync(response.body, "application/vnd.openxmlformats-officedocument.wordprocessingml.document")
|
|
55
59
|
def extract_bytes_sync(data:, mime_type:, config: nil)
|
|
60
|
+
raise TypeError, "mime_type must be a String, got #{mime_type.inspect}" if mime_type.nil?
|
|
61
|
+
|
|
56
62
|
opts = normalize_config(config)
|
|
57
63
|
hash = native_extract_bytes_sync(data.to_s, mime_type.to_s, **opts)
|
|
58
64
|
result = Result.new(hash)
|
|
@@ -92,6 +98,12 @@ module Kreuzberg
|
|
|
92
98
|
# config = Kreuzberg::Config::Extraction.new(force_ocr: true)
|
|
93
99
|
# results = Kreuzberg.batch_extract_files_sync(paths, config: config)
|
|
94
100
|
def batch_extract_files_sync(paths:, config: nil)
|
|
101
|
+
# Validate that all files exist
|
|
102
|
+
paths.each do |path|
|
|
103
|
+
path_str = path.to_s
|
|
104
|
+
raise Errors::IOError, "File not found: #{path_str}" unless File.exist?(path_str)
|
|
105
|
+
end
|
|
106
|
+
|
|
95
107
|
opts = normalize_config(config)
|
|
96
108
|
hashes = native_batch_extract_files_sync(paths.map(&:to_s), **opts)
|
|
97
109
|
results = hashes.map { |hash| Result.new(hash) }
|
|
@@ -130,11 +142,15 @@ module Kreuzberg
|
|
|
130
142
|
# )
|
|
131
143
|
# result = Kreuzberg.extract_file("document.pdf", config: config)
|
|
132
144
|
def extract_file(path:, mime_type: nil, config: nil)
|
|
145
|
+
# Validate that the file exists
|
|
146
|
+
path_str = path.to_s
|
|
147
|
+
raise Errors::IOError, "File not found: #{path_str}" unless File.exist?(path_str)
|
|
148
|
+
|
|
133
149
|
opts = normalize_config(config)
|
|
134
150
|
hash = if mime_type
|
|
135
|
-
native_extract_file(
|
|
151
|
+
native_extract_file(path_str, mime_type.to_s, **opts)
|
|
136
152
|
else
|
|
137
|
-
native_extract_file(
|
|
153
|
+
native_extract_file(path_str, **opts)
|
|
138
154
|
end
|
|
139
155
|
result = Result.new(hash)
|
|
140
156
|
record_cache_entry!(result, opts)
|
data/lib/kreuzberg/result.rb
CHANGED
|
@@ -11,7 +11,7 @@ module Kreuzberg
|
|
|
11
11
|
# rubocop:disable Metrics/ClassLength
|
|
12
12
|
class Result
|
|
13
13
|
attr_reader :content, :mime_type, :metadata, :metadata_json, :tables,
|
|
14
|
-
:detected_languages, :chunks, :images, :pages, :elements
|
|
14
|
+
:detected_languages, :chunks, :images, :pages, :elements, :djot_content
|
|
15
15
|
|
|
16
16
|
# @!attribute [r] cells
|
|
17
17
|
# @return [Array<Array<String>>] Table cells (2D array)
|
|
@@ -180,6 +180,7 @@ module Kreuzberg
|
|
|
180
180
|
#
|
|
181
181
|
# @param hash [Hash] Hash returned from native extension
|
|
182
182
|
#
|
|
183
|
+
# rubocop:disable Metrics/AbcSize
|
|
183
184
|
def initialize(hash)
|
|
184
185
|
@content = get_value(hash, 'content', '')
|
|
185
186
|
@mime_type = get_value(hash, 'mime_type', '')
|
|
@@ -191,7 +192,9 @@ module Kreuzberg
|
|
|
191
192
|
@images = parse_images(get_value(hash, 'images'))
|
|
192
193
|
@pages = parse_pages(get_value(hash, 'pages'))
|
|
193
194
|
@elements = parse_elements(get_value(hash, 'elements'))
|
|
195
|
+
@djot_content = parse_djot_content(get_value(hash, 'djot_content'))
|
|
194
196
|
end
|
|
197
|
+
# rubocop:enable Metrics/AbcSize
|
|
195
198
|
|
|
196
199
|
# Convert to hash
|
|
197
200
|
#
|
|
@@ -207,7 +210,8 @@ module Kreuzberg
|
|
|
207
210
|
chunks: serialize_chunks,
|
|
208
211
|
images: serialize_images,
|
|
209
212
|
pages: serialize_pages,
|
|
210
|
-
elements: serialize_elements
|
|
213
|
+
elements: serialize_elements,
|
|
214
|
+
djot_content: @djot_content&.to_h
|
|
211
215
|
}
|
|
212
216
|
end
|
|
213
217
|
|
|
@@ -434,6 +438,12 @@ module Kreuzberg
|
|
|
434
438
|
y1: coordinates_data['y1'].to_f
|
|
435
439
|
)
|
|
436
440
|
end
|
|
441
|
+
|
|
442
|
+
def parse_djot_content(djot_data)
|
|
443
|
+
return nil if djot_data.nil?
|
|
444
|
+
|
|
445
|
+
DjotContent.new(djot_data)
|
|
446
|
+
end
|
|
437
447
|
end
|
|
438
448
|
# rubocop:enable Metrics/ClassLength
|
|
439
449
|
end
|
data/lib/kreuzberg/version.rb
CHANGED
data/lib/kreuzberg.rb
CHANGED
|
@@ -87,6 +87,7 @@ end
|
|
|
87
87
|
|
|
88
88
|
require_relative 'kreuzberg/cache_api'
|
|
89
89
|
require_relative 'kreuzberg/extraction_api'
|
|
90
|
+
require_relative 'kreuzberg/djot_content'
|
|
90
91
|
|
|
91
92
|
Kreuzberg.singleton_class.prepend(Kreuzberg::CacheAPI)
|
|
92
93
|
Kreuzberg.singleton_class.prepend(Kreuzberg::ExtractionAPI)
|
data/sig/kreuzberg.rbs
CHANGED
|
@@ -202,6 +202,8 @@ module Kreuzberg
|
|
|
202
202
|
attr_reader html_options: HtmlOptions?
|
|
203
203
|
attr_reader pages: PageConfig?
|
|
204
204
|
attr_reader max_concurrent_extractions: Integer?
|
|
205
|
+
attr_reader output_format: String?
|
|
206
|
+
attr_reader result_format: String?
|
|
205
207
|
|
|
206
208
|
def self.from_file: (String path) -> Extraction
|
|
207
209
|
def initialize: (
|
|
@@ -219,7 +221,9 @@ module Kreuzberg
|
|
|
219
221
|
?keywords: (Keywords | Hash[Symbol, untyped])?,
|
|
220
222
|
?html_options: (HtmlOptions | Hash[Symbol, untyped])?,
|
|
221
223
|
?pages: (PageConfig | Hash[Symbol, untyped])?,
|
|
222
|
-
?max_concurrent_extractions: Integer
|
|
224
|
+
?max_concurrent_extractions: Integer?,
|
|
225
|
+
?output_format: String?,
|
|
226
|
+
?result_format: String?
|
|
223
227
|
) -> void
|
|
224
228
|
def to_h: () -> Hash[Symbol, untyped]
|
|
225
229
|
|
|
@@ -413,14 +417,23 @@ module Kreuzberg
|
|
|
413
417
|
attr_reader plain_text: String
|
|
414
418
|
attr_reader blocks: Array[DjotContent::FormattedBlock]
|
|
415
419
|
attr_reader metadata: Hash[untyped, untyped]
|
|
416
|
-
attr_reader
|
|
420
|
+
attr_reader metadata_json: String
|
|
421
|
+
attr_reader tables: Array[untyped]
|
|
417
422
|
attr_reader images: Array[DjotContent::DjotImage]
|
|
418
423
|
attr_reader links: Array[DjotContent::DjotLink]
|
|
419
424
|
attr_reader footnotes: Array[DjotContent::Footnote]
|
|
420
425
|
attr_reader attributes: Hash[String, untyped]?
|
|
421
426
|
|
|
422
|
-
def initialize: (
|
|
423
|
-
def to_h: () ->
|
|
427
|
+
def initialize: (untyped hash) -> void
|
|
428
|
+
def to_h: () -> Hash[Symbol, untyped]
|
|
429
|
+
|
|
430
|
+
private
|
|
431
|
+
|
|
432
|
+
def parse_metadata: (String metadata_json) -> Hash[untyped, untyped]
|
|
433
|
+
def parse_blocks: (Array[untyped] blocks_data) -> Array[FormattedBlock]
|
|
434
|
+
def parse_images: (Array[untyped] images_data) -> Array[DjotImage]
|
|
435
|
+
def parse_links: (Array[untyped] links_data) -> Array[DjotLink]
|
|
436
|
+
def parse_footnotes: (Array[untyped] footnotes_data) -> Array[Footnote]
|
|
424
437
|
|
|
425
438
|
class FormattedBlock
|
|
426
439
|
attr_reader block_type: String
|
|
@@ -429,28 +442,31 @@ module Kreuzberg
|
|
|
429
442
|
attr_reader children: Array[FormattedBlock]?
|
|
430
443
|
attr_reader attributes: Hash[String, untyped]?
|
|
431
444
|
|
|
432
|
-
def initialize: (
|
|
433
|
-
def to_h: () ->
|
|
445
|
+
def initialize: (?untyped hash_or_type, ?children: untyped, ?attributes: untyped, ?content: untyped, ?level: untyped, ?block_type: untyped) -> void
|
|
446
|
+
def to_h: () -> Hash[Symbol, untyped]
|
|
434
447
|
end
|
|
435
448
|
|
|
436
449
|
class DjotImage
|
|
437
450
|
attr_reader url: String
|
|
438
451
|
attr_reader alt: String?
|
|
439
452
|
attr_reader title: String?
|
|
440
|
-
attr_reader
|
|
453
|
+
attr_reader width: Integer?
|
|
454
|
+
attr_reader height: Integer?
|
|
441
455
|
|
|
442
|
-
def initialize: (
|
|
443
|
-
def
|
|
456
|
+
def initialize: (?untyped hash_or_url, ?alt: untyped, ?title: untyped, ?width: untyped, ?height: untyped, ?url: untyped, ?src: untyped) -> void
|
|
457
|
+
def src: () -> String
|
|
458
|
+
def to_h: () -> Hash[Symbol, untyped]
|
|
444
459
|
end
|
|
445
460
|
|
|
446
461
|
class DjotLink
|
|
447
462
|
attr_reader url: String
|
|
448
|
-
attr_reader text: String
|
|
463
|
+
attr_reader text: String?
|
|
449
464
|
attr_reader title: String?
|
|
450
465
|
attr_reader link_type: String?
|
|
451
466
|
|
|
452
|
-
def initialize: (
|
|
453
|
-
def
|
|
467
|
+
def initialize: (?untyped hash_or_url, ?text: untyped, ?title: untyped, ?url: untyped, ?href: untyped, ?link_type: untyped) -> void
|
|
468
|
+
def href: () -> String
|
|
469
|
+
def to_h: () -> Hash[Symbol, untyped]
|
|
454
470
|
end
|
|
455
471
|
|
|
456
472
|
class Footnote
|
|
@@ -592,4 +592,84 @@ RSpec.describe 'Batch Operations' do
|
|
|
592
592
|
paths.each { |p| FileUtils.rm_f(p) }
|
|
593
593
|
end
|
|
594
594
|
end
|
|
595
|
+
|
|
596
|
+
describe 'batch with output and result formats' do
|
|
597
|
+
it 'batch processes with output_format' do
|
|
598
|
+
paths = []
|
|
599
|
+
file = Tempfile.new(['format_test', '.txt']).tap do |f|
|
|
600
|
+
f.write('Test content for output format')
|
|
601
|
+
f.close
|
|
602
|
+
end
|
|
603
|
+
paths << file.path
|
|
604
|
+
|
|
605
|
+
config = Kreuzberg::Config::Extraction.new(output_format: 'markdown')
|
|
606
|
+
results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
|
|
607
|
+
|
|
608
|
+
expect(results).to be_an Array
|
|
609
|
+
expect(results.length).to eq 1
|
|
610
|
+
expect(results[0]).to be_a Kreuzberg::Result
|
|
611
|
+
|
|
612
|
+
paths.each { |p| FileUtils.rm_f(p) }
|
|
613
|
+
end
|
|
614
|
+
|
|
615
|
+
it 'batch processes with result_format' do
|
|
616
|
+
paths = []
|
|
617
|
+
file = Tempfile.new(['format_test', '.txt']).tap do |f|
|
|
618
|
+
f.write('Test content for result format')
|
|
619
|
+
f.close
|
|
620
|
+
end
|
|
621
|
+
paths << file.path
|
|
622
|
+
|
|
623
|
+
config = Kreuzberg::Config::Extraction.new(result_format: 'unified')
|
|
624
|
+
results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
|
|
625
|
+
|
|
626
|
+
expect(results).to be_an Array
|
|
627
|
+
expect(results.length).to eq 1
|
|
628
|
+
expect(results[0]).to be_a Kreuzberg::Result
|
|
629
|
+
|
|
630
|
+
paths.each { |p| FileUtils.rm_f(p) }
|
|
631
|
+
end
|
|
632
|
+
|
|
633
|
+
it 'batch processes with both output and result formats' do
|
|
634
|
+
paths = []
|
|
635
|
+
file = Tempfile.new(['format_test', '.txt']).tap do |f|
|
|
636
|
+
f.write('Test content for both formats')
|
|
637
|
+
f.close
|
|
638
|
+
end
|
|
639
|
+
paths << file.path
|
|
640
|
+
|
|
641
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
642
|
+
output_format: 'plain',
|
|
643
|
+
result_format: 'element_based'
|
|
644
|
+
)
|
|
645
|
+
results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
|
|
646
|
+
|
|
647
|
+
expect(results).to be_an Array
|
|
648
|
+
expect(results.length).to eq 1
|
|
649
|
+
expect(results[0]).to be_a Kreuzberg::Result
|
|
650
|
+
|
|
651
|
+
paths.each { |p| FileUtils.rm_f(p) }
|
|
652
|
+
end
|
|
653
|
+
|
|
654
|
+
it 'batch processes with chunking and output_format' do
|
|
655
|
+
paths = []
|
|
656
|
+
file = Tempfile.new(['format_test', '.txt']).tap do |f|
|
|
657
|
+
f.write('Test content ' * 100)
|
|
658
|
+
f.close
|
|
659
|
+
end
|
|
660
|
+
paths << file.path
|
|
661
|
+
|
|
662
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
663
|
+
output_format: 'markdown',
|
|
664
|
+
chunking: { max_chars: 1000 }
|
|
665
|
+
)
|
|
666
|
+
results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
|
|
667
|
+
|
|
668
|
+
expect(results).to be_an Array
|
|
669
|
+
expect(results.length).to eq 1
|
|
670
|
+
expect(results[0]).to be_a Kreuzberg::Result
|
|
671
|
+
|
|
672
|
+
paths.each { |p| FileUtils.rm_f(p) }
|
|
673
|
+
end
|
|
674
|
+
end
|
|
595
675
|
end
|
data/spec/binding/batch_spec.rb
CHANGED
|
@@ -295,7 +295,7 @@ RSpec.describe Kreuzberg do
|
|
|
295
295
|
end
|
|
296
296
|
|
|
297
297
|
describe 'batch error handling' do
|
|
298
|
-
it '
|
|
298
|
+
it 'raises IOError for missing files in batch' do
|
|
299
299
|
paths = [
|
|
300
300
|
'/nonexistent/file1.txt',
|
|
301
301
|
'/nonexistent/file2.txt'
|
|
@@ -303,10 +303,10 @@ RSpec.describe Kreuzberg do
|
|
|
303
303
|
|
|
304
304
|
expect do
|
|
305
305
|
described_class.batch_extract_files_sync(paths: paths)
|
|
306
|
-
end.
|
|
306
|
+
end.to raise_error(Kreuzberg::Errors::IOError, /not found/)
|
|
307
307
|
end
|
|
308
308
|
|
|
309
|
-
it '
|
|
309
|
+
it 'raises IOError when batch contains invalid paths' do
|
|
310
310
|
paths = []
|
|
311
311
|
temp_dir = Dir.mktmpdir
|
|
312
312
|
|
|
@@ -316,8 +316,9 @@ RSpec.describe Kreuzberg do
|
|
|
316
316
|
|
|
317
317
|
paths << '/nonexistent/invalid.txt'
|
|
318
318
|
|
|
319
|
-
|
|
320
|
-
|
|
319
|
+
expect do
|
|
320
|
+
described_class.batch_extract_files_sync(paths: paths)
|
|
321
|
+
end.to raise_error(Kreuzberg::Errors::IOError, /not found/)
|
|
321
322
|
ensure
|
|
322
323
|
FileUtils.remove_entry(temp_dir)
|
|
323
324
|
end
|
|
@@ -57,7 +57,7 @@ RSpec.describe 'Error Recovery' do
|
|
|
57
57
|
nonexistent_path = '/nonexistent/file/that/does/not/exist.pdf'
|
|
58
58
|
|
|
59
59
|
expect { Kreuzberg.extract_file_sync(path: nonexistent_path, config: config) }
|
|
60
|
-
.to raise_error(Kreuzberg::Errors::
|
|
60
|
+
.to raise_error(Kreuzberg::Errors::IOError, /not found|does not exist|no such file/)
|
|
61
61
|
end
|
|
62
62
|
|
|
63
63
|
it 'provides descriptive error messages for invalid MIME types' do
|
|
@@ -293,7 +293,7 @@ RSpec.describe 'Error Recovery' do
|
|
|
293
293
|
|
|
294
294
|
expect(validation_error).to be_a(ArgumentError)
|
|
295
295
|
|
|
296
|
-
# Runtime error (file not found)
|
|
296
|
+
# Runtime error (file not found) - IOError since the file doesn't exist
|
|
297
297
|
runtime_error = nil
|
|
298
298
|
begin
|
|
299
299
|
Kreuzberg.extract_file_sync(path: '/nonexistent/file.pdf')
|
|
@@ -301,7 +301,7 @@ RSpec.describe 'Error Recovery' do
|
|
|
301
301
|
runtime_error = e
|
|
302
302
|
end
|
|
303
303
|
|
|
304
|
-
expect(runtime_error).to be_a(Kreuzberg::Errors::
|
|
304
|
+
expect(runtime_error).to be_a(Kreuzberg::Errors::IOError)
|
|
305
305
|
end
|
|
306
306
|
|
|
307
307
|
it 'provides error recovery suggestions in messages' do
|