kreuzberg 4.3.5-aarch64-linux
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +14 -0
- data/.rspec +3 -0
- data/.rubocop.yaml +1 -0
- data/.rubocop.yml +543 -0
- data/Gemfile +8 -0
- data/Gemfile.lock +260 -0
- data/README.md +399 -0
- data/Rakefile +34 -0
- data/Steepfile +51 -0
- data/examples/async_patterns.rb +283 -0
- data/extconf.rb +60 -0
- data/kreuzberg.gemspec +253 -0
- data/lib/kreuzberg/api_proxy.rb +125 -0
- data/lib/kreuzberg/cache_api.rb +67 -0
- data/lib/kreuzberg/cli.rb +57 -0
- data/lib/kreuzberg/cli_proxy.rb +118 -0
- data/lib/kreuzberg/config.rb +1241 -0
- data/lib/kreuzberg/djot_content.rb +225 -0
- data/lib/kreuzberg/document_structure.rb +204 -0
- data/lib/kreuzberg/error_context.rb +136 -0
- data/lib/kreuzberg/errors.rb +116 -0
- data/lib/kreuzberg/extraction_api.rb +329 -0
- data/lib/kreuzberg/mcp_proxy.rb +176 -0
- data/lib/kreuzberg/ocr_backend_protocol.rb +40 -0
- data/lib/kreuzberg/post_processor_protocol.rb +15 -0
- data/lib/kreuzberg/result.rb +712 -0
- data/lib/kreuzberg/setup_lib_path.rb +99 -0
- data/lib/kreuzberg/types.rb +414 -0
- data/lib/kreuzberg/validator_protocol.rb +16 -0
- data/lib/kreuzberg/version.rb +5 -0
- data/lib/kreuzberg.rb +102 -0
- data/lib/kreuzberg_rb.so +0 -0
- data/lib/libpdfium.so +0 -0
- data/sig/kreuzberg/internal.rbs +184 -0
- data/sig/kreuzberg.rbs +1337 -0
- data/spec/binding/async_operations_spec.rb +473 -0
- data/spec/binding/batch_operations_spec.rb +677 -0
- data/spec/binding/batch_spec.rb +360 -0
- data/spec/binding/cache_spec.rb +227 -0
- data/spec/binding/cli_proxy_spec.rb +85 -0
- data/spec/binding/cli_spec.rb +55 -0
- data/spec/binding/config_result_spec.rb +377 -0
- data/spec/binding/config_spec.rb +419 -0
- data/spec/binding/config_validation_spec.rb +377 -0
- data/spec/binding/embeddings_spec.rb +816 -0
- data/spec/binding/error_handling_spec.rb +399 -0
- data/spec/binding/error_recovery_spec.rb +488 -0
- data/spec/binding/errors_spec.rb +66 -0
- data/spec/binding/font_config_spec.rb +220 -0
- data/spec/binding/images_spec.rb +732 -0
- data/spec/binding/keywords_extraction_spec.rb +600 -0
- data/spec/binding/metadata_types_spec.rb +1253 -0
- data/spec/binding/pages_extraction_spec.rb +550 -0
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
- data/spec/binding/plugins/postprocessor_spec.rb +269 -0
- data/spec/binding/plugins/validator_spec.rb +273 -0
- data/spec/binding/tables_spec.rb +650 -0
- data/spec/fixtures/config.toml +38 -0
- data/spec/fixtures/config.yaml +41 -0
- data/spec/fixtures/invalid_config.toml +3 -0
- data/spec/serialization_spec.rb +134 -0
- data/spec/smoke/package_spec.rb +177 -0
- data/spec/spec_helper.rb +40 -0
- data/spec/unit/config/chunking_config_spec.rb +213 -0
- data/spec/unit/config/embedding_config_spec.rb +343 -0
- data/spec/unit/config/extraction_config_spec.rb +434 -0
- data/spec/unit/config/font_config_spec.rb +285 -0
- data/spec/unit/config/hierarchy_config_spec.rb +314 -0
- data/spec/unit/config/image_extraction_config_spec.rb +209 -0
- data/spec/unit/config/image_preprocessing_config_spec.rb +230 -0
- data/spec/unit/config/keyword_config_spec.rb +229 -0
- data/spec/unit/config/language_detection_config_spec.rb +258 -0
- data/spec/unit/config/ocr_config_spec.rb +171 -0
- data/spec/unit/config/output_format_spec.rb +380 -0
- data/spec/unit/config/page_config_spec.rb +221 -0
- data/spec/unit/config/pdf_config_spec.rb +267 -0
- data/spec/unit/config/postprocessor_config_spec.rb +290 -0
- data/spec/unit/config/tesseract_config_spec.rb +181 -0
- data/spec/unit/config/token_reduction_config_spec.rb +251 -0
- data/test/metadata_types_test.rb +959 -0
- metadata +292 -0
|
@@ -0,0 +1,712 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
begin
|
|
4
|
+
require 'json'
|
|
5
|
+
rescue LoadError
|
|
6
|
+
require 'json/pure'
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
require_relative 'document_structure'
|
|
10
|
+
|
|
11
|
+
module Kreuzberg
|
|
12
|
+
# @example
|
|
13
|
+
# rubocop:disable Metrics/ClassLength
|
|
14
|
+
class Result
|
|
15
|
+
attr_reader :content, :mime_type, :metadata, :metadata_json, :tables,
|
|
16
|
+
:detected_languages, :chunks, :images, :pages, :elements, :ocr_elements, :djot_content,
|
|
17
|
+
:document, :extracted_keywords, :quality_score, :processing_warnings
|
|
18
|
+
|
|
19
|
+
# @!attribute [r] cells
|
|
20
|
+
# @return [Array<Array<String>>] Table cells (2D array)
|
|
21
|
+
# @!attribute [r] markdown
|
|
22
|
+
# @return [String] Markdown representation
|
|
23
|
+
# @!attribute [r] page_number
|
|
24
|
+
# @return [Integer] Page number where table was found
|
|
25
|
+
# @!attribute [r] bounding_box
|
|
26
|
+
# @return [BoundingBox, nil] Bounding box of the table on the page
|
|
27
|
+
Table = Struct.new(:cells, :markdown, :page_number, :bounding_box, keyword_init: true) do
|
|
28
|
+
def to_h
|
|
29
|
+
{ cells: cells, markdown: markdown, page_number: page_number, bounding_box: bounding_box&.to_h }
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
# @!attribute [r] content
|
|
34
|
+
# @return [String] Chunk content
|
|
35
|
+
# @!attribute [r] byte_start
|
|
36
|
+
# @return [Integer] Starting byte offset (UTF-8)
|
|
37
|
+
# @!attribute [r] byte_end
|
|
38
|
+
# @return [Integer] Ending byte offset (UTF-8)
|
|
39
|
+
# @!attribute [r] token_count
|
|
40
|
+
# @return [Integer, nil] Approximate token count (may be nil)
|
|
41
|
+
# @!attribute [r] first_page
|
|
42
|
+
# @return [Integer, nil] First page number (1-indexed)
|
|
43
|
+
# @!attribute [r] last_page
|
|
44
|
+
# @return [Integer, nil] Last page number (1-indexed)
|
|
45
|
+
Chunk = Struct.new(
|
|
46
|
+
:content,
|
|
47
|
+
:byte_start,
|
|
48
|
+
:byte_end,
|
|
49
|
+
:token_count,
|
|
50
|
+
:chunk_index,
|
|
51
|
+
:total_chunks,
|
|
52
|
+
:first_page,
|
|
53
|
+
:last_page,
|
|
54
|
+
:embedding,
|
|
55
|
+
keyword_init: true
|
|
56
|
+
) do
|
|
57
|
+
def to_h
|
|
58
|
+
{
|
|
59
|
+
content: content,
|
|
60
|
+
byte_start: byte_start,
|
|
61
|
+
byte_end: byte_end,
|
|
62
|
+
token_count: token_count,
|
|
63
|
+
chunk_index: chunk_index,
|
|
64
|
+
total_chunks: total_chunks,
|
|
65
|
+
first_page: first_page,
|
|
66
|
+
last_page: last_page,
|
|
67
|
+
embedding: embedding
|
|
68
|
+
}
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
Image = Struct.new(
|
|
73
|
+
:data,
|
|
74
|
+
:format,
|
|
75
|
+
:image_index,
|
|
76
|
+
:page_number,
|
|
77
|
+
:width,
|
|
78
|
+
:height,
|
|
79
|
+
:colorspace,
|
|
80
|
+
:bits_per_component,
|
|
81
|
+
:is_mask,
|
|
82
|
+
:description,
|
|
83
|
+
:bounding_box,
|
|
84
|
+
:ocr_result,
|
|
85
|
+
keyword_init: true
|
|
86
|
+
) do
|
|
87
|
+
def to_h
|
|
88
|
+
{
|
|
89
|
+
data: data,
|
|
90
|
+
format: format,
|
|
91
|
+
image_index: image_index,
|
|
92
|
+
page_number: page_number,
|
|
93
|
+
width: width,
|
|
94
|
+
height: height,
|
|
95
|
+
colorspace: colorspace,
|
|
96
|
+
bits_per_component: bits_per_component,
|
|
97
|
+
is_mask: is_mask,
|
|
98
|
+
description: description,
|
|
99
|
+
bounding_box: bounding_box&.to_h,
|
|
100
|
+
ocr_result: ocr_result&.to_h
|
|
101
|
+
}
|
|
102
|
+
end
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
# @!attribute [r] page_number
|
|
106
|
+
# @return [Integer] Page number (1-indexed)
|
|
107
|
+
# @!attribute [r] content
|
|
108
|
+
# @return [String] Text content for this page
|
|
109
|
+
# @!attribute [r] tables
|
|
110
|
+
# @return [Array<Table>] Tables on this page
|
|
111
|
+
# @!attribute [r] images
|
|
112
|
+
# @return [Array<Image>] Images on this page
|
|
113
|
+
# @!attribute [r] text
|
|
114
|
+
# @return [String] The text content of this block
|
|
115
|
+
# @!attribute [r] font_size
|
|
116
|
+
# @return [Float] The font size of the text
|
|
117
|
+
# @!attribute [r] level
|
|
118
|
+
# @return [String] The hierarchy level (h1-h6 or body)
|
|
119
|
+
# @!attribute [r] bbox
|
|
120
|
+
# @return [Array<Float>, nil] Bounding box (left, top, right, bottom)
|
|
121
|
+
HierarchicalBlock = Struct.new(:text, :font_size, :level, :bbox, keyword_init: true) do
|
|
122
|
+
def to_h
|
|
123
|
+
{ text: text, font_size: font_size, level: level, bbox: bbox }
|
|
124
|
+
end
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
# @!attribute [r] block_count
|
|
128
|
+
# @return [Integer] Number of hierarchy blocks
|
|
129
|
+
# @!attribute [r] blocks
|
|
130
|
+
# @return [Array<HierarchicalBlock>] Hierarchical blocks
|
|
131
|
+
PageHierarchy = Struct.new(:block_count, :blocks, keyword_init: true) do
|
|
132
|
+
def to_h
|
|
133
|
+
{ block_count: block_count, blocks: blocks.map(&:to_h) }
|
|
134
|
+
end
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
# @!attribute [r] page_number
|
|
138
|
+
# @return [Integer] Page number (1-indexed)
|
|
139
|
+
# @!attribute [r] content
|
|
140
|
+
# @return [String] Text content for this page
|
|
141
|
+
# @!attribute [r] tables
|
|
142
|
+
# @return [Array<Table>] Tables on this page
|
|
143
|
+
# @!attribute [r] images
|
|
144
|
+
# @return [Array<Image>] Images on this page
|
|
145
|
+
# @!attribute [r] hierarchy
|
|
146
|
+
# @return [PageHierarchy, nil] Hierarchy information for the page
|
|
147
|
+
PageContent = Struct.new(:page_number, :content, :tables, :images, :hierarchy, :is_blank, keyword_init: true) do
|
|
148
|
+
def to_h
|
|
149
|
+
{
|
|
150
|
+
page_number: page_number,
|
|
151
|
+
content: content,
|
|
152
|
+
tables: tables.map(&:to_h),
|
|
153
|
+
images: images.map(&:to_h),
|
|
154
|
+
hierarchy: hierarchy&.to_h,
|
|
155
|
+
is_blank: is_blank
|
|
156
|
+
}
|
|
157
|
+
end
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
# @!attribute [r] x0
|
|
161
|
+
# @return [Float] Left x-coordinate
|
|
162
|
+
# @!attribute [r] y0
|
|
163
|
+
# @return [Float] Bottom y-coordinate
|
|
164
|
+
# @!attribute [r] x1
|
|
165
|
+
# @return [Float] Right x-coordinate
|
|
166
|
+
# @!attribute [r] y1
|
|
167
|
+
# @return [Float] Top y-coordinate
|
|
168
|
+
ElementBoundingBox = Struct.new(:x0, :y0, :x1, :y1, keyword_init: true) do
|
|
169
|
+
def to_h
|
|
170
|
+
{ x0: x0, y0: y0, x1: x1, y1: y1 }
|
|
171
|
+
end
|
|
172
|
+
end
|
|
173
|
+
|
|
174
|
+
# @!attribute [r] page_number
|
|
175
|
+
# @return [Integer, nil] Page number (1-indexed)
|
|
176
|
+
# @!attribute [r] filename
|
|
177
|
+
# @return [String, nil] Source filename or document name
|
|
178
|
+
# @!attribute [r] coordinates
|
|
179
|
+
# @return [ElementBoundingBox, nil] Bounding box coordinates if available
|
|
180
|
+
# @!attribute [r] element_index
|
|
181
|
+
# @return [Integer, nil] Position index in the element sequence
|
|
182
|
+
# @!attribute [r] additional
|
|
183
|
+
# @return [Hash<String, String>] Additional custom metadata
|
|
184
|
+
ElementMetadataStruct = Struct.new(
|
|
185
|
+
:page_number,
|
|
186
|
+
:filename,
|
|
187
|
+
:coordinates,
|
|
188
|
+
:element_index,
|
|
189
|
+
:additional,
|
|
190
|
+
keyword_init: true
|
|
191
|
+
) do
|
|
192
|
+
def to_h
|
|
193
|
+
{
|
|
194
|
+
page_number: page_number,
|
|
195
|
+
filename: filename,
|
|
196
|
+
coordinates: coordinates&.to_h,
|
|
197
|
+
element_index: element_index,
|
|
198
|
+
additional: additional
|
|
199
|
+
}
|
|
200
|
+
end
|
|
201
|
+
end
|
|
202
|
+
|
|
203
|
+
# @!attribute [r] element_id
|
|
204
|
+
# @return [String] Unique element identifier
|
|
205
|
+
# @!attribute [r] element_type
|
|
206
|
+
# @return [String] Semantic type of the element
|
|
207
|
+
# @!attribute [r] text
|
|
208
|
+
# @return [String] Text content of the element
|
|
209
|
+
# @!attribute [r] metadata
|
|
210
|
+
# @return [ElementMetadataStruct] Metadata about the element
|
|
211
|
+
ElementStruct = Struct.new(:element_id, :element_type, :text, :metadata, keyword_init: true) do
|
|
212
|
+
def to_h
|
|
213
|
+
{
|
|
214
|
+
element_id: element_id,
|
|
215
|
+
element_type: element_type,
|
|
216
|
+
text: text,
|
|
217
|
+
metadata: metadata&.to_h
|
|
218
|
+
}
|
|
219
|
+
end
|
|
220
|
+
end
|
|
221
|
+
|
|
222
|
+
# OCR bounding geometry with type and coordinates
|
|
223
|
+
class OcrBoundingGeometry
|
|
224
|
+
attr_reader :type, :left, :top, :width, :height, :points
|
|
225
|
+
|
|
226
|
+
def initialize(type:, left: nil, top: nil, width: nil, height: nil, points: nil)
|
|
227
|
+
@type = type.to_s
|
|
228
|
+
@left = left&.to_f
|
|
229
|
+
@top = top&.to_f
|
|
230
|
+
@width = width&.to_f
|
|
231
|
+
@height = height&.to_f
|
|
232
|
+
@points = points
|
|
233
|
+
end
|
|
234
|
+
|
|
235
|
+
def to_h
|
|
236
|
+
{
|
|
237
|
+
type: @type,
|
|
238
|
+
left: @left,
|
|
239
|
+
top: @top,
|
|
240
|
+
width: @width,
|
|
241
|
+
height: @height,
|
|
242
|
+
points: @points
|
|
243
|
+
}.compact
|
|
244
|
+
end
|
|
245
|
+
end
|
|
246
|
+
|
|
247
|
+
# OCR confidence scores for detection and recognition
|
|
248
|
+
class OcrConfidence
|
|
249
|
+
attr_reader :detection, :recognition
|
|
250
|
+
|
|
251
|
+
def initialize(detection: nil, recognition: nil)
|
|
252
|
+
@detection = detection&.to_f
|
|
253
|
+
@recognition = recognition&.to_f
|
|
254
|
+
end
|
|
255
|
+
|
|
256
|
+
def to_h
|
|
257
|
+
{
|
|
258
|
+
detection: @detection,
|
|
259
|
+
recognition: @recognition
|
|
260
|
+
}.compact
|
|
261
|
+
end
|
|
262
|
+
end
|
|
263
|
+
|
|
264
|
+
# OCR rotation information
|
|
265
|
+
class OcrRotation
|
|
266
|
+
attr_reader :angle_degrees, :confidence
|
|
267
|
+
|
|
268
|
+
def initialize(angle_degrees: nil, confidence: nil)
|
|
269
|
+
@angle_degrees = angle_degrees&.to_f
|
|
270
|
+
@confidence = confidence&.to_f
|
|
271
|
+
end
|
|
272
|
+
|
|
273
|
+
def to_h
|
|
274
|
+
{
|
|
275
|
+
angle_degrees: @angle_degrees,
|
|
276
|
+
confidence: @confidence
|
|
277
|
+
}.compact
|
|
278
|
+
end
|
|
279
|
+
end
|
|
280
|
+
|
|
281
|
+
# OCR text element with geometry and metadata
|
|
282
|
+
class OcrElement
|
|
283
|
+
attr_reader :text, :geometry, :confidence, :level, :rotation,
|
|
284
|
+
:page_number, :parent_id, :backend_metadata
|
|
285
|
+
|
|
286
|
+
def initialize(
|
|
287
|
+
text:,
|
|
288
|
+
geometry: nil,
|
|
289
|
+
confidence: nil,
|
|
290
|
+
level: nil,
|
|
291
|
+
rotation: nil,
|
|
292
|
+
page_number: nil,
|
|
293
|
+
parent_id: nil,
|
|
294
|
+
backend_metadata: nil
|
|
295
|
+
)
|
|
296
|
+
@text = text.to_s
|
|
297
|
+
@geometry = geometry
|
|
298
|
+
@confidence = confidence
|
|
299
|
+
@level = level&.to_s
|
|
300
|
+
@rotation = rotation
|
|
301
|
+
@page_number = page_number&.to_i
|
|
302
|
+
@parent_id = parent_id&.to_s
|
|
303
|
+
@backend_metadata = backend_metadata
|
|
304
|
+
end
|
|
305
|
+
|
|
306
|
+
def to_h
|
|
307
|
+
{
|
|
308
|
+
text: @text,
|
|
309
|
+
geometry: @geometry&.to_h,
|
|
310
|
+
confidence: @confidence&.to_h,
|
|
311
|
+
level: @level,
|
|
312
|
+
rotation: @rotation&.to_h,
|
|
313
|
+
page_number: @page_number,
|
|
314
|
+
parent_id: @parent_id,
|
|
315
|
+
backend_metadata: @backend_metadata
|
|
316
|
+
}.compact
|
|
317
|
+
end
|
|
318
|
+
end
|
|
319
|
+
|
|
320
|
+
# Initialize from native hash result
|
|
321
|
+
#
|
|
322
|
+
# @param hash [Hash] Hash returned from native extension
|
|
323
|
+
#
|
|
324
|
+
# rubocop:disable Metrics/AbcSize
|
|
325
|
+
def initialize(hash)
|
|
326
|
+
@content = get_value(hash, 'content', '')
|
|
327
|
+
@mime_type = get_value(hash, 'mime_type', '')
|
|
328
|
+
@metadata_json = get_value(hash, 'metadata_json', '{}')
|
|
329
|
+
@metadata = parse_metadata(@metadata_json)
|
|
330
|
+
@tables = parse_tables(get_value(hash, 'tables'))
|
|
331
|
+
@detected_languages = parse_detected_languages(get_value(hash, 'detected_languages'))
|
|
332
|
+
@chunks = parse_chunks(get_value(hash, 'chunks'))
|
|
333
|
+
@images = parse_images(get_value(hash, 'images'))
|
|
334
|
+
@pages = parse_pages(get_value(hash, 'pages'))
|
|
335
|
+
@elements = parse_elements(get_value(hash, 'elements'))
|
|
336
|
+
@ocr_elements = parse_ocr_elements(get_value(hash, 'ocr_elements'))
|
|
337
|
+
@djot_content = parse_djot_content(get_value(hash, 'djot_content'))
|
|
338
|
+
@document = parse_document_structure(get_value(hash, 'document'))
|
|
339
|
+
@extracted_keywords = parse_extracted_keywords(get_value(hash, 'extracted_keywords'))
|
|
340
|
+
@quality_score = get_value(hash, 'quality_score')
|
|
341
|
+
@processing_warnings = parse_processing_warnings(get_value(hash, 'processing_warnings'))
|
|
342
|
+
end
|
|
343
|
+
# rubocop:enable Metrics/AbcSize
|
|
344
|
+
|
|
345
|
+
# Convert to hash
|
|
346
|
+
#
|
|
347
|
+
# @return [Hash] Hash representation
|
|
348
|
+
#
|
|
349
|
+
def to_h
|
|
350
|
+
{
|
|
351
|
+
content: @content,
|
|
352
|
+
mime_type: @mime_type,
|
|
353
|
+
metadata: @metadata,
|
|
354
|
+
tables: serialize_tables,
|
|
355
|
+
detected_languages: @detected_languages,
|
|
356
|
+
chunks: serialize_chunks,
|
|
357
|
+
images: serialize_images,
|
|
358
|
+
pages: serialize_pages,
|
|
359
|
+
elements: serialize_elements,
|
|
360
|
+
ocr_elements: serialize_ocr_elements,
|
|
361
|
+
djot_content: @djot_content&.to_h,
|
|
362
|
+
document: @document&.to_h,
|
|
363
|
+
extracted_keywords: @extracted_keywords&.map(&:to_h),
|
|
364
|
+
quality_score: @quality_score,
|
|
365
|
+
processing_warnings: @processing_warnings.map(&:to_h)
|
|
366
|
+
}
|
|
367
|
+
end
|
|
368
|
+
|
|
369
|
+
# Convert to JSON
|
|
370
|
+
#
|
|
371
|
+
# @return [String] JSON representation
|
|
372
|
+
#
|
|
373
|
+
def to_json(*)
|
|
374
|
+
to_h.to_json(*)
|
|
375
|
+
end
|
|
376
|
+
|
|
377
|
+
# Get the total number of pages in the document
|
|
378
|
+
#
|
|
379
|
+
# @return [Integer] Total page count (>= 0), or -1 on error
|
|
380
|
+
#
|
|
381
|
+
# @example
|
|
382
|
+
# result = Kreuzberg.extract_file_sync("document.pdf")
|
|
383
|
+
# puts "Document has #{result.page_count} pages"
|
|
384
|
+
#
|
|
385
|
+
def page_count
|
|
386
|
+
if @metadata.is_a?(Hash) && @metadata['pages'].is_a?(Hash)
|
|
387
|
+
@metadata['pages']['total_count'] || 0
|
|
388
|
+
else
|
|
389
|
+
0
|
|
390
|
+
end
|
|
391
|
+
end
|
|
392
|
+
|
|
393
|
+
# Get the total number of text chunks
|
|
394
|
+
#
|
|
395
|
+
# Returns 0 if chunking was not performed.
|
|
396
|
+
#
|
|
397
|
+
# @return [Integer] Total chunk count (>= 0), or -1 on error
|
|
398
|
+
#
|
|
399
|
+
# @example
|
|
400
|
+
# result = Kreuzberg.extract_file_sync("document.pdf")
|
|
401
|
+
# puts "Document has #{result.chunk_count} chunks"
|
|
402
|
+
#
|
|
403
|
+
def chunk_count
|
|
404
|
+
@chunks&.length || 0
|
|
405
|
+
end
|
|
406
|
+
|
|
407
|
+
# Get the primary detected language
|
|
408
|
+
#
|
|
409
|
+
# @return [String, nil] ISO 639 language code (e.g., "en", "de"), or nil if not detected
|
|
410
|
+
#
|
|
411
|
+
# @example
|
|
412
|
+
# result = Kreuzberg.extract_file_sync("document.pdf")
|
|
413
|
+
# lang = result.detected_language
|
|
414
|
+
# puts "Language: #{lang}" if lang
|
|
415
|
+
#
|
|
416
|
+
def detected_language
|
|
417
|
+
return @metadata['language'] if @metadata.is_a?(Hash) && @metadata['language']
|
|
418
|
+
return @detected_languages&.first if @detected_languages&.any?
|
|
419
|
+
|
|
420
|
+
nil
|
|
421
|
+
end
|
|
422
|
+
|
|
423
|
+
# Get a metadata field by name
|
|
424
|
+
#
|
|
425
|
+
# Supports dot notation for nested fields (e.g., "format.pages").
|
|
426
|
+
#
|
|
427
|
+
# @param name [String, Symbol] Field name
|
|
428
|
+
# @return [Object, nil] Field value, or nil if field doesn't exist
|
|
429
|
+
#
|
|
430
|
+
# @example Get a top-level field
|
|
431
|
+
# result = Kreuzberg.extract_file_sync("document.pdf")
|
|
432
|
+
# title = result.metadata_field("title")
|
|
433
|
+
# puts "Title: #{title}" if title
|
|
434
|
+
#
|
|
435
|
+
# @example Get a nested field
|
|
436
|
+
# format_info = result.metadata_field("format.pages")
|
|
437
|
+
#
|
|
438
|
+
def metadata_field(name)
|
|
439
|
+
return nil unless @metadata.is_a?(Hash)
|
|
440
|
+
|
|
441
|
+
parts = name.to_s.split('.')
|
|
442
|
+
value = @metadata
|
|
443
|
+
|
|
444
|
+
parts.each do |part|
|
|
445
|
+
return nil unless value.is_a?(Hash)
|
|
446
|
+
|
|
447
|
+
value = value[part]
|
|
448
|
+
end
|
|
449
|
+
|
|
450
|
+
value
|
|
451
|
+
end
|
|
452
|
+
|
|
453
|
+
private
|
|
454
|
+
|
|
455
|
+
def serialize_tables
|
|
456
|
+
@tables.map(&:to_h)
|
|
457
|
+
end
|
|
458
|
+
|
|
459
|
+
def serialize_chunks
|
|
460
|
+
@chunks&.map(&:to_h)
|
|
461
|
+
end
|
|
462
|
+
|
|
463
|
+
def serialize_images
|
|
464
|
+
@images&.map(&:to_h)
|
|
465
|
+
end
|
|
466
|
+
|
|
467
|
+
def serialize_pages
|
|
468
|
+
@pages&.map(&:to_h)
|
|
469
|
+
end
|
|
470
|
+
|
|
471
|
+
def serialize_elements
|
|
472
|
+
@elements&.map(&:to_h)
|
|
473
|
+
end
|
|
474
|
+
|
|
475
|
+
def serialize_ocr_elements
|
|
476
|
+
@ocr_elements&.map(&:to_h)
|
|
477
|
+
end
|
|
478
|
+
|
|
479
|
+
def get_value(hash, key, default = nil)
|
|
480
|
+
hash[key] || hash[key.to_sym] || default
|
|
481
|
+
end
|
|
482
|
+
|
|
483
|
+
def parse_metadata(metadata_json)
|
|
484
|
+
JSON.parse(metadata_json)
|
|
485
|
+
rescue JSON::ParserError
|
|
486
|
+
{}
|
|
487
|
+
end
|
|
488
|
+
|
|
489
|
+
def parse_tables(tables_data)
|
|
490
|
+
return [] if tables_data.nil? || tables_data.empty?
|
|
491
|
+
|
|
492
|
+
tables_data.map do |table_hash|
|
|
493
|
+
bounding_box = parse_bounding_box(table_hash['bounding_box'])
|
|
494
|
+
Table.new(
|
|
495
|
+
cells: table_hash['cells'] || [],
|
|
496
|
+
markdown: table_hash['markdown'] || '',
|
|
497
|
+
page_number: table_hash['page_number'] || 0,
|
|
498
|
+
bounding_box: bounding_box
|
|
499
|
+
)
|
|
500
|
+
end
|
|
501
|
+
end
|
|
502
|
+
|
|
503
|
+
def parse_detected_languages(langs_data)
|
|
504
|
+
return nil if langs_data.nil?
|
|
505
|
+
|
|
506
|
+
langs_data.is_a?(Array) ? langs_data : []
|
|
507
|
+
end
|
|
508
|
+
|
|
509
|
+
def parse_chunks(chunks_data)
|
|
510
|
+
return [] if chunks_data.nil? || chunks_data.empty?
|
|
511
|
+
|
|
512
|
+
chunks_data.map do |chunk_hash|
|
|
513
|
+
Chunk.new(
|
|
514
|
+
content: chunk_hash['content'],
|
|
515
|
+
byte_start: chunk_hash['byte_start'],
|
|
516
|
+
byte_end: chunk_hash['byte_end'],
|
|
517
|
+
token_count: chunk_hash['token_count'],
|
|
518
|
+
chunk_index: chunk_hash['chunk_index'],
|
|
519
|
+
total_chunks: chunk_hash['total_chunks'],
|
|
520
|
+
first_page: chunk_hash['first_page'],
|
|
521
|
+
last_page: chunk_hash['last_page'],
|
|
522
|
+
embedding: chunk_hash['embedding']
|
|
523
|
+
)
|
|
524
|
+
end
|
|
525
|
+
end
|
|
526
|
+
|
|
527
|
+
def parse_images(images_data)
|
|
528
|
+
return nil if images_data.nil?
|
|
529
|
+
|
|
530
|
+
images_data.map { |image_hash| parse_single_image(image_hash) }
|
|
531
|
+
end
|
|
532
|
+
|
|
533
|
+
def parse_single_image(image_hash)
|
|
534
|
+
data = image_hash['data']
|
|
535
|
+
data = data.dup.force_encoding(Encoding::BINARY) if data.respond_to?(:force_encoding)
|
|
536
|
+
Image.new(
|
|
537
|
+
data: data,
|
|
538
|
+
format: image_hash['format'],
|
|
539
|
+
image_index: image_hash['image_index'],
|
|
540
|
+
page_number: image_hash['page_number'],
|
|
541
|
+
width: image_hash['width'],
|
|
542
|
+
height: image_hash['height'],
|
|
543
|
+
colorspace: image_hash['colorspace'],
|
|
544
|
+
bits_per_component: image_hash['bits_per_component'],
|
|
545
|
+
is_mask: image_hash['is_mask'],
|
|
546
|
+
description: image_hash['description'],
|
|
547
|
+
bounding_box: parse_bounding_box(image_hash['bounding_box']),
|
|
548
|
+
ocr_result: image_hash['ocr_result'] ? Result.new(image_hash['ocr_result']) : nil
|
|
549
|
+
)
|
|
550
|
+
end
|
|
551
|
+
|
|
552
|
+
def parse_pages(pages_data)
|
|
553
|
+
return nil if pages_data.nil?
|
|
554
|
+
|
|
555
|
+
pages_data.map do |page_hash|
|
|
556
|
+
PageContent.new(
|
|
557
|
+
page_number: page_hash['page_number'],
|
|
558
|
+
content: page_hash['content'],
|
|
559
|
+
tables: parse_tables(page_hash['tables']),
|
|
560
|
+
images: parse_images(page_hash['images']),
|
|
561
|
+
hierarchy: parse_page_hierarchy(page_hash['hierarchy']),
|
|
562
|
+
is_blank: page_hash['is_blank']
|
|
563
|
+
)
|
|
564
|
+
end
|
|
565
|
+
end
|
|
566
|
+
|
|
567
|
+
def parse_page_hierarchy(hierarchy_data)
|
|
568
|
+
return nil if hierarchy_data.nil?
|
|
569
|
+
|
|
570
|
+
blocks = (hierarchy_data['blocks'] || []).map do |block_hash|
|
|
571
|
+
HierarchicalBlock.new(
|
|
572
|
+
text: block_hash['text'],
|
|
573
|
+
font_size: block_hash['font_size']&.to_f,
|
|
574
|
+
level: block_hash['level'],
|
|
575
|
+
bbox: block_hash['bbox']
|
|
576
|
+
)
|
|
577
|
+
end
|
|
578
|
+
|
|
579
|
+
PageHierarchy.new(
|
|
580
|
+
block_count: hierarchy_data['block_count'] || 0,
|
|
581
|
+
blocks: blocks
|
|
582
|
+
)
|
|
583
|
+
end
|
|
584
|
+
|
|
585
|
+
def parse_elements(elements_data)
|
|
586
|
+
return nil if elements_data.nil?
|
|
587
|
+
|
|
588
|
+
elements_data.map { |element_hash| parse_element(element_hash) }
|
|
589
|
+
end
|
|
590
|
+
|
|
591
|
+
def parse_element(element_hash)
|
|
592
|
+
metadata_hash = element_hash['metadata'] || {}
|
|
593
|
+
coordinates = parse_element_coordinates(metadata_hash['coordinates'])
|
|
594
|
+
|
|
595
|
+
metadata = ElementMetadataStruct.new(
|
|
596
|
+
page_number: metadata_hash['page_number'],
|
|
597
|
+
filename: metadata_hash['filename'],
|
|
598
|
+
coordinates: coordinates,
|
|
599
|
+
element_index: metadata_hash['element_index'],
|
|
600
|
+
additional: metadata_hash['additional'] || {}
|
|
601
|
+
)
|
|
602
|
+
|
|
603
|
+
ElementStruct.new(
|
|
604
|
+
element_id: element_hash['element_id'],
|
|
605
|
+
element_type: element_hash['element_type'],
|
|
606
|
+
text: element_hash['text'],
|
|
607
|
+
metadata: metadata
|
|
608
|
+
)
|
|
609
|
+
end
|
|
610
|
+
|
|
611
|
+
def parse_element_coordinates(coordinates_data)
|
|
612
|
+
return nil if coordinates_data.nil?
|
|
613
|
+
|
|
614
|
+
ElementBoundingBox.new(
|
|
615
|
+
x0: coordinates_data['x0'].to_f,
|
|
616
|
+
y0: coordinates_data['y0'].to_f,
|
|
617
|
+
x1: coordinates_data['x1'].to_f,
|
|
618
|
+
y1: coordinates_data['y1'].to_f
|
|
619
|
+
)
|
|
620
|
+
end
|
|
621
|
+
|
|
622
|
+
def parse_bounding_box(bounding_box_data)
|
|
623
|
+
return nil if bounding_box_data.nil?
|
|
624
|
+
|
|
625
|
+
# If it's already a BoundingBox object, return it
|
|
626
|
+
return bounding_box_data if bounding_box_data.is_a?(BoundingBox)
|
|
627
|
+
|
|
628
|
+
# Otherwise parse from hash
|
|
629
|
+
BoundingBox.new(
|
|
630
|
+
x0: bounding_box_data['x0'].to_f,
|
|
631
|
+
y0: bounding_box_data['y0'].to_f,
|
|
632
|
+
x1: bounding_box_data['x1'].to_f,
|
|
633
|
+
y1: bounding_box_data['y1'].to_f
|
|
634
|
+
)
|
|
635
|
+
end
|
|
636
|
+
|
|
637
|
+
def parse_ocr_elements(ocr_elements_data)
|
|
638
|
+
return nil if ocr_elements_data.nil?
|
|
639
|
+
|
|
640
|
+
ocr_elements_data.map do |element_hash|
|
|
641
|
+
OcrElement.new(
|
|
642
|
+
text: element_hash['text'],
|
|
643
|
+
geometry: parse_ocr_geometry(element_hash['geometry']),
|
|
644
|
+
confidence: parse_ocr_confidence(element_hash['confidence']),
|
|
645
|
+
level: element_hash['level'],
|
|
646
|
+
rotation: parse_ocr_rotation(element_hash['rotation']),
|
|
647
|
+
page_number: element_hash['page_number'],
|
|
648
|
+
parent_id: element_hash['parent_id'],
|
|
649
|
+
backend_metadata: element_hash['backend_metadata']
|
|
650
|
+
)
|
|
651
|
+
end
|
|
652
|
+
end
|
|
653
|
+
|
|
654
|
+
def parse_ocr_geometry(data)
|
|
655
|
+
return nil unless data.is_a?(Hash)
|
|
656
|
+
|
|
657
|
+
OcrBoundingGeometry.new(
|
|
658
|
+
type: data['type'], left: data['left'], top: data['top'],
|
|
659
|
+
width: data['width'], height: data['height'], points: data['points']
|
|
660
|
+
)
|
|
661
|
+
end
|
|
662
|
+
|
|
663
|
+
def parse_ocr_confidence(data)
|
|
664
|
+
return nil unless data.is_a?(Hash)
|
|
665
|
+
|
|
666
|
+
OcrConfidence.new(detection: data['detection'], recognition: data['recognition'])
|
|
667
|
+
end
|
|
668
|
+
|
|
669
|
+
def parse_ocr_rotation(data)
|
|
670
|
+
return nil unless data.is_a?(Hash)
|
|
671
|
+
|
|
672
|
+
OcrRotation.new(angle_degrees: data['angle_degrees'], confidence: data['confidence'])
|
|
673
|
+
end
|
|
674
|
+
|
|
675
|
+
def parse_djot_content(djot_data)
|
|
676
|
+
return nil if djot_data.nil?
|
|
677
|
+
|
|
678
|
+
DjotContent.new(djot_data)
|
|
679
|
+
end
|
|
680
|
+
|
|
681
|
+
def parse_document_structure(document_data)
|
|
682
|
+
return nil if document_data.nil?
|
|
683
|
+
|
|
684
|
+
DocumentStructure.new(document_data)
|
|
685
|
+
end
|
|
686
|
+
|
|
687
|
+
def parse_extracted_keywords(keywords_data)
|
|
688
|
+
return nil if keywords_data.nil?
|
|
689
|
+
|
|
690
|
+
keywords_data.map do |kw_hash|
|
|
691
|
+
Kreuzberg::ExtractedKeyword.new(
|
|
692
|
+
text: kw_hash['text'] || '',
|
|
693
|
+
score: (kw_hash['score'] || 0.0).to_f,
|
|
694
|
+
algorithm: kw_hash['algorithm'] || '',
|
|
695
|
+
positions: kw_hash['positions']
|
|
696
|
+
)
|
|
697
|
+
end
|
|
698
|
+
end
|
|
699
|
+
|
|
700
|
+
def parse_processing_warnings(warnings_data)
|
|
701
|
+
return [] if warnings_data.nil?
|
|
702
|
+
|
|
703
|
+
warnings_data.map do |w_hash|
|
|
704
|
+
Kreuzberg::ProcessingWarning.new(
|
|
705
|
+
source: w_hash['source'] || '',
|
|
706
|
+
message: w_hash['message'] || ''
|
|
707
|
+
)
|
|
708
|
+
end
|
|
709
|
+
end
|
|
710
|
+
end
|
|
711
|
+
# rubocop:enable Metrics/ClassLength
|
|
712
|
+
end
|