kreuzberg 4.3.5-aarch64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yaml +1 -0
  5. data/.rubocop.yml +543 -0
  6. data/Gemfile +8 -0
  7. data/Gemfile.lock +260 -0
  8. data/README.md +399 -0
  9. data/Rakefile +34 -0
  10. data/Steepfile +51 -0
  11. data/examples/async_patterns.rb +283 -0
  12. data/extconf.rb +60 -0
  13. data/kreuzberg.gemspec +253 -0
  14. data/lib/kreuzberg/api_proxy.rb +125 -0
  15. data/lib/kreuzberg/cache_api.rb +67 -0
  16. data/lib/kreuzberg/cli.rb +57 -0
  17. data/lib/kreuzberg/cli_proxy.rb +118 -0
  18. data/lib/kreuzberg/config.rb +1241 -0
  19. data/lib/kreuzberg/djot_content.rb +225 -0
  20. data/lib/kreuzberg/document_structure.rb +204 -0
  21. data/lib/kreuzberg/error_context.rb +136 -0
  22. data/lib/kreuzberg/errors.rb +116 -0
  23. data/lib/kreuzberg/extraction_api.rb +329 -0
  24. data/lib/kreuzberg/mcp_proxy.rb +176 -0
  25. data/lib/kreuzberg/ocr_backend_protocol.rb +40 -0
  26. data/lib/kreuzberg/post_processor_protocol.rb +15 -0
  27. data/lib/kreuzberg/result.rb +712 -0
  28. data/lib/kreuzberg/setup_lib_path.rb +99 -0
  29. data/lib/kreuzberg/types.rb +414 -0
  30. data/lib/kreuzberg/validator_protocol.rb +16 -0
  31. data/lib/kreuzberg/version.rb +5 -0
  32. data/lib/kreuzberg.rb +102 -0
  33. data/lib/kreuzberg_rb.so +0 -0
  34. data/lib/libpdfium.so +0 -0
  35. data/sig/kreuzberg/internal.rbs +184 -0
  36. data/sig/kreuzberg.rbs +1337 -0
  37. data/spec/binding/async_operations_spec.rb +473 -0
  38. data/spec/binding/batch_operations_spec.rb +677 -0
  39. data/spec/binding/batch_spec.rb +360 -0
  40. data/spec/binding/cache_spec.rb +227 -0
  41. data/spec/binding/cli_proxy_spec.rb +85 -0
  42. data/spec/binding/cli_spec.rb +55 -0
  43. data/spec/binding/config_result_spec.rb +377 -0
  44. data/spec/binding/config_spec.rb +419 -0
  45. data/spec/binding/config_validation_spec.rb +377 -0
  46. data/spec/binding/embeddings_spec.rb +816 -0
  47. data/spec/binding/error_handling_spec.rb +399 -0
  48. data/spec/binding/error_recovery_spec.rb +488 -0
  49. data/spec/binding/errors_spec.rb +66 -0
  50. data/spec/binding/font_config_spec.rb +220 -0
  51. data/spec/binding/images_spec.rb +732 -0
  52. data/spec/binding/keywords_extraction_spec.rb +600 -0
  53. data/spec/binding/metadata_types_spec.rb +1253 -0
  54. data/spec/binding/pages_extraction_spec.rb +550 -0
  55. data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
  56. data/spec/binding/plugins/postprocessor_spec.rb +269 -0
  57. data/spec/binding/plugins/validator_spec.rb +273 -0
  58. data/spec/binding/tables_spec.rb +650 -0
  59. data/spec/fixtures/config.toml +38 -0
  60. data/spec/fixtures/config.yaml +41 -0
  61. data/spec/fixtures/invalid_config.toml +3 -0
  62. data/spec/serialization_spec.rb +134 -0
  63. data/spec/smoke/package_spec.rb +177 -0
  64. data/spec/spec_helper.rb +40 -0
  65. data/spec/unit/config/chunking_config_spec.rb +213 -0
  66. data/spec/unit/config/embedding_config_spec.rb +343 -0
  67. data/spec/unit/config/extraction_config_spec.rb +434 -0
  68. data/spec/unit/config/font_config_spec.rb +285 -0
  69. data/spec/unit/config/hierarchy_config_spec.rb +314 -0
  70. data/spec/unit/config/image_extraction_config_spec.rb +209 -0
  71. data/spec/unit/config/image_preprocessing_config_spec.rb +230 -0
  72. data/spec/unit/config/keyword_config_spec.rb +229 -0
  73. data/spec/unit/config/language_detection_config_spec.rb +258 -0
  74. data/spec/unit/config/ocr_config_spec.rb +171 -0
  75. data/spec/unit/config/output_format_spec.rb +380 -0
  76. data/spec/unit/config/page_config_spec.rb +221 -0
  77. data/spec/unit/config/pdf_config_spec.rb +267 -0
  78. data/spec/unit/config/postprocessor_config_spec.rb +290 -0
  79. data/spec/unit/config/tesseract_config_spec.rb +181 -0
  80. data/spec/unit/config/token_reduction_config_spec.rb +251 -0
  81. data/test/metadata_types_test.rb +959 -0
  82. metadata +292 -0
@@ -0,0 +1,712 @@
1
+ # frozen_string_literal: true
2
+
3
+ begin
4
+ require 'json'
5
+ rescue LoadError
6
+ require 'json/pure'
7
+ end
8
+
9
+ require_relative 'document_structure'
10
+
11
+ module Kreuzberg
12
+ # @example
13
+ # rubocop:disable Metrics/ClassLength
14
+ class Result
15
+ attr_reader :content, :mime_type, :metadata, :metadata_json, :tables,
16
+ :detected_languages, :chunks, :images, :pages, :elements, :ocr_elements, :djot_content,
17
+ :document, :extracted_keywords, :quality_score, :processing_warnings
18
+
19
+ # @!attribute [r] cells
20
+ # @return [Array<Array<String>>] Table cells (2D array)
21
+ # @!attribute [r] markdown
22
+ # @return [String] Markdown representation
23
+ # @!attribute [r] page_number
24
+ # @return [Integer] Page number where table was found
25
+ # @!attribute [r] bounding_box
26
+ # @return [BoundingBox, nil] Bounding box of the table on the page
27
+ Table = Struct.new(:cells, :markdown, :page_number, :bounding_box, keyword_init: true) do
28
+ def to_h
29
+ { cells: cells, markdown: markdown, page_number: page_number, bounding_box: bounding_box&.to_h }
30
+ end
31
+ end
32
+
33
+ # @!attribute [r] content
34
+ # @return [String] Chunk content
35
+ # @!attribute [r] byte_start
36
+ # @return [Integer] Starting byte offset (UTF-8)
37
+ # @!attribute [r] byte_end
38
+ # @return [Integer] Ending byte offset (UTF-8)
39
+ # @!attribute [r] token_count
40
+ # @return [Integer, nil] Approximate token count (may be nil)
41
+ # @!attribute [r] first_page
42
+ # @return [Integer, nil] First page number (1-indexed)
43
+ # @!attribute [r] last_page
44
+ # @return [Integer, nil] Last page number (1-indexed)
45
+ Chunk = Struct.new(
46
+ :content,
47
+ :byte_start,
48
+ :byte_end,
49
+ :token_count,
50
+ :chunk_index,
51
+ :total_chunks,
52
+ :first_page,
53
+ :last_page,
54
+ :embedding,
55
+ keyword_init: true
56
+ ) do
57
+ def to_h
58
+ {
59
+ content: content,
60
+ byte_start: byte_start,
61
+ byte_end: byte_end,
62
+ token_count: token_count,
63
+ chunk_index: chunk_index,
64
+ total_chunks: total_chunks,
65
+ first_page: first_page,
66
+ last_page: last_page,
67
+ embedding: embedding
68
+ }
69
+ end
70
+ end
71
+
72
+ Image = Struct.new(
73
+ :data,
74
+ :format,
75
+ :image_index,
76
+ :page_number,
77
+ :width,
78
+ :height,
79
+ :colorspace,
80
+ :bits_per_component,
81
+ :is_mask,
82
+ :description,
83
+ :bounding_box,
84
+ :ocr_result,
85
+ keyword_init: true
86
+ ) do
87
+ def to_h
88
+ {
89
+ data: data,
90
+ format: format,
91
+ image_index: image_index,
92
+ page_number: page_number,
93
+ width: width,
94
+ height: height,
95
+ colorspace: colorspace,
96
+ bits_per_component: bits_per_component,
97
+ is_mask: is_mask,
98
+ description: description,
99
+ bounding_box: bounding_box&.to_h,
100
+ ocr_result: ocr_result&.to_h
101
+ }
102
+ end
103
+ end
104
+
105
+ # @!attribute [r] page_number
106
+ # @return [Integer] Page number (1-indexed)
107
+ # @!attribute [r] content
108
+ # @return [String] Text content for this page
109
+ # @!attribute [r] tables
110
+ # @return [Array<Table>] Tables on this page
111
+ # @!attribute [r] images
112
+ # @return [Array<Image>] Images on this page
113
+ # @!attribute [r] text
114
+ # @return [String] The text content of this block
115
+ # @!attribute [r] font_size
116
+ # @return [Float] The font size of the text
117
+ # @!attribute [r] level
118
+ # @return [String] The hierarchy level (h1-h6 or body)
119
+ # @!attribute [r] bbox
120
+ # @return [Array<Float>, nil] Bounding box (left, top, right, bottom)
121
+ HierarchicalBlock = Struct.new(:text, :font_size, :level, :bbox, keyword_init: true) do
122
+ def to_h
123
+ { text: text, font_size: font_size, level: level, bbox: bbox }
124
+ end
125
+ end
126
+
127
+ # @!attribute [r] block_count
128
+ # @return [Integer] Number of hierarchy blocks
129
+ # @!attribute [r] blocks
130
+ # @return [Array<HierarchicalBlock>] Hierarchical blocks
131
+ PageHierarchy = Struct.new(:block_count, :blocks, keyword_init: true) do
132
+ def to_h
133
+ { block_count: block_count, blocks: blocks.map(&:to_h) }
134
+ end
135
+ end
136
+
137
+ # @!attribute [r] page_number
138
+ # @return [Integer] Page number (1-indexed)
139
+ # @!attribute [r] content
140
+ # @return [String] Text content for this page
141
+ # @!attribute [r] tables
142
+ # @return [Array<Table>] Tables on this page
143
+ # @!attribute [r] images
144
+ # @return [Array<Image>] Images on this page
145
+ # @!attribute [r] hierarchy
146
+ # @return [PageHierarchy, nil] Hierarchy information for the page
147
+ PageContent = Struct.new(:page_number, :content, :tables, :images, :hierarchy, :is_blank, keyword_init: true) do
148
+ def to_h
149
+ {
150
+ page_number: page_number,
151
+ content: content,
152
+ tables: tables.map(&:to_h),
153
+ images: images.map(&:to_h),
154
+ hierarchy: hierarchy&.to_h,
155
+ is_blank: is_blank
156
+ }
157
+ end
158
+ end
159
+
160
+ # @!attribute [r] x0
161
+ # @return [Float] Left x-coordinate
162
+ # @!attribute [r] y0
163
+ # @return [Float] Bottom y-coordinate
164
+ # @!attribute [r] x1
165
+ # @return [Float] Right x-coordinate
166
+ # @!attribute [r] y1
167
+ # @return [Float] Top y-coordinate
168
+ ElementBoundingBox = Struct.new(:x0, :y0, :x1, :y1, keyword_init: true) do
169
+ def to_h
170
+ { x0: x0, y0: y0, x1: x1, y1: y1 }
171
+ end
172
+ end
173
+
174
+ # @!attribute [r] page_number
175
+ # @return [Integer, nil] Page number (1-indexed)
176
+ # @!attribute [r] filename
177
+ # @return [String, nil] Source filename or document name
178
+ # @!attribute [r] coordinates
179
+ # @return [ElementBoundingBox, nil] Bounding box coordinates if available
180
+ # @!attribute [r] element_index
181
+ # @return [Integer, nil] Position index in the element sequence
182
+ # @!attribute [r] additional
183
+ # @return [Hash<String, String>] Additional custom metadata
184
+ ElementMetadataStruct = Struct.new(
185
+ :page_number,
186
+ :filename,
187
+ :coordinates,
188
+ :element_index,
189
+ :additional,
190
+ keyword_init: true
191
+ ) do
192
+ def to_h
193
+ {
194
+ page_number: page_number,
195
+ filename: filename,
196
+ coordinates: coordinates&.to_h,
197
+ element_index: element_index,
198
+ additional: additional
199
+ }
200
+ end
201
+ end
202
+
203
+ # @!attribute [r] element_id
204
+ # @return [String] Unique element identifier
205
+ # @!attribute [r] element_type
206
+ # @return [String] Semantic type of the element
207
+ # @!attribute [r] text
208
+ # @return [String] Text content of the element
209
+ # @!attribute [r] metadata
210
+ # @return [ElementMetadataStruct] Metadata about the element
211
+ ElementStruct = Struct.new(:element_id, :element_type, :text, :metadata, keyword_init: true) do
212
+ def to_h
213
+ {
214
+ element_id: element_id,
215
+ element_type: element_type,
216
+ text: text,
217
+ metadata: metadata&.to_h
218
+ }
219
+ end
220
+ end
221
+
222
+ # OCR bounding geometry with type and coordinates
223
+ class OcrBoundingGeometry
224
+ attr_reader :type, :left, :top, :width, :height, :points
225
+
226
+ def initialize(type:, left: nil, top: nil, width: nil, height: nil, points: nil)
227
+ @type = type.to_s
228
+ @left = left&.to_f
229
+ @top = top&.to_f
230
+ @width = width&.to_f
231
+ @height = height&.to_f
232
+ @points = points
233
+ end
234
+
235
+ def to_h
236
+ {
237
+ type: @type,
238
+ left: @left,
239
+ top: @top,
240
+ width: @width,
241
+ height: @height,
242
+ points: @points
243
+ }.compact
244
+ end
245
+ end
246
+
247
+ # OCR confidence scores for detection and recognition
248
+ class OcrConfidence
249
+ attr_reader :detection, :recognition
250
+
251
+ def initialize(detection: nil, recognition: nil)
252
+ @detection = detection&.to_f
253
+ @recognition = recognition&.to_f
254
+ end
255
+
256
+ def to_h
257
+ {
258
+ detection: @detection,
259
+ recognition: @recognition
260
+ }.compact
261
+ end
262
+ end
263
+
264
+ # OCR rotation information
265
+ class OcrRotation
266
+ attr_reader :angle_degrees, :confidence
267
+
268
+ def initialize(angle_degrees: nil, confidence: nil)
269
+ @angle_degrees = angle_degrees&.to_f
270
+ @confidence = confidence&.to_f
271
+ end
272
+
273
+ def to_h
274
+ {
275
+ angle_degrees: @angle_degrees,
276
+ confidence: @confidence
277
+ }.compact
278
+ end
279
+ end
280
+
281
+ # OCR text element with geometry and metadata
282
+ class OcrElement
283
+ attr_reader :text, :geometry, :confidence, :level, :rotation,
284
+ :page_number, :parent_id, :backend_metadata
285
+
286
+ def initialize(
287
+ text:,
288
+ geometry: nil,
289
+ confidence: nil,
290
+ level: nil,
291
+ rotation: nil,
292
+ page_number: nil,
293
+ parent_id: nil,
294
+ backend_metadata: nil
295
+ )
296
+ @text = text.to_s
297
+ @geometry = geometry
298
+ @confidence = confidence
299
+ @level = level&.to_s
300
+ @rotation = rotation
301
+ @page_number = page_number&.to_i
302
+ @parent_id = parent_id&.to_s
303
+ @backend_metadata = backend_metadata
304
+ end
305
+
306
+ def to_h
307
+ {
308
+ text: @text,
309
+ geometry: @geometry&.to_h,
310
+ confidence: @confidence&.to_h,
311
+ level: @level,
312
+ rotation: @rotation&.to_h,
313
+ page_number: @page_number,
314
+ parent_id: @parent_id,
315
+ backend_metadata: @backend_metadata
316
+ }.compact
317
+ end
318
+ end
319
+
320
+ # Initialize from native hash result
321
+ #
322
+ # @param hash [Hash] Hash returned from native extension
323
+ #
324
+ # rubocop:disable Metrics/AbcSize
325
+ def initialize(hash)
326
+ @content = get_value(hash, 'content', '')
327
+ @mime_type = get_value(hash, 'mime_type', '')
328
+ @metadata_json = get_value(hash, 'metadata_json', '{}')
329
+ @metadata = parse_metadata(@metadata_json)
330
+ @tables = parse_tables(get_value(hash, 'tables'))
331
+ @detected_languages = parse_detected_languages(get_value(hash, 'detected_languages'))
332
+ @chunks = parse_chunks(get_value(hash, 'chunks'))
333
+ @images = parse_images(get_value(hash, 'images'))
334
+ @pages = parse_pages(get_value(hash, 'pages'))
335
+ @elements = parse_elements(get_value(hash, 'elements'))
336
+ @ocr_elements = parse_ocr_elements(get_value(hash, 'ocr_elements'))
337
+ @djot_content = parse_djot_content(get_value(hash, 'djot_content'))
338
+ @document = parse_document_structure(get_value(hash, 'document'))
339
+ @extracted_keywords = parse_extracted_keywords(get_value(hash, 'extracted_keywords'))
340
+ @quality_score = get_value(hash, 'quality_score')
341
+ @processing_warnings = parse_processing_warnings(get_value(hash, 'processing_warnings'))
342
+ end
343
+ # rubocop:enable Metrics/AbcSize
344
+
345
+ # Convert to hash
346
+ #
347
+ # @return [Hash] Hash representation
348
+ #
349
+ def to_h
350
+ {
351
+ content: @content,
352
+ mime_type: @mime_type,
353
+ metadata: @metadata,
354
+ tables: serialize_tables,
355
+ detected_languages: @detected_languages,
356
+ chunks: serialize_chunks,
357
+ images: serialize_images,
358
+ pages: serialize_pages,
359
+ elements: serialize_elements,
360
+ ocr_elements: serialize_ocr_elements,
361
+ djot_content: @djot_content&.to_h,
362
+ document: @document&.to_h,
363
+ extracted_keywords: @extracted_keywords&.map(&:to_h),
364
+ quality_score: @quality_score,
365
+ processing_warnings: @processing_warnings.map(&:to_h)
366
+ }
367
+ end
368
+
369
+ # Convert to JSON
370
+ #
371
+ # @return [String] JSON representation
372
+ #
373
+ def to_json(*)
374
+ to_h.to_json(*)
375
+ end
376
+
377
+ # Get the total number of pages in the document
378
+ #
379
+ # @return [Integer] Total page count (>= 0), or -1 on error
380
+ #
381
+ # @example
382
+ # result = Kreuzberg.extract_file_sync("document.pdf")
383
+ # puts "Document has #{result.page_count} pages"
384
+ #
385
+ def page_count
386
+ if @metadata.is_a?(Hash) && @metadata['pages'].is_a?(Hash)
387
+ @metadata['pages']['total_count'] || 0
388
+ else
389
+ 0
390
+ end
391
+ end
392
+
393
+ # Get the total number of text chunks
394
+ #
395
+ # Returns 0 if chunking was not performed.
396
+ #
397
+ # @return [Integer] Total chunk count (>= 0), or -1 on error
398
+ #
399
+ # @example
400
+ # result = Kreuzberg.extract_file_sync("document.pdf")
401
+ # puts "Document has #{result.chunk_count} chunks"
402
+ #
403
+ def chunk_count
404
+ @chunks&.length || 0
405
+ end
406
+
407
+ # Get the primary detected language
408
+ #
409
+ # @return [String, nil] ISO 639 language code (e.g., "en", "de"), or nil if not detected
410
+ #
411
+ # @example
412
+ # result = Kreuzberg.extract_file_sync("document.pdf")
413
+ # lang = result.detected_language
414
+ # puts "Language: #{lang}" if lang
415
+ #
416
+ def detected_language
417
+ return @metadata['language'] if @metadata.is_a?(Hash) && @metadata['language']
418
+ return @detected_languages&.first if @detected_languages&.any?
419
+
420
+ nil
421
+ end
422
+
423
+ # Get a metadata field by name
424
+ #
425
+ # Supports dot notation for nested fields (e.g., "format.pages").
426
+ #
427
+ # @param name [String, Symbol] Field name
428
+ # @return [Object, nil] Field value, or nil if field doesn't exist
429
+ #
430
+ # @example Get a top-level field
431
+ # result = Kreuzberg.extract_file_sync("document.pdf")
432
+ # title = result.metadata_field("title")
433
+ # puts "Title: #{title}" if title
434
+ #
435
+ # @example Get a nested field
436
+ # format_info = result.metadata_field("format.pages")
437
+ #
438
+ def metadata_field(name)
439
+ return nil unless @metadata.is_a?(Hash)
440
+
441
+ parts = name.to_s.split('.')
442
+ value = @metadata
443
+
444
+ parts.each do |part|
445
+ return nil unless value.is_a?(Hash)
446
+
447
+ value = value[part]
448
+ end
449
+
450
+ value
451
+ end
452
+
453
+ private
454
+
455
+ def serialize_tables
456
+ @tables.map(&:to_h)
457
+ end
458
+
459
+ def serialize_chunks
460
+ @chunks&.map(&:to_h)
461
+ end
462
+
463
+ def serialize_images
464
+ @images&.map(&:to_h)
465
+ end
466
+
467
+ def serialize_pages
468
+ @pages&.map(&:to_h)
469
+ end
470
+
471
+ def serialize_elements
472
+ @elements&.map(&:to_h)
473
+ end
474
+
475
+ def serialize_ocr_elements
476
+ @ocr_elements&.map(&:to_h)
477
+ end
478
+
479
+ def get_value(hash, key, default = nil)
480
+ hash[key] || hash[key.to_sym] || default
481
+ end
482
+
483
+ def parse_metadata(metadata_json)
484
+ JSON.parse(metadata_json)
485
+ rescue JSON::ParserError
486
+ {}
487
+ end
488
+
489
+ def parse_tables(tables_data)
490
+ return [] if tables_data.nil? || tables_data.empty?
491
+
492
+ tables_data.map do |table_hash|
493
+ bounding_box = parse_bounding_box(table_hash['bounding_box'])
494
+ Table.new(
495
+ cells: table_hash['cells'] || [],
496
+ markdown: table_hash['markdown'] || '',
497
+ page_number: table_hash['page_number'] || 0,
498
+ bounding_box: bounding_box
499
+ )
500
+ end
501
+ end
502
+
503
+ def parse_detected_languages(langs_data)
504
+ return nil if langs_data.nil?
505
+
506
+ langs_data.is_a?(Array) ? langs_data : []
507
+ end
508
+
509
+ def parse_chunks(chunks_data)
510
+ return [] if chunks_data.nil? || chunks_data.empty?
511
+
512
+ chunks_data.map do |chunk_hash|
513
+ Chunk.new(
514
+ content: chunk_hash['content'],
515
+ byte_start: chunk_hash['byte_start'],
516
+ byte_end: chunk_hash['byte_end'],
517
+ token_count: chunk_hash['token_count'],
518
+ chunk_index: chunk_hash['chunk_index'],
519
+ total_chunks: chunk_hash['total_chunks'],
520
+ first_page: chunk_hash['first_page'],
521
+ last_page: chunk_hash['last_page'],
522
+ embedding: chunk_hash['embedding']
523
+ )
524
+ end
525
+ end
526
+
527
+ def parse_images(images_data)
528
+ return nil if images_data.nil?
529
+
530
+ images_data.map { |image_hash| parse_single_image(image_hash) }
531
+ end
532
+
533
+ def parse_single_image(image_hash)
534
+ data = image_hash['data']
535
+ data = data.dup.force_encoding(Encoding::BINARY) if data.respond_to?(:force_encoding)
536
+ Image.new(
537
+ data: data,
538
+ format: image_hash['format'],
539
+ image_index: image_hash['image_index'],
540
+ page_number: image_hash['page_number'],
541
+ width: image_hash['width'],
542
+ height: image_hash['height'],
543
+ colorspace: image_hash['colorspace'],
544
+ bits_per_component: image_hash['bits_per_component'],
545
+ is_mask: image_hash['is_mask'],
546
+ description: image_hash['description'],
547
+ bounding_box: parse_bounding_box(image_hash['bounding_box']),
548
+ ocr_result: image_hash['ocr_result'] ? Result.new(image_hash['ocr_result']) : nil
549
+ )
550
+ end
551
+
552
+ def parse_pages(pages_data)
553
+ return nil if pages_data.nil?
554
+
555
+ pages_data.map do |page_hash|
556
+ PageContent.new(
557
+ page_number: page_hash['page_number'],
558
+ content: page_hash['content'],
559
+ tables: parse_tables(page_hash['tables']),
560
+ images: parse_images(page_hash['images']),
561
+ hierarchy: parse_page_hierarchy(page_hash['hierarchy']),
562
+ is_blank: page_hash['is_blank']
563
+ )
564
+ end
565
+ end
566
+
567
+ def parse_page_hierarchy(hierarchy_data)
568
+ return nil if hierarchy_data.nil?
569
+
570
+ blocks = (hierarchy_data['blocks'] || []).map do |block_hash|
571
+ HierarchicalBlock.new(
572
+ text: block_hash['text'],
573
+ font_size: block_hash['font_size']&.to_f,
574
+ level: block_hash['level'],
575
+ bbox: block_hash['bbox']
576
+ )
577
+ end
578
+
579
+ PageHierarchy.new(
580
+ block_count: hierarchy_data['block_count'] || 0,
581
+ blocks: blocks
582
+ )
583
+ end
584
+
585
+ def parse_elements(elements_data)
586
+ return nil if elements_data.nil?
587
+
588
+ elements_data.map { |element_hash| parse_element(element_hash) }
589
+ end
590
+
591
+ def parse_element(element_hash)
592
+ metadata_hash = element_hash['metadata'] || {}
593
+ coordinates = parse_element_coordinates(metadata_hash['coordinates'])
594
+
595
+ metadata = ElementMetadataStruct.new(
596
+ page_number: metadata_hash['page_number'],
597
+ filename: metadata_hash['filename'],
598
+ coordinates: coordinates,
599
+ element_index: metadata_hash['element_index'],
600
+ additional: metadata_hash['additional'] || {}
601
+ )
602
+
603
+ ElementStruct.new(
604
+ element_id: element_hash['element_id'],
605
+ element_type: element_hash['element_type'],
606
+ text: element_hash['text'],
607
+ metadata: metadata
608
+ )
609
+ end
610
+
611
+ def parse_element_coordinates(coordinates_data)
612
+ return nil if coordinates_data.nil?
613
+
614
+ ElementBoundingBox.new(
615
+ x0: coordinates_data['x0'].to_f,
616
+ y0: coordinates_data['y0'].to_f,
617
+ x1: coordinates_data['x1'].to_f,
618
+ y1: coordinates_data['y1'].to_f
619
+ )
620
+ end
621
+
622
+ def parse_bounding_box(bounding_box_data)
623
+ return nil if bounding_box_data.nil?
624
+
625
+ # If it's already a BoundingBox object, return it
626
+ return bounding_box_data if bounding_box_data.is_a?(BoundingBox)
627
+
628
+ # Otherwise parse from hash
629
+ BoundingBox.new(
630
+ x0: bounding_box_data['x0'].to_f,
631
+ y0: bounding_box_data['y0'].to_f,
632
+ x1: bounding_box_data['x1'].to_f,
633
+ y1: bounding_box_data['y1'].to_f
634
+ )
635
+ end
636
+
637
+ def parse_ocr_elements(ocr_elements_data)
638
+ return nil if ocr_elements_data.nil?
639
+
640
+ ocr_elements_data.map do |element_hash|
641
+ OcrElement.new(
642
+ text: element_hash['text'],
643
+ geometry: parse_ocr_geometry(element_hash['geometry']),
644
+ confidence: parse_ocr_confidence(element_hash['confidence']),
645
+ level: element_hash['level'],
646
+ rotation: parse_ocr_rotation(element_hash['rotation']),
647
+ page_number: element_hash['page_number'],
648
+ parent_id: element_hash['parent_id'],
649
+ backend_metadata: element_hash['backend_metadata']
650
+ )
651
+ end
652
+ end
653
+
654
+ def parse_ocr_geometry(data)
655
+ return nil unless data.is_a?(Hash)
656
+
657
+ OcrBoundingGeometry.new(
658
+ type: data['type'], left: data['left'], top: data['top'],
659
+ width: data['width'], height: data['height'], points: data['points']
660
+ )
661
+ end
662
+
663
+ def parse_ocr_confidence(data)
664
+ return nil unless data.is_a?(Hash)
665
+
666
+ OcrConfidence.new(detection: data['detection'], recognition: data['recognition'])
667
+ end
668
+
669
+ def parse_ocr_rotation(data)
670
+ return nil unless data.is_a?(Hash)
671
+
672
+ OcrRotation.new(angle_degrees: data['angle_degrees'], confidence: data['confidence'])
673
+ end
674
+
675
+ def parse_djot_content(djot_data)
676
+ return nil if djot_data.nil?
677
+
678
+ DjotContent.new(djot_data)
679
+ end
680
+
681
+ def parse_document_structure(document_data)
682
+ return nil if document_data.nil?
683
+
684
+ DocumentStructure.new(document_data)
685
+ end
686
+
687
+ def parse_extracted_keywords(keywords_data)
688
+ return nil if keywords_data.nil?
689
+
690
+ keywords_data.map do |kw_hash|
691
+ Kreuzberg::ExtractedKeyword.new(
692
+ text: kw_hash['text'] || '',
693
+ score: (kw_hash['score'] || 0.0).to_f,
694
+ algorithm: kw_hash['algorithm'] || '',
695
+ positions: kw_hash['positions']
696
+ )
697
+ end
698
+ end
699
+
700
+ def parse_processing_warnings(warnings_data)
701
+ return [] if warnings_data.nil?
702
+
703
+ warnings_data.map do |w_hash|
704
+ Kreuzberg::ProcessingWarning.new(
705
+ source: w_hash['source'] || '',
706
+ message: w_hash['message'] || ''
707
+ )
708
+ end
709
+ end
710
+ end
711
+ # rubocop:enable Metrics/ClassLength
712
+ end