kreuzberg 4.1.2 → 4.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (103) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +26 -17
  5. data/ext/kreuzberg_rb/native/libpdfium.so +0 -0
  6. data/kreuzberg.gemspec +13 -1
  7. data/lib/kreuzberg/cli.rb +16 -6
  8. data/lib/kreuzberg/cli_proxy.rb +3 -1
  9. data/lib/kreuzberg/config.rb +121 -39
  10. data/lib/kreuzberg/djot_content.rb +225 -0
  11. data/lib/kreuzberg/extraction_api.rb +20 -4
  12. data/lib/kreuzberg/result.rb +12 -2
  13. data/lib/kreuzberg/version.rb +1 -1
  14. data/lib/kreuzberg.rb +1 -0
  15. data/sig/kreuzberg.rbs +28 -12
  16. data/spec/binding/batch_operations_spec.rb +80 -0
  17. data/spec/binding/batch_spec.rb +6 -5
  18. data/spec/binding/error_recovery_spec.rb +3 -3
  19. data/spec/binding/metadata_types_spec.rb +77 -57
  20. data/spec/binding/tables_spec.rb +11 -2
  21. data/spec/serialization_spec.rb +134 -0
  22. data/spec/unit/config/output_format_spec.rb +380 -0
  23. data/vendor/Cargo.toml +1 -1
  24. data/vendor/kreuzberg/Cargo.toml +1 -1
  25. data/vendor/kreuzberg/README.md +1 -1
  26. data/vendor/kreuzberg/src/api/startup.rs +15 -1
  27. data/vendor/kreuzberg/src/core/config_validation/sections.rs +16 -4
  28. data/vendor/kreuzberg/src/core/extractor/file.rs +1 -2
  29. data/vendor/kreuzberg/src/core/extractor/mod.rs +2 -1
  30. data/vendor/kreuzberg/src/core/io.rs +7 -7
  31. data/vendor/kreuzberg/src/core/mime.rs +4 -4
  32. data/vendor/kreuzberg/src/embeddings.rs +4 -4
  33. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +6 -0
  34. data/vendor/kreuzberg/src/mcp/format.rs +237 -39
  35. data/vendor/kreuzberg/src/mcp/params.rs +26 -33
  36. data/vendor/kreuzberg/src/mcp/server.rs +6 -3
  37. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +16 -23
  38. data/vendor/kreuzberg/src/plugins/mod.rs +1 -0
  39. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +251 -5
  40. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +150 -2
  41. data/vendor/kreuzberg/src/plugins/registry/processor.rs +213 -5
  42. data/vendor/kreuzberg/src/plugins/registry/validator.rs +220 -4
  43. data/vendor/kreuzberg/src/plugins/startup_validation.rs +385 -0
  44. data/vendor/kreuzberg/tests/api_chunk.rs +40 -30
  45. data/vendor/kreuzberg/tests/api_consistency.rs +349 -0
  46. data/vendor/kreuzberg/tests/api_embed.rs +84 -50
  47. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +8 -2
  48. data/vendor/kreuzberg/tests/api_tests.rs +298 -139
  49. data/vendor/kreuzberg/tests/archive_integration.rs +63 -56
  50. data/vendor/kreuzberg/tests/batch_orchestration.rs +22 -14
  51. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +13 -13
  52. data/vendor/kreuzberg/tests/batch_processing.rs +13 -9
  53. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +10 -10
  54. data/vendor/kreuzberg/tests/concurrency_stress.rs +10 -6
  55. data/vendor/kreuzberg/tests/config_behavioral.rs +416 -0
  56. data/vendor/kreuzberg/tests/config_features.rs +19 -15
  57. data/vendor/kreuzberg/tests/config_integration_test.rs +68 -68
  58. data/vendor/kreuzberg/tests/config_loading_tests.rs +71 -62
  59. data/vendor/kreuzberg/tests/contract_mcp.rs +314 -0
  60. data/vendor/kreuzberg/tests/core_integration.rs +57 -57
  61. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +23 -23
  62. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +15 -14
  63. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +4 -4
  64. data/vendor/kreuzberg/tests/email_integration.rs +7 -7
  65. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +2 -2
  66. data/vendor/kreuzberg/tests/error_handling.rs +13 -11
  67. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +2 -2
  68. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  69. data/vendor/kreuzberg/tests/instrumentation_test.rs +18 -13
  70. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +17 -17
  71. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +9 -9
  72. data/vendor/kreuzberg/tests/keywords_integration.rs +25 -25
  73. data/vendor/kreuzberg/tests/keywords_quality.rs +9 -9
  74. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +2 -2
  75. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +1 -1
  76. data/vendor/kreuzberg/tests/mcp_integration.rs +849 -0
  77. data/vendor/kreuzberg/tests/mime_detection.rs +75 -43
  78. data/vendor/kreuzberg/tests/ocr_errors.rs +10 -4
  79. data/vendor/kreuzberg/tests/ocr_language_registry.rs +1 -1
  80. data/vendor/kreuzberg/tests/ocr_stress.rs +3 -3
  81. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +6 -6
  82. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +2 -2
  83. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +2 -2
  84. data/vendor/kreuzberg/tests/page_markers.rs +1 -1
  85. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +6 -6
  86. data/vendor/kreuzberg/tests/pdf_text_merging.rs +2 -2
  87. data/vendor/kreuzberg/tests/pipeline_integration.rs +77 -61
  88. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +97 -77
  89. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +78 -61
  90. data/vendor/kreuzberg/tests/plugin_system.rs +49 -46
  91. data/vendor/kreuzberg/tests/plugin_validator_test.rs +109 -97
  92. data/vendor/kreuzberg/tests/pptx_regression_tests.rs +324 -31
  93. data/vendor/kreuzberg/tests/registry_integration_tests.rs +26 -23
  94. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +1 -1
  95. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +18 -18
  96. data/vendor/kreuzberg/tests/security_validation.rs +20 -19
  97. data/vendor/kreuzberg/tests/serialization_integration.rs +112 -0
  98. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +36 -36
  99. data/vendor/kreuzberg/tests/test_fastembed.rs +8 -8
  100. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +9 -9
  101. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +12 -9
  102. data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
  103. metadata +12 -2
@@ -0,0 +1,225 @@
1
+ # frozen_string_literal: true
2
+
3
+ begin
4
+ require 'json'
5
+ rescue LoadError
6
+ require 'json/pure'
7
+ end
8
+
9
+ module Kreuzberg
10
+ class Result
11
+ # Djot structured content representation
12
+ #
13
+ # Represents document content in Djot format with structured metadata about
14
+ # blocks, images, links, footnotes, and other document elements.
15
+ #
16
+ class DjotContent
17
+ attr_reader :plain_text, :blocks, :metadata_json, :tables, :images, :links, :footnotes, :attributes
18
+
19
+ # Represents a formatted block in Djot content
20
+ class FormattedBlock
21
+ attr_reader :block_type, :children, :attributes, :content, :level
22
+
23
+ # rubocop:disable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
24
+ def initialize(hash_or_type = nil, children: nil, attributes: nil, content: nil, level: nil, block_type: nil)
25
+ if hash_or_type.is_a?(Hash)
26
+ # Initialize from hash
27
+ @block_type = hash_or_type[:block_type] || hash_or_type['block_type'] || ''
28
+ @children = hash_or_type[:children] || hash_or_type['children']
29
+ @attributes = hash_or_type[:attributes] || hash_or_type['attributes'] || {}
30
+ @content = hash_or_type[:content] || hash_or_type['content']
31
+ @level = hash_or_type[:level] || hash_or_type['level']
32
+ else
33
+ # Initialize from keyword arguments (for backward compatibility)
34
+ @block_type = block_type || hash_or_type || ''
35
+ @children = children || []
36
+ @attributes = attributes || {}
37
+ @content = content
38
+ @level = level
39
+ end
40
+ end
41
+ # rubocop:enable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
42
+
43
+ def to_h
44
+ {
45
+ block_type: @block_type,
46
+ children: @children,
47
+ attributes: @attributes,
48
+ content: @content,
49
+ level: @level
50
+ }.compact
51
+ end
52
+ end
53
+
54
+ # Represents an image in Djot content
55
+ class DjotImage
56
+ attr_reader :url, :alt, :title, :width, :height
57
+ alias src url
58
+
59
+ # rubocop:disable Metrics/CyclomaticComplexity
60
+ def initialize(hash_or_url = nil, alt: nil, title: nil, width: nil, height: nil, url: nil, src: nil)
61
+ if hash_or_url.is_a?(Hash)
62
+ # Initialize from hash (supports both 'url' and 'src' keys)
63
+ @url = hash_or_url[:url] || hash_or_url['url'] || hash_or_url[:src] || hash_or_url['src']
64
+ @alt = hash_or_url[:alt] || hash_or_url['alt']
65
+ @title = hash_or_url[:title] || hash_or_url['title']
66
+ @width = hash_or_url[:width] || hash_or_url['width']
67
+ @height = hash_or_url[:height] || hash_or_url['height']
68
+ else
69
+ # Initialize from keyword arguments
70
+ @url = url || src || hash_or_url
71
+ @alt = alt
72
+ @title = title
73
+ @width = width
74
+ @height = height
75
+ end
76
+ end
77
+ # rubocop:enable Metrics/CyclomaticComplexity
78
+
79
+ def to_h
80
+ {
81
+ url: @url,
82
+ alt: @alt,
83
+ title: @title,
84
+ width: @width,
85
+ height: @height
86
+ }.compact
87
+ end
88
+ end
89
+
90
+ # Represents a link in Djot content
91
+ class DjotLink
92
+ attr_reader :url, :text, :title, :link_type
93
+ alias href url
94
+
95
+ # rubocop:disable Metrics/CyclomaticComplexity
96
+ def initialize(hash_or_url = nil, text: nil, title: nil, url: nil, href: nil, link_type: nil)
97
+ if hash_or_url.is_a?(Hash)
98
+ # Initialize from hash (supports both 'url' and 'href' keys)
99
+ @url = hash_or_url[:url] || hash_or_url['url'] || hash_or_url[:href] || hash_or_url['href']
100
+ @text = hash_or_url[:text] || hash_or_url['text']
101
+ @title = hash_or_url[:title] || hash_or_url['title']
102
+ @link_type = hash_or_url[:link_type] || hash_or_url['link_type']
103
+ else
104
+ # Initialize from keyword arguments
105
+ @url = url || href || hash_or_url
106
+ @text = text
107
+ @title = title
108
+ @link_type = link_type
109
+ end
110
+ end
111
+ # rubocop:enable Metrics/CyclomaticComplexity
112
+
113
+ def to_h
114
+ {
115
+ url: @url,
116
+ text: @text,
117
+ title: @title,
118
+ link_type: @link_type
119
+ }.compact
120
+ end
121
+ end
122
+
123
+ # Represents a footnote in Djot content
124
+ class Footnote
125
+ attr_reader :label, :content
126
+
127
+ def initialize(label:, content:)
128
+ @label = label
129
+ @content = content
130
+ end
131
+
132
+ def to_h
133
+ {
134
+ label: @label,
135
+ content: @content
136
+ }
137
+ end
138
+ end
139
+
140
+ # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
141
+ def initialize(hash)
142
+ @plain_text = hash['plain_text'] || hash[:plain_text] || ''
143
+ @blocks = parse_blocks(hash['blocks'] || hash[:blocks] || [])
144
+ @metadata_json = hash['metadata_json'] || hash[:metadata_json] || '{}'
145
+ @tables = hash['tables'] || hash[:tables] || []
146
+ @images = parse_images(hash['images'] || hash[:images] || [])
147
+ @links = parse_links(hash['links'] || hash[:links] || [])
148
+ @footnotes = parse_footnotes(hash['footnotes'] || hash[:footnotes] || [])
149
+ @attributes = hash['attributes'] || hash[:attributes] || {}
150
+ end
151
+ # rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
152
+
153
+ def metadata
154
+ @metadata ||= parse_metadata(@metadata_json)
155
+ end
156
+
157
+ def to_h
158
+ {
159
+ plain_text: @plain_text,
160
+ blocks: @blocks.map(&:to_h),
161
+ metadata_json: @metadata_json,
162
+ tables: @tables,
163
+ images: @images.map(&:to_h),
164
+ links: @links.map(&:to_h),
165
+ footnotes: @footnotes.map(&:to_h),
166
+ attributes: @attributes
167
+ }
168
+ end
169
+
170
+ private
171
+
172
+ def parse_metadata(metadata_json)
173
+ JSON.parse(metadata_json)
174
+ rescue JSON::ParserError
175
+ {}
176
+ end
177
+
178
+ def parse_blocks(blocks_data)
179
+ blocks_data.map do |block|
180
+ FormattedBlock.new(
181
+ block_type: block['block_type'] || block[:block_type] || '',
182
+ children: block['children'] || block[:children],
183
+ attributes: block['attributes'] || block[:attributes]
184
+ )
185
+ end
186
+ end
187
+
188
+ # rubocop:disable Metrics/CyclomaticComplexity
189
+ def parse_images(images_data)
190
+ images_data.map do |image|
191
+ DjotImage.new(
192
+ url: image['url'] || image[:url] || image['src'] || image[:src],
193
+ alt: image['alt'] || image[:alt],
194
+ title: image['title'] || image[:title],
195
+ width: image['width'] || image[:width],
196
+ height: image['height'] || image[:height]
197
+ )
198
+ end
199
+ end
200
+ # rubocop:enable Metrics/CyclomaticComplexity
201
+
202
+ # rubocop:disable Metrics/CyclomaticComplexity
203
+ def parse_links(links_data)
204
+ links_data.map do |link|
205
+ DjotLink.new(
206
+ url: link['url'] || link[:url] || link['href'] || link[:href],
207
+ text: link['text'] || link[:text],
208
+ title: link['title'] || link[:title],
209
+ link_type: link['link_type'] || link[:link_type]
210
+ )
211
+ end
212
+ end
213
+ # rubocop:enable Metrics/CyclomaticComplexity
214
+
215
+ def parse_footnotes(footnotes_data)
216
+ footnotes_data.map do |note|
217
+ Footnote.new(
218
+ label: note['label'] || note[:label],
219
+ content: note['content'] || note[:content]
220
+ )
221
+ end
222
+ end
223
+ end
224
+ end
225
+ end
@@ -15,11 +15,15 @@ module Kreuzberg
15
15
  # @example Extract with explicit MIME type
16
16
  # @example Extract with OCR enabled
17
17
  def extract_file_sync(path:, mime_type: nil, config: nil)
18
+ # Validate that the file exists
19
+ path_str = path.to_s
20
+ raise Errors::IOError, "File not found: #{path_str}" unless File.exist?(path_str)
21
+
18
22
  opts = normalize_config(config)
19
23
  hash = if mime_type
20
- native_extract_file_sync(path.to_s, mime_type.to_s, **opts)
24
+ native_extract_file_sync(path_str, mime_type.to_s, **opts)
21
25
  else
22
- native_extract_file_sync(path.to_s, **opts)
26
+ native_extract_file_sync(path_str, **opts)
23
27
  end
24
28
  result = Result.new(hash)
25
29
  record_cache_entry!(result, opts)
@@ -53,6 +57,8 @@ module Kreuzberg
53
57
  # response = HTTParty.get("https://example.com/document.docx")
54
58
  # result = Kreuzberg.extract_bytes_sync(response.body, "application/vnd.openxmlformats-officedocument.wordprocessingml.document")
55
59
  def extract_bytes_sync(data:, mime_type:, config: nil)
60
+ raise TypeError, "mime_type must be a String, got #{mime_type.inspect}" if mime_type.nil?
61
+
56
62
  opts = normalize_config(config)
57
63
  hash = native_extract_bytes_sync(data.to_s, mime_type.to_s, **opts)
58
64
  result = Result.new(hash)
@@ -92,6 +98,12 @@ module Kreuzberg
92
98
  # config = Kreuzberg::Config::Extraction.new(force_ocr: true)
93
99
  # results = Kreuzberg.batch_extract_files_sync(paths, config: config)
94
100
  def batch_extract_files_sync(paths:, config: nil)
101
+ # Validate that all files exist
102
+ paths.each do |path|
103
+ path_str = path.to_s
104
+ raise Errors::IOError, "File not found: #{path_str}" unless File.exist?(path_str)
105
+ end
106
+
95
107
  opts = normalize_config(config)
96
108
  hashes = native_batch_extract_files_sync(paths.map(&:to_s), **opts)
97
109
  results = hashes.map { |hash| Result.new(hash) }
@@ -130,11 +142,15 @@ module Kreuzberg
130
142
  # )
131
143
  # result = Kreuzberg.extract_file("document.pdf", config: config)
132
144
  def extract_file(path:, mime_type: nil, config: nil)
145
+ # Validate that the file exists
146
+ path_str = path.to_s
147
+ raise Errors::IOError, "File not found: #{path_str}" unless File.exist?(path_str)
148
+
133
149
  opts = normalize_config(config)
134
150
  hash = if mime_type
135
- native_extract_file(path.to_s, mime_type.to_s, **opts)
151
+ native_extract_file(path_str, mime_type.to_s, **opts)
136
152
  else
137
- native_extract_file(path.to_s, **opts)
153
+ native_extract_file(path_str, **opts)
138
154
  end
139
155
  result = Result.new(hash)
140
156
  record_cache_entry!(result, opts)
@@ -11,7 +11,7 @@ module Kreuzberg
11
11
  # rubocop:disable Metrics/ClassLength
12
12
  class Result
13
13
  attr_reader :content, :mime_type, :metadata, :metadata_json, :tables,
14
- :detected_languages, :chunks, :images, :pages, :elements
14
+ :detected_languages, :chunks, :images, :pages, :elements, :djot_content
15
15
 
16
16
  # @!attribute [r] cells
17
17
  # @return [Array<Array<String>>] Table cells (2D array)
@@ -180,6 +180,7 @@ module Kreuzberg
180
180
  #
181
181
  # @param hash [Hash] Hash returned from native extension
182
182
  #
183
+ # rubocop:disable Metrics/AbcSize
183
184
  def initialize(hash)
184
185
  @content = get_value(hash, 'content', '')
185
186
  @mime_type = get_value(hash, 'mime_type', '')
@@ -191,7 +192,9 @@ module Kreuzberg
191
192
  @images = parse_images(get_value(hash, 'images'))
192
193
  @pages = parse_pages(get_value(hash, 'pages'))
193
194
  @elements = parse_elements(get_value(hash, 'elements'))
195
+ @djot_content = parse_djot_content(get_value(hash, 'djot_content'))
194
196
  end
197
+ # rubocop:enable Metrics/AbcSize
195
198
 
196
199
  # Convert to hash
197
200
  #
@@ -207,7 +210,8 @@ module Kreuzberg
207
210
  chunks: serialize_chunks,
208
211
  images: serialize_images,
209
212
  pages: serialize_pages,
210
- elements: serialize_elements
213
+ elements: serialize_elements,
214
+ djot_content: @djot_content&.to_h
211
215
  }
212
216
  end
213
217
 
@@ -434,6 +438,12 @@ module Kreuzberg
434
438
  y1: coordinates_data['y1'].to_f
435
439
  )
436
440
  end
441
+
442
+ def parse_djot_content(djot_data)
443
+ return nil if djot_data.nil?
444
+
445
+ DjotContent.new(djot_data)
446
+ end
437
447
  end
438
448
  # rubocop:enable Metrics/ClassLength
439
449
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Kreuzberg
4
- VERSION = '4.1.2'
4
+ VERSION = '4.2.1'
5
5
  end
data/lib/kreuzberg.rb CHANGED
@@ -87,6 +87,7 @@ end
87
87
 
88
88
  require_relative 'kreuzberg/cache_api'
89
89
  require_relative 'kreuzberg/extraction_api'
90
+ require_relative 'kreuzberg/djot_content'
90
91
 
91
92
  Kreuzberg.singleton_class.prepend(Kreuzberg::CacheAPI)
92
93
  Kreuzberg.singleton_class.prepend(Kreuzberg::ExtractionAPI)
data/sig/kreuzberg.rbs CHANGED
@@ -202,6 +202,8 @@ module Kreuzberg
202
202
  attr_reader html_options: HtmlOptions?
203
203
  attr_reader pages: PageConfig?
204
204
  attr_reader max_concurrent_extractions: Integer?
205
+ attr_reader output_format: String?
206
+ attr_reader result_format: String?
205
207
 
206
208
  def self.from_file: (String path) -> Extraction
207
209
  def initialize: (
@@ -219,7 +221,9 @@ module Kreuzberg
219
221
  ?keywords: (Keywords | Hash[Symbol, untyped])?,
220
222
  ?html_options: (HtmlOptions | Hash[Symbol, untyped])?,
221
223
  ?pages: (PageConfig | Hash[Symbol, untyped])?,
222
- ?max_concurrent_extractions: Integer?
224
+ ?max_concurrent_extractions: Integer?,
225
+ ?output_format: String?,
226
+ ?result_format: String?
223
227
  ) -> void
224
228
  def to_h: () -> Hash[Symbol, untyped]
225
229
 
@@ -413,14 +417,23 @@ module Kreuzberg
413
417
  attr_reader plain_text: String
414
418
  attr_reader blocks: Array[DjotContent::FormattedBlock]
415
419
  attr_reader metadata: Hash[untyped, untyped]
416
- attr_reader tables: Array[Table]
420
+ attr_reader metadata_json: String
421
+ attr_reader tables: Array[untyped]
417
422
  attr_reader images: Array[DjotContent::DjotImage]
418
423
  attr_reader links: Array[DjotContent::DjotLink]
419
424
  attr_reader footnotes: Array[DjotContent::Footnote]
420
425
  attr_reader attributes: Hash[String, untyped]?
421
426
 
422
- def initialize: (djot_content_hash hash) -> void
423
- def to_h: () -> djot_content_hash
427
+ def initialize: (untyped hash) -> void
428
+ def to_h: () -> Hash[Symbol, untyped]
429
+
430
+ private
431
+
432
+ def parse_metadata: (String metadata_json) -> Hash[untyped, untyped]
433
+ def parse_blocks: (Array[untyped] blocks_data) -> Array[FormattedBlock]
434
+ def parse_images: (Array[untyped] images_data) -> Array[DjotImage]
435
+ def parse_links: (Array[untyped] links_data) -> Array[DjotLink]
436
+ def parse_footnotes: (Array[untyped] footnotes_data) -> Array[Footnote]
424
437
 
425
438
  class FormattedBlock
426
439
  attr_reader block_type: String
@@ -429,28 +442,31 @@ module Kreuzberg
429
442
  attr_reader children: Array[FormattedBlock]?
430
443
  attr_reader attributes: Hash[String, untyped]?
431
444
 
432
- def initialize: (formatted_block_hash hash) -> void
433
- def to_h: () -> formatted_block_hash
445
+ def initialize: (?untyped hash_or_type, ?children: untyped, ?attributes: untyped, ?content: untyped, ?level: untyped, ?block_type: untyped) -> void
446
+ def to_h: () -> Hash[Symbol, untyped]
434
447
  end
435
448
 
436
449
  class DjotImage
437
450
  attr_reader url: String
438
451
  attr_reader alt: String?
439
452
  attr_reader title: String?
440
- attr_reader attributes: Hash[String, untyped]?
453
+ attr_reader width: Integer?
454
+ attr_reader height: Integer?
441
455
 
442
- def initialize: (djot_image_hash hash) -> void
443
- def to_h: () -> djot_image_hash
456
+ def initialize: (?untyped hash_or_url, ?alt: untyped, ?title: untyped, ?width: untyped, ?height: untyped, ?url: untyped, ?src: untyped) -> void
457
+ def src: () -> String
458
+ def to_h: () -> Hash[Symbol, untyped]
444
459
  end
445
460
 
446
461
  class DjotLink
447
462
  attr_reader url: String
448
- attr_reader text: String
463
+ attr_reader text: String?
449
464
  attr_reader title: String?
450
465
  attr_reader link_type: String?
451
466
 
452
- def initialize: (djot_link_hash hash) -> void
453
- def to_h: () -> djot_link_hash
467
+ def initialize: (?untyped hash_or_url, ?text: untyped, ?title: untyped, ?url: untyped, ?href: untyped, ?link_type: untyped) -> void
468
+ def href: () -> String
469
+ def to_h: () -> Hash[Symbol, untyped]
454
470
  end
455
471
 
456
472
  class Footnote
@@ -592,4 +592,84 @@ RSpec.describe 'Batch Operations' do
592
592
  paths.each { |p| FileUtils.rm_f(p) }
593
593
  end
594
594
  end
595
+
596
+ describe 'batch with output and result formats' do
597
+ it 'batch processes with output_format' do
598
+ paths = []
599
+ file = Tempfile.new(['format_test', '.txt']).tap do |f|
600
+ f.write('Test content for output format')
601
+ f.close
602
+ end
603
+ paths << file.path
604
+
605
+ config = Kreuzberg::Config::Extraction.new(output_format: 'markdown')
606
+ results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
607
+
608
+ expect(results).to be_an Array
609
+ expect(results.length).to eq 1
610
+ expect(results[0]).to be_a Kreuzberg::Result
611
+
612
+ paths.each { |p| FileUtils.rm_f(p) }
613
+ end
614
+
615
+ it 'batch processes with result_format' do
616
+ paths = []
617
+ file = Tempfile.new(['format_test', '.txt']).tap do |f|
618
+ f.write('Test content for result format')
619
+ f.close
620
+ end
621
+ paths << file.path
622
+
623
+ config = Kreuzberg::Config::Extraction.new(result_format: 'unified')
624
+ results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
625
+
626
+ expect(results).to be_an Array
627
+ expect(results.length).to eq 1
628
+ expect(results[0]).to be_a Kreuzberg::Result
629
+
630
+ paths.each { |p| FileUtils.rm_f(p) }
631
+ end
632
+
633
+ it 'batch processes with both output and result formats' do
634
+ paths = []
635
+ file = Tempfile.new(['format_test', '.txt']).tap do |f|
636
+ f.write('Test content for both formats')
637
+ f.close
638
+ end
639
+ paths << file.path
640
+
641
+ config = Kreuzberg::Config::Extraction.new(
642
+ output_format: 'plain',
643
+ result_format: 'element_based'
644
+ )
645
+ results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
646
+
647
+ expect(results).to be_an Array
648
+ expect(results.length).to eq 1
649
+ expect(results[0]).to be_a Kreuzberg::Result
650
+
651
+ paths.each { |p| FileUtils.rm_f(p) }
652
+ end
653
+
654
+ it 'batch processes with chunking and output_format' do
655
+ paths = []
656
+ file = Tempfile.new(['format_test', '.txt']).tap do |f|
657
+ f.write('Test content ' * 100)
658
+ f.close
659
+ end
660
+ paths << file.path
661
+
662
+ config = Kreuzberg::Config::Extraction.new(
663
+ output_format: 'markdown',
664
+ chunking: { max_chars: 1000 }
665
+ )
666
+ results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
667
+
668
+ expect(results).to be_an Array
669
+ expect(results.length).to eq 1
670
+ expect(results[0]).to be_a Kreuzberg::Result
671
+
672
+ paths.each { |p| FileUtils.rm_f(p) }
673
+ end
674
+ end
595
675
  end
@@ -295,7 +295,7 @@ RSpec.describe Kreuzberg do
295
295
  end
296
296
 
297
297
  describe 'batch error handling' do
298
- it 'handles missing files gracefully in batch' do
298
+ it 'raises IOError for missing files in batch' do
299
299
  paths = [
300
300
  '/nonexistent/file1.txt',
301
301
  '/nonexistent/file2.txt'
@@ -303,10 +303,10 @@ RSpec.describe Kreuzberg do
303
303
 
304
304
  expect do
305
305
  described_class.batch_extract_files_sync(paths: paths)
306
- end.not_to raise_error
306
+ end.to raise_error(Kreuzberg::Errors::IOError, /not found/)
307
307
  end
308
308
 
309
- it 'handles mixed valid and invalid paths' do
309
+ it 'raises IOError when batch contains invalid paths' do
310
310
  paths = []
311
311
  temp_dir = Dir.mktmpdir
312
312
 
@@ -316,8 +316,9 @@ RSpec.describe Kreuzberg do
316
316
 
317
317
  paths << '/nonexistent/invalid.txt'
318
318
 
319
- results = described_class.batch_extract_files_sync(paths: paths)
320
- expect(results).to be_a(Array)
319
+ expect do
320
+ described_class.batch_extract_files_sync(paths: paths)
321
+ end.to raise_error(Kreuzberg::Errors::IOError, /not found/)
321
322
  ensure
322
323
  FileUtils.remove_entry(temp_dir)
323
324
  end
@@ -57,7 +57,7 @@ RSpec.describe 'Error Recovery' do
57
57
  nonexistent_path = '/nonexistent/file/that/does/not/exist.pdf'
58
58
 
59
59
  expect { Kreuzberg.extract_file_sync(path: nonexistent_path, config: config) }
60
- .to raise_error(Kreuzberg::Errors::ValidationError, /not found|does not exist|no such file/)
60
+ .to raise_error(Kreuzberg::Errors::IOError, /not found|does not exist|no such file/)
61
61
  end
62
62
 
63
63
  it 'provides descriptive error messages for invalid MIME types' do
@@ -293,7 +293,7 @@ RSpec.describe 'Error Recovery' do
293
293
 
294
294
  expect(validation_error).to be_a(ArgumentError)
295
295
 
296
- # Runtime error (file not found)
296
+ # Runtime error (file not found) - IOError since the file doesn't exist
297
297
  runtime_error = nil
298
298
  begin
299
299
  Kreuzberg.extract_file_sync(path: '/nonexistent/file.pdf')
@@ -301,7 +301,7 @@ RSpec.describe 'Error Recovery' do
301
301
  runtime_error = e
302
302
  end
303
303
 
304
- expect(runtime_error).to be_a(Kreuzberg::Errors::ValidationError)
304
+ expect(runtime_error).to be_a(Kreuzberg::Errors::IOError)
305
305
  end
306
306
 
307
307
  it 'provides error recovery suggestions in messages' do