kreuzberg 4.1.2 → 4.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/libpdfium.so +0 -0
  5. data/kreuzberg.gemspec +13 -1
  6. data/lib/kreuzberg/config.rb +70 -35
  7. data/lib/kreuzberg/version.rb +1 -1
  8. data/sig/kreuzberg.rbs +5 -1
  9. data/spec/binding/batch_operations_spec.rb +80 -0
  10. data/spec/binding/metadata_types_spec.rb +77 -57
  11. data/spec/serialization_spec.rb +134 -0
  12. data/spec/unit/config/output_format_spec.rb +380 -0
  13. data/vendor/Cargo.toml +1 -1
  14. data/vendor/kreuzberg/Cargo.toml +1 -1
  15. data/vendor/kreuzberg/README.md +1 -1
  16. data/vendor/kreuzberg/src/embeddings.rs +4 -4
  17. data/vendor/kreuzberg/src/mcp/format.rs +237 -39
  18. data/vendor/kreuzberg/src/mcp/params.rs +26 -33
  19. data/vendor/kreuzberg/src/mcp/server.rs +6 -3
  20. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +16 -23
  21. data/vendor/kreuzberg/tests/api_chunk.rs +40 -30
  22. data/vendor/kreuzberg/tests/api_consistency.rs +349 -0
  23. data/vendor/kreuzberg/tests/api_embed.rs +84 -50
  24. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +8 -2
  25. data/vendor/kreuzberg/tests/api_tests.rs +298 -139
  26. data/vendor/kreuzberg/tests/archive_integration.rs +63 -56
  27. data/vendor/kreuzberg/tests/batch_orchestration.rs +22 -14
  28. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +13 -13
  29. data/vendor/kreuzberg/tests/batch_processing.rs +13 -9
  30. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +10 -10
  31. data/vendor/kreuzberg/tests/concurrency_stress.rs +10 -6
  32. data/vendor/kreuzberg/tests/config_behavioral.rs +414 -0
  33. data/vendor/kreuzberg/tests/config_features.rs +19 -15
  34. data/vendor/kreuzberg/tests/config_integration_test.rs +68 -68
  35. data/vendor/kreuzberg/tests/config_loading_tests.rs +71 -62
  36. data/vendor/kreuzberg/tests/contract_mcp.rs +314 -0
  37. data/vendor/kreuzberg/tests/core_integration.rs +55 -53
  38. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +23 -23
  39. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +15 -14
  40. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +4 -4
  41. data/vendor/kreuzberg/tests/email_integration.rs +7 -7
  42. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +2 -2
  43. data/vendor/kreuzberg/tests/error_handling.rs +13 -11
  44. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +2 -2
  45. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  46. data/vendor/kreuzberg/tests/instrumentation_test.rs +18 -13
  47. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +17 -17
  48. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +9 -9
  49. data/vendor/kreuzberg/tests/keywords_integration.rs +25 -25
  50. data/vendor/kreuzberg/tests/keywords_quality.rs +9 -9
  51. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +2 -2
  52. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +1 -1
  53. data/vendor/kreuzberg/tests/mcp_integration.rs +849 -0
  54. data/vendor/kreuzberg/tests/mime_detection.rs +72 -41
  55. data/vendor/kreuzberg/tests/ocr_errors.rs +10 -4
  56. data/vendor/kreuzberg/tests/ocr_language_registry.rs +1 -1
  57. data/vendor/kreuzberg/tests/ocr_stress.rs +3 -3
  58. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +6 -6
  59. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +2 -2
  60. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +2 -2
  61. data/vendor/kreuzberg/tests/page_markers.rs +1 -1
  62. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +6 -6
  63. data/vendor/kreuzberg/tests/pdf_text_merging.rs +2 -2
  64. data/vendor/kreuzberg/tests/pipeline_integration.rs +77 -61
  65. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +97 -77
  66. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +78 -61
  67. data/vendor/kreuzberg/tests/plugin_system.rs +49 -46
  68. data/vendor/kreuzberg/tests/plugin_validator_test.rs +109 -97
  69. data/vendor/kreuzberg/tests/pptx_regression_tests.rs +40 -30
  70. data/vendor/kreuzberg/tests/registry_integration_tests.rs +26 -23
  71. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +1 -1
  72. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +18 -18
  73. data/vendor/kreuzberg/tests/security_validation.rs +20 -19
  74. data/vendor/kreuzberg/tests/serialization_integration.rs +112 -0
  75. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +36 -36
  76. data/vendor/kreuzberg/tests/test_fastembed.rs +8 -8
  77. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +9 -9
  78. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +12 -9
  79. data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
  80. metadata +10 -2
@@ -0,0 +1,380 @@
1
+ # frozen_string_literal: true
2
+
3
+ # rubocop:disable RSpec/RepeatedExample
4
+ RSpec.describe 'Output Format and Result Format Configuration' do
5
+ describe Kreuzberg::Config::Extraction do
6
+ describe 'output_format' do
7
+ it 'accepts output_format as initialization parameter' do
8
+ config = described_class.new(output_format: 'markdown')
9
+
10
+ expect(config.output_format).to eq 'markdown'
11
+ end
12
+
13
+ it 'defaults to nil when not specified' do
14
+ config = described_class.new
15
+
16
+ expect(config.output_format).to be_nil
17
+ end
18
+
19
+ it 'accepts plain format' do
20
+ config = described_class.new(output_format: 'plain')
21
+
22
+ expect(config.output_format).to eq 'plain'
23
+ end
24
+
25
+ it 'accepts markdown format' do
26
+ config = described_class.new(output_format: 'markdown')
27
+
28
+ expect(config.output_format).to eq 'markdown'
29
+ end
30
+
31
+ it 'accepts djot format' do
32
+ config = described_class.new(output_format: 'djot')
33
+
34
+ expect(config.output_format).to eq 'djot'
35
+ end
36
+
37
+ it 'accepts html format' do
38
+ config = described_class.new(output_format: 'html')
39
+
40
+ expect(config.output_format).to eq 'html'
41
+ end
42
+
43
+ it 'converts output_format to string' do
44
+ config = described_class.new(output_format: :markdown)
45
+
46
+ expect(config.output_format).to eq 'markdown'
47
+ expect(config.output_format).to be_a String
48
+ end
49
+
50
+ it 'includes output_format in to_h' do
51
+ config = described_class.new(output_format: 'markdown')
52
+ hash = config.to_h
53
+
54
+ expect(hash[:output_format]).to eq 'markdown'
55
+ end
56
+
57
+ it 'excludes nil output_format from to_h' do
58
+ config = described_class.new(output_format: nil)
59
+ hash = config.to_h
60
+
61
+ expect(hash.key?(:output_format)).to be false
62
+ end
63
+
64
+ it 'includes output_format in JSON' do
65
+ config = described_class.new(output_format: 'markdown')
66
+ json = config.to_json
67
+ parsed = JSON.parse(json)
68
+
69
+ expect(parsed['output_format']).to eq 'markdown'
70
+ end
71
+
72
+ it 'retrieves output_format with get_field' do
73
+ config = described_class.new(output_format: 'djot')
74
+
75
+ expect(config.get_field('output_format')).to eq 'djot'
76
+ end
77
+
78
+ it 'can be set with []=' do
79
+ config = described_class.new
80
+ config[:output_format] = 'html'
81
+
82
+ expect(config.output_format).to eq 'html'
83
+ end
84
+
85
+ it 'can be set with []= using symbol' do
86
+ config = described_class.new
87
+ config[:output_format] = :plain
88
+
89
+ expect(config.output_format).to eq 'plain'
90
+ end
91
+
92
+ it 'can be retrieved with []' do
93
+ config = described_class.new(output_format: 'markdown')
94
+
95
+ expect(config[:output_format]).to eq 'markdown'
96
+ end
97
+ end
98
+
99
+ describe 'result_format' do
100
+ it 'accepts result_format as initialization parameter' do
101
+ config = described_class.new(result_format: 'unified')
102
+
103
+ expect(config.result_format).to eq 'unified'
104
+ end
105
+
106
+ it 'defaults to nil when not specified' do
107
+ config = described_class.new
108
+
109
+ expect(config.result_format).to be_nil
110
+ end
111
+
112
+ it 'accepts unified format' do
113
+ config = described_class.new(result_format: 'unified')
114
+
115
+ expect(config.result_format).to eq 'unified'
116
+ end
117
+
118
+ it 'accepts element_based format' do
119
+ config = described_class.new(result_format: 'element_based')
120
+
121
+ expect(config.result_format).to eq 'element_based'
122
+ end
123
+
124
+ it 'converts result_format to string' do
125
+ config = described_class.new(result_format: :unified)
126
+
127
+ expect(config.result_format).to eq 'unified'
128
+ expect(config.result_format).to be_a String
129
+ end
130
+
131
+ it 'includes result_format in to_h' do
132
+ config = described_class.new(result_format: 'element_based')
133
+ hash = config.to_h
134
+
135
+ expect(hash[:result_format]).to eq 'element_based'
136
+ end
137
+
138
+ it 'excludes nil result_format from to_h' do
139
+ config = described_class.new(result_format: nil)
140
+ hash = config.to_h
141
+
142
+ expect(hash.key?(:result_format)).to be false
143
+ end
144
+
145
+ it 'includes result_format in JSON' do
146
+ config = described_class.new(result_format: 'element_based')
147
+ json = config.to_json
148
+ parsed = JSON.parse(json)
149
+
150
+ expect(parsed['result_format']).to eq 'element_based'
151
+ end
152
+
153
+ it 'retrieves result_format with get_field' do
154
+ config = described_class.new(result_format: 'unified')
155
+
156
+ expect(config.get_field('result_format')).to eq 'unified'
157
+ end
158
+
159
+ it 'can be set with []=' do
160
+ config = described_class.new
161
+ config[:result_format] = 'unified'
162
+
163
+ expect(config.result_format).to eq 'unified'
164
+ end
165
+
166
+ it 'can be set with []= using symbol' do
167
+ config = described_class.new
168
+ config[:result_format] = :element_based
169
+
170
+ expect(config.result_format).to eq 'element_based'
171
+ end
172
+
173
+ it 'can be retrieved with []' do
174
+ config = described_class.new(result_format: 'element_based')
175
+
176
+ expect(config[:result_format]).to eq 'element_based'
177
+ end
178
+ end
179
+
180
+ describe 'combined output and result formats' do
181
+ it 'accepts both output_format and result_format' do
182
+ config = described_class.new(
183
+ output_format: 'markdown',
184
+ result_format: 'unified'
185
+ )
186
+
187
+ expect(config.output_format).to eq 'markdown'
188
+ expect(config.result_format).to eq 'unified'
189
+ end
190
+
191
+ it 'serializes both formats in to_h' do
192
+ config = described_class.new(
193
+ output_format: 'djot',
194
+ result_format: 'element_based'
195
+ )
196
+ hash = config.to_h
197
+
198
+ expect(hash[:output_format]).to eq 'djot'
199
+ expect(hash[:result_format]).to eq 'element_based'
200
+ end
201
+
202
+ it 'serializes both formats in JSON' do
203
+ config = described_class.new(
204
+ output_format: 'html',
205
+ result_format: 'unified'
206
+ )
207
+ json = config.to_json
208
+ parsed = JSON.parse(json)
209
+
210
+ expect(parsed['output_format']).to eq 'html'
211
+ expect(parsed['result_format']).to eq 'unified'
212
+ end
213
+
214
+ it 'merges both formats correctly' do
215
+ base = described_class.new(
216
+ output_format: 'markdown',
217
+ result_format: 'unified'
218
+ )
219
+ override = described_class.new(output_format: 'html')
220
+ merged = base.merge(override)
221
+
222
+ expect(merged.output_format).to eq 'html'
223
+ expect(merged.result_format).to eq 'unified'
224
+ end
225
+
226
+ it 'merges both formats with merge!' do
227
+ config = described_class.new(
228
+ output_format: 'markdown',
229
+ result_format: 'unified'
230
+ )
231
+ override = described_class.new(
232
+ output_format: 'djot',
233
+ result_format: 'element_based'
234
+ )
235
+ config.merge!(override)
236
+
237
+ expect(config.output_format).to eq 'djot'
238
+ expect(config.result_format).to eq 'element_based'
239
+ end
240
+
241
+ it 'handles merge with hash containing both formats' do
242
+ config = described_class.new(
243
+ output_format: 'plain',
244
+ result_format: 'unified'
245
+ )
246
+ merged = config.merge({ output_format: 'markdown' })
247
+
248
+ expect(merged.output_format).to eq 'markdown'
249
+ expect(merged.result_format).to eq 'unified'
250
+ end
251
+ end
252
+
253
+ describe 'format persistence across operations' do
254
+ it 'persists output_format through multiple conversions' do
255
+ config = described_class.new(output_format: 'markdown')
256
+ hash = config.to_h
257
+ new_config = described_class.new(**hash)
258
+
259
+ expect(new_config.output_format).to eq 'markdown'
260
+ end
261
+
262
+ it 'persists result_format through multiple conversions' do
263
+ config = described_class.new(result_format: 'element_based')
264
+ hash = config.to_h
265
+ new_config = described_class.new(**hash)
266
+
267
+ expect(new_config.result_format).to eq 'element_based'
268
+ end
269
+
270
+ it 'round-trips through JSON' do
271
+ config = described_class.new(
272
+ output_format: 'djot',
273
+ result_format: 'unified'
274
+ )
275
+ json = config.to_json
276
+ parsed = JSON.parse(json)
277
+ new_config = described_class.new(**parsed.transform_keys(&:to_sym))
278
+
279
+ expect(new_config.output_format).to eq 'djot'
280
+ expect(new_config.result_format).to eq 'unified'
281
+ end
282
+ end
283
+
284
+ describe 'format validation and edge cases' do
285
+ it 'handles empty string output_format' do
286
+ config = described_class.new(output_format: '')
287
+
288
+ expect(config.output_format).to eq ''
289
+ end
290
+
291
+ it 'handles empty string result_format' do
292
+ config = described_class.new(result_format: '')
293
+
294
+ expect(config.result_format).to eq ''
295
+ end
296
+
297
+ it 'handles whitespace in output_format' do
298
+ config = described_class.new(output_format: ' plain ')
299
+
300
+ expect(config.output_format).to eq ' plain '
301
+ end
302
+
303
+ it 'handles case sensitivity in output_format' do
304
+ config = described_class.new(output_format: 'MarkDown')
305
+
306
+ expect(config.output_format).to eq 'MarkDown'
307
+ end
308
+
309
+ it 'handles custom string in result_format' do
310
+ config = described_class.new(result_format: 'custom_format')
311
+
312
+ expect(config.result_format).to eq 'custom_format'
313
+ end
314
+ end
315
+
316
+ describe 'integration with other config fields' do
317
+ it 'works with output_format and chunking together' do
318
+ config = described_class.new(
319
+ output_format: 'markdown',
320
+ chunking: { max_chars: 500 }
321
+ )
322
+
323
+ expect(config.output_format).to eq 'markdown'
324
+ expect(config.chunking.max_chars).to eq 500
325
+ end
326
+
327
+ it 'works with result_format and OCR together' do
328
+ config = described_class.new(
329
+ result_format: 'element_based',
330
+ ocr: { backend: 'tesseract' }
331
+ )
332
+
333
+ expect(config.result_format).to eq 'element_based'
334
+ expect(config.ocr.backend).to eq 'tesseract'
335
+ end
336
+
337
+ it 'works with both formats and language detection' do
338
+ config = described_class.new(
339
+ output_format: 'html',
340
+ result_format: 'unified',
341
+ language_detection: { enabled: true }
342
+ )
343
+
344
+ expect(config.output_format).to eq 'html'
345
+ expect(config.result_format).to eq 'unified'
346
+ expect(config.language_detection.enabled).to be true
347
+ end
348
+
349
+ it 'preserves formats in complex config merge' do
350
+ base = described_class.new(
351
+ output_format: 'markdown',
352
+ result_format: 'unified',
353
+ chunking: { max_chars: 500 },
354
+ ocr: { backend: 'tesseract' }
355
+ )
356
+ override = described_class.new(
357
+ output_format: 'djot',
358
+ chunking: { max_chars: 750 }
359
+ )
360
+ merged = base.merge(override)
361
+
362
+ expect(merged.output_format).to eq 'djot'
363
+ expect(merged.result_format).to eq 'unified'
364
+ expect(merged.chunking.max_chars).to eq 750
365
+ expect(merged.ocr.backend).to eq 'tesseract'
366
+ end
367
+ end
368
+
369
+ describe 'allowed keys integration' do
370
+ it 'includes output_format in ALLOWED_KEYS' do
371
+ expect(Kreuzberg::Config::Extraction::ALLOWED_KEYS).to include(:output_format)
372
+ end
373
+
374
+ it 'includes result_format in ALLOWED_KEYS' do
375
+ expect(Kreuzberg::Config::Extraction::ALLOWED_KEYS).to include(:result_format)
376
+ end
377
+ end
378
+ end
379
+ end
380
+ # rubocop:enable RSpec/RepeatedExample
data/vendor/Cargo.toml CHANGED
@@ -3,7 +3,7 @@ members = ["kreuzberg", "kreuzberg-tesseract", "kreuzberg-ffi"]
3
3
  resolver = "2"
4
4
 
5
5
  [workspace.package]
6
- version = "4.1.2"
6
+ version = "4.2.0"
7
7
  edition = "2024"
8
8
  rust-version = "1.91"
9
9
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg"
3
- version = "4.1.2"
3
+ version = "4.2.0"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -17,7 +17,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
17
17
 
18
18
  This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
19
19
 
20
- > **🚀 Version 4.1.2 Release**
20
+ > **🚀 Version 4.2.0 Release**
21
21
  > This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
22
22
  >
23
23
  > **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
@@ -219,10 +219,10 @@ pub fn get_or_init_model(
219
219
  // This prevents panics that cannot unwind through FFI boundaries
220
220
  fn ensure_onnx_available() -> Result<(), String> {
221
221
  // Check if ORT_DYLIB_PATH is already set and valid
222
- if let Ok(path) = std::env::var("ORT_DYLIB_PATH") {
223
- if std::path::Path::new(&path).exists() {
224
- return Ok(());
225
- }
222
+ if let Ok(path) = std::env::var("ORT_DYLIB_PATH")
223
+ && std::path::Path::new(&path).exists()
224
+ {
225
+ return Ok(());
226
226
  }
227
227
 
228
228
  // Check common installation paths and set ORT_DYLIB_PATH if found