kreuzberg 4.1.1 → 4.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +4 -4
  3. data/README.md +8 -5
  4. data/ext/kreuzberg_rb/native/Cargo.toml +2 -2
  5. data/ext/kreuzberg_rb/native/libpdfium.so +0 -0
  6. data/ext/kreuzberg_rb/native/src/config/types.rs +23 -13
  7. data/kreuzberg.gemspec +14 -2
  8. data/lib/kreuzberg/api_proxy.rb +0 -1
  9. data/lib/kreuzberg/cli_proxy.rb +0 -1
  10. data/lib/kreuzberg/config.rb +70 -35
  11. data/lib/kreuzberg/mcp_proxy.rb +0 -1
  12. data/lib/kreuzberg/version.rb +1 -1
  13. data/sig/kreuzberg.rbs +5 -1
  14. data/spec/binding/batch_operations_spec.rb +80 -0
  15. data/spec/binding/metadata_types_spec.rb +77 -57
  16. data/spec/serialization_spec.rb +134 -0
  17. data/spec/unit/config/output_format_spec.rb +380 -0
  18. data/vendor/Cargo.toml +1 -1
  19. data/vendor/kreuzberg/Cargo.toml +3 -3
  20. data/vendor/kreuzberg/README.md +1 -1
  21. data/vendor/kreuzberg/src/embeddings.rs +4 -4
  22. data/vendor/kreuzberg/src/mcp/format.rs +237 -39
  23. data/vendor/kreuzberg/src/mcp/params.rs +26 -33
  24. data/vendor/kreuzberg/src/mcp/server.rs +6 -3
  25. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +16 -23
  26. data/vendor/kreuzberg/tests/api_chunk.rs +40 -30
  27. data/vendor/kreuzberg/tests/api_consistency.rs +349 -0
  28. data/vendor/kreuzberg/tests/api_embed.rs +84 -50
  29. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +8 -2
  30. data/vendor/kreuzberg/tests/api_tests.rs +298 -139
  31. data/vendor/kreuzberg/tests/archive_integration.rs +63 -56
  32. data/vendor/kreuzberg/tests/batch_orchestration.rs +22 -14
  33. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +13 -13
  34. data/vendor/kreuzberg/tests/batch_processing.rs +13 -9
  35. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +10 -10
  36. data/vendor/kreuzberg/tests/concurrency_stress.rs +10 -6
  37. data/vendor/kreuzberg/tests/config_behavioral.rs +414 -0
  38. data/vendor/kreuzberg/tests/config_features.rs +19 -15
  39. data/vendor/kreuzberg/tests/config_integration_test.rs +68 -68
  40. data/vendor/kreuzberg/tests/config_loading_tests.rs +71 -62
  41. data/vendor/kreuzberg/tests/contract_mcp.rs +314 -0
  42. data/vendor/kreuzberg/tests/core_integration.rs +55 -53
  43. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +23 -23
  44. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +15 -14
  45. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +4 -4
  46. data/vendor/kreuzberg/tests/email_integration.rs +7 -7
  47. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +2 -2
  48. data/vendor/kreuzberg/tests/error_handling.rs +13 -11
  49. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +2 -2
  50. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  51. data/vendor/kreuzberg/tests/instrumentation_test.rs +18 -13
  52. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +17 -17
  53. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +9 -9
  54. data/vendor/kreuzberg/tests/keywords_integration.rs +25 -25
  55. data/vendor/kreuzberg/tests/keywords_quality.rs +9 -9
  56. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +2 -2
  57. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +1 -1
  58. data/vendor/kreuzberg/tests/mcp_integration.rs +849 -0
  59. data/vendor/kreuzberg/tests/mime_detection.rs +72 -41
  60. data/vendor/kreuzberg/tests/ocr_errors.rs +10 -4
  61. data/vendor/kreuzberg/tests/ocr_language_registry.rs +1 -1
  62. data/vendor/kreuzberg/tests/ocr_stress.rs +3 -3
  63. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +6 -6
  64. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +2 -2
  65. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +2 -2
  66. data/vendor/kreuzberg/tests/page_markers.rs +1 -1
  67. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +6 -6
  68. data/vendor/kreuzberg/tests/pdf_text_merging.rs +2 -2
  69. data/vendor/kreuzberg/tests/pipeline_integration.rs +77 -61
  70. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +97 -77
  71. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +78 -61
  72. data/vendor/kreuzberg/tests/plugin_system.rs +49 -46
  73. data/vendor/kreuzberg/tests/plugin_validator_test.rs +109 -97
  74. data/vendor/kreuzberg/tests/pptx_regression_tests.rs +40 -30
  75. data/vendor/kreuzberg/tests/registry_integration_tests.rs +26 -23
  76. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +1 -1
  77. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +18 -18
  78. data/vendor/kreuzberg/tests/security_validation.rs +20 -19
  79. data/vendor/kreuzberg/tests/serialization_integration.rs +112 -0
  80. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +36 -36
  81. data/vendor/kreuzberg/tests/test_fastembed.rs +8 -8
  82. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +9 -9
  83. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +12 -9
  84. data/vendor/kreuzberg-tesseract/Cargo.toml +3 -3
  85. data/vendor/kreuzberg-tesseract/build.rs +4 -4
  86. data/vendor/kreuzberg-tesseract/src/lib.rs +6 -6
  87. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +3 -3
  88. metadata +13 -2
@@ -29,16 +29,20 @@ async fn test_embed_valid_texts() {
29
29
  .method("POST")
30
30
  .uri("/embed")
31
31
  .header("content-type", "application/json")
32
- .body(Body::from(serde_json::to_string(&request_body).unwrap()))
33
- .unwrap(),
32
+ .body(Body::from(
33
+ serde_json::to_string(&request_body).expect("Operation failed"),
34
+ ))
35
+ .expect("Operation failed"),
34
36
  )
35
37
  .await
36
- .unwrap();
38
+ .expect("Operation failed");
37
39
 
38
40
  assert_eq!(response.status(), StatusCode::OK);
39
41
 
40
- let body = axum::body::to_bytes(response.into_body(), usize::MAX).await.unwrap();
41
- let embed_response: EmbedResponse = serde_json::from_slice(&body).unwrap();
42
+ let body = axum::body::to_bytes(response.into_body(), usize::MAX)
43
+ .await
44
+ .expect("Failed to convert to bytes");
45
+ let embed_response: EmbedResponse = serde_json::from_slice(&body).expect("Failed to deserialize");
42
46
 
43
47
  assert_eq!(embed_response.count, 2);
44
48
  assert_eq!(embed_response.embeddings.len(), 2);
@@ -66,11 +70,13 @@ async fn test_embed_empty_texts() {
66
70
  .method("POST")
67
71
  .uri("/embed")
68
72
  .header("content-type", "application/json")
69
- .body(Body::from(serde_json::to_string(&request_body).unwrap()))
70
- .unwrap(),
73
+ .body(Body::from(
74
+ serde_json::to_string(&request_body).expect("Operation failed"),
75
+ ))
76
+ .expect("Operation failed"),
71
77
  )
72
78
  .await
73
- .unwrap();
79
+ .expect("Operation failed");
74
80
 
75
81
  assert_eq!(response.status(), StatusCode::BAD_REQUEST);
76
82
  }
@@ -97,16 +103,20 @@ async fn test_embed_with_custom_config() {
97
103
  .method("POST")
98
104
  .uri("/embed")
99
105
  .header("content-type", "application/json")
100
- .body(Body::from(serde_json::to_string(&request_body).unwrap()))
101
- .unwrap(),
106
+ .body(Body::from(
107
+ serde_json::to_string(&request_body).expect("Operation failed"),
108
+ ))
109
+ .expect("Operation failed"),
102
110
  )
103
111
  .await
104
- .unwrap();
112
+ .expect("Operation failed");
105
113
 
106
114
  assert_eq!(response.status(), StatusCode::OK);
107
115
 
108
- let body = axum::body::to_bytes(response.into_body(), usize::MAX).await.unwrap();
109
- let embed_response: EmbedResponse = serde_json::from_slice(&body).unwrap();
116
+ let body = axum::body::to_bytes(response.into_body(), usize::MAX)
117
+ .await
118
+ .expect("Failed to convert to bytes");
119
+ let embed_response: EmbedResponse = serde_json::from_slice(&body).expect("Failed to deserialize");
110
120
 
111
121
  assert_eq!(embed_response.count, 1);
112
122
  assert_eq!(embed_response.embeddings.len(), 1);
@@ -128,16 +138,20 @@ async fn test_embed_single_text() {
128
138
  .method("POST")
129
139
  .uri("/embed")
130
140
  .header("content-type", "application/json")
131
- .body(Body::from(serde_json::to_string(&request_body).unwrap()))
132
- .unwrap(),
141
+ .body(Body::from(
142
+ serde_json::to_string(&request_body).expect("Operation failed"),
143
+ ))
144
+ .expect("Operation failed"),
133
145
  )
134
146
  .await
135
- .unwrap();
147
+ .expect("Operation failed");
136
148
 
137
149
  assert_eq!(response.status(), StatusCode::OK);
138
150
 
139
- let body = axum::body::to_bytes(response.into_body(), usize::MAX).await.unwrap();
140
- let embed_response: EmbedResponse = serde_json::from_slice(&body).unwrap();
151
+ let body = axum::body::to_bytes(response.into_body(), usize::MAX)
152
+ .await
153
+ .expect("Failed to convert to bytes");
154
+ let embed_response: EmbedResponse = serde_json::from_slice(&body).expect("Failed to deserialize");
141
155
 
142
156
  assert_eq!(embed_response.count, 1);
143
157
  assert_eq!(embed_response.embeddings.len(), 1);
@@ -160,16 +174,20 @@ async fn test_embed_batch() {
160
174
  .method("POST")
161
175
  .uri("/embed")
162
176
  .header("content-type", "application/json")
163
- .body(Body::from(serde_json::to_string(&request_body).unwrap()))
164
- .unwrap(),
177
+ .body(Body::from(
178
+ serde_json::to_string(&request_body).expect("Operation failed"),
179
+ ))
180
+ .expect("Operation failed"),
165
181
  )
166
182
  .await
167
- .unwrap();
183
+ .expect("Operation failed");
168
184
 
169
185
  assert_eq!(response.status(), StatusCode::OK);
170
186
 
171
- let body = axum::body::to_bytes(response.into_body(), usize::MAX).await.unwrap();
172
- let embed_response: EmbedResponse = serde_json::from_slice(&body).unwrap();
187
+ let body = axum::body::to_bytes(response.into_body(), usize::MAX)
188
+ .await
189
+ .expect("Failed to convert to bytes");
190
+ let embed_response: EmbedResponse = serde_json::from_slice(&body).expect("Failed to deserialize");
173
191
 
174
192
  assert_eq!(embed_response.count, 10);
175
193
  assert_eq!(embed_response.embeddings.len(), 10);
@@ -198,16 +216,20 @@ async fn test_embed_long_text() {
198
216
  .method("POST")
199
217
  .uri("/embed")
200
218
  .header("content-type", "application/json")
201
- .body(Body::from(serde_json::to_string(&request_body).unwrap()))
202
- .unwrap(),
219
+ .body(Body::from(
220
+ serde_json::to_string(&request_body).expect("Operation failed"),
221
+ ))
222
+ .expect("Operation failed"),
203
223
  )
204
224
  .await
205
- .unwrap();
225
+ .expect("Operation failed");
206
226
 
207
227
  assert_eq!(response.status(), StatusCode::OK);
208
228
 
209
- let body = axum::body::to_bytes(response.into_body(), usize::MAX).await.unwrap();
210
- let embed_response: EmbedResponse = serde_json::from_slice(&body).unwrap();
229
+ let body = axum::body::to_bytes(response.into_body(), usize::MAX)
230
+ .await
231
+ .expect("Failed to convert to bytes");
232
+ let embed_response: EmbedResponse = serde_json::from_slice(&body).expect("Failed to deserialize");
211
233
 
212
234
  assert_eq!(embed_response.count, 1);
213
235
  assert_eq!(embed_response.embeddings.len(), 1);
@@ -225,10 +247,10 @@ async fn test_embed_malformed_json() {
225
247
  .uri("/embed")
226
248
  .header("content-type", "application/json")
227
249
  .body(Body::from("{invalid json}"))
228
- .unwrap(),
250
+ .expect("Operation failed"),
229
251
  )
230
252
  .await
231
- .unwrap();
253
+ .expect("Operation failed");
232
254
 
233
255
  assert_eq!(response.status(), StatusCode::BAD_REQUEST);
234
256
  }
@@ -250,16 +272,20 @@ async fn test_embed_deterministic() {
250
272
  .method("POST")
251
273
  .uri("/embed")
252
274
  .header("content-type", "application/json")
253
- .body(Body::from(serde_json::to_string(&request_body).unwrap()))
254
- .unwrap(),
275
+ .body(Body::from(
276
+ serde_json::to_string(&request_body).expect("Operation failed"),
277
+ ))
278
+ .expect("Operation failed"),
255
279
  )
256
280
  .await
257
- .unwrap();
281
+ .expect("Operation failed");
258
282
 
259
283
  assert_eq!(response1.status(), StatusCode::OK);
260
284
 
261
- let body1 = axum::body::to_bytes(response1.into_body(), usize::MAX).await.unwrap();
262
- let embed_response1: EmbedResponse = serde_json::from_slice(&body1).unwrap();
285
+ let body1 = axum::body::to_bytes(response1.into_body(), usize::MAX)
286
+ .await
287
+ .expect("Failed to convert to bytes");
288
+ let embed_response1: EmbedResponse = serde_json::from_slice(&body1).expect("Failed to deserialize");
263
289
 
264
290
  // Second call with same text
265
291
  let response2 = app
@@ -268,16 +294,20 @@ async fn test_embed_deterministic() {
268
294
  .method("POST")
269
295
  .uri("/embed")
270
296
  .header("content-type", "application/json")
271
- .body(Body::from(serde_json::to_string(&request_body).unwrap()))
272
- .unwrap(),
297
+ .body(Body::from(
298
+ serde_json::to_string(&request_body).expect("Operation failed"),
299
+ ))
300
+ .expect("Operation failed"),
273
301
  )
274
302
  .await
275
- .unwrap();
303
+ .expect("Operation failed");
276
304
 
277
305
  assert_eq!(response2.status(), StatusCode::OK);
278
306
 
279
- let body2 = axum::body::to_bytes(response2.into_body(), usize::MAX).await.unwrap();
280
- let embed_response2: EmbedResponse = serde_json::from_slice(&body2).unwrap();
307
+ let body2 = axum::body::to_bytes(response2.into_body(), usize::MAX)
308
+ .await
309
+ .expect("Failed to convert to bytes");
310
+ let embed_response2: EmbedResponse = serde_json::from_slice(&body2).expect("Failed to deserialize");
281
311
 
282
312
  // Compare embeddings - they should be identical
283
313
  assert_eq!(embed_response1.embeddings.len(), embed_response2.embeddings.len());
@@ -307,18 +337,20 @@ async fn test_embed_different_presets() {
307
337
  .method("POST")
308
338
  .uri("/embed")
309
339
  .header("content-type", "application/json")
310
- .body(Body::from(serde_json::to_string(&request_fast).unwrap()))
311
- .unwrap(),
340
+ .body(Body::from(
341
+ serde_json::to_string(&request_fast).expect("Operation failed"),
342
+ ))
343
+ .expect("Operation failed"),
312
344
  )
313
345
  .await
314
- .unwrap();
346
+ .expect("Operation failed");
315
347
 
316
348
  assert_eq!(response_fast.status(), StatusCode::OK);
317
349
 
318
350
  let body_fast = axum::body::to_bytes(response_fast.into_body(), usize::MAX)
319
351
  .await
320
- .unwrap();
321
- let embed_fast: EmbedResponse = serde_json::from_slice(&body_fast).unwrap();
352
+ .expect("Operation failed");
353
+ let embed_fast: EmbedResponse = serde_json::from_slice(&body_fast).expect("Failed to deserialize");
322
354
 
323
355
  // Test with "balanced" preset
324
356
  let request_balanced = json!({
@@ -337,18 +369,20 @@ async fn test_embed_different_presets() {
337
369
  .method("POST")
338
370
  .uri("/embed")
339
371
  .header("content-type", "application/json")
340
- .body(Body::from(serde_json::to_string(&request_balanced).unwrap()))
341
- .unwrap(),
372
+ .body(Body::from(
373
+ serde_json::to_string(&request_balanced).expect("Operation failed"),
374
+ ))
375
+ .expect("Operation failed"),
342
376
  )
343
377
  .await
344
- .unwrap();
378
+ .expect("Operation failed");
345
379
 
346
380
  assert_eq!(response_balanced.status(), StatusCode::OK);
347
381
 
348
382
  let body_balanced = axum::body::to_bytes(response_balanced.into_body(), usize::MAX)
349
383
  .await
350
- .unwrap();
351
- let embed_balanced: EmbedResponse = serde_json::from_slice(&body_balanced).unwrap();
384
+ .expect("Operation failed");
385
+ let embed_balanced: EmbedResponse = serde_json::from_slice(&body_balanced).expect("Failed to deserialize");
352
386
 
353
387
  // Different presets should have different dimensions
354
388
  assert_ne!(embed_fast.dimensions, embed_balanced.dimensions);
@@ -93,7 +93,10 @@ startxref
93
93
  .expect("Failed to read response body");
94
94
 
95
95
  let parsed: Value = serde_json::from_slice(&body).expect("Failed to parse response");
96
- eprintln!("Extraction result: {}", serde_json::to_string_pretty(&parsed).unwrap());
96
+ eprintln!(
97
+ "Extraction result: {}",
98
+ serde_json::to_string_pretty(&parsed).expect("Failed to parse")
99
+ );
97
100
  }
98
101
 
99
102
  /// Test extracting a 1MB text file (control test without PDF).
@@ -187,7 +190,10 @@ async fn test_find_size_breaking_point() {
187
190
  .expect("Failed to read response body");
188
191
 
189
192
  if let Ok(parsed) = serde_json::from_slice::<Value>(&body) {
190
- eprintln!("Error response: {}", serde_json::to_string_pretty(&parsed).unwrap());
193
+ eprintln!(
194
+ "Error response: {}",
195
+ serde_json::to_string_pretty(&parsed).expect("Failed to parse")
196
+ );
191
197
  } else {
192
198
  eprintln!("Response body (not JSON): {}", String::from_utf8_lossy(&body));
193
199
  }