kreuzberg 4.1.1 → 4.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +4 -4
- data/README.md +8 -5
- data/ext/kreuzberg_rb/native/Cargo.toml +2 -2
- data/ext/kreuzberg_rb/native/libpdfium.so +0 -0
- data/ext/kreuzberg_rb/native/src/config/types.rs +23 -13
- data/kreuzberg.gemspec +14 -2
- data/lib/kreuzberg/api_proxy.rb +0 -1
- data/lib/kreuzberg/cli_proxy.rb +0 -1
- data/lib/kreuzberg/config.rb +70 -35
- data/lib/kreuzberg/mcp_proxy.rb +0 -1
- data/lib/kreuzberg/version.rb +1 -1
- data/sig/kreuzberg.rbs +5 -1
- data/spec/binding/batch_operations_spec.rb +80 -0
- data/spec/binding/metadata_types_spec.rb +77 -57
- data/spec/serialization_spec.rb +134 -0
- data/spec/unit/config/output_format_spec.rb +380 -0
- data/vendor/Cargo.toml +1 -1
- data/vendor/kreuzberg/Cargo.toml +3 -3
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/embeddings.rs +4 -4
- data/vendor/kreuzberg/src/mcp/format.rs +237 -39
- data/vendor/kreuzberg/src/mcp/params.rs +26 -33
- data/vendor/kreuzberg/src/mcp/server.rs +6 -3
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +16 -23
- data/vendor/kreuzberg/tests/api_chunk.rs +40 -30
- data/vendor/kreuzberg/tests/api_consistency.rs +349 -0
- data/vendor/kreuzberg/tests/api_embed.rs +84 -50
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +8 -2
- data/vendor/kreuzberg/tests/api_tests.rs +298 -139
- data/vendor/kreuzberg/tests/archive_integration.rs +63 -56
- data/vendor/kreuzberg/tests/batch_orchestration.rs +22 -14
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +13 -13
- data/vendor/kreuzberg/tests/batch_processing.rs +13 -9
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +10 -10
- data/vendor/kreuzberg/tests/concurrency_stress.rs +10 -6
- data/vendor/kreuzberg/tests/config_behavioral.rs +414 -0
- data/vendor/kreuzberg/tests/config_features.rs +19 -15
- data/vendor/kreuzberg/tests/config_integration_test.rs +68 -68
- data/vendor/kreuzberg/tests/config_loading_tests.rs +71 -62
- data/vendor/kreuzberg/tests/contract_mcp.rs +314 -0
- data/vendor/kreuzberg/tests/core_integration.rs +55 -53
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +23 -23
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +15 -14
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +4 -4
- data/vendor/kreuzberg/tests/email_integration.rs +7 -7
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/error_handling.rs +13 -11
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
- data/vendor/kreuzberg/tests/instrumentation_test.rs +18 -13
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +17 -17
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +9 -9
- data/vendor/kreuzberg/tests/keywords_integration.rs +25 -25
- data/vendor/kreuzberg/tests/keywords_quality.rs +9 -9
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +1 -1
- data/vendor/kreuzberg/tests/mcp_integration.rs +849 -0
- data/vendor/kreuzberg/tests/mime_detection.rs +72 -41
- data/vendor/kreuzberg/tests/ocr_errors.rs +10 -4
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +1 -1
- data/vendor/kreuzberg/tests/ocr_stress.rs +3 -3
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +6 -6
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/page_markers.rs +1 -1
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +6 -6
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +2 -2
- data/vendor/kreuzberg/tests/pipeline_integration.rs +77 -61
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +97 -77
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +78 -61
- data/vendor/kreuzberg/tests/plugin_system.rs +49 -46
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +109 -97
- data/vendor/kreuzberg/tests/pptx_regression_tests.rs +40 -30
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +26 -23
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +1 -1
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +18 -18
- data/vendor/kreuzberg/tests/security_validation.rs +20 -19
- data/vendor/kreuzberg/tests/serialization_integration.rs +112 -0
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +36 -36
- data/vendor/kreuzberg/tests/test_fastembed.rs +8 -8
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +9 -9
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +12 -9
- data/vendor/kreuzberg-tesseract/Cargo.toml +3 -3
- data/vendor/kreuzberg-tesseract/build.rs +4 -4
- data/vendor/kreuzberg-tesseract/src/lib.rs +6 -6
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +3 -3
- metadata +13 -2
|
@@ -29,16 +29,20 @@ async fn test_embed_valid_texts() {
|
|
|
29
29
|
.method("POST")
|
|
30
30
|
.uri("/embed")
|
|
31
31
|
.header("content-type", "application/json")
|
|
32
|
-
.body(Body::from(
|
|
33
|
-
|
|
32
|
+
.body(Body::from(
|
|
33
|
+
serde_json::to_string(&request_body).expect("Operation failed"),
|
|
34
|
+
))
|
|
35
|
+
.expect("Operation failed"),
|
|
34
36
|
)
|
|
35
37
|
.await
|
|
36
|
-
.
|
|
38
|
+
.expect("Operation failed");
|
|
37
39
|
|
|
38
40
|
assert_eq!(response.status(), StatusCode::OK);
|
|
39
41
|
|
|
40
|
-
let body = axum::body::to_bytes(response.into_body(), usize::MAX)
|
|
41
|
-
|
|
42
|
+
let body = axum::body::to_bytes(response.into_body(), usize::MAX)
|
|
43
|
+
.await
|
|
44
|
+
.expect("Failed to convert to bytes");
|
|
45
|
+
let embed_response: EmbedResponse = serde_json::from_slice(&body).expect("Failed to deserialize");
|
|
42
46
|
|
|
43
47
|
assert_eq!(embed_response.count, 2);
|
|
44
48
|
assert_eq!(embed_response.embeddings.len(), 2);
|
|
@@ -66,11 +70,13 @@ async fn test_embed_empty_texts() {
|
|
|
66
70
|
.method("POST")
|
|
67
71
|
.uri("/embed")
|
|
68
72
|
.header("content-type", "application/json")
|
|
69
|
-
.body(Body::from(
|
|
70
|
-
|
|
73
|
+
.body(Body::from(
|
|
74
|
+
serde_json::to_string(&request_body).expect("Operation failed"),
|
|
75
|
+
))
|
|
76
|
+
.expect("Operation failed"),
|
|
71
77
|
)
|
|
72
78
|
.await
|
|
73
|
-
.
|
|
79
|
+
.expect("Operation failed");
|
|
74
80
|
|
|
75
81
|
assert_eq!(response.status(), StatusCode::BAD_REQUEST);
|
|
76
82
|
}
|
|
@@ -97,16 +103,20 @@ async fn test_embed_with_custom_config() {
|
|
|
97
103
|
.method("POST")
|
|
98
104
|
.uri("/embed")
|
|
99
105
|
.header("content-type", "application/json")
|
|
100
|
-
.body(Body::from(
|
|
101
|
-
|
|
106
|
+
.body(Body::from(
|
|
107
|
+
serde_json::to_string(&request_body).expect("Operation failed"),
|
|
108
|
+
))
|
|
109
|
+
.expect("Operation failed"),
|
|
102
110
|
)
|
|
103
111
|
.await
|
|
104
|
-
.
|
|
112
|
+
.expect("Operation failed");
|
|
105
113
|
|
|
106
114
|
assert_eq!(response.status(), StatusCode::OK);
|
|
107
115
|
|
|
108
|
-
let body = axum::body::to_bytes(response.into_body(), usize::MAX)
|
|
109
|
-
|
|
116
|
+
let body = axum::body::to_bytes(response.into_body(), usize::MAX)
|
|
117
|
+
.await
|
|
118
|
+
.expect("Failed to convert to bytes");
|
|
119
|
+
let embed_response: EmbedResponse = serde_json::from_slice(&body).expect("Failed to deserialize");
|
|
110
120
|
|
|
111
121
|
assert_eq!(embed_response.count, 1);
|
|
112
122
|
assert_eq!(embed_response.embeddings.len(), 1);
|
|
@@ -128,16 +138,20 @@ async fn test_embed_single_text() {
|
|
|
128
138
|
.method("POST")
|
|
129
139
|
.uri("/embed")
|
|
130
140
|
.header("content-type", "application/json")
|
|
131
|
-
.body(Body::from(
|
|
132
|
-
|
|
141
|
+
.body(Body::from(
|
|
142
|
+
serde_json::to_string(&request_body).expect("Operation failed"),
|
|
143
|
+
))
|
|
144
|
+
.expect("Operation failed"),
|
|
133
145
|
)
|
|
134
146
|
.await
|
|
135
|
-
.
|
|
147
|
+
.expect("Operation failed");
|
|
136
148
|
|
|
137
149
|
assert_eq!(response.status(), StatusCode::OK);
|
|
138
150
|
|
|
139
|
-
let body = axum::body::to_bytes(response.into_body(), usize::MAX)
|
|
140
|
-
|
|
151
|
+
let body = axum::body::to_bytes(response.into_body(), usize::MAX)
|
|
152
|
+
.await
|
|
153
|
+
.expect("Failed to convert to bytes");
|
|
154
|
+
let embed_response: EmbedResponse = serde_json::from_slice(&body).expect("Failed to deserialize");
|
|
141
155
|
|
|
142
156
|
assert_eq!(embed_response.count, 1);
|
|
143
157
|
assert_eq!(embed_response.embeddings.len(), 1);
|
|
@@ -160,16 +174,20 @@ async fn test_embed_batch() {
|
|
|
160
174
|
.method("POST")
|
|
161
175
|
.uri("/embed")
|
|
162
176
|
.header("content-type", "application/json")
|
|
163
|
-
.body(Body::from(
|
|
164
|
-
|
|
177
|
+
.body(Body::from(
|
|
178
|
+
serde_json::to_string(&request_body).expect("Operation failed"),
|
|
179
|
+
))
|
|
180
|
+
.expect("Operation failed"),
|
|
165
181
|
)
|
|
166
182
|
.await
|
|
167
|
-
.
|
|
183
|
+
.expect("Operation failed");
|
|
168
184
|
|
|
169
185
|
assert_eq!(response.status(), StatusCode::OK);
|
|
170
186
|
|
|
171
|
-
let body = axum::body::to_bytes(response.into_body(), usize::MAX)
|
|
172
|
-
|
|
187
|
+
let body = axum::body::to_bytes(response.into_body(), usize::MAX)
|
|
188
|
+
.await
|
|
189
|
+
.expect("Failed to convert to bytes");
|
|
190
|
+
let embed_response: EmbedResponse = serde_json::from_slice(&body).expect("Failed to deserialize");
|
|
173
191
|
|
|
174
192
|
assert_eq!(embed_response.count, 10);
|
|
175
193
|
assert_eq!(embed_response.embeddings.len(), 10);
|
|
@@ -198,16 +216,20 @@ async fn test_embed_long_text() {
|
|
|
198
216
|
.method("POST")
|
|
199
217
|
.uri("/embed")
|
|
200
218
|
.header("content-type", "application/json")
|
|
201
|
-
.body(Body::from(
|
|
202
|
-
|
|
219
|
+
.body(Body::from(
|
|
220
|
+
serde_json::to_string(&request_body).expect("Operation failed"),
|
|
221
|
+
))
|
|
222
|
+
.expect("Operation failed"),
|
|
203
223
|
)
|
|
204
224
|
.await
|
|
205
|
-
.
|
|
225
|
+
.expect("Operation failed");
|
|
206
226
|
|
|
207
227
|
assert_eq!(response.status(), StatusCode::OK);
|
|
208
228
|
|
|
209
|
-
let body = axum::body::to_bytes(response.into_body(), usize::MAX)
|
|
210
|
-
|
|
229
|
+
let body = axum::body::to_bytes(response.into_body(), usize::MAX)
|
|
230
|
+
.await
|
|
231
|
+
.expect("Failed to convert to bytes");
|
|
232
|
+
let embed_response: EmbedResponse = serde_json::from_slice(&body).expect("Failed to deserialize");
|
|
211
233
|
|
|
212
234
|
assert_eq!(embed_response.count, 1);
|
|
213
235
|
assert_eq!(embed_response.embeddings.len(), 1);
|
|
@@ -225,10 +247,10 @@ async fn test_embed_malformed_json() {
|
|
|
225
247
|
.uri("/embed")
|
|
226
248
|
.header("content-type", "application/json")
|
|
227
249
|
.body(Body::from("{invalid json}"))
|
|
228
|
-
.
|
|
250
|
+
.expect("Operation failed"),
|
|
229
251
|
)
|
|
230
252
|
.await
|
|
231
|
-
.
|
|
253
|
+
.expect("Operation failed");
|
|
232
254
|
|
|
233
255
|
assert_eq!(response.status(), StatusCode::BAD_REQUEST);
|
|
234
256
|
}
|
|
@@ -250,16 +272,20 @@ async fn test_embed_deterministic() {
|
|
|
250
272
|
.method("POST")
|
|
251
273
|
.uri("/embed")
|
|
252
274
|
.header("content-type", "application/json")
|
|
253
|
-
.body(Body::from(
|
|
254
|
-
|
|
275
|
+
.body(Body::from(
|
|
276
|
+
serde_json::to_string(&request_body).expect("Operation failed"),
|
|
277
|
+
))
|
|
278
|
+
.expect("Operation failed"),
|
|
255
279
|
)
|
|
256
280
|
.await
|
|
257
|
-
.
|
|
281
|
+
.expect("Operation failed");
|
|
258
282
|
|
|
259
283
|
assert_eq!(response1.status(), StatusCode::OK);
|
|
260
284
|
|
|
261
|
-
let body1 = axum::body::to_bytes(response1.into_body(), usize::MAX)
|
|
262
|
-
|
|
285
|
+
let body1 = axum::body::to_bytes(response1.into_body(), usize::MAX)
|
|
286
|
+
.await
|
|
287
|
+
.expect("Failed to convert to bytes");
|
|
288
|
+
let embed_response1: EmbedResponse = serde_json::from_slice(&body1).expect("Failed to deserialize");
|
|
263
289
|
|
|
264
290
|
// Second call with same text
|
|
265
291
|
let response2 = app
|
|
@@ -268,16 +294,20 @@ async fn test_embed_deterministic() {
|
|
|
268
294
|
.method("POST")
|
|
269
295
|
.uri("/embed")
|
|
270
296
|
.header("content-type", "application/json")
|
|
271
|
-
.body(Body::from(
|
|
272
|
-
|
|
297
|
+
.body(Body::from(
|
|
298
|
+
serde_json::to_string(&request_body).expect("Operation failed"),
|
|
299
|
+
))
|
|
300
|
+
.expect("Operation failed"),
|
|
273
301
|
)
|
|
274
302
|
.await
|
|
275
|
-
.
|
|
303
|
+
.expect("Operation failed");
|
|
276
304
|
|
|
277
305
|
assert_eq!(response2.status(), StatusCode::OK);
|
|
278
306
|
|
|
279
|
-
let body2 = axum::body::to_bytes(response2.into_body(), usize::MAX)
|
|
280
|
-
|
|
307
|
+
let body2 = axum::body::to_bytes(response2.into_body(), usize::MAX)
|
|
308
|
+
.await
|
|
309
|
+
.expect("Failed to convert to bytes");
|
|
310
|
+
let embed_response2: EmbedResponse = serde_json::from_slice(&body2).expect("Failed to deserialize");
|
|
281
311
|
|
|
282
312
|
// Compare embeddings - they should be identical
|
|
283
313
|
assert_eq!(embed_response1.embeddings.len(), embed_response2.embeddings.len());
|
|
@@ -307,18 +337,20 @@ async fn test_embed_different_presets() {
|
|
|
307
337
|
.method("POST")
|
|
308
338
|
.uri("/embed")
|
|
309
339
|
.header("content-type", "application/json")
|
|
310
|
-
.body(Body::from(
|
|
311
|
-
|
|
340
|
+
.body(Body::from(
|
|
341
|
+
serde_json::to_string(&request_fast).expect("Operation failed"),
|
|
342
|
+
))
|
|
343
|
+
.expect("Operation failed"),
|
|
312
344
|
)
|
|
313
345
|
.await
|
|
314
|
-
.
|
|
346
|
+
.expect("Operation failed");
|
|
315
347
|
|
|
316
348
|
assert_eq!(response_fast.status(), StatusCode::OK);
|
|
317
349
|
|
|
318
350
|
let body_fast = axum::body::to_bytes(response_fast.into_body(), usize::MAX)
|
|
319
351
|
.await
|
|
320
|
-
.
|
|
321
|
-
let embed_fast: EmbedResponse = serde_json::from_slice(&body_fast).
|
|
352
|
+
.expect("Operation failed");
|
|
353
|
+
let embed_fast: EmbedResponse = serde_json::from_slice(&body_fast).expect("Failed to deserialize");
|
|
322
354
|
|
|
323
355
|
// Test with "balanced" preset
|
|
324
356
|
let request_balanced = json!({
|
|
@@ -337,18 +369,20 @@ async fn test_embed_different_presets() {
|
|
|
337
369
|
.method("POST")
|
|
338
370
|
.uri("/embed")
|
|
339
371
|
.header("content-type", "application/json")
|
|
340
|
-
.body(Body::from(
|
|
341
|
-
|
|
372
|
+
.body(Body::from(
|
|
373
|
+
serde_json::to_string(&request_balanced).expect("Operation failed"),
|
|
374
|
+
))
|
|
375
|
+
.expect("Operation failed"),
|
|
342
376
|
)
|
|
343
377
|
.await
|
|
344
|
-
.
|
|
378
|
+
.expect("Operation failed");
|
|
345
379
|
|
|
346
380
|
assert_eq!(response_balanced.status(), StatusCode::OK);
|
|
347
381
|
|
|
348
382
|
let body_balanced = axum::body::to_bytes(response_balanced.into_body(), usize::MAX)
|
|
349
383
|
.await
|
|
350
|
-
.
|
|
351
|
-
let embed_balanced: EmbedResponse = serde_json::from_slice(&body_balanced).
|
|
384
|
+
.expect("Operation failed");
|
|
385
|
+
let embed_balanced: EmbedResponse = serde_json::from_slice(&body_balanced).expect("Failed to deserialize");
|
|
352
386
|
|
|
353
387
|
// Different presets should have different dimensions
|
|
354
388
|
assert_ne!(embed_fast.dimensions, embed_balanced.dimensions);
|
|
@@ -93,7 +93,10 @@ startxref
|
|
|
93
93
|
.expect("Failed to read response body");
|
|
94
94
|
|
|
95
95
|
let parsed: Value = serde_json::from_slice(&body).expect("Failed to parse response");
|
|
96
|
-
eprintln!(
|
|
96
|
+
eprintln!(
|
|
97
|
+
"Extraction result: {}",
|
|
98
|
+
serde_json::to_string_pretty(&parsed).expect("Failed to parse")
|
|
99
|
+
);
|
|
97
100
|
}
|
|
98
101
|
|
|
99
102
|
/// Test extracting a 1MB text file (control test without PDF).
|
|
@@ -187,7 +190,10 @@ async fn test_find_size_breaking_point() {
|
|
|
187
190
|
.expect("Failed to read response body");
|
|
188
191
|
|
|
189
192
|
if let Ok(parsed) = serde_json::from_slice::<Value>(&body) {
|
|
190
|
-
eprintln!(
|
|
193
|
+
eprintln!(
|
|
194
|
+
"Error response: {}",
|
|
195
|
+
serde_json::to_string_pretty(&parsed).expect("Failed to parse")
|
|
196
|
+
);
|
|
191
197
|
} else {
|
|
192
198
|
eprintln!("Response body (not JSON): {}", String::from_utf8_lossy(&body));
|
|
193
199
|
}
|