kreuzberg 4.1.2 → 4.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/libpdfium.so +0 -0
- data/kreuzberg.gemspec +13 -1
- data/lib/kreuzberg/config.rb +70 -35
- data/lib/kreuzberg/version.rb +1 -1
- data/sig/kreuzberg.rbs +5 -1
- data/spec/binding/batch_operations_spec.rb +80 -0
- data/spec/binding/metadata_types_spec.rb +77 -57
- data/spec/serialization_spec.rb +134 -0
- data/spec/unit/config/output_format_spec.rb +380 -0
- data/vendor/Cargo.toml +1 -1
- data/vendor/kreuzberg/Cargo.toml +1 -1
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/embeddings.rs +4 -4
- data/vendor/kreuzberg/src/mcp/format.rs +237 -39
- data/vendor/kreuzberg/src/mcp/params.rs +26 -33
- data/vendor/kreuzberg/src/mcp/server.rs +6 -3
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +16 -23
- data/vendor/kreuzberg/tests/api_chunk.rs +40 -30
- data/vendor/kreuzberg/tests/api_consistency.rs +349 -0
- data/vendor/kreuzberg/tests/api_embed.rs +84 -50
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +8 -2
- data/vendor/kreuzberg/tests/api_tests.rs +298 -139
- data/vendor/kreuzberg/tests/archive_integration.rs +63 -56
- data/vendor/kreuzberg/tests/batch_orchestration.rs +22 -14
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +13 -13
- data/vendor/kreuzberg/tests/batch_processing.rs +13 -9
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +10 -10
- data/vendor/kreuzberg/tests/concurrency_stress.rs +10 -6
- data/vendor/kreuzberg/tests/config_behavioral.rs +414 -0
- data/vendor/kreuzberg/tests/config_features.rs +19 -15
- data/vendor/kreuzberg/tests/config_integration_test.rs +68 -68
- data/vendor/kreuzberg/tests/config_loading_tests.rs +71 -62
- data/vendor/kreuzberg/tests/contract_mcp.rs +314 -0
- data/vendor/kreuzberg/tests/core_integration.rs +55 -53
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +23 -23
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +15 -14
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +4 -4
- data/vendor/kreuzberg/tests/email_integration.rs +7 -7
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/error_handling.rs +13 -11
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
- data/vendor/kreuzberg/tests/instrumentation_test.rs +18 -13
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +17 -17
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +9 -9
- data/vendor/kreuzberg/tests/keywords_integration.rs +25 -25
- data/vendor/kreuzberg/tests/keywords_quality.rs +9 -9
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +1 -1
- data/vendor/kreuzberg/tests/mcp_integration.rs +849 -0
- data/vendor/kreuzberg/tests/mime_detection.rs +72 -41
- data/vendor/kreuzberg/tests/ocr_errors.rs +10 -4
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +1 -1
- data/vendor/kreuzberg/tests/ocr_stress.rs +3 -3
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +6 -6
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/page_markers.rs +1 -1
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +6 -6
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +2 -2
- data/vendor/kreuzberg/tests/pipeline_integration.rs +77 -61
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +97 -77
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +78 -61
- data/vendor/kreuzberg/tests/plugin_system.rs +49 -46
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +109 -97
- data/vendor/kreuzberg/tests/pptx_regression_tests.rs +40 -30
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +26 -23
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +1 -1
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +18 -18
- data/vendor/kreuzberg/tests/security_validation.rs +20 -19
- data/vendor/kreuzberg/tests/serialization_integration.rs +112 -0
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +36 -36
- data/vendor/kreuzberg/tests/test_fastembed.rs +8 -8
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +9 -9
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +12 -9
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- metadata +10 -2
|
@@ -11,9 +11,9 @@ fn test_file_path(filename: &str) -> PathBuf {
|
|
|
11
11
|
let manifest_dir = env!("CARGO_MANIFEST_DIR");
|
|
12
12
|
PathBuf::from(manifest_dir)
|
|
13
13
|
.parent()
|
|
14
|
-
.
|
|
14
|
+
.expect("Operation failed")
|
|
15
15
|
.parent()
|
|
16
|
-
.
|
|
16
|
+
.expect("Operation failed")
|
|
17
17
|
.join("test_documents")
|
|
18
18
|
.join("docbook")
|
|
19
19
|
.join(filename)
|
|
@@ -72,7 +72,7 @@ async fn test_docbook4_chapter_extraction() {
|
|
|
72
72
|
let result = extract_docbook4_file("docbook-chapter.docbook").await;
|
|
73
73
|
assert!(result.is_ok(), "Failed to extract DocBook 4 chapter");
|
|
74
74
|
|
|
75
|
-
let result = result.
|
|
75
|
+
let result = result.expect("Operation failed");
|
|
76
76
|
assert!(!result.content.is_empty(), "Content should not be empty");
|
|
77
77
|
assert!(
|
|
78
78
|
result.content.contains("Test Chapter"),
|
|
@@ -89,7 +89,7 @@ async fn test_docbook5_reader_extraction() {
|
|
|
89
89
|
let result = extract_docbook5_file("docbook-reader.docbook").await;
|
|
90
90
|
assert!(result.is_ok(), "Failed to extract DocBook 5 file");
|
|
91
91
|
|
|
92
|
-
let result = result.
|
|
92
|
+
let result = result.expect("Operation failed");
|
|
93
93
|
assert!(!result.content.is_empty(), "Content should not be empty");
|
|
94
94
|
assert!(
|
|
95
95
|
result.content.contains("Pandoc Test Suite"),
|
|
@@ -102,7 +102,7 @@ async fn test_docbook_xref_extraction() {
|
|
|
102
102
|
let result = extract_docbook4_file("docbook-xref.docbook").await;
|
|
103
103
|
assert!(result.is_ok(), "Failed to extract DocBook with xref elements");
|
|
104
104
|
|
|
105
|
-
let result = result.
|
|
105
|
+
let result = result.expect("Operation failed");
|
|
106
106
|
assert!(!result.content.is_empty(), "Content should not be empty");
|
|
107
107
|
assert!(
|
|
108
108
|
result.content.contains("An Example Book"),
|
|
@@ -119,7 +119,7 @@ async fn test_docbook_tables_extraction() {
|
|
|
119
119
|
let result = extract_docbook4_file("tables.docbook4").await;
|
|
120
120
|
assert!(result.is_ok(), "Failed to extract DocBook with tables");
|
|
121
121
|
|
|
122
|
-
let result = result.
|
|
122
|
+
let result = result.expect("Operation failed");
|
|
123
123
|
assert!(!result.content.is_empty(), "Content should not be empty");
|
|
124
124
|
assert!(!result.tables.is_empty(), "Should extract tables from DocBook");
|
|
125
125
|
}
|
|
@@ -129,7 +129,7 @@ async fn test_docbook5_tables_extraction() {
|
|
|
129
129
|
let result = extract_docbook5_file("tables.docbook5").await;
|
|
130
130
|
assert!(result.is_ok(), "Failed to extract DocBook 5 with tables");
|
|
131
131
|
|
|
132
|
-
let result = result.
|
|
132
|
+
let result = result.expect("Operation failed");
|
|
133
133
|
assert!(!result.content.is_empty(), "Content should not be empty");
|
|
134
134
|
assert!(!result.tables.is_empty(), "Should extract tables from DocBook 5");
|
|
135
135
|
}
|
|
@@ -139,7 +139,7 @@ async fn test_docbook_metadata_extraction() {
|
|
|
139
139
|
let result = extract_docbook5_file("docbook-reader.docbook").await;
|
|
140
140
|
assert!(result.is_ok());
|
|
141
141
|
|
|
142
|
-
let result = result.
|
|
142
|
+
let result = result.expect("Operation failed");
|
|
143
143
|
assert!(!result.content.is_empty());
|
|
144
144
|
}
|
|
145
145
|
|
|
@@ -148,7 +148,7 @@ async fn test_docbook_section_hierarchy() {
|
|
|
148
148
|
let result = extract_docbook4_file("docbook-chapter.docbook").await;
|
|
149
149
|
assert!(result.is_ok());
|
|
150
150
|
|
|
151
|
-
let result = result.
|
|
151
|
+
let result = result.expect("Operation failed");
|
|
152
152
|
let content = &result.content;
|
|
153
153
|
|
|
154
154
|
assert!(content.contains("Like a Sect1"));
|
|
@@ -162,7 +162,7 @@ async fn test_docbook_paragraph_extraction() {
|
|
|
162
162
|
let result = extract_docbook4_file("docbook-chapter.docbook").await;
|
|
163
163
|
assert!(result.is_ok());
|
|
164
164
|
|
|
165
|
-
let result = result.
|
|
165
|
+
let result = result.expect("Operation failed");
|
|
166
166
|
assert!(
|
|
167
167
|
result.content.contains("This chapter uses recursive sections"),
|
|
168
168
|
"Should extract paragraph content"
|
|
@@ -183,7 +183,7 @@ async fn test_docbook_paragraph_content() {
|
|
|
183
183
|
let result = extract_docbook_bytes(docbook.as_bytes(), "application/docbook+xml").await;
|
|
184
184
|
assert!(result.is_ok());
|
|
185
185
|
|
|
186
|
-
let result = result.
|
|
186
|
+
let result = result.expect("Operation failed");
|
|
187
187
|
assert!(result.content.contains("Test Article"));
|
|
188
188
|
assert!(result.content.contains("This is a test paragraph"));
|
|
189
189
|
assert!(result.content.contains("another paragraph"));
|
|
@@ -205,7 +205,7 @@ def hello():
|
|
|
205
205
|
let result = extract_docbook_bytes(docbook.as_bytes(), "application/docbook+xml").await;
|
|
206
206
|
assert!(result.is_ok());
|
|
207
207
|
|
|
208
|
-
let result = result.
|
|
208
|
+
let result = result.expect("Operation failed");
|
|
209
209
|
assert!(result.content.contains("def hello"));
|
|
210
210
|
assert!(result.content.contains("print"));
|
|
211
211
|
}
|
|
@@ -229,7 +229,7 @@ async fn test_docbook_mixed_content() {
|
|
|
229
229
|
let result = extract_docbook_bytes(docbook.as_bytes(), "application/docbook+xml").await;
|
|
230
230
|
assert!(result.is_ok());
|
|
231
231
|
|
|
232
|
-
let result = result.
|
|
232
|
+
let result = result.expect("Operation failed");
|
|
233
233
|
assert!(result.content.contains("Test Book"));
|
|
234
234
|
assert!(result.content.contains("Chapter 1"));
|
|
235
235
|
assert!(result.content.contains("Section 1.1"));
|
|
@@ -259,7 +259,7 @@ async fn test_docbook_namespaced_5x_parsing() {
|
|
|
259
259
|
let result = extract_docbook_bytes(docbook5.as_bytes(), "application/docbook+xml").await;
|
|
260
260
|
assert!(result.is_ok());
|
|
261
261
|
|
|
262
|
-
let result = result.
|
|
262
|
+
let result = result.expect("Operation failed");
|
|
263
263
|
assert!(result.content.contains("DocBook 5 Article"));
|
|
264
264
|
assert!(result.content.contains("Welcome to DocBook 5"));
|
|
265
265
|
}
|
|
@@ -277,7 +277,7 @@ async fn test_docbook_link_handling() {
|
|
|
277
277
|
let result = extract_docbook_bytes(docbook.as_bytes(), "application/docbook+xml").await;
|
|
278
278
|
assert!(result.is_ok());
|
|
279
279
|
|
|
280
|
-
let result = result.
|
|
280
|
+
let result = result.expect("Operation failed");
|
|
281
281
|
assert!(result.content.contains("example"));
|
|
282
282
|
}
|
|
283
283
|
|
|
@@ -316,7 +316,7 @@ async fn test_docbook_empty_sections() {
|
|
|
316
316
|
let result = extract_docbook_bytes(docbook.as_bytes(), "application/docbook+xml").await;
|
|
317
317
|
assert!(result.is_ok());
|
|
318
318
|
|
|
319
|
-
let result = result.
|
|
319
|
+
let result = result.expect("Operation failed");
|
|
320
320
|
assert!(result.content.contains("Empty Section"));
|
|
321
321
|
assert!(result.content.contains("Section with Content"));
|
|
322
322
|
assert!(result.content.contains("Content here"));
|
|
@@ -345,7 +345,7 @@ async fn test_docbook_itemized_list() {
|
|
|
345
345
|
let result = extract_docbook_bytes(docbook.as_bytes(), "application/docbook+xml").await;
|
|
346
346
|
assert!(result.is_ok());
|
|
347
347
|
|
|
348
|
-
let result = result.
|
|
348
|
+
let result = result.expect("Operation failed");
|
|
349
349
|
assert!(result.content.contains("First item"));
|
|
350
350
|
assert!(result.content.contains("Second item"));
|
|
351
351
|
assert!(result.content.contains("Third item"));
|
|
@@ -375,7 +375,7 @@ async fn test_docbook_ordered_list() {
|
|
|
375
375
|
let result = extract_docbook_bytes(docbook.as_bytes(), "application/docbook+xml").await;
|
|
376
376
|
assert!(result.is_ok());
|
|
377
377
|
|
|
378
|
-
let result = result.
|
|
378
|
+
let result = result.expect("Operation failed");
|
|
379
379
|
assert!(result.content.contains("First step"));
|
|
380
380
|
assert!(result.content.contains("Second step"));
|
|
381
381
|
assert!(result.content.contains("Third step"));
|
|
@@ -397,7 +397,7 @@ async fn test_docbook_blockquote() {
|
|
|
397
397
|
let result = extract_docbook_bytes(docbook.as_bytes(), "application/docbook+xml").await;
|
|
398
398
|
assert!(result.is_ok());
|
|
399
399
|
|
|
400
|
-
let result = result.
|
|
400
|
+
let result = result.expect("Operation failed");
|
|
401
401
|
assert!(result.content.contains("quoted passage"));
|
|
402
402
|
assert!(result.content.contains("> "), "Should contain blockquote marker");
|
|
403
403
|
}
|
|
@@ -418,7 +418,7 @@ async fn test_docbook_figure() {
|
|
|
418
418
|
let result = extract_docbook_bytes(docbook.as_bytes(), "application/docbook+xml").await;
|
|
419
419
|
assert!(result.is_ok());
|
|
420
420
|
|
|
421
|
-
let result = result.
|
|
421
|
+
let result = result.expect("Operation failed");
|
|
422
422
|
assert!(result.content.contains("Figure"));
|
|
423
423
|
}
|
|
424
424
|
|
|
@@ -435,7 +435,7 @@ async fn test_docbook_footnote() {
|
|
|
435
435
|
let result = extract_docbook_bytes(docbook.as_bytes(), "application/docbook+xml").await;
|
|
436
436
|
assert!(result.is_ok());
|
|
437
437
|
|
|
438
|
-
let result = result.
|
|
438
|
+
let result = result.expect("Operation failed");
|
|
439
439
|
assert!(result.content.contains("text with a footnote"));
|
|
440
440
|
assert!(result.content.contains("footnote content"));
|
|
441
441
|
}
|
|
@@ -465,7 +465,7 @@ code example
|
|
|
465
465
|
let result = extract_docbook_bytes(docbook.as_bytes(), "application/docbook+xml").await;
|
|
466
466
|
assert!(result.is_ok());
|
|
467
467
|
|
|
468
|
-
let result = result.
|
|
468
|
+
let result = result.expect("Operation failed");
|
|
469
469
|
assert!(result.content.contains("Introduction paragraph"));
|
|
470
470
|
assert!(result.content.contains("List item 1"));
|
|
471
471
|
assert!(result.content.contains("List item 2"));
|
|
@@ -493,7 +493,7 @@ async fn test_docbook_namespaced_lists() {
|
|
|
493
493
|
let result = extract_docbook_bytes(docbook5.as_bytes(), "application/docbook+xml").await;
|
|
494
494
|
assert!(result.is_ok());
|
|
495
495
|
|
|
496
|
-
let result = result.
|
|
496
|
+
let result = result.expect("Operation failed");
|
|
497
497
|
assert!(result.content.contains("Namespaced item 1"));
|
|
498
498
|
assert!(result.content.contains("Namespaced item 2"));
|
|
499
499
|
assert!(result.content.contains("- "));
|
|
@@ -8,9 +8,9 @@ use kreuzberg::{ExtractionConfig, extract_file};
|
|
|
8
8
|
async fn test_docx_full_metadata_extraction() {
|
|
9
9
|
let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
|
|
10
10
|
.parent()
|
|
11
|
-
.
|
|
11
|
+
.expect("Operation failed")
|
|
12
12
|
.parent()
|
|
13
|
-
.
|
|
13
|
+
.expect("Operation failed");
|
|
14
14
|
let test_file = workspace_root.join("test_documents/documents/word_sample.docx");
|
|
15
15
|
|
|
16
16
|
if !test_file.exists() {
|
|
@@ -91,9 +91,9 @@ async fn test_docx_full_metadata_extraction() {
|
|
|
91
91
|
async fn test_docx_minimal_metadata_extraction() {
|
|
92
92
|
let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
|
|
93
93
|
.parent()
|
|
94
|
-
.
|
|
94
|
+
.expect("Operation failed")
|
|
95
95
|
.parent()
|
|
96
|
-
.
|
|
96
|
+
.expect("Operation failed");
|
|
97
97
|
let test_file = workspace_root.join("test_documents/documents/lorem_ipsum.docx");
|
|
98
98
|
|
|
99
99
|
if !test_file.exists() {
|
|
@@ -143,25 +143,26 @@ async fn test_docx_keywords_extraction() {
|
|
|
143
143
|
let options: FileOptions<()> = FileOptions::default().compression_method(CompressionMethod::Stored);
|
|
144
144
|
|
|
145
145
|
// Add [Content_Types].xml
|
|
146
|
-
zip.start_file("[Content_Types].xml", options)
|
|
146
|
+
zip.start_file("[Content_Types].xml", options)
|
|
147
|
+
.expect("Operation failed");
|
|
147
148
|
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
148
149
|
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
|
|
149
150
|
<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
|
|
150
151
|
<Default Extension="xml" ContentType="application/xml"/>
|
|
151
152
|
<Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>
|
|
152
153
|
<Override PartName="/docProps/core.xml" ContentType="application/vnd.openxmlformats-package.core-properties+xml"/>
|
|
153
|
-
</Types>"#).
|
|
154
|
+
</Types>"#).expect("Operation failed");
|
|
154
155
|
|
|
155
156
|
// Add _rels/.rels
|
|
156
|
-
zip.start_file("_rels/.rels", options).
|
|
157
|
+
zip.start_file("_rels/.rels", options).expect("Operation failed");
|
|
157
158
|
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
158
159
|
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
|
|
159
160
|
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/>
|
|
160
161
|
<Relationship Id="rId2" Type="http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties" Target="docProps/core.xml"/>
|
|
161
|
-
</Relationships>"#).
|
|
162
|
+
</Relationships>"#).expect("Operation failed");
|
|
162
163
|
|
|
163
164
|
// Add word/document.xml with simple content
|
|
164
|
-
zip.start_file("word/document.xml", options).
|
|
165
|
+
zip.start_file("word/document.xml", options).expect("Operation failed");
|
|
165
166
|
zip.write_all(
|
|
166
167
|
br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
167
168
|
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
|
|
@@ -174,10 +175,10 @@ async fn test_docx_keywords_extraction() {
|
|
|
174
175
|
</w:body>
|
|
175
176
|
</w:document>"#,
|
|
176
177
|
)
|
|
177
|
-
.
|
|
178
|
+
.expect("Operation failed");
|
|
178
179
|
|
|
179
180
|
// Add docProps/core.xml with keywords (comma-separated string)
|
|
180
|
-
zip.start_file("docProps/core.xml", options).
|
|
181
|
+
zip.start_file("docProps/core.xml", options).expect("Operation failed");
|
|
181
182
|
zip.write_all(
|
|
182
183
|
br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
183
184
|
<cp:coreProperties xmlns:cp="http://schemas.openxmlformats.org/package/2006/metadata/core-properties"
|
|
@@ -189,9 +190,9 @@ async fn test_docx_keywords_extraction() {
|
|
|
189
190
|
<dc:subject>Testing keyword extraction</dc:subject>
|
|
190
191
|
</cp:coreProperties>"#,
|
|
191
192
|
)
|
|
192
|
-
.
|
|
193
|
+
.expect("Operation failed");
|
|
193
194
|
|
|
194
|
-
zip.finish().
|
|
195
|
+
zip.finish().expect("Operation failed");
|
|
195
196
|
}
|
|
196
197
|
|
|
197
198
|
// Extract the DOCX file
|
|
@@ -216,7 +217,7 @@ async fn test_docx_keywords_extraction() {
|
|
|
216
217
|
"Keywords should be present in metadata.keywords"
|
|
217
218
|
);
|
|
218
219
|
|
|
219
|
-
let keywords = result.metadata.keywords.as_ref().
|
|
220
|
+
let keywords = result.metadata.keywords.as_ref().expect("Operation failed");
|
|
220
221
|
assert_eq!(
|
|
221
222
|
keywords.len(),
|
|
222
223
|
5,
|
|
@@ -10,9 +10,9 @@ use kreuzberg::plugins::DocumentExtractor;
|
|
|
10
10
|
async fn test_docx_kreuzberg_vs_pandoc_comparison() {
|
|
11
11
|
let docx_path = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
|
|
12
12
|
.parent()
|
|
13
|
-
.
|
|
13
|
+
.expect("Operation failed")
|
|
14
14
|
.parent()
|
|
15
|
-
.
|
|
15
|
+
.expect("Operation failed")
|
|
16
16
|
.join("test_documents/documents/word_sample.docx");
|
|
17
17
|
|
|
18
18
|
if !docx_path.exists() {
|
|
@@ -319,9 +319,9 @@ Here are some interesting things a respectful duck could eat:
|
|
|
319
319
|
async fn test_docx_lorem_ipsum_comparison() {
|
|
320
320
|
let docx_path = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
|
|
321
321
|
.parent()
|
|
322
|
-
.
|
|
322
|
+
.expect("Operation failed")
|
|
323
323
|
.parent()
|
|
324
|
-
.
|
|
324
|
+
.expect("Operation failed")
|
|
325
325
|
.join("test_documents/documents/lorem_ipsum.docx");
|
|
326
326
|
|
|
327
327
|
if !docx_path.exists() {
|
|
@@ -32,7 +32,7 @@ This is the email body content.";
|
|
|
32
32
|
assert_eq!(result.metadata.subject, Some("Test Email Subject".to_string()));
|
|
33
33
|
|
|
34
34
|
assert!(result.metadata.format.is_some());
|
|
35
|
-
let email_meta = match result.metadata.format.as_ref().
|
|
35
|
+
let email_meta = match result.metadata.format.as_ref().expect("Operation failed") {
|
|
36
36
|
kreuzberg::FormatMetadata::Email(meta) => meta,
|
|
37
37
|
_ => panic!("Expected Email metadata"),
|
|
38
38
|
};
|
|
@@ -44,7 +44,7 @@ This is the email body content.";
|
|
|
44
44
|
assert!(email_meta.bcc_emails.is_empty(), "BCC should be empty");
|
|
45
45
|
|
|
46
46
|
assert!(email_meta.message_id.is_some());
|
|
47
|
-
let msg_id = email_meta.message_id.clone().
|
|
47
|
+
let msg_id = email_meta.message_id.clone().expect("Operation failed");
|
|
48
48
|
assert!(
|
|
49
49
|
msg_id.contains("unique123@example.com"),
|
|
50
50
|
"Message ID should contain unique123@example.com"
|
|
@@ -86,7 +86,7 @@ Attachment content here.\r\n\
|
|
|
86
86
|
.expect("Should extract EML with attachment");
|
|
87
87
|
|
|
88
88
|
assert!(result.metadata.format.is_some());
|
|
89
|
-
let email_meta = match result.metadata.format.as_ref().
|
|
89
|
+
let email_meta = match result.metadata.format.as_ref().expect("Operation failed") {
|
|
90
90
|
kreuzberg::FormatMetadata::Email(meta) => meta,
|
|
91
91
|
_ => panic!("Expected Email metadata"),
|
|
92
92
|
};
|
|
@@ -127,7 +127,7 @@ Content-Type: text/html; charset=utf-8\r\n\
|
|
|
127
127
|
assert!(result.content.contains("HTML Heading") || result.content.contains("bold"));
|
|
128
128
|
|
|
129
129
|
assert!(result.metadata.format.is_some());
|
|
130
|
-
let email_meta = match result.metadata.format.as_ref().
|
|
130
|
+
let email_meta = match result.metadata.format.as_ref().expect("Operation failed") {
|
|
131
131
|
kreuzberg::FormatMetadata::Email(meta) => meta,
|
|
132
132
|
_ => panic!("Expected Email metadata"),
|
|
133
133
|
};
|
|
@@ -159,7 +159,7 @@ And preserves formatting.";
|
|
|
159
159
|
assert!(result.content.contains("preserves formatting"));
|
|
160
160
|
|
|
161
161
|
assert!(result.metadata.format.is_some());
|
|
162
|
-
let email_meta = match result.metadata.format.as_ref().
|
|
162
|
+
let email_meta = match result.metadata.format.as_ref().expect("Operation failed") {
|
|
163
163
|
kreuzberg::FormatMetadata::Email(meta) => meta,
|
|
164
164
|
_ => panic!("Expected Email metadata"),
|
|
165
165
|
};
|
|
@@ -198,7 +198,7 @@ Content-Type: text/html\r\n\
|
|
|
198
198
|
);
|
|
199
199
|
|
|
200
200
|
assert!(result.metadata.format.is_some());
|
|
201
|
-
let email_meta = match result.metadata.format.as_ref().
|
|
201
|
+
let email_meta = match result.metadata.format.as_ref().expect("Operation failed") {
|
|
202
202
|
kreuzberg::FormatMetadata::Email(meta) => meta,
|
|
203
203
|
_ => panic!("Expected Email metadata"),
|
|
204
204
|
};
|
|
@@ -290,7 +290,7 @@ Email to multiple recipients.";
|
|
|
290
290
|
.expect("Should extract email with multiple recipients");
|
|
291
291
|
|
|
292
292
|
assert!(result.metadata.format.is_some());
|
|
293
|
-
let email_meta = match result.metadata.format.as_ref().
|
|
293
|
+
let email_meta = match result.metadata.format.as_ref().expect("Operation failed") {
|
|
294
294
|
kreuzberg::FormatMetadata::Email(meta) => meta,
|
|
295
295
|
_ => panic!("Expected Email metadata"),
|
|
296
296
|
};
|
|
@@ -17,9 +17,9 @@ use std::path::PathBuf;
|
|
|
17
17
|
fn get_test_epub_path(filename: &str) -> PathBuf {
|
|
18
18
|
let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
|
|
19
19
|
.parent()
|
|
20
|
-
.
|
|
20
|
+
.expect("Operation failed")
|
|
21
21
|
.parent()
|
|
22
|
-
.
|
|
22
|
+
.expect("Operation failed");
|
|
23
23
|
workspace_root.join(format!("test_documents/epub/{}", filename))
|
|
24
24
|
}
|
|
25
25
|
|
|
@@ -187,7 +187,7 @@ async fn test_very_large_file() {
|
|
|
187
187
|
let result = extract_bytes(large_bytes, "text/plain", &config).await;
|
|
188
188
|
|
|
189
189
|
assert!(result.is_ok(), "Large file should be processed successfully");
|
|
190
|
-
let extraction = result.
|
|
190
|
+
let extraction = result.expect("Operation failed");
|
|
191
191
|
|
|
192
192
|
assert!(!extraction.content.is_empty(), "Large file content should not be empty");
|
|
193
193
|
assert!(extraction.content.len() > 1_000_000, "Content should be large");
|
|
@@ -213,12 +213,14 @@ async fn test_unicode_filenames() {
|
|
|
213
213
|
let config = ExtractionConfig::default();
|
|
214
214
|
|
|
215
215
|
let mut temp_file = NamedTempFile::new().expect("Should create temp file");
|
|
216
|
-
temp_file
|
|
216
|
+
temp_file
|
|
217
|
+
.write_all(b"Test content with Unicode filename.")
|
|
218
|
+
.expect("Operation failed");
|
|
217
219
|
|
|
218
220
|
let result = extract_file(temp_file.path(), Some("text/plain"), &config).await;
|
|
219
221
|
|
|
220
222
|
assert!(result.is_ok(), "Unicode filename should be handled");
|
|
221
|
-
let extraction = result.
|
|
223
|
+
let extraction = result.expect("Operation failed");
|
|
222
224
|
|
|
223
225
|
assert!(
|
|
224
226
|
extraction.content.contains("Test content"),
|
|
@@ -249,7 +251,7 @@ Math symbols: ∑ ∫ √ ≈ ∞";
|
|
|
249
251
|
let result = extract_bytes(special_text.as_bytes(), "text/plain", &config).await;
|
|
250
252
|
|
|
251
253
|
assert!(result.is_ok(), "Special characters should be handled");
|
|
252
|
-
let extraction = result.
|
|
254
|
+
let extraction = result.expect("Operation failed");
|
|
253
255
|
|
|
254
256
|
assert!(!extraction.content.is_empty(), "Content should not be empty");
|
|
255
257
|
assert!(extraction.content.len() > 10, "Should have substantial content");
|
|
@@ -319,17 +321,17 @@ async fn test_permission_denied() {
|
|
|
319
321
|
let config = ExtractionConfig::default();
|
|
320
322
|
|
|
321
323
|
let mut temp_file = NamedTempFile::new().expect("Should create temp file");
|
|
322
|
-
temp_file.write_all(b"Test content").
|
|
324
|
+
temp_file.write_all(b"Test content").expect("Operation failed");
|
|
323
325
|
|
|
324
|
-
let mut perms = fs::metadata(temp_file.path()).
|
|
326
|
+
let mut perms = fs::metadata(temp_file.path()).expect("Operation failed").permissions();
|
|
325
327
|
perms.set_mode(0o000);
|
|
326
|
-
fs::set_permissions(temp_file.path(), perms).
|
|
328
|
+
fs::set_permissions(temp_file.path(), perms).expect("Operation failed");
|
|
327
329
|
|
|
328
330
|
let result = extract_file(temp_file.path(), Some("text/plain"), &config).await;
|
|
329
331
|
|
|
330
|
-
let mut perms = fs::metadata(temp_file.path()).
|
|
332
|
+
let mut perms = fs::metadata(temp_file.path()).expect("Operation failed").permissions();
|
|
331
333
|
perms.set_mode(0o644);
|
|
332
|
-
fs::set_permissions(temp_file.path(), perms).
|
|
334
|
+
fs::set_permissions(temp_file.path(), perms).expect("Operation failed");
|
|
333
335
|
|
|
334
336
|
assert!(result.is_err(), "Permission denied should return error");
|
|
335
337
|
}
|
|
@@ -356,7 +358,7 @@ async fn test_null_bytes_in_content() {
|
|
|
356
358
|
let result = extract_bytes(data_with_nulls, "text/plain", &config).await;
|
|
357
359
|
|
|
358
360
|
assert!(result.is_ok(), "Null bytes should be handled");
|
|
359
|
-
let extraction = result.
|
|
361
|
+
let extraction = result.expect("Operation failed");
|
|
360
362
|
|
|
361
363
|
assert!(!extraction.content.is_empty(), "Content should not be empty");
|
|
362
364
|
assert!(
|
|
@@ -388,7 +390,7 @@ async fn test_concurrent_extractions() {
|
|
|
388
390
|
let result = handle.await.expect("Task should complete");
|
|
389
391
|
assert!(result.is_ok(), "Concurrent extraction should succeed");
|
|
390
392
|
|
|
391
|
-
let extraction = result.
|
|
393
|
+
let extraction = result.expect("Operation failed");
|
|
392
394
|
assert!(
|
|
393
395
|
extraction.content.contains("Concurrent extraction"),
|
|
394
396
|
"Content should be extracted correctly"
|
|
@@ -9,9 +9,9 @@ fn test_file_path(filename: &str) -> PathBuf {
|
|
|
9
9
|
let manifest_dir = env!("CARGO_MANIFEST_DIR");
|
|
10
10
|
PathBuf::from(manifest_dir)
|
|
11
11
|
.parent()
|
|
12
|
-
.
|
|
12
|
+
.expect("Operation failed")
|
|
13
13
|
.parent()
|
|
14
|
-
.
|
|
14
|
+
.expect("Operation failed")
|
|
15
15
|
.join("test_documents")
|
|
16
16
|
.join("fictionbook")
|
|
17
17
|
.join(filename)
|
|
@@ -37,7 +37,7 @@ mod html_table_tests {
|
|
|
37
37
|
let result = convert_html_to_markdown(html, None, None);
|
|
38
38
|
assert!(result.is_ok(), "HTML to markdown conversion should succeed");
|
|
39
39
|
|
|
40
|
-
let markdown = result.
|
|
40
|
+
let markdown = result.expect("Operation failed");
|
|
41
41
|
|
|
42
42
|
println!("=== Basic Table Test ===");
|
|
43
43
|
println!("Input HTML:\n{}", html);
|
|
@@ -79,7 +79,7 @@ mod html_table_tests {
|
|
|
79
79
|
let result = convert_html_to_markdown(html, None, None);
|
|
80
80
|
assert!(result.is_ok(), "Should convert to markdown");
|
|
81
81
|
|
|
82
|
-
let markdown = result.
|
|
82
|
+
let markdown = result.expect("Operation failed");
|
|
83
83
|
|
|
84
84
|
println!("=== Table Format Test ===");
|
|
85
85
|
println!("Input HTML:\n{}", html);
|
|
@@ -143,7 +143,7 @@ mod html_table_tests {
|
|
|
143
143
|
let result = convert_html_to_markdown(html, None, None);
|
|
144
144
|
assert!(result.is_ok(), "Should convert complex table");
|
|
145
145
|
|
|
146
|
-
let markdown = result.
|
|
146
|
+
let markdown = result.expect("Operation failed");
|
|
147
147
|
|
|
148
148
|
println!("=== Complex Table Test ===");
|
|
149
149
|
println!("Input HTML:\n{}", html);
|
|
@@ -194,7 +194,7 @@ mod html_table_tests {
|
|
|
194
194
|
let result = convert_html_to_markdown(html, None, None);
|
|
195
195
|
assert!(result.is_ok(), "Should handle merged cell table");
|
|
196
196
|
|
|
197
|
-
let markdown = result.
|
|
197
|
+
let markdown = result.expect("Operation failed");
|
|
198
198
|
|
|
199
199
|
println!("=== Merged Cells Test ===");
|
|
200
200
|
println!("Input HTML:\n{}", html);
|
|
@@ -248,7 +248,7 @@ mod html_table_tests {
|
|
|
248
248
|
let result = convert_html_to_markdown(html, None, None);
|
|
249
249
|
assert!(result.is_ok(), "Should handle multiple tables");
|
|
250
250
|
|
|
251
|
-
let markdown = result.
|
|
251
|
+
let markdown = result.expect("Operation failed");
|
|
252
252
|
|
|
253
253
|
println!("=== Multiple Tables Test ===");
|
|
254
254
|
println!("Input HTML:\n{}", html);
|
|
@@ -303,7 +303,7 @@ mod html_table_tests {
|
|
|
303
303
|
let result = convert_html_to_markdown(html, None, None);
|
|
304
304
|
assert!(result.is_ok(), "Should handle mixed header cells");
|
|
305
305
|
|
|
306
|
-
let markdown = result.
|
|
306
|
+
let markdown = result.expect("Operation failed");
|
|
307
307
|
|
|
308
308
|
println!("=== Mixed Header Cells Test ===");
|
|
309
309
|
println!("Input HTML:\n{}", html);
|
|
@@ -349,7 +349,7 @@ mod html_table_tests {
|
|
|
349
349
|
let result = convert_html_to_markdown(html, None, None);
|
|
350
350
|
assert!(result.is_ok(), "Should handle table with caption");
|
|
351
351
|
|
|
352
|
-
let markdown = result.
|
|
352
|
+
let markdown = result.expect("Operation failed");
|
|
353
353
|
|
|
354
354
|
println!("=== Table with Caption Test ===");
|
|
355
355
|
println!("Input HTML:\n{}", html);
|
|
@@ -385,7 +385,7 @@ mod html_table_tests {
|
|
|
385
385
|
let result = convert_html_to_markdown(html, None, None);
|
|
386
386
|
assert!(result.is_ok(), "Should handle flat table");
|
|
387
387
|
|
|
388
|
-
let markdown = result.
|
|
388
|
+
let markdown = result.expect("Operation failed");
|
|
389
389
|
|
|
390
390
|
println!("=== Simple Flat Table Test ===");
|
|
391
391
|
println!("Input HTML:\n{}", html);
|
|
@@ -421,7 +421,7 @@ mod html_table_tests {
|
|
|
421
421
|
let result = convert_html_to_markdown(html, None, None);
|
|
422
422
|
assert!(result.is_ok(), "Should handle empty cells");
|
|
423
423
|
|
|
424
|
-
let markdown = result.
|
|
424
|
+
let markdown = result.expect("Operation failed");
|
|
425
425
|
|
|
426
426
|
println!("=== Empty Cells Test ===");
|
|
427
427
|
println!("Input HTML:\n{}", html);
|
|
@@ -459,7 +459,7 @@ mod html_table_tests {
|
|
|
459
459
|
let result = convert_html_to_markdown(html, None, None);
|
|
460
460
|
assert!(result.is_ok(), "Should handle numeric table");
|
|
461
461
|
|
|
462
|
-
let markdown = result.
|
|
462
|
+
let markdown = result.expect("Operation failed");
|
|
463
463
|
|
|
464
464
|
println!("=== Numeric Data Test ===");
|
|
465
465
|
println!("Input HTML:\n{}", html);
|
|
@@ -502,7 +502,7 @@ mod html_table_tests {
|
|
|
502
502
|
let result = convert_html_to_markdown(html, None, None);
|
|
503
503
|
assert!(result.is_ok(), "Should handle unicode characters");
|
|
504
504
|
|
|
505
|
-
let markdown = result.
|
|
505
|
+
let markdown = result.expect("Operation failed");
|
|
506
506
|
|
|
507
507
|
println!("=== Special Characters Test ===");
|
|
508
508
|
println!("Input HTML:\n{}", html);
|
|
@@ -17,7 +17,10 @@ struct SpanCollector {
|
|
|
17
17
|
|
|
18
18
|
impl<S: Subscriber + for<'a> LookupSpan<'a>> Layer<S> for SpanCollector {
|
|
19
19
|
fn on_new_span(&self, attrs: &Attributes<'_>, _id: &Id, _ctx: Context<'_, S>) {
|
|
20
|
-
self.spans
|
|
20
|
+
self.spans
|
|
21
|
+
.lock()
|
|
22
|
+
.expect("Operation failed")
|
|
23
|
+
.push(attrs.metadata().name().to_string());
|
|
21
24
|
}
|
|
22
25
|
}
|
|
23
26
|
|
|
@@ -32,21 +35,23 @@ async fn test_cache_instrumentation() {
|
|
|
32
35
|
let subscriber = tracing_subscriber::registry().with(collector);
|
|
33
36
|
let _guard = tracing::subscriber::set_default(subscriber);
|
|
34
37
|
|
|
35
|
-
let temp_dir = tempdir().
|
|
38
|
+
let temp_dir = tempdir().expect("Operation failed");
|
|
36
39
|
let cache = GenericCache::new(
|
|
37
40
|
"test".to_string(),
|
|
38
|
-
Some(temp_dir.path().to_str().
|
|
41
|
+
Some(temp_dir.path().to_str().expect("Operation failed").to_string()),
|
|
39
42
|
30.0,
|
|
40
43
|
500.0,
|
|
41
44
|
1000.0,
|
|
42
45
|
)
|
|
43
|
-
.
|
|
46
|
+
.expect("Operation failed");
|
|
44
47
|
|
|
45
|
-
cache
|
|
48
|
+
cache
|
|
49
|
+
.set("test_key", b"test data".to_vec(), None)
|
|
50
|
+
.expect("Operation failed");
|
|
46
51
|
|
|
47
|
-
let _ = cache.get("test_key", None).
|
|
52
|
+
let _ = cache.get("test_key", None).expect("Value not found");
|
|
48
53
|
|
|
49
|
-
let span_names = spans.lock().
|
|
54
|
+
let span_names = spans.lock().expect("Operation failed");
|
|
50
55
|
assert!(span_names.contains(&"set".to_string()), "Expected 'set' span");
|
|
51
56
|
assert!(span_names.contains(&"get".to_string()), "Expected 'get' span");
|
|
52
57
|
}
|
|
@@ -64,13 +69,13 @@ async fn test_ocr_instrumentation() {
|
|
|
64
69
|
let subscriber = tracing_subscriber::registry().with(collector);
|
|
65
70
|
let _guard = tracing::subscriber::set_default(subscriber);
|
|
66
71
|
|
|
67
|
-
let temp_dir = tempdir().
|
|
68
|
-
let processor = OcrProcessor::new(Some(temp_dir.path().to_path_buf())).
|
|
72
|
+
let temp_dir = tempdir().expect("Operation failed");
|
|
73
|
+
let processor = OcrProcessor::new(Some(temp_dir.path().to_path_buf())).expect("Operation failed");
|
|
69
74
|
|
|
70
75
|
let mut test_image = Vec::new();
|
|
71
76
|
let img = image::ImageBuffer::from_fn(1, 1, |_, _| image::Rgb([255u8, 255u8, 255u8]));
|
|
72
77
|
img.write_to(&mut std::io::Cursor::new(&mut test_image), image::ImageFormat::Png)
|
|
73
|
-
.
|
|
78
|
+
.expect("Operation failed");
|
|
74
79
|
|
|
75
80
|
let config = TesseractConfig {
|
|
76
81
|
output_format: "text".to_string(),
|
|
@@ -80,7 +85,7 @@ async fn test_ocr_instrumentation() {
|
|
|
80
85
|
|
|
81
86
|
let _ = processor.process_image(&test_image, &config);
|
|
82
87
|
|
|
83
|
-
let span_names = spans.lock().
|
|
88
|
+
let span_names = spans.lock().expect("Operation failed");
|
|
84
89
|
assert!(
|
|
85
90
|
span_names.contains(&"process_image".to_string()),
|
|
86
91
|
"Expected 'process_image' span"
|
|
@@ -101,7 +106,7 @@ async fn test_registry_instrumentation() {
|
|
|
101
106
|
|
|
102
107
|
let _ = registry.get("application/pdf");
|
|
103
108
|
|
|
104
|
-
let span_names = spans.lock().
|
|
109
|
+
let span_names = spans.lock().expect("Operation failed");
|
|
105
110
|
assert!(
|
|
106
111
|
span_names.contains(&"get".to_string()),
|
|
107
112
|
"Expected 'get' span from registry"
|
|
@@ -125,7 +130,7 @@ async fn test_span_hierarchy() {
|
|
|
125
130
|
|
|
126
131
|
let _ = extract_bytes(test_content, "text/plain", &config).await;
|
|
127
132
|
|
|
128
|
-
let span_names = spans.lock().
|
|
133
|
+
let span_names = spans.lock().expect("Operation failed");
|
|
129
134
|
assert!(
|
|
130
135
|
span_names.contains(&"extract_bytes".to_string()),
|
|
131
136
|
"Expected 'extract_bytes' span"
|