kreuzberg 4.0.0.pre.rc.6 → 4.0.0.rc1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +0 -6
- data/.rubocop.yaml +534 -1
- data/Gemfile +2 -1
- data/Gemfile.lock +11 -11
- data/README.md +5 -10
- data/examples/async_patterns.rb +0 -1
- data/ext/kreuzberg_rb/extconf.rb +0 -10
- data/ext/kreuzberg_rb/native/Cargo.toml +15 -23
- data/ext/kreuzberg_rb/native/build.rs +2 -0
- data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
- data/ext/kreuzberg_rb/native/include/strings.h +2 -2
- data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
- data/ext/kreuzberg_rb/native/src/lib.rs +16 -75
- data/kreuzberg.gemspec +14 -57
- data/lib/kreuzberg/cache_api.rb +0 -1
- data/lib/kreuzberg/cli.rb +2 -2
- data/lib/kreuzberg/config.rb +2 -9
- data/lib/kreuzberg/errors.rb +7 -75
- data/lib/kreuzberg/extraction_api.rb +0 -1
- data/lib/kreuzberg/setup_lib_path.rb +0 -1
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +0 -21
- data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
- data/sig/kreuzberg.rbs +3 -55
- data/spec/binding/cli_proxy_spec.rb +4 -2
- data/spec/binding/cli_spec.rb +11 -12
- data/spec/examples.txt +104 -0
- data/spec/fixtures/config.yaml +1 -0
- data/spec/spec_helper.rb +1 -1
- data/vendor/kreuzberg/Cargo.toml +42 -112
- data/vendor/kreuzberg/README.md +2 -2
- data/vendor/kreuzberg/build.rs +4 -18
- data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
- data/vendor/kreuzberg/src/cache/mod.rs +3 -27
- data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
- data/vendor/kreuzberg/src/core/extractor.rs +81 -202
- data/vendor/kreuzberg/src/core/io.rs +2 -4
- data/vendor/kreuzberg/src/core/mime.rs +12 -2
- data/vendor/kreuzberg/src/core/mod.rs +1 -4
- data/vendor/kreuzberg/src/core/pipeline.rs +33 -111
- data/vendor/kreuzberg/src/embeddings.rs +16 -125
- data/vendor/kreuzberg/src/error.rs +1 -1
- data/vendor/kreuzberg/src/extraction/docx.rs +1 -1
- data/vendor/kreuzberg/src/extraction/image.rs +13 -13
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +1 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +5 -9
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
- data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
- data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
- data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
- data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +0 -21
- data/vendor/kreuzberg/src/extractors/docx.rs +128 -16
- data/vendor/kreuzberg/src/extractors/email.rs +0 -14
- data/vendor/kreuzberg/src/extractors/excel.rs +20 -19
- data/vendor/kreuzberg/src/extractors/html.rs +154 -137
- data/vendor/kreuzberg/src/extractors/image.rs +4 -7
- data/vendor/kreuzberg/src/extractors/mod.rs +9 -106
- data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +15 -12
- data/vendor/kreuzberg/src/extractors/pptx.rs +3 -17
- data/vendor/kreuzberg/src/extractors/structured.rs +0 -14
- data/vendor/kreuzberg/src/extractors/text.rs +5 -23
- data/vendor/kreuzberg/src/extractors/xml.rs +0 -7
- data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
- data/vendor/kreuzberg/src/lib.rs +1 -4
- data/vendor/kreuzberg/src/mcp/mod.rs +1 -1
- data/vendor/kreuzberg/src/mcp/server.rs +3 -5
- data/vendor/kreuzberg/src/ocr/processor.rs +2 -18
- data/vendor/kreuzberg/src/pdf/error.rs +1 -1
- data/vendor/kreuzberg/src/pdf/table.rs +44 -17
- data/vendor/kreuzberg/src/pdf/text.rs +3 -0
- data/vendor/kreuzberg/src/plugins/extractor.rs +5 -8
- data/vendor/kreuzberg/src/plugins/ocr.rs +11 -2
- data/vendor/kreuzberg/src/plugins/processor.rs +1 -2
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -13
- data/vendor/kreuzberg/src/plugins/validator.rs +8 -9
- data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
- data/vendor/kreuzberg/src/types.rs +12 -42
- data/vendor/kreuzberg/tests/batch_orchestration.rs +5 -19
- data/vendor/kreuzberg/tests/batch_processing.rs +3 -15
- data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +1 -17
- data/vendor/kreuzberg/tests/config_features.rs +0 -18
- data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -15
- data/vendor/kreuzberg/tests/core_integration.rs +7 -24
- data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
- data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1 -0
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -1
- data/vendor/kreuzberg/tests/security_validation.rs +1 -12
- metadata +25 -90
- data/.rubocop.yml +0 -538
- data/ext/kreuzberg_rb/native/Cargo.lock +0 -6535
- data/lib/kreuzberg/error_context.rb +0 -32
- data/vendor/kreuzberg/benches/otel_overhead.rs +0 -48
- data/vendor/kreuzberg/src/extraction/markdown.rs +0 -213
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -287
- data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -469
- data/vendor/kreuzberg/src/extractors/docbook.rs +0 -502
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -707
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -491
- data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
- data/vendor/kreuzberg/src/extractors/jats.rs +0 -1051
- data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -367
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -652
- data/vendor/kreuzberg/src/extractors/markdown.rs +0 -700
- data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -634
- data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -528
- data/vendor/kreuzberg/src/extractors/rst.rs +0 -576
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -810
- data/vendor/kreuzberg/src/extractors/security.rs +0 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
- data/vendor/kreuzberg/src/extractors/typst.rs +0 -650
- data/vendor/kreuzberg/src/panic_context.rs +0 -154
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -498
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
- data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
- data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -695
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -692
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -776
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1259
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -647
- data/vendor/rb-sys/.cargo-ok +0 -1
- data/vendor/rb-sys/.cargo_vcs_info.json +0 -6
- data/vendor/rb-sys/Cargo.lock +0 -393
- data/vendor/rb-sys/Cargo.toml +0 -70
- data/vendor/rb-sys/Cargo.toml.orig +0 -57
- data/vendor/rb-sys/LICENSE-APACHE +0 -190
- data/vendor/rb-sys/LICENSE-MIT +0 -21
- data/vendor/rb-sys/bin/release.sh +0 -21
- data/vendor/rb-sys/build/features.rs +0 -108
- data/vendor/rb-sys/build/main.rs +0 -246
- data/vendor/rb-sys/build/stable_api_config.rs +0 -153
- data/vendor/rb-sys/build/version.rs +0 -48
- data/vendor/rb-sys/readme.md +0 -36
- data/vendor/rb-sys/src/bindings.rs +0 -21
- data/vendor/rb-sys/src/hidden.rs +0 -11
- data/vendor/rb-sys/src/lib.rs +0 -34
- data/vendor/rb-sys/src/macros.rs +0 -371
- data/vendor/rb-sys/src/memory.rs +0 -53
- data/vendor/rb-sys/src/ruby_abi_version.rs +0 -38
- data/vendor/rb-sys/src/special_consts.rs +0 -31
- data/vendor/rb-sys/src/stable_api/compiled.c +0 -179
- data/vendor/rb-sys/src/stable_api/compiled.rs +0 -257
- data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +0 -316
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +0 -324
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +0 -317
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +0 -315
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +0 -326
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +0 -327
- data/vendor/rb-sys/src/stable_api.rs +0 -261
- data/vendor/rb-sys/src/symbol.rs +0 -31
- data/vendor/rb-sys/src/tracking_allocator.rs +0 -332
- data/vendor/rb-sys/src/utils.rs +0 -89
- data/vendor/rb-sys/src/value_type.rs +0 -7
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
//! CSV and spreadsheet integration tests.
|
|
2
2
|
//!
|
|
3
|
-
//! Tests for CSV and TSV extraction.
|
|
3
|
+
//! Tests for CSV and TSV extraction via Pandoc.
|
|
4
4
|
//! Validates data extraction, custom delimiters, quoted fields, and edge cases.
|
|
5
5
|
|
|
6
6
|
use kreuzberg::core::config::ExtractionConfig;
|
|
@@ -15,13 +15,14 @@ async fn test_csv_basic_extraction() {
|
|
|
15
15
|
|
|
16
16
|
let csv_content = b"Name,Age,City\nAlice,30,NYC\nBob,25,LA";
|
|
17
17
|
|
|
18
|
-
let
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
18
|
+
let result = extract_bytes(csv_content, "text/csv", &config).await;
|
|
19
|
+
|
|
20
|
+
if result.is_err() {
|
|
21
|
+
println!("Skipping test: Pandoc may not be installed");
|
|
22
|
+
return;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
let extraction = result.unwrap();
|
|
25
26
|
|
|
26
27
|
assert_eq!(extraction.mime_type, "text/csv");
|
|
27
28
|
assert!(
|
|
@@ -54,13 +55,14 @@ async fn test_csv_with_headers() {
|
|
|
54
55
|
|
|
55
56
|
let csv_content = b"Product,Price,Quantity\nApple,1.50,100\nBanana,0.75,200\nOrange,2.00,150";
|
|
56
57
|
|
|
57
|
-
let
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
58
|
+
let result = extract_bytes(csv_content, "text/csv", &config).await;
|
|
59
|
+
|
|
60
|
+
if result.is_err() {
|
|
61
|
+
println!("Skipping test: Pandoc may not be installed");
|
|
62
|
+
return;
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
let extraction = result.unwrap();
|
|
64
66
|
|
|
65
67
|
assert!(
|
|
66
68
|
extraction.chunks.is_none(),
|
|
@@ -103,13 +105,14 @@ async fn test_csv_custom_delimiter() {
|
|
|
103
105
|
|
|
104
106
|
let csv_content = b"Name;Age;City\nAlice;30;NYC\nBob;25;LA";
|
|
105
107
|
|
|
106
|
-
let
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
108
|
+
let result = extract_bytes(csv_content, "text/csv", &config).await;
|
|
109
|
+
|
|
110
|
+
if result.is_err() {
|
|
111
|
+
println!("Skipping test: Pandoc may not be installed");
|
|
112
|
+
return;
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
let extraction = result.unwrap();
|
|
113
116
|
|
|
114
117
|
assert!(
|
|
115
118
|
extraction.chunks.is_none(),
|
|
@@ -135,13 +138,14 @@ async fn test_tsv_file() {
|
|
|
135
138
|
|
|
136
139
|
let tsv_content = b"Name\tAge\tCity\nAlice\t30\tNYC\nBob\t25\tLA";
|
|
137
140
|
|
|
138
|
-
let
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
141
|
+
let result = extract_bytes(tsv_content, "text/tab-separated-values", &config).await;
|
|
142
|
+
|
|
143
|
+
if result.is_err() {
|
|
144
|
+
println!("Skipping test: Pandoc may not be installed");
|
|
145
|
+
return;
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
let extraction = result.unwrap();
|
|
145
149
|
|
|
146
150
|
assert_eq!(extraction.mime_type, "text/tab-separated-values");
|
|
147
151
|
assert!(
|
|
@@ -171,13 +175,14 @@ async fn test_csv_quoted_fields() {
|
|
|
171
175
|
let csv_content =
|
|
172
176
|
b"Name,Description,Price\n\"Smith, John\",\"Product A, premium\",100\n\"Doe, Jane\",\"Product B, standard\",50";
|
|
173
177
|
|
|
174
|
-
let
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
178
|
+
let result = extract_bytes(csv_content, "text/csv", &config).await;
|
|
179
|
+
|
|
180
|
+
if result.is_err() {
|
|
181
|
+
println!("Skipping test: Pandoc may not be installed");
|
|
182
|
+
return;
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
let extraction = result.unwrap();
|
|
181
186
|
|
|
182
187
|
assert!(
|
|
183
188
|
extraction.chunks.is_none(),
|
|
@@ -207,13 +212,14 @@ async fn test_csv_special_characters() {
|
|
|
207
212
|
|
|
208
213
|
let csv_content = "Name,City,Emoji\nAlice,Tokyo 東京,🎉\nBob,París,✅\nCarlos,Москва,🌍".as_bytes();
|
|
209
214
|
|
|
210
|
-
let
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
215
|
+
let result = extract_bytes(csv_content, "text/csv", &config).await;
|
|
216
|
+
|
|
217
|
+
if result.is_err() {
|
|
218
|
+
println!("Skipping test: Pandoc may not be installed");
|
|
219
|
+
return;
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
let extraction = result.unwrap();
|
|
217
223
|
|
|
218
224
|
assert!(
|
|
219
225
|
extraction.chunks.is_none(),
|
|
@@ -245,13 +251,14 @@ async fn test_csv_large_file() {
|
|
|
245
251
|
csv_content.push_str(&format!("{},Item{},{}.00\n", i, i, i * 10));
|
|
246
252
|
}
|
|
247
253
|
|
|
248
|
-
let
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
254
|
+
let result = extract_bytes(csv_content.as_bytes(), "text/csv", &config).await;
|
|
255
|
+
|
|
256
|
+
if result.is_err() {
|
|
257
|
+
println!("Skipping test: Pandoc may not be installed");
|
|
258
|
+
return;
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
let extraction = result.unwrap();
|
|
255
262
|
|
|
256
263
|
assert!(
|
|
257
264
|
extraction.chunks.is_none(),
|
|
@@ -315,13 +322,14 @@ async fn test_csv_headers_only() {
|
|
|
315
322
|
|
|
316
323
|
let csv_content = b"Name,Age,City";
|
|
317
324
|
|
|
318
|
-
let
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
+
let result = extract_bytes(csv_content, "text/csv", &config).await;
|
|
326
|
+
|
|
327
|
+
if result.is_err() {
|
|
328
|
+
println!("Skipping test: Pandoc may not be installed");
|
|
329
|
+
return;
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
let extraction = result.unwrap();
|
|
325
333
|
|
|
326
334
|
assert!(
|
|
327
335
|
extraction.chunks.is_none(),
|
|
@@ -346,13 +354,14 @@ async fn test_csv_blank_lines() {
|
|
|
346
354
|
|
|
347
355
|
let csv_content = b"Name,Age\nAlice,30\n\nBob,25\n\nCarlos,35";
|
|
348
356
|
|
|
349
|
-
let
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
357
|
+
let result = extract_bytes(csv_content, "text/csv", &config).await;
|
|
358
|
+
|
|
359
|
+
if result.is_err() {
|
|
360
|
+
println!("Skipping test: Pandoc may not be installed");
|
|
361
|
+
return;
|
|
362
|
+
}
|
|
363
|
+
|
|
364
|
+
let extraction = result.unwrap();
|
|
356
365
|
|
|
357
366
|
assert!(
|
|
358
367
|
extraction.chunks.is_none(),
|
|
@@ -374,13 +383,14 @@ async fn test_csv_numeric_data() {
|
|
|
374
383
|
|
|
375
384
|
let csv_content = b"ID,Price,Quantity,Discount\n1,19.99,100,0.15\n2,29.99,50,0.20\n3,9.99,200,0.10";
|
|
376
385
|
|
|
377
|
-
let
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
386
|
+
let result = extract_bytes(csv_content, "text/csv", &config).await;
|
|
387
|
+
|
|
388
|
+
if result.is_err() {
|
|
389
|
+
println!("Skipping test: Pandoc may not be installed");
|
|
390
|
+
return;
|
|
391
|
+
}
|
|
392
|
+
|
|
393
|
+
let extraction = result.unwrap();
|
|
384
394
|
|
|
385
395
|
assert!(
|
|
386
396
|
extraction.chunks.is_none(),
|
|
@@ -2,10 +2,15 @@
|
|
|
2
2
|
|
|
3
3
|
#![cfg(feature = "office")]
|
|
4
4
|
|
|
5
|
-
use kreuzberg::
|
|
5
|
+
use kreuzberg::extraction::pandoc::extract_file;
|
|
6
6
|
|
|
7
7
|
#[tokio::test]
|
|
8
8
|
async fn test_docx_full_metadata_extraction() {
|
|
9
|
+
if kreuzberg::extraction::pandoc::validate_pandoc_version().await.is_err() {
|
|
10
|
+
println!("Skipping test: Pandoc not available");
|
|
11
|
+
return;
|
|
12
|
+
}
|
|
13
|
+
|
|
9
14
|
let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
|
|
10
15
|
.parent()
|
|
11
16
|
.unwrap()
|
|
@@ -18,7 +23,7 @@ async fn test_docx_full_metadata_extraction() {
|
|
|
18
23
|
return;
|
|
19
24
|
}
|
|
20
25
|
|
|
21
|
-
let result = extract_file(&test_file,
|
|
26
|
+
let result = extract_file(&test_file, "docx")
|
|
22
27
|
.await
|
|
23
28
|
.expect("Should extract DOCX successfully");
|
|
24
29
|
|
|
@@ -29,66 +34,63 @@ async fn test_docx_full_metadata_extraction() {
|
|
|
29
34
|
);
|
|
30
35
|
|
|
31
36
|
assert_eq!(
|
|
32
|
-
result.metadata.
|
|
37
|
+
result.metadata.get("created_by").and_then(|v| v.as_str()),
|
|
33
38
|
Some("Christoph Auer"),
|
|
34
39
|
"Should have correct creator"
|
|
35
40
|
);
|
|
36
41
|
assert_eq!(
|
|
37
|
-
result.metadata.
|
|
42
|
+
result.metadata.get("modified_by").and_then(|v| v.as_str()),
|
|
38
43
|
Some("Maxim Lysak"),
|
|
39
44
|
"Should have correct last modified by"
|
|
40
45
|
);
|
|
41
46
|
assert_eq!(
|
|
42
|
-
result.metadata.
|
|
47
|
+
result.metadata.get("created_at").and_then(|v| v.as_str()),
|
|
43
48
|
Some("2024-10-09T12:43:00Z"),
|
|
44
49
|
"Should have correct creation date"
|
|
45
50
|
);
|
|
46
51
|
assert_eq!(
|
|
47
|
-
result.metadata.
|
|
52
|
+
result.metadata.get("revision").and_then(|v| v.as_str()),
|
|
48
53
|
Some("7"),
|
|
49
54
|
"Should have revision number"
|
|
50
55
|
);
|
|
51
56
|
|
|
52
57
|
assert_eq!(
|
|
53
|
-
result.metadata.
|
|
58
|
+
result.metadata.get("page_count").and_then(|v| v.as_i64()),
|
|
54
59
|
Some(2),
|
|
55
60
|
"Should have 2 pages"
|
|
56
61
|
);
|
|
57
62
|
assert_eq!(
|
|
58
|
-
result.metadata.
|
|
63
|
+
result.metadata.get("word_count").and_then(|v| v.as_i64()),
|
|
59
64
|
Some(108),
|
|
60
65
|
"Should have 108 words"
|
|
61
66
|
);
|
|
62
67
|
assert_eq!(
|
|
63
|
-
result
|
|
64
|
-
.metadata
|
|
65
|
-
.additional
|
|
66
|
-
.get("character_count")
|
|
67
|
-
.and_then(|v| v.as_i64()),
|
|
68
|
+
result.metadata.get("character_count").and_then(|v| v.as_i64()),
|
|
68
69
|
Some(620),
|
|
69
70
|
"Should have 620 characters"
|
|
70
71
|
);
|
|
71
72
|
assert_eq!(
|
|
72
|
-
result.metadata.
|
|
73
|
+
result.metadata.get("line_count").and_then(|v| v.as_i64()),
|
|
73
74
|
Some(5),
|
|
74
75
|
"Should have 5 lines"
|
|
75
76
|
);
|
|
76
77
|
assert_eq!(
|
|
77
|
-
result
|
|
78
|
-
.metadata
|
|
79
|
-
.additional
|
|
80
|
-
.get("paragraph_count")
|
|
81
|
-
.and_then(|v| v.as_i64()),
|
|
78
|
+
result.metadata.get("paragraph_count").and_then(|v| v.as_i64()),
|
|
82
79
|
Some(1),
|
|
83
80
|
"Should have 1 paragraph"
|
|
84
81
|
);
|
|
85
82
|
|
|
86
83
|
println!("✅ DOCX metadata extraction test passed!");
|
|
87
|
-
println!(" Found {} metadata fields", result.metadata.
|
|
84
|
+
println!(" Found {} metadata fields", result.metadata.len());
|
|
88
85
|
}
|
|
89
86
|
|
|
90
87
|
#[tokio::test]
|
|
91
88
|
async fn test_docx_minimal_metadata_extraction() {
|
|
89
|
+
if kreuzberg::extraction::pandoc::validate_pandoc_version().await.is_err() {
|
|
90
|
+
println!("Skipping test: Pandoc not available");
|
|
91
|
+
return;
|
|
92
|
+
}
|
|
93
|
+
|
|
92
94
|
let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
|
|
93
95
|
.parent()
|
|
94
96
|
.unwrap()
|
|
@@ -101,19 +103,19 @@ async fn test_docx_minimal_metadata_extraction() {
|
|
|
101
103
|
return;
|
|
102
104
|
}
|
|
103
105
|
|
|
104
|
-
let result = extract_file(&test_file,
|
|
106
|
+
let result = extract_file(&test_file, "docx")
|
|
105
107
|
.await
|
|
106
108
|
.expect("Should extract DOCX successfully");
|
|
107
109
|
|
|
108
110
|
assert!(!result.content.is_empty(), "Content should not be empty");
|
|
109
111
|
|
|
110
112
|
assert_eq!(
|
|
111
|
-
result.metadata.
|
|
113
|
+
result.metadata.get("page_count").and_then(|v| v.as_i64()),
|
|
112
114
|
Some(1),
|
|
113
115
|
"Should have 1 page"
|
|
114
116
|
);
|
|
115
117
|
assert_eq!(
|
|
116
|
-
result.metadata.
|
|
118
|
+
result.metadata.get("word_count").and_then(|v| v.as_i64()),
|
|
117
119
|
Some(520),
|
|
118
120
|
"Should have 520 words"
|
|
119
121
|
);
|