kreuzberg 4.0.0.pre.rc.6 → 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (175) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +0 -6
  3. data/.rubocop.yaml +534 -1
  4. data/Gemfile +2 -1
  5. data/Gemfile.lock +11 -11
  6. data/README.md +5 -10
  7. data/examples/async_patterns.rb +0 -1
  8. data/ext/kreuzberg_rb/extconf.rb +0 -10
  9. data/ext/kreuzberg_rb/native/Cargo.toml +15 -23
  10. data/ext/kreuzberg_rb/native/build.rs +2 -0
  11. data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
  12. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
  13. data/ext/kreuzberg_rb/native/include/strings.h +2 -2
  14. data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
  15. data/ext/kreuzberg_rb/native/src/lib.rs +16 -75
  16. data/kreuzberg.gemspec +14 -57
  17. data/lib/kreuzberg/cache_api.rb +0 -1
  18. data/lib/kreuzberg/cli.rb +2 -2
  19. data/lib/kreuzberg/config.rb +2 -9
  20. data/lib/kreuzberg/errors.rb +7 -75
  21. data/lib/kreuzberg/extraction_api.rb +0 -1
  22. data/lib/kreuzberg/setup_lib_path.rb +0 -1
  23. data/lib/kreuzberg/version.rb +1 -1
  24. data/lib/kreuzberg.rb +0 -21
  25. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  26. data/sig/kreuzberg.rbs +3 -55
  27. data/spec/binding/cli_proxy_spec.rb +4 -2
  28. data/spec/binding/cli_spec.rb +11 -12
  29. data/spec/examples.txt +104 -0
  30. data/spec/fixtures/config.yaml +1 -0
  31. data/spec/spec_helper.rb +1 -1
  32. data/vendor/kreuzberg/Cargo.toml +42 -112
  33. data/vendor/kreuzberg/README.md +2 -2
  34. data/vendor/kreuzberg/build.rs +4 -18
  35. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  36. data/vendor/kreuzberg/src/cache/mod.rs +3 -27
  37. data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
  38. data/vendor/kreuzberg/src/core/extractor.rs +81 -202
  39. data/vendor/kreuzberg/src/core/io.rs +2 -4
  40. data/vendor/kreuzberg/src/core/mime.rs +12 -2
  41. data/vendor/kreuzberg/src/core/mod.rs +1 -4
  42. data/vendor/kreuzberg/src/core/pipeline.rs +33 -111
  43. data/vendor/kreuzberg/src/embeddings.rs +16 -125
  44. data/vendor/kreuzberg/src/error.rs +1 -1
  45. data/vendor/kreuzberg/src/extraction/docx.rs +1 -1
  46. data/vendor/kreuzberg/src/extraction/image.rs +13 -13
  47. data/vendor/kreuzberg/src/extraction/libreoffice.rs +1 -0
  48. data/vendor/kreuzberg/src/extraction/mod.rs +5 -9
  49. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
  50. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  51. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  52. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  53. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  54. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  55. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  56. data/vendor/kreuzberg/src/extractors/archive.rs +0 -21
  57. data/vendor/kreuzberg/src/extractors/docx.rs +128 -16
  58. data/vendor/kreuzberg/src/extractors/email.rs +0 -14
  59. data/vendor/kreuzberg/src/extractors/excel.rs +20 -19
  60. data/vendor/kreuzberg/src/extractors/html.rs +154 -137
  61. data/vendor/kreuzberg/src/extractors/image.rs +4 -7
  62. data/vendor/kreuzberg/src/extractors/mod.rs +9 -106
  63. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  64. data/vendor/kreuzberg/src/extractors/pdf.rs +15 -12
  65. data/vendor/kreuzberg/src/extractors/pptx.rs +3 -17
  66. data/vendor/kreuzberg/src/extractors/structured.rs +0 -14
  67. data/vendor/kreuzberg/src/extractors/text.rs +5 -23
  68. data/vendor/kreuzberg/src/extractors/xml.rs +0 -7
  69. data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
  70. data/vendor/kreuzberg/src/lib.rs +1 -4
  71. data/vendor/kreuzberg/src/mcp/mod.rs +1 -1
  72. data/vendor/kreuzberg/src/mcp/server.rs +3 -5
  73. data/vendor/kreuzberg/src/ocr/processor.rs +2 -18
  74. data/vendor/kreuzberg/src/pdf/error.rs +1 -1
  75. data/vendor/kreuzberg/src/pdf/table.rs +44 -17
  76. data/vendor/kreuzberg/src/pdf/text.rs +3 -0
  77. data/vendor/kreuzberg/src/plugins/extractor.rs +5 -8
  78. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -2
  79. data/vendor/kreuzberg/src/plugins/processor.rs +1 -2
  80. data/vendor/kreuzberg/src/plugins/registry.rs +0 -13
  81. data/vendor/kreuzberg/src/plugins/validator.rs +8 -9
  82. data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
  83. data/vendor/kreuzberg/src/types.rs +12 -42
  84. data/vendor/kreuzberg/tests/batch_orchestration.rs +5 -19
  85. data/vendor/kreuzberg/tests/batch_processing.rs +3 -15
  86. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  87. data/vendor/kreuzberg/tests/concurrency_stress.rs +1 -17
  88. data/vendor/kreuzberg/tests/config_features.rs +0 -18
  89. data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -15
  90. data/vendor/kreuzberg/tests/core_integration.rs +7 -24
  91. data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
  92. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
  93. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  94. data/vendor/kreuzberg/tests/pipeline_integration.rs +1 -0
  95. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -0
  96. data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -1
  97. data/vendor/kreuzberg/tests/security_validation.rs +1 -12
  98. metadata +25 -90
  99. data/.rubocop.yml +0 -538
  100. data/ext/kreuzberg_rb/native/Cargo.lock +0 -6535
  101. data/lib/kreuzberg/error_context.rb +0 -32
  102. data/vendor/kreuzberg/benches/otel_overhead.rs +0 -48
  103. data/vendor/kreuzberg/src/extraction/markdown.rs +0 -213
  104. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -287
  105. data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -469
  106. data/vendor/kreuzberg/src/extractors/docbook.rs +0 -502
  107. data/vendor/kreuzberg/src/extractors/epub.rs +0 -707
  108. data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -491
  109. data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
  110. data/vendor/kreuzberg/src/extractors/jats.rs +0 -1051
  111. data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -367
  112. data/vendor/kreuzberg/src/extractors/latex.rs +0 -652
  113. data/vendor/kreuzberg/src/extractors/markdown.rs +0 -700
  114. data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
  115. data/vendor/kreuzberg/src/extractors/opml.rs +0 -634
  116. data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -528
  117. data/vendor/kreuzberg/src/extractors/rst.rs +0 -576
  118. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -810
  119. data/vendor/kreuzberg/src/extractors/security.rs +0 -484
  120. data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
  121. data/vendor/kreuzberg/src/extractors/typst.rs +0 -650
  122. data/vendor/kreuzberg/src/panic_context.rs +0 -154
  123. data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
  124. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
  125. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -498
  126. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
  127. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
  128. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
  129. data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
  130. data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
  131. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
  132. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
  133. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
  134. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
  135. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -695
  136. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
  137. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
  138. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -692
  139. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -776
  140. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1259
  141. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -647
  142. data/vendor/rb-sys/.cargo-ok +0 -1
  143. data/vendor/rb-sys/.cargo_vcs_info.json +0 -6
  144. data/vendor/rb-sys/Cargo.lock +0 -393
  145. data/vendor/rb-sys/Cargo.toml +0 -70
  146. data/vendor/rb-sys/Cargo.toml.orig +0 -57
  147. data/vendor/rb-sys/LICENSE-APACHE +0 -190
  148. data/vendor/rb-sys/LICENSE-MIT +0 -21
  149. data/vendor/rb-sys/bin/release.sh +0 -21
  150. data/vendor/rb-sys/build/features.rs +0 -108
  151. data/vendor/rb-sys/build/main.rs +0 -246
  152. data/vendor/rb-sys/build/stable_api_config.rs +0 -153
  153. data/vendor/rb-sys/build/version.rs +0 -48
  154. data/vendor/rb-sys/readme.md +0 -36
  155. data/vendor/rb-sys/src/bindings.rs +0 -21
  156. data/vendor/rb-sys/src/hidden.rs +0 -11
  157. data/vendor/rb-sys/src/lib.rs +0 -34
  158. data/vendor/rb-sys/src/macros.rs +0 -371
  159. data/vendor/rb-sys/src/memory.rs +0 -53
  160. data/vendor/rb-sys/src/ruby_abi_version.rs +0 -38
  161. data/vendor/rb-sys/src/special_consts.rs +0 -31
  162. data/vendor/rb-sys/src/stable_api/compiled.c +0 -179
  163. data/vendor/rb-sys/src/stable_api/compiled.rs +0 -257
  164. data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
  165. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +0 -316
  166. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +0 -324
  167. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +0 -317
  168. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +0 -315
  169. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +0 -326
  170. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +0 -327
  171. data/vendor/rb-sys/src/stable_api.rs +0 -261
  172. data/vendor/rb-sys/src/symbol.rs +0 -31
  173. data/vendor/rb-sys/src/tracking_allocator.rs +0 -332
  174. data/vendor/rb-sys/src/utils.rs +0 -89
  175. data/vendor/rb-sys/src/value_type.rs +0 -7
@@ -1,6 +1,6 @@
1
1
  //! CSV and spreadsheet integration tests.
2
2
  //!
3
- //! Tests for CSV and TSV extraction.
3
+ //! Tests for CSV and TSV extraction via Pandoc.
4
4
  //! Validates data extraction, custom delimiters, quoted fields, and edge cases.
5
5
 
6
6
  use kreuzberg::core::config::ExtractionConfig;
@@ -15,13 +15,14 @@ async fn test_csv_basic_extraction() {
15
15
 
16
16
  let csv_content = b"Name,Age,City\nAlice,30,NYC\nBob,25,LA";
17
17
 
18
- let extraction = match extract_bytes(csv_content, "text/csv", &config).await {
19
- Ok(result) => result,
20
- Err(_) => {
21
- println!("Skipping test: CSV extraction not available");
22
- return;
23
- }
24
- };
18
+ let result = extract_bytes(csv_content, "text/csv", &config).await;
19
+
20
+ if result.is_err() {
21
+ println!("Skipping test: Pandoc may not be installed");
22
+ return;
23
+ }
24
+
25
+ let extraction = result.unwrap();
25
26
 
26
27
  assert_eq!(extraction.mime_type, "text/csv");
27
28
  assert!(
@@ -54,13 +55,14 @@ async fn test_csv_with_headers() {
54
55
 
55
56
  let csv_content = b"Product,Price,Quantity\nApple,1.50,100\nBanana,0.75,200\nOrange,2.00,150";
56
57
 
57
- let extraction = match extract_bytes(csv_content, "text/csv", &config).await {
58
- Ok(result) => result,
59
- Err(_) => {
60
- println!("Skipping test: CSV extraction not available");
61
- return;
62
- }
63
- };
58
+ let result = extract_bytes(csv_content, "text/csv", &config).await;
59
+
60
+ if result.is_err() {
61
+ println!("Skipping test: Pandoc may not be installed");
62
+ return;
63
+ }
64
+
65
+ let extraction = result.unwrap();
64
66
 
65
67
  assert!(
66
68
  extraction.chunks.is_none(),
@@ -103,13 +105,14 @@ async fn test_csv_custom_delimiter() {
103
105
 
104
106
  let csv_content = b"Name;Age;City\nAlice;30;NYC\nBob;25;LA";
105
107
 
106
- let extraction = match extract_bytes(csv_content, "text/csv", &config).await {
107
- Ok(result) => result,
108
- Err(_) => {
109
- println!("Skipping test: CSV extraction not available");
110
- return;
111
- }
112
- };
108
+ let result = extract_bytes(csv_content, "text/csv", &config).await;
109
+
110
+ if result.is_err() {
111
+ println!("Skipping test: Pandoc may not be installed");
112
+ return;
113
+ }
114
+
115
+ let extraction = result.unwrap();
113
116
 
114
117
  assert!(
115
118
  extraction.chunks.is_none(),
@@ -135,13 +138,14 @@ async fn test_tsv_file() {
135
138
 
136
139
  let tsv_content = b"Name\tAge\tCity\nAlice\t30\tNYC\nBob\t25\tLA";
137
140
 
138
- let extraction = match extract_bytes(tsv_content, "text/tab-separated-values", &config).await {
139
- Ok(result) => result,
140
- Err(_) => {
141
- println!("Skipping test: TSV extraction not available");
142
- return;
143
- }
144
- };
141
+ let result = extract_bytes(tsv_content, "text/tab-separated-values", &config).await;
142
+
143
+ if result.is_err() {
144
+ println!("Skipping test: Pandoc may not be installed");
145
+ return;
146
+ }
147
+
148
+ let extraction = result.unwrap();
145
149
 
146
150
  assert_eq!(extraction.mime_type, "text/tab-separated-values");
147
151
  assert!(
@@ -171,13 +175,14 @@ async fn test_csv_quoted_fields() {
171
175
  let csv_content =
172
176
  b"Name,Description,Price\n\"Smith, John\",\"Product A, premium\",100\n\"Doe, Jane\",\"Product B, standard\",50";
173
177
 
174
- let extraction = match extract_bytes(csv_content, "text/csv", &config).await {
175
- Ok(result) => result,
176
- Err(_) => {
177
- println!("Skipping test: CSV extraction not available");
178
- return;
179
- }
180
- };
178
+ let result = extract_bytes(csv_content, "text/csv", &config).await;
179
+
180
+ if result.is_err() {
181
+ println!("Skipping test: Pandoc may not be installed");
182
+ return;
183
+ }
184
+
185
+ let extraction = result.unwrap();
181
186
 
182
187
  assert!(
183
188
  extraction.chunks.is_none(),
@@ -207,13 +212,14 @@ async fn test_csv_special_characters() {
207
212
 
208
213
  let csv_content = "Name,City,Emoji\nAlice,Tokyo 東京,🎉\nBob,París,✅\nCarlos,Москва,🌍".as_bytes();
209
214
 
210
- let extraction = match extract_bytes(csv_content, "text/csv", &config).await {
211
- Ok(result) => result,
212
- Err(_) => {
213
- println!("Skipping test: CSV extraction not available");
214
- return;
215
- }
216
- };
215
+ let result = extract_bytes(csv_content, "text/csv", &config).await;
216
+
217
+ if result.is_err() {
218
+ println!("Skipping test: Pandoc may not be installed");
219
+ return;
220
+ }
221
+
222
+ let extraction = result.unwrap();
217
223
 
218
224
  assert!(
219
225
  extraction.chunks.is_none(),
@@ -245,13 +251,14 @@ async fn test_csv_large_file() {
245
251
  csv_content.push_str(&format!("{},Item{},{}.00\n", i, i, i * 10));
246
252
  }
247
253
 
248
- let extraction = match extract_bytes(csv_content.as_bytes(), "text/csv", &config).await {
249
- Ok(result) => result,
250
- Err(_) => {
251
- println!("Skipping test: CSV extraction not available");
252
- return;
253
- }
254
- };
254
+ let result = extract_bytes(csv_content.as_bytes(), "text/csv", &config).await;
255
+
256
+ if result.is_err() {
257
+ println!("Skipping test: Pandoc may not be installed");
258
+ return;
259
+ }
260
+
261
+ let extraction = result.unwrap();
255
262
 
256
263
  assert!(
257
264
  extraction.chunks.is_none(),
@@ -315,13 +322,14 @@ async fn test_csv_headers_only() {
315
322
 
316
323
  let csv_content = b"Name,Age,City";
317
324
 
318
- let extraction = match extract_bytes(csv_content, "text/csv", &config).await {
319
- Ok(result) => result,
320
- Err(_) => {
321
- println!("Skipping test: CSV extraction not available");
322
- return;
323
- }
324
- };
325
+ let result = extract_bytes(csv_content, "text/csv", &config).await;
326
+
327
+ if result.is_err() {
328
+ println!("Skipping test: Pandoc may not be installed");
329
+ return;
330
+ }
331
+
332
+ let extraction = result.unwrap();
325
333
 
326
334
  assert!(
327
335
  extraction.chunks.is_none(),
@@ -346,13 +354,14 @@ async fn test_csv_blank_lines() {
346
354
 
347
355
  let csv_content = b"Name,Age\nAlice,30\n\nBob,25\n\nCarlos,35";
348
356
 
349
- let extraction = match extract_bytes(csv_content, "text/csv", &config).await {
350
- Ok(result) => result,
351
- Err(_) => {
352
- println!("Skipping test: CSV extraction not available");
353
- return;
354
- }
355
- };
357
+ let result = extract_bytes(csv_content, "text/csv", &config).await;
358
+
359
+ if result.is_err() {
360
+ println!("Skipping test: Pandoc may not be installed");
361
+ return;
362
+ }
363
+
364
+ let extraction = result.unwrap();
356
365
 
357
366
  assert!(
358
367
  extraction.chunks.is_none(),
@@ -374,13 +383,14 @@ async fn test_csv_numeric_data() {
374
383
 
375
384
  let csv_content = b"ID,Price,Quantity,Discount\n1,19.99,100,0.15\n2,29.99,50,0.20\n3,9.99,200,0.10";
376
385
 
377
- let extraction = match extract_bytes(csv_content, "text/csv", &config).await {
378
- Ok(result) => result,
379
- Err(_) => {
380
- println!("Skipping test: CSV extraction not available");
381
- return;
382
- }
383
- };
386
+ let result = extract_bytes(csv_content, "text/csv", &config).await;
387
+
388
+ if result.is_err() {
389
+ println!("Skipping test: Pandoc may not be installed");
390
+ return;
391
+ }
392
+
393
+ let extraction = result.unwrap();
384
394
 
385
395
  assert!(
386
396
  extraction.chunks.is_none(),
@@ -2,10 +2,15 @@
2
2
 
3
3
  #![cfg(feature = "office")]
4
4
 
5
- use kreuzberg::{ExtractionConfig, extract_file};
5
+ use kreuzberg::extraction::pandoc::extract_file;
6
6
 
7
7
  #[tokio::test]
8
8
  async fn test_docx_full_metadata_extraction() {
9
+ if kreuzberg::extraction::pandoc::validate_pandoc_version().await.is_err() {
10
+ println!("Skipping test: Pandoc not available");
11
+ return;
12
+ }
13
+
9
14
  let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
10
15
  .parent()
11
16
  .unwrap()
@@ -18,7 +23,7 @@ async fn test_docx_full_metadata_extraction() {
18
23
  return;
19
24
  }
20
25
 
21
- let result = extract_file(&test_file, None, &ExtractionConfig::default())
26
+ let result = extract_file(&test_file, "docx")
22
27
  .await
23
28
  .expect("Should extract DOCX successfully");
24
29
 
@@ -29,66 +34,63 @@ async fn test_docx_full_metadata_extraction() {
29
34
  );
30
35
 
31
36
  assert_eq!(
32
- result.metadata.additional.get("created_by").and_then(|v| v.as_str()),
37
+ result.metadata.get("created_by").and_then(|v| v.as_str()),
33
38
  Some("Christoph Auer"),
34
39
  "Should have correct creator"
35
40
  );
36
41
  assert_eq!(
37
- result.metadata.additional.get("modified_by").and_then(|v| v.as_str()),
42
+ result.metadata.get("modified_by").and_then(|v| v.as_str()),
38
43
  Some("Maxim Lysak"),
39
44
  "Should have correct last modified by"
40
45
  );
41
46
  assert_eq!(
42
- result.metadata.additional.get("created_at").and_then(|v| v.as_str()),
47
+ result.metadata.get("created_at").and_then(|v| v.as_str()),
43
48
  Some("2024-10-09T12:43:00Z"),
44
49
  "Should have correct creation date"
45
50
  );
46
51
  assert_eq!(
47
- result.metadata.additional.get("revision").and_then(|v| v.as_str()),
52
+ result.metadata.get("revision").and_then(|v| v.as_str()),
48
53
  Some("7"),
49
54
  "Should have revision number"
50
55
  );
51
56
 
52
57
  assert_eq!(
53
- result.metadata.additional.get("page_count").and_then(|v| v.as_i64()),
58
+ result.metadata.get("page_count").and_then(|v| v.as_i64()),
54
59
  Some(2),
55
60
  "Should have 2 pages"
56
61
  );
57
62
  assert_eq!(
58
- result.metadata.additional.get("word_count").and_then(|v| v.as_i64()),
63
+ result.metadata.get("word_count").and_then(|v| v.as_i64()),
59
64
  Some(108),
60
65
  "Should have 108 words"
61
66
  );
62
67
  assert_eq!(
63
- result
64
- .metadata
65
- .additional
66
- .get("character_count")
67
- .and_then(|v| v.as_i64()),
68
+ result.metadata.get("character_count").and_then(|v| v.as_i64()),
68
69
  Some(620),
69
70
  "Should have 620 characters"
70
71
  );
71
72
  assert_eq!(
72
- result.metadata.additional.get("line_count").and_then(|v| v.as_i64()),
73
+ result.metadata.get("line_count").and_then(|v| v.as_i64()),
73
74
  Some(5),
74
75
  "Should have 5 lines"
75
76
  );
76
77
  assert_eq!(
77
- result
78
- .metadata
79
- .additional
80
- .get("paragraph_count")
81
- .and_then(|v| v.as_i64()),
78
+ result.metadata.get("paragraph_count").and_then(|v| v.as_i64()),
82
79
  Some(1),
83
80
  "Should have 1 paragraph"
84
81
  );
85
82
 
86
83
  println!("✅ DOCX metadata extraction test passed!");
87
- println!(" Found {} metadata fields", result.metadata.additional.len());
84
+ println!(" Found {} metadata fields", result.metadata.len());
88
85
  }
89
86
 
90
87
  #[tokio::test]
91
88
  async fn test_docx_minimal_metadata_extraction() {
89
+ if kreuzberg::extraction::pandoc::validate_pandoc_version().await.is_err() {
90
+ println!("Skipping test: Pandoc not available");
91
+ return;
92
+ }
93
+
92
94
  let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
93
95
  .parent()
94
96
  .unwrap()
@@ -101,19 +103,19 @@ async fn test_docx_minimal_metadata_extraction() {
101
103
  return;
102
104
  }
103
105
 
104
- let result = extract_file(&test_file, None, &ExtractionConfig::default())
106
+ let result = extract_file(&test_file, "docx")
105
107
  .await
106
108
  .expect("Should extract DOCX successfully");
107
109
 
108
110
  assert!(!result.content.is_empty(), "Content should not be empty");
109
111
 
110
112
  assert_eq!(
111
- result.metadata.additional.get("page_count").and_then(|v| v.as_i64()),
113
+ result.metadata.get("page_count").and_then(|v| v.as_i64()),
112
114
  Some(1),
113
115
  "Should have 1 page"
114
116
  );
115
117
  assert_eq!(
116
- result.metadata.additional.get("word_count").and_then(|v| v.as_i64()),
118
+ result.metadata.get("word_count").and_then(|v| v.as_i64()),
117
119
  Some(520),
118
120
  "Should have 520 words"
119
121
  );