kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +0 -6
- data/.rubocop.yaml +534 -1
- data/Gemfile +2 -1
- data/Gemfile.lock +28 -116
- data/README.md +269 -629
- data/Rakefile +0 -9
- data/Steepfile +4 -8
- data/examples/async_patterns.rb +58 -1
- data/ext/kreuzberg_rb/extconf.rb +5 -35
- data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
- data/ext/kreuzberg_rb/native/build.rs +14 -12
- data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
- data/ext/kreuzberg_rb/native/include/strings.h +2 -2
- data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
- data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
- data/extconf.rb +6 -38
- data/kreuzberg.gemspec +20 -114
- data/lib/kreuzberg/api_proxy.rb +18 -2
- data/lib/kreuzberg/cache_api.rb +0 -22
- data/lib/kreuzberg/cli.rb +10 -2
- data/lib/kreuzberg/cli_proxy.rb +10 -0
- data/lib/kreuzberg/config.rb +22 -274
- data/lib/kreuzberg/errors.rb +7 -73
- data/lib/kreuzberg/extraction_api.rb +8 -237
- data/lib/kreuzberg/mcp_proxy.rb +11 -2
- data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
- data/lib/kreuzberg/post_processor_protocol.rb +71 -0
- data/lib/kreuzberg/result.rb +33 -151
- data/lib/kreuzberg/setup_lib_path.rb +2 -22
- data/lib/kreuzberg/validator_protocol.rb +73 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +13 -27
- data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
- data/sig/kreuzberg.rbs +12 -105
- data/spec/binding/cache_spec.rb +22 -22
- data/spec/binding/cli_proxy_spec.rb +4 -2
- data/spec/binding/cli_spec.rb +11 -12
- data/spec/binding/config_spec.rb +0 -74
- data/spec/binding/config_validation_spec.rb +6 -100
- data/spec/binding/error_handling_spec.rb +97 -283
- data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
- data/spec/binding/plugins/postprocessor_spec.rb +11 -11
- data/spec/binding/plugins/validator_spec.rb +13 -12
- data/spec/examples.txt +104 -0
- data/spec/fixtures/config.toml +1 -0
- data/spec/fixtures/config.yaml +1 -0
- data/spec/fixtures/invalid_config.toml +1 -0
- data/spec/smoke/package_spec.rb +3 -2
- data/spec/spec_helper.rb +3 -1
- data/vendor/kreuzberg/Cargo.toml +67 -192
- data/vendor/kreuzberg/README.md +9 -97
- data/vendor/kreuzberg/build.rs +194 -516
- data/vendor/kreuzberg/src/api/handlers.rs +9 -130
- data/vendor/kreuzberg/src/api/mod.rs +3 -18
- data/vendor/kreuzberg/src/api/server.rs +71 -236
- data/vendor/kreuzberg/src/api/types.rs +7 -43
- data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
- data/vendor/kreuzberg/src/cache/mod.rs +3 -27
- data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
- data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
- data/vendor/kreuzberg/src/core/config.rs +23 -905
- data/vendor/kreuzberg/src/core/extractor.rs +106 -403
- data/vendor/kreuzberg/src/core/io.rs +2 -4
- data/vendor/kreuzberg/src/core/mime.rs +12 -2
- data/vendor/kreuzberg/src/core/mod.rs +3 -22
- data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
- data/vendor/kreuzberg/src/embeddings.rs +21 -169
- data/vendor/kreuzberg/src/error.rs +2 -2
- data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
- data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
- data/vendor/kreuzberg/src/extraction/email.rs +11 -12
- data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
- data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
- data/vendor/kreuzberg/src/extraction/image.rs +14 -138
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
- data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
- data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
- data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
- data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
- data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
- data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
- data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
- data/vendor/kreuzberg/src/extraction/table.rs +1 -2
- data/vendor/kreuzberg/src/extraction/text.rs +10 -18
- data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
- data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
- data/vendor/kreuzberg/src/extractors/email.rs +9 -37
- data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
- data/vendor/kreuzberg/src/extractors/html.rs +173 -182
- data/vendor/kreuzberg/src/extractors/image.rs +8 -32
- data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
- data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
- data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
- data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
- data/vendor/kreuzberg/src/extractors/text.rs +7 -30
- data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
- data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
- data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
- data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
- data/vendor/kreuzberg/src/lib.rs +5 -17
- data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
- data/vendor/kreuzberg/src/mcp/server.rs +21 -145
- data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
- data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
- data/vendor/kreuzberg/src/pdf/error.rs +1 -93
- data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
- data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
- data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
- data/vendor/kreuzberg/src/pdf/table.rs +64 -61
- data/vendor/kreuzberg/src/pdf/text.rs +24 -416
- data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
- data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
- data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
- data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
- data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
- data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
- data/vendor/kreuzberg/src/text/mod.rs +0 -8
- data/vendor/kreuzberg/src/text/quality.rs +15 -28
- data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
- data/vendor/kreuzberg/src/types.rs +67 -907
- data/vendor/kreuzberg/src/utils/mod.rs +0 -14
- data/vendor/kreuzberg/src/utils/quality.rs +3 -12
- data/vendor/kreuzberg/tests/api_tests.rs +0 -506
- data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
- data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
- data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
- data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
- data/vendor/kreuzberg/tests/config_features.rs +1 -33
- data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
- data/vendor/kreuzberg/tests/core_integration.rs +9 -35
- data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
- data/vendor/kreuzberg/tests/email_integration.rs +1 -3
- data/vendor/kreuzberg/tests/error_handling.rs +34 -43
- data/vendor/kreuzberg/tests/format_integration.rs +1 -7
- data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
- data/vendor/kreuzberg/tests/image_integration.rs +0 -2
- data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
- data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
- data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
- data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
- data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
- data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
- data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
- data/vendor/kreuzberg/tests/security_validation.rs +1 -13
- data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
- metadata +25 -171
- data/.rubocop.yml +0 -543
- data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
- data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
- data/lib/kreuzberg/error_context.rb +0 -136
- data/lib/kreuzberg/types.rb +0 -170
- data/lib/libpdfium.so +0 -0
- data/spec/binding/async_operations_spec.rb +0 -473
- data/spec/binding/batch_operations_spec.rb +0 -595
- data/spec/binding/batch_spec.rb +0 -359
- data/spec/binding/config_result_spec.rb +0 -377
- data/spec/binding/embeddings_spec.rb +0 -816
- data/spec/binding/error_recovery_spec.rb +0 -488
- data/spec/binding/font_config_spec.rb +0 -220
- data/spec/binding/images_spec.rb +0 -738
- data/spec/binding/keywords_extraction_spec.rb +0 -600
- data/spec/binding/metadata_types_spec.rb +0 -1228
- data/spec/binding/pages_extraction_spec.rb +0 -471
- data/spec/binding/tables_spec.rb +0 -641
- data/spec/unit/config/chunking_config_spec.rb +0 -213
- data/spec/unit/config/embedding_config_spec.rb +0 -343
- data/spec/unit/config/extraction_config_spec.rb +0 -438
- data/spec/unit/config/font_config_spec.rb +0 -285
- data/spec/unit/config/hierarchy_config_spec.rb +0 -314
- data/spec/unit/config/image_extraction_config_spec.rb +0 -209
- data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
- data/spec/unit/config/keyword_config_spec.rb +0 -229
- data/spec/unit/config/language_detection_config_spec.rb +0 -258
- data/spec/unit/config/ocr_config_spec.rb +0 -171
- data/spec/unit/config/page_config_spec.rb +0 -221
- data/spec/unit/config/pdf_config_spec.rb +0 -267
- data/spec/unit/config/postprocessor_config_spec.rb +0 -290
- data/spec/unit/config/tesseract_config_spec.rb +0 -181
- data/spec/unit/config/token_reduction_config_spec.rb +0 -251
- data/test/metadata_types_test.rb +0 -959
- data/vendor/Cargo.toml +0 -61
- data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
- data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
- data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
- data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/formats.rs +0 -235
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
- data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
- data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
- data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
- data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
- data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
- data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
- data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/extractors/security.rs +0 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
- data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
- data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
- data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
- data/vendor/kreuzberg/src/panic_context.rs +0 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
- data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
- data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
- data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
- data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
- data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
- data/vendor/kreuzberg/src/utils/pool.rs +0 -503
- data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
- data/vendor/kreuzberg/tests/api_embed.rs +0 -360
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
- data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
- data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
- data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
- data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
- data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
- data/vendor/kreuzberg/tests/page_markers.rs +0 -297
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
- data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
- data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
- data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
- data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
- data/vendor/kreuzberg-ffi/README.md +0 -851
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
- data/vendor/kreuzberg-ffi/build.rs +0 -168
- data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
- data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
- data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
- data/vendor/kreuzberg-ffi/src/error.rs +0 -901
- data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
- data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
- data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
- data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
- data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
- data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
- data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
- data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
- data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
- data/vendor/kreuzberg-ffi/src/result.rs +0 -510
- data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
- data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
- data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
- data/vendor/kreuzberg-ffi/src/types.rs +0 -363
- data/vendor/kreuzberg-ffi/src/util.rs +0 -210
- data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
- data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
- data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
- data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
- data/vendor/kreuzberg-tesseract/LICENSE +0 -22
- data/vendor/kreuzberg-tesseract/README.md +0 -399
- data/vendor/kreuzberg-tesseract/build.rs +0 -1127
- data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
- data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
- data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
- data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
- data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
- data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211
|
@@ -31,7 +31,6 @@
|
|
|
31
31
|
//! # }
|
|
32
32
|
//! ```
|
|
33
33
|
use crate::error::{KreuzbergError, Result};
|
|
34
|
-
use crate::text::utf8_validation;
|
|
35
34
|
use serde::{Deserialize, Serialize};
|
|
36
35
|
use std::collections::HashMap;
|
|
37
36
|
|
|
@@ -240,8 +239,8 @@ fn is_text_field(key: &str, custom_patterns: &[String]) -> bool {
|
|
|
240
239
|
}
|
|
241
240
|
|
|
242
241
|
pub fn parse_yaml(data: &[u8]) -> Result<StructuredDataResult> {
|
|
243
|
-
let yaml_str =
|
|
244
|
-
.map_err(|e| KreuzbergError::parsing(format!("Invalid UTF-8 in YAML: {}", e)))?;
|
|
242
|
+
let yaml_str =
|
|
243
|
+
std::str::from_utf8(data).map_err(|e| KreuzbergError::parsing(format!("Invalid UTF-8 in YAML: {}", e)))?;
|
|
245
244
|
|
|
246
245
|
let value: serde_json::Value = serde_yaml_ng::from_str(yaml_str)
|
|
247
246
|
.map_err(|e| KreuzbergError::parsing(format!("Failed to parse YAML: {}", e)))?;
|
|
@@ -312,8 +311,8 @@ fn extract_from_value(
|
|
|
312
311
|
}
|
|
313
312
|
|
|
314
313
|
pub fn parse_toml(data: &[u8]) -> Result<StructuredDataResult> {
|
|
315
|
-
let toml_str =
|
|
316
|
-
.map_err(|e| KreuzbergError::parsing(format!("Invalid UTF-8 in TOML: {}", e)))?;
|
|
314
|
+
let toml_str =
|
|
315
|
+
std::str::from_utf8(data).map_err(|e| KreuzbergError::parsing(format!("Invalid UTF-8 in TOML: {}", e)))?;
|
|
317
316
|
|
|
318
317
|
let value: toml::Value =
|
|
319
318
|
toml::from_str(toml_str).map_err(|e| KreuzbergError::parsing(format!("Failed to parse TOML: {}", e)))?;
|
|
@@ -60,8 +60,7 @@ fn dataframe_to_markdown(df: &DataFrame) -> Result<String> {
|
|
|
60
60
|
return Ok(String::new());
|
|
61
61
|
}
|
|
62
62
|
|
|
63
|
-
let
|
|
64
|
-
let mut markdown = String::with_capacity(estimated_capacity);
|
|
63
|
+
let mut markdown = String::new();
|
|
65
64
|
|
|
66
65
|
markdown.push_str("| ");
|
|
67
66
|
for col_name in df.get_column_names() {
|
|
@@ -26,7 +26,6 @@ use once_cell::sync::Lazy;
|
|
|
26
26
|
use regex::Regex;
|
|
27
27
|
|
|
28
28
|
use crate::error::Result;
|
|
29
|
-
use crate::text::utf8_validation;
|
|
30
29
|
use crate::types::TextExtractionResult;
|
|
31
30
|
|
|
32
31
|
static MARKDOWN_HEADER: Lazy<Regex> =
|
|
@@ -39,25 +38,18 @@ static CODE_BLOCK_DELIMITER: Lazy<Regex> = Lazy::new(|| {
|
|
|
39
38
|
});
|
|
40
39
|
|
|
41
40
|
pub fn parse_text(text_bytes: &[u8], is_markdown: bool) -> Result<TextExtractionResult> {
|
|
42
|
-
let text
|
|
43
|
-
Ok(s) => std::borrow::Cow::Borrowed(s),
|
|
44
|
-
Err(_) => std::borrow::Cow::Owned(String::from_utf8_lossy(text_bytes).into_owned()),
|
|
45
|
-
};
|
|
41
|
+
let text = String::from_utf8_lossy(text_bytes).into_owned();
|
|
46
42
|
|
|
47
43
|
let mut line_count = 0;
|
|
48
44
|
let mut word_count = 0;
|
|
49
45
|
let character_count = text.len();
|
|
50
46
|
|
|
51
|
-
let
|
|
52
|
-
let
|
|
53
|
-
let
|
|
54
|
-
|
|
55
|
-
let mut headers = Vec::with_capacity(estimated_headers_capacity);
|
|
56
|
-
let mut links = Vec::with_capacity(estimated_links_capacity);
|
|
57
|
-
let mut code_blocks = Vec::with_capacity(estimated_code_blocks_capacity);
|
|
47
|
+
let mut headers = Vec::new();
|
|
48
|
+
let mut links = Vec::new();
|
|
49
|
+
let mut code_blocks = Vec::new();
|
|
58
50
|
let mut in_code_block = false;
|
|
59
|
-
let mut current_code_lang = String::
|
|
60
|
-
let mut current_code = String::
|
|
51
|
+
let mut current_code_lang = String::new();
|
|
52
|
+
let mut current_code = String::new();
|
|
61
53
|
|
|
62
54
|
for line in text.lines() {
|
|
63
55
|
line_count += 1;
|
|
@@ -73,7 +65,7 @@ pub fn parse_text(text_bytes: &[u8], is_markdown: bool) -> Result<TextExtraction
|
|
|
73
65
|
if current_code_lang.is_empty() {
|
|
74
66
|
"plain".to_string()
|
|
75
67
|
} else {
|
|
76
|
-
|
|
68
|
+
current_code_lang.clone()
|
|
77
69
|
},
|
|
78
70
|
current_code.trim_end().to_string(),
|
|
79
71
|
));
|
|
@@ -102,14 +94,14 @@ pub fn parse_text(text_bytes: &[u8], is_markdown: bool) -> Result<TextExtraction
|
|
|
102
94
|
}
|
|
103
95
|
|
|
104
96
|
for caps in MARKDOWN_LINK.captures_iter(line) {
|
|
105
|
-
if let (Some(
|
|
106
|
-
links.push((
|
|
97
|
+
if let (Some(text), Some(url)) = (caps.get(1), caps.get(2)) {
|
|
98
|
+
links.push((text.as_str().to_string(), url.as_str().to_string()));
|
|
107
99
|
}
|
|
108
100
|
}
|
|
109
101
|
}
|
|
110
102
|
|
|
111
103
|
Ok(TextExtractionResult {
|
|
112
|
-
content: text
|
|
104
|
+
content: text,
|
|
113
105
|
line_count,
|
|
114
106
|
word_count,
|
|
115
107
|
character_count,
|
|
@@ -77,7 +77,6 @@ fn build_archive_result(
|
|
|
77
77
|
detected_languages: None,
|
|
78
78
|
chunks: None,
|
|
79
79
|
images: None,
|
|
80
|
-
pages: None,
|
|
81
80
|
}
|
|
82
81
|
}
|
|
83
82
|
|
|
@@ -127,13 +126,6 @@ impl Plugin for ZipExtractor {
|
|
|
127
126
|
|
|
128
127
|
#[async_trait]
|
|
129
128
|
impl DocumentExtractor for ZipExtractor {
|
|
130
|
-
#[cfg_attr(feature = "otel", tracing::instrument(
|
|
131
|
-
skip(self, content, _config),
|
|
132
|
-
fields(
|
|
133
|
-
extractor.name = self.name(),
|
|
134
|
-
content.size_bytes = content.len(),
|
|
135
|
-
)
|
|
136
|
-
))]
|
|
137
129
|
async fn extract_bytes(
|
|
138
130
|
&self,
|
|
139
131
|
content: &[u8],
|
|
@@ -205,13 +197,6 @@ impl Plugin for TarExtractor {
|
|
|
205
197
|
|
|
206
198
|
#[async_trait]
|
|
207
199
|
impl DocumentExtractor for TarExtractor {
|
|
208
|
-
#[cfg_attr(feature = "otel", tracing::instrument(
|
|
209
|
-
skip(self, content, _config),
|
|
210
|
-
fields(
|
|
211
|
-
extractor.name = self.name(),
|
|
212
|
-
content.size_bytes = content.len(),
|
|
213
|
-
)
|
|
214
|
-
))]
|
|
215
200
|
async fn extract_bytes(
|
|
216
201
|
&self,
|
|
217
202
|
content: &[u8],
|
|
@@ -288,13 +273,6 @@ impl Plugin for SevenZExtractor {
|
|
|
288
273
|
|
|
289
274
|
#[async_trait]
|
|
290
275
|
impl DocumentExtractor for SevenZExtractor {
|
|
291
|
-
#[cfg_attr(feature = "otel", tracing::instrument(
|
|
292
|
-
skip(self, content, _config),
|
|
293
|
-
fields(
|
|
294
|
-
extractor.name = self.name(),
|
|
295
|
-
content.size_bytes = content.len(),
|
|
296
|
-
)
|
|
297
|
-
))]
|
|
298
276
|
async fn extract_bytes(
|
|
299
277
|
&self,
|
|
300
278
|
content: &[u8],
|
|
@@ -1,14 +1,12 @@
|
|
|
1
|
-
#![cfg(all(feature = "tokio-runtime", feature = "office"))]
|
|
2
|
-
|
|
3
1
|
//! DOCX extractor using docx-lite for high-performance text extraction.
|
|
4
2
|
//!
|
|
5
3
|
//! Supports: Microsoft Word (.docx)
|
|
6
4
|
|
|
7
5
|
use crate::Result;
|
|
8
6
|
use crate::core::config::ExtractionConfig;
|
|
9
|
-
use crate::extraction::
|
|
7
|
+
use crate::extraction::office_metadata;
|
|
10
8
|
use crate::plugins::{DocumentExtractor, Plugin};
|
|
11
|
-
use crate::types::{ExtractionResult, Metadata,
|
|
9
|
+
use crate::types::{ExtractionResult, Metadata, Table};
|
|
12
10
|
use async_trait::async_trait;
|
|
13
11
|
use std::io::Cursor;
|
|
14
12
|
|
|
@@ -17,6 +15,7 @@ use std::io::Cursor;
|
|
|
17
15
|
/// This extractor provides:
|
|
18
16
|
/// - Fast text extraction via streaming XML parsing (~160 MB/s average)
|
|
19
17
|
/// - Comprehensive metadata extraction (core.xml, app.xml, custom.xml)
|
|
18
|
+
/// - ~400x faster than Pandoc subprocess approach
|
|
20
19
|
pub struct DocxExtractor;
|
|
21
20
|
|
|
22
21
|
impl DocxExtractor {
|
|
@@ -67,6 +66,7 @@ impl Plugin for DocxExtractor {
|
|
|
67
66
|
/// # Returns
|
|
68
67
|
/// * `Table` - Converted table with cells and markdown representation
|
|
69
68
|
fn convert_docx_table_to_table(docx_table: &docx_lite::Table, table_index: usize) -> Table {
|
|
69
|
+
// Extract cells as 2D vector
|
|
70
70
|
let cells: Vec<Vec<String>> = docx_table
|
|
71
71
|
.rows
|
|
72
72
|
.iter()
|
|
@@ -74,6 +74,7 @@ fn convert_docx_table_to_table(docx_table: &docx_lite::Table, table_index: usize
|
|
|
74
74
|
row.cells
|
|
75
75
|
.iter()
|
|
76
76
|
.map(|cell| {
|
|
77
|
+
// Extract text from all paragraphs in the cell
|
|
77
78
|
cell.paragraphs
|
|
78
79
|
.iter()
|
|
79
80
|
.map(|para| para.to_text())
|
|
@@ -86,12 +87,13 @@ fn convert_docx_table_to_table(docx_table: &docx_lite::Table, table_index: usize
|
|
|
86
87
|
})
|
|
87
88
|
.collect();
|
|
88
89
|
|
|
90
|
+
// Generate markdown representation
|
|
89
91
|
let markdown = cells_to_markdown(&cells);
|
|
90
92
|
|
|
91
93
|
Table {
|
|
92
94
|
cells,
|
|
93
95
|
markdown,
|
|
94
|
-
page_number: table_index + 1,
|
|
96
|
+
page_number: table_index + 1, // 1-indexed
|
|
95
97
|
}
|
|
96
98
|
}
|
|
97
99
|
|
|
@@ -102,55 +104,103 @@ fn convert_docx_table_to_table(docx_table: &docx_lite::Table, table_index: usize
|
|
|
102
104
|
///
|
|
103
105
|
/// # Returns
|
|
104
106
|
/// * `String` - Markdown formatted table
|
|
107
|
+
fn cells_to_markdown(cells: &[Vec<String>]) -> String {
|
|
108
|
+
if cells.is_empty() {
|
|
109
|
+
return String::new();
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
let mut markdown = String::new();
|
|
113
|
+
|
|
114
|
+
// Determine number of columns from first row
|
|
115
|
+
let num_cols = cells.first().map(|r| r.len()).unwrap_or(0);
|
|
116
|
+
if num_cols == 0 {
|
|
117
|
+
return String::new();
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
// Header row (first row)
|
|
121
|
+
if let Some(header) = cells.first() {
|
|
122
|
+
markdown.push_str("| ");
|
|
123
|
+
for cell in header {
|
|
124
|
+
// Escape pipe characters in cell content
|
|
125
|
+
let escaped = cell.replace('|', "\\|");
|
|
126
|
+
markdown.push_str(&escaped);
|
|
127
|
+
markdown.push_str(" | ");
|
|
128
|
+
}
|
|
129
|
+
markdown.push('\n');
|
|
130
|
+
|
|
131
|
+
// Separator row
|
|
132
|
+
markdown.push('|');
|
|
133
|
+
for _ in 0..num_cols {
|
|
134
|
+
markdown.push_str("------|");
|
|
135
|
+
}
|
|
136
|
+
markdown.push('\n');
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
// Data rows (skip first row as it's the header)
|
|
140
|
+
for row in cells.iter().skip(1) {
|
|
141
|
+
markdown.push_str("| ");
|
|
142
|
+
for (idx, cell) in row.iter().enumerate() {
|
|
143
|
+
if idx >= num_cols {
|
|
144
|
+
break; // Handle irregular tables
|
|
145
|
+
}
|
|
146
|
+
// Escape pipe characters in cell content
|
|
147
|
+
let escaped = cell.replace('|', "\\|");
|
|
148
|
+
markdown.push_str(&escaped);
|
|
149
|
+
markdown.push_str(" | ");
|
|
150
|
+
}
|
|
151
|
+
// Pad with empty cells if row is shorter than expected
|
|
152
|
+
for _ in row.len()..num_cols {
|
|
153
|
+
markdown.push_str(" | ");
|
|
154
|
+
}
|
|
155
|
+
markdown.push('\n');
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
markdown
|
|
159
|
+
}
|
|
105
160
|
|
|
106
161
|
#[async_trait]
|
|
107
162
|
impl DocumentExtractor for DocxExtractor {
|
|
108
|
-
#[cfg_attr(feature = "otel", tracing::instrument(
|
|
109
|
-
skip(self, content, _config),
|
|
110
|
-
fields(
|
|
111
|
-
extractor.name = self.name(),
|
|
112
|
-
content.size_bytes = content.len(),
|
|
113
|
-
)
|
|
114
|
-
))]
|
|
115
163
|
async fn extract_bytes(
|
|
116
164
|
&self,
|
|
117
165
|
content: &[u8],
|
|
118
166
|
mime_type: &str,
|
|
119
167
|
_config: &ExtractionConfig,
|
|
120
168
|
) -> Result<ExtractionResult> {
|
|
121
|
-
|
|
169
|
+
// Parse the DOCX document to extract both text and tables
|
|
170
|
+
let (text, tables) = if crate::core::batch_mode::is_batch_mode() {
|
|
171
|
+
// Batch mode: Use spawn_blocking for parallelism
|
|
122
172
|
let content_owned = content.to_vec();
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
Ok((text, tables, page_boundaries))
|
|
143
|
-
},
|
|
144
|
-
)
|
|
173
|
+
tokio::task::spawn_blocking(move || -> crate::error::Result<(String, Vec<Table>)> {
|
|
174
|
+
// Parse document structure
|
|
175
|
+
let cursor = Cursor::new(&content_owned);
|
|
176
|
+
let doc = docx_lite::parse_document(cursor)
|
|
177
|
+
.map_err(|e| crate::error::KreuzbergError::parsing(format!("DOCX parsing failed: {}", e)))?;
|
|
178
|
+
|
|
179
|
+
// Extract text
|
|
180
|
+
let text = doc.extract_text();
|
|
181
|
+
|
|
182
|
+
// Extract tables
|
|
183
|
+
let tables: Vec<Table> = doc
|
|
184
|
+
.tables
|
|
185
|
+
.iter()
|
|
186
|
+
.enumerate()
|
|
187
|
+
.map(|(idx, table)| convert_docx_table_to_table(table, idx))
|
|
188
|
+
.collect();
|
|
189
|
+
|
|
190
|
+
Ok((text, tables))
|
|
191
|
+
})
|
|
145
192
|
.await
|
|
146
193
|
.map_err(|e| crate::error::KreuzbergError::parsing(format!("DOCX extraction task failed: {}", e)))??
|
|
147
194
|
} else {
|
|
195
|
+
// Single-file mode: Direct extraction (no spawn overhead)
|
|
148
196
|
let cursor = Cursor::new(content);
|
|
149
197
|
let doc = docx_lite::parse_document(cursor)
|
|
150
198
|
.map_err(|e| crate::error::KreuzbergError::parsing(format!("DOCX parsing failed: {}", e)))?;
|
|
151
199
|
|
|
200
|
+
// Extract text
|
|
152
201
|
let text = doc.extract_text();
|
|
153
202
|
|
|
203
|
+
// Extract tables
|
|
154
204
|
let tables: Vec<Table> = doc
|
|
155
205
|
.tables
|
|
156
206
|
.iter()
|
|
@@ -158,16 +208,14 @@ impl DocumentExtractor for DocxExtractor {
|
|
|
158
208
|
.map(|(idx, table)| convert_docx_table_to_table(table, idx))
|
|
159
209
|
.collect();
|
|
160
210
|
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
(text, tables, page_boundaries)
|
|
211
|
+
(text, tables)
|
|
164
212
|
};
|
|
165
213
|
|
|
214
|
+
// Extract metadata using existing office_metadata module
|
|
166
215
|
let mut archive = if crate::core::batch_mode::is_batch_mode() {
|
|
216
|
+
// Batch mode: Use spawn_blocking for parallelism
|
|
167
217
|
let content_owned = content.to_vec();
|
|
168
|
-
let span = tracing::Span::current();
|
|
169
218
|
tokio::task::spawn_blocking(move || -> crate::error::Result<_> {
|
|
170
|
-
let _guard = span.entered();
|
|
171
219
|
let cursor = Cursor::new(content_owned);
|
|
172
220
|
zip::ZipArchive::new(cursor)
|
|
173
221
|
.map_err(|e| crate::error::KreuzbergError::parsing(format!("Failed to open ZIP archive: {}", e)))
|
|
@@ -175,6 +223,8 @@ impl DocumentExtractor for DocxExtractor {
|
|
|
175
223
|
.await
|
|
176
224
|
.map_err(|e| crate::error::KreuzbergError::parsing(format!("Task join error: {}", e)))??
|
|
177
225
|
} else {
|
|
226
|
+
// Single-file mode: Direct extraction (no spawn overhead)
|
|
227
|
+
// Note: We still need to clone for ZipArchive type consistency with batch mode
|
|
178
228
|
let content_owned = content.to_vec();
|
|
179
229
|
let cursor = Cursor::new(content_owned);
|
|
180
230
|
zip::ZipArchive::new(cursor)
|
|
@@ -183,6 +233,7 @@ impl DocumentExtractor for DocxExtractor {
|
|
|
183
233
|
|
|
184
234
|
let mut metadata_map = std::collections::HashMap::new();
|
|
185
235
|
|
|
236
|
+
// Extract core properties (title, creator, dates, keywords, etc.)
|
|
186
237
|
if let Ok(core) = office_metadata::extract_core_properties(&mut archive) {
|
|
187
238
|
if let Some(title) = core.title {
|
|
188
239
|
metadata_map.insert("title".to_string(), serde_json::Value::String(title));
|
|
@@ -226,6 +277,7 @@ impl DocumentExtractor for DocxExtractor {
|
|
|
226
277
|
}
|
|
227
278
|
}
|
|
228
279
|
|
|
280
|
+
// Extract app properties (page count, word count, etc.)
|
|
229
281
|
if let Ok(app) = office_metadata::extract_docx_app_properties(&mut archive) {
|
|
230
282
|
if let Some(pages) = app.pages {
|
|
231
283
|
metadata_map.insert("page_count".to_string(), serde_json::Value::Number(pages.into()));
|
|
@@ -262,48 +314,24 @@ impl DocumentExtractor for DocxExtractor {
|
|
|
262
314
|
}
|
|
263
315
|
}
|
|
264
316
|
|
|
317
|
+
// Extract custom properties
|
|
265
318
|
if let Ok(custom) = office_metadata::extract_custom_properties(&mut archive) {
|
|
266
319
|
for (key, value) in custom {
|
|
267
320
|
metadata_map.insert(format!("custom_{}", key), value);
|
|
268
321
|
}
|
|
269
322
|
}
|
|
270
323
|
|
|
271
|
-
let page_structure = if let Some(boundaries) = page_boundaries {
|
|
272
|
-
let total_count = boundaries.len();
|
|
273
|
-
Some(PageStructure {
|
|
274
|
-
total_count,
|
|
275
|
-
unit_type: PageUnitType::Page,
|
|
276
|
-
boundaries: Some(boundaries),
|
|
277
|
-
pages: Some(
|
|
278
|
-
(1..=total_count)
|
|
279
|
-
.map(|page_num| PageInfo {
|
|
280
|
-
number: page_num,
|
|
281
|
-
title: None,
|
|
282
|
-
dimensions: None,
|
|
283
|
-
image_count: None,
|
|
284
|
-
table_count: None,
|
|
285
|
-
hidden: None,
|
|
286
|
-
})
|
|
287
|
-
.collect(),
|
|
288
|
-
),
|
|
289
|
-
})
|
|
290
|
-
} else {
|
|
291
|
-
None
|
|
292
|
-
};
|
|
293
|
-
|
|
294
324
|
Ok(ExtractionResult {
|
|
295
325
|
content: text,
|
|
296
326
|
mime_type: mime_type.to_string(),
|
|
297
327
|
metadata: Metadata {
|
|
298
|
-
pages: page_structure,
|
|
299
328
|
additional: metadata_map,
|
|
300
329
|
..Default::default()
|
|
301
330
|
},
|
|
302
|
-
pages: None,
|
|
303
331
|
tables,
|
|
304
332
|
detected_languages: None,
|
|
305
333
|
chunks: None,
|
|
306
|
-
images:
|
|
334
|
+
images: None,
|
|
307
335
|
})
|
|
308
336
|
}
|
|
309
337
|
|
|
@@ -312,7 +340,7 @@ impl DocumentExtractor for DocxExtractor {
|
|
|
312
340
|
}
|
|
313
341
|
|
|
314
342
|
fn priority(&self) -> i32 {
|
|
315
|
-
50
|
|
343
|
+
50 // Higher priority than Pandoc (40) to take precedence
|
|
316
344
|
}
|
|
317
345
|
}
|
|
318
346
|
|
|
@@ -352,12 +380,61 @@ mod tests {
|
|
|
352
380
|
assert!(extractor.shutdown().is_ok());
|
|
353
381
|
}
|
|
354
382
|
|
|
383
|
+
#[test]
|
|
384
|
+
fn test_cells_to_markdown_basic_table() {
|
|
385
|
+
let cells = vec![
|
|
386
|
+
vec!["Header1".to_string(), "Header2".to_string()],
|
|
387
|
+
vec!["Row1Col1".to_string(), "Row1Col2".to_string()],
|
|
388
|
+
vec!["Row2Col1".to_string(), "Row2Col2".to_string()],
|
|
389
|
+
];
|
|
390
|
+
|
|
391
|
+
let markdown = cells_to_markdown(&cells);
|
|
392
|
+
|
|
393
|
+
assert!(markdown.contains("| Header1 | Header2 |"));
|
|
394
|
+
assert!(markdown.contains("|------|------|"));
|
|
395
|
+
assert!(markdown.contains("| Row1Col1 | Row1Col2 |"));
|
|
396
|
+
assert!(markdown.contains("| Row2Col1 | Row2Col2 |"));
|
|
397
|
+
}
|
|
398
|
+
|
|
399
|
+
#[test]
|
|
400
|
+
fn test_cells_to_markdown_empty() {
|
|
401
|
+
let cells: Vec<Vec<String>> = vec![];
|
|
402
|
+
let markdown = cells_to_markdown(&cells);
|
|
403
|
+
assert_eq!(markdown, "");
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
#[test]
|
|
407
|
+
fn test_cells_to_markdown_escape_pipes() {
|
|
408
|
+
let cells = vec![vec!["Header".to_string()], vec!["Cell with | pipe".to_string()]];
|
|
409
|
+
|
|
410
|
+
let markdown = cells_to_markdown(&cells);
|
|
411
|
+
assert!(markdown.contains("Cell with \\| pipe"));
|
|
412
|
+
}
|
|
413
|
+
|
|
414
|
+
#[test]
|
|
415
|
+
fn test_cells_to_markdown_irregular_rows() {
|
|
416
|
+
let cells = vec![
|
|
417
|
+
vec!["H1".to_string(), "H2".to_string(), "H3".to_string()],
|
|
418
|
+
vec!["R1C1".to_string(), "R1C2".to_string()], // Missing third column
|
|
419
|
+
vec!["R2C1".to_string(), "R2C2".to_string(), "R2C3".to_string()],
|
|
420
|
+
];
|
|
421
|
+
|
|
422
|
+
let markdown = cells_to_markdown(&cells);
|
|
423
|
+
|
|
424
|
+
// Should have 3 columns in header
|
|
425
|
+
assert!(markdown.contains("| H1 | H2 | H3 |"));
|
|
426
|
+
// Should pad short rows
|
|
427
|
+
assert!(markdown.contains("| R1C1 | R1C2 | |"));
|
|
428
|
+
}
|
|
429
|
+
|
|
355
430
|
#[test]
|
|
356
431
|
fn test_convert_docx_table_to_table() {
|
|
357
432
|
use docx_lite::{Paragraph, Run, Table as DocxTable, TableCell, TableRow};
|
|
358
433
|
|
|
434
|
+
// Create a simple docx-lite table
|
|
359
435
|
let mut table = DocxTable::new();
|
|
360
436
|
|
|
437
|
+
// Header row
|
|
361
438
|
let mut header_row = TableRow::default();
|
|
362
439
|
let mut cell1 = TableCell::default();
|
|
363
440
|
let mut para1 = Paragraph::new();
|
|
@@ -373,6 +450,7 @@ mod tests {
|
|
|
373
450
|
|
|
374
451
|
table.rows.push(header_row);
|
|
375
452
|
|
|
453
|
+
// Data row
|
|
376
454
|
let mut data_row = TableRow::default();
|
|
377
455
|
let mut cell3 = TableCell::default();
|
|
378
456
|
let mut para3 = Paragraph::new();
|
|
@@ -388,10 +466,11 @@ mod tests {
|
|
|
388
466
|
|
|
389
467
|
table.rows.push(data_row);
|
|
390
468
|
|
|
469
|
+
// Convert to Kreuzberg Table
|
|
391
470
|
let result = convert_docx_table_to_table(&table, 0);
|
|
392
471
|
|
|
393
|
-
assert_eq!(result.page_number, 1);
|
|
394
|
-
assert_eq!(result.cells.len(), 2);
|
|
472
|
+
assert_eq!(result.page_number, 1); // 0 + 1 = 1 (1-indexed)
|
|
473
|
+
assert_eq!(result.cells.len(), 2); // 2 rows
|
|
395
474
|
assert_eq!(result.cells[0], vec!["Name", "Age"]);
|
|
396
475
|
assert_eq!(result.cells[1], vec!["Alice", "30"]);
|
|
397
476
|
assert!(result.markdown.contains("| Name | Age |"));
|
|
@@ -2,11 +2,9 @@
|
|
|
2
2
|
|
|
3
3
|
use crate::Result;
|
|
4
4
|
use crate::core::config::ExtractionConfig;
|
|
5
|
-
use crate::extractors::SyncExtractor;
|
|
6
5
|
use crate::plugins::{DocumentExtractor, Plugin};
|
|
7
6
|
use crate::types::{EmailMetadata, ExtractionResult, Metadata};
|
|
8
7
|
use async_trait::async_trait;
|
|
9
|
-
#[cfg(feature = "tokio-runtime")]
|
|
10
8
|
use std::path::Path;
|
|
11
9
|
|
|
12
10
|
/// Email message extractor.
|
|
@@ -44,8 +42,14 @@ impl Plugin for EmailExtractor {
|
|
|
44
42
|
}
|
|
45
43
|
}
|
|
46
44
|
|
|
47
|
-
|
|
48
|
-
|
|
45
|
+
#[async_trait]
|
|
46
|
+
impl DocumentExtractor for EmailExtractor {
|
|
47
|
+
async fn extract_bytes(
|
|
48
|
+
&self,
|
|
49
|
+
content: &[u8],
|
|
50
|
+
mime_type: &str,
|
|
51
|
+
_config: &ExtractionConfig,
|
|
52
|
+
) -> Result<ExtractionResult> {
|
|
49
53
|
let email_result = crate::extraction::email::extract_email_content(content, mime_type)?;
|
|
50
54
|
|
|
51
55
|
let text = crate::extraction::email::build_email_text_output(&email_result);
|
|
@@ -77,7 +81,7 @@ impl SyncExtractor for EmailExtractor {
|
|
|
77
81
|
metadata: Metadata {
|
|
78
82
|
format: Some(crate::types::FormatMetadata::Email(email_metadata)),
|
|
79
83
|
subject: email_result.subject.clone(),
|
|
80
|
-
|
|
84
|
+
date: email_result.date.clone(),
|
|
81
85
|
additional,
|
|
82
86
|
..Default::default()
|
|
83
87
|
},
|
|
@@ -85,37 +89,9 @@ impl SyncExtractor for EmailExtractor {
|
|
|
85
89
|
detected_languages: None,
|
|
86
90
|
chunks: None,
|
|
87
91
|
images: None,
|
|
88
|
-
pages: None,
|
|
89
92
|
})
|
|
90
93
|
}
|
|
91
|
-
}
|
|
92
94
|
|
|
93
|
-
#[async_trait]
|
|
94
|
-
impl DocumentExtractor for EmailExtractor {
|
|
95
|
-
#[cfg_attr(feature = "otel", tracing::instrument(
|
|
96
|
-
skip(self, content, config),
|
|
97
|
-
fields(
|
|
98
|
-
extractor.name = self.name(),
|
|
99
|
-
content.size_bytes = content.len(),
|
|
100
|
-
)
|
|
101
|
-
))]
|
|
102
|
-
async fn extract_bytes(
|
|
103
|
-
&self,
|
|
104
|
-
content: &[u8],
|
|
105
|
-
mime_type: &str,
|
|
106
|
-
config: &ExtractionConfig,
|
|
107
|
-
) -> Result<ExtractionResult> {
|
|
108
|
-
self.extract_sync(content, mime_type, config)
|
|
109
|
-
}
|
|
110
|
-
|
|
111
|
-
#[cfg(feature = "tokio-runtime")]
|
|
112
|
-
#[cfg_attr(feature = "otel", tracing::instrument(
|
|
113
|
-
skip(self, path, config),
|
|
114
|
-
fields(
|
|
115
|
-
extractor.name = self.name(),
|
|
116
|
-
)
|
|
117
|
-
))]
|
|
118
|
-
#[cfg(feature = "tokio-runtime")]
|
|
119
95
|
async fn extract_file(&self, path: &Path, mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult> {
|
|
120
96
|
let bytes = tokio::fs::read(path).await?;
|
|
121
97
|
self.extract_bytes(&bytes, mime_type, config).await
|
|
@@ -128,10 +104,6 @@ impl DocumentExtractor for EmailExtractor {
|
|
|
128
104
|
fn priority(&self) -> i32 {
|
|
129
105
|
50
|
|
130
106
|
}
|
|
131
|
-
|
|
132
|
-
fn as_sync_extractor(&self) -> Option<&dyn crate::extractors::SyncExtractor> {
|
|
133
|
-
Some(self)
|
|
134
|
-
}
|
|
135
107
|
}
|
|
136
108
|
|
|
137
109
|
#[cfg(test)]
|