kreuzberg 4.0.0.pre.rc.6 → 4.0.0.rc1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +0 -6
- data/.rubocop.yaml +534 -1
- data/Gemfile +2 -1
- data/Gemfile.lock +11 -11
- data/README.md +5 -10
- data/examples/async_patterns.rb +0 -1
- data/ext/kreuzberg_rb/extconf.rb +0 -10
- data/ext/kreuzberg_rb/native/Cargo.toml +15 -23
- data/ext/kreuzberg_rb/native/build.rs +2 -0
- data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
- data/ext/kreuzberg_rb/native/include/strings.h +2 -2
- data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
- data/ext/kreuzberg_rb/native/src/lib.rs +16 -75
- data/kreuzberg.gemspec +14 -57
- data/lib/kreuzberg/cache_api.rb +0 -1
- data/lib/kreuzberg/cli.rb +2 -2
- data/lib/kreuzberg/config.rb +2 -9
- data/lib/kreuzberg/errors.rb +7 -75
- data/lib/kreuzberg/extraction_api.rb +0 -1
- data/lib/kreuzberg/setup_lib_path.rb +0 -1
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +0 -21
- data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
- data/sig/kreuzberg.rbs +3 -55
- data/spec/binding/cli_proxy_spec.rb +4 -2
- data/spec/binding/cli_spec.rb +11 -12
- data/spec/examples.txt +104 -0
- data/spec/fixtures/config.yaml +1 -0
- data/spec/spec_helper.rb +1 -1
- data/vendor/kreuzberg/Cargo.toml +42 -112
- data/vendor/kreuzberg/README.md +2 -2
- data/vendor/kreuzberg/build.rs +4 -18
- data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
- data/vendor/kreuzberg/src/cache/mod.rs +3 -27
- data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
- data/vendor/kreuzberg/src/core/extractor.rs +81 -202
- data/vendor/kreuzberg/src/core/io.rs +2 -4
- data/vendor/kreuzberg/src/core/mime.rs +12 -2
- data/vendor/kreuzberg/src/core/mod.rs +1 -4
- data/vendor/kreuzberg/src/core/pipeline.rs +33 -111
- data/vendor/kreuzberg/src/embeddings.rs +16 -125
- data/vendor/kreuzberg/src/error.rs +1 -1
- data/vendor/kreuzberg/src/extraction/docx.rs +1 -1
- data/vendor/kreuzberg/src/extraction/image.rs +13 -13
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +1 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +5 -9
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
- data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
- data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
- data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
- data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +0 -21
- data/vendor/kreuzberg/src/extractors/docx.rs +128 -16
- data/vendor/kreuzberg/src/extractors/email.rs +0 -14
- data/vendor/kreuzberg/src/extractors/excel.rs +20 -19
- data/vendor/kreuzberg/src/extractors/html.rs +154 -137
- data/vendor/kreuzberg/src/extractors/image.rs +4 -7
- data/vendor/kreuzberg/src/extractors/mod.rs +9 -106
- data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +15 -12
- data/vendor/kreuzberg/src/extractors/pptx.rs +3 -17
- data/vendor/kreuzberg/src/extractors/structured.rs +0 -14
- data/vendor/kreuzberg/src/extractors/text.rs +5 -23
- data/vendor/kreuzberg/src/extractors/xml.rs +0 -7
- data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
- data/vendor/kreuzberg/src/lib.rs +1 -4
- data/vendor/kreuzberg/src/mcp/mod.rs +1 -1
- data/vendor/kreuzberg/src/mcp/server.rs +3 -5
- data/vendor/kreuzberg/src/ocr/processor.rs +2 -18
- data/vendor/kreuzberg/src/pdf/error.rs +1 -1
- data/vendor/kreuzberg/src/pdf/table.rs +44 -17
- data/vendor/kreuzberg/src/pdf/text.rs +3 -0
- data/vendor/kreuzberg/src/plugins/extractor.rs +5 -8
- data/vendor/kreuzberg/src/plugins/ocr.rs +11 -2
- data/vendor/kreuzberg/src/plugins/processor.rs +1 -2
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -13
- data/vendor/kreuzberg/src/plugins/validator.rs +8 -9
- data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
- data/vendor/kreuzberg/src/types.rs +12 -42
- data/vendor/kreuzberg/tests/batch_orchestration.rs +5 -19
- data/vendor/kreuzberg/tests/batch_processing.rs +3 -15
- data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +1 -17
- data/vendor/kreuzberg/tests/config_features.rs +0 -18
- data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -15
- data/vendor/kreuzberg/tests/core_integration.rs +7 -24
- data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
- data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1 -0
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -1
- data/vendor/kreuzberg/tests/security_validation.rs +1 -12
- metadata +25 -90
- data/.rubocop.yml +0 -538
- data/ext/kreuzberg_rb/native/Cargo.lock +0 -6535
- data/lib/kreuzberg/error_context.rb +0 -32
- data/vendor/kreuzberg/benches/otel_overhead.rs +0 -48
- data/vendor/kreuzberg/src/extraction/markdown.rs +0 -213
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -287
- data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -469
- data/vendor/kreuzberg/src/extractors/docbook.rs +0 -502
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -707
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -491
- data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
- data/vendor/kreuzberg/src/extractors/jats.rs +0 -1051
- data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -367
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -652
- data/vendor/kreuzberg/src/extractors/markdown.rs +0 -700
- data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -634
- data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -528
- data/vendor/kreuzberg/src/extractors/rst.rs +0 -576
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -810
- data/vendor/kreuzberg/src/extractors/security.rs +0 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
- data/vendor/kreuzberg/src/extractors/typst.rs +0 -650
- data/vendor/kreuzberg/src/panic_context.rs +0 -154
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -498
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
- data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
- data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -695
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -692
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -776
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1259
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -647
- data/vendor/rb-sys/.cargo-ok +0 -1
- data/vendor/rb-sys/.cargo_vcs_info.json +0 -6
- data/vendor/rb-sys/Cargo.lock +0 -393
- data/vendor/rb-sys/Cargo.toml +0 -70
- data/vendor/rb-sys/Cargo.toml.orig +0 -57
- data/vendor/rb-sys/LICENSE-APACHE +0 -190
- data/vendor/rb-sys/LICENSE-MIT +0 -21
- data/vendor/rb-sys/bin/release.sh +0 -21
- data/vendor/rb-sys/build/features.rs +0 -108
- data/vendor/rb-sys/build/main.rs +0 -246
- data/vendor/rb-sys/build/stable_api_config.rs +0 -153
- data/vendor/rb-sys/build/version.rs +0 -48
- data/vendor/rb-sys/readme.md +0 -36
- data/vendor/rb-sys/src/bindings.rs +0 -21
- data/vendor/rb-sys/src/hidden.rs +0 -11
- data/vendor/rb-sys/src/lib.rs +0 -34
- data/vendor/rb-sys/src/macros.rs +0 -371
- data/vendor/rb-sys/src/memory.rs +0 -53
- data/vendor/rb-sys/src/ruby_abi_version.rs +0 -38
- data/vendor/rb-sys/src/special_consts.rs +0 -31
- data/vendor/rb-sys/src/stable_api/compiled.c +0 -179
- data/vendor/rb-sys/src/stable_api/compiled.rs +0 -257
- data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +0 -316
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +0 -324
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +0 -317
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +0 -315
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +0 -326
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +0 -327
- data/vendor/rb-sys/src/stable_api.rs +0 -261
- data/vendor/rb-sys/src/symbol.rs +0 -31
- data/vendor/rb-sys/src/tracking_allocator.rs +0 -332
- data/vendor/rb-sys/src/utils.rs +0 -89
- data/vendor/rb-sys/src/value_type.rs +0 -7
|
@@ -30,13 +30,6 @@ use crate::{KreuzbergError, Result};
|
|
|
30
30
|
/// - Validator errors bubble up immediately
|
|
31
31
|
/// - Post-processor errors are caught and recorded in metadata
|
|
32
32
|
/// - System errors (IO, RuntimeError equivalents) always bubble up
|
|
33
|
-
#[cfg_attr(feature = "otel", tracing::instrument(
|
|
34
|
-
skip(result, config),
|
|
35
|
-
fields(
|
|
36
|
-
pipeline.stage = "post_processing",
|
|
37
|
-
content.length = result.content.len(),
|
|
38
|
-
)
|
|
39
|
-
))]
|
|
40
33
|
pub async fn run_pipeline(mut result: ExtractionResult, config: &ExtractionConfig) -> Result<ExtractionResult> {
|
|
41
34
|
let pp_config = config.postprocessor.as_ref();
|
|
42
35
|
let postprocessing_enabled = pp_config.is_none_or(|c| c.enabled);
|
|
@@ -234,18 +227,13 @@ mod tests {
|
|
|
234
227
|
use crate::types::Metadata;
|
|
235
228
|
use lazy_static::lazy_static;
|
|
236
229
|
|
|
237
|
-
const VALIDATION_MARKER_KEY: &str = "registry_validation_marker";
|
|
238
|
-
const QUALITY_VALIDATION_MARKER: &str = "quality_validation_test";
|
|
239
|
-
const POSTPROCESSOR_VALIDATION_MARKER: &str = "postprocessor_validation_test";
|
|
240
|
-
const ORDER_VALIDATION_MARKER: &str = "order_validation_test";
|
|
241
|
-
|
|
242
230
|
lazy_static! {
|
|
243
231
|
static ref REGISTRY_TEST_GUARD: std::sync::Mutex<()> = std::sync::Mutex::new(());
|
|
244
232
|
}
|
|
245
233
|
|
|
246
234
|
#[tokio::test]
|
|
247
235
|
async fn test_run_pipeline_basic() {
|
|
248
|
-
let
|
|
236
|
+
let result = ExtractionResult {
|
|
249
237
|
content: "test".to_string(),
|
|
250
238
|
mime_type: "text/plain".to_string(),
|
|
251
239
|
metadata: Metadata::default(),
|
|
@@ -254,10 +242,6 @@ mod tests {
|
|
|
254
242
|
chunks: None,
|
|
255
243
|
images: None,
|
|
256
244
|
};
|
|
257
|
-
result.metadata.additional.insert(
|
|
258
|
-
VALIDATION_MARKER_KEY.to_string(),
|
|
259
|
-
serde_json::json!(ORDER_VALIDATION_MARKER),
|
|
260
|
-
);
|
|
261
245
|
let config = ExtractionConfig::default();
|
|
262
246
|
|
|
263
247
|
let processed = run_pipeline(result, &config).await.unwrap();
|
|
@@ -413,17 +397,9 @@ mod tests {
|
|
|
413
397
|
|
|
414
398
|
#[tokio::test]
|
|
415
399
|
async fn test_pipeline_empty_content() {
|
|
416
|
-
let _guard = REGISTRY_TEST_GUARD.lock().unwrap();
|
|
417
|
-
|
|
418
|
-
{
|
|
419
|
-
let registry = crate::plugins::registry::get_post_processor_registry();
|
|
420
|
-
registry.write().unwrap().shutdown_all().unwrap();
|
|
421
|
-
}
|
|
422
400
|
{
|
|
423
|
-
let
|
|
424
|
-
|
|
425
|
-
}
|
|
426
|
-
|
|
401
|
+
let _guard = REGISTRY_TEST_GUARD.lock().unwrap();
|
|
402
|
+
} // Drop guard before async operations
|
|
427
403
|
let result = ExtractionResult {
|
|
428
404
|
content: String::new(),
|
|
429
405
|
mime_type: "text/plain".to_string(),
|
|
@@ -435,8 +411,6 @@ mod tests {
|
|
|
435
411
|
};
|
|
436
412
|
let config = ExtractionConfig::default();
|
|
437
413
|
|
|
438
|
-
drop(_guard);
|
|
439
|
-
|
|
440
414
|
let processed = run_pipeline(result, &config).await.unwrap();
|
|
441
415
|
assert_eq!(processed.content, "");
|
|
442
416
|
}
|
|
@@ -472,8 +446,6 @@ mod tests {
|
|
|
472
446
|
#[tokio::test]
|
|
473
447
|
#[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
|
|
474
448
|
async fn test_pipeline_with_keyword_extraction() {
|
|
475
|
-
let _ = crate::keywords::register_keyword_processor();
|
|
476
|
-
|
|
477
449
|
let result = ExtractionResult {
|
|
478
450
|
content: r#"
|
|
479
451
|
Machine learning is a branch of artificial intelligence that focuses on
|
|
@@ -544,18 +516,6 @@ Natural language processing enables computers to understand human language.
|
|
|
544
516
|
#[tokio::test]
|
|
545
517
|
#[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
|
|
546
518
|
async fn test_pipeline_keyword_extraction_short_content() {
|
|
547
|
-
let _guard = REGISTRY_TEST_GUARD.lock().unwrap();
|
|
548
|
-
crate::plugins::registry::get_validator_registry()
|
|
549
|
-
.write()
|
|
550
|
-
.unwrap()
|
|
551
|
-
.shutdown_all()
|
|
552
|
-
.unwrap();
|
|
553
|
-
crate::plugins::registry::get_post_processor_registry()
|
|
554
|
-
.write()
|
|
555
|
-
.unwrap()
|
|
556
|
-
.shutdown_all()
|
|
557
|
-
.unwrap();
|
|
558
|
-
|
|
559
519
|
let result = ExtractionResult {
|
|
560
520
|
content: "Short text".to_string(),
|
|
561
521
|
mime_type: "text/plain".to_string(),
|
|
@@ -577,8 +537,6 @@ Natural language processing enables computers to understand human language.
|
|
|
577
537
|
..Default::default()
|
|
578
538
|
};
|
|
579
539
|
|
|
580
|
-
drop(_guard);
|
|
581
|
-
|
|
582
540
|
let processed = run_pipeline(result, &config).await.unwrap();
|
|
583
541
|
|
|
584
542
|
assert!(!processed.metadata.additional.contains_key("keywords"));
|
|
@@ -586,6 +544,9 @@ Natural language processing enables computers to understand human language.
|
|
|
586
544
|
|
|
587
545
|
#[tokio::test]
|
|
588
546
|
async fn test_postprocessor_runs_before_validator() {
|
|
547
|
+
{
|
|
548
|
+
let _guard = REGISTRY_TEST_GUARD.lock().unwrap();
|
|
549
|
+
} // Drop guard before async operations
|
|
589
550
|
use crate::plugins::{Plugin, PostProcessor, ProcessingStage, Validator};
|
|
590
551
|
use async_trait::async_trait;
|
|
591
552
|
use std::sync::Arc;
|
|
@@ -640,17 +601,6 @@ Natural language processing enables computers to understand human language.
|
|
|
640
601
|
#[async_trait]
|
|
641
602
|
impl Validator for TestValidator {
|
|
642
603
|
async fn validate(&self, result: &ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
|
|
643
|
-
let should_validate = result
|
|
644
|
-
.metadata
|
|
645
|
-
.additional
|
|
646
|
-
.get(VALIDATION_MARKER_KEY)
|
|
647
|
-
.and_then(|v| v.as_str())
|
|
648
|
-
== Some(POSTPROCESSOR_VALIDATION_MARKER);
|
|
649
|
-
|
|
650
|
-
if !should_validate {
|
|
651
|
-
return Ok(());
|
|
652
|
-
}
|
|
653
|
-
|
|
654
604
|
let processed = result
|
|
655
605
|
.metadata
|
|
656
606
|
.additional
|
|
@@ -669,23 +619,18 @@ Natural language processing enables computers to understand human language.
|
|
|
669
619
|
}
|
|
670
620
|
|
|
671
621
|
let pp_registry = crate::plugins::registry::get_post_processor_registry();
|
|
672
|
-
let val_registry = crate::plugins::registry::get_validator_registry();
|
|
673
|
-
|
|
674
|
-
let _guard = REGISTRY_TEST_GUARD.lock().unwrap();
|
|
675
|
-
pp_registry.write().unwrap().shutdown_all().unwrap();
|
|
676
|
-
val_registry.write().unwrap().shutdown_all().unwrap();
|
|
677
|
-
|
|
678
622
|
{
|
|
679
623
|
let mut registry = pp_registry.write().unwrap();
|
|
680
624
|
registry.register(Arc::new(TestPostProcessor), 0).unwrap();
|
|
681
625
|
}
|
|
682
626
|
|
|
627
|
+
let val_registry = crate::plugins::registry::get_validator_registry();
|
|
683
628
|
{
|
|
684
629
|
let mut registry = val_registry.write().unwrap();
|
|
685
630
|
registry.register(Arc::new(TestValidator)).unwrap();
|
|
686
631
|
}
|
|
687
632
|
|
|
688
|
-
let
|
|
633
|
+
let result = ExtractionResult {
|
|
689
634
|
content: "test".to_string(),
|
|
690
635
|
mime_type: "text/plain".to_string(),
|
|
691
636
|
metadata: Metadata::default(),
|
|
@@ -694,18 +639,18 @@ Natural language processing enables computers to understand human language.
|
|
|
694
639
|
chunks: None,
|
|
695
640
|
images: None,
|
|
696
641
|
};
|
|
697
|
-
result.metadata.additional.insert(
|
|
698
|
-
VALIDATION_MARKER_KEY.to_string(),
|
|
699
|
-
serde_json::json!(POSTPROCESSOR_VALIDATION_MARKER),
|
|
700
|
-
);
|
|
701
642
|
|
|
702
643
|
let config = ExtractionConfig::default();
|
|
703
|
-
drop(_guard);
|
|
704
|
-
|
|
705
644
|
let processed = run_pipeline(result, &config).await;
|
|
706
645
|
|
|
707
|
-
|
|
708
|
-
|
|
646
|
+
{
|
|
647
|
+
let mut registry = pp_registry.write().unwrap();
|
|
648
|
+
registry.remove("test-processor").unwrap();
|
|
649
|
+
}
|
|
650
|
+
{
|
|
651
|
+
let mut registry = val_registry.write().unwrap();
|
|
652
|
+
registry.remove("test-validator").unwrap();
|
|
653
|
+
}
|
|
709
654
|
|
|
710
655
|
assert!(processed.is_ok(), "Validator should have seen post-processor metadata");
|
|
711
656
|
let processed = processed.unwrap();
|
|
@@ -719,7 +664,9 @@ Natural language processing enables computers to understand human language.
|
|
|
719
664
|
#[tokio::test]
|
|
720
665
|
#[cfg(feature = "quality")]
|
|
721
666
|
async fn test_quality_processing_runs_before_validator() {
|
|
722
|
-
|
|
667
|
+
{
|
|
668
|
+
let _guard = REGISTRY_TEST_GUARD.lock().unwrap();
|
|
669
|
+
} // Drop guard before async operations
|
|
723
670
|
use crate::plugins::{Plugin, Validator};
|
|
724
671
|
use async_trait::async_trait;
|
|
725
672
|
use std::sync::Arc;
|
|
@@ -743,17 +690,6 @@ Natural language processing enables computers to understand human language.
|
|
|
743
690
|
#[async_trait]
|
|
744
691
|
impl Validator for QualityValidator {
|
|
745
692
|
async fn validate(&self, result: &ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
|
|
746
|
-
let should_validate = result
|
|
747
|
-
.metadata
|
|
748
|
-
.additional
|
|
749
|
-
.get(VALIDATION_MARKER_KEY)
|
|
750
|
-
.and_then(|v| v.as_str())
|
|
751
|
-
== Some(QUALITY_VALIDATION_MARKER);
|
|
752
|
-
|
|
753
|
-
if !should_validate {
|
|
754
|
-
return Ok(());
|
|
755
|
-
}
|
|
756
|
-
|
|
757
693
|
if !result.metadata.additional.contains_key("quality_score") {
|
|
758
694
|
return Err(crate::KreuzbergError::Validation {
|
|
759
695
|
message: "Quality processing did not run before validator".to_string(),
|
|
@@ -770,7 +706,7 @@ Natural language processing enables computers to understand human language.
|
|
|
770
706
|
registry.register(Arc::new(QualityValidator)).unwrap();
|
|
771
707
|
}
|
|
772
708
|
|
|
773
|
-
let
|
|
709
|
+
let result = ExtractionResult {
|
|
774
710
|
content: "This is meaningful test content for quality scoring.".to_string(),
|
|
775
711
|
mime_type: "text/plain".to_string(),
|
|
776
712
|
metadata: Metadata::default(),
|
|
@@ -779,18 +715,12 @@ Natural language processing enables computers to understand human language.
|
|
|
779
715
|
chunks: None,
|
|
780
716
|
images: None,
|
|
781
717
|
};
|
|
782
|
-
result.metadata.additional.insert(
|
|
783
|
-
VALIDATION_MARKER_KEY.to_string(),
|
|
784
|
-
serde_json::json!(QUALITY_VALIDATION_MARKER),
|
|
785
|
-
);
|
|
786
718
|
|
|
787
719
|
let config = ExtractionConfig {
|
|
788
720
|
enable_quality_processing: true,
|
|
789
721
|
..Default::default()
|
|
790
722
|
};
|
|
791
723
|
|
|
792
|
-
drop(_guard);
|
|
793
|
-
|
|
794
724
|
let processed = run_pipeline(result, &config).await;
|
|
795
725
|
|
|
796
726
|
{
|
|
@@ -803,6 +733,9 @@ Natural language processing enables computers to understand human language.
|
|
|
803
733
|
|
|
804
734
|
#[tokio::test]
|
|
805
735
|
async fn test_multiple_postprocessors_run_before_validator() {
|
|
736
|
+
{
|
|
737
|
+
let _guard = REGISTRY_TEST_GUARD.lock().unwrap();
|
|
738
|
+
} // Drop guard before async operations
|
|
806
739
|
use crate::plugins::{Plugin, PostProcessor, ProcessingStage, Validator};
|
|
807
740
|
use async_trait::async_trait;
|
|
808
741
|
use std::sync::Arc;
|
|
@@ -904,17 +837,6 @@ Natural language processing enables computers to understand human language.
|
|
|
904
837
|
#[async_trait]
|
|
905
838
|
impl Validator for OrderValidator {
|
|
906
839
|
async fn validate(&self, result: &ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
|
|
907
|
-
let should_validate = result
|
|
908
|
-
.metadata
|
|
909
|
-
.additional
|
|
910
|
-
.get(VALIDATION_MARKER_KEY)
|
|
911
|
-
.and_then(|v| v.as_str())
|
|
912
|
-
== Some(ORDER_VALIDATION_MARKER);
|
|
913
|
-
|
|
914
|
-
if !should_validate {
|
|
915
|
-
return Ok(());
|
|
916
|
-
}
|
|
917
|
-
|
|
918
840
|
let order = result
|
|
919
841
|
.metadata
|
|
920
842
|
.additional
|
|
@@ -944,18 +866,13 @@ Natural language processing enables computers to understand human language.
|
|
|
944
866
|
}
|
|
945
867
|
|
|
946
868
|
let pp_registry = crate::plugins::registry::get_post_processor_registry();
|
|
947
|
-
let val_registry = crate::plugins::registry::get_validator_registry();
|
|
948
|
-
let _guard = REGISTRY_TEST_GUARD.lock().unwrap();
|
|
949
|
-
|
|
950
|
-
pp_registry.write().unwrap().shutdown_all().unwrap();
|
|
951
|
-
val_registry.write().unwrap().shutdown_all().unwrap();
|
|
952
|
-
|
|
953
869
|
{
|
|
954
870
|
let mut registry = pp_registry.write().unwrap();
|
|
955
871
|
registry.register(Arc::new(EarlyProcessor), 0).unwrap();
|
|
956
872
|
registry.register(Arc::new(LateProcessor), 0).unwrap();
|
|
957
873
|
}
|
|
958
874
|
|
|
875
|
+
let val_registry = crate::plugins::registry::get_validator_registry();
|
|
959
876
|
{
|
|
960
877
|
let mut registry = val_registry.write().unwrap();
|
|
961
878
|
registry.register(Arc::new(OrderValidator)).unwrap();
|
|
@@ -972,12 +889,17 @@ Natural language processing enables computers to understand human language.
|
|
|
972
889
|
};
|
|
973
890
|
|
|
974
891
|
let config = ExtractionConfig::default();
|
|
975
|
-
drop(_guard);
|
|
976
|
-
|
|
977
892
|
let processed = run_pipeline(result, &config).await;
|
|
978
893
|
|
|
979
|
-
|
|
980
|
-
|
|
894
|
+
{
|
|
895
|
+
let mut registry = pp_registry.write().unwrap();
|
|
896
|
+
registry.remove("early-proc").unwrap();
|
|
897
|
+
registry.remove("late-proc").unwrap();
|
|
898
|
+
}
|
|
899
|
+
{
|
|
900
|
+
let mut registry = val_registry.write().unwrap();
|
|
901
|
+
registry.remove("order-validator").unwrap();
|
|
902
|
+
}
|
|
981
903
|
|
|
982
904
|
assert!(processed.is_ok(), "All processors should run before validator");
|
|
983
905
|
}
|
|
@@ -45,104 +45,9 @@ use std::collections::HashMap;
|
|
|
45
45
|
#[cfg(feature = "embeddings")]
|
|
46
46
|
use lazy_static::lazy_static;
|
|
47
47
|
|
|
48
|
-
/// Wrapper for TextEmbedding that prevents cleanup during process shutdown.
|
|
49
|
-
///
|
|
50
|
-
/// # Problem
|
|
51
|
-
///
|
|
52
|
-
/// When the process terminates, global static objects are dropped. The `TextEmbedding`
|
|
53
|
-
/// objects from fastembed contain ONNX Runtime sessions (via `ort v2.0.0-rc.10`), and
|
|
54
|
-
/// during their `Drop` implementation, ONNX Runtime's C++ destructor tries to acquire
|
|
55
|
-
/// mutexes for cleanup.
|
|
56
|
-
///
|
|
57
|
-
/// At process shutdown time, the C++ runtime may have already begun tearing down
|
|
58
|
-
/// threading infrastructure, causing mutex operations to fail with:
|
|
59
|
-
/// "mutex lock failed: Invalid argument"
|
|
60
|
-
///
|
|
61
|
-
/// This manifests as:
|
|
62
|
-
/// ```text
|
|
63
|
-
/// libc++abi: terminating due to uncaught exception of type std::__1::system_error:
|
|
64
|
-
/// mutex lock failed: Invalid argument
|
|
65
|
-
/// ```
|
|
66
|
-
///
|
|
67
|
-
/// This is a known issue in `ort` (see pykeio/ort#441), fixed in later versions via commit
|
|
68
|
-
/// 317be20 ("fix: let `Environment` drop"), but we're using v2.0.0-rc.10 through fastembed
|
|
69
|
-
/// v5.3.1 which predates the fix.
|
|
70
|
-
///
|
|
71
|
-
/// # Solution
|
|
72
|
-
///
|
|
73
|
-
/// We use `Box::leak` to intentionally leak `TextEmbedding` objects during process
|
|
74
|
-
/// shutdown, preventing their `Drop` implementation from running. This is acceptable because:
|
|
75
|
-
///
|
|
76
|
-
/// 1. The OS will reclaim all process memory anyway
|
|
77
|
-
/// 2. Avoiding the crash is more important than cleanup
|
|
78
|
-
/// 3. This only affects process termination, not runtime behavior
|
|
79
|
-
/// 4. Models are long-lived and would survive until process exit anyway
|
|
80
|
-
/// 5. The memory leak is bounded (one model per unique config)
|
|
81
|
-
///
|
|
82
|
-
/// # Remaining Issue
|
|
83
|
-
///
|
|
84
|
-
/// Even with this fix, you may still see the mutex error during final process cleanup.
|
|
85
|
-
/// This is because `ort` v2.0.0-rc.10 also holds the ONNX Runtime `Environment` as a
|
|
86
|
-
/// static variable, which gets dropped during C++ static destruction after Rust cleanup.
|
|
87
|
-
/// This error occurs *after* all Rust code has finished and can be safely ignored - all
|
|
88
|
-
/// tests pass before the error occurs.
|
|
89
|
-
///
|
|
90
|
-
/// The error will be resolved when fastembed upgrades to ort >= 2.0.0 (post-rc.10) which
|
|
91
|
-
/// contains the proper fix.
|
|
92
|
-
///
|
|
93
|
-
/// # Safety
|
|
94
|
-
///
|
|
95
|
-
/// The leak is contained to process shutdown and does not affect runtime behavior.
|
|
96
|
-
/// All normal usage patterns (creating embeddings, caching models) work identically.
|
|
97
|
-
/// We use static references to the leaked models, which is safe because:
|
|
98
|
-
/// - The pointers are never null (we leak valid Box<TextEmbedding>)
|
|
99
|
-
/// - The models live until process exit
|
|
100
|
-
/// - We never manually deallocate the leaked memory
|
|
101
|
-
/// - Mutex provides interior mutability for the embed() method
|
|
102
|
-
///
|
|
103
|
-
/// Thread-safe wrapper for leaked TextEmbedding that allows interior mutability.
|
|
104
|
-
///
|
|
105
|
-
/// This wrapper holds a raw pointer to a leaked `TextEmbedding` and provides
|
|
106
|
-
/// safe access through the Mutex lock in MODEL_CACHE.
|
|
107
|
-
#[cfg(feature = "embeddings")]
|
|
108
|
-
pub(crate) struct LeakedModel {
|
|
109
|
-
ptr: *mut TextEmbedding,
|
|
110
|
-
}
|
|
111
|
-
|
|
112
|
-
#[cfg(feature = "embeddings")]
|
|
113
|
-
impl LeakedModel {
|
|
114
|
-
fn new(model: TextEmbedding) -> Self {
|
|
115
|
-
Self {
|
|
116
|
-
ptr: Box::into_raw(Box::new(model)),
|
|
117
|
-
}
|
|
118
|
-
}
|
|
119
|
-
|
|
120
|
-
/// Get a mutable reference to the model.
|
|
121
|
-
///
|
|
122
|
-
/// # Safety
|
|
123
|
-
///
|
|
124
|
-
/// This is safe to call only when:
|
|
125
|
-
/// 1. The caller has exclusive access (guaranteed by Mutex in MODEL_CACHE)
|
|
126
|
-
/// 2. The pointer is valid (guaranteed by Box::into_raw and never deallocating)
|
|
127
|
-
#[allow(unsafe_code, clippy::mut_from_ref)]
|
|
128
|
-
unsafe fn get_mut(&self) -> &mut TextEmbedding {
|
|
129
|
-
unsafe { &mut *self.ptr }
|
|
130
|
-
}
|
|
131
|
-
}
|
|
132
|
-
|
|
133
|
-
#[cfg(feature = "embeddings")]
|
|
134
|
-
#[allow(unsafe_code)]
|
|
135
|
-
unsafe impl Send for LeakedModel {}
|
|
136
|
-
#[cfg(feature = "embeddings")]
|
|
137
|
-
#[allow(unsafe_code)]
|
|
138
|
-
unsafe impl Sync for LeakedModel {}
|
|
139
|
-
|
|
140
|
-
#[cfg(feature = "embeddings")]
|
|
141
|
-
type CachedEmbedding = Arc<Mutex<LeakedModel>>;
|
|
142
|
-
|
|
143
48
|
#[cfg(feature = "embeddings")]
|
|
144
49
|
lazy_static! {
|
|
145
|
-
static ref MODEL_CACHE: RwLock<HashMap<String,
|
|
50
|
+
static ref MODEL_CACHE: RwLock<HashMap<String, Arc<Mutex<TextEmbedding>>>> = RwLock::new(HashMap::new());
|
|
146
51
|
}
|
|
147
52
|
|
|
148
53
|
/// Get or initialize a text embedding model from cache.
|
|
@@ -150,11 +55,10 @@ lazy_static! {
|
|
|
150
55
|
/// This function ensures models are initialized only once and reused across
|
|
151
56
|
/// the application, avoiding redundant downloads and initialization overhead.
|
|
152
57
|
#[cfg(feature = "embeddings")]
|
|
153
|
-
#[allow(private_interfaces)]
|
|
154
58
|
pub fn get_or_init_model(
|
|
155
59
|
model: EmbeddingModel,
|
|
156
60
|
cache_dir: Option<std::path::PathBuf>,
|
|
157
|
-
) -> crate::Result<
|
|
61
|
+
) -> crate::Result<Arc<Mutex<TextEmbedding>>> {
|
|
158
62
|
let cache_directory = cache_dir.unwrap_or_else(|| {
|
|
159
63
|
let mut path = std::env::current_dir().unwrap_or_else(|_| std::path::PathBuf::from("."));
|
|
160
64
|
path.push(".kreuzberg");
|
|
@@ -165,26 +69,21 @@ pub fn get_or_init_model(
|
|
|
165
69
|
let model_key = format!("{:?}_{}", model, cache_directory.display());
|
|
166
70
|
|
|
167
71
|
{
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
let cache = poison_error.get_ref();
|
|
176
|
-
if let Some(cached_model) = cache.get(&model_key) {
|
|
177
|
-
return Ok(Arc::clone(cached_model));
|
|
178
|
-
}
|
|
179
|
-
}
|
|
72
|
+
let cache = MODEL_CACHE.read().map_err(|e| crate::KreuzbergError::Plugin {
|
|
73
|
+
message: format!("Failed to acquire model cache read lock: {}", e),
|
|
74
|
+
plugin_name: "embeddings".to_string(),
|
|
75
|
+
})?;
|
|
76
|
+
|
|
77
|
+
if let Some(cached_model) = cache.get(&model_key) {
|
|
78
|
+
return Ok(Arc::clone(cached_model));
|
|
180
79
|
}
|
|
181
80
|
}
|
|
182
81
|
|
|
183
82
|
{
|
|
184
|
-
let mut cache =
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
}
|
|
83
|
+
let mut cache = MODEL_CACHE.write().map_err(|e| crate::KreuzbergError::Plugin {
|
|
84
|
+
message: format!("Failed to acquire model cache write lock: {}", e),
|
|
85
|
+
plugin_name: "embeddings".to_string(),
|
|
86
|
+
})?;
|
|
188
87
|
|
|
189
88
|
if let Some(cached_model) = cache.get(&model_key) {
|
|
190
89
|
return Ok(Arc::clone(cached_model));
|
|
@@ -198,8 +97,7 @@ pub fn get_or_init_model(
|
|
|
198
97
|
plugin_name: "embeddings".to_string(),
|
|
199
98
|
})?;
|
|
200
99
|
|
|
201
|
-
let
|
|
202
|
-
let arc_model = Arc::new(Mutex::new(leaked_model));
|
|
100
|
+
let arc_model = Arc::new(Mutex::new(embedding_model));
|
|
203
101
|
cache.insert(model_key, Arc::clone(&arc_model));
|
|
204
102
|
|
|
205
103
|
Ok(arc_model)
|
|
@@ -350,15 +248,12 @@ pub fn generate_embeddings_for_chunks(
|
|
|
350
248
|
let texts: Vec<String> = chunks.iter().map(|chunk| chunk.content.clone()).collect();
|
|
351
249
|
|
|
352
250
|
let embeddings_result = {
|
|
353
|
-
let locked_model = model.lock().map_err(|e| crate::KreuzbergError::Plugin {
|
|
251
|
+
let mut locked_model = model.lock().map_err(|e| crate::KreuzbergError::Plugin {
|
|
354
252
|
message: format!("Failed to acquire model lock: {}", e),
|
|
355
253
|
plugin_name: "embeddings".to_string(),
|
|
356
254
|
})?;
|
|
357
255
|
|
|
358
|
-
|
|
359
|
-
let model_mut = unsafe { locked_model.get_mut() };
|
|
360
|
-
|
|
361
|
-
model_mut
|
|
256
|
+
locked_model
|
|
362
257
|
.embed(texts, Some(config.batch_size))
|
|
363
258
|
.map_err(|e| crate::KreuzbergError::Plugin {
|
|
364
259
|
message: format!("Failed to generate embeddings: {}", e),
|
|
@@ -425,8 +320,4 @@ mod tests {
|
|
|
425
320
|
assert_eq!(quality.chunk_size, 2000);
|
|
426
321
|
assert_eq!(quality.overlap, 200);
|
|
427
322
|
}
|
|
428
|
-
|
|
429
|
-
#[cfg(feature = "embeddings")]
|
|
430
|
-
#[test]
|
|
431
|
-
fn test_lock_poisoning_recovery_semantics() {}
|
|
432
323
|
}
|
|
@@ -60,7 +60,7 @@ pub type Result<T> = std::result::Result<T, KreuzbergError>;
|
|
|
60
60
|
/// - `Cache` - Cache operation errors (non-fatal, can be ignored)
|
|
61
61
|
/// - `ImageProcessing` - Image manipulation errors
|
|
62
62
|
/// - `Serialization` - JSON/MessagePack serialization errors
|
|
63
|
-
/// - `MissingDependency` - Missing optional dependencies (tesseract, etc.)
|
|
63
|
+
/// - `MissingDependency` - Missing optional dependencies (tesseract, pandoc, etc.)
|
|
64
64
|
/// - `Plugin` - Plugin-specific errors
|
|
65
65
|
/// - `LockPoisoned` - Mutex/RwLock poisoning (should not happen in normal operation)
|
|
66
66
|
/// - `UnsupportedFormat` - Unsupported MIME type or file format
|
|
@@ -16,7 +16,7 @@ use crate::error::{KreuzbergError, Result};
|
|
|
16
16
|
///
|
|
17
17
|
/// # Performance
|
|
18
18
|
/// docx-lite uses streaming XML parsing for minimal memory overhead and high throughput
|
|
19
|
-
/// (~160 MB/s average).
|
|
19
|
+
/// (~160 MB/s average). It eliminates subprocess overhead compared to Pandoc (~400x faster).
|
|
20
20
|
pub fn extract_text(bytes: &[u8]) -> Result<String> {
|
|
21
21
|
docx_lite::extract_text_from_bytes(bytes)
|
|
22
22
|
.map_err(|e| KreuzbergError::parsing(format!("DOCX text extraction failed: {}", e)))
|
|
@@ -39,7 +39,7 @@ pub fn extract_image_metadata(bytes: &[u8]) -> Result<ImageMetadata> {
|
|
|
39
39
|
|
|
40
40
|
let width = image.width();
|
|
41
41
|
let height = image.height();
|
|
42
|
-
let format_str = format!("{:?}", format)
|
|
42
|
+
let format_str = format!("{:?}", format);
|
|
43
43
|
|
|
44
44
|
let exif_data = extract_exif_data(bytes);
|
|
45
45
|
|
|
@@ -123,7 +123,7 @@ mod tests {
|
|
|
123
123
|
let metadata = result.unwrap();
|
|
124
124
|
assert_eq!(metadata.width, 100);
|
|
125
125
|
assert_eq!(metadata.height, 80);
|
|
126
|
-
assert_eq!(metadata.format, "
|
|
126
|
+
assert_eq!(metadata.format, "Png");
|
|
127
127
|
}
|
|
128
128
|
|
|
129
129
|
#[test]
|
|
@@ -135,7 +135,7 @@ mod tests {
|
|
|
135
135
|
let metadata = result.unwrap();
|
|
136
136
|
assert_eq!(metadata.width, 200);
|
|
137
137
|
assert_eq!(metadata.height, 150);
|
|
138
|
-
assert_eq!(metadata.format, "
|
|
138
|
+
assert_eq!(metadata.format, "Jpeg");
|
|
139
139
|
}
|
|
140
140
|
|
|
141
141
|
#[test]
|
|
@@ -147,7 +147,7 @@ mod tests {
|
|
|
147
147
|
let metadata = result.unwrap();
|
|
148
148
|
assert_eq!(metadata.width, 120);
|
|
149
149
|
assert_eq!(metadata.height, 90);
|
|
150
|
-
assert_eq!(metadata.format, "
|
|
150
|
+
assert_eq!(metadata.format, "WebP");
|
|
151
151
|
}
|
|
152
152
|
|
|
153
153
|
#[test]
|
|
@@ -159,7 +159,7 @@ mod tests {
|
|
|
159
159
|
let metadata = result.unwrap();
|
|
160
160
|
assert_eq!(metadata.width, 50);
|
|
161
161
|
assert_eq!(metadata.height, 50);
|
|
162
|
-
assert_eq!(metadata.format, "
|
|
162
|
+
assert_eq!(metadata.format, "Bmp");
|
|
163
163
|
}
|
|
164
164
|
|
|
165
165
|
#[test]
|
|
@@ -171,7 +171,7 @@ mod tests {
|
|
|
171
171
|
let metadata = result.unwrap();
|
|
172
172
|
assert_eq!(metadata.width, 180);
|
|
173
173
|
assert_eq!(metadata.height, 120);
|
|
174
|
-
assert_eq!(metadata.format, "
|
|
174
|
+
assert_eq!(metadata.format, "Tiff");
|
|
175
175
|
}
|
|
176
176
|
|
|
177
177
|
#[test]
|
|
@@ -183,7 +183,7 @@ mod tests {
|
|
|
183
183
|
let metadata = result.unwrap();
|
|
184
184
|
assert_eq!(metadata.width, 64);
|
|
185
185
|
assert_eq!(metadata.height, 64);
|
|
186
|
-
assert_eq!(metadata.format, "
|
|
186
|
+
assert_eq!(metadata.format, "Gif");
|
|
187
187
|
}
|
|
188
188
|
|
|
189
189
|
#[test]
|
|
@@ -217,8 +217,8 @@ mod tests {
|
|
|
217
217
|
let png_metadata = extract_image_metadata(&png_bytes).unwrap();
|
|
218
218
|
let jpeg_metadata = extract_image_metadata(&jpeg_bytes).unwrap();
|
|
219
219
|
|
|
220
|
-
assert_eq!(png_metadata.format, "
|
|
221
|
-
assert_eq!(jpeg_metadata.format, "
|
|
220
|
+
assert_eq!(png_metadata.format, "Png");
|
|
221
|
+
assert_eq!(jpeg_metadata.format, "Jpeg");
|
|
222
222
|
}
|
|
223
223
|
|
|
224
224
|
#[test]
|
|
@@ -284,7 +284,7 @@ mod tests {
|
|
|
284
284
|
let metadata = result.unwrap();
|
|
285
285
|
assert_eq!(metadata.width, 1);
|
|
286
286
|
assert_eq!(metadata.height, 1);
|
|
287
|
-
assert_eq!(metadata.format, "
|
|
287
|
+
assert_eq!(metadata.format, "Png");
|
|
288
288
|
}
|
|
289
289
|
|
|
290
290
|
#[test]
|
|
@@ -361,8 +361,8 @@ mod tests {
|
|
|
361
361
|
let jpeg_meta = extract_image_metadata(&jpeg_bytes).unwrap();
|
|
362
362
|
let webp_meta = extract_image_metadata(&webp_bytes).unwrap();
|
|
363
363
|
|
|
364
|
-
assert_eq!(png_meta.format, "
|
|
365
|
-
assert_eq!(jpeg_meta.format, "
|
|
366
|
-
assert_eq!(webp_meta.format, "
|
|
364
|
+
assert_eq!(png_meta.format, "Png");
|
|
365
|
+
assert_eq!(jpeg_meta.format, "Jpeg");
|
|
366
|
+
assert_eq!(webp_meta.format, "WebP");
|
|
367
367
|
}
|
|
368
368
|
}
|
|
@@ -255,6 +255,7 @@ pub async fn convert_office_doc(
|
|
|
255
255
|
let stderr = String::from_utf8_lossy(&output.stderr);
|
|
256
256
|
let stdout = String::from_utf8_lossy(&output.stdout);
|
|
257
257
|
|
|
258
|
+
// Build detailed error message with both stdout and stderr
|
|
258
259
|
let mut error_details = format!(
|
|
259
260
|
"LibreOffice process failed with return code {}",
|
|
260
261
|
output.status.code().unwrap_or(-1)
|
|
@@ -25,6 +25,9 @@ pub mod libreoffice;
|
|
|
25
25
|
#[cfg(feature = "office")]
|
|
26
26
|
pub mod office_metadata;
|
|
27
27
|
|
|
28
|
+
#[cfg(feature = "office")]
|
|
29
|
+
pub mod pandoc;
|
|
30
|
+
|
|
28
31
|
#[cfg(feature = "office")]
|
|
29
32
|
pub mod pptx;
|
|
30
33
|
|
|
@@ -34,9 +37,6 @@ pub mod table;
|
|
|
34
37
|
#[cfg(feature = "xml")]
|
|
35
38
|
pub mod xml;
|
|
36
39
|
|
|
37
|
-
#[cfg(any(feature = "office", feature = "html"))]
|
|
38
|
-
pub mod markdown;
|
|
39
|
-
|
|
40
40
|
pub use structured::{JsonExtractionConfig, StructuredDataResult, parse_json, parse_toml, parse_yaml};
|
|
41
41
|
pub use text::parse_text;
|
|
42
42
|
|
|
@@ -63,9 +63,8 @@ pub use libreoffice::{check_libreoffice_available, convert_doc_to_docx, convert_
|
|
|
63
63
|
|
|
64
64
|
#[cfg(feature = "office")]
|
|
65
65
|
pub use office_metadata::{
|
|
66
|
-
CoreProperties, CustomProperties, DocxAppProperties,
|
|
67
|
-
|
|
68
|
-
extract_pptx_app_properties, extract_xlsx_app_properties,
|
|
66
|
+
CoreProperties, CustomProperties, DocxAppProperties, PptxAppProperties, XlsxAppProperties, extract_core_properties,
|
|
67
|
+
extract_custom_properties, extract_docx_app_properties, extract_pptx_app_properties, extract_xlsx_app_properties,
|
|
69
68
|
};
|
|
70
69
|
|
|
71
70
|
#[cfg(feature = "office")]
|
|
@@ -76,6 +75,3 @@ pub use table::table_from_arrow_to_markdown;
|
|
|
76
75
|
|
|
77
76
|
#[cfg(feature = "xml")]
|
|
78
77
|
pub use xml::parse_xml;
|
|
79
|
-
|
|
80
|
-
#[cfg(any(feature = "office", feature = "html"))]
|
|
81
|
-
pub use markdown::cells_to_markdown;
|
|
@@ -35,7 +35,6 @@
|
|
|
35
35
|
pub mod app_properties;
|
|
36
36
|
pub mod core_properties;
|
|
37
37
|
pub mod custom_properties;
|
|
38
|
-
pub mod odt_properties;
|
|
39
38
|
|
|
40
39
|
pub use app_properties::{
|
|
41
40
|
DocxAppProperties, PptxAppProperties, XlsxAppProperties, extract_docx_app_properties, extract_pptx_app_properties,
|
|
@@ -43,7 +42,6 @@ pub use app_properties::{
|
|
|
43
42
|
};
|
|
44
43
|
pub use core_properties::{CoreProperties, extract_core_properties};
|
|
45
44
|
pub use custom_properties::{CustomProperties, extract_custom_properties};
|
|
46
|
-
pub use odt_properties::{OdtProperties, extract_odt_properties};
|
|
47
45
|
|
|
48
46
|
use roxmltree::Node;
|
|
49
47
|
|