kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +0 -6
- data/.rubocop.yaml +534 -1
- data/Gemfile +2 -1
- data/Gemfile.lock +28 -116
- data/README.md +269 -629
- data/Rakefile +0 -9
- data/Steepfile +4 -8
- data/examples/async_patterns.rb +58 -1
- data/ext/kreuzberg_rb/extconf.rb +5 -35
- data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
- data/ext/kreuzberg_rb/native/build.rs +14 -12
- data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
- data/ext/kreuzberg_rb/native/include/strings.h +2 -2
- data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
- data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
- data/extconf.rb +6 -38
- data/kreuzberg.gemspec +20 -114
- data/lib/kreuzberg/api_proxy.rb +18 -2
- data/lib/kreuzberg/cache_api.rb +0 -22
- data/lib/kreuzberg/cli.rb +10 -2
- data/lib/kreuzberg/cli_proxy.rb +10 -0
- data/lib/kreuzberg/config.rb +22 -274
- data/lib/kreuzberg/errors.rb +7 -73
- data/lib/kreuzberg/extraction_api.rb +8 -237
- data/lib/kreuzberg/mcp_proxy.rb +11 -2
- data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
- data/lib/kreuzberg/post_processor_protocol.rb +71 -0
- data/lib/kreuzberg/result.rb +33 -151
- data/lib/kreuzberg/setup_lib_path.rb +2 -22
- data/lib/kreuzberg/validator_protocol.rb +73 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +13 -27
- data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
- data/sig/kreuzberg.rbs +12 -105
- data/spec/binding/cache_spec.rb +22 -22
- data/spec/binding/cli_proxy_spec.rb +4 -2
- data/spec/binding/cli_spec.rb +11 -12
- data/spec/binding/config_spec.rb +0 -74
- data/spec/binding/config_validation_spec.rb +6 -100
- data/spec/binding/error_handling_spec.rb +97 -283
- data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
- data/spec/binding/plugins/postprocessor_spec.rb +11 -11
- data/spec/binding/plugins/validator_spec.rb +13 -12
- data/spec/examples.txt +104 -0
- data/spec/fixtures/config.toml +1 -0
- data/spec/fixtures/config.yaml +1 -0
- data/spec/fixtures/invalid_config.toml +1 -0
- data/spec/smoke/package_spec.rb +3 -2
- data/spec/spec_helper.rb +3 -1
- data/vendor/kreuzberg/Cargo.toml +67 -192
- data/vendor/kreuzberg/README.md +9 -97
- data/vendor/kreuzberg/build.rs +194 -516
- data/vendor/kreuzberg/src/api/handlers.rs +9 -130
- data/vendor/kreuzberg/src/api/mod.rs +3 -18
- data/vendor/kreuzberg/src/api/server.rs +71 -236
- data/vendor/kreuzberg/src/api/types.rs +7 -43
- data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
- data/vendor/kreuzberg/src/cache/mod.rs +3 -27
- data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
- data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
- data/vendor/kreuzberg/src/core/config.rs +23 -905
- data/vendor/kreuzberg/src/core/extractor.rs +106 -403
- data/vendor/kreuzberg/src/core/io.rs +2 -4
- data/vendor/kreuzberg/src/core/mime.rs +12 -2
- data/vendor/kreuzberg/src/core/mod.rs +3 -22
- data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
- data/vendor/kreuzberg/src/embeddings.rs +21 -169
- data/vendor/kreuzberg/src/error.rs +2 -2
- data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
- data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
- data/vendor/kreuzberg/src/extraction/email.rs +11 -12
- data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
- data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
- data/vendor/kreuzberg/src/extraction/image.rs +14 -138
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
- data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
- data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
- data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
- data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
- data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
- data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
- data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
- data/vendor/kreuzberg/src/extraction/table.rs +1 -2
- data/vendor/kreuzberg/src/extraction/text.rs +10 -18
- data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
- data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
- data/vendor/kreuzberg/src/extractors/email.rs +9 -37
- data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
- data/vendor/kreuzberg/src/extractors/html.rs +173 -182
- data/vendor/kreuzberg/src/extractors/image.rs +8 -32
- data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
- data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
- data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
- data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
- data/vendor/kreuzberg/src/extractors/text.rs +7 -30
- data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
- data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
- data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
- data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
- data/vendor/kreuzberg/src/lib.rs +5 -17
- data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
- data/vendor/kreuzberg/src/mcp/server.rs +21 -145
- data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
- data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
- data/vendor/kreuzberg/src/pdf/error.rs +1 -93
- data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
- data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
- data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
- data/vendor/kreuzberg/src/pdf/table.rs +64 -61
- data/vendor/kreuzberg/src/pdf/text.rs +24 -416
- data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
- data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
- data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
- data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
- data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
- data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
- data/vendor/kreuzberg/src/text/mod.rs +0 -8
- data/vendor/kreuzberg/src/text/quality.rs +15 -28
- data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
- data/vendor/kreuzberg/src/types.rs +67 -907
- data/vendor/kreuzberg/src/utils/mod.rs +0 -14
- data/vendor/kreuzberg/src/utils/quality.rs +3 -12
- data/vendor/kreuzberg/tests/api_tests.rs +0 -506
- data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
- data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
- data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
- data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
- data/vendor/kreuzberg/tests/config_features.rs +1 -33
- data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
- data/vendor/kreuzberg/tests/core_integration.rs +9 -35
- data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
- data/vendor/kreuzberg/tests/email_integration.rs +1 -3
- data/vendor/kreuzberg/tests/error_handling.rs +34 -43
- data/vendor/kreuzberg/tests/format_integration.rs +1 -7
- data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
- data/vendor/kreuzberg/tests/image_integration.rs +0 -2
- data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
- data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
- data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
- data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
- data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
- data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
- data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
- data/vendor/kreuzberg/tests/security_validation.rs +1 -13
- data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
- metadata +25 -171
- data/.rubocop.yml +0 -543
- data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
- data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
- data/lib/kreuzberg/error_context.rb +0 -136
- data/lib/kreuzberg/types.rb +0 -170
- data/lib/libpdfium.so +0 -0
- data/spec/binding/async_operations_spec.rb +0 -473
- data/spec/binding/batch_operations_spec.rb +0 -595
- data/spec/binding/batch_spec.rb +0 -359
- data/spec/binding/config_result_spec.rb +0 -377
- data/spec/binding/embeddings_spec.rb +0 -816
- data/spec/binding/error_recovery_spec.rb +0 -488
- data/spec/binding/font_config_spec.rb +0 -220
- data/spec/binding/images_spec.rb +0 -738
- data/spec/binding/keywords_extraction_spec.rb +0 -600
- data/spec/binding/metadata_types_spec.rb +0 -1228
- data/spec/binding/pages_extraction_spec.rb +0 -471
- data/spec/binding/tables_spec.rb +0 -641
- data/spec/unit/config/chunking_config_spec.rb +0 -213
- data/spec/unit/config/embedding_config_spec.rb +0 -343
- data/spec/unit/config/extraction_config_spec.rb +0 -438
- data/spec/unit/config/font_config_spec.rb +0 -285
- data/spec/unit/config/hierarchy_config_spec.rb +0 -314
- data/spec/unit/config/image_extraction_config_spec.rb +0 -209
- data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
- data/spec/unit/config/keyword_config_spec.rb +0 -229
- data/spec/unit/config/language_detection_config_spec.rb +0 -258
- data/spec/unit/config/ocr_config_spec.rb +0 -171
- data/spec/unit/config/page_config_spec.rb +0 -221
- data/spec/unit/config/pdf_config_spec.rb +0 -267
- data/spec/unit/config/postprocessor_config_spec.rb +0 -290
- data/spec/unit/config/tesseract_config_spec.rb +0 -181
- data/spec/unit/config/token_reduction_config_spec.rb +0 -251
- data/test/metadata_types_test.rb +0 -959
- data/vendor/Cargo.toml +0 -61
- data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
- data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
- data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
- data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/formats.rs +0 -235
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
- data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
- data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
- data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
- data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
- data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
- data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
- data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/extractors/security.rs +0 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
- data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
- data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
- data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
- data/vendor/kreuzberg/src/panic_context.rs +0 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
- data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
- data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
- data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
- data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
- data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
- data/vendor/kreuzberg/src/utils/pool.rs +0 -503
- data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
- data/vendor/kreuzberg/tests/api_embed.rs +0 -360
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
- data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
- data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
- data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
- data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
- data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
- data/vendor/kreuzberg/tests/page_markers.rs +0 -297
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
- data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
- data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
- data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
- data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
- data/vendor/kreuzberg-ffi/README.md +0 -851
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
- data/vendor/kreuzberg-ffi/build.rs +0 -168
- data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
- data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
- data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
- data/vendor/kreuzberg-ffi/src/error.rs +0 -901
- data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
- data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
- data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
- data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
- data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
- data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
- data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
- data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
- data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
- data/vendor/kreuzberg-ffi/src/result.rs +0 -510
- data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
- data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
- data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
- data/vendor/kreuzberg-ffi/src/types.rs +0 -363
- data/vendor/kreuzberg-ffi/src/util.rs +0 -210
- data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
- data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
- data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
- data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
- data/vendor/kreuzberg-tesseract/LICENSE +0 -22
- data/vendor/kreuzberg-tesseract/README.md +0 -399
- data/vendor/kreuzberg-tesseract/build.rs +0 -1127
- data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
- data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
- data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
- data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
- data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
- data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211
|
@@ -2,16 +2,12 @@
|
|
|
2
2
|
|
|
3
3
|
use crate::Result;
|
|
4
4
|
use crate::core::config::ExtractionConfig;
|
|
5
|
-
use crate::extractors::SyncExtractor;
|
|
6
5
|
use crate::plugins::{DocumentExtractor, Plugin};
|
|
7
|
-
use crate::text::utf8_validation;
|
|
8
6
|
use crate::types::{ExtractionResult, Metadata, Table};
|
|
9
7
|
use async_trait::async_trait;
|
|
10
|
-
|
|
8
|
+
use scraper::{Html, Selector};
|
|
11
9
|
use std::path::Path;
|
|
12
10
|
|
|
13
|
-
// NOTE: scraper dependency has been removed in favor of html-to-markdown-rs
|
|
14
|
-
|
|
15
11
|
/// HTML document extractor using html-to-markdown.
|
|
16
12
|
pub struct HtmlExtractor;
|
|
17
13
|
|
|
@@ -27,152 +23,134 @@ impl HtmlExtractor {
|
|
|
27
23
|
}
|
|
28
24
|
}
|
|
29
25
|
|
|
30
|
-
/// Extract all tables from
|
|
31
|
-
///
|
|
32
|
-
/// Parses markdown pipe-delimited format to extract table structures.
|
|
33
|
-
/// This function accepts already-converted markdown to enable reuse of
|
|
34
|
-
/// a single HTML-to-markdown conversion pass, improving performance by
|
|
35
|
-
/// eliminating duplicate conversions.
|
|
26
|
+
/// Extract all tables from HTML content.
|
|
36
27
|
///
|
|
37
|
-
///
|
|
38
|
-
///
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
}
|
|
28
|
+
/// Parses HTML to find `<table>` elements and extracts their structure
|
|
29
|
+
/// into `Table` objects with cells and markdown representation.
|
|
30
|
+
fn extract_html_tables(html: &str) -> Result<Vec<Table>> {
|
|
31
|
+
let document = Html::parse_document(html);
|
|
32
|
+
let table_selector = Selector::parse("table")
|
|
33
|
+
.map_err(|e| crate::error::KreuzbergError::parsing(format!("Failed to parse table selector: {}", e)))?;
|
|
34
|
+
let row_selector = Selector::parse("tr")
|
|
35
|
+
.map_err(|e| crate::error::KreuzbergError::parsing(format!("Failed to parse row selector: {}", e)))?;
|
|
36
|
+
let header_selector = Selector::parse("th")
|
|
37
|
+
.map_err(|e| crate::error::KreuzbergError::parsing(format!("Failed to parse header selector: {}", e)))?;
|
|
38
|
+
let cell_selector = Selector::parse("td")
|
|
39
|
+
.map_err(|e| crate::error::KreuzbergError::parsing(format!("Failed to parse cell selector: {}", e)))?;
|
|
45
40
|
|
|
46
|
-
/// Parse markdown tables from HTML-converted markdown.
|
|
47
|
-
///
|
|
48
|
-
/// Extracts table data from markdown pipe-delimited format.
|
|
49
|
-
/// This maintains the existing Table structure API.
|
|
50
|
-
fn parse_markdown_tables(markdown: &str) -> Vec<Table> {
|
|
51
41
|
let mut tables = Vec::new();
|
|
52
|
-
let mut table_index = 0;
|
|
53
|
-
let lines: Vec<&str> = markdown.lines().collect();
|
|
54
|
-
let mut i = 0;
|
|
55
|
-
|
|
56
|
-
while i < lines.len() {
|
|
57
|
-
if lines[i].trim_start().starts_with('|')
|
|
58
|
-
&& let Some((cells, end_idx)) = extract_markdown_table(&lines, i)
|
|
59
|
-
&& !cells.is_empty()
|
|
60
|
-
{
|
|
61
|
-
let markdown_table = reconstruct_markdown_table(&cells);
|
|
62
|
-
tables.push(Table {
|
|
63
|
-
cells,
|
|
64
|
-
markdown: markdown_table,
|
|
65
|
-
page_number: table_index + 1,
|
|
66
|
-
});
|
|
67
|
-
table_index += 1;
|
|
68
|
-
i = end_idx;
|
|
69
|
-
continue;
|
|
70
|
-
}
|
|
71
|
-
i += 1;
|
|
72
|
-
}
|
|
73
42
|
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
43
|
+
for (table_index, table_elem) in document.select(&table_selector).enumerate() {
|
|
44
|
+
let mut cells: Vec<Vec<String>> = Vec::new();
|
|
45
|
+
|
|
46
|
+
for row in table_elem.select(&row_selector) {
|
|
47
|
+
let mut row_cells = Vec::new();
|
|
48
|
+
|
|
49
|
+
// Try headers first (th elements)
|
|
50
|
+
let headers: Vec<_> = row.select(&header_selector).collect();
|
|
51
|
+
if !headers.is_empty() {
|
|
52
|
+
for header in headers {
|
|
53
|
+
let text = header
|
|
54
|
+
.text()
|
|
55
|
+
.collect::<Vec<_>>()
|
|
56
|
+
.join(" ")
|
|
57
|
+
.split_whitespace()
|
|
58
|
+
.collect::<Vec<_>>()
|
|
59
|
+
.join(" ");
|
|
60
|
+
row_cells.push(text);
|
|
61
|
+
}
|
|
62
|
+
} else {
|
|
63
|
+
// Use data cells (td elements)
|
|
64
|
+
for cell in row.select(&cell_selector) {
|
|
65
|
+
let text = cell
|
|
66
|
+
.text()
|
|
67
|
+
.collect::<Vec<_>>()
|
|
68
|
+
.join(" ")
|
|
69
|
+
.split_whitespace()
|
|
70
|
+
.collect::<Vec<_>>()
|
|
71
|
+
.join(" ");
|
|
72
|
+
row_cells.push(text);
|
|
73
|
+
}
|
|
74
|
+
}
|
|
96
75
|
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
i += 1;
|
|
76
|
+
if !row_cells.is_empty() {
|
|
77
|
+
cells.push(row_cells);
|
|
78
|
+
}
|
|
101
79
|
}
|
|
102
|
-
}
|
|
103
80
|
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
i += 1;
|
|
81
|
+
// Only create a table if it has content
|
|
82
|
+
if !cells.is_empty() {
|
|
83
|
+
let markdown = cells_to_markdown(&cells);
|
|
84
|
+
tables.push(Table {
|
|
85
|
+
cells,
|
|
86
|
+
markdown,
|
|
87
|
+
page_number: table_index + 1, // 1-indexed
|
|
88
|
+
});
|
|
113
89
|
}
|
|
114
90
|
}
|
|
115
91
|
|
|
116
|
-
|
|
117
|
-
}
|
|
118
|
-
|
|
119
|
-
/// Parse a single markdown table row into cell contents.
|
|
120
|
-
fn parse_markdown_table_row(line: &str) -> Option<Vec<String>> {
|
|
121
|
-
let trimmed = line.trim_start();
|
|
122
|
-
|
|
123
|
-
if !trimmed.starts_with('|') || !trimmed.contains('|') {
|
|
124
|
-
return None;
|
|
125
|
-
}
|
|
126
|
-
|
|
127
|
-
let cells: Vec<String> = trimmed
|
|
128
|
-
.split('|')
|
|
129
|
-
.skip(1)
|
|
130
|
-
.map(|cell| cell.trim().to_string())
|
|
131
|
-
.filter(|cell| !cell.is_empty())
|
|
132
|
-
.collect();
|
|
133
|
-
|
|
134
|
-
if cells.is_empty() { None } else { Some(cells) }
|
|
135
|
-
}
|
|
136
|
-
|
|
137
|
-
/// Check if a line is a markdown table separator.
|
|
138
|
-
fn is_markdown_table_separator(line: &str) -> bool {
|
|
139
|
-
let trimmed = line.trim_start();
|
|
140
|
-
if !trimmed.starts_with('|') {
|
|
141
|
-
return false;
|
|
142
|
-
}
|
|
143
|
-
|
|
144
|
-
trimmed
|
|
145
|
-
.split('|')
|
|
146
|
-
.all(|cell| cell.trim().chars().all(|c| c == '-' || c == ':' || c.is_whitespace()))
|
|
92
|
+
Ok(tables)
|
|
147
93
|
}
|
|
148
94
|
|
|
149
|
-
///
|
|
95
|
+
/// Convert table cells to markdown format.
|
|
96
|
+
///
|
|
97
|
+
/// Reuses the same logic as DOCX extractor for consistency.
|
|
98
|
+
/// First row is treated as header, remaining rows as data.
|
|
150
99
|
///
|
|
151
|
-
///
|
|
152
|
-
|
|
100
|
+
/// # Arguments
|
|
101
|
+
/// * `cells` - 2D vector of cell strings (rows × columns)
|
|
102
|
+
///
|
|
103
|
+
/// # Returns
|
|
104
|
+
/// * `String` - Markdown formatted table
|
|
105
|
+
fn cells_to_markdown(cells: &[Vec<String>]) -> String {
|
|
153
106
|
if cells.is_empty() {
|
|
154
107
|
return String::new();
|
|
155
108
|
}
|
|
156
109
|
|
|
157
110
|
let mut markdown = String::new();
|
|
158
111
|
|
|
159
|
-
|
|
112
|
+
// Determine number of columns from first row
|
|
113
|
+
let num_cols = cells.first().map(|r| r.len()).unwrap_or(0);
|
|
114
|
+
if num_cols == 0 {
|
|
115
|
+
return String::new();
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
// Header row (first row)
|
|
119
|
+
if let Some(header) = cells.first() {
|
|
120
|
+
markdown.push_str("| ");
|
|
121
|
+
for cell in header {
|
|
122
|
+
// Escape pipe characters in cell content
|
|
123
|
+
let escaped = cell.replace('|', "\\|");
|
|
124
|
+
markdown.push_str(&escaped);
|
|
125
|
+
markdown.push_str(" | ");
|
|
126
|
+
}
|
|
127
|
+
markdown.push('\n');
|
|
128
|
+
|
|
129
|
+
// Separator row
|
|
160
130
|
markdown.push('|');
|
|
161
|
-
for
|
|
162
|
-
markdown.
|
|
163
|
-
markdown.push_str(cell);
|
|
164
|
-
markdown.push(' ');
|
|
165
|
-
markdown.push('|');
|
|
131
|
+
for _ in 0..num_cols {
|
|
132
|
+
markdown.push_str("------|");
|
|
166
133
|
}
|
|
167
134
|
markdown.push('\n');
|
|
135
|
+
}
|
|
168
136
|
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
137
|
+
// Data rows (skip first row as it's the header)
|
|
138
|
+
for row in cells.iter().skip(1) {
|
|
139
|
+
markdown.push_str("| ");
|
|
140
|
+
for (idx, cell) in row.iter().enumerate() {
|
|
141
|
+
if idx >= num_cols {
|
|
142
|
+
break; // Handle irregular tables
|
|
173
143
|
}
|
|
174
|
-
|
|
144
|
+
// Escape pipe characters in cell content
|
|
145
|
+
let escaped = cell.replace('|', "\\|");
|
|
146
|
+
markdown.push_str(&escaped);
|
|
147
|
+
markdown.push_str(" | ");
|
|
175
148
|
}
|
|
149
|
+
// Pad with empty cells if row is shorter than expected
|
|
150
|
+
for _ in row.len()..num_cols {
|
|
151
|
+
markdown.push_str(" | ");
|
|
152
|
+
}
|
|
153
|
+
markdown.push('\n');
|
|
176
154
|
}
|
|
177
155
|
|
|
178
156
|
markdown
|
|
@@ -196,18 +174,24 @@ impl Plugin for HtmlExtractor {
|
|
|
196
174
|
}
|
|
197
175
|
}
|
|
198
176
|
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
177
|
+
#[async_trait]
|
|
178
|
+
impl DocumentExtractor for HtmlExtractor {
|
|
179
|
+
async fn extract_bytes(
|
|
180
|
+
&self,
|
|
181
|
+
content: &[u8],
|
|
182
|
+
mime_type: &str,
|
|
183
|
+
config: &ExtractionConfig,
|
|
184
|
+
) -> Result<ExtractionResult> {
|
|
185
|
+
let html = std::str::from_utf8(content)
|
|
202
186
|
.map(|s| s.to_string())
|
|
203
187
|
.unwrap_or_else(|_| String::from_utf8_lossy(content).to_string());
|
|
204
188
|
|
|
205
|
-
|
|
206
|
-
|
|
189
|
+
// Extract tables from HTML
|
|
190
|
+
let tables = extract_html_tables(&html)?;
|
|
207
191
|
|
|
208
|
-
let
|
|
192
|
+
let markdown = crate::extraction::html::convert_html_to_markdown(&html, config.html_options.clone())?;
|
|
209
193
|
|
|
210
|
-
let content_without_frontmatter = markdown
|
|
194
|
+
let (html_metadata, content_without_frontmatter) = crate::extraction::html::parse_html_metadata(&markdown)?;
|
|
211
195
|
|
|
212
196
|
Ok(ExtractionResult {
|
|
213
197
|
content: content_without_frontmatter,
|
|
@@ -216,41 +200,13 @@ impl SyncExtractor for HtmlExtractor {
|
|
|
216
200
|
format: html_metadata.map(|m| crate::types::FormatMetadata::Html(Box::new(m))),
|
|
217
201
|
..Default::default()
|
|
218
202
|
},
|
|
219
|
-
pages: None,
|
|
220
203
|
tables,
|
|
221
204
|
detected_languages: None,
|
|
222
205
|
chunks: None,
|
|
223
206
|
images: None,
|
|
224
207
|
})
|
|
225
208
|
}
|
|
226
|
-
}
|
|
227
209
|
|
|
228
|
-
#[async_trait]
|
|
229
|
-
impl DocumentExtractor for HtmlExtractor {
|
|
230
|
-
#[cfg_attr(feature = "otel", tracing::instrument(
|
|
231
|
-
skip(self, content, config),
|
|
232
|
-
fields(
|
|
233
|
-
extractor.name = self.name(),
|
|
234
|
-
content.size_bytes = content.len(),
|
|
235
|
-
)
|
|
236
|
-
))]
|
|
237
|
-
async fn extract_bytes(
|
|
238
|
-
&self,
|
|
239
|
-
content: &[u8],
|
|
240
|
-
mime_type: &str,
|
|
241
|
-
config: &ExtractionConfig,
|
|
242
|
-
) -> Result<ExtractionResult> {
|
|
243
|
-
self.extract_sync(content, mime_type, config)
|
|
244
|
-
}
|
|
245
|
-
|
|
246
|
-
#[cfg(feature = "tokio-runtime")]
|
|
247
|
-
#[cfg_attr(feature = "otel", tracing::instrument(
|
|
248
|
-
skip(self, path, config),
|
|
249
|
-
fields(
|
|
250
|
-
extractor.name = self.name(),
|
|
251
|
-
)
|
|
252
|
-
))]
|
|
253
|
-
#[cfg(feature = "tokio-runtime")]
|
|
254
210
|
async fn extract_file(&self, path: &Path, mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult> {
|
|
255
211
|
let bytes = tokio::fs::read(path).await?;
|
|
256
212
|
self.extract_bytes(&bytes, mime_type, config).await
|
|
@@ -263,21 +219,12 @@ impl DocumentExtractor for HtmlExtractor {
|
|
|
263
219
|
fn priority(&self) -> i32 {
|
|
264
220
|
50
|
|
265
221
|
}
|
|
266
|
-
|
|
267
|
-
fn as_sync_extractor(&self) -> Option<&dyn crate::extractors::SyncExtractor> {
|
|
268
|
-
Some(self)
|
|
269
|
-
}
|
|
270
222
|
}
|
|
271
223
|
|
|
272
224
|
#[cfg(test)]
|
|
273
225
|
mod tests {
|
|
274
226
|
use super::*;
|
|
275
227
|
|
|
276
|
-
/// Helper function to convert HTML to markdown for testing
|
|
277
|
-
fn html_to_markdown_for_test(html: &str) -> String {
|
|
278
|
-
crate::extraction::html::convert_html_to_markdown(html, None).unwrap()
|
|
279
|
-
}
|
|
280
|
-
|
|
281
228
|
#[test]
|
|
282
229
|
fn test_html_extractor_plugin_interface() {
|
|
283
230
|
let extractor = HtmlExtractor::new();
|
|
@@ -305,8 +252,7 @@ mod tests {
|
|
|
305
252
|
</table>
|
|
306
253
|
"#;
|
|
307
254
|
|
|
308
|
-
let
|
|
309
|
-
let tables = extract_html_tables(&markdown).unwrap();
|
|
255
|
+
let tables = extract_html_tables(html).unwrap();
|
|
310
256
|
assert_eq!(tables.len(), 1);
|
|
311
257
|
|
|
312
258
|
let table = &tables[0];
|
|
@@ -316,6 +262,7 @@ mod tests {
|
|
|
316
262
|
assert_eq!(table.cells[2], vec!["Row2Col1", "Row2Col2"]);
|
|
317
263
|
assert_eq!(table.page_number, 1);
|
|
318
264
|
|
|
265
|
+
// Check markdown format
|
|
319
266
|
assert!(table.markdown.contains("| Header1 | Header2 |"));
|
|
320
267
|
assert!(table.markdown.contains("|------|------|"));
|
|
321
268
|
assert!(table.markdown.contains("| Row1Col1 | Row1Col2 |"));
|
|
@@ -335,8 +282,7 @@ mod tests {
|
|
|
335
282
|
</table>
|
|
336
283
|
"#;
|
|
337
284
|
|
|
338
|
-
let
|
|
339
|
-
let tables = extract_html_tables(&markdown).unwrap();
|
|
285
|
+
let tables = extract_html_tables(html).unwrap();
|
|
340
286
|
assert_eq!(tables.len(), 2);
|
|
341
287
|
assert_eq!(tables[0].page_number, 1);
|
|
342
288
|
assert_eq!(tables[1].page_number, 2);
|
|
@@ -351,8 +297,7 @@ mod tests {
|
|
|
351
297
|
</table>
|
|
352
298
|
"#;
|
|
353
299
|
|
|
354
|
-
let
|
|
355
|
-
let tables = extract_html_tables(&markdown).unwrap();
|
|
300
|
+
let tables = extract_html_tables(html).unwrap();
|
|
356
301
|
assert_eq!(tables.len(), 1);
|
|
357
302
|
|
|
358
303
|
let table = &tables[0];
|
|
@@ -364,8 +309,7 @@ mod tests {
|
|
|
364
309
|
#[test]
|
|
365
310
|
fn test_extract_html_tables_empty() {
|
|
366
311
|
let html = "<p>No tables here</p>";
|
|
367
|
-
let
|
|
368
|
-
let tables = extract_html_tables(&markdown).unwrap();
|
|
312
|
+
let tables = extract_html_tables(html).unwrap();
|
|
369
313
|
assert_eq!(tables.len(), 0);
|
|
370
314
|
}
|
|
371
315
|
|
|
@@ -378,13 +322,60 @@ mod tests {
|
|
|
378
322
|
</table>
|
|
379
323
|
"#;
|
|
380
324
|
|
|
381
|
-
let
|
|
382
|
-
let tables = extract_html_tables(&markdown).unwrap();
|
|
325
|
+
let tables = extract_html_tables(html).unwrap();
|
|
383
326
|
assert_eq!(tables.len(), 1);
|
|
384
327
|
|
|
385
328
|
let table = &tables[0];
|
|
386
|
-
|
|
387
|
-
assert_eq!(table.cells[
|
|
329
|
+
// Whitespace is normalized during text extraction
|
|
330
|
+
assert_eq!(table.cells[0][0], "Header Bold");
|
|
331
|
+
assert_eq!(table.cells[1][0], "Data with emphasis");
|
|
332
|
+
}
|
|
333
|
+
|
|
334
|
+
#[test]
|
|
335
|
+
fn test_cells_to_markdown_basic() {
|
|
336
|
+
let cells = vec![
|
|
337
|
+
vec!["Header1".to_string(), "Header2".to_string()],
|
|
338
|
+
vec!["Row1Col1".to_string(), "Row1Col2".to_string()],
|
|
339
|
+
vec!["Row2Col1".to_string(), "Row2Col2".to_string()],
|
|
340
|
+
];
|
|
341
|
+
|
|
342
|
+
let markdown = cells_to_markdown(&cells);
|
|
343
|
+
|
|
344
|
+
assert!(markdown.contains("| Header1 | Header2 |"));
|
|
345
|
+
assert!(markdown.contains("|------|------|"));
|
|
346
|
+
assert!(markdown.contains("| Row1Col1 | Row1Col2 |"));
|
|
347
|
+
assert!(markdown.contains("| Row2Col1 | Row2Col2 |"));
|
|
348
|
+
}
|
|
349
|
+
|
|
350
|
+
#[test]
|
|
351
|
+
fn test_cells_to_markdown_empty() {
|
|
352
|
+
let cells: Vec<Vec<String>> = vec![];
|
|
353
|
+
let markdown = cells_to_markdown(&cells);
|
|
354
|
+
assert_eq!(markdown, "");
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
#[test]
|
|
358
|
+
fn test_cells_to_markdown_escape_pipes() {
|
|
359
|
+
let cells = vec![vec!["Header".to_string()], vec!["Cell with | pipe".to_string()]];
|
|
360
|
+
|
|
361
|
+
let markdown = cells_to_markdown(&cells);
|
|
362
|
+
assert!(markdown.contains("Cell with \\| pipe"));
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
#[test]
|
|
366
|
+
fn test_cells_to_markdown_irregular_rows() {
|
|
367
|
+
let cells = vec![
|
|
368
|
+
vec!["H1".to_string(), "H2".to_string(), "H3".to_string()],
|
|
369
|
+
vec!["R1C1".to_string(), "R1C2".to_string()], // Missing third column
|
|
370
|
+
vec!["R2C1".to_string(), "R2C2".to_string(), "R2C3".to_string()],
|
|
371
|
+
];
|
|
372
|
+
|
|
373
|
+
let markdown = cells_to_markdown(&cells);
|
|
374
|
+
|
|
375
|
+
// Should have 3 columns in header
|
|
376
|
+
assert!(markdown.contains("| H1 | H2 | H3 |"));
|
|
377
|
+
// Should pad short rows
|
|
378
|
+
assert!(markdown.contains("| R1C1 | R1C2 | |"));
|
|
388
379
|
}
|
|
389
380
|
|
|
390
381
|
#[tokio::test]
|
|
@@ -20,14 +20,9 @@ impl ImageExtractor {
|
|
|
20
20
|
Self
|
|
21
21
|
}
|
|
22
22
|
|
|
23
|
-
/// Extract text from image using OCR
|
|
23
|
+
/// Extract text from image using OCR.
|
|
24
24
|
#[cfg(feature = "ocr")]
|
|
25
|
-
async fn extract_with_ocr(
|
|
26
|
-
&self,
|
|
27
|
-
content: &[u8],
|
|
28
|
-
mime_type: &str,
|
|
29
|
-
config: &ExtractionConfig,
|
|
30
|
-
) -> Result<ExtractionResult> {
|
|
25
|
+
async fn extract_with_ocr(&self, content: &[u8], config: &ExtractionConfig) -> Result<ExtractionResult> {
|
|
31
26
|
use crate::plugins::registry::get_ocr_backend_registry;
|
|
32
27
|
|
|
33
28
|
let ocr_config = config.ocr.as_ref().ok_or_else(|| crate::KreuzbergError::Parsing {
|
|
@@ -44,21 +39,8 @@ impl ImageExtractor {
|
|
|
44
39
|
registry.get(&ocr_config.backend)?
|
|
45
40
|
};
|
|
46
41
|
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
let ocr_text = ocr_result.content.clone();
|
|
50
|
-
let ocr_extraction_result = crate::extraction::image::extract_text_from_image_with_ocr(
|
|
51
|
-
content,
|
|
52
|
-
mime_type,
|
|
53
|
-
ocr_text,
|
|
54
|
-
config.pages.as_ref(),
|
|
55
|
-
)?;
|
|
56
|
-
|
|
57
|
-
let mut result = ocr_result;
|
|
58
|
-
result.content = ocr_extraction_result.content;
|
|
59
|
-
result.pages = ocr_extraction_result.page_contents;
|
|
60
|
-
|
|
61
|
-
Ok(result)
|
|
42
|
+
// Process image using the backend - returns full ExtractionResult with tables/metadata
|
|
43
|
+
backend.process_image(content, ocr_config).await
|
|
62
44
|
}
|
|
63
45
|
}
|
|
64
46
|
|
|
@@ -96,13 +78,6 @@ impl Plugin for ImageExtractor {
|
|
|
96
78
|
|
|
97
79
|
#[async_trait]
|
|
98
80
|
impl DocumentExtractor for ImageExtractor {
|
|
99
|
-
#[cfg_attr(feature = "otel", tracing::instrument(
|
|
100
|
-
skip(self, content, config),
|
|
101
|
-
fields(
|
|
102
|
-
extractor.name = self.name(),
|
|
103
|
-
content.size_bytes = content.len(),
|
|
104
|
-
)
|
|
105
|
-
))]
|
|
106
81
|
async fn extract_bytes(
|
|
107
82
|
&self,
|
|
108
83
|
content: &[u8],
|
|
@@ -118,11 +93,13 @@ impl DocumentExtractor for ImageExtractor {
|
|
|
118
93
|
exif: extraction_metadata.exif_data,
|
|
119
94
|
};
|
|
120
95
|
|
|
96
|
+
// If OCR is enabled, use OCR result (which includes tables and OCR-specific metadata)
|
|
121
97
|
if config.ocr.is_some() {
|
|
122
98
|
#[cfg(feature = "ocr")]
|
|
123
99
|
{
|
|
124
|
-
let mut ocr_result = self.extract_with_ocr(content,
|
|
100
|
+
let mut ocr_result = self.extract_with_ocr(content, config).await?;
|
|
125
101
|
|
|
102
|
+
// Add image metadata to the OCR result
|
|
126
103
|
ocr_result.metadata.format = Some(crate::types::FormatMetadata::Image(image_metadata));
|
|
127
104
|
ocr_result.mime_type = mime_type.to_string();
|
|
128
105
|
|
|
@@ -142,7 +119,6 @@ impl DocumentExtractor for ImageExtractor {
|
|
|
142
119
|
format: Some(crate::types::FormatMetadata::Image(image_metadata)),
|
|
143
120
|
..Default::default()
|
|
144
121
|
},
|
|
145
|
-
pages: None,
|
|
146
122
|
tables: vec![],
|
|
147
123
|
detected_languages: None,
|
|
148
124
|
chunks: None,
|
|
@@ -151,6 +127,7 @@ impl DocumentExtractor for ImageExtractor {
|
|
|
151
127
|
}
|
|
152
128
|
}
|
|
153
129
|
|
|
130
|
+
// No OCR - just return image dimensions
|
|
154
131
|
Ok(ExtractionResult {
|
|
155
132
|
content: format!(
|
|
156
133
|
"Image: {} {}x{}",
|
|
@@ -161,7 +138,6 @@ impl DocumentExtractor for ImageExtractor {
|
|
|
161
138
|
format: Some(crate::types::FormatMetadata::Image(image_metadata)),
|
|
162
139
|
..Default::default()
|
|
163
140
|
},
|
|
164
|
-
pages: None,
|
|
165
141
|
tables: vec![],
|
|
166
142
|
detected_languages: None,
|
|
167
143
|
chunks: None,
|