kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +0 -6
- data/.rubocop.yaml +534 -1
- data/Gemfile +2 -1
- data/Gemfile.lock +28 -116
- data/README.md +269 -629
- data/Rakefile +0 -9
- data/Steepfile +4 -8
- data/examples/async_patterns.rb +58 -1
- data/ext/kreuzberg_rb/extconf.rb +5 -35
- data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
- data/ext/kreuzberg_rb/native/build.rs +14 -12
- data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
- data/ext/kreuzberg_rb/native/include/strings.h +2 -2
- data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
- data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
- data/extconf.rb +6 -38
- data/kreuzberg.gemspec +20 -114
- data/lib/kreuzberg/api_proxy.rb +18 -2
- data/lib/kreuzberg/cache_api.rb +0 -22
- data/lib/kreuzberg/cli.rb +10 -2
- data/lib/kreuzberg/cli_proxy.rb +10 -0
- data/lib/kreuzberg/config.rb +22 -274
- data/lib/kreuzberg/errors.rb +7 -73
- data/lib/kreuzberg/extraction_api.rb +8 -237
- data/lib/kreuzberg/mcp_proxy.rb +11 -2
- data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
- data/lib/kreuzberg/post_processor_protocol.rb +71 -0
- data/lib/kreuzberg/result.rb +33 -151
- data/lib/kreuzberg/setup_lib_path.rb +2 -22
- data/lib/kreuzberg/validator_protocol.rb +73 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +13 -27
- data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
- data/sig/kreuzberg.rbs +12 -105
- data/spec/binding/cache_spec.rb +22 -22
- data/spec/binding/cli_proxy_spec.rb +4 -2
- data/spec/binding/cli_spec.rb +11 -12
- data/spec/binding/config_spec.rb +0 -74
- data/spec/binding/config_validation_spec.rb +6 -100
- data/spec/binding/error_handling_spec.rb +97 -283
- data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
- data/spec/binding/plugins/postprocessor_spec.rb +11 -11
- data/spec/binding/plugins/validator_spec.rb +13 -12
- data/spec/examples.txt +104 -0
- data/spec/fixtures/config.toml +1 -0
- data/spec/fixtures/config.yaml +1 -0
- data/spec/fixtures/invalid_config.toml +1 -0
- data/spec/smoke/package_spec.rb +3 -2
- data/spec/spec_helper.rb +3 -1
- data/vendor/kreuzberg/Cargo.toml +67 -192
- data/vendor/kreuzberg/README.md +9 -97
- data/vendor/kreuzberg/build.rs +194 -516
- data/vendor/kreuzberg/src/api/handlers.rs +9 -130
- data/vendor/kreuzberg/src/api/mod.rs +3 -18
- data/vendor/kreuzberg/src/api/server.rs +71 -236
- data/vendor/kreuzberg/src/api/types.rs +7 -43
- data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
- data/vendor/kreuzberg/src/cache/mod.rs +3 -27
- data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
- data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
- data/vendor/kreuzberg/src/core/config.rs +23 -905
- data/vendor/kreuzberg/src/core/extractor.rs +106 -403
- data/vendor/kreuzberg/src/core/io.rs +2 -4
- data/vendor/kreuzberg/src/core/mime.rs +12 -2
- data/vendor/kreuzberg/src/core/mod.rs +3 -22
- data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
- data/vendor/kreuzberg/src/embeddings.rs +21 -169
- data/vendor/kreuzberg/src/error.rs +2 -2
- data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
- data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
- data/vendor/kreuzberg/src/extraction/email.rs +11 -12
- data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
- data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
- data/vendor/kreuzberg/src/extraction/image.rs +14 -138
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
- data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
- data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
- data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
- data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
- data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
- data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
- data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
- data/vendor/kreuzberg/src/extraction/table.rs +1 -2
- data/vendor/kreuzberg/src/extraction/text.rs +10 -18
- data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
- data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
- data/vendor/kreuzberg/src/extractors/email.rs +9 -37
- data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
- data/vendor/kreuzberg/src/extractors/html.rs +173 -182
- data/vendor/kreuzberg/src/extractors/image.rs +8 -32
- data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
- data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
- data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
- data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
- data/vendor/kreuzberg/src/extractors/text.rs +7 -30
- data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
- data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
- data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
- data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
- data/vendor/kreuzberg/src/lib.rs +5 -17
- data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
- data/vendor/kreuzberg/src/mcp/server.rs +21 -145
- data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
- data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
- data/vendor/kreuzberg/src/pdf/error.rs +1 -93
- data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
- data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
- data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
- data/vendor/kreuzberg/src/pdf/table.rs +64 -61
- data/vendor/kreuzberg/src/pdf/text.rs +24 -416
- data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
- data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
- data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
- data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
- data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
- data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
- data/vendor/kreuzberg/src/text/mod.rs +0 -8
- data/vendor/kreuzberg/src/text/quality.rs +15 -28
- data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
- data/vendor/kreuzberg/src/types.rs +67 -907
- data/vendor/kreuzberg/src/utils/mod.rs +0 -14
- data/vendor/kreuzberg/src/utils/quality.rs +3 -12
- data/vendor/kreuzberg/tests/api_tests.rs +0 -506
- data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
- data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
- data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
- data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
- data/vendor/kreuzberg/tests/config_features.rs +1 -33
- data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
- data/vendor/kreuzberg/tests/core_integration.rs +9 -35
- data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
- data/vendor/kreuzberg/tests/email_integration.rs +1 -3
- data/vendor/kreuzberg/tests/error_handling.rs +34 -43
- data/vendor/kreuzberg/tests/format_integration.rs +1 -7
- data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
- data/vendor/kreuzberg/tests/image_integration.rs +0 -2
- data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
- data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
- data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
- data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
- data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
- data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
- data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
- data/vendor/kreuzberg/tests/security_validation.rs +1 -13
- data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
- metadata +25 -171
- data/.rubocop.yml +0 -543
- data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
- data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
- data/lib/kreuzberg/error_context.rb +0 -136
- data/lib/kreuzberg/types.rb +0 -170
- data/lib/libpdfium.so +0 -0
- data/spec/binding/async_operations_spec.rb +0 -473
- data/spec/binding/batch_operations_spec.rb +0 -595
- data/spec/binding/batch_spec.rb +0 -359
- data/spec/binding/config_result_spec.rb +0 -377
- data/spec/binding/embeddings_spec.rb +0 -816
- data/spec/binding/error_recovery_spec.rb +0 -488
- data/spec/binding/font_config_spec.rb +0 -220
- data/spec/binding/images_spec.rb +0 -738
- data/spec/binding/keywords_extraction_spec.rb +0 -600
- data/spec/binding/metadata_types_spec.rb +0 -1228
- data/spec/binding/pages_extraction_spec.rb +0 -471
- data/spec/binding/tables_spec.rb +0 -641
- data/spec/unit/config/chunking_config_spec.rb +0 -213
- data/spec/unit/config/embedding_config_spec.rb +0 -343
- data/spec/unit/config/extraction_config_spec.rb +0 -438
- data/spec/unit/config/font_config_spec.rb +0 -285
- data/spec/unit/config/hierarchy_config_spec.rb +0 -314
- data/spec/unit/config/image_extraction_config_spec.rb +0 -209
- data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
- data/spec/unit/config/keyword_config_spec.rb +0 -229
- data/spec/unit/config/language_detection_config_spec.rb +0 -258
- data/spec/unit/config/ocr_config_spec.rb +0 -171
- data/spec/unit/config/page_config_spec.rb +0 -221
- data/spec/unit/config/pdf_config_spec.rb +0 -267
- data/spec/unit/config/postprocessor_config_spec.rb +0 -290
- data/spec/unit/config/tesseract_config_spec.rb +0 -181
- data/spec/unit/config/token_reduction_config_spec.rb +0 -251
- data/test/metadata_types_test.rb +0 -959
- data/vendor/Cargo.toml +0 -61
- data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
- data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
- data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
- data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/formats.rs +0 -235
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
- data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
- data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
- data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
- data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
- data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
- data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
- data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/extractors/security.rs +0 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
- data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
- data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
- data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
- data/vendor/kreuzberg/src/panic_context.rs +0 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
- data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
- data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
- data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
- data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
- data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
- data/vendor/kreuzberg/src/utils/pool.rs +0 -503
- data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
- data/vendor/kreuzberg/tests/api_embed.rs +0 -360
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
- data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
- data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
- data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
- data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
- data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
- data/vendor/kreuzberg/tests/page_markers.rs +0 -297
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
- data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
- data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
- data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
- data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
- data/vendor/kreuzberg-ffi/README.md +0 -851
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
- data/vendor/kreuzberg-ffi/build.rs +0 -168
- data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
- data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
- data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
- data/vendor/kreuzberg-ffi/src/error.rs +0 -901
- data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
- data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
- data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
- data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
- data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
- data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
- data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
- data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
- data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
- data/vendor/kreuzberg-ffi/src/result.rs +0 -510
- data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
- data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
- data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
- data/vendor/kreuzberg-ffi/src/types.rs +0 -363
- data/vendor/kreuzberg-ffi/src/util.rs +0 -210
- data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
- data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
- data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
- data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
- data/vendor/kreuzberg-tesseract/LICENSE +0 -22
- data/vendor/kreuzberg-tesseract/README.md +0 -399
- data/vendor/kreuzberg-tesseract/build.rs +0 -1127
- data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
- data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
- data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
- data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
- data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
- data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
use serde::{Deserialize, Serialize};
|
|
2
|
-
use std::collections::
|
|
3
|
-
use std::sync::Arc;
|
|
2
|
+
use std::collections::HashMap;
|
|
4
3
|
|
|
5
4
|
#[cfg(feature = "pdf")]
|
|
6
5
|
use crate::pdf::metadata::PdfMetadata;
|
|
@@ -8,137 +7,6 @@ use crate::pdf::metadata::PdfMetadata;
|
|
|
8
7
|
// ============================================================================
|
|
9
8
|
// ============================================================================
|
|
10
9
|
|
|
11
|
-
/// Module providing transparent serde support for Arc<T>.
|
|
12
|
-
///
|
|
13
|
-
/// Allows Arc-wrapped types to serialize/deserialize as if unwrapped,
|
|
14
|
-
/// maintaining exact JSON format while preserving memory efficiency benefits.
|
|
15
|
-
///
|
|
16
|
-
/// # Arc Sharing Semantics
|
|
17
|
-
///
|
|
18
|
-
/// **Important**: Arc sharing semantics are **NOT** preserved across serialization.
|
|
19
|
-
/// When deserializing, each Arc is independently created with `Arc::new()`.
|
|
20
|
-
/// This means that if two Arcs referenced the same data before serialization,
|
|
21
|
-
/// they will be separate Arcs after deserialization.
|
|
22
|
-
///
|
|
23
|
-
/// Example:
|
|
24
|
-
/// ```ignore
|
|
25
|
-
/// let shared = Arc::new(Table { /* ... */ });
|
|
26
|
-
/// let tables = vec![Arc::clone(&shared), Arc::clone(&shared)];
|
|
27
|
-
/// // Both in-memory Arcs point to the same Table
|
|
28
|
-
///
|
|
29
|
-
/// let json = serde_json::to_string(&tables)?;
|
|
30
|
-
/// let deserialized: Vec<Arc<Table>> = serde_json::from_str(&json)?;
|
|
31
|
-
/// // deserialized[0] and deserialized[1] are now independent Arcs,
|
|
32
|
-
/// // even though they contain identical data
|
|
33
|
-
/// ```
|
|
34
|
-
///
|
|
35
|
-
/// This design choice maintains:
|
|
36
|
-
/// - Exact JSON format compatibility (no sharing metadata in JSON)
|
|
37
|
-
/// - Predictable deserialization behavior
|
|
38
|
-
/// - Zero additional serialization overhead
|
|
39
|
-
///
|
|
40
|
-
/// If in-memory sharing is required, callers must implement custom sharing logic
|
|
41
|
-
/// or use a different data structure (like a HashMap of deduplicated values).
|
|
42
|
-
#[allow(dead_code)]
|
|
43
|
-
mod serde_arc {
|
|
44
|
-
use serde::{Deserialize, Deserializer, Serializer};
|
|
45
|
-
use std::sync::Arc;
|
|
46
|
-
|
|
47
|
-
/// Serialize an Arc<T> by serializing the inner value directly.
|
|
48
|
-
///
|
|
49
|
-
/// This makes Arc<T> serialize identically to T, maintaining API compatibility.
|
|
50
|
-
/// The outer Arc wrapper is transparent during serialization.
|
|
51
|
-
pub fn serialize<S, T>(arc_value: &Arc<T>, serializer: S) -> Result<S::Ok, S::Error>
|
|
52
|
-
where
|
|
53
|
-
S: Serializer,
|
|
54
|
-
T: serde::Serialize,
|
|
55
|
-
{
|
|
56
|
-
(**arc_value).serialize(serializer)
|
|
57
|
-
}
|
|
58
|
-
|
|
59
|
-
/// Deserialize a T and wrap it in Arc.
|
|
60
|
-
///
|
|
61
|
-
/// This makes Arc<T> deserialize from the same format as T.
|
|
62
|
-
/// Each Arc is independently created during deserialization;
|
|
63
|
-
/// Arc sharing from before serialization is NOT preserved.
|
|
64
|
-
pub fn deserialize<'de, D, T>(deserializer: D) -> Result<Arc<T>, D::Error>
|
|
65
|
-
where
|
|
66
|
-
D: Deserializer<'de>,
|
|
67
|
-
T: Deserialize<'de>,
|
|
68
|
-
{
|
|
69
|
-
T::deserialize(deserializer).map(Arc::new)
|
|
70
|
-
}
|
|
71
|
-
}
|
|
72
|
-
|
|
73
|
-
/// Module for serializing Vec<Arc<T>> with transparent Arc handling.
|
|
74
|
-
///
|
|
75
|
-
/// Serializes a Vec<Arc<T>> as Vec<T> for compatibility, while preserving
|
|
76
|
-
/// Arc semantics for memory efficiency.
|
|
77
|
-
///
|
|
78
|
-
/// # Arc Sharing Semantics
|
|
79
|
-
///
|
|
80
|
-
/// **Important**: Arc sharing semantics are **NOT** preserved across serialization.
|
|
81
|
-
/// When deserializing, each element's Arc is independently created with `Arc::new()`.
|
|
82
|
-
/// This is important for `PageContent` where tables/images may be shared across pages.
|
|
83
|
-
///
|
|
84
|
-
/// Example with shared tables:
|
|
85
|
-
/// ```ignore
|
|
86
|
-
/// let shared_table = Arc::new(Table { /* ... */ });
|
|
87
|
-
/// let page_contents = vec![
|
|
88
|
-
/// PageContent { tables: vec![Arc::clone(&shared_table)], ... },
|
|
89
|
-
/// PageContent { tables: vec![Arc::clone(&shared_table)], ... },
|
|
90
|
-
/// ];
|
|
91
|
-
/// // In-memory: both pages' tables point to the same Arc
|
|
92
|
-
///
|
|
93
|
-
/// let json = serde_json::to_string(&page_contents)?;
|
|
94
|
-
/// let deserialized = serde_json::from_str::<Vec<PageContent>>(&json)?;
|
|
95
|
-
/// // After deserialization: each page has independent Arc instances,
|
|
96
|
-
/// // even though the table data is identical
|
|
97
|
-
/// ```
|
|
98
|
-
///
|
|
99
|
-
/// Design rationale:
|
|
100
|
-
/// - JSON has no mechanism to represent shared references
|
|
101
|
-
/// - Preserving sharing would require complex metadata and deduplication
|
|
102
|
-
/// - Current approach is simple, predictable, and maintains compatibility
|
|
103
|
-
/// - In-memory sharing (via Arc) is an implementation detail for the Rust side
|
|
104
|
-
///
|
|
105
|
-
/// If in-memory sharing is required after deserialization, implement custom
|
|
106
|
-
/// deduplication logic using hashing or content comparison.
|
|
107
|
-
mod serde_vec_arc {
|
|
108
|
-
use serde::{Deserialize, Deserializer, Serializer};
|
|
109
|
-
use std::sync::Arc;
|
|
110
|
-
|
|
111
|
-
/// Serialize Vec<Arc<T>> by serializing each T directly.
|
|
112
|
-
///
|
|
113
|
-
/// Each element is unwrapped from its Arc and serialized independently.
|
|
114
|
-
/// No sharing metadata is included in the serialized output.
|
|
115
|
-
pub fn serialize<S, T>(vec: &[Arc<T>], serializer: S) -> Result<S::Ok, S::Error>
|
|
116
|
-
where
|
|
117
|
-
S: Serializer,
|
|
118
|
-
T: serde::Serialize,
|
|
119
|
-
{
|
|
120
|
-
use serde::ser::SerializeSeq;
|
|
121
|
-
let mut seq = serializer.serialize_seq(Some(vec.len()))?;
|
|
122
|
-
for arc_item in vec {
|
|
123
|
-
seq.serialize_element(&**arc_item)?;
|
|
124
|
-
}
|
|
125
|
-
seq.end()
|
|
126
|
-
}
|
|
127
|
-
|
|
128
|
-
/// Deserialize Vec<T> and wrap each element in Arc.
|
|
129
|
-
///
|
|
130
|
-
/// Each element is independently wrapped in a new Arc.
|
|
131
|
-
/// Sharing relationships from before serialization are lost.
|
|
132
|
-
pub fn deserialize<'de, D, T>(deserializer: D) -> Result<Vec<Arc<T>>, D::Error>
|
|
133
|
-
where
|
|
134
|
-
D: Deserializer<'de>,
|
|
135
|
-
T: Deserialize<'de>,
|
|
136
|
-
{
|
|
137
|
-
let vec: Vec<T> = Deserialize::deserialize(deserializer)?;
|
|
138
|
-
Ok(vec.into_iter().map(Arc::new).collect())
|
|
139
|
-
}
|
|
140
|
-
}
|
|
141
|
-
|
|
142
10
|
/// General extraction result used by the core extraction API.
|
|
143
11
|
///
|
|
144
12
|
/// This is the main result type returned by all extraction functions.
|
|
@@ -166,13 +34,6 @@ pub struct ExtractionResult {
|
|
|
166
34
|
/// Each image may optionally contain a nested `ocr_result` if OCR was performed.
|
|
167
35
|
#[serde(skip_serializing_if = "Option::is_none")]
|
|
168
36
|
pub images: Option<Vec<ExtractedImage>>,
|
|
169
|
-
|
|
170
|
-
/// Per-page content when page extraction is enabled.
|
|
171
|
-
///
|
|
172
|
-
/// When page extraction is configured, the document is split into per-page content
|
|
173
|
-
/// with tables and images mapped to their respective pages.
|
|
174
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
175
|
-
pub pages: Option<Vec<PageContent>>,
|
|
176
37
|
}
|
|
177
38
|
|
|
178
39
|
/// Format-specific metadata (discriminated union).
|
|
@@ -201,45 +62,17 @@ pub enum FormatMetadata {
|
|
|
201
62
|
/// via a discriminated union, and additional custom fields from postprocessors.
|
|
202
63
|
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
|
|
203
64
|
pub struct Metadata {
|
|
204
|
-
///
|
|
205
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
206
|
-
pub title: Option<String>,
|
|
207
|
-
|
|
208
|
-
/// Document subject or description
|
|
209
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
210
|
-
pub subject: Option<String>,
|
|
211
|
-
|
|
212
|
-
/// Primary author(s) - always Vec for consistency
|
|
213
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
214
|
-
pub authors: Option<Vec<String>>,
|
|
215
|
-
|
|
216
|
-
/// Keywords/tags - always Vec for consistency
|
|
217
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
218
|
-
pub keywords: Option<Vec<String>>,
|
|
219
|
-
|
|
220
|
-
/// Primary language (ISO 639 code)
|
|
65
|
+
/// Language of the document (ISO 639 code)
|
|
221
66
|
#[serde(skip_serializing_if = "Option::is_none")]
|
|
222
67
|
pub language: Option<String>,
|
|
223
68
|
|
|
224
|
-
///
|
|
225
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
226
|
-
pub created_at: Option<String>,
|
|
227
|
-
|
|
228
|
-
/// Last modification timestamp (ISO 8601 format)
|
|
69
|
+
/// Document date (format varies by source)
|
|
229
70
|
#[serde(skip_serializing_if = "Option::is_none")]
|
|
230
|
-
pub
|
|
231
|
-
|
|
232
|
-
/// User who created the document
|
|
233
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
234
|
-
pub created_by: Option<String>,
|
|
235
|
-
|
|
236
|
-
/// User who last modified the document
|
|
237
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
238
|
-
pub modified_by: Option<String>,
|
|
71
|
+
pub date: Option<String>,
|
|
239
72
|
|
|
240
|
-
///
|
|
73
|
+
/// Document subject/description
|
|
241
74
|
#[serde(skip_serializing_if = "Option::is_none")]
|
|
242
|
-
pub
|
|
75
|
+
pub subject: Option<String>,
|
|
243
76
|
|
|
244
77
|
/// Format-specific metadata (discriminated union)
|
|
245
78
|
///
|
|
@@ -269,177 +102,6 @@ pub struct Metadata {
|
|
|
269
102
|
pub additional: HashMap<String, serde_json::Value>,
|
|
270
103
|
}
|
|
271
104
|
|
|
272
|
-
/// Unified page structure for documents.
|
|
273
|
-
///
|
|
274
|
-
/// Supports different page types (PDF pages, PPTX slides, Excel sheets)
|
|
275
|
-
/// with character offset boundaries for chunk-to-page mapping.
|
|
276
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
277
|
-
pub struct PageStructure {
|
|
278
|
-
/// Total number of pages/slides/sheets
|
|
279
|
-
pub total_count: usize,
|
|
280
|
-
|
|
281
|
-
/// Type of paginated unit
|
|
282
|
-
pub unit_type: PageUnitType,
|
|
283
|
-
|
|
284
|
-
/// Character offset boundaries for each page
|
|
285
|
-
///
|
|
286
|
-
/// Maps character ranges in the extracted content to page numbers.
|
|
287
|
-
/// Used for chunk page range calculation.
|
|
288
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
289
|
-
pub boundaries: Option<Vec<PageBoundary>>,
|
|
290
|
-
|
|
291
|
-
/// Detailed per-page metadata (optional, only when needed)
|
|
292
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
293
|
-
pub pages: Option<Vec<PageInfo>>,
|
|
294
|
-
}
|
|
295
|
-
|
|
296
|
-
/// Type of paginated unit in a document.
|
|
297
|
-
///
|
|
298
|
-
/// Distinguishes between different types of "pages" (PDF pages, presentation slides, spreadsheet sheets).
|
|
299
|
-
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
|
300
|
-
#[serde(rename_all = "snake_case")]
|
|
301
|
-
pub enum PageUnitType {
|
|
302
|
-
/// Standard document pages (PDF, DOCX, images)
|
|
303
|
-
Page,
|
|
304
|
-
/// Presentation slides (PPTX, ODP)
|
|
305
|
-
Slide,
|
|
306
|
-
/// Spreadsheet sheets (XLSX, ODS)
|
|
307
|
-
Sheet,
|
|
308
|
-
}
|
|
309
|
-
|
|
310
|
-
/// Byte offset boundary for a page.
|
|
311
|
-
///
|
|
312
|
-
/// Tracks where a specific page's content starts and ends in the main content string,
|
|
313
|
-
/// enabling mapping from byte positions to page numbers. Offsets are guaranteed to be
|
|
314
|
-
/// at valid UTF-8 character boundaries when using standard String methods (push_str, push, etc.).
|
|
315
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
316
|
-
pub struct PageBoundary {
|
|
317
|
-
/// Byte offset where this page starts in the content string (UTF-8 valid boundary, inclusive)
|
|
318
|
-
pub byte_start: usize,
|
|
319
|
-
/// Byte offset where this page ends in the content string (UTF-8 valid boundary, exclusive)
|
|
320
|
-
pub byte_end: usize,
|
|
321
|
-
/// Page number (1-indexed)
|
|
322
|
-
pub page_number: usize,
|
|
323
|
-
}
|
|
324
|
-
|
|
325
|
-
/// Metadata for individual page/slide/sheet.
|
|
326
|
-
///
|
|
327
|
-
/// Captures per-page information including dimensions, content counts,
|
|
328
|
-
/// and visibility state (for presentations).
|
|
329
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
330
|
-
pub struct PageInfo {
|
|
331
|
-
/// Page number (1-indexed)
|
|
332
|
-
pub number: usize,
|
|
333
|
-
|
|
334
|
-
/// Page title (usually for presentations)
|
|
335
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
336
|
-
pub title: Option<String>,
|
|
337
|
-
|
|
338
|
-
/// Dimensions in points (PDF) or pixels (images): (width, height)
|
|
339
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
340
|
-
pub dimensions: Option<(f64, f64)>,
|
|
341
|
-
|
|
342
|
-
/// Number of images on this page
|
|
343
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
344
|
-
pub image_count: Option<usize>,
|
|
345
|
-
|
|
346
|
-
/// Number of tables on this page
|
|
347
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
348
|
-
pub table_count: Option<usize>,
|
|
349
|
-
|
|
350
|
-
/// Whether this page is hidden (e.g., in presentations)
|
|
351
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
352
|
-
pub hidden: Option<bool>,
|
|
353
|
-
}
|
|
354
|
-
|
|
355
|
-
/// Content for a single page/slide.
|
|
356
|
-
///
|
|
357
|
-
/// When page extraction is enabled, documents are split into per-page content
|
|
358
|
-
/// with associated tables and images mapped to each page.
|
|
359
|
-
///
|
|
360
|
-
/// # Performance
|
|
361
|
-
///
|
|
362
|
-
/// Uses Arc-wrapped tables and images for memory efficiency:
|
|
363
|
-
/// - `Vec<Arc<Table>>` enables zero-copy sharing of table data
|
|
364
|
-
/// - `Vec<Arc<ExtractedImage>>` enables zero-copy sharing of image data
|
|
365
|
-
/// - Maintains exact JSON compatibility via custom Serialize/Deserialize
|
|
366
|
-
///
|
|
367
|
-
/// This reduces memory overhead for documents with shared tables/images
|
|
368
|
-
/// by avoiding redundant copies during serialization.
|
|
369
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
370
|
-
pub struct PageContent {
|
|
371
|
-
/// Page number (1-indexed)
|
|
372
|
-
pub page_number: usize,
|
|
373
|
-
|
|
374
|
-
/// Text content for this page
|
|
375
|
-
pub content: String,
|
|
376
|
-
|
|
377
|
-
/// Tables found on this page (uses Arc for memory efficiency)
|
|
378
|
-
///
|
|
379
|
-
/// Serializes as Vec<Table> for JSON compatibility while maintaining
|
|
380
|
-
/// Arc semantics in-memory for zero-copy sharing.
|
|
381
|
-
#[serde(skip_serializing_if = "Vec::is_empty", default, with = "serde_vec_arc")]
|
|
382
|
-
pub tables: Vec<Arc<Table>>,
|
|
383
|
-
|
|
384
|
-
/// Images found on this page (uses Arc for memory efficiency)
|
|
385
|
-
///
|
|
386
|
-
/// Serializes as Vec<ExtractedImage> for JSON compatibility while maintaining
|
|
387
|
-
/// Arc semantics in-memory for zero-copy sharing.
|
|
388
|
-
#[serde(skip_serializing_if = "Vec::is_empty", default, with = "serde_vec_arc")]
|
|
389
|
-
pub images: Vec<Arc<ExtractedImage>>,
|
|
390
|
-
|
|
391
|
-
/// Hierarchy information for the page (when hierarchy extraction is enabled)
|
|
392
|
-
///
|
|
393
|
-
/// Contains text hierarchy levels (H1-H6) extracted from the page content.
|
|
394
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
395
|
-
pub hierarchy: Option<PageHierarchy>,
|
|
396
|
-
}
|
|
397
|
-
|
|
398
|
-
/// Page hierarchy structure containing heading levels and block information.
|
|
399
|
-
///
|
|
400
|
-
/// Used when PDF text hierarchy extraction is enabled. Contains hierarchical
|
|
401
|
-
/// blocks with heading levels (H1-H6) for semantic document structure.
|
|
402
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
403
|
-
pub struct PageHierarchy {
|
|
404
|
-
/// Number of hierarchy blocks on this page
|
|
405
|
-
pub block_count: usize,
|
|
406
|
-
|
|
407
|
-
/// Hierarchical blocks with heading levels
|
|
408
|
-
#[serde(skip_serializing_if = "Vec::is_empty", default)]
|
|
409
|
-
pub blocks: Vec<HierarchicalBlock>,
|
|
410
|
-
}
|
|
411
|
-
|
|
412
|
-
/// A text block with hierarchy level assignment.
|
|
413
|
-
///
|
|
414
|
-
/// Represents a block of text with semantic heading information extracted from
|
|
415
|
-
/// font size clustering and hierarchical analysis.
|
|
416
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
417
|
-
pub struct HierarchicalBlock {
|
|
418
|
-
/// The text content of this block
|
|
419
|
-
pub text: String,
|
|
420
|
-
|
|
421
|
-
/// The font size of the text in this block
|
|
422
|
-
pub font_size: f32,
|
|
423
|
-
|
|
424
|
-
/// The hierarchy level of this block (H1-H6 or Body)
|
|
425
|
-
///
|
|
426
|
-
/// Levels correspond to HTML heading tags:
|
|
427
|
-
/// - "h1": Top-level heading
|
|
428
|
-
/// - "h2": Secondary heading
|
|
429
|
-
/// - "h3": Tertiary heading
|
|
430
|
-
/// - "h4": Quaternary heading
|
|
431
|
-
/// - "h5": Quinary heading
|
|
432
|
-
/// - "h6": Senary heading
|
|
433
|
-
/// - "body": Body text (no heading level)
|
|
434
|
-
pub level: String,
|
|
435
|
-
|
|
436
|
-
/// Bounding box information for the block
|
|
437
|
-
///
|
|
438
|
-
/// Contains coordinates as (left, top, right, bottom) in PDF units.
|
|
439
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
440
|
-
pub bbox: Option<(f32, f32, f32, f32)>,
|
|
441
|
-
}
|
|
442
|
-
|
|
443
105
|
/// Excel/spreadsheet metadata.
|
|
444
106
|
///
|
|
445
107
|
/// Contains information about sheets in Excel, LibreOffice Calc, and other
|
|
@@ -551,308 +213,73 @@ pub struct TextMetadata {
|
|
|
551
213
|
pub code_blocks: Option<Vec<(String, String)>>,
|
|
552
214
|
}
|
|
553
215
|
|
|
554
|
-
/// Text direction enumeration for HTML documents.
|
|
555
|
-
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
|
|
556
|
-
#[serde(rename_all = "lowercase")]
|
|
557
|
-
pub enum TextDirection {
|
|
558
|
-
/// Left-to-right text direction
|
|
559
|
-
#[serde(rename = "ltr")]
|
|
560
|
-
LeftToRight,
|
|
561
|
-
/// Right-to-left text direction
|
|
562
|
-
#[serde(rename = "rtl")]
|
|
563
|
-
RightToLeft,
|
|
564
|
-
/// Automatic text direction detection
|
|
565
|
-
#[serde(rename = "auto")]
|
|
566
|
-
Auto,
|
|
567
|
-
}
|
|
568
|
-
|
|
569
|
-
/// Header/heading element metadata.
|
|
570
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
571
|
-
pub struct HeaderMetadata {
|
|
572
|
-
/// Header level: 1 (h1) through 6 (h6)
|
|
573
|
-
pub level: u8,
|
|
574
|
-
/// Normalized text content of the header
|
|
575
|
-
pub text: String,
|
|
576
|
-
/// HTML id attribute if present
|
|
577
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
578
|
-
pub id: Option<String>,
|
|
579
|
-
/// Document tree depth at the header element
|
|
580
|
-
pub depth: usize,
|
|
581
|
-
/// Byte offset in original HTML document
|
|
582
|
-
pub html_offset: usize,
|
|
583
|
-
}
|
|
584
|
-
|
|
585
|
-
/// Link element metadata.
|
|
586
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
587
|
-
pub struct LinkMetadata {
|
|
588
|
-
/// The href URL value
|
|
589
|
-
pub href: String,
|
|
590
|
-
/// Link text content (normalized)
|
|
591
|
-
pub text: String,
|
|
592
|
-
/// Optional title attribute
|
|
593
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
594
|
-
pub title: Option<String>,
|
|
595
|
-
/// Link type classification
|
|
596
|
-
pub link_type: LinkType,
|
|
597
|
-
/// Rel attribute values
|
|
598
|
-
pub rel: Vec<String>,
|
|
599
|
-
/// Additional attributes as key-value pairs
|
|
600
|
-
pub attributes: HashMap<String, String>,
|
|
601
|
-
}
|
|
602
|
-
|
|
603
|
-
/// Link type classification.
|
|
604
|
-
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
|
|
605
|
-
#[serde(rename_all = "lowercase")]
|
|
606
|
-
pub enum LinkType {
|
|
607
|
-
/// Anchor link (#section)
|
|
608
|
-
Anchor,
|
|
609
|
-
/// Internal link (same domain)
|
|
610
|
-
Internal,
|
|
611
|
-
/// External link (different domain)
|
|
612
|
-
External,
|
|
613
|
-
/// Email link (mailto:)
|
|
614
|
-
Email,
|
|
615
|
-
/// Phone link (tel:)
|
|
616
|
-
Phone,
|
|
617
|
-
/// Other link type
|
|
618
|
-
Other,
|
|
619
|
-
}
|
|
620
|
-
|
|
621
|
-
/// Image element metadata.
|
|
622
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
623
|
-
pub struct ImageMetadataType {
|
|
624
|
-
/// Image source (URL, data URI, or SVG content)
|
|
625
|
-
pub src: String,
|
|
626
|
-
/// Alternative text from alt attribute
|
|
627
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
628
|
-
pub alt: Option<String>,
|
|
629
|
-
/// Title attribute
|
|
630
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
631
|
-
pub title: Option<String>,
|
|
632
|
-
/// Image dimensions as (width, height) if available
|
|
633
|
-
pub dimensions: Option<(u32, u32)>,
|
|
634
|
-
/// Image type classification
|
|
635
|
-
pub image_type: ImageType,
|
|
636
|
-
/// Additional attributes as key-value pairs
|
|
637
|
-
pub attributes: HashMap<String, String>,
|
|
638
|
-
}
|
|
639
|
-
|
|
640
|
-
/// Image type classification.
|
|
641
|
-
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
|
|
642
|
-
#[serde(rename_all = "lowercase")]
|
|
643
|
-
pub enum ImageType {
|
|
644
|
-
/// Data URI image
|
|
645
|
-
#[serde(rename = "data-uri")]
|
|
646
|
-
DataUri,
|
|
647
|
-
/// Inline SVG
|
|
648
|
-
#[serde(rename = "inline-svg")]
|
|
649
|
-
InlineSvg,
|
|
650
|
-
/// External image URL
|
|
651
|
-
External,
|
|
652
|
-
/// Relative path image
|
|
653
|
-
Relative,
|
|
654
|
-
}
|
|
655
|
-
|
|
656
|
-
/// Structured data (Schema.org, microdata, RDFa) block.
|
|
657
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
658
|
-
pub struct StructuredData {
|
|
659
|
-
/// Type of structured data
|
|
660
|
-
pub data_type: StructuredDataType,
|
|
661
|
-
/// Raw JSON string representation
|
|
662
|
-
pub raw_json: String,
|
|
663
|
-
/// Schema type if detectable (e.g., "Article", "Event", "Product")
|
|
664
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
665
|
-
pub schema_type: Option<String>,
|
|
666
|
-
}
|
|
667
|
-
|
|
668
|
-
/// Structured data type classification.
|
|
669
|
-
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
|
|
670
|
-
#[serde(rename_all = "lowercase")]
|
|
671
|
-
pub enum StructuredDataType {
|
|
672
|
-
/// JSON-LD structured data
|
|
673
|
-
#[serde(rename = "json-ld")]
|
|
674
|
-
JsonLd,
|
|
675
|
-
/// Microdata
|
|
676
|
-
Microdata,
|
|
677
|
-
/// RDFa
|
|
678
|
-
#[serde(rename = "rdfa")]
|
|
679
|
-
RDFa,
|
|
680
|
-
}
|
|
681
|
-
|
|
682
216
|
/// HTML metadata extracted from HTML documents.
|
|
683
217
|
///
|
|
684
|
-
/// Includes
|
|
685
|
-
/// and extracted structural elements (headers, links, images, structured data).
|
|
218
|
+
/// Includes meta tags, Open Graph data, Twitter Card metadata, and link relations.
|
|
686
219
|
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
|
|
687
220
|
pub struct HtmlMetadata {
|
|
688
|
-
/// Document title from `<title>` tag
|
|
689
221
|
#[serde(skip_serializing_if = "Option::is_none")]
|
|
690
222
|
pub title: Option<String>,
|
|
691
223
|
|
|
692
|
-
/// Document description from `<meta name="description">` tag
|
|
693
224
|
#[serde(skip_serializing_if = "Option::is_none")]
|
|
694
225
|
pub description: Option<String>,
|
|
695
226
|
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
pub keywords: Vec<String>,
|
|
227
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
228
|
+
pub keywords: Option<String>,
|
|
699
229
|
|
|
700
|
-
/// Document author from `<meta name="author">` tag
|
|
701
230
|
#[serde(skip_serializing_if = "Option::is_none")]
|
|
702
231
|
pub author: Option<String>,
|
|
703
232
|
|
|
704
|
-
/// Canonical URL from `<link rel="canonical">` tag
|
|
705
233
|
#[serde(skip_serializing_if = "Option::is_none")]
|
|
706
|
-
pub
|
|
234
|
+
pub canonical: Option<String>,
|
|
707
235
|
|
|
708
|
-
/// Base URL from `<base href="">` tag for resolving relative URLs
|
|
709
236
|
#[serde(skip_serializing_if = "Option::is_none")]
|
|
710
237
|
pub base_href: Option<String>,
|
|
711
238
|
|
|
712
|
-
/// Document language from `lang` attribute
|
|
713
239
|
#[serde(skip_serializing_if = "Option::is_none")]
|
|
714
|
-
pub
|
|
240
|
+
pub og_title: Option<String>,
|
|
715
241
|
|
|
716
|
-
/// Document text direction from `dir` attribute
|
|
717
242
|
#[serde(skip_serializing_if = "Option::is_none")]
|
|
718
|
-
pub
|
|
243
|
+
pub og_description: Option<String>,
|
|
719
244
|
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
#[serde(default)]
|
|
723
|
-
pub open_graph: BTreeMap<String, String>,
|
|
245
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
246
|
+
pub og_image: Option<String>,
|
|
724
247
|
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
#[serde(default)]
|
|
728
|
-
pub twitter_card: BTreeMap<String, String>,
|
|
248
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
249
|
+
pub og_url: Option<String>,
|
|
729
250
|
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
#[serde(default)]
|
|
733
|
-
pub meta_tags: BTreeMap<String, String>,
|
|
251
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
252
|
+
pub og_type: Option<String>,
|
|
734
253
|
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
pub headers: Vec<HeaderMetadata>,
|
|
254
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
255
|
+
pub og_site_name: Option<String>,
|
|
738
256
|
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
pub links: Vec<LinkMetadata>,
|
|
257
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
258
|
+
pub twitter_card: Option<String>,
|
|
742
259
|
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
pub images: Vec<ImageMetadataType>,
|
|
260
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
261
|
+
pub twitter_title: Option<String>,
|
|
746
262
|
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
pub structured_data: Vec<StructuredData>,
|
|
750
|
-
}
|
|
263
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
264
|
+
pub twitter_description: Option<String>,
|
|
751
265
|
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
pub fn is_empty(&self) -> bool {
|
|
755
|
-
self.title.is_none()
|
|
756
|
-
&& self.description.is_none()
|
|
757
|
-
&& self.keywords.is_empty()
|
|
758
|
-
&& self.author.is_none()
|
|
759
|
-
&& self.canonical_url.is_none()
|
|
760
|
-
&& self.base_href.is_none()
|
|
761
|
-
&& self.language.is_none()
|
|
762
|
-
&& self.text_direction.is_none()
|
|
763
|
-
&& self.open_graph.is_empty()
|
|
764
|
-
&& self.twitter_card.is_empty()
|
|
765
|
-
&& self.meta_tags.is_empty()
|
|
766
|
-
&& self.headers.is_empty()
|
|
767
|
-
&& self.links.is_empty()
|
|
768
|
-
&& self.images.is_empty()
|
|
769
|
-
&& self.structured_data.is_empty()
|
|
770
|
-
}
|
|
771
|
-
}
|
|
266
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
267
|
+
pub twitter_image: Option<String>,
|
|
772
268
|
|
|
773
|
-
#[
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
canonical_url: metadata.document.canonical_url,
|
|
788
|
-
base_href: metadata.document.base_href,
|
|
789
|
-
language: metadata.document.language,
|
|
790
|
-
text_direction: text_dir,
|
|
791
|
-
open_graph: metadata.document.open_graph,
|
|
792
|
-
twitter_card: metadata.document.twitter_card,
|
|
793
|
-
meta_tags: metadata.document.meta_tags,
|
|
794
|
-
headers: metadata
|
|
795
|
-
.headers
|
|
796
|
-
.into_iter()
|
|
797
|
-
.map(|h| HeaderMetadata {
|
|
798
|
-
level: h.level,
|
|
799
|
-
text: h.text,
|
|
800
|
-
id: h.id,
|
|
801
|
-
depth: h.depth,
|
|
802
|
-
html_offset: h.html_offset,
|
|
803
|
-
})
|
|
804
|
-
.collect(),
|
|
805
|
-
links: metadata
|
|
806
|
-
.links
|
|
807
|
-
.into_iter()
|
|
808
|
-
.map(|l| LinkMetadata {
|
|
809
|
-
href: l.href,
|
|
810
|
-
text: l.text,
|
|
811
|
-
title: l.title,
|
|
812
|
-
link_type: match l.link_type {
|
|
813
|
-
html_to_markdown_rs::LinkType::Anchor => LinkType::Anchor,
|
|
814
|
-
html_to_markdown_rs::LinkType::Internal => LinkType::Internal,
|
|
815
|
-
html_to_markdown_rs::LinkType::External => LinkType::External,
|
|
816
|
-
html_to_markdown_rs::LinkType::Email => LinkType::Email,
|
|
817
|
-
html_to_markdown_rs::LinkType::Phone => LinkType::Phone,
|
|
818
|
-
html_to_markdown_rs::LinkType::Other => LinkType::Other,
|
|
819
|
-
},
|
|
820
|
-
rel: l.rel,
|
|
821
|
-
attributes: l.attributes.into_iter().collect(),
|
|
822
|
-
})
|
|
823
|
-
.collect(),
|
|
824
|
-
images: metadata
|
|
825
|
-
.images
|
|
826
|
-
.into_iter()
|
|
827
|
-
.map(|img| ImageMetadataType {
|
|
828
|
-
src: img.src,
|
|
829
|
-
alt: img.alt,
|
|
830
|
-
title: img.title,
|
|
831
|
-
dimensions: img.dimensions,
|
|
832
|
-
image_type: match img.image_type {
|
|
833
|
-
html_to_markdown_rs::ImageType::DataUri => ImageType::DataUri,
|
|
834
|
-
html_to_markdown_rs::ImageType::InlineSvg => ImageType::InlineSvg,
|
|
835
|
-
html_to_markdown_rs::ImageType::External => ImageType::External,
|
|
836
|
-
html_to_markdown_rs::ImageType::Relative => ImageType::Relative,
|
|
837
|
-
},
|
|
838
|
-
attributes: img.attributes.into_iter().collect(),
|
|
839
|
-
})
|
|
840
|
-
.collect(),
|
|
841
|
-
structured_data: metadata
|
|
842
|
-
.structured_data
|
|
843
|
-
.into_iter()
|
|
844
|
-
.map(|sd| StructuredData {
|
|
845
|
-
data_type: match sd.data_type {
|
|
846
|
-
html_to_markdown_rs::StructuredDataType::JsonLd => StructuredDataType::JsonLd,
|
|
847
|
-
html_to_markdown_rs::StructuredDataType::Microdata => StructuredDataType::Microdata,
|
|
848
|
-
html_to_markdown_rs::StructuredDataType::RDFa => StructuredDataType::RDFa,
|
|
849
|
-
},
|
|
850
|
-
raw_json: sd.raw_json,
|
|
851
|
-
schema_type: sd.schema_type,
|
|
852
|
-
})
|
|
853
|
-
.collect(),
|
|
854
|
-
}
|
|
855
|
-
}
|
|
269
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
270
|
+
pub twitter_site: Option<String>,
|
|
271
|
+
|
|
272
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
273
|
+
pub twitter_creator: Option<String>,
|
|
274
|
+
|
|
275
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
276
|
+
pub link_author: Option<String>,
|
|
277
|
+
|
|
278
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
279
|
+
pub link_license: Option<String>,
|
|
280
|
+
|
|
281
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
282
|
+
pub link_alternate: Option<String>,
|
|
856
283
|
}
|
|
857
284
|
|
|
858
285
|
/// OCR processing metadata.
|
|
@@ -921,11 +348,11 @@ pub struct Chunk {
|
|
|
921
348
|
/// Metadata about a chunk's position in the original document.
|
|
922
349
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
923
350
|
pub struct ChunkMetadata {
|
|
924
|
-
///
|
|
925
|
-
pub
|
|
351
|
+
/// Character offset where this chunk starts in the original text.
|
|
352
|
+
pub char_start: usize,
|
|
926
353
|
|
|
927
|
-
///
|
|
928
|
-
pub
|
|
354
|
+
/// Character offset where this chunk ends in the original text.
|
|
355
|
+
pub char_end: usize,
|
|
929
356
|
|
|
930
357
|
/// Number of tokens in this chunk (if available).
|
|
931
358
|
///
|
|
@@ -938,18 +365,6 @@ pub struct ChunkMetadata {
|
|
|
938
365
|
|
|
939
366
|
/// Total number of chunks in the document.
|
|
940
367
|
pub total_chunks: usize,
|
|
941
|
-
|
|
942
|
-
/// First page number this chunk spans (1-indexed).
|
|
943
|
-
///
|
|
944
|
-
/// Only populated when page tracking is enabled in extraction configuration.
|
|
945
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
946
|
-
pub first_page: Option<usize>,
|
|
947
|
-
|
|
948
|
-
/// Last page number this chunk spans (1-indexed, equal to first_page for single-page chunks).
|
|
949
|
-
///
|
|
950
|
-
/// Only populated when page tracking is enabled in extraction configuration.
|
|
951
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
952
|
-
pub last_page: Option<usize>,
|
|
953
368
|
}
|
|
954
369
|
|
|
955
370
|
/// Extracted image from a document.
|
|
@@ -1032,11 +447,6 @@ pub struct ExcelSheet {
|
|
|
1032
447
|
pub col_count: usize,
|
|
1033
448
|
/// Total number of non-empty cells
|
|
1034
449
|
pub cell_count: usize,
|
|
1035
|
-
/// Pre-extracted table cells (2D vector of cell values)
|
|
1036
|
-
/// Populated during markdown generation to avoid re-parsing markdown.
|
|
1037
|
-
/// None for empty sheets.
|
|
1038
|
-
#[serde(skip)]
|
|
1039
|
-
pub table_cells: Option<Vec<Vec<String>>>,
|
|
1040
450
|
}
|
|
1041
451
|
|
|
1042
452
|
/// XML extraction result.
|
|
@@ -1095,22 +505,22 @@ pub struct PptxExtractionResult {
|
|
|
1095
505
|
pub table_count: usize,
|
|
1096
506
|
/// Extracted images from the presentation
|
|
1097
507
|
pub images: Vec<ExtractedImage>,
|
|
1098
|
-
/// Slide structure with boundaries (when page tracking is enabled)
|
|
1099
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
1100
|
-
pub page_structure: Option<PageStructure>,
|
|
1101
|
-
/// Per-slide content (when page tracking is enabled)
|
|
1102
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
1103
|
-
pub page_contents: Option<Vec<PageContent>>,
|
|
1104
508
|
}
|
|
1105
509
|
|
|
1106
510
|
/// PowerPoint presentation metadata.
|
|
1107
511
|
///
|
|
1108
|
-
/// Contains
|
|
1109
|
-
/// are now in the base `Metadata` struct.
|
|
512
|
+
/// Contains document-level metadata extracted from the PPTX file.
|
|
1110
513
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
1111
514
|
pub struct PptxMetadata {
|
|
515
|
+
/// Presentation title
|
|
516
|
+
pub title: Option<String>,
|
|
517
|
+
/// Author name
|
|
518
|
+
pub author: Option<String>,
|
|
519
|
+
/// Description/comments
|
|
520
|
+
pub description: Option<String>,
|
|
521
|
+
/// Summary text
|
|
522
|
+
pub summary: Option<String>,
|
|
1112
523
|
/// List of fonts used in the presentation
|
|
1113
|
-
#[serde(skip_serializing_if = "Vec::is_empty", default)]
|
|
1114
524
|
pub fonts: Vec<String>,
|
|
1115
525
|
}
|
|
1116
526
|
|
|
@@ -1434,6 +844,18 @@ pub struct CacheStats {
|
|
|
1434
844
|
pub newest_file_age_days: f64,
|
|
1435
845
|
}
|
|
1436
846
|
|
|
847
|
+
/// Pandoc extraction result.
|
|
848
|
+
///
|
|
849
|
+
/// Result of extracting content from a document using Pandoc,
|
|
850
|
+
/// including text and any metadata Pandoc was able to extract.
|
|
851
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
852
|
+
pub struct PandocExtractionResult {
|
|
853
|
+
/// Extracted text content
|
|
854
|
+
pub content: String,
|
|
855
|
+
/// Metadata extracted by Pandoc (varies by format)
|
|
856
|
+
pub metadata: HashMap<String, serde_json::Value>,
|
|
857
|
+
}
|
|
858
|
+
|
|
1437
859
|
/// LibreOffice conversion result.
|
|
1438
860
|
///
|
|
1439
861
|
/// Result of converting a legacy office document (e.g., .doc, .ppt)
|
|
@@ -1449,265 +871,3 @@ pub struct LibreOfficeConversionResult {
|
|
|
1449
871
|
/// Target MIME type after conversion
|
|
1450
872
|
pub target_mime: String,
|
|
1451
873
|
}
|
|
1452
|
-
|
|
1453
|
-
#[cfg(test)]
|
|
1454
|
-
mod tests {
|
|
1455
|
-
use super::*;
|
|
1456
|
-
|
|
1457
|
-
#[test]
|
|
1458
|
-
fn test_metadata_serialization_with_format() {
|
|
1459
|
-
let mut metadata = Metadata {
|
|
1460
|
-
format: Some(FormatMetadata::Text(TextMetadata {
|
|
1461
|
-
line_count: 1,
|
|
1462
|
-
word_count: 2,
|
|
1463
|
-
character_count: 13,
|
|
1464
|
-
headers: None,
|
|
1465
|
-
links: None,
|
|
1466
|
-
code_blocks: None,
|
|
1467
|
-
})),
|
|
1468
|
-
..Default::default()
|
|
1469
|
-
};
|
|
1470
|
-
|
|
1471
|
-
metadata
|
|
1472
|
-
.additional
|
|
1473
|
-
.insert("quality_score".to_string(), serde_json::json!(1.0));
|
|
1474
|
-
|
|
1475
|
-
let json = serde_json::to_value(&metadata).unwrap();
|
|
1476
|
-
println!("Serialized metadata: {}", serde_json::to_string_pretty(&json).unwrap());
|
|
1477
|
-
|
|
1478
|
-
assert!(
|
|
1479
|
-
json.get("format_type").is_some(),
|
|
1480
|
-
"format_type should be present in serialized JSON"
|
|
1481
|
-
);
|
|
1482
|
-
assert_eq!(json.get("format_type").unwrap(), "text");
|
|
1483
|
-
|
|
1484
|
-
assert_eq!(json.get("line_count").unwrap(), 1);
|
|
1485
|
-
assert_eq!(json.get("word_count").unwrap(), 2);
|
|
1486
|
-
assert_eq!(json.get("character_count").unwrap(), 13);
|
|
1487
|
-
|
|
1488
|
-
assert_eq!(json.get("quality_score").unwrap(), 1.0);
|
|
1489
|
-
}
|
|
1490
|
-
|
|
1491
|
-
#[test]
|
|
1492
|
-
fn test_arc_table_serialization_format() {
|
|
1493
|
-
let table = Table {
|
|
1494
|
-
cells: vec![vec!["A".to_string(), "B".to_string()]],
|
|
1495
|
-
markdown: "| A | B |\n|---|---|\n".to_string(),
|
|
1496
|
-
page_number: 1,
|
|
1497
|
-
};
|
|
1498
|
-
|
|
1499
|
-
let json = serde_json::to_value(&table).unwrap();
|
|
1500
|
-
|
|
1501
|
-
assert_eq!(json.get("cells").unwrap()[0][0], "A");
|
|
1502
|
-
assert_eq!(json.get("markdown").unwrap(), "| A | B |\n|---|---|\n");
|
|
1503
|
-
assert_eq!(json.get("page_number").unwrap(), 1);
|
|
1504
|
-
}
|
|
1505
|
-
|
|
1506
|
-
#[test]
|
|
1507
|
-
fn test_arc_table_roundtrip() {
|
|
1508
|
-
let original = Table {
|
|
1509
|
-
cells: vec![
|
|
1510
|
-
vec!["X".to_string(), "Y".to_string()],
|
|
1511
|
-
vec!["1".to_string(), "2".to_string()],
|
|
1512
|
-
],
|
|
1513
|
-
markdown: "| X | Y |\n|---|---|\n| 1 | 2 |\n".to_string(),
|
|
1514
|
-
page_number: 5,
|
|
1515
|
-
};
|
|
1516
|
-
|
|
1517
|
-
let json = serde_json::to_string(&original).unwrap();
|
|
1518
|
-
let deserialized: Table = serde_json::from_str(&json).unwrap();
|
|
1519
|
-
|
|
1520
|
-
assert_eq!(deserialized.cells, original.cells);
|
|
1521
|
-
assert_eq!(deserialized.markdown, original.markdown);
|
|
1522
|
-
assert_eq!(deserialized.page_number, original.page_number);
|
|
1523
|
-
}
|
|
1524
|
-
|
|
1525
|
-
#[test]
|
|
1526
|
-
fn test_arc_sharing_preserved_before_serialization() {
|
|
1527
|
-
let shared_table = Arc::new(Table {
|
|
1528
|
-
cells: vec![vec!["shared".to_string()]],
|
|
1529
|
-
markdown: "| shared |".to_string(),
|
|
1530
|
-
page_number: 1,
|
|
1531
|
-
});
|
|
1532
|
-
|
|
1533
|
-
let tables_before = [Arc::clone(&shared_table), Arc::clone(&shared_table)].to_vec();
|
|
1534
|
-
assert_eq!(Arc::strong_count(&tables_before[0]), 3);
|
|
1535
|
-
assert_eq!(Arc::strong_count(&tables_before[1]), 3);
|
|
1536
|
-
assert!(Arc::ptr_eq(&tables_before[0], &tables_before[1]));
|
|
1537
|
-
}
|
|
1538
|
-
|
|
1539
|
-
#[test]
|
|
1540
|
-
fn test_vec_arc_table_serialization_format() {
|
|
1541
|
-
let tables = vec![
|
|
1542
|
-
Table {
|
|
1543
|
-
cells: vec![vec!["A".to_string()]],
|
|
1544
|
-
markdown: "| A |".to_string(),
|
|
1545
|
-
page_number: 1,
|
|
1546
|
-
},
|
|
1547
|
-
Table {
|
|
1548
|
-
cells: vec![vec!["B".to_string()]],
|
|
1549
|
-
markdown: "| B |".to_string(),
|
|
1550
|
-
page_number: 2,
|
|
1551
|
-
},
|
|
1552
|
-
];
|
|
1553
|
-
|
|
1554
|
-
let json = serde_json::to_string(&tables).unwrap();
|
|
1555
|
-
let parsed: serde_json::Value = serde_json::from_str(&json).unwrap();
|
|
1556
|
-
|
|
1557
|
-
assert!(parsed.is_array());
|
|
1558
|
-
assert_eq!(parsed.as_array().unwrap().len(), 2);
|
|
1559
|
-
assert_eq!(parsed[0]["cells"][0][0], "A");
|
|
1560
|
-
assert_eq!(parsed[1]["cells"][0][0], "B");
|
|
1561
|
-
}
|
|
1562
|
-
|
|
1563
|
-
#[test]
|
|
1564
|
-
fn test_page_content_arc_tables_roundtrip() {
|
|
1565
|
-
let page = PageContent {
|
|
1566
|
-
page_number: 3,
|
|
1567
|
-
content: "Page 3 content".to_string(),
|
|
1568
|
-
tables: vec![
|
|
1569
|
-
Arc::new(Table {
|
|
1570
|
-
cells: vec![vec!["Table1".to_string()]],
|
|
1571
|
-
markdown: "| Table1 |".to_string(),
|
|
1572
|
-
page_number: 3,
|
|
1573
|
-
}),
|
|
1574
|
-
Arc::new(Table {
|
|
1575
|
-
cells: vec![vec!["Table2".to_string()]],
|
|
1576
|
-
markdown: "| Table2 |".to_string(),
|
|
1577
|
-
page_number: 3,
|
|
1578
|
-
}),
|
|
1579
|
-
],
|
|
1580
|
-
images: Vec::new(),
|
|
1581
|
-
hierarchy: None,
|
|
1582
|
-
};
|
|
1583
|
-
|
|
1584
|
-
let json = serde_json::to_string(&page).unwrap();
|
|
1585
|
-
let deserialized: PageContent = serde_json::from_str(&json).unwrap();
|
|
1586
|
-
|
|
1587
|
-
assert_eq!(deserialized.page_number, 3);
|
|
1588
|
-
assert_eq!(deserialized.content, "Page 3 content");
|
|
1589
|
-
assert_eq!(deserialized.tables.len(), 2);
|
|
1590
|
-
assert_eq!(deserialized.tables[0].cells[0][0], "Table1");
|
|
1591
|
-
assert_eq!(deserialized.tables[1].cells[0][0], "Table2");
|
|
1592
|
-
}
|
|
1593
|
-
|
|
1594
|
-
#[test]
|
|
1595
|
-
fn test_page_content_arc_images_roundtrip() {
|
|
1596
|
-
let image1 = Arc::new(ExtractedImage {
|
|
1597
|
-
data: vec![0xFF, 0xD8, 0xFF],
|
|
1598
|
-
format: "jpeg".to_string(),
|
|
1599
|
-
image_index: 0,
|
|
1600
|
-
page_number: Some(1),
|
|
1601
|
-
width: Some(100),
|
|
1602
|
-
height: Some(200),
|
|
1603
|
-
colorspace: Some("RGB".to_string()),
|
|
1604
|
-
bits_per_component: Some(8),
|
|
1605
|
-
is_mask: false,
|
|
1606
|
-
description: Some("Image 1".to_string()),
|
|
1607
|
-
ocr_result: None,
|
|
1608
|
-
});
|
|
1609
|
-
|
|
1610
|
-
let image2 = Arc::new(ExtractedImage {
|
|
1611
|
-
data: vec![0x89, 0x50, 0x4E],
|
|
1612
|
-
format: "png".to_string(),
|
|
1613
|
-
image_index: 1,
|
|
1614
|
-
page_number: Some(1),
|
|
1615
|
-
width: Some(300),
|
|
1616
|
-
height: Some(400),
|
|
1617
|
-
colorspace: Some("RGBA".to_string()),
|
|
1618
|
-
bits_per_component: Some(8),
|
|
1619
|
-
is_mask: false,
|
|
1620
|
-
description: Some("Image 2".to_string()),
|
|
1621
|
-
ocr_result: None,
|
|
1622
|
-
});
|
|
1623
|
-
|
|
1624
|
-
let page = PageContent {
|
|
1625
|
-
page_number: 1,
|
|
1626
|
-
content: "Page with images".to_string(),
|
|
1627
|
-
tables: Vec::new(),
|
|
1628
|
-
images: vec![image1, image2],
|
|
1629
|
-
hierarchy: None,
|
|
1630
|
-
};
|
|
1631
|
-
|
|
1632
|
-
let json = serde_json::to_string(&page).unwrap();
|
|
1633
|
-
let deserialized: PageContent = serde_json::from_str(&json).unwrap();
|
|
1634
|
-
|
|
1635
|
-
assert_eq!(deserialized.images.len(), 2);
|
|
1636
|
-
assert_eq!(deserialized.images[0].format, "jpeg");
|
|
1637
|
-
assert_eq!(deserialized.images[0].width, Some(100));
|
|
1638
|
-
assert_eq!(deserialized.images[1].format, "png");
|
|
1639
|
-
assert_eq!(deserialized.images[1].height, Some(400));
|
|
1640
|
-
}
|
|
1641
|
-
|
|
1642
|
-
#[test]
|
|
1643
|
-
fn test_arc_sharing_loss_with_page_content() {
|
|
1644
|
-
let shared_table = Arc::new(Table {
|
|
1645
|
-
cells: vec![vec!["shared across pages".to_string()]],
|
|
1646
|
-
markdown: "| shared across pages |".to_string(),
|
|
1647
|
-
page_number: 0,
|
|
1648
|
-
});
|
|
1649
|
-
|
|
1650
|
-
let page1 = PageContent {
|
|
1651
|
-
page_number: 1,
|
|
1652
|
-
content: "Page 1".to_string(),
|
|
1653
|
-
tables: vec![Arc::clone(&shared_table)],
|
|
1654
|
-
images: Vec::new(),
|
|
1655
|
-
hierarchy: None,
|
|
1656
|
-
};
|
|
1657
|
-
|
|
1658
|
-
let page2 = PageContent {
|
|
1659
|
-
page_number: 2,
|
|
1660
|
-
content: "Page 2".to_string(),
|
|
1661
|
-
tables: vec![Arc::clone(&shared_table)],
|
|
1662
|
-
images: Vec::new(),
|
|
1663
|
-
hierarchy: None,
|
|
1664
|
-
};
|
|
1665
|
-
|
|
1666
|
-
assert!(Arc::ptr_eq(&page1.tables[0], &page2.tables[0]));
|
|
1667
|
-
|
|
1668
|
-
let pages = vec![page1, page2];
|
|
1669
|
-
let json = serde_json::to_string(&pages).unwrap();
|
|
1670
|
-
let deserialized: Vec<PageContent> = serde_json::from_str(&json).unwrap();
|
|
1671
|
-
|
|
1672
|
-
assert_eq!(deserialized.len(), 2);
|
|
1673
|
-
assert_eq!(deserialized[0].tables[0].cells, deserialized[1].tables[0].cells);
|
|
1674
|
-
assert!(!Arc::ptr_eq(&deserialized[0].tables[0], &deserialized[1].tables[0]));
|
|
1675
|
-
}
|
|
1676
|
-
|
|
1677
|
-
#[test]
|
|
1678
|
-
fn test_empty_page_content_arcs() {
|
|
1679
|
-
let page = PageContent {
|
|
1680
|
-
page_number: 5,
|
|
1681
|
-
content: "No tables or images".to_string(),
|
|
1682
|
-
tables: Vec::new(),
|
|
1683
|
-
images: Vec::new(),
|
|
1684
|
-
hierarchy: None,
|
|
1685
|
-
};
|
|
1686
|
-
|
|
1687
|
-
let json = serde_json::to_string(&page).unwrap();
|
|
1688
|
-
let deserialized: PageContent = serde_json::from_str(&json).unwrap();
|
|
1689
|
-
|
|
1690
|
-
assert_eq!(deserialized.page_number, 5);
|
|
1691
|
-
assert_eq!(deserialized.tables.len(), 0);
|
|
1692
|
-
assert_eq!(deserialized.images.len(), 0);
|
|
1693
|
-
}
|
|
1694
|
-
|
|
1695
|
-
#[test]
|
|
1696
|
-
fn test_serde_vec_arc_module_behavior() {
|
|
1697
|
-
let table1 = Table {
|
|
1698
|
-
cells: vec![vec!["A".to_string()]],
|
|
1699
|
-
markdown: "| A |".to_string(),
|
|
1700
|
-
page_number: 1,
|
|
1701
|
-
};
|
|
1702
|
-
|
|
1703
|
-
let table2 = Table {
|
|
1704
|
-
cells: vec![vec!["B".to_string()]],
|
|
1705
|
-
markdown: "| B |".to_string(),
|
|
1706
|
-
page_number: 2,
|
|
1707
|
-
};
|
|
1708
|
-
|
|
1709
|
-
let json = serde_json::to_string(&vec![table1, table2]).unwrap();
|
|
1710
|
-
assert!(json.contains("\"A\""));
|
|
1711
|
-
assert!(json.contains("\"B\""));
|
|
1712
|
-
}
|
|
1713
|
-
}
|