RubyGems - kreuzberg - Versions diffs - 4.0.0.pre.rc.29 → 4.0.0.rc1 - Mend

kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (321) hide show

checksums.yaml +4 -4
data/.gitignore +0 -6
data/.rubocop.yaml +534 -1
data/Gemfile +2 -1
data/Gemfile.lock +28 -116
data/README.md +269 -629
data/Rakefile +0 -9
data/Steepfile +4 -8
data/examples/async_patterns.rb +58 -1
data/ext/kreuzberg_rb/extconf.rb +5 -35
data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
data/ext/kreuzberg_rb/native/build.rs +14 -12
data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
data/ext/kreuzberg_rb/native/include/strings.h +2 -2
data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
data/extconf.rb +6 -38
data/kreuzberg.gemspec +20 -114
data/lib/kreuzberg/api_proxy.rb +18 -2
data/lib/kreuzberg/cache_api.rb +0 -22
data/lib/kreuzberg/cli.rb +10 -2
data/lib/kreuzberg/cli_proxy.rb +10 -0
data/lib/kreuzberg/config.rb +22 -274
data/lib/kreuzberg/errors.rb +7 -73
data/lib/kreuzberg/extraction_api.rb +8 -237
data/lib/kreuzberg/mcp_proxy.rb +11 -2
data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
data/lib/kreuzberg/post_processor_protocol.rb +71 -0
data/lib/kreuzberg/result.rb +33 -151
data/lib/kreuzberg/setup_lib_path.rb +2 -22
data/lib/kreuzberg/validator_protocol.rb +73 -0
data/lib/kreuzberg/version.rb +1 -1
data/lib/kreuzberg.rb +13 -27
data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
data/sig/kreuzberg.rbs +12 -105
data/spec/binding/cache_spec.rb +22 -22
data/spec/binding/cli_proxy_spec.rb +4 -2
data/spec/binding/cli_spec.rb +11 -12
data/spec/binding/config_spec.rb +0 -74
data/spec/binding/config_validation_spec.rb +6 -100
data/spec/binding/error_handling_spec.rb +97 -283
data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
data/spec/binding/plugins/postprocessor_spec.rb +11 -11
data/spec/binding/plugins/validator_spec.rb +13 -12
data/spec/examples.txt +104 -0
data/spec/fixtures/config.toml +1 -0
data/spec/fixtures/config.yaml +1 -0
data/spec/fixtures/invalid_config.toml +1 -0
data/spec/smoke/package_spec.rb +3 -2
data/spec/spec_helper.rb +3 -1
data/vendor/kreuzberg/Cargo.toml +67 -192
data/vendor/kreuzberg/README.md +9 -97
data/vendor/kreuzberg/build.rs +194 -516
data/vendor/kreuzberg/src/api/handlers.rs +9 -130
data/vendor/kreuzberg/src/api/mod.rs +3 -18
data/vendor/kreuzberg/src/api/server.rs +71 -236
data/vendor/kreuzberg/src/api/types.rs +7 -43
data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
data/vendor/kreuzberg/src/cache/mod.rs +3 -27
data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
data/vendor/kreuzberg/src/core/config.rs +23 -905
data/vendor/kreuzberg/src/core/extractor.rs +106 -403
data/vendor/kreuzberg/src/core/io.rs +2 -4
data/vendor/kreuzberg/src/core/mime.rs +12 -2
data/vendor/kreuzberg/src/core/mod.rs +3 -22
data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
data/vendor/kreuzberg/src/embeddings.rs +21 -169
data/vendor/kreuzberg/src/error.rs +2 -2
data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
data/vendor/kreuzberg/src/extraction/email.rs +11 -12
data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
data/vendor/kreuzberg/src/extraction/image.rs +14 -138
data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
data/vendor/kreuzberg/src/extraction/table.rs +1 -2
data/vendor/kreuzberg/src/extraction/text.rs +10 -18
data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
data/vendor/kreuzberg/src/extractors/email.rs +9 -37
data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
data/vendor/kreuzberg/src/extractors/html.rs +173 -182
data/vendor/kreuzberg/src/extractors/image.rs +8 -32
data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
data/vendor/kreuzberg/src/extractors/text.rs +7 -30
data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
data/vendor/kreuzberg/src/lib.rs +5 -17
data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
data/vendor/kreuzberg/src/mcp/server.rs +21 -145
data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
data/vendor/kreuzberg/src/pdf/error.rs +1 -93
data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
data/vendor/kreuzberg/src/pdf/table.rs +64 -61
data/vendor/kreuzberg/src/pdf/text.rs +24 -416
data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
data/vendor/kreuzberg/src/text/mod.rs +0 -8
data/vendor/kreuzberg/src/text/quality.rs +15 -28
data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
data/vendor/kreuzberg/src/types.rs +67 -907
data/vendor/kreuzberg/src/utils/mod.rs +0 -14
data/vendor/kreuzberg/src/utils/quality.rs +3 -12
data/vendor/kreuzberg/tests/api_tests.rs +0 -506
data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
data/vendor/kreuzberg/tests/config_features.rs +1 -33
data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
data/vendor/kreuzberg/tests/core_integration.rs +9 -35
data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
data/vendor/kreuzberg/tests/email_integration.rs +1 -3
data/vendor/kreuzberg/tests/error_handling.rs +34 -43
data/vendor/kreuzberg/tests/format_integration.rs +1 -7
data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
data/vendor/kreuzberg/tests/image_integration.rs +0 -2
data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
data/vendor/kreuzberg/tests/security_validation.rs +1 -13
data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
metadata +25 -171
data/.rubocop.yml +0 -543
data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
data/lib/kreuzberg/error_context.rb +0 -136
data/lib/kreuzberg/types.rb +0 -170
data/lib/libpdfium.so +0 -0
data/spec/binding/async_operations_spec.rb +0 -473
data/spec/binding/batch_operations_spec.rb +0 -595
data/spec/binding/batch_spec.rb +0 -359
data/spec/binding/config_result_spec.rb +0 -377
data/spec/binding/embeddings_spec.rb +0 -816
data/spec/binding/error_recovery_spec.rb +0 -488
data/spec/binding/font_config_spec.rb +0 -220
data/spec/binding/images_spec.rb +0 -738
data/spec/binding/keywords_extraction_spec.rb +0 -600
data/spec/binding/metadata_types_spec.rb +0 -1228
data/spec/binding/pages_extraction_spec.rb +0 -471
data/spec/binding/tables_spec.rb +0 -641
data/spec/unit/config/chunking_config_spec.rb +0 -213
data/spec/unit/config/embedding_config_spec.rb +0 -343
data/spec/unit/config/extraction_config_spec.rb +0 -438
data/spec/unit/config/font_config_spec.rb +0 -285
data/spec/unit/config/hierarchy_config_spec.rb +0 -314
data/spec/unit/config/image_extraction_config_spec.rb +0 -209
data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
data/spec/unit/config/keyword_config_spec.rb +0 -229
data/spec/unit/config/language_detection_config_spec.rb +0 -258
data/spec/unit/config/ocr_config_spec.rb +0 -171
data/spec/unit/config/page_config_spec.rb +0 -221
data/spec/unit/config/pdf_config_spec.rb +0 -267
data/spec/unit/config/postprocessor_config_spec.rb +0 -290
data/spec/unit/config/tesseract_config_spec.rb +0 -181
data/spec/unit/config/token_reduction_config_spec.rb +0 -251
data/test/metadata_types_test.rb +0 -959
data/vendor/Cargo.toml +0 -61
data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
data/vendor/kreuzberg/src/core/formats.rs +0 -235
data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
data/vendor/kreuzberg/src/extractors/security.rs +0 -484
data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
data/vendor/kreuzberg/src/panic_context.rs +0 -154
data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
data/vendor/kreuzberg/src/utils/pool.rs +0 -503
data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
data/vendor/kreuzberg/tests/api_embed.rs +0 -360
data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
data/vendor/kreuzberg/tests/page_markers.rs +0 -297
data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
data/vendor/kreuzberg-ffi/README.md +0 -851
data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
data/vendor/kreuzberg-ffi/build.rs +0 -168
data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
data/vendor/kreuzberg-ffi/src/error.rs +0 -901
data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
data/vendor/kreuzberg-ffi/src/result.rs +0 -510
data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
data/vendor/kreuzberg-ffi/src/types.rs +0 -363
data/vendor/kreuzberg-ffi/src/util.rs +0 -210
data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
data/vendor/kreuzberg-tesseract/LICENSE +0 -22
data/vendor/kreuzberg-tesseract/README.md +0 -399
data/vendor/kreuzberg-tesseract/build.rs +0 -1127
data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211

data/vendor/kreuzberg/src/types.rs CHANGED Viewed

@@ -1,6 +1,5 @@
 use serde::{Deserialize, Serialize};
-use std::collections::{BTreeMap, HashMap};
-use std::sync::Arc;
+use std::collections::HashMap;
 #[cfg(feature = "pdf")]
 use crate::pdf::metadata::PdfMetadata;
@@ -8,137 +7,6 @@ use crate::pdf::metadata::PdfMetadata;
 // ============================================================================
 // ============================================================================
-/// Module providing transparent serde support for Arc<T>.
-///
-/// Allows Arc-wrapped types to serialize/deserialize as if unwrapped,
-/// maintaining exact JSON format while preserving memory efficiency benefits.
-///
-/// # Arc Sharing Semantics
-///
-/// **Important**: Arc sharing semantics are **NOT** preserved across serialization.
-/// When deserializing, each Arc is independently created with `Arc::new()`.
-/// This means that if two Arcs referenced the same data before serialization,
-/// they will be separate Arcs after deserialization.
-///
-/// Example:
-/// ```ignore
-/// let shared = Arc::new(Table { /* ... */ });
-/// let tables = vec![Arc::clone(&shared), Arc::clone(&shared)];
-/// // Both in-memory Arcs point to the same Table
-///
-/// let json = serde_json::to_string(&tables)?;
-/// let deserialized: Vec<Arc<Table>> = serde_json::from_str(&json)?;
-/// // deserialized[0] and deserialized[1] are now independent Arcs,
-/// // even though they contain identical data
-/// ```
-///
-/// This design choice maintains:
-/// - Exact JSON format compatibility (no sharing metadata in JSON)
-/// - Predictable deserialization behavior
-/// - Zero additional serialization overhead
-///
-/// If in-memory sharing is required, callers must implement custom sharing logic
-/// or use a different data structure (like a HashMap of deduplicated values).
-#[allow(dead_code)]
-mod serde_arc {
-    use serde::{Deserialize, Deserializer, Serializer};
-    use std::sync::Arc;
-    /// Serialize an Arc<T> by serializing the inner value directly.
-    ///
-    /// This makes Arc<T> serialize identically to T, maintaining API compatibility.
-    /// The outer Arc wrapper is transparent during serialization.
-    pub fn serialize<S, T>(arc_value: &Arc<T>, serializer: S) -> Result<S::Ok, S::Error>
-    where
-        S: Serializer,
-        T: serde::Serialize,
-    {
-        (**arc_value).serialize(serializer)
-    }
-    /// Deserialize a T and wrap it in Arc.
-    ///
-    /// This makes Arc<T> deserialize from the same format as T.
-    /// Each Arc is independently created during deserialization;
-    /// Arc sharing from before serialization is NOT preserved.
-    pub fn deserialize<'de, D, T>(deserializer: D) -> Result<Arc<T>, D::Error>
-    where
-        D: Deserializer<'de>,
-        T: Deserialize<'de>,
-    {
-        T::deserialize(deserializer).map(Arc::new)
-    }
-}
-/// Module for serializing Vec<Arc<T>> with transparent Arc handling.
-///
-/// Serializes a Vec<Arc<T>> as Vec<T> for compatibility, while preserving
-/// Arc semantics for memory efficiency.
-///
-/// # Arc Sharing Semantics
-///
-/// **Important**: Arc sharing semantics are **NOT** preserved across serialization.
-/// When deserializing, each element's Arc is independently created with `Arc::new()`.
-/// This is important for `PageContent` where tables/images may be shared across pages.
-///
-/// Example with shared tables:
-/// ```ignore
-/// let shared_table = Arc::new(Table { /* ... */ });
-/// let page_contents = vec![
-///     PageContent { tables: vec![Arc::clone(&shared_table)], ... },
-///     PageContent { tables: vec![Arc::clone(&shared_table)], ... },
-/// ];
-/// // In-memory: both pages' tables point to the same Arc
-///
-/// let json = serde_json::to_string(&page_contents)?;
-/// let deserialized = serde_json::from_str::<Vec<PageContent>>(&json)?;
-/// // After deserialization: each page has independent Arc instances,
-/// // even though the table data is identical
-/// ```
-///
-/// Design rationale:
-/// - JSON has no mechanism to represent shared references
-/// - Preserving sharing would require complex metadata and deduplication
-/// - Current approach is simple, predictable, and maintains compatibility
-/// - In-memory sharing (via Arc) is an implementation detail for the Rust side
-///
-/// If in-memory sharing is required after deserialization, implement custom
-/// deduplication logic using hashing or content comparison.
-mod serde_vec_arc {
-    use serde::{Deserialize, Deserializer, Serializer};
-    use std::sync::Arc;
-    /// Serialize Vec<Arc<T>> by serializing each T directly.
-    ///
-    /// Each element is unwrapped from its Arc and serialized independently.
-    /// No sharing metadata is included in the serialized output.
-    pub fn serialize<S, T>(vec: &[Arc<T>], serializer: S) -> Result<S::Ok, S::Error>
-    where
-        S: Serializer,
-        T: serde::Serialize,
-    {
-        use serde::ser::SerializeSeq;
-        let mut seq = serializer.serialize_seq(Some(vec.len()))?;
-        for arc_item in vec {
-            seq.serialize_element(&**arc_item)?;
-        }
-        seq.end()
-    }
-    /// Deserialize Vec<T> and wrap each element in Arc.
-    ///
-    /// Each element is independently wrapped in a new Arc.
-    /// Sharing relationships from before serialization are lost.
-    pub fn deserialize<'de, D, T>(deserializer: D) -> Result<Vec<Arc<T>>, D::Error>
-    where
-        D: Deserializer<'de>,
-        T: Deserialize<'de>,
-    {
-        let vec: Vec<T> = Deserialize::deserialize(deserializer)?;
-        Ok(vec.into_iter().map(Arc::new).collect())
-    }
-}
 /// General extraction result used by the core extraction API.
 ///
 /// This is the main result type returned by all extraction functions.
@@ -166,13 +34,6 @@ pub struct ExtractionResult {
     /// Each image may optionally contain a nested `ocr_result` if OCR was performed.
     #[serde(skip_serializing_if = "Option::is_none")]
     pub images: Option<Vec<ExtractedImage>>,
-    /// Per-page content when page extraction is enabled.
-    ///
-    /// When page extraction is configured, the document is split into per-page content
-    /// with tables and images mapped to their respective pages.
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub pages: Option<Vec<PageContent>>,
 }
 /// Format-specific metadata (discriminated union).
@@ -201,45 +62,17 @@ pub enum FormatMetadata {
 /// via a discriminated union, and additional custom fields from postprocessors.
 #[derive(Debug, Clone, Serialize, Deserialize, Default)]
 pub struct Metadata {
-    /// Document title
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub title: Option<String>,
-    /// Document subject or description
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub subject: Option<String>,
-    /// Primary author(s) - always Vec for consistency
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub authors: Option<Vec<String>>,
-    /// Keywords/tags - always Vec for consistency
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub keywords: Option<Vec<String>>,
-    /// Primary language (ISO 639 code)
+    /// Language of the document (ISO 639 code)
     #[serde(skip_serializing_if = "Option::is_none")]
     pub language: Option<String>,
-    /// Creation timestamp (ISO 8601 format)
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub created_at: Option<String>,
-    /// Last modification timestamp (ISO 8601 format)
+    /// Document date (format varies by source)
     #[serde(skip_serializing_if = "Option::is_none")]
-    pub modified_at: Option<String>,
-    /// User who created the document
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub created_by: Option<String>,
-    /// User who last modified the document
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub modified_by: Option<String>,
+    pub date: Option<String>,
-    /// Page/slide/sheet structure with boundaries
+    /// Document subject/description
     #[serde(skip_serializing_if = "Option::is_none")]
-    pub pages: Option<PageStructure>,
+    pub subject: Option<String>,
     /// Format-specific metadata (discriminated union)
     ///
@@ -269,177 +102,6 @@ pub struct Metadata {
     pub additional: HashMap<String, serde_json::Value>,
 }
-/// Unified page structure for documents.
-///
-/// Supports different page types (PDF pages, PPTX slides, Excel sheets)
-/// with character offset boundaries for chunk-to-page mapping.
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct PageStructure {
-    /// Total number of pages/slides/sheets
-    pub total_count: usize,
-    /// Type of paginated unit
-    pub unit_type: PageUnitType,
-    /// Character offset boundaries for each page
-    ///
-    /// Maps character ranges in the extracted content to page numbers.
-    /// Used for chunk page range calculation.
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub boundaries: Option<Vec<PageBoundary>>,
-    /// Detailed per-page metadata (optional, only when needed)
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub pages: Option<Vec<PageInfo>>,
-}
-/// Type of paginated unit in a document.
-///
-/// Distinguishes between different types of "pages" (PDF pages, presentation slides, spreadsheet sheets).
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
-#[serde(rename_all = "snake_case")]
-pub enum PageUnitType {
-    /// Standard document pages (PDF, DOCX, images)
-    Page,
-    /// Presentation slides (PPTX, ODP)
-    Slide,
-    /// Spreadsheet sheets (XLSX, ODS)
-    Sheet,
-}
-/// Byte offset boundary for a page.
-///
-/// Tracks where a specific page's content starts and ends in the main content string,
-/// enabling mapping from byte positions to page numbers. Offsets are guaranteed to be
-/// at valid UTF-8 character boundaries when using standard String methods (push_str, push, etc.).
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct PageBoundary {
-    /// Byte offset where this page starts in the content string (UTF-8 valid boundary, inclusive)
-    pub byte_start: usize,
-    /// Byte offset where this page ends in the content string (UTF-8 valid boundary, exclusive)
-    pub byte_end: usize,
-    /// Page number (1-indexed)
-    pub page_number: usize,
-}
-/// Metadata for individual page/slide/sheet.
-///
-/// Captures per-page information including dimensions, content counts,
-/// and visibility state (for presentations).
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct PageInfo {
-    /// Page number (1-indexed)
-    pub number: usize,
-    /// Page title (usually for presentations)
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub title: Option<String>,
-    /// Dimensions in points (PDF) or pixels (images): (width, height)
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub dimensions: Option<(f64, f64)>,
-    /// Number of images on this page
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub image_count: Option<usize>,
-    /// Number of tables on this page
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub table_count: Option<usize>,
-    /// Whether this page is hidden (e.g., in presentations)
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub hidden: Option<bool>,
-}
-/// Content for a single page/slide.
-///
-/// When page extraction is enabled, documents are split into per-page content
-/// with associated tables and images mapped to each page.
-///
-/// # Performance
-///
-/// Uses Arc-wrapped tables and images for memory efficiency:
-/// - `Vec<Arc<Table>>` enables zero-copy sharing of table data
-/// - `Vec<Arc<ExtractedImage>>` enables zero-copy sharing of image data
-/// - Maintains exact JSON compatibility via custom Serialize/Deserialize
-///
-/// This reduces memory overhead for documents with shared tables/images
-/// by avoiding redundant copies during serialization.
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct PageContent {
-    /// Page number (1-indexed)
-    pub page_number: usize,
-    /// Text content for this page
-    pub content: String,
-    /// Tables found on this page (uses Arc for memory efficiency)
-    ///
-    /// Serializes as Vec<Table> for JSON compatibility while maintaining
-    /// Arc semantics in-memory for zero-copy sharing.
-    #[serde(skip_serializing_if = "Vec::is_empty", default, with = "serde_vec_arc")]
-    pub tables: Vec<Arc<Table>>,
-    /// Images found on this page (uses Arc for memory efficiency)
-    ///
-    /// Serializes as Vec<ExtractedImage> for JSON compatibility while maintaining
-    /// Arc semantics in-memory for zero-copy sharing.
-    #[serde(skip_serializing_if = "Vec::is_empty", default, with = "serde_vec_arc")]
-    pub images: Vec<Arc<ExtractedImage>>,
-    /// Hierarchy information for the page (when hierarchy extraction is enabled)
-    ///
-    /// Contains text hierarchy levels (H1-H6) extracted from the page content.
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub hierarchy: Option<PageHierarchy>,
-}
-/// Page hierarchy structure containing heading levels and block information.
-///
-/// Used when PDF text hierarchy extraction is enabled. Contains hierarchical
-/// blocks with heading levels (H1-H6) for semantic document structure.
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct PageHierarchy {
-    /// Number of hierarchy blocks on this page
-    pub block_count: usize,
-    /// Hierarchical blocks with heading levels
-    #[serde(skip_serializing_if = "Vec::is_empty", default)]
-    pub blocks: Vec<HierarchicalBlock>,
-}
-/// A text block with hierarchy level assignment.
-///
-/// Represents a block of text with semantic heading information extracted from
-/// font size clustering and hierarchical analysis.
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct HierarchicalBlock {
-    /// The text content of this block
-    pub text: String,
-    /// The font size of the text in this block
-    pub font_size: f32,
-    /// The hierarchy level of this block (H1-H6 or Body)
-    ///
-    /// Levels correspond to HTML heading tags:
-    /// - "h1": Top-level heading
-    /// - "h2": Secondary heading
-    /// - "h3": Tertiary heading
-    /// - "h4": Quaternary heading
-    /// - "h5": Quinary heading
-    /// - "h6": Senary heading
-    /// - "body": Body text (no heading level)
-    pub level: String,
-    /// Bounding box information for the block
-    ///
-    /// Contains coordinates as (left, top, right, bottom) in PDF units.
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub bbox: Option<(f32, f32, f32, f32)>,
-}
 /// Excel/spreadsheet metadata.
 ///
 /// Contains information about sheets in Excel, LibreOffice Calc, and other
@@ -551,308 +213,73 @@ pub struct TextMetadata {
     pub code_blocks: Option<Vec<(String, String)>>,
 }
-/// Text direction enumeration for HTML documents.
-#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
-#[serde(rename_all = "lowercase")]
-pub enum TextDirection {
-    /// Left-to-right text direction
-    #[serde(rename = "ltr")]
-    LeftToRight,
-    /// Right-to-left text direction
-    #[serde(rename = "rtl")]
-    RightToLeft,
-    /// Automatic text direction detection
-    #[serde(rename = "auto")]
-    Auto,
-}
-/// Header/heading element metadata.
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct HeaderMetadata {
-    /// Header level: 1 (h1) through 6 (h6)
-    pub level: u8,
-    /// Normalized text content of the header
-    pub text: String,
-    /// HTML id attribute if present
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub id: Option<String>,
-    /// Document tree depth at the header element
-    pub depth: usize,
-    /// Byte offset in original HTML document
-    pub html_offset: usize,
-}
-/// Link element metadata.
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct LinkMetadata {
-    /// The href URL value
-    pub href: String,
-    /// Link text content (normalized)
-    pub text: String,
-    /// Optional title attribute
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub title: Option<String>,
-    /// Link type classification
-    pub link_type: LinkType,
-    /// Rel attribute values
-    pub rel: Vec<String>,
-    /// Additional attributes as key-value pairs
-    pub attributes: HashMap<String, String>,
-}
-/// Link type classification.
-#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
-#[serde(rename_all = "lowercase")]
-pub enum LinkType {
-    /// Anchor link (#section)
-    Anchor,
-    /// Internal link (same domain)
-    Internal,
-    /// External link (different domain)
-    External,
-    /// Email link (mailto:)
-    Email,
-    /// Phone link (tel:)
-    Phone,
-    /// Other link type
-    Other,
-}
-/// Image element metadata.
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct ImageMetadataType {
-    /// Image source (URL, data URI, or SVG content)
-    pub src: String,
-    /// Alternative text from alt attribute
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub alt: Option<String>,
-    /// Title attribute
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub title: Option<String>,
-    /// Image dimensions as (width, height) if available
-    pub dimensions: Option<(u32, u32)>,
-    /// Image type classification
-    pub image_type: ImageType,
-    /// Additional attributes as key-value pairs
-    pub attributes: HashMap<String, String>,
-}
-/// Image type classification.
-#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
-#[serde(rename_all = "lowercase")]
-pub enum ImageType {
-    /// Data URI image
-    #[serde(rename = "data-uri")]
-    DataUri,
-    /// Inline SVG
-    #[serde(rename = "inline-svg")]
-    InlineSvg,
-    /// External image URL
-    External,
-    /// Relative path image
-    Relative,
-}
-/// Structured data (Schema.org, microdata, RDFa) block.
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct StructuredData {
-    /// Type of structured data
-    pub data_type: StructuredDataType,
-    /// Raw JSON string representation
-    pub raw_json: String,
-    /// Schema type if detectable (e.g., "Article", "Event", "Product")
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub schema_type: Option<String>,
-}
-/// Structured data type classification.
-#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
-#[serde(rename_all = "lowercase")]
-pub enum StructuredDataType {
-    /// JSON-LD structured data
-    #[serde(rename = "json-ld")]
-    JsonLd,
-    /// Microdata
-    Microdata,
-    /// RDFa
-    #[serde(rename = "rdfa")]
-    RDFa,
-}
 /// HTML metadata extracted from HTML documents.
 ///
-/// Includes document-level metadata, Open Graph data, Twitter Card metadata,
-/// and extracted structural elements (headers, links, images, structured data).
+/// Includes meta tags, Open Graph data, Twitter Card metadata, and link relations.
 #[derive(Debug, Clone, Serialize, Deserialize, Default)]
 pub struct HtmlMetadata {
-    /// Document title from `<title>` tag
     #[serde(skip_serializing_if = "Option::is_none")]
     pub title: Option<String>,
-    /// Document description from `<meta name="description">` tag
     #[serde(skip_serializing_if = "Option::is_none")]
     pub description: Option<String>,
-    /// Document keywords from `<meta name="keywords">` tag, split on commas
-    #[serde(default)]
-    pub keywords: Vec<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub keywords: Option<String>,
-    /// Document author from `<meta name="author">` tag
     #[serde(skip_serializing_if = "Option::is_none")]
     pub author: Option<String>,
-    /// Canonical URL from `<link rel="canonical">` tag
     #[serde(skip_serializing_if = "Option::is_none")]
-    pub canonical_url: Option<String>,
+    pub canonical: Option<String>,
-    /// Base URL from `<base href="">` tag for resolving relative URLs
     #[serde(skip_serializing_if = "Option::is_none")]
     pub base_href: Option<String>,
-    /// Document language from `lang` attribute
     #[serde(skip_serializing_if = "Option::is_none")]
-    pub language: Option<String>,
+    pub og_title: Option<String>,
-    /// Document text direction from `dir` attribute
     #[serde(skip_serializing_if = "Option::is_none")]
-    pub text_direction: Option<TextDirection>,
+    pub og_description: Option<String>,
-    /// Open Graph metadata (og:* properties) for social media
-    /// Keys like "title", "description", "image", "url", etc.
-    #[serde(default)]
-    pub open_graph: BTreeMap<String, String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub og_image: Option<String>,
-    /// Twitter Card metadata (twitter:* properties)
-    /// Keys like "card", "site", "creator", "title", "description", "image", etc.
-    #[serde(default)]
-    pub twitter_card: BTreeMap<String, String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub og_url: Option<String>,
-    /// Additional meta tags not covered by specific fields
-    /// Keys are meta name/property attributes, values are content
-    #[serde(default)]
-    pub meta_tags: BTreeMap<String, String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub og_type: Option<String>,
-    /// Extracted header elements with hierarchy
-    #[serde(default)]
-    pub headers: Vec<HeaderMetadata>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub og_site_name: Option<String>,
-    /// Extracted hyperlinks with type classification
-    #[serde(default)]
-    pub links: Vec<LinkMetadata>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub twitter_card: Option<String>,
-    /// Extracted images with source and dimensions
-    #[serde(default)]
-    pub images: Vec<ImageMetadataType>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub twitter_title: Option<String>,
-    /// Extracted structured data blocks
-    #[serde(default)]
-    pub structured_data: Vec<StructuredData>,
-}
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub twitter_description: Option<String>,
-impl HtmlMetadata {
-    /// Check if metadata is empty (no meaningful content extracted).
-    pub fn is_empty(&self) -> bool {
-        self.title.is_none()
-            && self.description.is_none()
-            && self.keywords.is_empty()
-            && self.author.is_none()
-            && self.canonical_url.is_none()
-            && self.base_href.is_none()
-            && self.language.is_none()
-            && self.text_direction.is_none()
-            && self.open_graph.is_empty()
-            && self.twitter_card.is_empty()
-            && self.meta_tags.is_empty()
-            && self.headers.is_empty()
-            && self.links.is_empty()
-            && self.images.is_empty()
-            && self.structured_data.is_empty()
-    }
-}
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub twitter_image: Option<String>,
-#[cfg(feature = "html")]
-impl From<html_to_markdown_rs::ExtendedMetadata> for HtmlMetadata {
-    fn from(metadata: html_to_markdown_rs::ExtendedMetadata) -> Self {
-        let text_dir = metadata.document.text_direction.map(|td| match td {
-            html_to_markdown_rs::TextDirection::LeftToRight => TextDirection::LeftToRight,
-            html_to_markdown_rs::TextDirection::RightToLeft => TextDirection::RightToLeft,
-            html_to_markdown_rs::TextDirection::Auto => TextDirection::Auto,
-        });
-        HtmlMetadata {
-            title: metadata.document.title,
-            description: metadata.document.description,
-            keywords: metadata.document.keywords,
-            author: metadata.document.author,
-            canonical_url: metadata.document.canonical_url,
-            base_href: metadata.document.base_href,
-            language: metadata.document.language,
-            text_direction: text_dir,
-            open_graph: metadata.document.open_graph,
-            twitter_card: metadata.document.twitter_card,
-            meta_tags: metadata.document.meta_tags,
-            headers: metadata
-                .headers
-                .into_iter()
-                .map(|h| HeaderMetadata {
-                    level: h.level,
-                    text: h.text,
-                    id: h.id,
-                    depth: h.depth,
-                    html_offset: h.html_offset,
-                })
-                .collect(),
-            links: metadata
-                .links
-                .into_iter()
-                .map(|l| LinkMetadata {
-                    href: l.href,
-                    text: l.text,
-                    title: l.title,
-                    link_type: match l.link_type {
-                        html_to_markdown_rs::LinkType::Anchor => LinkType::Anchor,
-                        html_to_markdown_rs::LinkType::Internal => LinkType::Internal,
-                        html_to_markdown_rs::LinkType::External => LinkType::External,
-                        html_to_markdown_rs::LinkType::Email => LinkType::Email,
-                        html_to_markdown_rs::LinkType::Phone => LinkType::Phone,
-                        html_to_markdown_rs::LinkType::Other => LinkType::Other,
-                    },
-                    rel: l.rel,
-                    attributes: l.attributes.into_iter().collect(),
-                })
-                .collect(),
-            images: metadata
-                .images
-                .into_iter()
-                .map(|img| ImageMetadataType {
-                    src: img.src,
-                    alt: img.alt,
-                    title: img.title,
-                    dimensions: img.dimensions,
-                    image_type: match img.image_type {
-                        html_to_markdown_rs::ImageType::DataUri => ImageType::DataUri,
-                        html_to_markdown_rs::ImageType::InlineSvg => ImageType::InlineSvg,
-                        html_to_markdown_rs::ImageType::External => ImageType::External,
-                        html_to_markdown_rs::ImageType::Relative => ImageType::Relative,
-                    },
-                    attributes: img.attributes.into_iter().collect(),
-                })
-                .collect(),
-            structured_data: metadata
-                .structured_data
-                .into_iter()
-                .map(|sd| StructuredData {
-                    data_type: match sd.data_type {
-                        html_to_markdown_rs::StructuredDataType::JsonLd => StructuredDataType::JsonLd,
-                        html_to_markdown_rs::StructuredDataType::Microdata => StructuredDataType::Microdata,
-                        html_to_markdown_rs::StructuredDataType::RDFa => StructuredDataType::RDFa,
-                    },
-                    raw_json: sd.raw_json,
-                    schema_type: sd.schema_type,
-                })
-                .collect(),
-        }
-    }
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub twitter_site: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub twitter_creator: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub link_author: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub link_license: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub link_alternate: Option<String>,
 }
 /// OCR processing metadata.
@@ -921,11 +348,11 @@ pub struct Chunk {
 /// Metadata about a chunk's position in the original document.
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct ChunkMetadata {
-    /// Byte offset where this chunk starts in the original text (UTF-8 valid boundary).
-    pub byte_start: usize,
+    /// Character offset where this chunk starts in the original text.
+    pub char_start: usize,
-    /// Byte offset where this chunk ends in the original text (UTF-8 valid boundary).
-    pub byte_end: usize,
+    /// Character offset where this chunk ends in the original text.
+    pub char_end: usize,
     /// Number of tokens in this chunk (if available).
     ///
@@ -938,18 +365,6 @@ pub struct ChunkMetadata {
     /// Total number of chunks in the document.
     pub total_chunks: usize,
-    /// First page number this chunk spans (1-indexed).
-    ///
-    /// Only populated when page tracking is enabled in extraction configuration.
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub first_page: Option<usize>,
-    /// Last page number this chunk spans (1-indexed, equal to first_page for single-page chunks).
-    ///
-    /// Only populated when page tracking is enabled in extraction configuration.
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub last_page: Option<usize>,
 }
 /// Extracted image from a document.
@@ -1032,11 +447,6 @@ pub struct ExcelSheet {
     pub col_count: usize,
     /// Total number of non-empty cells
     pub cell_count: usize,
-    /// Pre-extracted table cells (2D vector of cell values)
-    /// Populated during markdown generation to avoid re-parsing markdown.
-    /// None for empty sheets.
-    #[serde(skip)]
-    pub table_cells: Option<Vec<Vec<String>>>,
 }
 /// XML extraction result.
@@ -1095,22 +505,22 @@ pub struct PptxExtractionResult {
     pub table_count: usize,
     /// Extracted images from the presentation
     pub images: Vec<ExtractedImage>,
-    /// Slide structure with boundaries (when page tracking is enabled)
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub page_structure: Option<PageStructure>,
-    /// Per-slide content (when page tracking is enabled)
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub page_contents: Option<Vec<PageContent>>,
 }
 /// PowerPoint presentation metadata.
 ///
-/// Contains PPTX-specific metadata. Common fields like title, author, and description
-/// are now in the base `Metadata` struct.
+/// Contains document-level metadata extracted from the PPTX file.
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct PptxMetadata {
+    /// Presentation title
+    pub title: Option<String>,
+    /// Author name
+    pub author: Option<String>,
+    /// Description/comments
+    pub description: Option<String>,
+    /// Summary text
+    pub summary: Option<String>,
     /// List of fonts used in the presentation
-    #[serde(skip_serializing_if = "Vec::is_empty", default)]
     pub fonts: Vec<String>,
 }
@@ -1434,6 +844,18 @@ pub struct CacheStats {
     pub newest_file_age_days: f64,
 }
+/// Pandoc extraction result.
+///
+/// Result of extracting content from a document using Pandoc,
+/// including text and any metadata Pandoc was able to extract.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct PandocExtractionResult {
+    /// Extracted text content
+    pub content: String,
+    /// Metadata extracted by Pandoc (varies by format)
+    pub metadata: HashMap<String, serde_json::Value>,
+}
 /// LibreOffice conversion result.
 ///
 /// Result of converting a legacy office document (e.g., .doc, .ppt)
@@ -1449,265 +871,3 @@ pub struct LibreOfficeConversionResult {
     /// Target MIME type after conversion
     pub target_mime: String,
 }
-#[cfg(test)]
-mod tests {
-    use super::*;
-    #[test]
-    fn test_metadata_serialization_with_format() {
-        let mut metadata = Metadata {
-            format: Some(FormatMetadata::Text(TextMetadata {
-                line_count: 1,
-                word_count: 2,
-                character_count: 13,
-                headers: None,
-                links: None,
-                code_blocks: None,
-            })),
-            ..Default::default()
-        };
-        metadata
-            .additional
-            .insert("quality_score".to_string(), serde_json::json!(1.0));
-        let json = serde_json::to_value(&metadata).unwrap();
-        println!("Serialized metadata: {}", serde_json::to_string_pretty(&json).unwrap());
-        assert!(
-            json.get("format_type").is_some(),
-            "format_type should be present in serialized JSON"
-        );
-        assert_eq!(json.get("format_type").unwrap(), "text");
-        assert_eq!(json.get("line_count").unwrap(), 1);
-        assert_eq!(json.get("word_count").unwrap(), 2);
-        assert_eq!(json.get("character_count").unwrap(), 13);
-        assert_eq!(json.get("quality_score").unwrap(), 1.0);
-    }
-    #[test]
-    fn test_arc_table_serialization_format() {
-        let table = Table {
-            cells: vec![vec!["A".to_string(), "B".to_string()]],
-            markdown: "| A | B |\n|---|---|\n".to_string(),
-            page_number: 1,
-        };
-        let json = serde_json::to_value(&table).unwrap();
-        assert_eq!(json.get("cells").unwrap()[0][0], "A");
-        assert_eq!(json.get("markdown").unwrap(), "| A | B |\n|---|---|\n");
-        assert_eq!(json.get("page_number").unwrap(), 1);
-    }
-    #[test]
-    fn test_arc_table_roundtrip() {
-        let original = Table {
-            cells: vec![
-                vec!["X".to_string(), "Y".to_string()],
-                vec!["1".to_string(), "2".to_string()],
-            ],
-            markdown: "| X | Y |\n|---|---|\n| 1 | 2 |\n".to_string(),
-            page_number: 5,
-        };
-        let json = serde_json::to_string(&original).unwrap();
-        let deserialized: Table = serde_json::from_str(&json).unwrap();
-        assert_eq!(deserialized.cells, original.cells);
-        assert_eq!(deserialized.markdown, original.markdown);
-        assert_eq!(deserialized.page_number, original.page_number);
-    }
-    #[test]
-    fn test_arc_sharing_preserved_before_serialization() {
-        let shared_table = Arc::new(Table {
-            cells: vec![vec!["shared".to_string()]],
-            markdown: "| shared |".to_string(),
-            page_number: 1,
-        });
-        let tables_before = [Arc::clone(&shared_table), Arc::clone(&shared_table)].to_vec();
-        assert_eq!(Arc::strong_count(&tables_before[0]), 3);
-        assert_eq!(Arc::strong_count(&tables_before[1]), 3);
-        assert!(Arc::ptr_eq(&tables_before[0], &tables_before[1]));
-    }
-    #[test]
-    fn test_vec_arc_table_serialization_format() {
-        let tables = vec![
-            Table {
-                cells: vec![vec!["A".to_string()]],
-                markdown: "| A |".to_string(),
-                page_number: 1,
-            },
-            Table {
-                cells: vec![vec!["B".to_string()]],
-                markdown: "| B |".to_string(),
-                page_number: 2,
-            },
-        ];
-        let json = serde_json::to_string(&tables).unwrap();
-        let parsed: serde_json::Value = serde_json::from_str(&json).unwrap();
-        assert!(parsed.is_array());
-        assert_eq!(parsed.as_array().unwrap().len(), 2);
-        assert_eq!(parsed[0]["cells"][0][0], "A");
-        assert_eq!(parsed[1]["cells"][0][0], "B");
-    }
-    #[test]
-    fn test_page_content_arc_tables_roundtrip() {
-        let page = PageContent {
-            page_number: 3,
-            content: "Page 3 content".to_string(),
-            tables: vec![
-                Arc::new(Table {
-                    cells: vec![vec!["Table1".to_string()]],
-                    markdown: "| Table1 |".to_string(),
-                    page_number: 3,
-                }),
-                Arc::new(Table {
-                    cells: vec![vec!["Table2".to_string()]],
-                    markdown: "| Table2 |".to_string(),
-                    page_number: 3,
-                }),
-            ],
-            images: Vec::new(),
-            hierarchy: None,
-        };
-        let json = serde_json::to_string(&page).unwrap();
-        let deserialized: PageContent = serde_json::from_str(&json).unwrap();
-        assert_eq!(deserialized.page_number, 3);
-        assert_eq!(deserialized.content, "Page 3 content");
-        assert_eq!(deserialized.tables.len(), 2);
-        assert_eq!(deserialized.tables[0].cells[0][0], "Table1");
-        assert_eq!(deserialized.tables[1].cells[0][0], "Table2");
-    }
-    #[test]
-    fn test_page_content_arc_images_roundtrip() {
-        let image1 = Arc::new(ExtractedImage {
-            data: vec![0xFF, 0xD8, 0xFF],
-            format: "jpeg".to_string(),
-            image_index: 0,
-            page_number: Some(1),
-            width: Some(100),
-            height: Some(200),
-            colorspace: Some("RGB".to_string()),
-            bits_per_component: Some(8),
-            is_mask: false,
-            description: Some("Image 1".to_string()),
-            ocr_result: None,
-        });
-        let image2 = Arc::new(ExtractedImage {
-            data: vec![0x89, 0x50, 0x4E],
-            format: "png".to_string(),
-            image_index: 1,
-            page_number: Some(1),
-            width: Some(300),
-            height: Some(400),
-            colorspace: Some("RGBA".to_string()),
-            bits_per_component: Some(8),
-            is_mask: false,
-            description: Some("Image 2".to_string()),
-            ocr_result: None,
-        });
-        let page = PageContent {
-            page_number: 1,
-            content: "Page with images".to_string(),
-            tables: Vec::new(),
-            images: vec![image1, image2],
-            hierarchy: None,
-        };
-        let json = serde_json::to_string(&page).unwrap();
-        let deserialized: PageContent = serde_json::from_str(&json).unwrap();
-        assert_eq!(deserialized.images.len(), 2);
-        assert_eq!(deserialized.images[0].format, "jpeg");
-        assert_eq!(deserialized.images[0].width, Some(100));
-        assert_eq!(deserialized.images[1].format, "png");
-        assert_eq!(deserialized.images[1].height, Some(400));
-    }
-    #[test]
-    fn test_arc_sharing_loss_with_page_content() {
-        let shared_table = Arc::new(Table {
-            cells: vec![vec!["shared across pages".to_string()]],
-            markdown: "| shared across pages |".to_string(),
-            page_number: 0,
-        });
-        let page1 = PageContent {
-            page_number: 1,
-            content: "Page 1".to_string(),
-            tables: vec![Arc::clone(&shared_table)],
-            images: Vec::new(),
-            hierarchy: None,
-        };
-        let page2 = PageContent {
-            page_number: 2,
-            content: "Page 2".to_string(),
-            tables: vec![Arc::clone(&shared_table)],
-            images: Vec::new(),
-            hierarchy: None,
-        };
-        assert!(Arc::ptr_eq(&page1.tables[0], &page2.tables[0]));
-        let pages = vec![page1, page2];
-        let json = serde_json::to_string(&pages).unwrap();
-        let deserialized: Vec<PageContent> = serde_json::from_str(&json).unwrap();
-        assert_eq!(deserialized.len(), 2);
-        assert_eq!(deserialized[0].tables[0].cells, deserialized[1].tables[0].cells);
-        assert!(!Arc::ptr_eq(&deserialized[0].tables[0], &deserialized[1].tables[0]));
-    }
-    #[test]
-    fn test_empty_page_content_arcs() {
-        let page = PageContent {
-            page_number: 5,
-            content: "No tables or images".to_string(),
-            tables: Vec::new(),
-            images: Vec::new(),
-            hierarchy: None,
-        };
-        let json = serde_json::to_string(&page).unwrap();
-        let deserialized: PageContent = serde_json::from_str(&json).unwrap();
-        assert_eq!(deserialized.page_number, 5);
-        assert_eq!(deserialized.tables.len(), 0);
-        assert_eq!(deserialized.images.len(), 0);
-    }
-    #[test]
-    fn test_serde_vec_arc_module_behavior() {
-        let table1 = Table {
-            cells: vec![vec!["A".to_string()]],
-            markdown: "| A |".to_string(),
-            page_number: 1,
-        };
-        let table2 = Table {
-            cells: vec![vec!["B".to_string()]],
-            markdown: "| B |".to_string(),
-            page_number: 2,
-        };
-        let json = serde_json::to_string(&vec![table1, table2]).unwrap();
-        assert!(json.contains("\"A\""));
-        assert!(json.contains("\"B\""));
-    }
-}