RubyGems - kreuzberg - Versions diffs - 4.0.0.rc1 → 4.0.0.rc2 - Mend

kreuzberg 4.0.0.rc1 → 4.0.0.rc2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (342) hide show

checksums.yaml +4 -4
data/.gitignore +14 -8
data/.rspec +3 -3
data/.rubocop.yaml +1 -534
data/.rubocop.yml +538 -0
data/Gemfile +8 -9
data/Gemfile.lock +9 -109
data/README.md +426 -421
data/Rakefile +25 -25
data/Steepfile +47 -47
data/examples/async_patterns.rb +341 -340
data/ext/kreuzberg_rb/extconf.rb +45 -35
data/ext/kreuzberg_rb/native/Cargo.lock +6535 -0
data/ext/kreuzberg_rb/native/Cargo.toml +44 -36
data/ext/kreuzberg_rb/native/README.md +425 -425
data/ext/kreuzberg_rb/native/build.rs +15 -17
data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
data/ext/kreuzberg_rb/native/include/strings.h +20 -20
data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
data/ext/kreuzberg_rb/native/src/lib.rs +2998 -2939
data/extconf.rb +28 -28
data/kreuzberg.gemspec +148 -105
data/lib/kreuzberg/api_proxy.rb +142 -142
data/lib/kreuzberg/cache_api.rb +46 -45
data/lib/kreuzberg/cli.rb +55 -55
data/lib/kreuzberg/cli_proxy.rb +127 -127
data/lib/kreuzberg/config.rb +691 -684
data/lib/kreuzberg/error_context.rb +32 -0
data/lib/kreuzberg/errors.rb +118 -50
data/lib/kreuzberg/extraction_api.rb +85 -84
data/lib/kreuzberg/mcp_proxy.rb +186 -186
data/lib/kreuzberg/ocr_backend_protocol.rb +113 -113
data/lib/kreuzberg/post_processor_protocol.rb +86 -86
data/lib/kreuzberg/result.rb +216 -216
data/lib/kreuzberg/setup_lib_path.rb +80 -79
data/lib/kreuzberg/validator_protocol.rb +89 -89
data/lib/kreuzberg/version.rb +5 -5
data/lib/kreuzberg.rb +103 -82
data/sig/kreuzberg/internal.rbs +184 -184
data/sig/kreuzberg.rbs +520 -468
data/spec/binding/cache_spec.rb +227 -227
data/spec/binding/cli_proxy_spec.rb +85 -87
data/spec/binding/cli_spec.rb +55 -54
data/spec/binding/config_spec.rb +345 -345
data/spec/binding/config_validation_spec.rb +283 -283
data/spec/binding/error_handling_spec.rb +213 -213
data/spec/binding/errors_spec.rb +66 -66
data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
data/spec/binding/plugins/postprocessor_spec.rb +269 -269
data/spec/binding/plugins/validator_spec.rb +274 -274
data/spec/fixtures/config.toml +39 -39
data/spec/fixtures/config.yaml +41 -42
data/spec/fixtures/invalid_config.toml +4 -4
data/spec/smoke/package_spec.rb +178 -178
data/spec/spec_helper.rb +42 -42
data/vendor/kreuzberg/Cargo.toml +204 -134
data/vendor/kreuzberg/README.md +175 -175
data/vendor/kreuzberg/benches/otel_overhead.rs +48 -0
data/vendor/kreuzberg/build.rs +474 -460
data/vendor/kreuzberg/src/api/error.rs +81 -81
data/vendor/kreuzberg/src/api/handlers.rs +199 -199
data/vendor/kreuzberg/src/api/mod.rs +79 -79
data/vendor/kreuzberg/src/api/server.rs +353 -353
data/vendor/kreuzberg/src/api/types.rs +170 -170
data/vendor/kreuzberg/src/cache/mod.rs +1167 -1143
data/vendor/kreuzberg/src/chunking/mod.rs +677 -677
data/vendor/kreuzberg/src/core/batch_mode.rs +95 -35
data/vendor/kreuzberg/src/core/config.rs +1032 -1032
data/vendor/kreuzberg/src/core/extractor.rs +1024 -903
data/vendor/kreuzberg/src/core/io.rs +329 -327
data/vendor/kreuzberg/src/core/mime.rs +605 -615
data/vendor/kreuzberg/src/core/mod.rs +45 -42
data/vendor/kreuzberg/src/core/pipeline.rs +984 -906
data/vendor/kreuzberg/src/embeddings.rs +432 -323
data/vendor/kreuzberg/src/error.rs +431 -431
data/vendor/kreuzberg/src/extraction/archive.rs +954 -954
data/vendor/kreuzberg/src/extraction/docx.rs +40 -40
data/vendor/kreuzberg/src/extraction/email.rs +854 -854
data/vendor/kreuzberg/src/extraction/excel.rs +688 -688
data/vendor/kreuzberg/src/extraction/html.rs +553 -553
data/vendor/kreuzberg/src/extraction/image.rs +368 -368
data/vendor/kreuzberg/src/extraction/libreoffice.rs +563 -564
data/vendor/kreuzberg/src/extraction/markdown.rs +213 -0
data/vendor/kreuzberg/src/extraction/mod.rs +81 -77
data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -128
data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +287 -0
data/vendor/kreuzberg/src/extraction/pptx.rs +3000 -3000
data/vendor/kreuzberg/src/extraction/structured.rs +490 -490
data/vendor/kreuzberg/src/extraction/table.rs +328 -328
data/vendor/kreuzberg/src/extraction/text.rs +269 -269
data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
data/vendor/kreuzberg/src/extractors/archive.rs +446 -425
data/vendor/kreuzberg/src/extractors/bibtex.rs +469 -0
data/vendor/kreuzberg/src/extractors/docbook.rs +502 -0
data/vendor/kreuzberg/src/extractors/docx.rs +367 -479
data/vendor/kreuzberg/src/extractors/email.rs +143 -129
data/vendor/kreuzberg/src/extractors/epub.rs +707 -0
data/vendor/kreuzberg/src/extractors/excel.rs +343 -344
data/vendor/kreuzberg/src/extractors/fictionbook.rs +491 -0
data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +738 -0
data/vendor/kreuzberg/src/extractors/html.rs +393 -410
data/vendor/kreuzberg/src/extractors/image.rs +198 -195
data/vendor/kreuzberg/src/extractors/jats.rs +1051 -0
data/vendor/kreuzberg/src/extractors/jupyter.rs +367 -0
data/vendor/kreuzberg/src/extractors/latex.rs +652 -0
data/vendor/kreuzberg/src/extractors/markdown.rs +700 -0
data/vendor/kreuzberg/src/extractors/mod.rs +365 -268
data/vendor/kreuzberg/src/extractors/odt.rs +628 -0
data/vendor/kreuzberg/src/extractors/opml.rs +634 -0
data/vendor/kreuzberg/src/extractors/orgmode.rs +528 -0
data/vendor/kreuzberg/src/extractors/pdf.rs +493 -496
data/vendor/kreuzberg/src/extractors/pptx.rs +248 -234
data/vendor/kreuzberg/src/extractors/rst.rs +576 -0
data/vendor/kreuzberg/src/extractors/rtf.rs +810 -0
data/vendor/kreuzberg/src/extractors/security.rs +484 -0
data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -0
data/vendor/kreuzberg/src/extractors/structured.rs +140 -126
data/vendor/kreuzberg/src/extractors/text.rs +260 -242
data/vendor/kreuzberg/src/extractors/typst.rs +650 -0
data/vendor/kreuzberg/src/extractors/xml.rs +135 -128
data/vendor/kreuzberg/src/image/dpi.rs +164 -164
data/vendor/kreuzberg/src/image/mod.rs +6 -6
data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
data/vendor/kreuzberg/src/image/resize.rs +89 -89
data/vendor/kreuzberg/src/keywords/config.rs +154 -154
data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
data/vendor/kreuzberg/src/keywords/processor.rs +267 -267
data/vendor/kreuzberg/src/keywords/rake.rs +293 -294
data/vendor/kreuzberg/src/keywords/types.rs +68 -68
data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
data/vendor/kreuzberg/src/language_detection/mod.rs +942 -942
data/vendor/kreuzberg/src/lib.rs +105 -102
data/vendor/kreuzberg/src/mcp/mod.rs +32 -32
data/vendor/kreuzberg/src/mcp/server.rs +1968 -1966
data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
data/vendor/kreuzberg/src/ocr/error.rs +37 -37
data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
data/vendor/kreuzberg/src/ocr/mod.rs +58 -58
data/vendor/kreuzberg/src/ocr/processor.rs +863 -847
data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +450 -450
data/vendor/kreuzberg/src/ocr/types.rs +393 -393
data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
data/vendor/kreuzberg/src/panic_context.rs +154 -0
data/vendor/kreuzberg/src/pdf/error.rs +122 -122
data/vendor/kreuzberg/src/pdf/images.rs +139 -139
data/vendor/kreuzberg/src/pdf/metadata.rs +346 -346
data/vendor/kreuzberg/src/pdf/mod.rs +50 -50
data/vendor/kreuzberg/src/pdf/rendering.rs +369 -369
data/vendor/kreuzberg/src/pdf/table.rs +393 -420
data/vendor/kreuzberg/src/pdf/text.rs +158 -161
data/vendor/kreuzberg/src/plugins/extractor.rs +1013 -1010
data/vendor/kreuzberg/src/plugins/mod.rs +209 -209
data/vendor/kreuzberg/src/plugins/ocr.rs +620 -629
data/vendor/kreuzberg/src/plugins/processor.rs +642 -641
data/vendor/kreuzberg/src/plugins/registry.rs +1337 -1324
data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
data/vendor/kreuzberg/src/plugins/validator.rs +956 -955
data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
data/vendor/kreuzberg/src/text/mod.rs +19 -19
data/vendor/kreuzberg/src/text/quality.rs +697 -697
data/vendor/kreuzberg/src/text/string_utils.rs +217 -217
data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -796
data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -902
data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -147
data/vendor/kreuzberg/src/types.rs +903 -873
data/vendor/kreuzberg/src/utils/mod.rs +17 -17
data/vendor/kreuzberg/src/utils/quality.rs +959 -959
data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -0
data/vendor/kreuzberg/tests/api_tests.rs +966 -966
data/vendor/kreuzberg/tests/archive_integration.rs +543 -543
data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -542
data/vendor/kreuzberg/tests/batch_processing.rs +316 -304
data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -0
data/vendor/kreuzberg/tests/concurrency_stress.rs +525 -509
data/vendor/kreuzberg/tests/config_features.rs +598 -580
data/vendor/kreuzberg/tests/config_loading_tests.rs +415 -439
data/vendor/kreuzberg/tests/core_integration.rs +510 -493
data/vendor/kreuzberg/tests/csv_integration.rs +414 -424
data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +498 -0
data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -124
data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -0
data/vendor/kreuzberg/tests/email_integration.rs +325 -325
data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -0
data/vendor/kreuzberg/tests/error_handling.rs +393 -393
data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -0
data/vendor/kreuzberg/tests/format_integration.rs +159 -159
data/vendor/kreuzberg/tests/helpers/mod.rs +142 -142
data/vendor/kreuzberg/tests/html_table_test.rs +551 -0
data/vendor/kreuzberg/tests/image_integration.rs +253 -253
data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -0
data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -0
data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -0
data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -0
data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -0
data/vendor/kreuzberg/tests/mime_detection.rs +428 -428
data/vendor/kreuzberg/tests/ocr_configuration.rs +510 -510
data/vendor/kreuzberg/tests/ocr_errors.rs +676 -676
data/vendor/kreuzberg/tests/ocr_quality.rs +627 -627
data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
data/vendor/kreuzberg/tests/odt_extractor_tests.rs +695 -0
data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -0
data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -0
data/vendor/kreuzberg/tests/pdf_integration.rs +43 -43
data/vendor/kreuzberg/tests/pipeline_integration.rs +1411 -1412
data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +771 -771
data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -561
data/vendor/kreuzberg/tests/plugin_system.rs +921 -921
data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
data/vendor/kreuzberg/tests/registry_integration_tests.rs +586 -607
data/vendor/kreuzberg/tests/rst_extractor_tests.rs +692 -0
data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +776 -0
data/vendor/kreuzberg/tests/security_validation.rs +415 -404
data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
data/vendor/kreuzberg/tests/test_fastembed.rs +609 -609
data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1259 -0
data/vendor/kreuzberg/tests/typst_extractor_tests.rs +647 -0
data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
data/vendor/rb-sys/.cargo-ok +1 -0
data/vendor/rb-sys/.cargo_vcs_info.json +6 -0
data/vendor/rb-sys/Cargo.lock +393 -0
data/vendor/rb-sys/Cargo.toml +70 -0
data/vendor/rb-sys/Cargo.toml.orig +57 -0
data/vendor/rb-sys/LICENSE-APACHE +190 -0
data/vendor/rb-sys/LICENSE-MIT +21 -0
data/vendor/rb-sys/bin/release.sh +21 -0
data/vendor/rb-sys/build/features.rs +108 -0
data/vendor/rb-sys/build/main.rs +246 -0
data/vendor/rb-sys/build/stable_api_config.rs +153 -0
data/vendor/rb-sys/build/version.rs +48 -0
data/vendor/rb-sys/readme.md +36 -0
data/vendor/rb-sys/src/bindings.rs +21 -0
data/vendor/rb-sys/src/hidden.rs +11 -0
data/vendor/rb-sys/src/lib.rs +34 -0
data/vendor/rb-sys/src/macros.rs +371 -0
data/vendor/rb-sys/src/memory.rs +53 -0
data/vendor/rb-sys/src/ruby_abi_version.rs +38 -0
data/vendor/rb-sys/src/special_consts.rs +31 -0
data/vendor/rb-sys/src/stable_api/compiled.c +179 -0
data/vendor/rb-sys/src/stable_api/compiled.rs +257 -0
data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +316 -0
data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +316 -0
data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +324 -0
data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +317 -0
data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +315 -0
data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +326 -0
data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +327 -0
data/vendor/rb-sys/src/stable_api.rs +261 -0
data/vendor/rb-sys/src/symbol.rs +31 -0
data/vendor/rb-sys/src/tracking_allocator.rs +332 -0
data/vendor/rb-sys/src/utils.rs +89 -0
data/vendor/rb-sys/src/value_type.rs +7 -0
metadata +90 -95
data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
data/spec/examples.txt +0 -104
data/vendor/kreuzberg/src/bin/profile_extract.rs +0 -455
data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +0 -275
data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +0 -178
data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +0 -491
data/vendor/kreuzberg/src/extraction/pandoc/server.rs +0 -496
data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +0 -1188
data/vendor/kreuzberg/src/extraction/pandoc/version.rs +0 -162
data/vendor/kreuzberg/src/extractors/pandoc.rs +0 -201
data/vendor/kreuzberg/tests/chunking_offset_demo.rs +0 -92
data/vendor/kreuzberg/tests/pandoc_integration.rs +0 -503

data/vendor/kreuzberg/tests/typst_behavioral_tests.rs ADDED Viewed

@@ -0,0 +1,1259 @@
+#![allow(clippy::len_zero, clippy::unnecessary_get_then_check, clippy::single_match)]
+//! Comprehensive behavioral tests for Typst extractor against Pandoc baselines.
+//!
+//! These tests expose the critical bugs found in code review:
+//! 1. 62% heading loss bug - only matches single `=` headings
+//! 2. Blockquotes not implemented
+//! 3. Display math not extracted
+//! 4. Nested table brackets cause corruption
+//! 5. Empty headings output (just `= ` with no text)
+//! 6. Regex failures silently lose metadata
+//!
+//! The tests are designed to FAIL initially, exposing real bugs that need fixing.
+//! They compare extracted output against Pandoc baseline outputs for behavioral parity.
+use kreuzberg::core::config::ExtractionConfig;
+use kreuzberg::core::extractor::extract_bytes;
+use std::{fs, path::PathBuf};
+fn typst_doc_root() -> PathBuf {
+    PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../../test_documents/typst")
+}
+/// Load a test document from the test_documents/typst directory
+fn load_test_document(filename: &str) -> Vec<u8> {
+    let path = typst_doc_root().join(filename);
+    fs::read(&path).unwrap_or_else(|_| panic!("Failed to read test document: {}", filename))
+}
+/// Load Pandoc baseline output for comparison
+fn load_pandoc_baseline(filename_base: &str) -> String {
+    let path = typst_doc_root().join(format!("{filename_base}_pandoc_baseline.txt"));
+    fs::read_to_string(&path).unwrap_or_else(|_| panic!("Failed to read baseline: {}", filename_base))
+}
+/// Load Pandoc metadata JSON for comparison
+fn load_pandoc_metadata(filename_base: &str) -> String {
+    let path = typst_doc_root().join(format!("{filename_base}_pandoc_meta.json"));
+    fs::read_to_string(&path).unwrap_or_else(|_| panic!("Failed to read metadata: {}", filename_base))
+}
+/// Count specific heading levels (= for level 1, == for level 2, etc.)
+fn count_heading_level(content: &str, level: usize) -> usize {
+    let exact_marker = format!("{} ", "=".repeat(level));
+    content
+        .lines()
+        .filter(|l| l.trim_start().starts_with(&exact_marker))
+        .count()
+}
+/// Extract all headings from content
+fn extract_all_headings(content: &str) -> Vec<String> {
+    content
+        .lines()
+        .filter(|l| {
+            let trimmed = l.trim_start();
+            trimmed.starts_with('=') && !trimmed.starts_with("#set")
+        })
+        .map(|l| l.to_string())
+        .collect()
+}
+/// Count lines that are pure metadata/directives (not content)
+fn count_directive_lines(content: &str) -> usize {
+    content
+        .lines()
+        .filter(|l| {
+            let t = l.trim();
+            t.starts_with("#set ") || t.starts_with("#let ") || t.starts_with("#import ")
+        })
+        .count()
+}
+/// Count empty headings (headings with just `= ` and no text)
+fn count_empty_headings(content: &str) -> usize {
+    content
+        .lines()
+        .filter(|l| {
+            let trimmed = l.trim_start();
+            trimmed == "="
+                || trimmed == "=="
+                || trimmed == "==="
+                || trimmed == "===="
+                || trimmed == "====="
+                || trimmed == "======"
+        })
+        .count()
+}
+/// Extract all text between headings (content blocks)
+fn extract_content_blocks(content: &str) -> Vec<String> {
+    let mut blocks = Vec::new();
+    let mut current_block = String::new();
+    let mut in_block = false;
+    for line in content.lines() {
+        let trimmed = line.trim_start();
+        if trimmed.starts_with('=') && !trimmed.starts_with("#set") {
+            if !current_block.is_empty() {
+                blocks.push(current_block.trim().to_string());
+                current_block.clear();
+            }
+            in_block = true;
+        } else if in_block && !trimmed.is_empty() {
+            current_block.push_str(line);
+            current_block.push('\n');
+        }
+    }
+    if !current_block.is_empty() {
+        blocks.push(current_block.trim().to_string());
+    }
+    blocks
+}
+/// Check if content has reasonable parity with baseline (within tolerance)
+fn content_parity_check(extracted: &str, baseline: &str, tolerance_percent: f64) -> bool {
+    let extracted_len = extracted.len();
+    let baseline_len = baseline.len();
+    if baseline_len == 0 {
+        return extracted_len == 0;
+    }
+    let ratio = (extracted_len as f64) / (baseline_len as f64);
+    let acceptable_min = 1.0 - (tolerance_percent / 100.0);
+    let acceptable_max = 1.0 + (tolerance_percent / 100.0);
+    ratio >= acceptable_min && ratio <= acceptable_max
+}
+// CRITICAL BUG TESTS - These expose the 45+ issues
+/// TEST 1: CRITICAL - 62% heading loss bug
+///
+/// The extractor only matches single `=` headings, completely skipping
+/// `==`, `===`, and higher levels. This causes catastrophic data loss
+/// in hierarchical documents.
+///
+/// Expected: All heading levels should be extracted
+/// Current behavior: Only level 1 headings extracted
+/// WILL FAIL: Exposing the heading loss bug
+#[tokio::test]
+async fn test_typst_all_heading_levels_not_lost() {
+    let content = load_test_document("headings.typ");
+    let _baseline = load_pandoc_baseline("headings");
+    let config = ExtractionConfig::default();
+    let result = extract_bytes(&content, "application/x-typst", &config)
+        .await
+        .expect("Extraction failed");
+    let extracted_all_headings = extract_all_headings(&result.content);
+    assert!(
+        extracted_all_headings.len() >= 6,
+        "CRITICAL BUG: Only extracted {} headings, should have extracted 6+ heading levels. \
+         This is the 62% heading loss bug - extractor only matches '=' but skips '==', '===', etc.",
+        extracted_all_headings.len()
+    );
+    for level in 1..=6 {
+        let count = count_heading_level(&result.content, level);
+        assert_eq!(
+            count, 1,
+            "Heading level {} should appear exactly once (found {}). \
+             Missing heading levels cause data loss in hierarchical documents.",
+            level, count
+        );
+    }
+}
+/// TEST 2: Display math not extracted
+///
+/// Display math ($$...$$) is completely lost from extraction,
+/// breaking mathematical content preservation.
+///
+/// Expected: Display math should be preserved in output
+/// Current behavior: Silently dropped
+/// WILL FAIL: Exposing display math loss
+#[tokio::test]
+async fn test_typst_display_math_preserved() {
+    let content = load_test_document("advanced.typ");
+    let baseline = load_pandoc_baseline("advanced");
+    let config = ExtractionConfig::default();
+    let result = extract_bytes(&content, "application/x-typst", &config)
+        .await
+        .expect("Extraction failed");
+    let has_display_math_in_baseline =
+        baseline.contains("²") || baseline.contains("Display math") || baseline.contains("x^2");
+    if has_display_math_in_baseline {
+        let our_has_math = result.content.contains("$")
+            || result.content.contains("Display")
+            || result.content.contains("²")
+            || result.content.contains("²");
+        assert!(
+            our_has_math,
+            "Display math should be extracted. Pandoc preserves mathematical notation, \
+             but extractor drops it entirely. This breaks scientific/academic documents."
+        );
+    }
+    let has_pythagorean = result.content.contains("^2")
+        || result.content.contains("²")
+        || result.content.contains("x") && result.content.contains("y") && result.content.contains("r");
+    assert!(
+        has_pythagorean,
+        "Pythagorean theorem expression should be present. Display math is being dropped."
+    );
+}
+/// TEST 3: Empty headings output
+///
+/// When heading text is missing or malformed, extractor outputs
+/// just the marker like "= " with no text, polluting the output.
+///
+/// Expected: Either full heading text or no heading at all
+/// Current behavior: "= " with no content
+/// WILL FAIL: Exposing empty heading bug
+#[tokio::test]
+async fn test_typst_no_empty_headings_output() {
+    let content = load_test_document("headings.typ");
+    let config = ExtractionConfig::default();
+    let result = extract_bytes(&content, "application/x-typst", &config)
+        .await
+        .expect("Extraction failed");
+    let empty_headings = count_empty_headings(&result.content);
+    assert_eq!(
+        empty_headings, 0,
+        "Found {} empty heading lines (just '=' with no text). \
+         Extractor outputs malformed headings like '= ' with no text, \
+         corrupting the document structure.",
+        empty_headings
+    );
+    for heading in extract_all_headings(&result.content) {
+        let trimmed = heading.trim_start();
+        let after_marker = trimmed.trim_start_matches('=').trim();
+        assert!(
+            !after_marker.is_empty(),
+            "Heading '{}' has no text after marker. Should not output empty headings.",
+            trimmed
+        );
+    }
+}
+/// TEST 4: Metadata extraction fails with regex silently
+///
+/// When regex patterns fail to match metadata fields,
+/// the extractor silently returns None instead of logging/failing,
+/// causing complete metadata loss for certain formats.
+///
+/// Expected: All metadata fields should be extracted
+/// Current behavior: Some formats fail silently
+/// WILL FAIL: Exposing metadata loss
+#[tokio::test]
+async fn test_typst_metadata_extraction_completeness() {
+    let content = load_test_document("metadata.typ");
+    let _baseline_meta = load_pandoc_metadata("metadata");
+    let config = ExtractionConfig::default();
+    let result = extract_bytes(&content, "application/x-typst", &config)
+        .await
+        .expect("Extraction failed");
+    let has_title = result
+        .metadata
+        .additional
+        .get("title")
+        .map(|t| t.to_string().len() > 0)
+        .unwrap_or(false);
+    let has_author = result
+        .metadata
+        .additional
+        .get("author")
+        .map(|a| a.to_string().len() > 0)
+        .unwrap_or(false);
+    let has_keywords = result
+        .metadata
+        .additional
+        .get("keywords")
+        .map(|k| k.to_string().len() > 0)
+        .unwrap_or(false);
+    assert!(
+        has_title,
+        "Title metadata should be extracted. Regex pattern matching fails silently \
+         and metadata is lost with no error reporting."
+    );
+    assert!(
+        has_author,
+        "Author metadata should be extracted. Some metadata formats fail silently."
+    );
+    assert!(
+        has_keywords,
+        "Keywords should be extracted. Regex failures cause silent data loss."
+    );
+}
+/// TEST 5: Nested table brackets cause corruption
+///
+/// Tables with nested brackets like [Name [full]] corrupt the
+/// table content extraction because bracket counting is naive.
+///
+/// Expected: Table cells should be extracted correctly even with nesting
+/// Current behavior: Bracket nesting causes cells to be malformed
+/// WILL FAIL: Exposing table corruption bug
+#[tokio::test]
+async fn test_typst_tables_with_nested_brackets_not_corrupted() {
+    let content = load_test_document("advanced.typ");
+    let baseline = load_pandoc_baseline("advanced");
+    let config = ExtractionConfig::default();
+    let result = extract_bytes(&content, "application/x-typst", &config)
+        .await
+        .expect("Extraction failed");
+    let has_table_in_baseline = baseline.contains("Name") && baseline.contains("Alice");
+    if has_table_in_baseline {
+        let table_content_extracted =
+            result.content.contains("Name") && result.content.contains("Alice") && result.content.contains("Age");
+        assert!(
+            table_content_extracted,
+            "Table content should be extracted correctly. Nested brackets cause corruption \
+             and table cells are malformed."
+        );
+        let corrupted_brackets = result.content.matches("[[").count();
+        assert_eq!(
+            corrupted_brackets, 0,
+            "Found corrupted bracket sequences [[. Table extraction with nested brackets \
+             produces malformed output."
+        );
+    }
+}
+/// TEST 6: Content volume parity - within tolerance of Pandoc
+///
+/// Our extractor should extract roughly the same amount of content
+/// as Pandoc (baseline). Large discrepancies indicate data loss or
+/// noise injection.
+///
+/// Expected: Within reasonable tolerance of baseline content size
+/// Current behavior: Significant data loss on complex documents (e.g., advanced.typ)
+/// WILL FAIL: Exposing data loss on complex documents with formatting
+#[tokio::test]
+async fn test_typst_content_volume_parity_with_pandoc() {
+    let documents = vec![("simple", 30.0), ("headings", 20.0)];
+    for (doc_name, tolerance) in documents {
+        let content = load_test_document(&format!("{}.typ", doc_name));
+        let baseline = load_pandoc_baseline(doc_name);
+        let config = ExtractionConfig::default();
+        let result = extract_bytes(&content, "application/x-typst", &config)
+            .await
+            .unwrap_or_else(|_| panic!("Extraction failed for {}", doc_name));
+        let baseline_size = baseline.len();
+        let extracted_size = result.content.len();
+        let is_within_tolerance = content_parity_check(&result.content, &baseline, tolerance);
+        assert!(
+            is_within_tolerance,
+            "Content volume parity failed for {}: \
+             Baseline: {} bytes, Extracted: {} bytes ({}% tolerance allowed). \
+             Data loss indicates missing extraction features or formatting issues.",
+            doc_name, baseline_size, extracted_size, tolerance
+        );
+    }
+}
+/// TEST 7: Blockquotes not implemented
+///
+/// Blockquotes (using > syntax in other formats, typst uses #quote)
+/// are completely unimplemented, causing loss of semantic structure.
+///
+/// Expected: Blockquote content should be extracted
+/// Current behavior: Feature not implemented
+/// WILL FAIL: Exposing missing blockquote support
+#[tokio::test]
+async fn test_typst_blockquote_handling() {
+    let test_content = b"#quote[
+        This is a blockquote.
+        It should be extracted.
+    ]";
+    let config = ExtractionConfig::default();
+    let result = extract_bytes(test_content, "application/x-typst", &config)
+        .await
+        .expect("Extraction failed");
+    let has_blockquote_content =
+        result.content.contains("blockquote") || result.content.contains("This is a blockquote");
+    assert!(
+        has_blockquote_content,
+        "Blockquote content should be extracted. Blockquotes are not implemented \
+         in the extractor, causing complete loss of quoted content."
+    );
+}
+/// TEST 8: Inline code preservation
+///
+/// Test that inline code blocks are properly extracted and marked.
+/// This ensures code snippets aren't corrupted.
+///
+/// Expected: Inline code preserved with backticks or clearly marked
+/// Current behavior: May be corrupted
+/// WILL FAIL: If inline code is not preserved
+#[tokio::test]
+async fn test_typst_inline_code_preserved() {
+    let content = load_test_document("advanced.typ");
+    let baseline = load_pandoc_baseline("advanced");
+    let config = ExtractionConfig::default();
+    let result = extract_bytes(&content, "application/x-typst", &config)
+        .await
+        .expect("Extraction failed");
+    let has_inline_code =
+        result.content.contains("`") || (result.content.contains("code") && baseline.contains("`code`"));
+    assert!(
+        has_inline_code,
+        "Inline code should be preserved with backticks or clearly marked."
+    );
+}
+/// TEST 9: Inline math extraction
+///
+/// Inline math (single $ delimiters) should be extracted and preserved.
+///
+/// Expected: Inline math formulas preserved
+/// Current behavior: May be dropped
+/// WILL FAIL: If inline math is lost
+#[tokio::test]
+async fn test_typst_inline_math_preserved() {
+    let content = load_test_document("advanced.typ");
+    let baseline = load_pandoc_baseline("advanced");
+    let config = ExtractionConfig::default();
+    let result = extract_bytes(&content, "application/x-typst", &config)
+        .await
+        .expect("Extraction failed");
+    let has_inline_math =
+        result.content.contains("$") || result.content.contains("sqrt") || result.content.contains("equation");
+    if baseline.contains("$") || baseline.contains("equation") {
+        assert!(
+            has_inline_math,
+            "Inline math should be extracted. Mathematical formulas are being dropped."
+        );
+    }
+}
+/// TEST 10: Figures and captions
+///
+/// Figure extraction with captions should preserve both image references
+/// and caption text.
+///
+/// Expected: Figure content and captions extracted
+/// Current behavior: May be unimplemented
+#[tokio::test]
+async fn test_typst_figures_and_captions() {
+    let test_content = b"#figure(
+        image(\"example.png\"),
+        caption: [This is a figure caption]
+    )";
+    let config = ExtractionConfig::default();
+    let result = extract_bytes(test_content, "application/x-typst", &config)
+        .await
+        .expect("Extraction failed");
+    let _has_caption = result.content.contains("caption") || result.content.contains("figure");
+    println!(
+        "Figure extraction result (feature may be unimplemented): {:?}",
+        result.content
+    );
+}
+/// TEST 11: Citation/reference handling
+///
+/// Citations and references should be extracted when present.
+///
+/// Expected: Citation markers and text preserved
+/// Current behavior: May be dropped
+#[tokio::test]
+async fn test_typst_citations_preserved() {
+    let test_content = b"Here is a citation @smith2020.
+= References
+#bibliography()";
+    let config = ExtractionConfig::default();
+    let result = extract_bytes(test_content, "application/x-typst", &config)
+        .await
+        .expect("Extraction failed");
+    let _has_citation = result.content.contains("@smith2020")
+        || result.content.contains("smith")
+        || result.content.contains("References");
+    println!("Citation handling (may be limited): {:?}", result.content);
+}
+/// TEST 12: Link extraction and formatting
+///
+/// Links should be extracted with both URL and link text.
+///
+/// Expected: Links in markdown format [text](url)
+/// Current behavior: May lose URL or text
+#[tokio::test]
+async fn test_typst_link_extraction() {
+    let content = load_test_document("advanced.typ");
+    let _baseline = load_pandoc_baseline("advanced");
+    let config = ExtractionConfig::default();
+    let result = extract_bytes(&content, "application/x-typst", &config)
+        .await
+        .expect("Extraction failed");
+    let has_link_content =
+        result.content.contains("example") || result.content.contains("link") || result.content.contains("https");
+    assert!(
+        has_link_content,
+        "Link content should be extracted. Links may be completely dropped."
+    );
+}
+/// TEST 13: Unordered list extraction
+///
+/// Both + and - list markers should be converted to standard format.
+///
+/// Expected: All list items extracted and normalized
+/// Current behavior: May lose some items
+#[tokio::test]
+async fn test_typst_list_extraction() {
+    let content = load_test_document("simple.typ");
+    let _baseline = load_pandoc_baseline("simple");
+    let config = ExtractionConfig::default();
+    let result = extract_bytes(&content, "application/x-typst", &config)
+        .await
+        .expect("Extraction failed");
+    let has_list_markers = result.content.contains("-") || result.content.contains("+");
+    let has_list_content =
+        result.content.contains("First") || result.content.contains("Second") || result.content.contains("item");
+    assert!(
+        has_list_markers || has_list_content,
+        "List items should be extracted with markers or content preserved."
+    );
+}
+/// TEST 14: Code block extraction
+///
+/// Triple-backtick code blocks should be fully extracted with language specifiers.
+///
+/// Expected: Code blocks with language markers preserved
+/// Current behavior: May be malformed
+#[tokio::test]
+async fn test_typst_code_block_extraction() {
+    let content = load_test_document("advanced.typ");
+    let _baseline = load_pandoc_baseline("advanced");
+    let config = ExtractionConfig::default();
+    let result = extract_bytes(&content, "application/x-typst", &config)
+        .await
+        .expect("Extraction failed");
+    let has_code = result.content.contains("```")
+        || result.content.contains("def")
+        || result.content.contains("fibonacci")
+        || result.content.contains("python");
+    assert!(has_code, "Code blocks should be extracted with language specifiers.");
+}
+/// TEST 15: Bold and italic formatting
+///
+/// Inline emphasis formatting should be preserved or normalized.
+///
+/// Expected: Bold (*text*) and italic (_text_) markers present
+/// Current behavior: May be lost
+#[tokio::test]
+async fn test_typst_emphasis_formatting() {
+    let content = load_test_document("advanced.typ");
+    let config = ExtractionConfig::default();
+    let result = extract_bytes(&content, "application/x-typst", &config)
+        .await
+        .expect("Extraction failed");
+    let has_emphasis = result.content.contains("*") && result.content.contains("_");
+    assert!(has_emphasis, "Bold and italic formatting markers should be preserved.");
+}
+/// TEST 16: Complex nested formatting
+///
+/// Test handling of *_nested formatting_* combinations.
+///
+/// Expected: Nested formatting preserved or flattened consistently
+/// Current behavior: May be malformed
+#[tokio::test]
+async fn test_typst_nested_formatting() {
+    let test_content = b"This is *bold with _nested italic_* text.";
+    let config = ExtractionConfig::default();
+    let result = extract_bytes(test_content, "application/x-typst", &config)
+        .await
+        .expect("Extraction failed");
+    let has_formatting = result.content.contains("*")
+        || result.content.contains("_")
+        || (result.content.contains("bold") && result.content.contains("italic"));
+    assert!(
+        has_formatting,
+        "Nested formatting should be preserved or flattened consistently."
+    );
+}
+/// TEST 17: Multiple paragraph handling
+///
+/// Multiple paragraphs separated by blank lines should be preserved.
+///
+/// Expected: Paragraph structure maintained
+/// Current behavior: May merge or lose paragraphs
+#[tokio::test]
+async fn test_typst_multiple_paragraphs() {
+    let content = load_test_document("advanced.typ");
+    let _baseline = load_pandoc_baseline("advanced");
+    let config = ExtractionConfig::default();
+    let result = extract_bytes(&content, "application/x-typst", &config)
+        .await
+        .expect("Extraction failed");
+    let non_empty_lines: Vec<_> = result.content.lines().filter(|l| !l.trim().is_empty()).collect();
+    assert!(
+        non_empty_lines.len() >= 5,
+        "Multiple paragraphs should be preserved. Found {} content lines.",
+        non_empty_lines.len()
+    );
+}
+/// TEST 18: Heading-content association
+///
+/// Content should follow its heading logically in the output.
+///
+/// Expected: Each heading followed by its content
+/// Current behavior: May be scrambled
+#[tokio::test]
+async fn test_typst_heading_content_association() {
+    let content = load_test_document("advanced.typ");
+    let config = ExtractionConfig::default();
+    let result = extract_bytes(&content, "application/x-typst", &config)
+        .await
+        .expect("Extraction failed");
+    let blocks = extract_content_blocks(&result.content);
+    assert!(blocks.len() > 0, "Content blocks should be associated with headings.");
+    for block in &blocks {
+        assert!(block.len() > 0, "Content blocks should not be empty.");
+    }
+}
+/// TEST 19: Whitespace normalization
+///
+/// Multiple blank lines should be normalized consistently.
+///
+/// Expected: Single blank lines between sections
+/// Current behavior: May have excessive whitespace
+#[tokio::test]
+async fn test_typst_whitespace_handling() {
+    let content = load_test_document("advanced.typ");
+    let config = ExtractionConfig::default();
+    let result = extract_bytes(&content, "application/x-typst", &config)
+        .await
+        .expect("Extraction failed");
+    let blank_line_runs: Vec<_> = result.content.split("\n\n\n").collect();
+    assert!(
+        blank_line_runs.len() <= 2,
+        "Should not have excessive blank lines (triple newlines). \
+         Found {} instances of triple newlines.",
+        blank_line_runs.len() - 1
+    );
+}
+/// TEST 20: Minimal document handling
+///
+/// Even minimal documents should extract correctly.
+///
+/// Expected: Basic content and structure
+/// Current behavior: May fail or lose content
+#[tokio::test]
+async fn test_typst_minimal_document() {
+    let content = load_test_document("minimal.typ");
+    let _baseline = load_pandoc_baseline("minimal");
+    let config = ExtractionConfig::default();
+    let result = extract_bytes(&content, "application/x-typst", &config)
+        .await
+        .expect("Extraction failed");
+    assert!(
+        !result.content.is_empty(),
+        "Even minimal documents should extract some content."
+    );
+    assert!(
+        result.content.len() > 0,
+        "Minimal document should produce non-empty output."
+    );
+}
+/// TEST 21: No directive pollution
+///
+/// Extracted content should not contain #set, #let, #import directives.
+///
+/// Expected: Clean extracted content without directives
+/// Current behavior: May include directives
+#[tokio::test]
+async fn test_typst_no_directive_pollution() {
+    let content = load_test_document("advanced.typ");
+    let config = ExtractionConfig::default();
+    let result = extract_bytes(&content, "application/x-typst", &config)
+        .await
+        .expect("Extraction failed");
+    let directive_count = count_directive_lines(&result.content);
+    assert_eq!(
+        directive_count, 0,
+        "Extracted content should not contain directives (#set, #let, etc). \
+         Found {} directive lines polluting the output.",
+        directive_count
+    );
+}
+/// TEST 22: Metadata field completeness
+///
+/// All metadata fields from baseline should be present.
+///
+/// Expected: Title, author, date, keywords all extracted
+/// Current behavior: Some fields missing
+#[tokio::test]
+async fn test_typst_metadata_field_completeness() {
+    let content = load_test_document("advanced.typ");
+    let config = ExtractionConfig::default();
+    let result = extract_bytes(&content, "application/x-typst", &config)
+        .await
+        .expect("Extraction failed");
+    let has_title = result.metadata.additional.get("title").is_some();
+    let has_author = result.metadata.additional.get("author").is_some();
+    let has_date = result.metadata.date.is_some();
+    assert!(
+        has_title && has_author && has_date,
+        "All metadata fields should be extracted. \
+         Title: {}, Author: {}, Date: {}",
+        has_title,
+        has_author,
+        has_date
+    );
+}
+/// TEST 23: Special character handling
+///
+/// Unicode and special characters should be preserved.
+///
+/// Expected: Special characters like ü, é, etc. preserved
+/// Current behavior: May be corrupted
+#[tokio::test]
+async fn test_typst_special_character_preservation() {
+    let test_content = "Café with naïve français".as_bytes();
+    let config = ExtractionConfig::default();
+    let result = extract_bytes(test_content, "application/x-typst", &config)
+        .await
+        .expect("Extraction failed");
+    let has_special_chars =
+        result.content.contains("Café") || result.content.contains("naïve") || result.content.contains("français");
+    assert!(
+        has_special_chars,
+        "Special characters should be preserved in extraction."
+    );
+}
+/// TEST 24: Very long heading handling
+///
+/// Long headings should not cause truncation or corruption.
+///
+/// Expected: Full heading text preserved regardless of length
+/// Current behavior: May truncate
+#[tokio::test]
+async fn test_typst_long_heading_handling() {
+    let test_content = b"= This is a very long heading that should be completely preserved without any truncation or corruption whatsoever";
+    let config = ExtractionConfig::default();
+    let result = extract_bytes(test_content, "application/x-typst", &config)
+        .await
+        .expect("Extraction failed");
+    let has_heading_start = result.content.contains("very long heading");
+    assert!(has_heading_start, "Long headings should not be truncated.");
+}
+/// TEST 25: Edge case - Empty heading recovery
+///
+/// Even if a heading has no text, extraction should be robust.
+///
+/// Expected: Graceful handling without crashes
+/// Current behavior: May panic or produce empty output
+#[tokio::test]
+async fn test_typst_empty_heading_edge_case() {
+    let test_content = b"= \n\n== \nContent here";
+    let config = ExtractionConfig::default();
+    let result = extract_bytes(test_content, "application/x-typst", &config).await;
+    match result {
+        Ok(extraction) => {
+            assert!(
+                extraction.content.contains("Content"),
+                "Should extract regular content even if some headings are empty."
+            );
+        }
+        Err(_) => {}
+    }
+}
+/// TEST 26: Regression - Basic heading extraction
+#[tokio::test]
+async fn test_typst_basic_heading_regression() {
+    let test_content = b"= Main Heading\n\nContent here";
+    let config = ExtractionConfig::default();
+    let result = extract_bytes(test_content, "application/x-typst", &config)
+        .await
+        .expect("Extraction failed");
+    assert!(
+        result.content.contains("= Main Heading"),
+        "Basic level-1 heading should be extracted."
+    );
+    assert!(result.content.contains("Content"), "Content should be extracted.");
+}
+/// TEST 27: Regression - Level 2 heading extraction
+#[tokio::test]
+async fn test_typst_level2_heading_regression() {
+    let test_content = b"= Main\n\n== Subsection\n\nMore content";
+    let config = ExtractionConfig::default();
+    let result = extract_bytes(test_content, "application/x-typst", &config)
+        .await
+        .expect("Extraction failed");
+    assert!(
+        result.content.contains("== Subsection"),
+        "Level 2 headings must be extracted."
+    );
+}
+/// TEST 28: Regression - Basic metadata
+#[tokio::test]
+async fn test_typst_basic_metadata_regression() {
+    let test_content = b"#set document(title: \"Test\", author: \"Me\")\n\n= Heading";
+    let config = ExtractionConfig::default();
+    let result = extract_bytes(test_content, "application/x-typst", &config)
+        .await
+        .expect("Extraction failed");
+    assert!(
+        result.metadata.additional.get("title").is_some(),
+        "Title metadata must be extracted."
+    );
+    assert!(
+        result.metadata.additional.get("author").is_some(),
+        "Author metadata must be extracted."
+    );
+}
+/// TEST 29: Regression - Bold formatting
+#[tokio::test]
+async fn test_typst_bold_regression() {
+    let test_content = b"This is *bold text* here";
+    let config = ExtractionConfig::default();
+    let result = extract_bytes(test_content, "application/x-typst", &config)
+        .await
+        .expect("Extraction failed");
+    assert!(
+        result.content.contains("*bold*") || result.content.contains("bold"),
+        "Bold text should be preserved."
+    );
+}
+/// TEST 30: Regression - Inline code
+#[tokio::test]
+async fn test_typst_inline_code_regression() {
+    let test_content = b"Use `println!(\"hello\")` in Rust";
+    let config = ExtractionConfig::default();
+    let result = extract_bytes(test_content, "application/x-typst", &config)
+        .await
+        .expect("Extraction failed");
+    assert!(
+        result.content.contains("`") && result.content.contains("println"),
+        "Inline code should be preserved with backticks."
+    );
+}
+/// TEST 31: Regression - Code blocks
+#[tokio::test]
+async fn test_typst_codeblock_regression() {
+    let test_content = b"```rust\nfn main() {}\n```";
+    let config = ExtractionConfig::default();
+    let result = extract_bytes(test_content, "application/x-typst", &config)
+        .await
+        .expect("Extraction failed");
+    assert!(
+        result.content.contains("```"),
+        "Code block delimiters should be preserved."
+    );
+    assert!(
+        result.content.contains("fn main"),
+        "Code block content should be preserved."
+    );
+}
+/// TEST 32: Regression - List extraction
+#[tokio::test]
+async fn test_typst_list_regression() {
+    let test_content = b"- Item 1\n+ Item 2\n- Item 3";
+    let config = ExtractionConfig::default();
+    let result = extract_bytes(test_content, "application/x-typst", &config)
+        .await
+        .expect("Extraction failed");
+    assert!(
+        result.content.contains("Item 1") && result.content.contains("Item 2") && result.content.contains("Item 3"),
+        "All list items should be extracted."
+    );
+}
+/// TEST 33: Regression - Math preservation
+#[tokio::test]
+async fn test_typst_math_regression() {
+    let test_content = b"Formula: $E = mc^2$";
+    let config = ExtractionConfig::default();
+    let result = extract_bytes(test_content, "application/x-typst", &config)
+        .await
+        .expect("Extraction failed");
+    assert!(
+        result.content.contains("$") && (result.content.contains("mc") || result.content.contains("E")),
+        "Math formulas should be preserved."
+    );
+}
+/// TEST 34: Regression - Link extraction
+#[tokio::test]
+async fn test_typst_link_regression() {
+    let test_content = b"Visit #link(\"https://example.com\")[example]";
+    let config = ExtractionConfig::default();
+    let result = extract_bytes(test_content, "application/x-typst", &config)
+        .await
+        .expect("Extraction failed");
+    assert!(
+        result.content.contains("example") || result.content.contains("example.com"),
+        "Link text or URL should be preserved."
+    );
+}
+/// TEST 35: Regression - Table basic extraction
+#[tokio::test]
+async fn test_typst_table_regression() {
+    let test_content = b"#table(columns: 2, [A], [B], [1], [2])";
+    let config = ExtractionConfig::default();
+    let result = extract_bytes(test_content, "application/x-typst", &config)
+        .await
+        .expect("Extraction failed");
+    assert!(
+        result.content.contains("A") || result.content.contains("TABLE"),
+        "Table content should be extracted."
+    );
+}
+/// TEST 36: Large document handling
+#[tokio::test]
+async fn test_typst_large_document_stress() {
+    let mut large_content = String::new();
+    for i in 1..=50 {
+        large_content.push_str(&format!("= Heading {}\n\n", i));
+        large_content.push_str(&format!("Content for section {}.\n\n", i));
+    }
+    let config = ExtractionConfig::default();
+    let result = extract_bytes(large_content.as_bytes(), "application/x-typst", &config)
+        .await
+        .expect("Extraction failed");
+    let heading_count = extract_all_headings(&result.content).len();
+    assert!(
+        heading_count >= 40,
+        "Large documents should extract all headings. Found {} of 50.",
+        heading_count
+    );
+}
+/// TEST 37: Deep nesting stress test
+#[tokio::test]
+async fn test_typst_deep_nesting_stress() {
+    let mut nested = String::new();
+    for level in 1..=6 {
+        nested.push_str(&format!("{} Level {} Heading\n\n", "=".repeat(level), level));
+        nested.push_str(&format!("Content at level {}.\n\n", level));
+    }
+    let config = ExtractionConfig::default();
+    let result = extract_bytes(nested.as_bytes(), "application/x-typst", &config)
+        .await
+        .expect("Extraction failed");
+    for level in 1..=6 {
+        let count = count_heading_level(&result.content, level);
+        assert!(
+            count >= 1,
+            "Level {} heading should be extracted in deep nesting test.",
+            level
+        );
+    }
+}
+/// TEST 38: Mixed formatting stress
+#[tokio::test]
+async fn test_typst_mixed_formatting_stress() {
+    let test_content = b"This text has *bold*, _italic_, `code`, and $math$ all mixed together!";
+    let config = ExtractionConfig::default();
+    let result = extract_bytes(test_content, "application/x-typst", &config)
+        .await
+        .expect("Extraction failed");
+    let has_formatting = (result.content.contains("*") || result.content.contains("bold"))
+        && (result.content.contains("_") || result.content.contains("italic"))
+        && (result.content.contains("`") || result.content.contains("code"))
+        && (result.content.contains("$") || result.content.contains("math"));
+    assert!(has_formatting, "All mixed formatting should be preserved.");
+}
+/// TEST 39: Unicode stress test
+#[tokio::test]
+async fn test_typst_unicode_stress() {
+    let test_content = "= Unicode Heading 中文 العربية\n\nContent with emojis: 🎉🚀💯\n\nGreek: α β γ δ ε ζ".as_bytes();
+    let config = ExtractionConfig::default();
+    let result = extract_bytes(test_content, "application/x-typst", &config)
+        .await
+        .expect("Extraction failed");
+    assert!(
+        result.content.contains("Unicode"),
+        "Unicode content should be preserved."
+    );
+}
+/// TEST 40: Pathological whitespace
+#[tokio::test]
+async fn test_typst_pathological_whitespace() {
+    let test_content = b"= Heading\n\n\n\n\n\nContent with excessive blank lines\n\n\n\n\nMore content";
+    let config = ExtractionConfig::default();
+    let result = extract_bytes(test_content, "application/x-typst", &config)
+        .await
+        .expect("Extraction failed");
+    assert!(
+        result.content.contains("Heading") && result.content.contains("Content"),
+        "Should extract content even with excessive whitespace."
+    );
+}
+/// TEST 41: Full document comparison - simple.typ
+#[tokio::test]
+async fn test_typst_full_simple_document_comparison() {
+    let content = load_test_document("simple.typ");
+    let _baseline = load_pandoc_baseline("simple");
+    let config = ExtractionConfig::default();
+    let result = extract_bytes(&content, "application/x-typst", &config)
+        .await
+        .expect("Extraction failed");
+    assert!(
+        result.content.len() > 50,
+        "simple.typ should extract substantial content"
+    );
+    let heading_count = extract_all_headings(&result.content).len();
+    assert!(heading_count > 2, "simple.typ should have multiple sections");
+}
+/// TEST 42: Full document comparison - advanced.typ
+#[tokio::test]
+async fn test_typst_full_advanced_document_comparison() {
+    let content = load_test_document("advanced.typ");
+    let _baseline = load_pandoc_baseline("advanced");
+    let config = ExtractionConfig::default();
+    let result = extract_bytes(&content, "application/x-typst", &config)
+        .await
+        .expect("Extraction failed");
+    assert!(
+        result.content.len() > 100,
+        "advanced.typ should extract comprehensive content"
+    );
+    let heading_count = extract_all_headings(&result.content).len();
+    assert!(heading_count >= 5, "advanced.typ should preserve heading structure");
+}
+/// TEST 43: MIME type consistency
+///
+/// The extractor should support both standard MIME types for Typst.
+/// Currently only supports application/x-typst, not text/x-typst.
+#[tokio::test]
+async fn test_typst_mime_type_consistency() {
+    let content = load_test_document("simple.typ");
+    let config = ExtractionConfig::default();
+    let result_primary = extract_bytes(&content, "application/x-typst", &config)
+        .await
+        .expect("Primary MIME type should work");
+    assert!(
+        result_primary.content.len() > 0,
+        "Primary MIME type should extract content"
+    );
+    match extract_bytes(&content, "text/x-typst", &config).await {
+        Ok(result) => {
+            assert!(
+                result.content.len() > 0,
+                "Alternative MIME type should extract content if supported"
+            );
+        }
+        Err(_e) => {
+            println!("Note: text/x-typst is not currently supported (may be added in future)");
+        }
+    }
+}
+/// TEST 44: Config parameter impact
+#[tokio::test]
+async fn test_typst_config_parameter_handling() {
+    let content = load_test_document("simple.typ");
+    let config = ExtractionConfig::default();
+    let result = extract_bytes(&content, "application/x-typst", &config)
+        .await
+        .expect("Extraction failed");
+    assert!(!result.content.is_empty(), "Extraction with default config should work");
+    assert_eq!(result.mime_type, "application/x-typst", "MIME type should be preserved");
+}
+/// TEST 45: Comparative heading analysis
+///
+/// This final comprehensive test checks heading extraction
+/// against the baseline to identify the exact scope of the heading loss bug.
+#[tokio::test]
+async fn test_typst_heading_loss_bug_analysis() {
+    let content = load_test_document("headings.typ");
+    let baseline = load_pandoc_baseline("headings");
+    let config = ExtractionConfig::default();
+    let result = extract_bytes(&content, "application/x-typst", &config)
+        .await
+        .expect("Extraction failed");
+    println!("\n===== HEADING EXTRACTION ANALYSIS =====");
+    println!("Baseline content:");
+    println!("{}", baseline);
+    println!("\nExtracted content:");
+    println!("{}", result.content);
+    let extracted_headings = extract_all_headings(&result.content);
+    println!("\nExtracted headings: {}", extracted_headings.len());
+    for (i, h) in extracted_headings.iter().enumerate() {
+        println!("  {}: {}", i + 1, h);
+    }
+    assert!(
+        extracted_headings.len() >= 6,
+        "BUG CONFIRMED: Heading loss detected. \
+         Expected 6 headings (1-6 levels), found {}. \
+         This is the 62% heading loss bug - only single '=' is matched, \
+         all '==' and higher are skipped entirely.",
+        extracted_headings.len()
+    );
+}