kreuzberg 4.0.0.rc1 → 4.0.0.rc2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +14 -8
- data/.rspec +3 -3
- data/.rubocop.yaml +1 -534
- data/.rubocop.yml +538 -0
- data/Gemfile +8 -9
- data/Gemfile.lock +9 -109
- data/README.md +426 -421
- data/Rakefile +25 -25
- data/Steepfile +47 -47
- data/examples/async_patterns.rb +341 -340
- data/ext/kreuzberg_rb/extconf.rb +45 -35
- data/ext/kreuzberg_rb/native/Cargo.lock +6535 -0
- data/ext/kreuzberg_rb/native/Cargo.toml +44 -36
- data/ext/kreuzberg_rb/native/README.md +425 -425
- data/ext/kreuzberg_rb/native/build.rs +15 -17
- data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
- data/ext/kreuzberg_rb/native/include/strings.h +20 -20
- data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
- data/ext/kreuzberg_rb/native/src/lib.rs +2998 -2939
- data/extconf.rb +28 -28
- data/kreuzberg.gemspec +148 -105
- data/lib/kreuzberg/api_proxy.rb +142 -142
- data/lib/kreuzberg/cache_api.rb +46 -45
- data/lib/kreuzberg/cli.rb +55 -55
- data/lib/kreuzberg/cli_proxy.rb +127 -127
- data/lib/kreuzberg/config.rb +691 -684
- data/lib/kreuzberg/error_context.rb +32 -0
- data/lib/kreuzberg/errors.rb +118 -50
- data/lib/kreuzberg/extraction_api.rb +85 -84
- data/lib/kreuzberg/mcp_proxy.rb +186 -186
- data/lib/kreuzberg/ocr_backend_protocol.rb +113 -113
- data/lib/kreuzberg/post_processor_protocol.rb +86 -86
- data/lib/kreuzberg/result.rb +216 -216
- data/lib/kreuzberg/setup_lib_path.rb +80 -79
- data/lib/kreuzberg/validator_protocol.rb +89 -89
- data/lib/kreuzberg/version.rb +5 -5
- data/lib/kreuzberg.rb +103 -82
- data/sig/kreuzberg/internal.rbs +184 -184
- data/sig/kreuzberg.rbs +520 -468
- data/spec/binding/cache_spec.rb +227 -227
- data/spec/binding/cli_proxy_spec.rb +85 -87
- data/spec/binding/cli_spec.rb +55 -54
- data/spec/binding/config_spec.rb +345 -345
- data/spec/binding/config_validation_spec.rb +283 -283
- data/spec/binding/error_handling_spec.rb +213 -213
- data/spec/binding/errors_spec.rb +66 -66
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
- data/spec/binding/plugins/postprocessor_spec.rb +269 -269
- data/spec/binding/plugins/validator_spec.rb +274 -274
- data/spec/fixtures/config.toml +39 -39
- data/spec/fixtures/config.yaml +41 -42
- data/spec/fixtures/invalid_config.toml +4 -4
- data/spec/smoke/package_spec.rb +178 -178
- data/spec/spec_helper.rb +42 -42
- data/vendor/kreuzberg/Cargo.toml +204 -134
- data/vendor/kreuzberg/README.md +175 -175
- data/vendor/kreuzberg/benches/otel_overhead.rs +48 -0
- data/vendor/kreuzberg/build.rs +474 -460
- data/vendor/kreuzberg/src/api/error.rs +81 -81
- data/vendor/kreuzberg/src/api/handlers.rs +199 -199
- data/vendor/kreuzberg/src/api/mod.rs +79 -79
- data/vendor/kreuzberg/src/api/server.rs +353 -353
- data/vendor/kreuzberg/src/api/types.rs +170 -170
- data/vendor/kreuzberg/src/cache/mod.rs +1167 -1143
- data/vendor/kreuzberg/src/chunking/mod.rs +677 -677
- data/vendor/kreuzberg/src/core/batch_mode.rs +95 -35
- data/vendor/kreuzberg/src/core/config.rs +1032 -1032
- data/vendor/kreuzberg/src/core/extractor.rs +1024 -903
- data/vendor/kreuzberg/src/core/io.rs +329 -327
- data/vendor/kreuzberg/src/core/mime.rs +605 -615
- data/vendor/kreuzberg/src/core/mod.rs +45 -42
- data/vendor/kreuzberg/src/core/pipeline.rs +984 -906
- data/vendor/kreuzberg/src/embeddings.rs +432 -323
- data/vendor/kreuzberg/src/error.rs +431 -431
- data/vendor/kreuzberg/src/extraction/archive.rs +954 -954
- data/vendor/kreuzberg/src/extraction/docx.rs +40 -40
- data/vendor/kreuzberg/src/extraction/email.rs +854 -854
- data/vendor/kreuzberg/src/extraction/excel.rs +688 -688
- data/vendor/kreuzberg/src/extraction/html.rs +553 -553
- data/vendor/kreuzberg/src/extraction/image.rs +368 -368
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +563 -564
- data/vendor/kreuzberg/src/extraction/markdown.rs +213 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +81 -77
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
- data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -128
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +287 -0
- data/vendor/kreuzberg/src/extraction/pptx.rs +3000 -3000
- data/vendor/kreuzberg/src/extraction/structured.rs +490 -490
- data/vendor/kreuzberg/src/extraction/table.rs +328 -328
- data/vendor/kreuzberg/src/extraction/text.rs +269 -269
- data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
- data/vendor/kreuzberg/src/extractors/archive.rs +446 -425
- data/vendor/kreuzberg/src/extractors/bibtex.rs +469 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +502 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +367 -479
- data/vendor/kreuzberg/src/extractors/email.rs +143 -129
- data/vendor/kreuzberg/src/extractors/epub.rs +707 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +343 -344
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +491 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +738 -0
- data/vendor/kreuzberg/src/extractors/html.rs +393 -410
- data/vendor/kreuzberg/src/extractors/image.rs +198 -195
- data/vendor/kreuzberg/src/extractors/jats.rs +1051 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +367 -0
- data/vendor/kreuzberg/src/extractors/latex.rs +652 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +700 -0
- data/vendor/kreuzberg/src/extractors/mod.rs +365 -268
- data/vendor/kreuzberg/src/extractors/odt.rs +628 -0
- data/vendor/kreuzberg/src/extractors/opml.rs +634 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +528 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +493 -496
- data/vendor/kreuzberg/src/extractors/pptx.rs +248 -234
- data/vendor/kreuzberg/src/extractors/rst.rs +576 -0
- data/vendor/kreuzberg/src/extractors/rtf.rs +810 -0
- data/vendor/kreuzberg/src/extractors/security.rs +484 -0
- data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +140 -126
- data/vendor/kreuzberg/src/extractors/text.rs +260 -242
- data/vendor/kreuzberg/src/extractors/typst.rs +650 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +135 -128
- data/vendor/kreuzberg/src/image/dpi.rs +164 -164
- data/vendor/kreuzberg/src/image/mod.rs +6 -6
- data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
- data/vendor/kreuzberg/src/image/resize.rs +89 -89
- data/vendor/kreuzberg/src/keywords/config.rs +154 -154
- data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
- data/vendor/kreuzberg/src/keywords/processor.rs +267 -267
- data/vendor/kreuzberg/src/keywords/rake.rs +293 -294
- data/vendor/kreuzberg/src/keywords/types.rs +68 -68
- data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
- data/vendor/kreuzberg/src/language_detection/mod.rs +942 -942
- data/vendor/kreuzberg/src/lib.rs +105 -102
- data/vendor/kreuzberg/src/mcp/mod.rs +32 -32
- data/vendor/kreuzberg/src/mcp/server.rs +1968 -1966
- data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
- data/vendor/kreuzberg/src/ocr/error.rs +37 -37
- data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
- data/vendor/kreuzberg/src/ocr/mod.rs +58 -58
- data/vendor/kreuzberg/src/ocr/processor.rs +863 -847
- data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
- data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +450 -450
- data/vendor/kreuzberg/src/ocr/types.rs +393 -393
- data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
- data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
- data/vendor/kreuzberg/src/panic_context.rs +154 -0
- data/vendor/kreuzberg/src/pdf/error.rs +122 -122
- data/vendor/kreuzberg/src/pdf/images.rs +139 -139
- data/vendor/kreuzberg/src/pdf/metadata.rs +346 -346
- data/vendor/kreuzberg/src/pdf/mod.rs +50 -50
- data/vendor/kreuzberg/src/pdf/rendering.rs +369 -369
- data/vendor/kreuzberg/src/pdf/table.rs +393 -420
- data/vendor/kreuzberg/src/pdf/text.rs +158 -161
- data/vendor/kreuzberg/src/plugins/extractor.rs +1013 -1010
- data/vendor/kreuzberg/src/plugins/mod.rs +209 -209
- data/vendor/kreuzberg/src/plugins/ocr.rs +620 -629
- data/vendor/kreuzberg/src/plugins/processor.rs +642 -641
- data/vendor/kreuzberg/src/plugins/registry.rs +1337 -1324
- data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
- data/vendor/kreuzberg/src/plugins/validator.rs +956 -955
- data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
- data/vendor/kreuzberg/src/text/mod.rs +19 -19
- data/vendor/kreuzberg/src/text/quality.rs +697 -697
- data/vendor/kreuzberg/src/text/string_utils.rs +217 -217
- data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
- data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -796
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -902
- data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
- data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -147
- data/vendor/kreuzberg/src/types.rs +903 -873
- data/vendor/kreuzberg/src/utils/mod.rs +17 -17
- data/vendor/kreuzberg/src/utils/quality.rs +959 -959
- data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
- data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
- data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
- data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
- data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
- data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
- data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
- data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
- data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
- data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
- data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
- data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
- data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
- data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
- data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
- data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
- data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
- data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
- data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
- data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
- data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
- data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
- data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
- data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
- data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
- data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
- data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
- data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
- data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
- data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
- data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
- data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
- data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
- data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
- data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
- data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
- data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
- data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
- data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
- data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
- data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
- data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
- data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
- data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
- data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
- data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
- data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
- data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
- data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
- data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
- data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
- data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
- data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
- data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
- data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
- data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
- data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
- data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
- data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
- data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
- data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -0
- data/vendor/kreuzberg/tests/api_tests.rs +966 -966
- data/vendor/kreuzberg/tests/archive_integration.rs +543 -543
- data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -542
- data/vendor/kreuzberg/tests/batch_processing.rs +316 -304
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +525 -509
- data/vendor/kreuzberg/tests/config_features.rs +598 -580
- data/vendor/kreuzberg/tests/config_loading_tests.rs +415 -439
- data/vendor/kreuzberg/tests/core_integration.rs +510 -493
- data/vendor/kreuzberg/tests/csv_integration.rs +414 -424
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +498 -0
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -124
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -0
- data/vendor/kreuzberg/tests/email_integration.rs +325 -325
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -0
- data/vendor/kreuzberg/tests/error_handling.rs +393 -393
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -0
- data/vendor/kreuzberg/tests/format_integration.rs +159 -159
- data/vendor/kreuzberg/tests/helpers/mod.rs +142 -142
- data/vendor/kreuzberg/tests/html_table_test.rs +551 -0
- data/vendor/kreuzberg/tests/image_integration.rs +253 -253
- data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -0
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -0
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -0
- data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
- data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -0
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -0
- data/vendor/kreuzberg/tests/mime_detection.rs +428 -428
- data/vendor/kreuzberg/tests/ocr_configuration.rs +510 -510
- data/vendor/kreuzberg/tests/ocr_errors.rs +676 -676
- data/vendor/kreuzberg/tests/ocr_quality.rs +627 -627
- data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +695 -0
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -0
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +43 -43
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1411 -1412
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +771 -771
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -561
- data/vendor/kreuzberg/tests/plugin_system.rs +921 -921
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +586 -607
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +692 -0
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +776 -0
- data/vendor/kreuzberg/tests/security_validation.rs +415 -404
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
- data/vendor/kreuzberg/tests/test_fastembed.rs +609 -609
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1259 -0
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +647 -0
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
- data/vendor/rb-sys/.cargo-ok +1 -0
- data/vendor/rb-sys/.cargo_vcs_info.json +6 -0
- data/vendor/rb-sys/Cargo.lock +393 -0
- data/vendor/rb-sys/Cargo.toml +70 -0
- data/vendor/rb-sys/Cargo.toml.orig +57 -0
- data/vendor/rb-sys/LICENSE-APACHE +190 -0
- data/vendor/rb-sys/LICENSE-MIT +21 -0
- data/vendor/rb-sys/bin/release.sh +21 -0
- data/vendor/rb-sys/build/features.rs +108 -0
- data/vendor/rb-sys/build/main.rs +246 -0
- data/vendor/rb-sys/build/stable_api_config.rs +153 -0
- data/vendor/rb-sys/build/version.rs +48 -0
- data/vendor/rb-sys/readme.md +36 -0
- data/vendor/rb-sys/src/bindings.rs +21 -0
- data/vendor/rb-sys/src/hidden.rs +11 -0
- data/vendor/rb-sys/src/lib.rs +34 -0
- data/vendor/rb-sys/src/macros.rs +371 -0
- data/vendor/rb-sys/src/memory.rs +53 -0
- data/vendor/rb-sys/src/ruby_abi_version.rs +38 -0
- data/vendor/rb-sys/src/special_consts.rs +31 -0
- data/vendor/rb-sys/src/stable_api/compiled.c +179 -0
- data/vendor/rb-sys/src/stable_api/compiled.rs +257 -0
- data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +316 -0
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +316 -0
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +324 -0
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +317 -0
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +315 -0
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +326 -0
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +327 -0
- data/vendor/rb-sys/src/stable_api.rs +261 -0
- data/vendor/rb-sys/src/symbol.rs +31 -0
- data/vendor/rb-sys/src/tracking_allocator.rs +332 -0
- data/vendor/rb-sys/src/utils.rs +89 -0
- data/vendor/rb-sys/src/value_type.rs +7 -0
- metadata +90 -95
- data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
- data/spec/examples.txt +0 -104
- data/vendor/kreuzberg/src/bin/profile_extract.rs +0 -455
- data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +0 -275
- data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +0 -178
- data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +0 -491
- data/vendor/kreuzberg/src/extraction/pandoc/server.rs +0 -496
- data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +0 -1188
- data/vendor/kreuzberg/src/extraction/pandoc/version.rs +0 -162
- data/vendor/kreuzberg/src/extractors/pandoc.rs +0 -201
- data/vendor/kreuzberg/tests/chunking_offset_demo.rs +0 -92
- data/vendor/kreuzberg/tests/pandoc_integration.rs +0 -503
|
@@ -0,0 +1,498 @@
|
|
|
1
|
+
//! Comprehensive tests for DocBook extractor supporting both 4.x and 5.x versions.
|
|
2
|
+
|
|
3
|
+
use kreuzberg::core::config::ExtractionConfig;
|
|
4
|
+
use kreuzberg::plugins::{DocumentExtractor, Plugin};
|
|
5
|
+
use std::path::PathBuf;
|
|
6
|
+
|
|
7
|
+
/// Helper to get absolute path to test documents
|
|
8
|
+
fn test_file_path(filename: &str) -> PathBuf {
|
|
9
|
+
let manifest_dir = env!("CARGO_MANIFEST_DIR");
|
|
10
|
+
PathBuf::from(manifest_dir)
|
|
11
|
+
.parent()
|
|
12
|
+
.unwrap()
|
|
13
|
+
.parent()
|
|
14
|
+
.unwrap()
|
|
15
|
+
.join("test_documents")
|
|
16
|
+
.join("docbook")
|
|
17
|
+
.join(filename)
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
/// DocBook 4.x extractor test helper
|
|
21
|
+
async fn extract_docbook4_file(filename: &str) -> kreuzberg::Result<kreuzberg::types::ExtractionResult> {
|
|
22
|
+
let extractor = kreuzberg::extractors::DocbookExtractor::new();
|
|
23
|
+
let path = test_file_path(filename);
|
|
24
|
+
let config = ExtractionConfig::default();
|
|
25
|
+
extractor.extract_file(&path, "application/docbook+xml", &config).await
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
/// DocBook 5.x extractor test helper
|
|
29
|
+
async fn extract_docbook5_file(filename: &str) -> kreuzberg::Result<kreuzberg::types::ExtractionResult> {
|
|
30
|
+
let extractor = kreuzberg::extractors::DocbookExtractor::new();
|
|
31
|
+
let path = test_file_path(filename);
|
|
32
|
+
let config = ExtractionConfig::default();
|
|
33
|
+
extractor.extract_file(&path, "application/docbook+xml", &config).await
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
/// Helper to extract bytes directly
|
|
37
|
+
async fn extract_docbook_bytes(
|
|
38
|
+
content: &[u8],
|
|
39
|
+
mime_type: &str,
|
|
40
|
+
) -> kreuzberg::Result<kreuzberg::types::ExtractionResult> {
|
|
41
|
+
let extractor = kreuzberg::extractors::DocbookExtractor::new();
|
|
42
|
+
let config = ExtractionConfig::default();
|
|
43
|
+
extractor.extract_bytes(content, mime_type, &config).await
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
#[test]
|
|
47
|
+
fn test_docbook_extractor_plugin_interface() {
|
|
48
|
+
let extractor = kreuzberg::extractors::DocbookExtractor::new();
|
|
49
|
+
assert_eq!(extractor.name(), "docbook-extractor");
|
|
50
|
+
assert!(extractor.initialize().is_ok());
|
|
51
|
+
assert!(extractor.shutdown().is_ok());
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
#[test]
|
|
55
|
+
fn test_docbook_extractor_supported_mime_types() {
|
|
56
|
+
let extractor = kreuzberg::extractors::DocbookExtractor::new();
|
|
57
|
+
let mime_types = extractor.supported_mime_types();
|
|
58
|
+
assert!(mime_types.contains(&"application/docbook+xml"));
|
|
59
|
+
assert!(mime_types.contains(&"text/docbook"));
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
#[test]
|
|
63
|
+
fn test_docbook_extractor_priority() {
|
|
64
|
+
let extractor = kreuzberg::extractors::DocbookExtractor::new();
|
|
65
|
+
assert_eq!(extractor.priority(), 50);
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
#[tokio::test]
|
|
69
|
+
async fn test_docbook4_chapter_extraction() {
|
|
70
|
+
let result = extract_docbook4_file("docbook-chapter.docbook").await;
|
|
71
|
+
assert!(result.is_ok(), "Failed to extract DocBook 4 chapter");
|
|
72
|
+
|
|
73
|
+
let result = result.unwrap();
|
|
74
|
+
assert!(!result.content.is_empty(), "Content should not be empty");
|
|
75
|
+
assert!(
|
|
76
|
+
result.content.contains("Test Chapter"),
|
|
77
|
+
"Content should contain chapter title"
|
|
78
|
+
);
|
|
79
|
+
assert!(
|
|
80
|
+
result.content.contains("Like a Sect1"),
|
|
81
|
+
"Content should contain section titles"
|
|
82
|
+
);
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
#[tokio::test]
|
|
86
|
+
async fn test_docbook5_reader_extraction() {
|
|
87
|
+
let result = extract_docbook5_file("docbook-reader.docbook").await;
|
|
88
|
+
assert!(result.is_ok(), "Failed to extract DocBook 5 file");
|
|
89
|
+
|
|
90
|
+
let result = result.unwrap();
|
|
91
|
+
assert!(!result.content.is_empty(), "Content should not be empty");
|
|
92
|
+
assert!(
|
|
93
|
+
result.content.contains("Pandoc Test Suite"),
|
|
94
|
+
"Content should contain article title"
|
|
95
|
+
);
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
#[tokio::test]
|
|
99
|
+
async fn test_docbook_xref_extraction() {
|
|
100
|
+
let result = extract_docbook4_file("docbook-xref.docbook").await;
|
|
101
|
+
assert!(result.is_ok(), "Failed to extract DocBook with xref elements");
|
|
102
|
+
|
|
103
|
+
let result = result.unwrap();
|
|
104
|
+
assert!(!result.content.is_empty(), "Content should not be empty");
|
|
105
|
+
assert!(
|
|
106
|
+
result.content.contains("An Example Book"),
|
|
107
|
+
"Content should contain book title"
|
|
108
|
+
);
|
|
109
|
+
assert!(
|
|
110
|
+
result.content.contains("XRef Samples"),
|
|
111
|
+
"Content should contain xref chapter"
|
|
112
|
+
);
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
#[tokio::test]
|
|
116
|
+
async fn test_docbook_tables_extraction() {
|
|
117
|
+
let result = extract_docbook4_file("tables.docbook4").await;
|
|
118
|
+
assert!(result.is_ok(), "Failed to extract DocBook with tables");
|
|
119
|
+
|
|
120
|
+
let result = result.unwrap();
|
|
121
|
+
assert!(!result.content.is_empty(), "Content should not be empty");
|
|
122
|
+
assert!(!result.tables.is_empty(), "Should extract tables from DocBook");
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
#[tokio::test]
|
|
126
|
+
async fn test_docbook5_tables_extraction() {
|
|
127
|
+
let result = extract_docbook5_file("tables.docbook5").await;
|
|
128
|
+
assert!(result.is_ok(), "Failed to extract DocBook 5 with tables");
|
|
129
|
+
|
|
130
|
+
let result = result.unwrap();
|
|
131
|
+
assert!(!result.content.is_empty(), "Content should not be empty");
|
|
132
|
+
assert!(!result.tables.is_empty(), "Should extract tables from DocBook 5");
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
#[tokio::test]
|
|
136
|
+
async fn test_docbook_metadata_extraction() {
|
|
137
|
+
let result = extract_docbook5_file("docbook-reader.docbook").await;
|
|
138
|
+
assert!(result.is_ok());
|
|
139
|
+
|
|
140
|
+
let result = result.unwrap();
|
|
141
|
+
assert!(!result.content.is_empty());
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
#[tokio::test]
|
|
145
|
+
async fn test_docbook_section_hierarchy() {
|
|
146
|
+
let result = extract_docbook4_file("docbook-chapter.docbook").await;
|
|
147
|
+
assert!(result.is_ok());
|
|
148
|
+
|
|
149
|
+
let result = result.unwrap();
|
|
150
|
+
let content = &result.content;
|
|
151
|
+
|
|
152
|
+
assert!(content.contains("Like a Sect1"));
|
|
153
|
+
assert!(content.contains("Like a Sect2"));
|
|
154
|
+
assert!(content.contains("Like a Sect3"));
|
|
155
|
+
assert!(content.contains("Like a Sect4"));
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
#[tokio::test]
|
|
159
|
+
async fn test_docbook_paragraph_extraction() {
|
|
160
|
+
let result = extract_docbook4_file("docbook-chapter.docbook").await;
|
|
161
|
+
assert!(result.is_ok());
|
|
162
|
+
|
|
163
|
+
let result = result.unwrap();
|
|
164
|
+
assert!(
|
|
165
|
+
result.content.contains("This chapter uses recursive sections"),
|
|
166
|
+
"Should extract paragraph content"
|
|
167
|
+
);
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
#[tokio::test]
|
|
171
|
+
async fn test_docbook_paragraph_content() {
|
|
172
|
+
let docbook = r#"<?xml version="1.0" encoding="UTF-8"?>
|
|
173
|
+
<!DOCTYPE article PUBLIC "-//OASIS//DTD DocBook XML V4.4//EN"
|
|
174
|
+
"http://www.oasis-open.org/docbook/xml/4.4/docbookx.dtd">
|
|
175
|
+
<article>
|
|
176
|
+
<title>Test Article</title>
|
|
177
|
+
<para>This is a test paragraph.</para>
|
|
178
|
+
<para>This is another paragraph with <emphasis>emphasized</emphasis> text.</para>
|
|
179
|
+
</article>"#;
|
|
180
|
+
|
|
181
|
+
let result = extract_docbook_bytes(docbook.as_bytes(), "application/docbook+xml").await;
|
|
182
|
+
assert!(result.is_ok());
|
|
183
|
+
|
|
184
|
+
let result = result.unwrap();
|
|
185
|
+
assert!(result.content.contains("Test Article"));
|
|
186
|
+
assert!(result.content.contains("This is a test paragraph"));
|
|
187
|
+
assert!(result.content.contains("another paragraph"));
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
#[tokio::test]
|
|
191
|
+
async fn test_docbook_code_block_extraction() {
|
|
192
|
+
let docbook = r#"<?xml version="1.0" encoding="UTF-8"?>
|
|
193
|
+
<!DOCTYPE article PUBLIC "-//OASIS//DTD DocBook XML V4.4//EN"
|
|
194
|
+
"http://www.oasis-open.org/docbook/xml/4.4/docbookx.dtd">
|
|
195
|
+
<article>
|
|
196
|
+
<para>Here is code:</para>
|
|
197
|
+
<programlisting>
|
|
198
|
+
def hello():
|
|
199
|
+
print("world")
|
|
200
|
+
</programlisting>
|
|
201
|
+
</article>"#;
|
|
202
|
+
|
|
203
|
+
let result = extract_docbook_bytes(docbook.as_bytes(), "application/docbook+xml").await;
|
|
204
|
+
assert!(result.is_ok());
|
|
205
|
+
|
|
206
|
+
let result = result.unwrap();
|
|
207
|
+
assert!(result.content.contains("def hello"));
|
|
208
|
+
assert!(result.content.contains("print"));
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
#[tokio::test]
|
|
212
|
+
async fn test_docbook_mixed_content() {
|
|
213
|
+
let docbook = r#"<?xml version="1.0" encoding="UTF-8"?>
|
|
214
|
+
<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.4//EN"
|
|
215
|
+
"http://www.oasis-open.org/docbook/xml/4.4/docbookx.dtd">
|
|
216
|
+
<book>
|
|
217
|
+
<title>Test Book</title>
|
|
218
|
+
<chapter>
|
|
219
|
+
<title>Chapter 1</title>
|
|
220
|
+
<section>
|
|
221
|
+
<title>Section 1.1</title>
|
|
222
|
+
<para>Paragraph in section.</para>
|
|
223
|
+
</section>
|
|
224
|
+
</chapter>
|
|
225
|
+
</book>"#;
|
|
226
|
+
|
|
227
|
+
let result = extract_docbook_bytes(docbook.as_bytes(), "application/docbook+xml").await;
|
|
228
|
+
assert!(result.is_ok());
|
|
229
|
+
|
|
230
|
+
let result = result.unwrap();
|
|
231
|
+
assert!(result.content.contains("Test Book"));
|
|
232
|
+
assert!(result.content.contains("Chapter 1"));
|
|
233
|
+
assert!(result.content.contains("Section 1.1"));
|
|
234
|
+
assert!(result.content.contains("Paragraph in section"));
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
#[tokio::test]
|
|
238
|
+
async fn test_docbook_namespaced_5x_parsing() {
|
|
239
|
+
let docbook5 = r#"<?xml version="1.0" encoding="UTF-8"?>
|
|
240
|
+
<article xmlns="http://docbook.org/ns/docbook">
|
|
241
|
+
<info>
|
|
242
|
+
<title>DocBook 5 Article</title>
|
|
243
|
+
<author>
|
|
244
|
+
<personname>
|
|
245
|
+
<firstname>John</firstname>
|
|
246
|
+
<surname>Doe</surname>
|
|
247
|
+
</personname>
|
|
248
|
+
</author>
|
|
249
|
+
<date>2024-01-01</date>
|
|
250
|
+
</info>
|
|
251
|
+
<section>
|
|
252
|
+
<title>Introduction</title>
|
|
253
|
+
<para>Welcome to DocBook 5.</para>
|
|
254
|
+
</section>
|
|
255
|
+
</article>"#;
|
|
256
|
+
|
|
257
|
+
let result = extract_docbook_bytes(docbook5.as_bytes(), "application/docbook+xml").await;
|
|
258
|
+
assert!(result.is_ok());
|
|
259
|
+
|
|
260
|
+
let result = result.unwrap();
|
|
261
|
+
assert!(result.content.contains("DocBook 5 Article"));
|
|
262
|
+
assert!(result.content.contains("Welcome to DocBook 5"));
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
#[tokio::test]
|
|
266
|
+
async fn test_docbook_link_handling() {
|
|
267
|
+
let docbook = r#"<?xml version="1.0" encoding="UTF-8"?>
|
|
268
|
+
<!DOCTYPE article PUBLIC "-//OASIS//DTD DocBook XML V4.4//EN"
|
|
269
|
+
"http://www.oasis-open.org/docbook/xml/4.4/docbookx.dtd">
|
|
270
|
+
<article>
|
|
271
|
+
<title>Links Test</title>
|
|
272
|
+
<para>See <link xlink:href="http://example.com">example site</link>.</para>
|
|
273
|
+
</article>"#;
|
|
274
|
+
|
|
275
|
+
let result = extract_docbook_bytes(docbook.as_bytes(), "application/docbook+xml").await;
|
|
276
|
+
assert!(result.is_ok());
|
|
277
|
+
|
|
278
|
+
let result = result.unwrap();
|
|
279
|
+
assert!(result.content.contains("example"));
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
#[tokio::test]
|
|
283
|
+
async fn test_docbook_mime_type_detection() {
|
|
284
|
+
let docbook = r#"<?xml version="1.0" encoding="UTF-8"?>
|
|
285
|
+
<!DOCTYPE article PUBLIC "-//OASIS//DTD DocBook XML V4.4//EN"
|
|
286
|
+
"http://www.oasis-open.org/docbook/xml/4.4/docbookx.dtd">
|
|
287
|
+
<article>
|
|
288
|
+
<title>Test</title>
|
|
289
|
+
</article>"#;
|
|
290
|
+
|
|
291
|
+
let result1 = extract_docbook_bytes(docbook.as_bytes(), "application/docbook+xml").await;
|
|
292
|
+
assert!(result1.is_ok());
|
|
293
|
+
|
|
294
|
+
let result2 = extract_docbook_bytes(docbook.as_bytes(), "text/docbook").await;
|
|
295
|
+
assert!(result2.is_ok());
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
#[tokio::test]
|
|
299
|
+
async fn test_docbook_empty_sections() {
|
|
300
|
+
let docbook = r#"<?xml version="1.0" encoding="UTF-8"?>
|
|
301
|
+
<!DOCTYPE article PUBLIC "-//OASIS//DTD DocBook XML V4.4//EN"
|
|
302
|
+
"http://www.oasis-open.org/docbook/xml/4.4/docbookx.dtd">
|
|
303
|
+
<article>
|
|
304
|
+
<title>Empty Sections</title>
|
|
305
|
+
<section>
|
|
306
|
+
<title>Empty Section</title>
|
|
307
|
+
</section>
|
|
308
|
+
<section>
|
|
309
|
+
<title>Section with Content</title>
|
|
310
|
+
<para>Content here</para>
|
|
311
|
+
</section>
|
|
312
|
+
</article>"#;
|
|
313
|
+
|
|
314
|
+
let result = extract_docbook_bytes(docbook.as_bytes(), "application/docbook+xml").await;
|
|
315
|
+
assert!(result.is_ok());
|
|
316
|
+
|
|
317
|
+
let result = result.unwrap();
|
|
318
|
+
assert!(result.content.contains("Empty Section"));
|
|
319
|
+
assert!(result.content.contains("Section with Content"));
|
|
320
|
+
assert!(result.content.contains("Content here"));
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
#[tokio::test]
|
|
324
|
+
async fn test_docbook_itemized_list() {
|
|
325
|
+
let docbook = r#"<?xml version="1.0" encoding="UTF-8"?>
|
|
326
|
+
<!DOCTYPE article PUBLIC "-//OASIS//DTD DocBook XML V4.4//EN"
|
|
327
|
+
"http://www.oasis-open.org/docbook/xml/4.4/docbookx.dtd">
|
|
328
|
+
<article>
|
|
329
|
+
<title>List Test</title>
|
|
330
|
+
<itemizedlist>
|
|
331
|
+
<listitem>
|
|
332
|
+
<para>First item</para>
|
|
333
|
+
</listitem>
|
|
334
|
+
<listitem>
|
|
335
|
+
<para>Second item</para>
|
|
336
|
+
</listitem>
|
|
337
|
+
<listitem>
|
|
338
|
+
<para>Third item</para>
|
|
339
|
+
</listitem>
|
|
340
|
+
</itemizedlist>
|
|
341
|
+
</article>"#;
|
|
342
|
+
|
|
343
|
+
let result = extract_docbook_bytes(docbook.as_bytes(), "application/docbook+xml").await;
|
|
344
|
+
assert!(result.is_ok());
|
|
345
|
+
|
|
346
|
+
let result = result.unwrap();
|
|
347
|
+
assert!(result.content.contains("First item"));
|
|
348
|
+
assert!(result.content.contains("Second item"));
|
|
349
|
+
assert!(result.content.contains("Third item"));
|
|
350
|
+
assert!(result.content.contains("- "), "Should contain bullet points");
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
#[tokio::test]
|
|
354
|
+
async fn test_docbook_ordered_list() {
|
|
355
|
+
let docbook = r#"<?xml version="1.0" encoding="UTF-8"?>
|
|
356
|
+
<!DOCTYPE article PUBLIC "-//OASIS//DTD DocBook XML V4.4//EN"
|
|
357
|
+
"http://www.oasis-open.org/docbook/xml/4.4/docbookx.dtd">
|
|
358
|
+
<article>
|
|
359
|
+
<title>Ordered List Test</title>
|
|
360
|
+
<orderedlist>
|
|
361
|
+
<listitem>
|
|
362
|
+
<para>First step</para>
|
|
363
|
+
</listitem>
|
|
364
|
+
<listitem>
|
|
365
|
+
<para>Second step</para>
|
|
366
|
+
</listitem>
|
|
367
|
+
<listitem>
|
|
368
|
+
<para>Third step</para>
|
|
369
|
+
</listitem>
|
|
370
|
+
</orderedlist>
|
|
371
|
+
</article>"#;
|
|
372
|
+
|
|
373
|
+
let result = extract_docbook_bytes(docbook.as_bytes(), "application/docbook+xml").await;
|
|
374
|
+
assert!(result.is_ok());
|
|
375
|
+
|
|
376
|
+
let result = result.unwrap();
|
|
377
|
+
assert!(result.content.contains("First step"));
|
|
378
|
+
assert!(result.content.contains("Second step"));
|
|
379
|
+
assert!(result.content.contains("Third step"));
|
|
380
|
+
assert!(result.content.contains("1. "), "Should contain numbered list");
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
#[tokio::test]
|
|
384
|
+
async fn test_docbook_blockquote() {
|
|
385
|
+
let docbook = r#"<?xml version="1.0" encoding="UTF-8"?>
|
|
386
|
+
<!DOCTYPE article PUBLIC "-//OASIS//DTD DocBook XML V4.4//EN"
|
|
387
|
+
"http://www.oasis-open.org/docbook/xml/4.4/docbookx.dtd">
|
|
388
|
+
<article>
|
|
389
|
+
<title>Blockquote Test</title>
|
|
390
|
+
<blockquote>
|
|
391
|
+
<para>This is a quoted passage.</para>
|
|
392
|
+
</blockquote>
|
|
393
|
+
</article>"#;
|
|
394
|
+
|
|
395
|
+
let result = extract_docbook_bytes(docbook.as_bytes(), "application/docbook+xml").await;
|
|
396
|
+
assert!(result.is_ok());
|
|
397
|
+
|
|
398
|
+
let result = result.unwrap();
|
|
399
|
+
assert!(result.content.contains("quoted passage"));
|
|
400
|
+
assert!(result.content.contains("> "), "Should contain blockquote marker");
|
|
401
|
+
}
|
|
402
|
+
|
|
403
|
+
#[tokio::test]
|
|
404
|
+
async fn test_docbook_figure() {
|
|
405
|
+
let docbook = r#"<?xml version="1.0" encoding="UTF-8"?>
|
|
406
|
+
<!DOCTYPE article PUBLIC "-//OASIS//DTD DocBook XML V4.4//EN"
|
|
407
|
+
"http://www.oasis-open.org/docbook/xml/4.4/docbookx.dtd">
|
|
408
|
+
<article>
|
|
409
|
+
<title>Figure Test</title>
|
|
410
|
+
<figure>
|
|
411
|
+
<title>Sample Figure</title>
|
|
412
|
+
<para>This is a figure description.</para>
|
|
413
|
+
</figure>
|
|
414
|
+
</article>"#;
|
|
415
|
+
|
|
416
|
+
let result = extract_docbook_bytes(docbook.as_bytes(), "application/docbook+xml").await;
|
|
417
|
+
assert!(result.is_ok());
|
|
418
|
+
|
|
419
|
+
let result = result.unwrap();
|
|
420
|
+
assert!(result.content.contains("Figure"));
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
#[tokio::test]
|
|
424
|
+
async fn test_docbook_footnote() {
|
|
425
|
+
let docbook = r#"<?xml version="1.0" encoding="UTF-8"?>
|
|
426
|
+
<!DOCTYPE article PUBLIC "-//OASIS//DTD DocBook XML V4.4//EN"
|
|
427
|
+
"http://www.oasis-open.org/docbook/xml/4.4/docbookx.dtd">
|
|
428
|
+
<article>
|
|
429
|
+
<title>Footnote Test</title>
|
|
430
|
+
<para>Here is some text with a footnote<footnote><para>This is the footnote content</para></footnote>.</para>
|
|
431
|
+
</article>"#;
|
|
432
|
+
|
|
433
|
+
let result = extract_docbook_bytes(docbook.as_bytes(), "application/docbook+xml").await;
|
|
434
|
+
assert!(result.is_ok());
|
|
435
|
+
|
|
436
|
+
let result = result.unwrap();
|
|
437
|
+
assert!(result.content.contains("text with a footnote"));
|
|
438
|
+
assert!(result.content.contains("footnote content"));
|
|
439
|
+
}
|
|
440
|
+
|
|
441
|
+
#[tokio::test]
|
|
442
|
+
async fn test_docbook_mixed_content_with_lists() {
|
|
443
|
+
let docbook = r#"<?xml version="1.0" encoding="UTF-8"?>
|
|
444
|
+
<!DOCTYPE article PUBLIC "-//OASIS//DTD DocBook XML V4.4//EN"
|
|
445
|
+
"http://www.oasis-open.org/docbook/xml/4.4/docbookx.dtd">
|
|
446
|
+
<article>
|
|
447
|
+
<title>Mixed Content</title>
|
|
448
|
+
<para>Introduction paragraph.</para>
|
|
449
|
+
<itemizedlist>
|
|
450
|
+
<listitem>
|
|
451
|
+
<para>List item 1</para>
|
|
452
|
+
</listitem>
|
|
453
|
+
<listitem>
|
|
454
|
+
<para>List item 2</para>
|
|
455
|
+
</listitem>
|
|
456
|
+
</itemizedlist>
|
|
457
|
+
<para>Conclusion paragraph.</para>
|
|
458
|
+
<programlisting>
|
|
459
|
+
code example
|
|
460
|
+
</programlisting>
|
|
461
|
+
</article>"#;
|
|
462
|
+
|
|
463
|
+
let result = extract_docbook_bytes(docbook.as_bytes(), "application/docbook+xml").await;
|
|
464
|
+
assert!(result.is_ok());
|
|
465
|
+
|
|
466
|
+
let result = result.unwrap();
|
|
467
|
+
assert!(result.content.contains("Introduction paragraph"));
|
|
468
|
+
assert!(result.content.contains("List item 1"));
|
|
469
|
+
assert!(result.content.contains("List item 2"));
|
|
470
|
+
assert!(result.content.contains("Conclusion paragraph"));
|
|
471
|
+
assert!(result.content.contains("code example"));
|
|
472
|
+
}
|
|
473
|
+
|
|
474
|
+
#[tokio::test]
|
|
475
|
+
async fn test_docbook_namespaced_lists() {
|
|
476
|
+
let docbook5 = r#"<?xml version="1.0" encoding="UTF-8"?>
|
|
477
|
+
<article xmlns="http://docbook.org/ns/docbook">
|
|
478
|
+
<info>
|
|
479
|
+
<title>Lists in DocBook 5</title>
|
|
480
|
+
</info>
|
|
481
|
+
<itemizedlist>
|
|
482
|
+
<listitem>
|
|
483
|
+
<para>Namespaced item 1</para>
|
|
484
|
+
</listitem>
|
|
485
|
+
<listitem>
|
|
486
|
+
<para>Namespaced item 2</para>
|
|
487
|
+
</listitem>
|
|
488
|
+
</itemizedlist>
|
|
489
|
+
</article>"#;
|
|
490
|
+
|
|
491
|
+
let result = extract_docbook_bytes(docbook5.as_bytes(), "application/docbook+xml").await;
|
|
492
|
+
assert!(result.is_ok());
|
|
493
|
+
|
|
494
|
+
let result = result.unwrap();
|
|
495
|
+
assert!(result.content.contains("Namespaced item 1"));
|
|
496
|
+
assert!(result.content.contains("Namespaced item 2"));
|
|
497
|
+
assert!(result.content.contains("- "));
|
|
498
|
+
}
|