kreuzberg 4.0.8 → 4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
- data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
- data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
- data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
- data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
- data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
- data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
- data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
- data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
- data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
- data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
- data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
- data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
- data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
- data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
- data/ext/kreuzberg_rb/native/src/result.rs +326 -0
- data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
- data/lib/kreuzberg/config.rb +66 -0
- data/lib/kreuzberg/result.rb +107 -2
- data/lib/kreuzberg/types.rb +104 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +0 -4
- data/sig/kreuzberg.rbs +105 -1
- data/vendor/Cargo.toml +3 -3
- data/vendor/kreuzberg/Cargo.toml +4 -3
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/config.rs +69 -0
- data/vendor/kreuzberg/src/api/handlers.rs +99 -2
- data/vendor/kreuzberg/src/api/mod.rs +14 -7
- data/vendor/kreuzberg/src/api/router.rs +214 -0
- data/vendor/kreuzberg/src/api/startup.rs +243 -0
- data/vendor/kreuzberg/src/api/types.rs +78 -0
- data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
- data/vendor/kreuzberg/src/cache/core.rs +428 -0
- data/vendor/kreuzberg/src/cache/mod.rs +21 -843
- data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
- data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
- data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
- data/vendor/kreuzberg/src/chunking/config.rs +52 -0
- data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
- data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
- data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
- data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
- data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
- data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
- data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
- data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
- data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
- data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
- data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
- data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
- data/vendor/kreuzberg/src/core/config/page.rs +57 -0
- data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
- data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
- data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
- data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
- data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
- data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
- data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
- data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
- data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
- data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
- data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
- data/vendor/kreuzberg/src/core/mod.rs +4 -1
- data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
- data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
- data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
- data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
- data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
- data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
- data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
- data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
- data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
- data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
- data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
- data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
- data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
- data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
- data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
- data/vendor/kreuzberg/src/embeddings.rs +136 -13
- data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
- data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
- data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
- data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
- data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
- data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
- data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
- data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
- data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
- data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
- data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
- data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
- data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
- data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
- data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
- data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
- data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
- data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
- data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
- data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
- data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
- data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
- data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
- data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
- data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
- data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
- data/vendor/kreuzberg/src/extractors/email.rs +2 -0
- data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
- data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
- data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
- data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
- data/vendor/kreuzberg/src/extractors/html.rs +80 -8
- data/vendor/kreuzberg/src/extractors/image.rs +8 -1
- data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
- data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
- data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
- data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
- data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
- data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
- data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
- data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
- data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
- data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
- data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
- data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
- data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
- data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
- data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
- data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
- data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
- data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
- data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
- data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
- data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
- data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
- data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
- data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
- data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
- data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
- data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
- data/vendor/kreuzberg/src/extractors/text.rs +4 -0
- data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
- data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
- data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
- data/vendor/kreuzberg/src/lib.rs +2 -2
- data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
- data/vendor/kreuzberg/src/mcp/format.rs +211 -0
- data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
- data/vendor/kreuzberg/src/mcp/params.rs +196 -0
- data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
- data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
- data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
- data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
- data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
- data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
- data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
- data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
- data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
- data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
- data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
- data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
- data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
- data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
- data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
- data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
- data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
- data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
- data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
- data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
- data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
- data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
- data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
- data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
- data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
- data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
- data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
- data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
- data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
- data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
- data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
- data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
- data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
- data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
- data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
- data/vendor/kreuzberg/src/text/quality.rs +1 -1
- data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
- data/vendor/kreuzberg/src/types/djot.rs +209 -0
- data/vendor/kreuzberg/src/types/extraction.rs +301 -0
- data/vendor/kreuzberg/src/types/formats.rs +443 -0
- data/vendor/kreuzberg/src/types/metadata.rs +560 -0
- data/vendor/kreuzberg/src/types/mod.rs +281 -0
- data/vendor/kreuzberg/src/types/page.rs +182 -0
- data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
- data/vendor/kreuzberg/src/types/tables.rs +39 -0
- data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
- data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
- data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
- data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
- data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
- data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
- data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
- data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
- data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
- data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
- data/vendor/kreuzberg/tests/api_embed.rs +6 -9
- data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
- data/vendor/kreuzberg/tests/core_integration.rs +1 -0
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
- data/vendor/kreuzberg/tests/format_integration.rs +2 -0
- data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
- data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
- data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
- data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
- data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
- data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
- data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
- data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
- data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
- data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
- data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
- data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
- data/vendor/kreuzberg-ffi/src/error.rs +46 -14
- data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
- data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
- data/vendor/kreuzberg-ffi/src/result.rs +148 -122
- data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- metadata +200 -28
- data/vendor/kreuzberg/src/api/server.rs +0 -518
- data/vendor/kreuzberg/src/core/config.rs +0 -1914
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
- data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
- data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
- data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
- data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
- data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
- data/vendor/kreuzberg/src/types.rs +0 -1713
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
|
@@ -34,7 +34,7 @@ mod html_table_tests {
|
|
|
34
34
|
</table>
|
|
35
35
|
"#;
|
|
36
36
|
|
|
37
|
-
let result = convert_html_to_markdown(html, None);
|
|
37
|
+
let result = convert_html_to_markdown(html, None, None);
|
|
38
38
|
assert!(result.is_ok(), "HTML to markdown conversion should succeed");
|
|
39
39
|
|
|
40
40
|
let markdown = result.unwrap();
|
|
@@ -76,7 +76,7 @@ mod html_table_tests {
|
|
|
76
76
|
</table>
|
|
77
77
|
"#;
|
|
78
78
|
|
|
79
|
-
let result = convert_html_to_markdown(html, None);
|
|
79
|
+
let result = convert_html_to_markdown(html, None, None);
|
|
80
80
|
assert!(result.is_ok(), "Should convert to markdown");
|
|
81
81
|
|
|
82
82
|
let markdown = result.unwrap();
|
|
@@ -140,7 +140,7 @@ mod html_table_tests {
|
|
|
140
140
|
</table>
|
|
141
141
|
"#;
|
|
142
142
|
|
|
143
|
-
let result = convert_html_to_markdown(html, None);
|
|
143
|
+
let result = convert_html_to_markdown(html, None, None);
|
|
144
144
|
assert!(result.is_ok(), "Should convert complex table");
|
|
145
145
|
|
|
146
146
|
let markdown = result.unwrap();
|
|
@@ -191,7 +191,7 @@ mod html_table_tests {
|
|
|
191
191
|
</table>
|
|
192
192
|
"#;
|
|
193
193
|
|
|
194
|
-
let result = convert_html_to_markdown(html, None);
|
|
194
|
+
let result = convert_html_to_markdown(html, None, None);
|
|
195
195
|
assert!(result.is_ok(), "Should handle merged cell table");
|
|
196
196
|
|
|
197
197
|
let markdown = result.unwrap();
|
|
@@ -245,7 +245,7 @@ mod html_table_tests {
|
|
|
245
245
|
</table>
|
|
246
246
|
"#;
|
|
247
247
|
|
|
248
|
-
let result = convert_html_to_markdown(html, None);
|
|
248
|
+
let result = convert_html_to_markdown(html, None, None);
|
|
249
249
|
assert!(result.is_ok(), "Should handle multiple tables");
|
|
250
250
|
|
|
251
251
|
let markdown = result.unwrap();
|
|
@@ -300,7 +300,7 @@ mod html_table_tests {
|
|
|
300
300
|
</table>
|
|
301
301
|
"#;
|
|
302
302
|
|
|
303
|
-
let result = convert_html_to_markdown(html, None);
|
|
303
|
+
let result = convert_html_to_markdown(html, None, None);
|
|
304
304
|
assert!(result.is_ok(), "Should handle mixed header cells");
|
|
305
305
|
|
|
306
306
|
let markdown = result.unwrap();
|
|
@@ -346,7 +346,7 @@ mod html_table_tests {
|
|
|
346
346
|
</table>
|
|
347
347
|
"#;
|
|
348
348
|
|
|
349
|
-
let result = convert_html_to_markdown(html, None);
|
|
349
|
+
let result = convert_html_to_markdown(html, None, None);
|
|
350
350
|
assert!(result.is_ok(), "Should handle table with caption");
|
|
351
351
|
|
|
352
352
|
let markdown = result.unwrap();
|
|
@@ -382,7 +382,7 @@ mod html_table_tests {
|
|
|
382
382
|
fn test_simple_flat_table() {
|
|
383
383
|
let html = r#"<table><tr><td>A</td><td>B</td></tr><tr><td>C</td><td>D</td></tr></table>"#;
|
|
384
384
|
|
|
385
|
-
let result = convert_html_to_markdown(html, None);
|
|
385
|
+
let result = convert_html_to_markdown(html, None, None);
|
|
386
386
|
assert!(result.is_ok(), "Should handle flat table");
|
|
387
387
|
|
|
388
388
|
let markdown = result.unwrap();
|
|
@@ -418,7 +418,7 @@ mod html_table_tests {
|
|
|
418
418
|
</table>
|
|
419
419
|
"#;
|
|
420
420
|
|
|
421
|
-
let result = convert_html_to_markdown(html, None);
|
|
421
|
+
let result = convert_html_to_markdown(html, None, None);
|
|
422
422
|
assert!(result.is_ok(), "Should handle empty cells");
|
|
423
423
|
|
|
424
424
|
let markdown = result.unwrap();
|
|
@@ -456,7 +456,7 @@ mod html_table_tests {
|
|
|
456
456
|
</table>
|
|
457
457
|
"#;
|
|
458
458
|
|
|
459
|
-
let result = convert_html_to_markdown(html, None);
|
|
459
|
+
let result = convert_html_to_markdown(html, None, None);
|
|
460
460
|
assert!(result.is_ok(), "Should handle numeric table");
|
|
461
461
|
|
|
462
462
|
let markdown = result.unwrap();
|
|
@@ -499,7 +499,7 @@ mod html_table_tests {
|
|
|
499
499
|
</table>
|
|
500
500
|
"#;
|
|
501
501
|
|
|
502
|
-
let result = convert_html_to_markdown(html, None);
|
|
502
|
+
let result = convert_html_to_markdown(html, None, None);
|
|
503
503
|
assert!(result.is_ok(), "Should handle unicode characters");
|
|
504
504
|
|
|
505
505
|
let markdown = result.unwrap();
|
|
@@ -32,6 +32,7 @@ fn test_ocr_language_english() {
|
|
|
32
32
|
backend: "tesseract".to_string(),
|
|
33
33
|
language: "eng".to_string(),
|
|
34
34
|
tesseract_config: None,
|
|
35
|
+
output_format: None,
|
|
35
36
|
}),
|
|
36
37
|
force_ocr: false,
|
|
37
38
|
..Default::default()
|
|
@@ -57,6 +58,7 @@ fn test_ocr_language_german() {
|
|
|
57
58
|
backend: "tesseract".to_string(),
|
|
58
59
|
language: "deu".to_string(),
|
|
59
60
|
tesseract_config: None,
|
|
61
|
+
output_format: None,
|
|
60
62
|
}),
|
|
61
63
|
force_ocr: false,
|
|
62
64
|
..Default::default()
|
|
@@ -95,6 +97,7 @@ fn test_ocr_language_multiple() {
|
|
|
95
97
|
backend: "tesseract".to_string(),
|
|
96
98
|
language: "eng+kor".to_string(),
|
|
97
99
|
tesseract_config: None,
|
|
100
|
+
output_format: None,
|
|
98
101
|
}),
|
|
99
102
|
force_ocr: false,
|
|
100
103
|
..Default::default()
|
|
@@ -136,6 +139,7 @@ fn test_ocr_psm_auto() {
|
|
|
136
139
|
psm: 3,
|
|
137
140
|
..Default::default()
|
|
138
141
|
}),
|
|
142
|
+
output_format: None,
|
|
139
143
|
}),
|
|
140
144
|
force_ocr: false,
|
|
141
145
|
..Default::default()
|
|
@@ -164,6 +168,7 @@ fn test_ocr_psm_single_block() {
|
|
|
164
168
|
psm: 6,
|
|
165
169
|
..Default::default()
|
|
166
170
|
}),
|
|
171
|
+
output_format: None,
|
|
167
172
|
}),
|
|
168
173
|
force_ocr: false,
|
|
169
174
|
..Default::default()
|
|
@@ -192,6 +197,7 @@ fn test_ocr_psm_single_line() {
|
|
|
192
197
|
psm: 7,
|
|
193
198
|
..Default::default()
|
|
194
199
|
}),
|
|
200
|
+
output_format: None,
|
|
195
201
|
}),
|
|
196
202
|
force_ocr: false,
|
|
197
203
|
..Default::default()
|
|
@@ -218,6 +224,7 @@ fn test_force_ocr_on_text_pdf() {
|
|
|
218
224
|
backend: "tesseract".to_string(),
|
|
219
225
|
language: "eng".to_string(),
|
|
220
226
|
tesseract_config: None,
|
|
227
|
+
output_format: None,
|
|
221
228
|
}),
|
|
222
229
|
force_ocr: true,
|
|
223
230
|
..Default::default()
|
|
@@ -248,6 +255,7 @@ fn test_force_ocr_disabled() {
|
|
|
248
255
|
backend: "tesseract".to_string(),
|
|
249
256
|
language: "eng".to_string(),
|
|
250
257
|
tesseract_config: None,
|
|
258
|
+
output_format: None,
|
|
251
259
|
}),
|
|
252
260
|
force_ocr: false,
|
|
253
261
|
..Default::default()
|
|
@@ -283,6 +291,7 @@ fn test_table_detection_enabled() {
|
|
|
283
291
|
table_row_threshold_ratio: 0.5,
|
|
284
292
|
..Default::default()
|
|
285
293
|
}),
|
|
294
|
+
output_format: None,
|
|
286
295
|
}),
|
|
287
296
|
force_ocr: false,
|
|
288
297
|
..Default::default()
|
|
@@ -311,6 +320,7 @@ fn test_table_detection_disabled() {
|
|
|
311
320
|
enable_table_detection: false,
|
|
312
321
|
..Default::default()
|
|
313
322
|
}),
|
|
323
|
+
output_format: None,
|
|
314
324
|
}),
|
|
315
325
|
force_ocr: false,
|
|
316
326
|
..Default::default()
|
|
@@ -339,6 +349,7 @@ fn test_language_model_ngram_configuration() {
|
|
|
339
349
|
language_model_ngram_on: true,
|
|
340
350
|
..Default::default()
|
|
341
351
|
}),
|
|
352
|
+
output_format: None,
|
|
342
353
|
}),
|
|
343
354
|
force_ocr: false,
|
|
344
355
|
..Default::default()
|
|
@@ -368,6 +379,7 @@ fn test_dictionary_correction_enabled() {
|
|
|
368
379
|
tessedit_enable_dict_correction: true,
|
|
369
380
|
..Default::default()
|
|
370
381
|
}),
|
|
382
|
+
output_format: None,
|
|
371
383
|
}),
|
|
372
384
|
force_ocr: false,
|
|
373
385
|
..Default::default()
|
|
@@ -397,6 +409,7 @@ fn test_character_whitelist() {
|
|
|
397
409
|
tessedit_char_whitelist: "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz ".to_string(),
|
|
398
410
|
..Default::default()
|
|
399
411
|
}),
|
|
412
|
+
output_format: None,
|
|
400
413
|
}),
|
|
401
414
|
force_ocr: false,
|
|
402
415
|
..Default::default()
|
|
@@ -425,6 +438,7 @@ fn test_ocr_cache_enabled() {
|
|
|
425
438
|
use_cache: true,
|
|
426
439
|
..Default::default()
|
|
427
440
|
}),
|
|
441
|
+
output_format: None,
|
|
428
442
|
}),
|
|
429
443
|
force_ocr: false,
|
|
430
444
|
use_cache: true,
|
|
@@ -464,6 +478,7 @@ fn test_ocr_cache_disabled() {
|
|
|
464
478
|
use_cache: false,
|
|
465
479
|
..Default::default()
|
|
466
480
|
}),
|
|
481
|
+
output_format: None,
|
|
467
482
|
}),
|
|
468
483
|
force_ocr: false,
|
|
469
484
|
use_cache: false,
|
|
@@ -498,6 +513,7 @@ fn test_complex_configuration_combination() {
|
|
|
498
513
|
use_cache: true,
|
|
499
514
|
..Default::default()
|
|
500
515
|
}),
|
|
516
|
+
output_format: None,
|
|
501
517
|
}),
|
|
502
518
|
force_ocr: false,
|
|
503
519
|
use_cache: true,
|
|
@@ -34,6 +34,7 @@ fn test_ocr_invalid_language_code() {
|
|
|
34
34
|
backend: "tesseract".to_string(),
|
|
35
35
|
language: "invalid_lang_99999".to_string(),
|
|
36
36
|
tesseract_config: None,
|
|
37
|
+
output_format: None,
|
|
37
38
|
}),
|
|
38
39
|
force_ocr: false,
|
|
39
40
|
..Default::default()
|
|
@@ -74,6 +75,7 @@ fn test_ocr_invalid_psm_mode() {
|
|
|
74
75
|
psm: 999,
|
|
75
76
|
..Default::default()
|
|
76
77
|
}),
|
|
78
|
+
output_format: None,
|
|
77
79
|
}),
|
|
78
80
|
force_ocr: false,
|
|
79
81
|
..Default::default()
|
|
@@ -112,6 +114,7 @@ fn test_ocr_invalid_backend_name() {
|
|
|
112
114
|
backend: "nonexistent_ocr_backend_xyz".to_string(),
|
|
113
115
|
language: "eng".to_string(),
|
|
114
116
|
tesseract_config: None,
|
|
117
|
+
output_format: None,
|
|
115
118
|
}),
|
|
116
119
|
force_ocr: false,
|
|
117
120
|
..Default::default()
|
|
@@ -147,6 +150,7 @@ fn test_ocr_corrupted_image_data() {
|
|
|
147
150
|
backend: "tesseract".to_string(),
|
|
148
151
|
language: "eng".to_string(),
|
|
149
152
|
tesseract_config: None,
|
|
153
|
+
output_format: None,
|
|
150
154
|
}),
|
|
151
155
|
force_ocr: true,
|
|
152
156
|
..Default::default()
|
|
@@ -177,6 +181,7 @@ fn test_ocr_empty_image() {
|
|
|
177
181
|
backend: "tesseract".to_string(),
|
|
178
182
|
language: "eng".to_string(),
|
|
179
183
|
tesseract_config: None,
|
|
184
|
+
output_format: None,
|
|
180
185
|
}),
|
|
181
186
|
force_ocr: true,
|
|
182
187
|
..Default::default()
|
|
@@ -207,6 +212,7 @@ fn test_ocr_non_image_data() {
|
|
|
207
212
|
backend: "tesseract".to_string(),
|
|
208
213
|
language: "eng".to_string(),
|
|
209
214
|
tesseract_config: None,
|
|
215
|
+
output_format: None,
|
|
210
216
|
}),
|
|
211
217
|
force_ocr: true,
|
|
212
218
|
..Default::default()
|
|
@@ -245,6 +251,7 @@ fn test_ocr_extreme_table_threshold() {
|
|
|
245
251
|
table_row_threshold_ratio: 10.0,
|
|
246
252
|
..Default::default()
|
|
247
253
|
}),
|
|
254
|
+
output_format: None,
|
|
248
255
|
}),
|
|
249
256
|
force_ocr: false,
|
|
250
257
|
..Default::default()
|
|
@@ -281,6 +288,7 @@ fn test_ocr_negative_psm() {
|
|
|
281
288
|
psm: -5,
|
|
282
289
|
..Default::default()
|
|
283
290
|
}),
|
|
291
|
+
output_format: None,
|
|
284
292
|
}),
|
|
285
293
|
force_ocr: false,
|
|
286
294
|
..Default::default()
|
|
@@ -313,6 +321,7 @@ fn test_ocr_empty_whitelist() {
|
|
|
313
321
|
tessedit_char_whitelist: "".to_string(),
|
|
314
322
|
..Default::default()
|
|
315
323
|
}),
|
|
324
|
+
output_format: None,
|
|
316
325
|
}),
|
|
317
326
|
force_ocr: false,
|
|
318
327
|
..Default::default()
|
|
@@ -349,6 +358,7 @@ fn test_ocr_conflicting_whitelist_blacklist() {
|
|
|
349
358
|
tessedit_char_blacklist: "abc".to_string(),
|
|
350
359
|
..Default::default()
|
|
351
360
|
}),
|
|
361
|
+
output_format: None,
|
|
352
362
|
}),
|
|
353
363
|
force_ocr: false,
|
|
354
364
|
..Default::default()
|
|
@@ -381,6 +391,7 @@ fn test_ocr_empty_language() {
|
|
|
381
391
|
backend: "tesseract".to_string(),
|
|
382
392
|
language: "".to_string(),
|
|
383
393
|
tesseract_config: None,
|
|
394
|
+
output_format: None,
|
|
384
395
|
}),
|
|
385
396
|
force_ocr: false,
|
|
386
397
|
..Default::default()
|
|
@@ -413,6 +424,7 @@ fn test_ocr_malformed_multi_language() {
|
|
|
413
424
|
backend: "tesseract".to_string(),
|
|
414
425
|
language: "eng++deu++fra".to_string(),
|
|
415
426
|
tesseract_config: None,
|
|
427
|
+
output_format: None,
|
|
416
428
|
}),
|
|
417
429
|
force_ocr: false,
|
|
418
430
|
..Default::default()
|
|
@@ -446,6 +458,7 @@ fn test_ocr_cache_disabled_then_enabled() {
|
|
|
446
458
|
use_cache: false,
|
|
447
459
|
..Default::default()
|
|
448
460
|
}),
|
|
461
|
+
output_format: None,
|
|
449
462
|
}),
|
|
450
463
|
force_ocr: false,
|
|
451
464
|
use_cache: false,
|
|
@@ -466,6 +479,7 @@ fn test_ocr_cache_disabled_then_enabled() {
|
|
|
466
479
|
use_cache: true,
|
|
467
480
|
..Default::default()
|
|
468
481
|
}),
|
|
482
|
+
output_format: None,
|
|
469
483
|
}),
|
|
470
484
|
force_ocr: false,
|
|
471
485
|
use_cache: true,
|
|
@@ -497,6 +511,7 @@ fn test_ocr_concurrent_same_file() {
|
|
|
497
511
|
backend: "tesseract".to_string(),
|
|
498
512
|
language: "eng".to_string(),
|
|
499
513
|
tesseract_config: None,
|
|
514
|
+
output_format: None,
|
|
500
515
|
}),
|
|
501
516
|
force_ocr: false,
|
|
502
517
|
use_cache: true,
|
|
@@ -563,6 +578,7 @@ fn test_ocr_concurrent_different_files() {
|
|
|
563
578
|
backend: "tesseract".to_string(),
|
|
564
579
|
language: "eng".to_string(),
|
|
565
580
|
tesseract_config: None,
|
|
581
|
+
output_format: None,
|
|
566
582
|
}),
|
|
567
583
|
force_ocr: false,
|
|
568
584
|
use_cache: true,
|
|
@@ -632,6 +648,7 @@ fn test_ocr_with_preprocessing_extreme_dpi() {
|
|
|
632
648
|
}),
|
|
633
649
|
..Default::default()
|
|
634
650
|
}),
|
|
651
|
+
output_format: None,
|
|
635
652
|
}),
|
|
636
653
|
force_ocr: false,
|
|
637
654
|
..Default::default()
|
|
@@ -677,6 +694,7 @@ fn test_ocr_with_invalid_binarization_method() {
|
|
|
677
694
|
}),
|
|
678
695
|
..Default::default()
|
|
679
696
|
}),
|
|
697
|
+
output_format: None,
|
|
680
698
|
}),
|
|
681
699
|
force_ocr: false,
|
|
682
700
|
..Default::default()
|
|
@@ -167,6 +167,7 @@ fn test_ocr_quality_simple_text_high_accuracy() {
|
|
|
167
167
|
backend: "tesseract".to_string(),
|
|
168
168
|
language: "eng".to_string(),
|
|
169
169
|
tesseract_config: None,
|
|
170
|
+
output_format: None,
|
|
170
171
|
}),
|
|
171
172
|
force_ocr: true,
|
|
172
173
|
..Default::default()
|
|
@@ -241,6 +242,7 @@ fn test_ocr_quality_numeric_accuracy() {
|
|
|
241
242
|
backend: "tesseract".to_string(),
|
|
242
243
|
language: "eng".to_string(),
|
|
243
244
|
tesseract_config: None,
|
|
245
|
+
output_format: None,
|
|
244
246
|
}),
|
|
245
247
|
force_ocr: true,
|
|
246
248
|
..Default::default()
|
|
@@ -306,6 +308,7 @@ fn test_ocr_quality_layout_preservation() {
|
|
|
306
308
|
backend: "tesseract".to_string(),
|
|
307
309
|
language: "eng".to_string(),
|
|
308
310
|
tesseract_config: None,
|
|
311
|
+
output_format: None,
|
|
309
312
|
}),
|
|
310
313
|
force_ocr: true,
|
|
311
314
|
..Default::default()
|
|
@@ -365,6 +368,7 @@ fn test_ocr_quality_technical_document() {
|
|
|
365
368
|
backend: "tesseract".to_string(),
|
|
366
369
|
language: "eng".to_string(),
|
|
367
370
|
tesseract_config: None,
|
|
371
|
+
output_format: None,
|
|
368
372
|
}),
|
|
369
373
|
force_ocr: true,
|
|
370
374
|
..Default::default()
|
|
@@ -409,6 +413,7 @@ fn test_ocr_consistency_across_runs() {
|
|
|
409
413
|
backend: "tesseract".to_string(),
|
|
410
414
|
language: "eng".to_string(),
|
|
411
415
|
tesseract_config: None,
|
|
416
|
+
output_format: None,
|
|
412
417
|
}),
|
|
413
418
|
force_ocr: true,
|
|
414
419
|
use_cache: false,
|
|
@@ -474,6 +479,7 @@ fn test_ocr_consistency_with_different_psm() {
|
|
|
474
479
|
psm: 3,
|
|
475
480
|
..Default::default()
|
|
476
481
|
}),
|
|
482
|
+
output_format: None,
|
|
477
483
|
}),
|
|
478
484
|
force_ocr: true,
|
|
479
485
|
..Default::default()
|
|
@@ -487,6 +493,7 @@ fn test_ocr_consistency_with_different_psm() {
|
|
|
487
493
|
psm: 6,
|
|
488
494
|
..Default::default()
|
|
489
495
|
}),
|
|
496
|
+
output_format: None,
|
|
490
497
|
}),
|
|
491
498
|
force_ocr: true,
|
|
492
499
|
..Default::default()
|
|
@@ -557,6 +564,7 @@ fn test_ocr_quality_multi_page_consistency() {
|
|
|
557
564
|
backend: "tesseract".to_string(),
|
|
558
565
|
language: "eng".to_string(),
|
|
559
566
|
tesseract_config: None,
|
|
567
|
+
output_format: None,
|
|
560
568
|
}),
|
|
561
569
|
force_ocr: true,
|
|
562
570
|
..Default::default()
|
|
@@ -608,6 +616,7 @@ fn test_ocr_quality_with_tables() {
|
|
|
608
616
|
table_min_confidence: 0.5,
|
|
609
617
|
..Default::default()
|
|
610
618
|
}),
|
|
619
|
+
output_format: None,
|
|
611
620
|
}),
|
|
612
621
|
force_ocr: true,
|
|
613
622
|
..Default::default()
|
|
@@ -141,6 +141,8 @@ async fn test_pipeline_empty_no_processors() {
|
|
|
141
141
|
chunks: None,
|
|
142
142
|
images: None,
|
|
143
143
|
pages: None,
|
|
144
|
+
djot_content: None,
|
|
145
|
+
elements: None,
|
|
144
146
|
};
|
|
145
147
|
let config = ExtractionConfig::default();
|
|
146
148
|
|
|
@@ -186,6 +188,8 @@ async fn test_pipeline_single_processor_per_stage() {
|
|
|
186
188
|
chunks: None,
|
|
187
189
|
images: None,
|
|
188
190
|
pages: None,
|
|
191
|
+
djot_content: None,
|
|
192
|
+
elements: None,
|
|
189
193
|
};
|
|
190
194
|
let config = ExtractionConfig::default();
|
|
191
195
|
|
|
@@ -231,6 +235,8 @@ async fn test_pipeline_multiple_processors_per_stage() {
|
|
|
231
235
|
chunks: None,
|
|
232
236
|
images: None,
|
|
233
237
|
pages: None,
|
|
238
|
+
djot_content: None,
|
|
239
|
+
elements: None,
|
|
234
240
|
};
|
|
235
241
|
let config = ExtractionConfig::default();
|
|
236
242
|
|
|
@@ -267,6 +273,8 @@ async fn test_pipeline_all_stages_enabled() {
|
|
|
267
273
|
chunks: None,
|
|
268
274
|
images: None,
|
|
269
275
|
pages: None,
|
|
276
|
+
djot_content: None,
|
|
277
|
+
elements: None,
|
|
270
278
|
};
|
|
271
279
|
let config = ExtractionConfig::default();
|
|
272
280
|
|
|
@@ -301,6 +309,8 @@ async fn test_pipeline_postprocessing_disabled() {
|
|
|
301
309
|
chunks: None,
|
|
302
310
|
images: None,
|
|
303
311
|
pages: None,
|
|
312
|
+
djot_content: None,
|
|
313
|
+
elements: None,
|
|
304
314
|
};
|
|
305
315
|
let config = ExtractionConfig {
|
|
306
316
|
postprocessor: Some(PostProcessorConfig {
|
|
@@ -350,6 +360,8 @@ async fn test_pipeline_early_stage_runs_first() {
|
|
|
350
360
|
chunks: None,
|
|
351
361
|
images: None,
|
|
352
362
|
pages: None,
|
|
363
|
+
djot_content: None,
|
|
364
|
+
elements: None,
|
|
353
365
|
};
|
|
354
366
|
let config = ExtractionConfig::default();
|
|
355
367
|
|
|
@@ -390,6 +402,8 @@ async fn test_pipeline_middle_stage_runs_second() {
|
|
|
390
402
|
chunks: None,
|
|
391
403
|
images: None,
|
|
392
404
|
pages: None,
|
|
405
|
+
djot_content: None,
|
|
406
|
+
elements: None,
|
|
393
407
|
};
|
|
394
408
|
let config = ExtractionConfig::default();
|
|
395
409
|
|
|
@@ -426,6 +440,8 @@ async fn test_pipeline_late_stage_runs_last() {
|
|
|
426
440
|
chunks: None,
|
|
427
441
|
images: None,
|
|
428
442
|
pages: None,
|
|
443
|
+
djot_content: None,
|
|
444
|
+
elements: None,
|
|
429
445
|
};
|
|
430
446
|
let config = ExtractionConfig::default();
|
|
431
447
|
|
|
@@ -462,6 +478,8 @@ async fn test_pipeline_within_stage_priority_order() {
|
|
|
462
478
|
chunks: None,
|
|
463
479
|
images: None,
|
|
464
480
|
pages: None,
|
|
481
|
+
djot_content: None,
|
|
482
|
+
elements: None,
|
|
465
483
|
};
|
|
466
484
|
let config = ExtractionConfig::default();
|
|
467
485
|
|
|
@@ -527,6 +545,8 @@ async fn test_pipeline_cross_stage_data_flow() {
|
|
|
527
545
|
chunks: None,
|
|
528
546
|
images: None,
|
|
529
547
|
pages: None,
|
|
548
|
+
djot_content: None,
|
|
549
|
+
elements: None,
|
|
530
550
|
};
|
|
531
551
|
let config = ExtractionConfig::default();
|
|
532
552
|
|
|
@@ -584,6 +604,8 @@ async fn test_pipeline_early_stage_error_recorded() {
|
|
|
584
604
|
chunks: None,
|
|
585
605
|
images: None,
|
|
586
606
|
pages: None,
|
|
607
|
+
djot_content: None,
|
|
608
|
+
elements: None,
|
|
587
609
|
};
|
|
588
610
|
let config = ExtractionConfig::default();
|
|
589
611
|
|
|
@@ -626,6 +648,8 @@ async fn test_pipeline_middle_stage_error_propagation() {
|
|
|
626
648
|
chunks: None,
|
|
627
649
|
images: None,
|
|
628
650
|
pages: None,
|
|
651
|
+
djot_content: None,
|
|
652
|
+
elements: None,
|
|
629
653
|
};
|
|
630
654
|
let config = ExtractionConfig::default();
|
|
631
655
|
|
|
@@ -698,6 +722,8 @@ async fn test_pipeline_late_stage_error_doesnt_affect_earlier_stages() {
|
|
|
698
722
|
chunks: None,
|
|
699
723
|
images: None,
|
|
700
724
|
pages: None,
|
|
725
|
+
djot_content: None,
|
|
726
|
+
elements: None,
|
|
701
727
|
};
|
|
702
728
|
let config = ExtractionConfig::default();
|
|
703
729
|
|
|
@@ -786,6 +812,8 @@ async fn test_pipeline_processor_error_doesnt_stop_other_processors() {
|
|
|
786
812
|
chunks: None,
|
|
787
813
|
images: None,
|
|
788
814
|
pages: None,
|
|
815
|
+
djot_content: None,
|
|
816
|
+
elements: None,
|
|
789
817
|
};
|
|
790
818
|
let config = ExtractionConfig::default();
|
|
791
819
|
|
|
@@ -864,6 +892,8 @@ async fn test_pipeline_multiple_processor_errors() {
|
|
|
864
892
|
chunks: None,
|
|
865
893
|
images: None,
|
|
866
894
|
pages: None,
|
|
895
|
+
djot_content: None,
|
|
896
|
+
elements: None,
|
|
867
897
|
};
|
|
868
898
|
let config = ExtractionConfig::default();
|
|
869
899
|
|
|
@@ -906,6 +936,8 @@ async fn test_pipeline_error_context_preservation() {
|
|
|
906
936
|
chunks: None,
|
|
907
937
|
images: None,
|
|
908
938
|
pages: None,
|
|
939
|
+
djot_content: None,
|
|
940
|
+
elements: None,
|
|
909
941
|
};
|
|
910
942
|
let config = ExtractionConfig::default();
|
|
911
943
|
|
|
@@ -978,6 +1010,8 @@ async fn test_pipeline_metadata_added_in_early_visible_in_middle() {
|
|
|
978
1010
|
chunks: None,
|
|
979
1011
|
images: None,
|
|
980
1012
|
pages: None,
|
|
1013
|
+
djot_content: None,
|
|
1014
|
+
elements: None,
|
|
981
1015
|
};
|
|
982
1016
|
let config = ExtractionConfig::default();
|
|
983
1017
|
|
|
@@ -1049,6 +1083,8 @@ async fn test_pipeline_content_modified_in_middle_visible_in_late() {
|
|
|
1049
1083
|
chunks: None,
|
|
1050
1084
|
images: None,
|
|
1051
1085
|
pages: None,
|
|
1086
|
+
djot_content: None,
|
|
1087
|
+
elements: None,
|
|
1052
1088
|
};
|
|
1053
1089
|
let config = ExtractionConfig::default();
|
|
1054
1090
|
|
|
@@ -1118,6 +1154,8 @@ async fn test_pipeline_multiple_processors_modifying_same_metadata() {
|
|
|
1118
1154
|
chunks: None,
|
|
1119
1155
|
images: None,
|
|
1120
1156
|
pages: None,
|
|
1157
|
+
djot_content: None,
|
|
1158
|
+
elements: None,
|
|
1121
1159
|
};
|
|
1122
1160
|
let config = ExtractionConfig::default();
|
|
1123
1161
|
|
|
@@ -1206,6 +1244,8 @@ async fn test_pipeline_processors_reading_previous_output() {
|
|
|
1206
1244
|
chunks: None,
|
|
1207
1245
|
images: None,
|
|
1208
1246
|
pages: None,
|
|
1247
|
+
djot_content: None,
|
|
1248
|
+
elements: None,
|
|
1209
1249
|
};
|
|
1210
1250
|
let config = ExtractionConfig::default();
|
|
1211
1251
|
|
|
@@ -1261,6 +1301,8 @@ async fn test_pipeline_large_content_modification() {
|
|
|
1261
1301
|
chunks: None,
|
|
1262
1302
|
images: None,
|
|
1263
1303
|
pages: None,
|
|
1304
|
+
djot_content: None,
|
|
1305
|
+
elements: None,
|
|
1264
1306
|
};
|
|
1265
1307
|
let config = ExtractionConfig::default();
|
|
1266
1308
|
|
|
@@ -1297,6 +1339,8 @@ async fn test_pipeline_enabled_processors_whitelist() {
|
|
|
1297
1339
|
chunks: None,
|
|
1298
1340
|
images: None,
|
|
1299
1341
|
pages: None,
|
|
1342
|
+
djot_content: None,
|
|
1343
|
+
elements: None,
|
|
1300
1344
|
};
|
|
1301
1345
|
let config = ExtractionConfig {
|
|
1302
1346
|
postprocessor: Some(PostProcessorConfig {
|
|
@@ -1344,6 +1388,8 @@ async fn test_pipeline_disabled_processors_blacklist() {
|
|
|
1344
1388
|
chunks: None,
|
|
1345
1389
|
images: None,
|
|
1346
1390
|
pages: None,
|
|
1391
|
+
djot_content: None,
|
|
1392
|
+
elements: None,
|
|
1347
1393
|
};
|
|
1348
1394
|
let config = ExtractionConfig {
|
|
1349
1395
|
postprocessor: Some(PostProcessorConfig {
|
|
@@ -1391,6 +1437,8 @@ async fn test_pipeline_no_filtering_runs_all() {
|
|
|
1391
1437
|
chunks: None,
|
|
1392
1438
|
images: None,
|
|
1393
1439
|
pages: None,
|
|
1440
|
+
djot_content: None,
|
|
1441
|
+
elements: None,
|
|
1394
1442
|
};
|
|
1395
1443
|
let config = ExtractionConfig::default();
|
|
1396
1444
|
|
|
@@ -1429,6 +1477,8 @@ async fn test_pipeline_empty_whitelist_runs_none() {
|
|
|
1429
1477
|
chunks: None,
|
|
1430
1478
|
images: None,
|
|
1431
1479
|
pages: None,
|
|
1480
|
+
djot_content: None,
|
|
1481
|
+
elements: None,
|
|
1432
1482
|
};
|
|
1433
1483
|
let config = ExtractionConfig {
|
|
1434
1484
|
postprocessor: Some(PostProcessorConfig {
|