kreuzberg 4.0.8 → 4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
- data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
- data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
- data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
- data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
- data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
- data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
- data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
- data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
- data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
- data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
- data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
- data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
- data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
- data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
- data/ext/kreuzberg_rb/native/src/result.rs +326 -0
- data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
- data/lib/kreuzberg/config.rb +66 -0
- data/lib/kreuzberg/result.rb +107 -2
- data/lib/kreuzberg/types.rb +104 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +0 -4
- data/sig/kreuzberg.rbs +105 -1
- data/vendor/Cargo.toml +3 -3
- data/vendor/kreuzberg/Cargo.toml +4 -3
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/config.rs +69 -0
- data/vendor/kreuzberg/src/api/handlers.rs +99 -2
- data/vendor/kreuzberg/src/api/mod.rs +14 -7
- data/vendor/kreuzberg/src/api/router.rs +214 -0
- data/vendor/kreuzberg/src/api/startup.rs +243 -0
- data/vendor/kreuzberg/src/api/types.rs +78 -0
- data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
- data/vendor/kreuzberg/src/cache/core.rs +428 -0
- data/vendor/kreuzberg/src/cache/mod.rs +21 -843
- data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
- data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
- data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
- data/vendor/kreuzberg/src/chunking/config.rs +52 -0
- data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
- data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
- data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
- data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
- data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
- data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
- data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
- data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
- data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
- data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
- data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
- data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
- data/vendor/kreuzberg/src/core/config/page.rs +57 -0
- data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
- data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
- data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
- data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
- data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
- data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
- data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
- data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
- data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
- data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
- data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
- data/vendor/kreuzberg/src/core/mod.rs +4 -1
- data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
- data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
- data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
- data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
- data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
- data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
- data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
- data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
- data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
- data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
- data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
- data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
- data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
- data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
- data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
- data/vendor/kreuzberg/src/embeddings.rs +136 -13
- data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
- data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
- data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
- data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
- data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
- data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
- data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
- data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
- data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
- data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
- data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
- data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
- data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
- data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
- data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
- data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
- data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
- data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
- data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
- data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
- data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
- data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
- data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
- data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
- data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
- data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
- data/vendor/kreuzberg/src/extractors/email.rs +2 -0
- data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
- data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
- data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
- data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
- data/vendor/kreuzberg/src/extractors/html.rs +80 -8
- data/vendor/kreuzberg/src/extractors/image.rs +8 -1
- data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
- data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
- data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
- data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
- data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
- data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
- data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
- data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
- data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
- data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
- data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
- data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
- data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
- data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
- data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
- data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
- data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
- data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
- data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
- data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
- data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
- data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
- data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
- data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
- data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
- data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
- data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
- data/vendor/kreuzberg/src/extractors/text.rs +4 -0
- data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
- data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
- data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
- data/vendor/kreuzberg/src/lib.rs +2 -2
- data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
- data/vendor/kreuzberg/src/mcp/format.rs +211 -0
- data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
- data/vendor/kreuzberg/src/mcp/params.rs +196 -0
- data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
- data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
- data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
- data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
- data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
- data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
- data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
- data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
- data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
- data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
- data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
- data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
- data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
- data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
- data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
- data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
- data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
- data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
- data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
- data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
- data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
- data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
- data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
- data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
- data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
- data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
- data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
- data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
- data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
- data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
- data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
- data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
- data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
- data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
- data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
- data/vendor/kreuzberg/src/text/quality.rs +1 -1
- data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
- data/vendor/kreuzberg/src/types/djot.rs +209 -0
- data/vendor/kreuzberg/src/types/extraction.rs +301 -0
- data/vendor/kreuzberg/src/types/formats.rs +443 -0
- data/vendor/kreuzberg/src/types/metadata.rs +560 -0
- data/vendor/kreuzberg/src/types/mod.rs +281 -0
- data/vendor/kreuzberg/src/types/page.rs +182 -0
- data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
- data/vendor/kreuzberg/src/types/tables.rs +39 -0
- data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
- data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
- data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
- data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
- data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
- data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
- data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
- data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
- data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
- data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
- data/vendor/kreuzberg/tests/api_embed.rs +6 -9
- data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
- data/vendor/kreuzberg/tests/core_integration.rs +1 -0
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
- data/vendor/kreuzberg/tests/format_integration.rs +2 -0
- data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
- data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
- data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
- data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
- data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
- data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
- data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
- data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
- data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
- data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
- data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
- data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
- data/vendor/kreuzberg-ffi/src/error.rs +46 -14
- data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
- data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
- data/vendor/kreuzberg-ffi/src/result.rs +148 -122
- data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- metadata +200 -28
- data/vendor/kreuzberg/src/api/server.rs +0 -518
- data/vendor/kreuzberg/src/core/config.rs +0 -1914
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
- data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
- data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
- data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
- data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
- data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
- data/vendor/kreuzberg/src/types.rs +0 -1713
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
|
@@ -0,0 +1,313 @@
|
|
|
1
|
+
//! Integration tests for the /chunk API endpoint.
|
|
2
|
+
|
|
3
|
+
#![cfg(feature = "api")]
|
|
4
|
+
|
|
5
|
+
use axum::{
|
|
6
|
+
body::Body,
|
|
7
|
+
http::{Request, StatusCode},
|
|
8
|
+
};
|
|
9
|
+
use serde_json::json;
|
|
10
|
+
use tower::ServiceExt;
|
|
11
|
+
|
|
12
|
+
use kreuzberg::{ExtractionConfig, api::create_router};
|
|
13
|
+
|
|
14
|
+
#[tokio::test]
|
|
15
|
+
async fn test_chunk_basic() {
|
|
16
|
+
let app = create_router(ExtractionConfig::default());
|
|
17
|
+
let response = app
|
|
18
|
+
.oneshot(
|
|
19
|
+
Request::builder()
|
|
20
|
+
.uri("/chunk")
|
|
21
|
+
.method("POST")
|
|
22
|
+
.header("content-type", "application/json")
|
|
23
|
+
.body(Body::from(
|
|
24
|
+
json!({
|
|
25
|
+
"text": "Short text. More text here. Even more content to chunk."
|
|
26
|
+
})
|
|
27
|
+
.to_string(),
|
|
28
|
+
))
|
|
29
|
+
.unwrap(),
|
|
30
|
+
)
|
|
31
|
+
.await
|
|
32
|
+
.unwrap();
|
|
33
|
+
|
|
34
|
+
assert_eq!(response.status(), StatusCode::OK);
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
#[tokio::test]
|
|
38
|
+
async fn test_chunk_empty_text_returns_400() {
|
|
39
|
+
let app = create_router(ExtractionConfig::default());
|
|
40
|
+
let response = app
|
|
41
|
+
.oneshot(
|
|
42
|
+
Request::builder()
|
|
43
|
+
.uri("/chunk")
|
|
44
|
+
.method("POST")
|
|
45
|
+
.header("content-type", "application/json")
|
|
46
|
+
.body(Body::from(json!({"text": ""}).to_string()))
|
|
47
|
+
.unwrap(),
|
|
48
|
+
)
|
|
49
|
+
.await
|
|
50
|
+
.unwrap();
|
|
51
|
+
|
|
52
|
+
assert_eq!(response.status(), StatusCode::BAD_REQUEST);
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
#[tokio::test]
|
|
56
|
+
async fn test_chunk_markdown_strategy() {
|
|
57
|
+
let app = create_router(ExtractionConfig::default());
|
|
58
|
+
let response = app
|
|
59
|
+
.oneshot(
|
|
60
|
+
Request::builder()
|
|
61
|
+
.uri("/chunk")
|
|
62
|
+
.method("POST")
|
|
63
|
+
.header("content-type", "application/json")
|
|
64
|
+
.body(Body::from(
|
|
65
|
+
json!({
|
|
66
|
+
"text": "# Heading\n\nParagraph text here.",
|
|
67
|
+
"chunker_type": "markdown"
|
|
68
|
+
})
|
|
69
|
+
.to_string(),
|
|
70
|
+
))
|
|
71
|
+
.unwrap(),
|
|
72
|
+
)
|
|
73
|
+
.await
|
|
74
|
+
.unwrap();
|
|
75
|
+
|
|
76
|
+
assert_eq!(response.status(), StatusCode::OK);
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
#[tokio::test]
|
|
80
|
+
async fn test_chunk_response_structure() {
|
|
81
|
+
use kreuzberg::api::ChunkResponse;
|
|
82
|
+
|
|
83
|
+
let app = create_router(ExtractionConfig::default());
|
|
84
|
+
let response = app
|
|
85
|
+
.oneshot(
|
|
86
|
+
Request::builder()
|
|
87
|
+
.uri("/chunk")
|
|
88
|
+
.method("POST")
|
|
89
|
+
.header("content-type", "application/json")
|
|
90
|
+
.body(Body::from(
|
|
91
|
+
json!({
|
|
92
|
+
"text": "This is a test. Another sentence here. And one more sentence to ensure we get chunks.",
|
|
93
|
+
"config": {
|
|
94
|
+
"max_characters": 50,
|
|
95
|
+
"overlap": 10,
|
|
96
|
+
"trim": true
|
|
97
|
+
},
|
|
98
|
+
"chunker_type": "text"
|
|
99
|
+
})
|
|
100
|
+
.to_string(),
|
|
101
|
+
))
|
|
102
|
+
.unwrap(),
|
|
103
|
+
)
|
|
104
|
+
.await
|
|
105
|
+
.unwrap();
|
|
106
|
+
|
|
107
|
+
assert_eq!(response.status(), StatusCode::OK);
|
|
108
|
+
|
|
109
|
+
let body = axum::body::to_bytes(response.into_body(), usize::MAX).await.unwrap();
|
|
110
|
+
let chunk_response: ChunkResponse = serde_json::from_slice(&body).unwrap();
|
|
111
|
+
|
|
112
|
+
// Verify response structure
|
|
113
|
+
assert!(chunk_response.chunk_count > 0);
|
|
114
|
+
assert_eq!(chunk_response.chunks.len(), chunk_response.chunk_count);
|
|
115
|
+
assert_eq!(chunk_response.chunker_type, "text");
|
|
116
|
+
assert_eq!(chunk_response.config.max_characters, 50);
|
|
117
|
+
assert_eq!(chunk_response.config.overlap, 10);
|
|
118
|
+
assert!(chunk_response.config.trim);
|
|
119
|
+
assert!(chunk_response.input_size_bytes > 0);
|
|
120
|
+
|
|
121
|
+
// Verify chunk metadata
|
|
122
|
+
for (idx, chunk) in chunk_response.chunks.iter().enumerate() {
|
|
123
|
+
assert!(!chunk.content.is_empty());
|
|
124
|
+
assert_eq!(chunk.chunk_index, idx);
|
|
125
|
+
assert_eq!(chunk.total_chunks, chunk_response.chunk_count);
|
|
126
|
+
assert!(chunk.byte_end > chunk.byte_start);
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
#[tokio::test]
|
|
131
|
+
async fn test_chunk_invalid_strategy_returns_400() {
|
|
132
|
+
let app = create_router(ExtractionConfig::default());
|
|
133
|
+
let response = app
|
|
134
|
+
.oneshot(
|
|
135
|
+
Request::builder()
|
|
136
|
+
.uri("/chunk")
|
|
137
|
+
.method("POST")
|
|
138
|
+
.header("content-type", "application/json")
|
|
139
|
+
.body(Body::from(
|
|
140
|
+
json!({
|
|
141
|
+
"text": "Test text",
|
|
142
|
+
"chunker_type": "invalid_type"
|
|
143
|
+
})
|
|
144
|
+
.to_string(),
|
|
145
|
+
))
|
|
146
|
+
.unwrap(),
|
|
147
|
+
)
|
|
148
|
+
.await
|
|
149
|
+
.unwrap();
|
|
150
|
+
|
|
151
|
+
assert_eq!(response.status(), StatusCode::BAD_REQUEST);
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
#[tokio::test]
|
|
155
|
+
async fn test_chunk_with_defaults() {
|
|
156
|
+
use kreuzberg::api::ChunkResponse;
|
|
157
|
+
|
|
158
|
+
let app = create_router(ExtractionConfig::default());
|
|
159
|
+
let response = app
|
|
160
|
+
.oneshot(
|
|
161
|
+
Request::builder()
|
|
162
|
+
.uri("/chunk")
|
|
163
|
+
.method("POST")
|
|
164
|
+
.header("content-type", "application/json")
|
|
165
|
+
.body(Body::from(
|
|
166
|
+
json!({
|
|
167
|
+
"text": "This is a test sentence. Another sentence here."
|
|
168
|
+
})
|
|
169
|
+
.to_string(),
|
|
170
|
+
))
|
|
171
|
+
.unwrap(),
|
|
172
|
+
)
|
|
173
|
+
.await
|
|
174
|
+
.unwrap();
|
|
175
|
+
|
|
176
|
+
assert_eq!(response.status(), StatusCode::OK);
|
|
177
|
+
|
|
178
|
+
let body = axum::body::to_bytes(response.into_body(), usize::MAX).await.unwrap();
|
|
179
|
+
let chunk_response: ChunkResponse = serde_json::from_slice(&body).unwrap();
|
|
180
|
+
|
|
181
|
+
// Verify defaults are applied
|
|
182
|
+
assert_eq!(chunk_response.config.max_characters, 2000);
|
|
183
|
+
assert_eq!(chunk_response.config.overlap, 100);
|
|
184
|
+
assert!(chunk_response.config.trim);
|
|
185
|
+
assert_eq!(chunk_response.chunker_type, "text");
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
#[tokio::test]
|
|
189
|
+
async fn test_chunk_malformed_json_returns_400() {
|
|
190
|
+
let app = create_router(ExtractionConfig::default());
|
|
191
|
+
let response = app
|
|
192
|
+
.oneshot(
|
|
193
|
+
Request::builder()
|
|
194
|
+
.uri("/chunk")
|
|
195
|
+
.method("POST")
|
|
196
|
+
.header("content-type", "application/json")
|
|
197
|
+
.body(Body::from("{invalid json}"))
|
|
198
|
+
.unwrap(),
|
|
199
|
+
)
|
|
200
|
+
.await
|
|
201
|
+
.unwrap();
|
|
202
|
+
|
|
203
|
+
assert_eq!(response.status(), StatusCode::BAD_REQUEST);
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
#[tokio::test]
|
|
207
|
+
async fn test_chunk_case_insensitive_chunker_type() {
|
|
208
|
+
use kreuzberg::api::ChunkResponse;
|
|
209
|
+
|
|
210
|
+
let app = create_router(ExtractionConfig::default());
|
|
211
|
+
let response = app
|
|
212
|
+
.oneshot(
|
|
213
|
+
Request::builder()
|
|
214
|
+
.uri("/chunk")
|
|
215
|
+
.method("POST")
|
|
216
|
+
.header("content-type", "application/json")
|
|
217
|
+
.body(Body::from(
|
|
218
|
+
json!({
|
|
219
|
+
"text": "# Title\n\nContent here.",
|
|
220
|
+
"chunker_type": "MARKDOWN"
|
|
221
|
+
})
|
|
222
|
+
.to_string(),
|
|
223
|
+
))
|
|
224
|
+
.unwrap(),
|
|
225
|
+
)
|
|
226
|
+
.await
|
|
227
|
+
.unwrap();
|
|
228
|
+
|
|
229
|
+
assert_eq!(response.status(), StatusCode::OK);
|
|
230
|
+
|
|
231
|
+
let body = axum::body::to_bytes(response.into_body(), usize::MAX).await.unwrap();
|
|
232
|
+
let chunk_response: ChunkResponse = serde_json::from_slice(&body).unwrap();
|
|
233
|
+
|
|
234
|
+
// Verify it's normalized to lowercase
|
|
235
|
+
assert_eq!(chunk_response.chunker_type, "markdown");
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
#[tokio::test]
|
|
239
|
+
async fn test_chunk_long_text() {
|
|
240
|
+
use kreuzberg::api::ChunkResponse;
|
|
241
|
+
|
|
242
|
+
let app = create_router(ExtractionConfig::default());
|
|
243
|
+
let long_text = "Lorem ipsum dolor sit amet. ".repeat(200);
|
|
244
|
+
|
|
245
|
+
let response = app
|
|
246
|
+
.oneshot(
|
|
247
|
+
Request::builder()
|
|
248
|
+
.uri("/chunk")
|
|
249
|
+
.method("POST")
|
|
250
|
+
.header("content-type", "application/json")
|
|
251
|
+
.body(Body::from(
|
|
252
|
+
json!({
|
|
253
|
+
"text": long_text,
|
|
254
|
+
"config": {
|
|
255
|
+
"max_characters": 500,
|
|
256
|
+
"overlap": 50
|
|
257
|
+
}
|
|
258
|
+
})
|
|
259
|
+
.to_string(),
|
|
260
|
+
))
|
|
261
|
+
.unwrap(),
|
|
262
|
+
)
|
|
263
|
+
.await
|
|
264
|
+
.unwrap();
|
|
265
|
+
|
|
266
|
+
assert_eq!(response.status(), StatusCode::OK);
|
|
267
|
+
|
|
268
|
+
let body = axum::body::to_bytes(response.into_body(), usize::MAX).await.unwrap();
|
|
269
|
+
let chunk_response: ChunkResponse = serde_json::from_slice(&body).unwrap();
|
|
270
|
+
|
|
271
|
+
// Should have multiple chunks
|
|
272
|
+
assert!(chunk_response.chunk_count > 1);
|
|
273
|
+
assert_eq!(chunk_response.chunks.len(), chunk_response.chunk_count);
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
#[tokio::test]
|
|
277
|
+
async fn test_chunk_custom_config() {
|
|
278
|
+
use kreuzberg::api::ChunkResponse;
|
|
279
|
+
|
|
280
|
+
let app = create_router(ExtractionConfig::default());
|
|
281
|
+
let response = app
|
|
282
|
+
.oneshot(
|
|
283
|
+
Request::builder()
|
|
284
|
+
.uri("/chunk")
|
|
285
|
+
.method("POST")
|
|
286
|
+
.header("content-type", "application/json")
|
|
287
|
+
.body(Body::from(
|
|
288
|
+
json!({
|
|
289
|
+
"text": "Test sentence one. Test sentence two. Test sentence three.",
|
|
290
|
+
"config": {
|
|
291
|
+
"max_characters": 30,
|
|
292
|
+
"overlap": 5,
|
|
293
|
+
"trim": false
|
|
294
|
+
},
|
|
295
|
+
"chunker_type": "text"
|
|
296
|
+
})
|
|
297
|
+
.to_string(),
|
|
298
|
+
))
|
|
299
|
+
.unwrap(),
|
|
300
|
+
)
|
|
301
|
+
.await
|
|
302
|
+
.unwrap();
|
|
303
|
+
|
|
304
|
+
assert_eq!(response.status(), StatusCode::OK);
|
|
305
|
+
|
|
306
|
+
let body = axum::body::to_bytes(response.into_body(), usize::MAX).await.unwrap();
|
|
307
|
+
let chunk_response: ChunkResponse = serde_json::from_slice(&body).unwrap();
|
|
308
|
+
|
|
309
|
+
// Verify custom config was applied
|
|
310
|
+
assert_eq!(chunk_response.config.max_characters, 30);
|
|
311
|
+
assert_eq!(chunk_response.config.overlap, 5);
|
|
312
|
+
assert!(!chunk_response.config.trim);
|
|
313
|
+
}
|
|
@@ -84,9 +84,8 @@ async fn test_embed_with_custom_config() {
|
|
|
84
84
|
"texts": ["Test embedding with custom config"],
|
|
85
85
|
"config": {
|
|
86
86
|
"model": {
|
|
87
|
-
"
|
|
88
|
-
|
|
89
|
-
}
|
|
87
|
+
"type": "preset",
|
|
88
|
+
"name": "fast"
|
|
90
89
|
},
|
|
91
90
|
"batch_size": 32
|
|
92
91
|
}
|
|
@@ -295,9 +294,8 @@ async fn test_embed_different_presets() {
|
|
|
295
294
|
"texts": ["Test text"],
|
|
296
295
|
"config": {
|
|
297
296
|
"model": {
|
|
298
|
-
"
|
|
299
|
-
|
|
300
|
-
}
|
|
297
|
+
"type": "preset",
|
|
298
|
+
"name": "fast"
|
|
301
299
|
}
|
|
302
300
|
}
|
|
303
301
|
});
|
|
@@ -327,9 +325,8 @@ async fn test_embed_different_presets() {
|
|
|
327
325
|
"texts": ["Test text"],
|
|
328
326
|
"config": {
|
|
329
327
|
"model": {
|
|
330
|
-
"
|
|
331
|
-
|
|
332
|
-
}
|
|
328
|
+
"type": "preset",
|
|
329
|
+
"name": "balanced"
|
|
333
330
|
}
|
|
334
331
|
}
|
|
335
332
|
});
|
|
@@ -193,6 +193,7 @@ async fn test_concurrent_ocr_processing() {
|
|
|
193
193
|
backend: "tesseract".to_string(),
|
|
194
194
|
language: "eng".to_string(),
|
|
195
195
|
tesseract_config: None,
|
|
196
|
+
output_format: None,
|
|
196
197
|
}),
|
|
197
198
|
force_ocr: false,
|
|
198
199
|
use_cache: true,
|
|
@@ -262,6 +263,7 @@ fn test_concurrent_ocr_cache_stress() {
|
|
|
262
263
|
backend: "tesseract".to_string(),
|
|
263
264
|
language: "eng".to_string(),
|
|
264
265
|
tesseract_config: None,
|
|
266
|
+
output_format: None,
|
|
265
267
|
}),
|
|
266
268
|
force_ocr: false,
|
|
267
269
|
use_cache: true,
|
|
@@ -313,7 +315,10 @@ fn test_concurrent_ocr_cache_stress() {
|
|
|
313
315
|
/// - Pipeline can process multiple results in parallel
|
|
314
316
|
/// - Processors don't interfere with each other
|
|
315
317
|
/// - Registry reads are thread-safe
|
|
318
|
+
///
|
|
319
|
+
/// Note: This test is flaky due to timing-dependent concurrent operations.
|
|
316
320
|
#[tokio::test]
|
|
321
|
+
#[ignore = "flaky concurrency test - timing dependent on system load"]
|
|
317
322
|
async fn test_concurrent_pipeline_processing() {
|
|
318
323
|
struct ConcurrentTestProcessor;
|
|
319
324
|
|
|
@@ -378,6 +383,8 @@ async fn test_concurrent_pipeline_processing() {
|
|
|
378
383
|
chunks: None,
|
|
379
384
|
images: None,
|
|
380
385
|
pages: None,
|
|
386
|
+
elements: None,
|
|
387
|
+
djot_content: None,
|
|
381
388
|
};
|
|
382
389
|
|
|
383
390
|
run_pipeline(result, &config).await
|
|
@@ -120,3 +120,133 @@ async fn test_docx_minimal_metadata_extraction() {
|
|
|
120
120
|
|
|
121
121
|
println!("✅ DOCX minimal metadata extraction test passed!");
|
|
122
122
|
}
|
|
123
|
+
|
|
124
|
+
#[tokio::test]
|
|
125
|
+
async fn test_docx_keywords_extraction() {
|
|
126
|
+
// This test verifies that DOCX keywords metadata is properly parsed
|
|
127
|
+
// from comma-separated strings into Vec<String> in Metadata.keywords
|
|
128
|
+
//
|
|
129
|
+
// Addresses GitHub issue #309: DOCX keyword extraction was returning
|
|
130
|
+
// strings instead of parsed keyword lists, causing FunctionClauseError
|
|
131
|
+
// in the Elixir binding.
|
|
132
|
+
|
|
133
|
+
use std::io::Write;
|
|
134
|
+
use tempfile::NamedTempFile;
|
|
135
|
+
use zip::CompressionMethod;
|
|
136
|
+
use zip::write::{FileOptions, ZipWriter};
|
|
137
|
+
|
|
138
|
+
// Create a minimal DOCX with keywords metadata
|
|
139
|
+
let mut temp_file = NamedTempFile::new().expect("Failed to create temp file");
|
|
140
|
+
|
|
141
|
+
{
|
|
142
|
+
let mut zip = ZipWriter::new(&mut temp_file);
|
|
143
|
+
let options: FileOptions<()> = FileOptions::default().compression_method(CompressionMethod::Stored);
|
|
144
|
+
|
|
145
|
+
// Add [Content_Types].xml
|
|
146
|
+
zip.start_file("[Content_Types].xml", options).unwrap();
|
|
147
|
+
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
148
|
+
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
|
|
149
|
+
<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
|
|
150
|
+
<Default Extension="xml" ContentType="application/xml"/>
|
|
151
|
+
<Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>
|
|
152
|
+
<Override PartName="/docProps/core.xml" ContentType="application/vnd.openxmlformats-package.core-properties+xml"/>
|
|
153
|
+
</Types>"#).unwrap();
|
|
154
|
+
|
|
155
|
+
// Add _rels/.rels
|
|
156
|
+
zip.start_file("_rels/.rels", options).unwrap();
|
|
157
|
+
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
158
|
+
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
|
|
159
|
+
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/>
|
|
160
|
+
<Relationship Id="rId2" Type="http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties" Target="docProps/core.xml"/>
|
|
161
|
+
</Relationships>"#).unwrap();
|
|
162
|
+
|
|
163
|
+
// Add word/document.xml with simple content
|
|
164
|
+
zip.start_file("word/document.xml", options).unwrap();
|
|
165
|
+
zip.write_all(
|
|
166
|
+
br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
167
|
+
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
|
|
168
|
+
<w:body>
|
|
169
|
+
<w:p>
|
|
170
|
+
<w:r>
|
|
171
|
+
<w:t>Test document for keyword extraction</w:t>
|
|
172
|
+
</w:r>
|
|
173
|
+
</w:p>
|
|
174
|
+
</w:body>
|
|
175
|
+
</w:document>"#,
|
|
176
|
+
)
|
|
177
|
+
.unwrap();
|
|
178
|
+
|
|
179
|
+
// Add docProps/core.xml with keywords (comma-separated string)
|
|
180
|
+
zip.start_file("docProps/core.xml", options).unwrap();
|
|
181
|
+
zip.write_all(
|
|
182
|
+
br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
183
|
+
<cp:coreProperties xmlns:cp="http://schemas.openxmlformats.org/package/2006/metadata/core-properties"
|
|
184
|
+
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
|
185
|
+
xmlns:dcterms="http://purl.org/dc/terms/">
|
|
186
|
+
<dc:title>Test Document</dc:title>
|
|
187
|
+
<dc:creator>Test Author</dc:creator>
|
|
188
|
+
<cp:keywords>rust, docx, extraction, metadata, test</cp:keywords>
|
|
189
|
+
<dc:subject>Testing keyword extraction</dc:subject>
|
|
190
|
+
</cp:coreProperties>"#,
|
|
191
|
+
)
|
|
192
|
+
.unwrap();
|
|
193
|
+
|
|
194
|
+
zip.finish().unwrap();
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
// Extract the DOCX file
|
|
198
|
+
let result = extract_file(
|
|
199
|
+
temp_file.path(),
|
|
200
|
+
Some("application/vnd.openxmlformats-officedocument.wordprocessingml.document"),
|
|
201
|
+
&ExtractionConfig::default(),
|
|
202
|
+
)
|
|
203
|
+
.await
|
|
204
|
+
.expect("Should extract DOCX with keywords successfully");
|
|
205
|
+
|
|
206
|
+
// Verify content was extracted
|
|
207
|
+
assert!(!result.content.is_empty(), "Content should not be empty");
|
|
208
|
+
assert!(
|
|
209
|
+
result.content.contains("Test document for keyword extraction"),
|
|
210
|
+
"Content should match document text"
|
|
211
|
+
);
|
|
212
|
+
|
|
213
|
+
// Verify keywords were parsed into Vec<String> in Metadata.keywords
|
|
214
|
+
assert!(
|
|
215
|
+
result.metadata.keywords.is_some(),
|
|
216
|
+
"Keywords should be present in metadata.keywords"
|
|
217
|
+
);
|
|
218
|
+
|
|
219
|
+
let keywords = result.metadata.keywords.as_ref().unwrap();
|
|
220
|
+
assert_eq!(
|
|
221
|
+
keywords.len(),
|
|
222
|
+
5,
|
|
223
|
+
"Should have 5 keywords parsed from comma-separated string"
|
|
224
|
+
);
|
|
225
|
+
|
|
226
|
+
// Verify individual keywords were trimmed and parsed correctly
|
|
227
|
+
assert_eq!(keywords[0], "rust", "First keyword should be 'rust'");
|
|
228
|
+
assert_eq!(keywords[1], "docx", "Second keyword should be 'docx'");
|
|
229
|
+
assert_eq!(keywords[2], "extraction", "Third keyword should be 'extraction'");
|
|
230
|
+
assert_eq!(keywords[3], "metadata", "Fourth keyword should be 'metadata'");
|
|
231
|
+
assert_eq!(keywords[4], "test", "Fifth keyword should be 'test'");
|
|
232
|
+
|
|
233
|
+
// Verify other metadata was also extracted
|
|
234
|
+
assert_eq!(
|
|
235
|
+
result.metadata.additional.get("created_by").and_then(|v| v.as_str()),
|
|
236
|
+
Some("Test Author"),
|
|
237
|
+
"Should have correct creator"
|
|
238
|
+
);
|
|
239
|
+
assert_eq!(
|
|
240
|
+
result.metadata.additional.get("title").and_then(|v| v.as_str()),
|
|
241
|
+
Some("Test Document"),
|
|
242
|
+
"Should have correct title"
|
|
243
|
+
);
|
|
244
|
+
assert_eq!(
|
|
245
|
+
result.metadata.additional.get("subject").and_then(|v| v.as_str()),
|
|
246
|
+
Some("Testing keyword extraction"),
|
|
247
|
+
"Should have correct subject"
|
|
248
|
+
);
|
|
249
|
+
|
|
250
|
+
println!("✅ DOCX keywords extraction test passed!");
|
|
251
|
+
println!(" Extracted keywords: {:?}", keywords);
|
|
252
|
+
}
|
|
@@ -53,20 +53,14 @@ async fn test_native_epub_wasteland_extraction() {
|
|
|
53
53
|
result.content.len()
|
|
54
54
|
);
|
|
55
55
|
|
|
56
|
-
assert!(
|
|
57
|
-
result.metadata.additional.contains_key("title"),
|
|
58
|
-
"Should extract title metadata"
|
|
59
|
-
);
|
|
56
|
+
assert!(result.metadata.title.is_some(), "Should extract title metadata");
|
|
60
57
|
assert_eq!(
|
|
61
|
-
result.metadata.
|
|
58
|
+
result.metadata.title.as_deref(),
|
|
62
59
|
Some("The Waste Land"),
|
|
63
60
|
"Should have correct title"
|
|
64
61
|
);
|
|
65
62
|
|
|
66
|
-
assert!(
|
|
67
|
-
result.metadata.additional.contains_key("creator"),
|
|
68
|
-
"Should extract creator metadata"
|
|
69
|
-
);
|
|
63
|
+
assert!(result.metadata.authors.is_some(), "Should extract creator metadata");
|
|
70
64
|
|
|
71
65
|
assert!(
|
|
72
66
|
result.content.contains("April") || result.content.contains("cruellest"),
|
|
@@ -105,10 +99,7 @@ async fn test_native_epub_images_extraction() {
|
|
|
105
99
|
result.content.len()
|
|
106
100
|
);
|
|
107
101
|
|
|
108
|
-
assert!(
|
|
109
|
-
result.metadata.additional.contains_key("title"),
|
|
110
|
-
"Should extract title metadata"
|
|
111
|
-
);
|
|
102
|
+
assert!(result.metadata.title.is_some(), "Should extract title metadata");
|
|
112
103
|
|
|
113
104
|
println!("✅ Images EPUB extraction test passed ({} bytes)", result.content.len());
|
|
114
105
|
}
|
|
@@ -179,7 +170,7 @@ async fn test_native_epub2_cover_extraction() {
|
|
|
179
170
|
);
|
|
180
171
|
|
|
181
172
|
assert_eq!(
|
|
182
|
-
result.metadata.
|
|
173
|
+
result.metadata.title.as_deref(),
|
|
183
174
|
Some("Pandoc EPUB Test"),
|
|
184
175
|
"Should have correct title"
|
|
185
176
|
);
|
|
@@ -100,6 +100,7 @@ async fn test_ocr_simple_english_image_async() {
|
|
|
100
100
|
backend: "tesseract".to_string(),
|
|
101
101
|
language: "eng".to_string(),
|
|
102
102
|
tesseract_config: None,
|
|
103
|
+
output_format: None,
|
|
103
104
|
}),
|
|
104
105
|
force_ocr: true,
|
|
105
106
|
..Default::default()
|
|
@@ -142,6 +143,7 @@ async fn test_ocr_image_without_text_async() {
|
|
|
142
143
|
backend: "tesseract".to_string(),
|
|
143
144
|
language: "eng".to_string(),
|
|
144
145
|
tesseract_config: None,
|
|
146
|
+
output_format: None,
|
|
145
147
|
}),
|
|
146
148
|
force_ocr: true,
|
|
147
149
|
..Default::default()
|
|
@@ -115,6 +115,7 @@ pub fn test_config_with_ocr() -> kreuzberg::core::config::ExtractionConfig {
|
|
|
115
115
|
backend: "tesseract".to_string(),
|
|
116
116
|
language: "eng".to_string(),
|
|
117
117
|
tesseract_config: None,
|
|
118
|
+
output_format: None,
|
|
118
119
|
}),
|
|
119
120
|
force_ocr: false,
|
|
120
121
|
..Default::default()
|