RubyGems - kreuzberg - Versions diffs - 4.0.8 → 4.1.0 - Mend

kreuzberg 4.0.8 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (308) hide show

checksums.yaml +4 -4
data/Gemfile.lock +2 -2
data/README.md +1 -1
data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
data/ext/kreuzberg_rb/native/src/result.rs +326 -0
data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
data/lib/kreuzberg/config.rb +66 -0
data/lib/kreuzberg/result.rb +107 -2
data/lib/kreuzberg/types.rb +104 -0
data/lib/kreuzberg/version.rb +1 -1
data/lib/kreuzberg.rb +0 -4
data/sig/kreuzberg.rbs +105 -1
data/vendor/Cargo.toml +3 -3
data/vendor/kreuzberg/Cargo.toml +4 -3
data/vendor/kreuzberg/README.md +1 -1
data/vendor/kreuzberg/src/api/config.rs +69 -0
data/vendor/kreuzberg/src/api/handlers.rs +99 -2
data/vendor/kreuzberg/src/api/mod.rs +14 -7
data/vendor/kreuzberg/src/api/router.rs +214 -0
data/vendor/kreuzberg/src/api/startup.rs +243 -0
data/vendor/kreuzberg/src/api/types.rs +78 -0
data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
data/vendor/kreuzberg/src/cache/core.rs +428 -0
data/vendor/kreuzberg/src/cache/mod.rs +21 -843
data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
data/vendor/kreuzberg/src/chunking/config.rs +52 -0
data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
data/vendor/kreuzberg/src/core/config/page.rs +57 -0
data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
data/vendor/kreuzberg/src/core/mod.rs +4 -1
data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
data/vendor/kreuzberg/src/embeddings.rs +136 -13
data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
data/vendor/kreuzberg/src/extractors/email.rs +2 -0
data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
data/vendor/kreuzberg/src/extractors/html.rs +80 -8
data/vendor/kreuzberg/src/extractors/image.rs +8 -1
data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
data/vendor/kreuzberg/src/extractors/text.rs +4 -0
data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
data/vendor/kreuzberg/src/lib.rs +2 -2
data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
data/vendor/kreuzberg/src/mcp/format.rs +211 -0
data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
data/vendor/kreuzberg/src/mcp/params.rs +196 -0
data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
data/vendor/kreuzberg/src/text/quality.rs +1 -1
data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
data/vendor/kreuzberg/src/types/djot.rs +209 -0
data/vendor/kreuzberg/src/types/extraction.rs +301 -0
data/vendor/kreuzberg/src/types/formats.rs +443 -0
data/vendor/kreuzberg/src/types/metadata.rs +560 -0
data/vendor/kreuzberg/src/types/mod.rs +281 -0
data/vendor/kreuzberg/src/types/page.rs +182 -0
data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
data/vendor/kreuzberg/src/types/tables.rs +39 -0
data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
data/vendor/kreuzberg/tests/api_embed.rs +6 -9
data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
data/vendor/kreuzberg/tests/core_integration.rs +1 -0
data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
data/vendor/kreuzberg/tests/format_integration.rs +2 -0
data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
data/vendor/kreuzberg-ffi/src/error.rs +46 -14
data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
data/vendor/kreuzberg-ffi/src/result.rs +148 -122
data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
metadata +200 -28
data/vendor/kreuzberg/src/api/server.rs +0 -518
data/vendor/kreuzberg/src/core/config.rs +0 -1914
data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
data/vendor/kreuzberg/src/types.rs +0 -1713
data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
data/vendor/kreuzberg-ffi/src/config.rs +0 -1341

data/vendor/kreuzberg/tests/api_chunk.rs ADDED Viewed

@@ -0,0 +1,313 @@
+//! Integration tests for the /chunk API endpoint.
+#![cfg(feature = "api")]
+use axum::{
+    body::Body,
+    http::{Request, StatusCode},
+};
+use serde_json::json;
+use tower::ServiceExt;
+use kreuzberg::{ExtractionConfig, api::create_router};
+#[tokio::test]
+async fn test_chunk_basic() {
+    let app = create_router(ExtractionConfig::default());
+    let response = app
+        .oneshot(
+            Request::builder()
+                .uri("/chunk")
+                .method("POST")
+                .header("content-type", "application/json")
+                .body(Body::from(
+                    json!({
+                        "text": "Short text. More text here. Even more content to chunk."
+                    })
+                    .to_string(),
+                ))
+                .unwrap(),
+        )
+        .await
+        .unwrap();
+    assert_eq!(response.status(), StatusCode::OK);
+}
+#[tokio::test]
+async fn test_chunk_empty_text_returns_400() {
+    let app = create_router(ExtractionConfig::default());
+    let response = app
+        .oneshot(
+            Request::builder()
+                .uri("/chunk")
+                .method("POST")
+                .header("content-type", "application/json")
+                .body(Body::from(json!({"text": ""}).to_string()))
+                .unwrap(),
+        )
+        .await
+        .unwrap();
+    assert_eq!(response.status(), StatusCode::BAD_REQUEST);
+}
+#[tokio::test]
+async fn test_chunk_markdown_strategy() {
+    let app = create_router(ExtractionConfig::default());
+    let response = app
+        .oneshot(
+            Request::builder()
+                .uri("/chunk")
+                .method("POST")
+                .header("content-type", "application/json")
+                .body(Body::from(
+                    json!({
+                        "text": "# Heading\n\nParagraph text here.",
+                        "chunker_type": "markdown"
+                    })
+                    .to_string(),
+                ))
+                .unwrap(),
+        )
+        .await
+        .unwrap();
+    assert_eq!(response.status(), StatusCode::OK);
+}
+#[tokio::test]
+async fn test_chunk_response_structure() {
+    use kreuzberg::api::ChunkResponse;
+    let app = create_router(ExtractionConfig::default());
+    let response = app
+        .oneshot(
+            Request::builder()
+                .uri("/chunk")
+                .method("POST")
+                .header("content-type", "application/json")
+                .body(Body::from(
+                    json!({
+                        "text": "This is a test. Another sentence here. And one more sentence to ensure we get chunks.",
+                        "config": {
+                            "max_characters": 50,
+                            "overlap": 10,
+                            "trim": true
+                        },
+                        "chunker_type": "text"
+                    })
+                    .to_string(),
+                ))
+                .unwrap(),
+        )
+        .await
+        .unwrap();
+    assert_eq!(response.status(), StatusCode::OK);
+    let body = axum::body::to_bytes(response.into_body(), usize::MAX).await.unwrap();
+    let chunk_response: ChunkResponse = serde_json::from_slice(&body).unwrap();
+    // Verify response structure
+    assert!(chunk_response.chunk_count > 0);
+    assert_eq!(chunk_response.chunks.len(), chunk_response.chunk_count);
+    assert_eq!(chunk_response.chunker_type, "text");
+    assert_eq!(chunk_response.config.max_characters, 50);
+    assert_eq!(chunk_response.config.overlap, 10);
+    assert!(chunk_response.config.trim);
+    assert!(chunk_response.input_size_bytes > 0);
+    // Verify chunk metadata
+    for (idx, chunk) in chunk_response.chunks.iter().enumerate() {
+        assert!(!chunk.content.is_empty());
+        assert_eq!(chunk.chunk_index, idx);
+        assert_eq!(chunk.total_chunks, chunk_response.chunk_count);
+        assert!(chunk.byte_end > chunk.byte_start);
+    }
+}
+#[tokio::test]
+async fn test_chunk_invalid_strategy_returns_400() {
+    let app = create_router(ExtractionConfig::default());
+    let response = app
+        .oneshot(
+            Request::builder()
+                .uri("/chunk")
+                .method("POST")
+                .header("content-type", "application/json")
+                .body(Body::from(
+                    json!({
+                        "text": "Test text",
+                        "chunker_type": "invalid_type"
+                    })
+                    .to_string(),
+                ))
+                .unwrap(),
+        )
+        .await
+        .unwrap();
+    assert_eq!(response.status(), StatusCode::BAD_REQUEST);
+}
+#[tokio::test]
+async fn test_chunk_with_defaults() {
+    use kreuzberg::api::ChunkResponse;
+    let app = create_router(ExtractionConfig::default());
+    let response = app
+        .oneshot(
+            Request::builder()
+                .uri("/chunk")
+                .method("POST")
+                .header("content-type", "application/json")
+                .body(Body::from(
+                    json!({
+                        "text": "This is a test sentence. Another sentence here."
+                    })
+                    .to_string(),
+                ))
+                .unwrap(),
+        )
+        .await
+        .unwrap();
+    assert_eq!(response.status(), StatusCode::OK);
+    let body = axum::body::to_bytes(response.into_body(), usize::MAX).await.unwrap();
+    let chunk_response: ChunkResponse = serde_json::from_slice(&body).unwrap();
+    // Verify defaults are applied
+    assert_eq!(chunk_response.config.max_characters, 2000);
+    assert_eq!(chunk_response.config.overlap, 100);
+    assert!(chunk_response.config.trim);
+    assert_eq!(chunk_response.chunker_type, "text");
+}
+#[tokio::test]
+async fn test_chunk_malformed_json_returns_400() {
+    let app = create_router(ExtractionConfig::default());
+    let response = app
+        .oneshot(
+            Request::builder()
+                .uri("/chunk")
+                .method("POST")
+                .header("content-type", "application/json")
+                .body(Body::from("{invalid json}"))
+                .unwrap(),
+        )
+        .await
+        .unwrap();
+    assert_eq!(response.status(), StatusCode::BAD_REQUEST);
+}
+#[tokio::test]
+async fn test_chunk_case_insensitive_chunker_type() {
+    use kreuzberg::api::ChunkResponse;
+    let app = create_router(ExtractionConfig::default());
+    let response = app
+        .oneshot(
+            Request::builder()
+                .uri("/chunk")
+                .method("POST")
+                .header("content-type", "application/json")
+                .body(Body::from(
+                    json!({
+                        "text": "# Title\n\nContent here.",
+                        "chunker_type": "MARKDOWN"
+                    })
+                    .to_string(),
+                ))
+                .unwrap(),
+        )
+        .await
+        .unwrap();
+    assert_eq!(response.status(), StatusCode::OK);
+    let body = axum::body::to_bytes(response.into_body(), usize::MAX).await.unwrap();
+    let chunk_response: ChunkResponse = serde_json::from_slice(&body).unwrap();
+    // Verify it's normalized to lowercase
+    assert_eq!(chunk_response.chunker_type, "markdown");
+}
+#[tokio::test]
+async fn test_chunk_long_text() {
+    use kreuzberg::api::ChunkResponse;
+    let app = create_router(ExtractionConfig::default());
+    let long_text = "Lorem ipsum dolor sit amet. ".repeat(200);
+    let response = app
+        .oneshot(
+            Request::builder()
+                .uri("/chunk")
+                .method("POST")
+                .header("content-type", "application/json")
+                .body(Body::from(
+                    json!({
+                        "text": long_text,
+                        "config": {
+                            "max_characters": 500,
+                            "overlap": 50
+                        }
+                    })
+                    .to_string(),
+                ))
+                .unwrap(),
+        )
+        .await
+        .unwrap();
+    assert_eq!(response.status(), StatusCode::OK);
+    let body = axum::body::to_bytes(response.into_body(), usize::MAX).await.unwrap();
+    let chunk_response: ChunkResponse = serde_json::from_slice(&body).unwrap();
+    // Should have multiple chunks
+    assert!(chunk_response.chunk_count > 1);
+    assert_eq!(chunk_response.chunks.len(), chunk_response.chunk_count);
+}
+#[tokio::test]
+async fn test_chunk_custom_config() {
+    use kreuzberg::api::ChunkResponse;
+    let app = create_router(ExtractionConfig::default());
+    let response = app
+        .oneshot(
+            Request::builder()
+                .uri("/chunk")
+                .method("POST")
+                .header("content-type", "application/json")
+                .body(Body::from(
+                    json!({
+                        "text": "Test sentence one. Test sentence two. Test sentence three.",
+                        "config": {
+                            "max_characters": 30,
+                            "overlap": 5,
+                            "trim": false
+                        },
+                        "chunker_type": "text"
+                    })
+                    .to_string(),
+                ))
+                .unwrap(),
+        )
+        .await
+        .unwrap();
+    assert_eq!(response.status(), StatusCode::OK);
+    let body = axum::body::to_bytes(response.into_body(), usize::MAX).await.unwrap();
+    let chunk_response: ChunkResponse = serde_json::from_slice(&body).unwrap();
+    // Verify custom config was applied
+    assert_eq!(chunk_response.config.max_characters, 30);
+    assert_eq!(chunk_response.config.overlap, 5);
+    assert!(!chunk_response.config.trim);
+}

data/vendor/kreuzberg/tests/api_embed.rs CHANGED Viewed

@@ -84,9 +84,8 @@ async fn test_embed_with_custom_config() {
         "texts": ["Test embedding with custom config"],
         "config": {
             "model": {
-                "preset": {
-                    "name": "fast"
-                }
+                "type": "preset",
+                "name": "fast"
             },
             "batch_size": 32
         }
@@ -295,9 +294,8 @@ async fn test_embed_different_presets() {
         "texts": ["Test text"],
         "config": {
             "model": {
-                "preset": {
-                    "name": "fast"
-                }
+                "type": "preset",
+                "name": "fast"
             }
         }
     });
@@ -327,9 +325,8 @@ async fn test_embed_different_presets() {
         "texts": ["Test text"],
         "config": {
             "model": {
-                "preset": {
-                    "name": "balanced"
-                }
+                "type": "preset",
+                "name": "balanced"
             }
         }
     });

data/vendor/kreuzberg/tests/batch_orchestration.rs CHANGED Viewed

@@ -257,6 +257,7 @@ fn test_ocr_multipage_efficiency() {
             backend: "tesseract".to_string(),
             language: "eng".to_string(),
             tesseract_config: None,
+            output_format: None,
         }),
         force_ocr: false,
         use_cache: true,

data/vendor/kreuzberg/tests/concurrency_stress.rs CHANGED Viewed

@@ -193,6 +193,7 @@ async fn test_concurrent_ocr_processing() {
             backend: "tesseract".to_string(),
             language: "eng".to_string(),
             tesseract_config: None,
+            output_format: None,
         }),
         force_ocr: false,
         use_cache: true,
@@ -262,6 +263,7 @@ fn test_concurrent_ocr_cache_stress() {
             backend: "tesseract".to_string(),
             language: "eng".to_string(),
             tesseract_config: None,
+            output_format: None,
         }),
         force_ocr: false,
         use_cache: true,
@@ -313,7 +315,10 @@ fn test_concurrent_ocr_cache_stress() {
 /// - Pipeline can process multiple results in parallel
 /// - Processors don't interfere with each other
 /// - Registry reads are thread-safe
+///
+/// Note: This test is flaky due to timing-dependent concurrent operations.
 #[tokio::test]
+#[ignore = "flaky concurrency test - timing dependent on system load"]
 async fn test_concurrent_pipeline_processing() {
     struct ConcurrentTestProcessor;
@@ -378,6 +383,8 @@ async fn test_concurrent_pipeline_processing() {
                 chunks: None,
                 images: None,
                 pages: None,
+                elements: None,
+                djot_content: None,
             };
             run_pipeline(result, &config).await

data/vendor/kreuzberg/tests/core_integration.rs CHANGED Viewed

@@ -459,6 +459,7 @@ async fn test_extraction_with_ocr_config() {
             tesseract_config: None,
             backend: "tesseract".to_string(),
             language: "eng".to_string(),
+            output_format: None,
         }),
         force_ocr: true,
         ..Default::default()

data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs CHANGED Viewed

@@ -120,3 +120,133 @@ async fn test_docx_minimal_metadata_extraction() {
     println!("✅ DOCX minimal metadata extraction test passed!");
 }
+#[tokio::test]
+async fn test_docx_keywords_extraction() {
+    // This test verifies that DOCX keywords metadata is properly parsed
+    // from comma-separated strings into Vec<String> in Metadata.keywords
+    //
+    // Addresses GitHub issue #309: DOCX keyword extraction was returning
+    // strings instead of parsed keyword lists, causing FunctionClauseError
+    // in the Elixir binding.
+    use std::io::Write;
+    use tempfile::NamedTempFile;
+    use zip::CompressionMethod;
+    use zip::write::{FileOptions, ZipWriter};
+    // Create a minimal DOCX with keywords metadata
+    let mut temp_file = NamedTempFile::new().expect("Failed to create temp file");
+    {
+        let mut zip = ZipWriter::new(&mut temp_file);
+        let options: FileOptions<()> = FileOptions::default().compression_method(CompressionMethod::Stored);
+        // Add [Content_Types].xml
+        zip.start_file("[Content_Types].xml", options).unwrap();
+        zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
+<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
+  <Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
+  <Default Extension="xml" ContentType="application/xml"/>
+  <Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>
+  <Override PartName="/docProps/core.xml" ContentType="application/vnd.openxmlformats-package.core-properties+xml"/>
+</Types>"#).unwrap();
+        // Add _rels/.rels
+        zip.start_file("_rels/.rels", options).unwrap();
+        zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
+<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
+  <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/>
+  <Relationship Id="rId2" Type="http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties" Target="docProps/core.xml"/>
+</Relationships>"#).unwrap();
+        // Add word/document.xml with simple content
+        zip.start_file("word/document.xml", options).unwrap();
+        zip.write_all(
+            br#"<?xml version="1.0" encoding="UTF-8"?>
+<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
+  <w:body>
+    <w:p>
+      <w:r>
+        <w:t>Test document for keyword extraction</w:t>
+      </w:r>
+    </w:p>
+  </w:body>
+</w:document>"#,
+        )
+        .unwrap();
+        // Add docProps/core.xml with keywords (comma-separated string)
+        zip.start_file("docProps/core.xml", options).unwrap();
+        zip.write_all(
+            br#"<?xml version="1.0" encoding="UTF-8"?>
+<cp:coreProperties xmlns:cp="http://schemas.openxmlformats.org/package/2006/metadata/core-properties"
+                   xmlns:dc="http://purl.org/dc/elements/1.1/"
+                   xmlns:dcterms="http://purl.org/dc/terms/">
+  <dc:title>Test Document</dc:title>
+  <dc:creator>Test Author</dc:creator>
+  <cp:keywords>rust, docx, extraction, metadata, test</cp:keywords>
+  <dc:subject>Testing keyword extraction</dc:subject>
+</cp:coreProperties>"#,
+        )
+        .unwrap();
+        zip.finish().unwrap();
+    }
+    // Extract the DOCX file
+    let result = extract_file(
+        temp_file.path(),
+        Some("application/vnd.openxmlformats-officedocument.wordprocessingml.document"),
+        &ExtractionConfig::default(),
+    )
+    .await
+    .expect("Should extract DOCX with keywords successfully");
+    // Verify content was extracted
+    assert!(!result.content.is_empty(), "Content should not be empty");
+    assert!(
+        result.content.contains("Test document for keyword extraction"),
+        "Content should match document text"
+    );
+    // Verify keywords were parsed into Vec<String> in Metadata.keywords
+    assert!(
+        result.metadata.keywords.is_some(),
+        "Keywords should be present in metadata.keywords"
+    );
+    let keywords = result.metadata.keywords.as_ref().unwrap();
+    assert_eq!(
+        keywords.len(),
+        5,
+        "Should have 5 keywords parsed from comma-separated string"
+    );
+    // Verify individual keywords were trimmed and parsed correctly
+    assert_eq!(keywords[0], "rust", "First keyword should be 'rust'");
+    assert_eq!(keywords[1], "docx", "Second keyword should be 'docx'");
+    assert_eq!(keywords[2], "extraction", "Third keyword should be 'extraction'");
+    assert_eq!(keywords[3], "metadata", "Fourth keyword should be 'metadata'");
+    assert_eq!(keywords[4], "test", "Fifth keyword should be 'test'");
+    // Verify other metadata was also extracted
+    assert_eq!(
+        result.metadata.additional.get("created_by").and_then(|v| v.as_str()),
+        Some("Test Author"),
+        "Should have correct creator"
+    );
+    assert_eq!(
+        result.metadata.additional.get("title").and_then(|v| v.as_str()),
+        Some("Test Document"),
+        "Should have correct title"
+    );
+    assert_eq!(
+        result.metadata.additional.get("subject").and_then(|v| v.as_str()),
+        Some("Testing keyword extraction"),
+        "Should have correct subject"
+    );
+    println!("✅ DOCX keywords extraction test passed!");
+    println!("   Extracted keywords: {:?}", keywords);
+}

data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs CHANGED Viewed

@@ -53,20 +53,14 @@ async fn test_native_epub_wasteland_extraction() {
         result.content.len()
     );
-    assert!(
-        result.metadata.additional.contains_key("title"),
-        "Should extract title metadata"
-    );
+    assert!(result.metadata.title.is_some(), "Should extract title metadata");
     assert_eq!(
-        result.metadata.additional.get("title").and_then(|v| v.as_str()),
+        result.metadata.title.as_deref(),
         Some("The Waste Land"),
         "Should have correct title"
     );
-    assert!(
-        result.metadata.additional.contains_key("creator"),
-        "Should extract creator metadata"
-    );
+    assert!(result.metadata.authors.is_some(), "Should extract creator metadata");
     assert!(
         result.content.contains("April") || result.content.contains("cruellest"),
@@ -105,10 +99,7 @@ async fn test_native_epub_images_extraction() {
         result.content.len()
     );
-    assert!(
-        result.metadata.additional.contains_key("title"),
-        "Should extract title metadata"
-    );
+    assert!(result.metadata.title.is_some(), "Should extract title metadata");
     println!("✅ Images EPUB extraction test passed ({} bytes)", result.content.len());
 }
@@ -179,7 +170,7 @@ async fn test_native_epub2_cover_extraction() {
     );
     assert_eq!(
-        result.metadata.additional.get("title").and_then(|v| v.as_str()),
+        result.metadata.title.as_deref(),
         Some("Pandoc EPUB Test"),
         "Should have correct title"
     );

data/vendor/kreuzberg/tests/format_integration.rs CHANGED Viewed

@@ -100,6 +100,7 @@ async fn test_ocr_simple_english_image_async() {
             backend: "tesseract".to_string(),
             language: "eng".to_string(),
             tesseract_config: None,
+            output_format: None,
         }),
         force_ocr: true,
         ..Default::default()
@@ -142,6 +143,7 @@ async fn test_ocr_image_without_text_async() {
             backend: "tesseract".to_string(),
             language: "eng".to_string(),
             tesseract_config: None,
+            output_format: None,
         }),
         force_ocr: true,
         ..Default::default()

data/vendor/kreuzberg/tests/helpers/mod.rs CHANGED Viewed

@@ -115,6 +115,7 @@ pub fn test_config_with_ocr() -> kreuzberg::core::config::ExtractionConfig {
             backend: "tesseract".to_string(),
             language: "eng".to_string(),
             tesseract_config: None,
+            output_format: None,
         }),
         force_ocr: false,
         ..Default::default()