kreuzberg 4.2.6 → 4.2.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +7 -4
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +36 -9
- data/ext/kreuzberg_rb/native/Cargo.toml +32 -0
- data/ext/kreuzberg_rb/native/src/config/types.rs +4 -2
- data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +1 -1
- data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +1 -1
- data/ext/kreuzberg_rb/native/src/result.rs +5 -3
- data/lib/kreuzberg/version.rb +1 -1
- data/sig/kreuzberg.rbs +228 -37
- data/spec/binding/batch_operations_spec.rb +2 -0
- data/vendor/Cargo.toml +3 -2
- data/vendor/kreuzberg/Cargo.toml +2 -1
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/error.rs +29 -1
- data/vendor/kreuzberg/src/api/handlers.rs +28 -25
- data/vendor/kreuzberg/src/api/openapi.rs +14 -1
- data/vendor/kreuzberg/src/chunking/config.rs +2 -37
- data/vendor/kreuzberg/src/chunking/core.rs +78 -2
- data/vendor/kreuzberg/src/chunking/mod.rs +1 -1
- data/vendor/kreuzberg/src/chunking/processor.rs +15 -17
- data/vendor/kreuzberg/src/core/config/extraction/env.rs +13 -9
- data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +12 -12
- data/vendor/kreuzberg/src/core/config/mod.rs +1 -1
- data/vendor/kreuzberg/src/core/config/processing.rs +65 -8
- data/vendor/kreuzberg/src/core/config_validation/mod.rs +8 -0
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +5 -0
- data/vendor/kreuzberg/src/core/extractor/batch.rs +9 -9
- data/vendor/kreuzberg/src/core/extractor/file.rs +4 -2
- data/vendor/kreuzberg/src/core/extractor/legacy.rs +7 -7
- data/vendor/kreuzberg/src/core/extractor/sync.rs +3 -3
- data/vendor/kreuzberg/src/core/pipeline/execution.rs +2 -1
- data/vendor/kreuzberg/src/core/pipeline/features.rs +16 -22
- data/vendor/kreuzberg/src/core/pipeline/format.rs +20 -18
- data/vendor/kreuzberg/src/core/pipeline/tests.rs +40 -35
- data/vendor/kreuzberg/src/extraction/email.rs +31 -19
- data/vendor/kreuzberg/src/extraction/excel.rs +6 -5
- data/vendor/kreuzberg/src/extraction/html/image_handling.rs +6 -1
- data/vendor/kreuzberg/src/extraction/html/types.rs +4 -3
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +10 -9
- data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +10 -8
- data/vendor/kreuzberg/src/extraction/pptx/mod.rs +8 -4
- data/vendor/kreuzberg/src/extraction/structured.rs +5 -4
- data/vendor/kreuzberg/src/extraction/transform/content.rs +1 -1
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +10 -7
- data/vendor/kreuzberg/src/extractors/archive.rs +7 -5
- data/vendor/kreuzberg/src/extractors/bibtex.rs +34 -17
- data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +7 -10
- data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +4 -2
- data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +3 -2
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +1 -1
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +2 -4
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +1 -1
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +4 -5
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +1 -1
- data/vendor/kreuzberg/src/extractors/docbook.rs +1 -1
- data/vendor/kreuzberg/src/extractors/docx.rs +32 -24
- data/vendor/kreuzberg/src/extractors/email.rs +5 -3
- data/vendor/kreuzberg/src/extractors/epub/metadata.rs +10 -10
- data/vendor/kreuzberg/src/extractors/epub/mod.rs +7 -3
- data/vendor/kreuzberg/src/extractors/excel.rs +8 -6
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +1 -1
- data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +15 -10
- data/vendor/kreuzberg/src/extractors/html.rs +1 -1
- data/vendor/kreuzberg/src/extractors/image.rs +3 -3
- data/vendor/kreuzberg/src/extractors/jats/mod.rs +1 -1
- data/vendor/kreuzberg/src/extractors/jupyter.rs +11 -9
- data/vendor/kreuzberg/src/extractors/latex/metadata.rs +4 -3
- data/vendor/kreuzberg/src/extractors/latex/mod.rs +1 -1
- data/vendor/kreuzberg/src/extractors/markdown.rs +6 -4
- data/vendor/kreuzberg/src/extractors/odt.rs +38 -21
- data/vendor/kreuzberg/src/extractors/opml/core.rs +1 -1
- data/vendor/kreuzberg/src/extractors/opml/parser.rs +13 -9
- data/vendor/kreuzberg/src/extractors/orgmode.rs +11 -9
- data/vendor/kreuzberg/src/extractors/pdf/mod.rs +10 -3
- data/vendor/kreuzberg/src/extractors/pptx.rs +13 -11
- data/vendor/kreuzberg/src/extractors/rst.rs +15 -13
- data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +22 -21
- data/vendor/kreuzberg/src/extractors/rtf/mod.rs +1 -1
- data/vendor/kreuzberg/src/extractors/structured.rs +10 -5
- data/vendor/kreuzberg/src/extractors/text.rs +2 -2
- data/vendor/kreuzberg/src/extractors/typst.rs +11 -5
- data/vendor/kreuzberg/src/extractors/xml.rs +1 -1
- data/vendor/kreuzberg/src/keywords/processor.rs +9 -8
- data/vendor/kreuzberg/src/language_detection/processor.rs +6 -5
- data/vendor/kreuzberg/src/lib.rs +1 -1
- data/vendor/kreuzberg/src/mcp/errors.rs +7 -6
- data/vendor/kreuzberg/src/mcp/format.rs +5 -4
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +3 -2
- data/vendor/kreuzberg/src/ocr/hocr.rs +4 -2
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +128 -14
- data/vendor/kreuzberg/src/ocr/processor/validation.rs +129 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +36 -6
- data/vendor/kreuzberg/src/ocr/types.rs +3 -4
- data/vendor/kreuzberg/src/ocr/validation.rs +14 -0
- data/vendor/kreuzberg/src/pdf/metadata.rs +1 -0
- data/vendor/kreuzberg/src/plugins/extractor/mod.rs +3 -2
- data/vendor/kreuzberg/src/plugins/extractor/registry.rs +5 -4
- data/vendor/kreuzberg/src/plugins/ocr.rs +5 -4
- data/vendor/kreuzberg/src/plugins/processor/mod.rs +13 -12
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +3 -2
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +3 -2
- data/vendor/kreuzberg/src/plugins/validator/mod.rs +15 -14
- data/vendor/kreuzberg/src/text/quality.rs +13 -13
- data/vendor/kreuzberg/src/text/quality_processor.rs +7 -6
- data/vendor/kreuzberg/src/types/djot.rs +15 -4
- data/vendor/kreuzberg/src/types/extraction.rs +24 -4
- data/vendor/kreuzberg/src/types/formats.rs +9 -5
- data/vendor/kreuzberg/src/types/metadata.rs +68 -7
- data/vendor/kreuzberg/src/types/mod.rs +7 -5
- data/vendor/kreuzberg/src/types/page.rs +9 -0
- data/vendor/kreuzberg/src/types/tables.rs +2 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +2 -1
- data/vendor/kreuzberg/tests/config_behavioral.rs +12 -16
- data/vendor/kreuzberg/tests/config_features.rs +19 -11
- data/vendor/kreuzberg/tests/config_loading_tests.rs +9 -9
- data/vendor/kreuzberg/tests/contract_mcp.rs +2 -2
- data/vendor/kreuzberg/tests/core_integration.rs +5 -6
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +1 -1
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +1 -1
- data/vendor/kreuzberg/tests/pipeline_integration.rs +36 -32
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +19 -13
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +3 -2
- data/vendor/kreuzberg/tests/plugin_system.rs +7 -6
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +1 -1
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -1
- data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +3 -2
- data/vendor/kreuzberg-ffi/kreuzberg.h +32 -0
- data/vendor/kreuzberg-ffi/src/error.rs +56 -0
- data/vendor/kreuzberg-ffi/src/helpers.rs +6 -5
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +1 -1
- data/vendor/kreuzberg-ffi/src/result.rs +2 -1
- data/vendor/kreuzberg-ffi/src/result_view.rs +3 -2
- data/vendor/kreuzberg-ffi/src/string_intern.rs +3 -3
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +2 -2
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- metadata +2 -2
|
@@ -4,10 +4,12 @@
|
|
|
4
4
|
//! and recursive processing of `<outline>` elements in the `<body>` section.
|
|
5
5
|
|
|
6
6
|
use crate::Result;
|
|
7
|
-
use
|
|
7
|
+
use ahash::AHashMap;
|
|
8
|
+
use std::borrow::Cow;
|
|
8
9
|
|
|
9
10
|
#[cfg(feature = "office")]
|
|
10
11
|
use roxmltree::Node;
|
|
12
|
+
use serde_json;
|
|
11
13
|
|
|
12
14
|
/// Extract OPML content and metadata from raw bytes.
|
|
13
15
|
///
|
|
@@ -20,7 +22,9 @@ use roxmltree::Node;
|
|
|
20
22
|
/// - Extracted content as a String (outline hierarchy with indentation)
|
|
21
23
|
/// - Metadata HashMap with key-value pairs from the head section
|
|
22
24
|
#[cfg(feature = "office")]
|
|
23
|
-
pub(crate) fn extract_content_and_metadata(
|
|
25
|
+
pub(crate) fn extract_content_and_metadata(
|
|
26
|
+
content: &[u8],
|
|
27
|
+
) -> Result<(String, AHashMap<Cow<'static, str>, serde_json::Value>)> {
|
|
24
28
|
let doc = roxmltree::Document::parse(
|
|
25
29
|
std::str::from_utf8(content)
|
|
26
30
|
.map_err(|e| crate::KreuzbergError::Other(format!("Invalid UTF-8 in OPML: {}", e)))?,
|
|
@@ -28,7 +32,7 @@ pub(crate) fn extract_content_and_metadata(content: &[u8]) -> Result<(String, Ha
|
|
|
28
32
|
.map_err(|e| crate::KreuzbergError::Other(format!("Failed to parse OPML: {}", e)))?;
|
|
29
33
|
|
|
30
34
|
let mut extracted_content = String::new();
|
|
31
|
-
let mut metadata =
|
|
35
|
+
let mut metadata = AHashMap::new();
|
|
32
36
|
|
|
33
37
|
if let Some(opml) = doc.root().children().find(|n| n.tag_name().name() == "opml") {
|
|
34
38
|
if let Some(head) = opml.children().find(|n| n.tag_name().name() == "head") {
|
|
@@ -60,7 +64,7 @@ pub(crate) fn extract_content_and_metadata(content: &[u8]) -> Result<(String, Ha
|
|
|
60
64
|
/// - ownerName: Document owner's name
|
|
61
65
|
/// - ownerEmail: Document owner's email
|
|
62
66
|
#[cfg(feature = "office")]
|
|
63
|
-
fn extract_metadata_from_head(head: Node, metadata: &mut
|
|
67
|
+
fn extract_metadata_from_head(head: Node, metadata: &mut AHashMap<Cow<'static, str>, serde_json::Value>) {
|
|
64
68
|
for child in head.children().filter(|n| n.is_element()) {
|
|
65
69
|
let tag = child.tag_name().name();
|
|
66
70
|
let text = child.text().unwrap_or("").trim();
|
|
@@ -71,19 +75,19 @@ fn extract_metadata_from_head(head: Node, metadata: &mut HashMap<String, serde_j
|
|
|
71
75
|
|
|
72
76
|
match tag {
|
|
73
77
|
"title" => {
|
|
74
|
-
metadata.insert("title"
|
|
78
|
+
metadata.insert(Cow::Borrowed("title"), serde_json::json!(text));
|
|
75
79
|
}
|
|
76
80
|
"dateCreated" => {
|
|
77
|
-
metadata.insert("dateCreated"
|
|
81
|
+
metadata.insert(Cow::Borrowed("dateCreated"), serde_json::json!(text));
|
|
78
82
|
}
|
|
79
83
|
"dateModified" => {
|
|
80
|
-
metadata.insert("dateModified"
|
|
84
|
+
metadata.insert(Cow::Borrowed("dateModified"), serde_json::json!(text));
|
|
81
85
|
}
|
|
82
86
|
"ownerName" => {
|
|
83
|
-
metadata.insert("ownerName"
|
|
87
|
+
metadata.insert(Cow::Borrowed("ownerName"), serde_json::json!(text));
|
|
84
88
|
}
|
|
85
89
|
"ownerEmail" => {
|
|
86
|
-
metadata.insert("ownerEmail"
|
|
90
|
+
metadata.insert(Cow::Borrowed("ownerEmail"), serde_json::json!(text));
|
|
87
91
|
}
|
|
88
92
|
_ => {}
|
|
89
93
|
}
|
|
@@ -23,9 +23,11 @@ use crate::plugins::{DocumentExtractor, Plugin};
|
|
|
23
23
|
#[cfg(feature = "office")]
|
|
24
24
|
use crate::types::{ExtractionResult, Metadata, Table};
|
|
25
25
|
#[cfg(feature = "office")]
|
|
26
|
+
use ahash::AHashMap;
|
|
27
|
+
#[cfg(feature = "office")]
|
|
26
28
|
use async_trait::async_trait;
|
|
27
29
|
#[cfg(feature = "office")]
|
|
28
|
-
use std::
|
|
30
|
+
use std::borrow::Cow;
|
|
29
31
|
|
|
30
32
|
#[cfg(feature = "office")]
|
|
31
33
|
use org::Org;
|
|
@@ -57,33 +59,33 @@ impl OrgModeExtractor {
|
|
|
57
59
|
/// Also extracts document structure and content in parallel.
|
|
58
60
|
fn extract_metadata_and_content(org_text: &str, org: &Org) -> (Metadata, String) {
|
|
59
61
|
let mut metadata = Metadata::default();
|
|
60
|
-
let mut additional =
|
|
62
|
+
let mut additional: AHashMap<Cow<'static, str>, serde_json::Value> = Default::default();
|
|
61
63
|
|
|
62
64
|
for line in org_text.lines().take(100) {
|
|
63
65
|
let trimmed = line.trim();
|
|
64
66
|
|
|
65
67
|
if let Some(rest) = trimmed.strip_prefix("#+TITLE:") {
|
|
66
68
|
let value = rest.trim().to_string();
|
|
67
|
-
additional.insert("title"
|
|
69
|
+
additional.insert(Cow::Borrowed("title"), serde_json::json!(value));
|
|
68
70
|
} else if let Some(rest) = trimmed.strip_prefix("#+AUTHOR:") {
|
|
69
71
|
let value = rest.trim().to_string();
|
|
70
|
-
additional.insert("author"
|
|
71
|
-
additional.insert("authors"
|
|
72
|
+
additional.insert(Cow::Borrowed("author"), serde_json::json!(&value));
|
|
73
|
+
additional.insert(Cow::Borrowed("authors"), serde_json::json!(vec![value]));
|
|
72
74
|
} else if let Some(rest) = trimmed.strip_prefix("#+DATE:") {
|
|
73
75
|
let value = rest.trim().to_string();
|
|
74
76
|
metadata.created_at = Some(value.clone());
|
|
75
|
-
additional.insert("date"
|
|
77
|
+
additional.insert(Cow::Borrowed("date"), serde_json::json!(value));
|
|
76
78
|
} else if let Some(rest) = trimmed.strip_prefix("#+KEYWORDS:") {
|
|
77
79
|
let value = rest.trim();
|
|
78
80
|
let keywords: Vec<&str> = value.split(',').map(|s| s.trim()).collect();
|
|
79
|
-
additional.insert("keywords"
|
|
81
|
+
additional.insert(Cow::Borrowed("keywords"), serde_json::json!(keywords));
|
|
80
82
|
} else if let Some(rest) = trimmed.strip_prefix("#+")
|
|
81
83
|
&& let Some((key, val)) = rest.split_once(':')
|
|
82
84
|
{
|
|
83
85
|
let key_lower = key.trim().to_lowercase();
|
|
84
86
|
let value = val.trim();
|
|
85
87
|
if !key_lower.is_empty() && !value.is_empty() {
|
|
86
|
-
additional.insert(format!("directive_{}", key_lower), serde_json::json!(value));
|
|
88
|
+
additional.insert(Cow::Owned(format!("directive_{}", key_lower)), serde_json::json!(value));
|
|
87
89
|
}
|
|
88
90
|
}
|
|
89
91
|
}
|
|
@@ -298,7 +300,7 @@ impl DocumentExtractor for OrgModeExtractor {
|
|
|
298
300
|
|
|
299
301
|
Ok(ExtractionResult {
|
|
300
302
|
content: extracted_content,
|
|
301
|
-
mime_type: mime_type.to_string(),
|
|
303
|
+
mime_type: mime_type.to_string().into(),
|
|
302
304
|
metadata,
|
|
303
305
|
tables,
|
|
304
306
|
detected_languages: None,
|
|
@@ -7,6 +7,8 @@ mod extraction;
|
|
|
7
7
|
mod ocr;
|
|
8
8
|
mod pages;
|
|
9
9
|
|
|
10
|
+
use bytes::Bytes;
|
|
11
|
+
|
|
10
12
|
use crate::Result;
|
|
11
13
|
use crate::core::config::ExtractionConfig;
|
|
12
14
|
use crate::plugins::{DocumentExtractor, Plugin};
|
|
@@ -236,9 +238,14 @@ impl DocumentExtractor for PdfExtractor {
|
|
|
236
238
|
.into_iter()
|
|
237
239
|
.enumerate()
|
|
238
240
|
.map(|(idx, img)| {
|
|
239
|
-
let format = img
|
|
241
|
+
let format = img
|
|
242
|
+
.filters
|
|
243
|
+
.first()
|
|
244
|
+
.cloned()
|
|
245
|
+
.map(std::borrow::Cow::Owned)
|
|
246
|
+
.unwrap_or(std::borrow::Cow::Borrowed("unknown"));
|
|
240
247
|
crate::types::ExtractedImage {
|
|
241
|
-
data: img.data,
|
|
248
|
+
data: Bytes::from(img.data),
|
|
242
249
|
format,
|
|
243
250
|
image_index: idx,
|
|
244
251
|
page_number: Some(img.page_number),
|
|
@@ -265,7 +272,7 @@ impl DocumentExtractor for PdfExtractor {
|
|
|
265
272
|
|
|
266
273
|
Ok(ExtractionResult {
|
|
267
274
|
content: text,
|
|
268
|
-
mime_type: mime_type.to_string(),
|
|
275
|
+
mime_type: mime_type.to_string().into(),
|
|
269
276
|
metadata: Metadata {
|
|
270
277
|
#[cfg(feature = "pdf")]
|
|
271
278
|
title: pdf_metadata.title.clone(),
|
|
@@ -6,7 +6,9 @@ use crate::Result;
|
|
|
6
6
|
use crate::core::config::ExtractionConfig;
|
|
7
7
|
use crate::plugins::{DocumentExtractor, Plugin};
|
|
8
8
|
use crate::types::{ExtractionResult, Metadata};
|
|
9
|
+
use ahash::AHashMap;
|
|
9
10
|
use async_trait::async_trait;
|
|
11
|
+
use std::borrow::Cow;
|
|
10
12
|
use std::path::Path;
|
|
11
13
|
|
|
12
14
|
#[cfg(feature = "ocr")]
|
|
@@ -66,7 +68,7 @@ impl PptxExtractor {
|
|
|
66
68
|
Ok(ocr_extraction) => {
|
|
67
69
|
let extraction_result = ExtractionResult {
|
|
68
70
|
content: ocr_extraction.content,
|
|
69
|
-
mime_type: ocr_extraction.mime_type,
|
|
71
|
+
mime_type: ocr_extraction.mime_type.into(),
|
|
70
72
|
metadata: Metadata::default(),
|
|
71
73
|
tables: vec![],
|
|
72
74
|
detected_languages: None,
|
|
@@ -137,10 +139,10 @@ impl DocumentExtractor for PptxExtractor {
|
|
|
137
139
|
crate::extraction::pptx::extract_pptx_from_bytes(content, extract_images, config.pages.as_ref())?
|
|
138
140
|
};
|
|
139
141
|
|
|
140
|
-
let mut additional =
|
|
141
|
-
additional.insert("slide_count"
|
|
142
|
-
additional.insert("image_count"
|
|
143
|
-
additional.insert("table_count"
|
|
142
|
+
let mut additional: AHashMap<Cow<'static, str>, serde_json::Value> = AHashMap::new();
|
|
143
|
+
additional.insert(Cow::Borrowed("slide_count"), serde_json::json!(pptx_result.slide_count));
|
|
144
|
+
additional.insert(Cow::Borrowed("image_count"), serde_json::json!(pptx_result.image_count));
|
|
145
|
+
additional.insert(Cow::Borrowed("table_count"), serde_json::json!(pptx_result.table_count));
|
|
144
146
|
|
|
145
147
|
let images = if extract_images {
|
|
146
148
|
// Image extraction is enabled, return images or empty vector
|
|
@@ -174,7 +176,7 @@ impl DocumentExtractor for PptxExtractor {
|
|
|
174
176
|
|
|
175
177
|
Ok(ExtractionResult {
|
|
176
178
|
content: pptx_result.content,
|
|
177
|
-
mime_type: mime_type.to_string(),
|
|
179
|
+
mime_type: mime_type.to_string().into(),
|
|
178
180
|
metadata,
|
|
179
181
|
pages: pptx_result.page_contents,
|
|
180
182
|
tables: vec![],
|
|
@@ -202,10 +204,10 @@ impl DocumentExtractor for PptxExtractor {
|
|
|
202
204
|
let pptx_result =
|
|
203
205
|
crate::extraction::pptx::extract_pptx_from_path(path_str, extract_images, config.pages.as_ref())?;
|
|
204
206
|
|
|
205
|
-
let mut additional =
|
|
206
|
-
additional.insert("slide_count"
|
|
207
|
-
additional.insert("image_count"
|
|
208
|
-
additional.insert("table_count"
|
|
207
|
+
let mut additional: AHashMap<Cow<'static, str>, serde_json::Value> = AHashMap::new();
|
|
208
|
+
additional.insert(Cow::Borrowed("slide_count"), serde_json::json!(pptx_result.slide_count));
|
|
209
|
+
additional.insert(Cow::Borrowed("image_count"), serde_json::json!(pptx_result.image_count));
|
|
210
|
+
additional.insert(Cow::Borrowed("table_count"), serde_json::json!(pptx_result.table_count));
|
|
209
211
|
|
|
210
212
|
let images = if extract_images {
|
|
211
213
|
// Image extraction is enabled, return images or empty vector
|
|
@@ -239,7 +241,7 @@ impl DocumentExtractor for PptxExtractor {
|
|
|
239
241
|
|
|
240
242
|
Ok(ExtractionResult {
|
|
241
243
|
content: pptx_result.content,
|
|
242
|
-
mime_type: mime_type.to_string(),
|
|
244
|
+
mime_type: mime_type.to_string().into(),
|
|
243
245
|
metadata,
|
|
244
246
|
pages: pptx_result.page_contents,
|
|
245
247
|
tables: vec![],
|
|
@@ -21,9 +21,11 @@ use crate::plugins::{DocumentExtractor, Plugin};
|
|
|
21
21
|
#[cfg(feature = "office")]
|
|
22
22
|
use crate::types::{ExtractionResult, Metadata, Table};
|
|
23
23
|
#[cfg(feature = "office")]
|
|
24
|
+
use ahash::AHashMap;
|
|
25
|
+
#[cfg(feature = "office")]
|
|
24
26
|
use async_trait::async_trait;
|
|
25
27
|
#[cfg(feature = "office")]
|
|
26
|
-
use std::
|
|
28
|
+
use std::borrow::Cow;
|
|
27
29
|
|
|
28
30
|
/// Native Rust reStructuredText extractor.
|
|
29
31
|
///
|
|
@@ -48,7 +50,7 @@ impl RstExtractor {
|
|
|
48
50
|
/// Uses document tree parsing and fallback text extraction.
|
|
49
51
|
fn extract_text_and_metadata(content: &str) -> (String, Metadata) {
|
|
50
52
|
let mut metadata = Metadata::default();
|
|
51
|
-
let mut additional =
|
|
53
|
+
let mut additional: AHashMap<Cow<'static, str>, serde_json::Value> = AHashMap::new();
|
|
52
54
|
|
|
53
55
|
let text = Self::extract_text_from_rst(content, &mut additional);
|
|
54
56
|
|
|
@@ -60,7 +62,7 @@ impl RstExtractor {
|
|
|
60
62
|
///
|
|
61
63
|
/// This is the main extraction engine that processes RST line-by-line
|
|
62
64
|
/// and extracts all document content including headings, code blocks, lists, etc.
|
|
63
|
-
fn extract_text_from_rst(content: &str, metadata: &mut
|
|
65
|
+
fn extract_text_from_rst(content: &str, metadata: &mut AHashMap<Cow<'static, str>, serde_json::Value>) -> String {
|
|
64
66
|
let mut output = String::new();
|
|
65
67
|
let lines: Vec<&str> = content.lines().collect();
|
|
66
68
|
let mut i = 0;
|
|
@@ -228,24 +230,24 @@ impl RstExtractor {
|
|
|
228
230
|
}
|
|
229
231
|
|
|
230
232
|
/// Add a metadata field from RST field list.
|
|
231
|
-
fn add_metadata_field(key: &str, value: &str, metadata: &mut
|
|
233
|
+
fn add_metadata_field(key: &str, value: &str, metadata: &mut AHashMap<Cow<'static, str>, serde_json::Value>) {
|
|
232
234
|
let key_lower = key.to_lowercase();
|
|
233
235
|
match key_lower.as_str() {
|
|
234
236
|
"author" | "authors" => {
|
|
235
|
-
metadata.insert("author"
|
|
237
|
+
metadata.insert(Cow::Borrowed("author"), serde_json::Value::String(value.to_string()));
|
|
236
238
|
}
|
|
237
239
|
"date" => {
|
|
238
|
-
metadata.insert("date"
|
|
240
|
+
metadata.insert(Cow::Borrowed("date"), serde_json::Value::String(value.to_string()));
|
|
239
241
|
}
|
|
240
242
|
"version" | "revision" => {
|
|
241
|
-
metadata.insert("version"
|
|
243
|
+
metadata.insert(Cow::Borrowed("version"), serde_json::Value::String(value.to_string()));
|
|
242
244
|
}
|
|
243
245
|
"title" => {
|
|
244
|
-
metadata.insert("title"
|
|
246
|
+
metadata.insert(Cow::Borrowed("title"), serde_json::Value::String(value.to_string()));
|
|
245
247
|
}
|
|
246
248
|
_ => {
|
|
247
249
|
metadata.insert(
|
|
248
|
-
format!("field_{}", key_lower),
|
|
250
|
+
Cow::Owned(format!("field_{}", key_lower)),
|
|
249
251
|
serde_json::Value::String(value.to_string()),
|
|
250
252
|
);
|
|
251
253
|
}
|
|
@@ -447,7 +449,7 @@ impl DocumentExtractor for RstExtractor {
|
|
|
447
449
|
|
|
448
450
|
Ok(ExtractionResult {
|
|
449
451
|
content: extracted_text,
|
|
450
|
-
mime_type: mime_type.to_string(),
|
|
452
|
+
mime_type: mime_type.to_string().into(),
|
|
451
453
|
metadata,
|
|
452
454
|
tables,
|
|
453
455
|
detected_languages: None,
|
|
@@ -504,7 +506,7 @@ This is a paragraph.
|
|
|
504
506
|
Another paragraph.
|
|
505
507
|
"#;
|
|
506
508
|
|
|
507
|
-
let mut metadata =
|
|
509
|
+
let mut metadata = AHashMap::new();
|
|
508
510
|
let output = RstExtractor::extract_text_from_rst(content, &mut metadata);
|
|
509
511
|
assert!(output.contains("Title"));
|
|
510
512
|
assert!(output.contains("This is a paragraph"));
|
|
@@ -522,7 +524,7 @@ Another paragraph.
|
|
|
522
524
|
Some text after.
|
|
523
525
|
"#;
|
|
524
526
|
|
|
525
|
-
let mut metadata =
|
|
527
|
+
let mut metadata = AHashMap::new();
|
|
526
528
|
let output = RstExtractor::extract_text_from_rst(content, &mut metadata);
|
|
527
529
|
assert!(output.contains("code-block"));
|
|
528
530
|
assert!(output.contains("def hello"));
|
|
@@ -540,7 +542,7 @@ First paragraph.
|
|
|
540
542
|
Second paragraph.
|
|
541
543
|
"#;
|
|
542
544
|
|
|
543
|
-
let mut metadata =
|
|
545
|
+
let mut metadata = AHashMap::new();
|
|
544
546
|
let output = RstExtractor::extract_text_from_rst(content, &mut metadata);
|
|
545
547
|
assert!(output.contains("First paragraph"));
|
|
546
548
|
assert!(output.contains("Second paragraph"));
|
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
//! Metadata extraction from RTF documents.
|
|
2
2
|
|
|
3
3
|
use crate::extractors::rtf::encoding::parse_rtf_control_word;
|
|
4
|
+
use ahash::AHashMap;
|
|
4
5
|
use serde_json::Value;
|
|
5
|
-
use std::
|
|
6
|
+
use std::borrow::Cow;
|
|
6
7
|
|
|
7
8
|
/// Parse a `{\\creatim ...}` or `{\\revtim ...}` RTF info block into ISO 8601 format.
|
|
8
9
|
pub fn parse_rtf_datetime(segment: &str) -> Option<String> {
|
|
@@ -45,8 +46,8 @@ pub fn parse_rtf_datetime(segment: &str) -> Option<String> {
|
|
|
45
46
|
}
|
|
46
47
|
|
|
47
48
|
/// Extract metadata from the RTF `\\info` block and augment with computed statistics.
|
|
48
|
-
pub fn extract_rtf_metadata(rtf_content: &str, extracted_text: &str) ->
|
|
49
|
-
let mut metadata:
|
|
49
|
+
pub fn extract_rtf_metadata(rtf_content: &str, extracted_text: &str) -> AHashMap<Cow<'static, str>, Value> {
|
|
50
|
+
let mut metadata: AHashMap<Cow<'static, str>, Value> = AHashMap::new();
|
|
50
51
|
|
|
51
52
|
if let Some(start) = rtf_content.find("{\\info") {
|
|
52
53
|
let slice = &rtf_content[start..];
|
|
@@ -120,68 +121,68 @@ pub fn extract_rtf_metadata(rtf_content: &str, extracted_text: &str) -> HashMap<
|
|
|
120
121
|
"author" => {
|
|
121
122
|
if !trimmed.is_empty() {
|
|
122
123
|
let author = trimmed.to_string();
|
|
123
|
-
metadata.insert("created_by"
|
|
124
|
-
metadata.insert("authors"
|
|
124
|
+
metadata.insert(Cow::Borrowed("created_by"), Value::String(author.clone()));
|
|
125
|
+
metadata.insert(Cow::Borrowed("authors"), Value::Array(vec![Value::String(author)]));
|
|
125
126
|
}
|
|
126
127
|
}
|
|
127
128
|
"operator" => {
|
|
128
129
|
if !trimmed.is_empty() {
|
|
129
|
-
metadata.insert("modified_by"
|
|
130
|
+
metadata.insert(Cow::Borrowed("modified_by"), Value::String(trimmed.to_string()));
|
|
130
131
|
}
|
|
131
132
|
}
|
|
132
133
|
"title" => {
|
|
133
134
|
if !trimmed.is_empty() {
|
|
134
|
-
metadata.insert("title"
|
|
135
|
+
metadata.insert(Cow::Borrowed("title"), Value::String(trimmed.to_string()));
|
|
135
136
|
}
|
|
136
137
|
}
|
|
137
138
|
"subject" => {
|
|
138
139
|
if !trimmed.is_empty() {
|
|
139
|
-
metadata.insert("subject"
|
|
140
|
+
metadata.insert(Cow::Borrowed("subject"), Value::String(trimmed.to_string()));
|
|
140
141
|
}
|
|
141
142
|
}
|
|
142
143
|
"generator" => {
|
|
143
144
|
if !trimmed.is_empty() {
|
|
144
|
-
metadata.insert("generator"
|
|
145
|
+
metadata.insert(Cow::Borrowed("generator"), Value::String(trimmed.to_string()));
|
|
145
146
|
}
|
|
146
147
|
}
|
|
147
148
|
"creatim" => {
|
|
148
149
|
if let Some(dt) = parse_rtf_datetime(trimmed) {
|
|
149
|
-
metadata.insert("created_at"
|
|
150
|
+
metadata.insert(Cow::Borrowed("created_at"), Value::String(dt));
|
|
150
151
|
}
|
|
151
152
|
}
|
|
152
153
|
"revtim" => {
|
|
153
154
|
if let Some(dt) = parse_rtf_datetime(trimmed) {
|
|
154
|
-
metadata.insert("modified_at"
|
|
155
|
+
metadata.insert(Cow::Borrowed("modified_at"), Value::String(dt));
|
|
155
156
|
}
|
|
156
157
|
}
|
|
157
158
|
"version" => {
|
|
158
159
|
if let Some(val) = numeric.or_else(|| trimmed.parse::<i32>().ok()) {
|
|
159
|
-
metadata.insert("revision"
|
|
160
|
+
metadata.insert(Cow::Borrowed("revision"), Value::String(val.to_string()));
|
|
160
161
|
}
|
|
161
162
|
}
|
|
162
163
|
"nofpages" => {
|
|
163
164
|
if let Some(val) = numeric.or_else(|| trimmed.parse::<i32>().ok()) {
|
|
164
|
-
metadata.insert("page_count"
|
|
165
|
+
metadata.insert(Cow::Borrowed("page_count"), Value::Number(val.into()));
|
|
165
166
|
}
|
|
166
167
|
}
|
|
167
168
|
"nofwords" => {
|
|
168
169
|
if let Some(val) = numeric.or_else(|| trimmed.parse::<i32>().ok()) {
|
|
169
|
-
metadata.insert("word_count"
|
|
170
|
+
metadata.insert(Cow::Borrowed("word_count"), Value::Number(val.into()));
|
|
170
171
|
}
|
|
171
172
|
}
|
|
172
173
|
"nofchars" => {
|
|
173
174
|
if let Some(val) = numeric.or_else(|| trimmed.parse::<i32>().ok()) {
|
|
174
|
-
metadata.insert("character_count"
|
|
175
|
+
metadata.insert(Cow::Borrowed("character_count"), Value::Number(val.into()));
|
|
175
176
|
}
|
|
176
177
|
}
|
|
177
178
|
"lines" => {
|
|
178
179
|
if let Some(val) = numeric.or_else(|| trimmed.parse::<i32>().ok()) {
|
|
179
|
-
metadata.insert("line_count"
|
|
180
|
+
metadata.insert(Cow::Borrowed("line_count"), Value::Number(val.into()));
|
|
180
181
|
}
|
|
181
182
|
}
|
|
182
183
|
"paragraphs" => {
|
|
183
184
|
if let Some(val) = numeric.or_else(|| trimmed.parse::<i32>().ok()) {
|
|
184
|
-
metadata.insert("paragraph_count"
|
|
185
|
+
metadata.insert(Cow::Borrowed("paragraph_count"), Value::Number(val.into()));
|
|
185
186
|
}
|
|
186
187
|
}
|
|
187
188
|
_ => {}
|
|
@@ -193,22 +194,22 @@ pub fn extract_rtf_metadata(rtf_content: &str, extracted_text: &str) -> HashMap<
|
|
|
193
194
|
if !cleaned_text.is_empty() {
|
|
194
195
|
let word_count = cleaned_text.split_whitespace().count() as i64;
|
|
195
196
|
metadata
|
|
196
|
-
.entry("word_count"
|
|
197
|
+
.entry(Cow::Borrowed("word_count"))
|
|
197
198
|
.or_insert(Value::Number(word_count.into()));
|
|
198
199
|
|
|
199
200
|
let character_count = cleaned_text.chars().count() as i64;
|
|
200
201
|
metadata
|
|
201
|
-
.entry("character_count"
|
|
202
|
+
.entry(Cow::Borrowed("character_count"))
|
|
202
203
|
.or_insert(Value::Number(character_count.into()));
|
|
203
204
|
|
|
204
205
|
let line_count = cleaned_text.lines().count() as i64;
|
|
205
206
|
metadata
|
|
206
|
-
.entry("line_count"
|
|
207
|
+
.entry(Cow::Borrowed("line_count"))
|
|
207
208
|
.or_insert(Value::Number(line_count.into()));
|
|
208
209
|
|
|
209
210
|
let paragraph_count = cleaned_text.split("\n\n").filter(|p| !p.trim().is_empty()).count() as i64;
|
|
210
211
|
metadata
|
|
211
|
-
.entry("paragraph_count"
|
|
212
|
+
.entry(Cow::Borrowed("paragraph_count"))
|
|
212
213
|
.or_insert(Value::Number(paragraph_count.into()));
|
|
213
214
|
}
|
|
214
215
|
|
|
@@ -4,7 +4,9 @@ use crate::Result;
|
|
|
4
4
|
use crate::core::config::ExtractionConfig;
|
|
5
5
|
use crate::plugins::{DocumentExtractor, Plugin};
|
|
6
6
|
use crate::types::{ExtractionResult, Metadata};
|
|
7
|
+
use ahash::AHashMap;
|
|
7
8
|
use async_trait::async_trait;
|
|
9
|
+
use std::borrow::Cow;
|
|
8
10
|
#[cfg(feature = "tokio-runtime")]
|
|
9
11
|
use std::path::Path;
|
|
10
12
|
|
|
@@ -63,20 +65,23 @@ impl DocumentExtractor for StructuredExtractor {
|
|
|
63
65
|
_ => return Err(crate::KreuzbergError::UnsupportedFormat(mime_type.to_string())),
|
|
64
66
|
};
|
|
65
67
|
|
|
66
|
-
let mut additional =
|
|
68
|
+
let mut additional = AHashMap::new();
|
|
67
69
|
additional.insert(
|
|
68
|
-
"field_count"
|
|
70
|
+
Cow::Borrowed("field_count"),
|
|
69
71
|
serde_json::json!(structured_result.text_fields.len()),
|
|
70
72
|
);
|
|
71
|
-
additional.insert(
|
|
73
|
+
additional.insert(
|
|
74
|
+
Cow::Borrowed("data_format"),
|
|
75
|
+
serde_json::json!(structured_result.format),
|
|
76
|
+
);
|
|
72
77
|
|
|
73
78
|
for (key, value) in structured_result.metadata {
|
|
74
|
-
additional.insert(key, serde_json::json!(value));
|
|
79
|
+
additional.insert(Cow::Owned(key), serde_json::json!(value));
|
|
75
80
|
}
|
|
76
81
|
|
|
77
82
|
Ok(ExtractionResult {
|
|
78
83
|
content: structured_result.content,
|
|
79
|
-
mime_type: mime_type.to_string(),
|
|
84
|
+
mime_type: mime_type.to_string().into(),
|
|
80
85
|
metadata: Metadata {
|
|
81
86
|
additional,
|
|
82
87
|
..Default::default()
|
|
@@ -74,7 +74,7 @@ impl DocumentExtractor for PlainTextExtractor {
|
|
|
74
74
|
|
|
75
75
|
Ok(ExtractionResult {
|
|
76
76
|
content: text,
|
|
77
|
-
mime_type: mime_type.to_string(),
|
|
77
|
+
mime_type: mime_type.to_string().into(),
|
|
78
78
|
metadata: crate::types::Metadata {
|
|
79
79
|
format: Some(crate::types::FormatMetadata::Text(crate::types::TextMetadata {
|
|
80
80
|
line_count,
|
|
@@ -169,7 +169,7 @@ impl DocumentExtractor for MarkdownExtractor {
|
|
|
169
169
|
|
|
170
170
|
Ok(ExtractionResult {
|
|
171
171
|
content: text_result.content,
|
|
172
|
-
mime_type: mime_type.to_string(),
|
|
172
|
+
mime_type: mime_type.to_string().into(),
|
|
173
173
|
metadata: crate::types::Metadata {
|
|
174
174
|
format: Some(crate::types::FormatMetadata::Text(crate::types::TextMetadata {
|
|
175
175
|
line_count: text_result.line_count,
|
|
@@ -29,6 +29,8 @@ use crate::types::{ExtractionResult, Metadata};
|
|
|
29
29
|
use async_trait::async_trait;
|
|
30
30
|
#[cfg(feature = "office")]
|
|
31
31
|
use regex::Regex;
|
|
32
|
+
#[cfg(feature = "office")]
|
|
33
|
+
use std::borrow::Cow;
|
|
32
34
|
|
|
33
35
|
/// Typst document extractor
|
|
34
36
|
#[cfg(feature = "office")]
|
|
@@ -106,7 +108,7 @@ impl DocumentExtractor for TypstExtractor {
|
|
|
106
108
|
|
|
107
109
|
Ok(ExtractionResult {
|
|
108
110
|
content: text,
|
|
109
|
-
mime_type: mime_type.to_string(),
|
|
111
|
+
mime_type: mime_type.to_string().into(),
|
|
110
112
|
metadata,
|
|
111
113
|
tables: Vec::new(),
|
|
112
114
|
detected_languages: None,
|
|
@@ -151,11 +153,11 @@ impl TypstParser {
|
|
|
151
153
|
|
|
152
154
|
fn extract_metadata(&mut self) {
|
|
153
155
|
if let Some(title) = self.extract_quoted_value("title") {
|
|
154
|
-
self.metadata.additional.insert("title"
|
|
156
|
+
self.metadata.additional.insert(Cow::Borrowed("title"), title.into());
|
|
155
157
|
}
|
|
156
158
|
|
|
157
159
|
if let Some(author) = self.extract_quoted_value("author") {
|
|
158
|
-
self.metadata.additional.insert("author"
|
|
160
|
+
self.metadata.additional.insert(Cow::Borrowed("author"), author.into());
|
|
159
161
|
}
|
|
160
162
|
|
|
161
163
|
if let Some(date) = self.extract_quoted_value("date") {
|
|
@@ -163,11 +165,15 @@ impl TypstParser {
|
|
|
163
165
|
}
|
|
164
166
|
|
|
165
167
|
if let Some(subject) = self.extract_quoted_value("subject") {
|
|
166
|
-
self.metadata
|
|
168
|
+
self.metadata
|
|
169
|
+
.additional
|
|
170
|
+
.insert(Cow::Borrowed("subject"), subject.into());
|
|
167
171
|
}
|
|
168
172
|
|
|
169
173
|
if let Some(keywords) = self.extract_keywords() {
|
|
170
|
-
self.metadata
|
|
174
|
+
self.metadata
|
|
175
|
+
.additional
|
|
176
|
+
.insert(Cow::Borrowed("keywords"), keywords.into());
|
|
171
177
|
}
|
|
172
178
|
}
|
|
173
179
|
|
|
@@ -58,7 +58,7 @@ impl SyncExtractor for XmlExtractor {
|
|
|
58
58
|
|
|
59
59
|
Ok(ExtractionResult {
|
|
60
60
|
content: xml_result.content,
|
|
61
|
-
mime_type: mime_type.to_string(),
|
|
61
|
+
mime_type: mime_type.to_string().into(),
|
|
62
62
|
metadata: crate::types::Metadata {
|
|
63
63
|
format: Some(crate::types::FormatMetadata::Xml(crate::types::XmlMetadata {
|
|
64
64
|
element_count: xml_result.element_count,
|