kreuzberg 4.2.6 → 4.2.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +7 -4
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +36 -9
- data/ext/kreuzberg_rb/native/Cargo.toml +32 -0
- data/ext/kreuzberg_rb/native/src/config/types.rs +4 -2
- data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +1 -1
- data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +1 -1
- data/ext/kreuzberg_rb/native/src/result.rs +5 -3
- data/lib/kreuzberg/version.rb +1 -1
- data/sig/kreuzberg.rbs +228 -37
- data/spec/binding/batch_operations_spec.rb +2 -0
- data/vendor/Cargo.toml +3 -2
- data/vendor/kreuzberg/Cargo.toml +2 -1
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/error.rs +29 -1
- data/vendor/kreuzberg/src/api/handlers.rs +28 -25
- data/vendor/kreuzberg/src/api/openapi.rs +14 -1
- data/vendor/kreuzberg/src/chunking/config.rs +2 -37
- data/vendor/kreuzberg/src/chunking/core.rs +78 -2
- data/vendor/kreuzberg/src/chunking/mod.rs +1 -1
- data/vendor/kreuzberg/src/chunking/processor.rs +15 -17
- data/vendor/kreuzberg/src/core/config/extraction/env.rs +13 -9
- data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +12 -12
- data/vendor/kreuzberg/src/core/config/mod.rs +1 -1
- data/vendor/kreuzberg/src/core/config/processing.rs +65 -8
- data/vendor/kreuzberg/src/core/config_validation/mod.rs +8 -0
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +5 -0
- data/vendor/kreuzberg/src/core/extractor/batch.rs +9 -9
- data/vendor/kreuzberg/src/core/extractor/file.rs +4 -2
- data/vendor/kreuzberg/src/core/extractor/legacy.rs +7 -7
- data/vendor/kreuzberg/src/core/extractor/sync.rs +3 -3
- data/vendor/kreuzberg/src/core/pipeline/execution.rs +2 -1
- data/vendor/kreuzberg/src/core/pipeline/features.rs +16 -22
- data/vendor/kreuzberg/src/core/pipeline/format.rs +20 -18
- data/vendor/kreuzberg/src/core/pipeline/tests.rs +40 -35
- data/vendor/kreuzberg/src/extraction/email.rs +31 -19
- data/vendor/kreuzberg/src/extraction/excel.rs +6 -5
- data/vendor/kreuzberg/src/extraction/html/image_handling.rs +6 -1
- data/vendor/kreuzberg/src/extraction/html/types.rs +4 -3
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +10 -9
- data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +10 -8
- data/vendor/kreuzberg/src/extraction/pptx/mod.rs +8 -4
- data/vendor/kreuzberg/src/extraction/structured.rs +5 -4
- data/vendor/kreuzberg/src/extraction/transform/content.rs +1 -1
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +10 -7
- data/vendor/kreuzberg/src/extractors/archive.rs +7 -5
- data/vendor/kreuzberg/src/extractors/bibtex.rs +34 -17
- data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +7 -10
- data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +4 -2
- data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +3 -2
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +1 -1
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +2 -4
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +1 -1
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +4 -5
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +1 -1
- data/vendor/kreuzberg/src/extractors/docbook.rs +1 -1
- data/vendor/kreuzberg/src/extractors/docx.rs +32 -24
- data/vendor/kreuzberg/src/extractors/email.rs +5 -3
- data/vendor/kreuzberg/src/extractors/epub/metadata.rs +10 -10
- data/vendor/kreuzberg/src/extractors/epub/mod.rs +7 -3
- data/vendor/kreuzberg/src/extractors/excel.rs +8 -6
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +1 -1
- data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +15 -10
- data/vendor/kreuzberg/src/extractors/html.rs +1 -1
- data/vendor/kreuzberg/src/extractors/image.rs +3 -3
- data/vendor/kreuzberg/src/extractors/jats/mod.rs +1 -1
- data/vendor/kreuzberg/src/extractors/jupyter.rs +11 -9
- data/vendor/kreuzberg/src/extractors/latex/metadata.rs +4 -3
- data/vendor/kreuzberg/src/extractors/latex/mod.rs +1 -1
- data/vendor/kreuzberg/src/extractors/markdown.rs +6 -4
- data/vendor/kreuzberg/src/extractors/odt.rs +38 -21
- data/vendor/kreuzberg/src/extractors/opml/core.rs +1 -1
- data/vendor/kreuzberg/src/extractors/opml/parser.rs +13 -9
- data/vendor/kreuzberg/src/extractors/orgmode.rs +11 -9
- data/vendor/kreuzberg/src/extractors/pdf/mod.rs +10 -3
- data/vendor/kreuzberg/src/extractors/pptx.rs +13 -11
- data/vendor/kreuzberg/src/extractors/rst.rs +15 -13
- data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +22 -21
- data/vendor/kreuzberg/src/extractors/rtf/mod.rs +1 -1
- data/vendor/kreuzberg/src/extractors/structured.rs +10 -5
- data/vendor/kreuzberg/src/extractors/text.rs +2 -2
- data/vendor/kreuzberg/src/extractors/typst.rs +11 -5
- data/vendor/kreuzberg/src/extractors/xml.rs +1 -1
- data/vendor/kreuzberg/src/keywords/processor.rs +9 -8
- data/vendor/kreuzberg/src/language_detection/processor.rs +6 -5
- data/vendor/kreuzberg/src/lib.rs +1 -1
- data/vendor/kreuzberg/src/mcp/errors.rs +7 -6
- data/vendor/kreuzberg/src/mcp/format.rs +5 -4
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +3 -2
- data/vendor/kreuzberg/src/ocr/hocr.rs +4 -2
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +128 -14
- data/vendor/kreuzberg/src/ocr/processor/validation.rs +129 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +36 -6
- data/vendor/kreuzberg/src/ocr/types.rs +3 -4
- data/vendor/kreuzberg/src/ocr/validation.rs +14 -0
- data/vendor/kreuzberg/src/pdf/metadata.rs +1 -0
- data/vendor/kreuzberg/src/plugins/extractor/mod.rs +3 -2
- data/vendor/kreuzberg/src/plugins/extractor/registry.rs +5 -4
- data/vendor/kreuzberg/src/plugins/ocr.rs +5 -4
- data/vendor/kreuzberg/src/plugins/processor/mod.rs +13 -12
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +3 -2
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +3 -2
- data/vendor/kreuzberg/src/plugins/validator/mod.rs +15 -14
- data/vendor/kreuzberg/src/text/quality.rs +13 -13
- data/vendor/kreuzberg/src/text/quality_processor.rs +7 -6
- data/vendor/kreuzberg/src/types/djot.rs +15 -4
- data/vendor/kreuzberg/src/types/extraction.rs +24 -4
- data/vendor/kreuzberg/src/types/formats.rs +9 -5
- data/vendor/kreuzberg/src/types/metadata.rs +68 -7
- data/vendor/kreuzberg/src/types/mod.rs +7 -5
- data/vendor/kreuzberg/src/types/page.rs +9 -0
- data/vendor/kreuzberg/src/types/tables.rs +2 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +2 -1
- data/vendor/kreuzberg/tests/config_behavioral.rs +12 -16
- data/vendor/kreuzberg/tests/config_features.rs +19 -11
- data/vendor/kreuzberg/tests/config_loading_tests.rs +9 -9
- data/vendor/kreuzberg/tests/contract_mcp.rs +2 -2
- data/vendor/kreuzberg/tests/core_integration.rs +5 -6
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +1 -1
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +1 -1
- data/vendor/kreuzberg/tests/pipeline_integration.rs +36 -32
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +19 -13
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +3 -2
- data/vendor/kreuzberg/tests/plugin_system.rs +7 -6
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +1 -1
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -1
- data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +3 -2
- data/vendor/kreuzberg-ffi/kreuzberg.h +32 -0
- data/vendor/kreuzberg-ffi/src/error.rs +56 -0
- data/vendor/kreuzberg-ffi/src/helpers.rs +6 -5
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +1 -1
- data/vendor/kreuzberg-ffi/src/result.rs +2 -1
- data/vendor/kreuzberg-ffi/src/result_view.rs +3 -2
- data/vendor/kreuzberg-ffi/src/string_intern.rs +3 -3
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +2 -2
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- metadata +2 -2
|
@@ -5,7 +5,9 @@ use crate::core::config::ExtractionConfig;
|
|
|
5
5
|
use crate::extractors::SyncExtractor;
|
|
6
6
|
use crate::plugins::{DocumentExtractor, Plugin};
|
|
7
7
|
use crate::types::{EmailMetadata, ExtractionResult, Metadata};
|
|
8
|
+
use ahash::AHashMap;
|
|
8
9
|
use async_trait::async_trait;
|
|
10
|
+
use std::borrow::Cow;
|
|
9
11
|
#[cfg(feature = "tokio-runtime")]
|
|
10
12
|
use std::path::Path;
|
|
11
13
|
|
|
@@ -66,14 +68,14 @@ impl SyncExtractor for EmailExtractor {
|
|
|
66
68
|
attachments: attachment_names,
|
|
67
69
|
};
|
|
68
70
|
|
|
69
|
-
let mut additional =
|
|
71
|
+
let mut additional = AHashMap::new();
|
|
70
72
|
for (key, value) in &email_result.metadata {
|
|
71
|
-
additional.insert(key.clone(), serde_json::json!(value));
|
|
73
|
+
additional.insert(Cow::Owned(key.clone()), serde_json::json!(value));
|
|
72
74
|
}
|
|
73
75
|
|
|
74
76
|
Ok(ExtractionResult {
|
|
75
77
|
content: text,
|
|
76
|
-
mime_type: mime_type.to_string(),
|
|
78
|
+
mime_type: mime_type.to_string().into(),
|
|
77
79
|
metadata: Metadata {
|
|
78
80
|
format: Some(crate::types::FormatMetadata::Email(email_metadata)),
|
|
79
81
|
subject: email_result.subject.clone(),
|
|
@@ -27,24 +27,24 @@ pub(super) fn extract_metadata(opf_xml: &str) -> Result<(OepbMetadata, BTreeMap<
|
|
|
27
27
|
|
|
28
28
|
let (epub_metadata, _) = parse_opf(opf_xml)?;
|
|
29
29
|
|
|
30
|
-
if let Some(identifier) = epub_metadata.identifier
|
|
31
|
-
additional_metadata.insert("identifier".to_string(), serde_json::json!(identifier));
|
|
30
|
+
if let Some(ref identifier) = epub_metadata.identifier {
|
|
31
|
+
additional_metadata.insert("identifier".to_string(), serde_json::json!(identifier.clone()));
|
|
32
32
|
}
|
|
33
33
|
|
|
34
|
-
if let Some(publisher) = epub_metadata.publisher
|
|
35
|
-
additional_metadata.insert("publisher".to_string(), serde_json::json!(publisher));
|
|
34
|
+
if let Some(ref publisher) = epub_metadata.publisher {
|
|
35
|
+
additional_metadata.insert("publisher".to_string(), serde_json::json!(publisher.clone()));
|
|
36
36
|
}
|
|
37
37
|
|
|
38
|
-
if let Some(subject) = epub_metadata.subject
|
|
39
|
-
additional_metadata.insert("subject".to_string(), serde_json::json!(subject));
|
|
38
|
+
if let Some(ref subject) = epub_metadata.subject {
|
|
39
|
+
additional_metadata.insert("subject".to_string(), serde_json::json!(subject.clone()));
|
|
40
40
|
}
|
|
41
41
|
|
|
42
|
-
if let Some(description) = epub_metadata.description
|
|
43
|
-
additional_metadata.insert("description".to_string(), serde_json::json!(description));
|
|
42
|
+
if let Some(ref description) = epub_metadata.description {
|
|
43
|
+
additional_metadata.insert("description".to_string(), serde_json::json!(description.clone()));
|
|
44
44
|
}
|
|
45
45
|
|
|
46
|
-
if let Some(rights) = epub_metadata.rights
|
|
47
|
-
additional_metadata.insert("rights".to_string(), serde_json::json!(rights));
|
|
46
|
+
if let Some(ref rights) = epub_metadata.rights {
|
|
47
|
+
additional_metadata.insert("rights".to_string(), serde_json::json!(rights.clone()));
|
|
48
48
|
}
|
|
49
49
|
|
|
50
50
|
Ok((epub_metadata, additional_metadata))
|
|
@@ -19,7 +19,9 @@ use crate::Result;
|
|
|
19
19
|
use crate::core::config::ExtractionConfig;
|
|
20
20
|
use crate::plugins::{DocumentExtractor, Plugin};
|
|
21
21
|
use crate::types::{ExtractionResult, Metadata};
|
|
22
|
+
use ahash::AHashMap;
|
|
22
23
|
use async_trait::async_trait;
|
|
24
|
+
use std::borrow::Cow;
|
|
23
25
|
use std::io::Cursor;
|
|
24
26
|
use zip::ZipArchive;
|
|
25
27
|
|
|
@@ -112,12 +114,14 @@ impl DocumentExtractor for EpubExtractor {
|
|
|
112
114
|
let extracted_content = extract_content(&mut archive, &opf_path, &manifest_dir)?;
|
|
113
115
|
|
|
114
116
|
let (epub_metadata, additional_metadata) = extract_metadata(&opf_xml)?;
|
|
115
|
-
let metadata_map:
|
|
116
|
-
|
|
117
|
+
let metadata_map: AHashMap<Cow<'static, str>, serde_json::Value> = additional_metadata
|
|
118
|
+
.into_iter()
|
|
119
|
+
.map(|(k, v)| (Cow::Owned(k), v))
|
|
120
|
+
.collect();
|
|
117
121
|
|
|
118
122
|
Ok(ExtractionResult {
|
|
119
123
|
content: extracted_content,
|
|
120
|
-
mime_type: mime_type.to_string(),
|
|
124
|
+
mime_type: mime_type.to_string().into(),
|
|
121
125
|
metadata: Metadata {
|
|
122
126
|
title: epub_metadata.title,
|
|
123
127
|
authors: epub_metadata.creator.map(|c| vec![c]),
|
|
@@ -4,7 +4,9 @@ use crate::Result;
|
|
|
4
4
|
use crate::core::config::ExtractionConfig;
|
|
5
5
|
use crate::plugins::{DocumentExtractor, Plugin};
|
|
6
6
|
use crate::types::{ExcelMetadata, ExtractionResult, Metadata, Table};
|
|
7
|
+
use ahash::AHashMap;
|
|
7
8
|
use async_trait::async_trait;
|
|
9
|
+
use std::borrow::Cow;
|
|
8
10
|
use std::path::Path;
|
|
9
11
|
|
|
10
12
|
/// Excel spreadsheet extractor using calamine.
|
|
@@ -120,16 +122,16 @@ impl DocumentExtractor for ExcelExtractor {
|
|
|
120
122
|
sheet_names,
|
|
121
123
|
};
|
|
122
124
|
|
|
123
|
-
let mut additional =
|
|
125
|
+
let mut additional = AHashMap::new();
|
|
124
126
|
for (key, value) in &workbook.metadata {
|
|
125
127
|
if key != "sheet_count" && key != "sheet_names" {
|
|
126
|
-
additional.insert(key.clone(), serde_json::json!(value));
|
|
128
|
+
additional.insert(Cow::Owned(key.clone()), serde_json::json!(value));
|
|
127
129
|
}
|
|
128
130
|
}
|
|
129
131
|
|
|
130
132
|
Ok(ExtractionResult {
|
|
131
133
|
content: markdown,
|
|
132
|
-
mime_type: mime_type.to_string(),
|
|
134
|
+
mime_type: mime_type.to_string().into(),
|
|
133
135
|
metadata: Metadata {
|
|
134
136
|
format: Some(crate::types::FormatMetadata::Excel(excel_metadata)),
|
|
135
137
|
additional,
|
|
@@ -166,16 +168,16 @@ impl DocumentExtractor for ExcelExtractor {
|
|
|
166
168
|
sheet_names,
|
|
167
169
|
};
|
|
168
170
|
|
|
169
|
-
let mut additional =
|
|
171
|
+
let mut additional = AHashMap::new();
|
|
170
172
|
for (key, value) in &workbook.metadata {
|
|
171
173
|
if key != "sheet_count" && key != "sheet_names" {
|
|
172
|
-
additional.insert(key.clone(), serde_json::json!(value));
|
|
174
|
+
additional.insert(Cow::Owned(key.clone()), serde_json::json!(value));
|
|
173
175
|
}
|
|
174
176
|
}
|
|
175
177
|
|
|
176
178
|
Ok(ExtractionResult {
|
|
177
179
|
content: markdown,
|
|
178
|
-
mime_type: mime_type.to_string(),
|
|
180
|
+
mime_type: mime_type.to_string().into(),
|
|
179
181
|
metadata: Metadata {
|
|
180
182
|
format: Some(crate::types::FormatMetadata::Excel(excel_metadata)),
|
|
181
183
|
additional,
|
|
@@ -434,7 +434,7 @@ impl DocumentExtractor for FictionBookExtractor {
|
|
|
434
434
|
|
|
435
435
|
Ok(ExtractionResult {
|
|
436
436
|
content: extracted_content,
|
|
437
|
-
mime_type: mime_type.to_string(),
|
|
437
|
+
mime_type: mime_type.to_string().into(),
|
|
438
438
|
metadata,
|
|
439
439
|
tables: vec![],
|
|
440
440
|
detected_languages: None,
|
|
@@ -9,6 +9,7 @@
|
|
|
9
9
|
use crate::types::Metadata;
|
|
10
10
|
|
|
11
11
|
use serde_yaml_ng::Value as YamlValue;
|
|
12
|
+
use std::borrow::Cow;
|
|
12
13
|
|
|
13
14
|
/// Extract YAML frontmatter from document content.
|
|
14
15
|
///
|
|
@@ -126,12 +127,12 @@ pub fn extract_metadata_from_yaml(yaml: &YamlValue) -> Metadata {
|
|
|
126
127
|
|
|
127
128
|
// Title
|
|
128
129
|
if let Some(title) = yaml.get("title").and_then(|v| v.as_str()) {
|
|
129
|
-
metadata.additional.insert("title"
|
|
130
|
+
metadata.additional.insert(Cow::Borrowed("title"), title.into());
|
|
130
131
|
}
|
|
131
132
|
|
|
132
133
|
// Author
|
|
133
134
|
if let Some(author) = yaml.get("author").and_then(|v| v.as_str()) {
|
|
134
|
-
metadata.additional.insert("author"
|
|
135
|
+
metadata.additional.insert(Cow::Borrowed("author"), author.into());
|
|
135
136
|
}
|
|
136
137
|
|
|
137
138
|
// Date (map to created_at)
|
|
@@ -143,11 +144,13 @@ pub fn extract_metadata_from_yaml(yaml: &YamlValue) -> Metadata {
|
|
|
143
144
|
if let Some(keywords) = yaml.get("keywords") {
|
|
144
145
|
match keywords {
|
|
145
146
|
YamlValue::String(s) => {
|
|
146
|
-
metadata.additional.insert("keywords"
|
|
147
|
+
metadata.additional.insert(Cow::Borrowed("keywords"), s.clone().into());
|
|
147
148
|
}
|
|
148
149
|
YamlValue::Sequence(seq) => {
|
|
149
150
|
let keywords_str = seq.iter().filter_map(|v| v.as_str()).collect::<Vec<_>>().join(", ");
|
|
150
|
-
metadata
|
|
151
|
+
metadata
|
|
152
|
+
.additional
|
|
153
|
+
.insert(Cow::Borrowed("keywords"), keywords_str.into());
|
|
151
154
|
}
|
|
152
155
|
_ => {}
|
|
153
156
|
}
|
|
@@ -160,7 +163,9 @@ pub fn extract_metadata_from_yaml(yaml: &YamlValue) -> Metadata {
|
|
|
160
163
|
|
|
161
164
|
// Abstract
|
|
162
165
|
if let Some(abstract_text) = yaml.get("abstract").and_then(|v| v.as_str()) {
|
|
163
|
-
metadata
|
|
166
|
+
metadata
|
|
167
|
+
.additional
|
|
168
|
+
.insert(Cow::Borrowed("abstract"), abstract_text.into());
|
|
164
169
|
}
|
|
165
170
|
|
|
166
171
|
// Subject (overrides description if both present)
|
|
@@ -170,18 +175,18 @@ pub fn extract_metadata_from_yaml(yaml: &YamlValue) -> Metadata {
|
|
|
170
175
|
|
|
171
176
|
// Category
|
|
172
177
|
if let Some(category) = yaml.get("category").and_then(|v| v.as_str()) {
|
|
173
|
-
metadata.additional.insert("category"
|
|
178
|
+
metadata.additional.insert(Cow::Borrowed("category"), category.into());
|
|
174
179
|
}
|
|
175
180
|
|
|
176
181
|
// Tags (support both string and array)
|
|
177
182
|
if let Some(tags) = yaml.get("tags") {
|
|
178
183
|
match tags {
|
|
179
184
|
YamlValue::String(s) => {
|
|
180
|
-
metadata.additional.insert("tags"
|
|
185
|
+
metadata.additional.insert(Cow::Borrowed("tags"), s.clone().into());
|
|
181
186
|
}
|
|
182
187
|
YamlValue::Sequence(seq) => {
|
|
183
188
|
let tags_str = seq.iter().filter_map(|v| v.as_str()).collect::<Vec<_>>().join(", ");
|
|
184
|
-
metadata.additional.insert("tags"
|
|
189
|
+
metadata.additional.insert(Cow::Borrowed("tags"), tags_str.into());
|
|
185
190
|
}
|
|
186
191
|
_ => {}
|
|
187
192
|
}
|
|
@@ -189,12 +194,12 @@ pub fn extract_metadata_from_yaml(yaml: &YamlValue) -> Metadata {
|
|
|
189
194
|
|
|
190
195
|
// Language
|
|
191
196
|
if let Some(language) = yaml.get("language").and_then(|v| v.as_str()) {
|
|
192
|
-
metadata.additional.insert("language"
|
|
197
|
+
metadata.additional.insert(Cow::Borrowed("language"), language.into());
|
|
193
198
|
}
|
|
194
199
|
|
|
195
200
|
// Version
|
|
196
201
|
if let Some(version) = yaml.get("version").and_then(|v| v.as_str()) {
|
|
197
|
-
metadata.additional.insert("version"
|
|
202
|
+
metadata.additional.insert(Cow::Borrowed("version"), version.into());
|
|
198
203
|
}
|
|
199
204
|
|
|
200
205
|
metadata
|
|
@@ -219,7 +219,7 @@ impl SyncExtractor for HtmlExtractor {
|
|
|
219
219
|
|
|
220
220
|
Ok(ExtractionResult {
|
|
221
221
|
content: content_text,
|
|
222
|
-
mime_type: result_mime_type.to_string(),
|
|
222
|
+
mime_type: result_mime_type.to_string().into(),
|
|
223
223
|
metadata: Metadata {
|
|
224
224
|
format: html_metadata.map(|m| crate::types::FormatMetadata::Html(Box::new(m))),
|
|
225
225
|
..Default::default()
|
|
@@ -128,7 +128,7 @@ impl DocumentExtractor for ImageExtractor {
|
|
|
128
128
|
let mut ocr_result = self.extract_with_ocr(content, mime_type, config).await?;
|
|
129
129
|
|
|
130
130
|
ocr_result.metadata.format = Some(crate::types::FormatMetadata::Image(image_metadata));
|
|
131
|
-
ocr_result.mime_type = mime_type.to_string();
|
|
131
|
+
ocr_result.mime_type = mime_type.to_string().into();
|
|
132
132
|
|
|
133
133
|
return Ok(ocr_result);
|
|
134
134
|
}
|
|
@@ -141,7 +141,7 @@ impl DocumentExtractor for ImageExtractor {
|
|
|
141
141
|
|
|
142
142
|
return Ok(ExtractionResult {
|
|
143
143
|
content: content_text,
|
|
144
|
-
mime_type: mime_type.to_string(),
|
|
144
|
+
mime_type: mime_type.to_string().into(),
|
|
145
145
|
metadata: Metadata {
|
|
146
146
|
format: Some(crate::types::FormatMetadata::Image(image_metadata)),
|
|
147
147
|
..Default::default()
|
|
@@ -161,7 +161,7 @@ impl DocumentExtractor for ImageExtractor {
|
|
|
161
161
|
"Image: {} {}x{}",
|
|
162
162
|
extraction_metadata.format, extraction_metadata.width, extraction_metadata.height
|
|
163
163
|
),
|
|
164
|
-
mime_type: mime_type.to_string(),
|
|
164
|
+
mime_type: mime_type.to_string().into(),
|
|
165
165
|
metadata: Metadata {
|
|
166
166
|
format: Some(crate::types::FormatMetadata::Image(image_metadata)),
|
|
167
167
|
..Default::default()
|
|
@@ -18,11 +18,13 @@ use crate::plugins::{DocumentExtractor, Plugin};
|
|
|
18
18
|
#[cfg(feature = "office")]
|
|
19
19
|
use crate::types::{ExtractionResult, Metadata};
|
|
20
20
|
#[cfg(feature = "office")]
|
|
21
|
+
use ahash::AHashMap;
|
|
22
|
+
#[cfg(feature = "office")]
|
|
21
23
|
use async_trait::async_trait;
|
|
22
24
|
#[cfg(feature = "office")]
|
|
23
25
|
use serde_json::{Value, json};
|
|
24
26
|
#[cfg(feature = "office")]
|
|
25
|
-
use std::
|
|
27
|
+
use std::borrow::Cow;
|
|
26
28
|
|
|
27
29
|
/// Jupyter Notebook extractor.
|
|
28
30
|
///
|
|
@@ -42,32 +44,32 @@ impl JupyterExtractor {
|
|
|
42
44
|
}
|
|
43
45
|
|
|
44
46
|
/// Extract content from a Jupyter notebook.
|
|
45
|
-
fn extract_notebook(content: &[u8]) -> Result<(String,
|
|
47
|
+
fn extract_notebook(content: &[u8]) -> Result<(String, AHashMap<Cow<'static, str>, Value>)> {
|
|
46
48
|
let notebook: Value = serde_json::from_slice(content)
|
|
47
49
|
.map_err(|e| crate::KreuzbergError::parsing(format!("Failed to parse JSON: {}", e)))?;
|
|
48
50
|
|
|
49
51
|
let mut extracted_content = String::new();
|
|
50
|
-
let mut metadata =
|
|
52
|
+
let mut metadata = AHashMap::new();
|
|
51
53
|
|
|
52
54
|
if let Some(notebook_metadata) = notebook.get("metadata").and_then(|m| m.as_object()) {
|
|
53
55
|
if let Some(kernelspec) = notebook_metadata.get("kernelspec")
|
|
54
56
|
&& let Some(name) = kernelspec.get("name").and_then(|n| n.as_str())
|
|
55
57
|
{
|
|
56
58
|
extracted_content.push_str(&format!("Kernelspec: {}\n", name));
|
|
57
|
-
metadata.insert("kernelspec"
|
|
59
|
+
metadata.insert(Cow::Borrowed("kernelspec"), kernelspec.clone());
|
|
58
60
|
}
|
|
59
61
|
|
|
60
62
|
if let Some(language_info) = notebook_metadata.get("language_info")
|
|
61
63
|
&& let Some(name) = language_info.get("name").and_then(|n| n.as_str())
|
|
62
64
|
{
|
|
63
65
|
extracted_content.push_str(&format!("Language: {}\n", name));
|
|
64
|
-
metadata.insert("language_info"
|
|
66
|
+
metadata.insert(Cow::Borrowed("language_info"), language_info.clone());
|
|
65
67
|
}
|
|
66
68
|
}
|
|
67
69
|
|
|
68
70
|
if let Some(nbformat) = notebook.get("nbformat") {
|
|
69
71
|
extracted_content.push_str(&format!("NBFormat: {}\n", nbformat));
|
|
70
|
-
metadata.insert("nbformat"
|
|
72
|
+
metadata.insert(Cow::Borrowed("nbformat"), nbformat.clone());
|
|
71
73
|
}
|
|
72
74
|
|
|
73
75
|
extracted_content.push('\n');
|
|
@@ -86,7 +88,7 @@ impl JupyterExtractor {
|
|
|
86
88
|
cell: &Value,
|
|
87
89
|
cell_idx: usize,
|
|
88
90
|
content: &mut String,
|
|
89
|
-
_metadata: &mut
|
|
91
|
+
_metadata: &mut AHashMap<Cow<'static, str>, Value>,
|
|
90
92
|
) -> Result<()> {
|
|
91
93
|
let cell_type = cell.get("cell_type").and_then(|t| t.as_str()).unwrap_or("unknown");
|
|
92
94
|
|
|
@@ -324,14 +326,14 @@ impl DocumentExtractor for JupyterExtractor {
|
|
|
324
326
|
) -> Result<ExtractionResult> {
|
|
325
327
|
let (extracted_content, additional_metadata) = Self::extract_notebook(content)?;
|
|
326
328
|
|
|
327
|
-
let mut metadata_additional =
|
|
329
|
+
let mut metadata_additional = AHashMap::new();
|
|
328
330
|
for (key, value) in additional_metadata {
|
|
329
331
|
metadata_additional.insert(key, json!(value));
|
|
330
332
|
}
|
|
331
333
|
|
|
332
334
|
Ok(ExtractionResult {
|
|
333
335
|
content: extracted_content,
|
|
334
|
-
mime_type: mime_type.to_string(),
|
|
336
|
+
mime_type: mime_type.to_string().into(),
|
|
335
337
|
metadata: Metadata {
|
|
336
338
|
additional: metadata_additional,
|
|
337
339
|
..Default::default()
|
|
@@ -5,6 +5,7 @@
|
|
|
5
5
|
|
|
6
6
|
use super::utilities::extract_braced;
|
|
7
7
|
use crate::types::Metadata;
|
|
8
|
+
use std::borrow::Cow;
|
|
8
9
|
|
|
9
10
|
/// Extracts metadata from a LaTeX line.
|
|
10
11
|
///
|
|
@@ -13,15 +14,15 @@ use crate::types::Metadata;
|
|
|
13
14
|
pub fn extract_metadata_from_line(line: &str, metadata: &mut Metadata) {
|
|
14
15
|
if line.starts_with("\\title{") {
|
|
15
16
|
if let Some(title) = extract_braced(line, "title") {
|
|
16
|
-
metadata.additional.insert("title"
|
|
17
|
+
metadata.additional.insert(Cow::Borrowed("title"), title.into());
|
|
17
18
|
}
|
|
18
19
|
} else if line.starts_with("\\author{") {
|
|
19
20
|
if let Some(author) = extract_braced(line, "author") {
|
|
20
|
-
metadata.additional.insert("author"
|
|
21
|
+
metadata.additional.insert(Cow::Borrowed("author"), author.into());
|
|
21
22
|
}
|
|
22
23
|
} else if line.starts_with("\\date{")
|
|
23
24
|
&& let Some(date) = extract_braced(line, "date")
|
|
24
25
|
{
|
|
25
|
-
metadata.additional.insert("date"
|
|
26
|
+
metadata.additional.insert(Cow::Borrowed("date"), date.into());
|
|
26
27
|
}
|
|
27
28
|
}
|
|
@@ -28,6 +28,8 @@ use crate::types::{ExtractionResult, Metadata, Table};
|
|
|
28
28
|
use async_trait::async_trait;
|
|
29
29
|
#[cfg(feature = "office")]
|
|
30
30
|
use pulldown_cmark::{Event, Options, Parser, Tag, TagEnd};
|
|
31
|
+
#[cfg(feature = "office")]
|
|
32
|
+
use std::borrow::Cow;
|
|
31
33
|
|
|
32
34
|
/// Enhanced Markdown extractor with metadata and table support.
|
|
33
35
|
///
|
|
@@ -113,7 +115,7 @@ impl MarkdownExtractor {
|
|
|
113
115
|
if !current_row.is_empty()
|
|
114
116
|
&& let Some((ref mut rows, _)) = current_table
|
|
115
117
|
{
|
|
116
|
-
rows.push(current_row
|
|
118
|
+
rows.push(std::mem::take(&mut current_row));
|
|
117
119
|
}
|
|
118
120
|
current_row = Vec::new();
|
|
119
121
|
}
|
|
@@ -121,7 +123,7 @@ impl MarkdownExtractor {
|
|
|
121
123
|
if !current_row.is_empty()
|
|
122
124
|
&& let Some((ref mut rows, _)) = current_table
|
|
123
125
|
{
|
|
124
|
-
rows.push(current_row
|
|
126
|
+
rows.push(std::mem::take(&mut current_row));
|
|
125
127
|
}
|
|
126
128
|
current_row = Vec::new();
|
|
127
129
|
}
|
|
@@ -211,7 +213,7 @@ impl DocumentExtractor for MarkdownExtractor {
|
|
|
211
213
|
if !metadata.additional.contains_key("title")
|
|
212
214
|
&& let Some(title) = extract_title_from_content(&remaining_content)
|
|
213
215
|
{
|
|
214
|
-
metadata.additional.insert("title"
|
|
216
|
+
metadata.additional.insert(Cow::Borrowed("title"), title.into());
|
|
215
217
|
}
|
|
216
218
|
|
|
217
219
|
let parser = Parser::new_ext(&remaining_content, Options::ENABLE_TABLES);
|
|
@@ -223,7 +225,7 @@ impl DocumentExtractor for MarkdownExtractor {
|
|
|
223
225
|
|
|
224
226
|
Ok(ExtractionResult {
|
|
225
227
|
content: extracted_text,
|
|
226
|
-
mime_type: mime_type.to_string(),
|
|
228
|
+
mime_type: mime_type.to_string().into(),
|
|
227
229
|
metadata,
|
|
228
230
|
tables,
|
|
229
231
|
detected_languages: None,
|
|
@@ -9,8 +9,10 @@ use crate::core::config::ExtractionConfig;
|
|
|
9
9
|
use crate::extraction::{cells_to_markdown, office_metadata};
|
|
10
10
|
use crate::plugins::{DocumentExtractor, Plugin};
|
|
11
11
|
use crate::types::{ExtractionResult, Metadata, Table};
|
|
12
|
+
use ahash::AHashMap;
|
|
12
13
|
use async_trait::async_trait;
|
|
13
14
|
use roxmltree::Document;
|
|
15
|
+
use std::borrow::Cow;
|
|
14
16
|
use std::io::Cursor;
|
|
15
17
|
|
|
16
18
|
/// High-performance ODT extractor using native Rust XML parsing.
|
|
@@ -475,7 +477,7 @@ impl DocumentExtractor for OdtExtractor {
|
|
|
475
477
|
(combined_text, tables)
|
|
476
478
|
};
|
|
477
479
|
|
|
478
|
-
let mut metadata_map =
|
|
480
|
+
let mut metadata_map = AHashMap::new();
|
|
479
481
|
|
|
480
482
|
let cursor = Cursor::new(content_owned.clone());
|
|
481
483
|
let mut archive = zip::ZipArchive::new(cursor).map_err(|e| {
|
|
@@ -484,80 +486,95 @@ impl DocumentExtractor for OdtExtractor {
|
|
|
484
486
|
|
|
485
487
|
if let Ok(odt_props) = office_metadata::extract_odt_properties(&mut archive) {
|
|
486
488
|
if let Some(title) = odt_props.title {
|
|
487
|
-
metadata_map.insert("title"
|
|
489
|
+
metadata_map.insert(Cow::Borrowed("title"), serde_json::Value::String(title));
|
|
488
490
|
}
|
|
489
491
|
if let Some(creator) = odt_props.creator {
|
|
490
492
|
metadata_map.insert(
|
|
491
|
-
"authors"
|
|
493
|
+
Cow::Borrowed("authors"),
|
|
492
494
|
serde_json::Value::Array(vec![serde_json::Value::String(creator.clone())]),
|
|
493
495
|
);
|
|
494
|
-
metadata_map.insert("created_by"
|
|
496
|
+
metadata_map.insert(Cow::Borrowed("created_by"), serde_json::Value::String(creator));
|
|
495
497
|
}
|
|
496
498
|
if let Some(initial_creator) = odt_props.initial_creator {
|
|
497
499
|
metadata_map.insert(
|
|
498
|
-
"initial_creator"
|
|
500
|
+
Cow::Borrowed("initial_creator"),
|
|
499
501
|
serde_json::Value::String(initial_creator),
|
|
500
502
|
);
|
|
501
503
|
}
|
|
502
504
|
if let Some(subject) = odt_props.subject {
|
|
503
|
-
metadata_map.insert("subject"
|
|
505
|
+
metadata_map.insert(Cow::Borrowed("subject"), serde_json::Value::String(subject));
|
|
504
506
|
}
|
|
505
507
|
if let Some(keywords) = odt_props.keywords {
|
|
506
|
-
metadata_map.insert("keywords"
|
|
508
|
+
metadata_map.insert(Cow::Borrowed("keywords"), serde_json::Value::String(keywords));
|
|
507
509
|
}
|
|
508
510
|
if let Some(description) = odt_props.description {
|
|
509
|
-
metadata_map.insert("description"
|
|
511
|
+
metadata_map.insert(Cow::Borrowed("description"), serde_json::Value::String(description));
|
|
510
512
|
}
|
|
511
513
|
if let Some(creation_date) = odt_props.creation_date {
|
|
512
|
-
metadata_map.insert("created_at"
|
|
514
|
+
metadata_map.insert(Cow::Borrowed("created_at"), serde_json::Value::String(creation_date));
|
|
513
515
|
}
|
|
514
516
|
if let Some(date) = odt_props.date {
|
|
515
|
-
metadata_map.insert("modified_at"
|
|
517
|
+
metadata_map.insert(Cow::Borrowed("modified_at"), serde_json::Value::String(date));
|
|
516
518
|
}
|
|
517
519
|
if let Some(language) = odt_props.language {
|
|
518
|
-
metadata_map.insert("language"
|
|
520
|
+
metadata_map.insert(Cow::Borrowed("language"), serde_json::Value::String(language));
|
|
519
521
|
}
|
|
520
522
|
if let Some(generator) = odt_props.generator {
|
|
521
|
-
metadata_map.insert("generator"
|
|
523
|
+
metadata_map.insert(Cow::Borrowed("generator"), serde_json::Value::String(generator));
|
|
522
524
|
}
|
|
523
525
|
if let Some(editing_duration) = odt_props.editing_duration {
|
|
524
526
|
metadata_map.insert(
|
|
525
|
-
"editing_duration"
|
|
527
|
+
Cow::Borrowed("editing_duration"),
|
|
526
528
|
serde_json::Value::String(editing_duration),
|
|
527
529
|
);
|
|
528
530
|
}
|
|
529
531
|
if let Some(editing_cycles) = odt_props.editing_cycles {
|
|
530
|
-
metadata_map.insert(
|
|
532
|
+
metadata_map.insert(
|
|
533
|
+
Cow::Borrowed("editing_cycles"),
|
|
534
|
+
serde_json::Value::String(editing_cycles),
|
|
535
|
+
);
|
|
531
536
|
}
|
|
532
537
|
if let Some(page_count) = odt_props.page_count {
|
|
533
|
-
metadata_map.insert(
|
|
538
|
+
metadata_map.insert(
|
|
539
|
+
Cow::Borrowed("page_count"),
|
|
540
|
+
serde_json::Value::Number(page_count.into()),
|
|
541
|
+
);
|
|
534
542
|
}
|
|
535
543
|
if let Some(word_count) = odt_props.word_count {
|
|
536
|
-
metadata_map.insert(
|
|
544
|
+
metadata_map.insert(
|
|
545
|
+
Cow::Borrowed("word_count"),
|
|
546
|
+
serde_json::Value::Number(word_count.into()),
|
|
547
|
+
);
|
|
537
548
|
}
|
|
538
549
|
if let Some(character_count) = odt_props.character_count {
|
|
539
550
|
metadata_map.insert(
|
|
540
|
-
"character_count"
|
|
551
|
+
Cow::Borrowed("character_count"),
|
|
541
552
|
serde_json::Value::Number(character_count.into()),
|
|
542
553
|
);
|
|
543
554
|
}
|
|
544
555
|
if let Some(paragraph_count) = odt_props.paragraph_count {
|
|
545
556
|
metadata_map.insert(
|
|
546
|
-
"paragraph_count"
|
|
557
|
+
Cow::Borrowed("paragraph_count"),
|
|
547
558
|
serde_json::Value::Number(paragraph_count.into()),
|
|
548
559
|
);
|
|
549
560
|
}
|
|
550
561
|
if let Some(table_count) = odt_props.table_count {
|
|
551
|
-
metadata_map.insert(
|
|
562
|
+
metadata_map.insert(
|
|
563
|
+
Cow::Borrowed("table_count"),
|
|
564
|
+
serde_json::Value::Number(table_count.into()),
|
|
565
|
+
);
|
|
552
566
|
}
|
|
553
567
|
if let Some(image_count) = odt_props.image_count {
|
|
554
|
-
metadata_map.insert(
|
|
568
|
+
metadata_map.insert(
|
|
569
|
+
Cow::Borrowed("image_count"),
|
|
570
|
+
serde_json::Value::Number(image_count.into()),
|
|
571
|
+
);
|
|
555
572
|
}
|
|
556
573
|
}
|
|
557
574
|
|
|
558
575
|
Ok(ExtractionResult {
|
|
559
576
|
content: text,
|
|
560
|
-
mime_type: mime_type.to_string(),
|
|
577
|
+
mime_type: mime_type.to_string().into(),
|
|
561
578
|
metadata: Metadata {
|
|
562
579
|
additional: metadata_map,
|
|
563
580
|
..Default::default()
|