kreuzberg 4.2.6 → 4.2.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +7 -4
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +36 -9
- data/ext/kreuzberg_rb/native/Cargo.toml +32 -0
- data/ext/kreuzberg_rb/native/src/config/types.rs +4 -2
- data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +1 -1
- data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +1 -1
- data/ext/kreuzberg_rb/native/src/result.rs +5 -3
- data/lib/kreuzberg/version.rb +1 -1
- data/sig/kreuzberg.rbs +228 -37
- data/spec/binding/batch_operations_spec.rb +2 -0
- data/vendor/Cargo.toml +3 -2
- data/vendor/kreuzberg/Cargo.toml +2 -1
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/error.rs +29 -1
- data/vendor/kreuzberg/src/api/handlers.rs +28 -25
- data/vendor/kreuzberg/src/api/openapi.rs +14 -1
- data/vendor/kreuzberg/src/chunking/config.rs +2 -37
- data/vendor/kreuzberg/src/chunking/core.rs +78 -2
- data/vendor/kreuzberg/src/chunking/mod.rs +1 -1
- data/vendor/kreuzberg/src/chunking/processor.rs +15 -17
- data/vendor/kreuzberg/src/core/config/extraction/env.rs +13 -9
- data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +12 -12
- data/vendor/kreuzberg/src/core/config/mod.rs +1 -1
- data/vendor/kreuzberg/src/core/config/processing.rs +65 -8
- data/vendor/kreuzberg/src/core/config_validation/mod.rs +8 -0
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +5 -0
- data/vendor/kreuzberg/src/core/extractor/batch.rs +9 -9
- data/vendor/kreuzberg/src/core/extractor/file.rs +4 -2
- data/vendor/kreuzberg/src/core/extractor/legacy.rs +7 -7
- data/vendor/kreuzberg/src/core/extractor/sync.rs +3 -3
- data/vendor/kreuzberg/src/core/pipeline/execution.rs +2 -1
- data/vendor/kreuzberg/src/core/pipeline/features.rs +16 -22
- data/vendor/kreuzberg/src/core/pipeline/format.rs +20 -18
- data/vendor/kreuzberg/src/core/pipeline/tests.rs +40 -35
- data/vendor/kreuzberg/src/extraction/email.rs +31 -19
- data/vendor/kreuzberg/src/extraction/excel.rs +6 -5
- data/vendor/kreuzberg/src/extraction/html/image_handling.rs +6 -1
- data/vendor/kreuzberg/src/extraction/html/types.rs +4 -3
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +10 -9
- data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +10 -8
- data/vendor/kreuzberg/src/extraction/pptx/mod.rs +8 -4
- data/vendor/kreuzberg/src/extraction/structured.rs +5 -4
- data/vendor/kreuzberg/src/extraction/transform/content.rs +1 -1
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +10 -7
- data/vendor/kreuzberg/src/extractors/archive.rs +7 -5
- data/vendor/kreuzberg/src/extractors/bibtex.rs +34 -17
- data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +7 -10
- data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +4 -2
- data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +3 -2
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +1 -1
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +2 -4
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +1 -1
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +4 -5
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +1 -1
- data/vendor/kreuzberg/src/extractors/docbook.rs +1 -1
- data/vendor/kreuzberg/src/extractors/docx.rs +32 -24
- data/vendor/kreuzberg/src/extractors/email.rs +5 -3
- data/vendor/kreuzberg/src/extractors/epub/metadata.rs +10 -10
- data/vendor/kreuzberg/src/extractors/epub/mod.rs +7 -3
- data/vendor/kreuzberg/src/extractors/excel.rs +8 -6
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +1 -1
- data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +15 -10
- data/vendor/kreuzberg/src/extractors/html.rs +1 -1
- data/vendor/kreuzberg/src/extractors/image.rs +3 -3
- data/vendor/kreuzberg/src/extractors/jats/mod.rs +1 -1
- data/vendor/kreuzberg/src/extractors/jupyter.rs +11 -9
- data/vendor/kreuzberg/src/extractors/latex/metadata.rs +4 -3
- data/vendor/kreuzberg/src/extractors/latex/mod.rs +1 -1
- data/vendor/kreuzberg/src/extractors/markdown.rs +6 -4
- data/vendor/kreuzberg/src/extractors/odt.rs +38 -21
- data/vendor/kreuzberg/src/extractors/opml/core.rs +1 -1
- data/vendor/kreuzberg/src/extractors/opml/parser.rs +13 -9
- data/vendor/kreuzberg/src/extractors/orgmode.rs +11 -9
- data/vendor/kreuzberg/src/extractors/pdf/mod.rs +10 -3
- data/vendor/kreuzberg/src/extractors/pptx.rs +13 -11
- data/vendor/kreuzberg/src/extractors/rst.rs +15 -13
- data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +22 -21
- data/vendor/kreuzberg/src/extractors/rtf/mod.rs +1 -1
- data/vendor/kreuzberg/src/extractors/structured.rs +10 -5
- data/vendor/kreuzberg/src/extractors/text.rs +2 -2
- data/vendor/kreuzberg/src/extractors/typst.rs +11 -5
- data/vendor/kreuzberg/src/extractors/xml.rs +1 -1
- data/vendor/kreuzberg/src/keywords/processor.rs +9 -8
- data/vendor/kreuzberg/src/language_detection/processor.rs +6 -5
- data/vendor/kreuzberg/src/lib.rs +1 -1
- data/vendor/kreuzberg/src/mcp/errors.rs +7 -6
- data/vendor/kreuzberg/src/mcp/format.rs +5 -4
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +3 -2
- data/vendor/kreuzberg/src/ocr/hocr.rs +4 -2
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +128 -14
- data/vendor/kreuzberg/src/ocr/processor/validation.rs +129 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +36 -6
- data/vendor/kreuzberg/src/ocr/types.rs +3 -4
- data/vendor/kreuzberg/src/ocr/validation.rs +14 -0
- data/vendor/kreuzberg/src/pdf/metadata.rs +1 -0
- data/vendor/kreuzberg/src/plugins/extractor/mod.rs +3 -2
- data/vendor/kreuzberg/src/plugins/extractor/registry.rs +5 -4
- data/vendor/kreuzberg/src/plugins/ocr.rs +5 -4
- data/vendor/kreuzberg/src/plugins/processor/mod.rs +13 -12
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +3 -2
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +3 -2
- data/vendor/kreuzberg/src/plugins/validator/mod.rs +15 -14
- data/vendor/kreuzberg/src/text/quality.rs +13 -13
- data/vendor/kreuzberg/src/text/quality_processor.rs +7 -6
- data/vendor/kreuzberg/src/types/djot.rs +15 -4
- data/vendor/kreuzberg/src/types/extraction.rs +24 -4
- data/vendor/kreuzberg/src/types/formats.rs +9 -5
- data/vendor/kreuzberg/src/types/metadata.rs +68 -7
- data/vendor/kreuzberg/src/types/mod.rs +7 -5
- data/vendor/kreuzberg/src/types/page.rs +9 -0
- data/vendor/kreuzberg/src/types/tables.rs +2 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +2 -1
- data/vendor/kreuzberg/tests/config_behavioral.rs +12 -16
- data/vendor/kreuzberg/tests/config_features.rs +19 -11
- data/vendor/kreuzberg/tests/config_loading_tests.rs +9 -9
- data/vendor/kreuzberg/tests/contract_mcp.rs +2 -2
- data/vendor/kreuzberg/tests/core_integration.rs +5 -6
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +1 -1
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +1 -1
- data/vendor/kreuzberg/tests/pipeline_integration.rs +36 -32
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +19 -13
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +3 -2
- data/vendor/kreuzberg/tests/plugin_system.rs +7 -6
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +1 -1
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -1
- data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +3 -2
- data/vendor/kreuzberg-ffi/kreuzberg.h +32 -0
- data/vendor/kreuzberg-ffi/src/error.rs +56 -0
- data/vendor/kreuzberg-ffi/src/helpers.rs +6 -5
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +1 -1
- data/vendor/kreuzberg-ffi/src/result.rs +2 -1
- data/vendor/kreuzberg-ffi/src/result_view.rs +3 -2
- data/vendor/kreuzberg-ffi/src/string_intern.rs +3 -3
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +2 -2
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- metadata +2 -2
|
@@ -21,6 +21,8 @@ use crate::types::{Element, ExtractionResult};
|
|
|
21
21
|
use content::{
|
|
22
22
|
add_page_break, format_table_as_text, process_content, process_hierarchy, process_images, process_tables,
|
|
23
23
|
};
|
|
24
|
+
#[cfg(test)]
|
|
25
|
+
use std::borrow::Cow;
|
|
24
26
|
|
|
25
27
|
/// Transform an extraction result into semantic elements.
|
|
26
28
|
///
|
|
@@ -117,7 +119,7 @@ pub fn transform_extraction_result_to_elements(result: &ExtractionResult) -> Vec
|
|
|
117
119
|
element_index: Some(elements.len()),
|
|
118
120
|
additional: {
|
|
119
121
|
let mut m = std::collections::HashMap::new();
|
|
120
|
-
m.insert("format".to_string(), image.format.
|
|
122
|
+
m.insert("format".to_string(), image.format.to_string());
|
|
121
123
|
if let Some(width) = image.width {
|
|
122
124
|
m.insert("width".to_string(), width.to_string());
|
|
123
125
|
}
|
|
@@ -138,6 +140,7 @@ pub fn transform_extraction_result_to_elements(result: &ExtractionResult) -> Vec
|
|
|
138
140
|
#[cfg(test)]
|
|
139
141
|
mod tests {
|
|
140
142
|
use super::*;
|
|
143
|
+
use bytes::Bytes;
|
|
141
144
|
|
|
142
145
|
#[test]
|
|
143
146
|
fn test_detect_bullet_items() {
|
|
@@ -262,7 +265,7 @@ mod tests {
|
|
|
262
265
|
// Create a mock result with pages and hierarchy
|
|
263
266
|
let result = ExtractionResult {
|
|
264
267
|
content: "Full document content".to_string(),
|
|
265
|
-
mime_type: "application/pdf"
|
|
268
|
+
mime_type: Cow::Borrowed("application/pdf"),
|
|
266
269
|
metadata: test_metadata(Some("Test Document".to_string())),
|
|
267
270
|
tables: vec![],
|
|
268
271
|
detected_languages: None,
|
|
@@ -358,8 +361,8 @@ mod tests {
|
|
|
358
361
|
};
|
|
359
362
|
|
|
360
363
|
let image = ExtractedImage {
|
|
361
|
-
data:
|
|
362
|
-
format: "jpeg"
|
|
364
|
+
data: Bytes::from_static(&[1, 2, 3, 4]),
|
|
365
|
+
format: std::borrow::Cow::Borrowed("jpeg"),
|
|
363
366
|
image_index: 0,
|
|
364
367
|
page_number: Some(1),
|
|
365
368
|
width: Some(640),
|
|
@@ -373,7 +376,7 @@ mod tests {
|
|
|
373
376
|
|
|
374
377
|
let result = ExtractionResult {
|
|
375
378
|
content: "Test content".to_string(),
|
|
376
|
-
mime_type: "application/pdf"
|
|
379
|
+
mime_type: Cow::Borrowed("application/pdf"),
|
|
377
380
|
metadata: test_metadata(Some("Test".to_string())),
|
|
378
381
|
tables: vec![],
|
|
379
382
|
detected_languages: None,
|
|
@@ -421,7 +424,7 @@ mod tests {
|
|
|
421
424
|
// Create a result without pages
|
|
422
425
|
let result = ExtractionResult {
|
|
423
426
|
content: "Simple text content\n\nSecond paragraph".to_string(),
|
|
424
|
-
mime_type: "text/plain"
|
|
427
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
425
428
|
metadata: test_metadata(Some("Simple Doc".to_string())),
|
|
426
429
|
tables: vec![],
|
|
427
430
|
detected_languages: None,
|
|
@@ -453,7 +456,7 @@ mod tests {
|
|
|
453
456
|
|
|
454
457
|
let result = ExtractionResult {
|
|
455
458
|
content: "First paragraph.\n\nSecond paragraph.\n\nThird paragraph.".to_string(),
|
|
456
|
-
mime_type: "text/plain"
|
|
459
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
457
460
|
metadata: test_metadata(None),
|
|
458
461
|
tables: vec![],
|
|
459
462
|
detected_languages: None,
|
|
@@ -8,7 +8,9 @@ use crate::extraction::archive::{
|
|
|
8
8
|
};
|
|
9
9
|
use crate::plugins::{DocumentExtractor, Plugin};
|
|
10
10
|
use crate::types::{ArchiveMetadata, ExtractionResult, Metadata};
|
|
11
|
+
use ahash::AHashMap;
|
|
11
12
|
use async_trait::async_trait;
|
|
13
|
+
use std::borrow::Cow;
|
|
12
14
|
use std::collections::HashMap;
|
|
13
15
|
|
|
14
16
|
/// Build an ExtractionResult from archive metadata and text contents.
|
|
@@ -18,7 +20,7 @@ use std::collections::HashMap;
|
|
|
18
20
|
fn build_archive_result(
|
|
19
21
|
extraction_metadata: ExtractedMetadata,
|
|
20
22
|
text_contents: HashMap<String, String>,
|
|
21
|
-
format_name: &str,
|
|
23
|
+
format_name: &'static str,
|
|
22
24
|
mime_type: &str,
|
|
23
25
|
) -> ExtractionResult {
|
|
24
26
|
let file_names: Vec<String> = extraction_metadata
|
|
@@ -28,14 +30,14 @@ fn build_archive_result(
|
|
|
28
30
|
.collect();
|
|
29
31
|
|
|
30
32
|
let archive_metadata = ArchiveMetadata {
|
|
31
|
-
format: format_name
|
|
33
|
+
format: Cow::Borrowed(format_name),
|
|
32
34
|
file_count: extraction_metadata.file_count,
|
|
33
35
|
file_list: file_names,
|
|
34
36
|
total_size: extraction_metadata.total_size as usize,
|
|
35
37
|
compressed_size: None,
|
|
36
38
|
};
|
|
37
39
|
|
|
38
|
-
let mut additional =
|
|
40
|
+
let mut additional = AHashMap::new();
|
|
39
41
|
let file_details: Vec<serde_json::Value> = extraction_metadata
|
|
40
42
|
.file_list
|
|
41
43
|
.iter()
|
|
@@ -47,7 +49,7 @@ fn build_archive_result(
|
|
|
47
49
|
})
|
|
48
50
|
})
|
|
49
51
|
.collect();
|
|
50
|
-
additional.insert("files"
|
|
52
|
+
additional.insert(Cow::Borrowed("files"), serde_json::json!(file_details));
|
|
51
53
|
|
|
52
54
|
let mut output = format!(
|
|
53
55
|
"{} Archive ({} files, {} bytes)\n\n",
|
|
@@ -67,7 +69,7 @@ fn build_archive_result(
|
|
|
67
69
|
|
|
68
70
|
ExtractionResult {
|
|
69
71
|
content: output,
|
|
70
|
-
mime_type: mime_type.to_string(),
|
|
72
|
+
mime_type: mime_type.to_string().into(),
|
|
71
73
|
metadata: Metadata {
|
|
72
74
|
format: Some(crate::types::FormatMetadata::Archive(archive_metadata)),
|
|
73
75
|
additional,
|
|
@@ -7,8 +7,10 @@ use crate::Result;
|
|
|
7
7
|
use crate::core::config::ExtractionConfig;
|
|
8
8
|
use crate::plugins::{DocumentExtractor, Plugin};
|
|
9
9
|
use crate::types::{ExtractionResult, Metadata};
|
|
10
|
+
use ahash::AHashMap;
|
|
10
11
|
use async_trait::async_trait;
|
|
11
|
-
use std::
|
|
12
|
+
use std::borrow::Cow;
|
|
13
|
+
use std::collections::HashSet;
|
|
12
14
|
|
|
13
15
|
#[cfg(feature = "office")]
|
|
14
16
|
use biblatex::{Bibliography, ChunksExt};
|
|
@@ -79,7 +81,7 @@ impl DocumentExtractor for BibtexExtractor {
|
|
|
79
81
|
let mut entries_vec = Vec::new();
|
|
80
82
|
let mut authors_set = HashSet::new();
|
|
81
83
|
let mut years_set = HashSet::new();
|
|
82
|
-
let mut entry_types_map =
|
|
84
|
+
let mut entry_types_map: AHashMap<String, i32> = AHashMap::new();
|
|
83
85
|
let mut formatted_entries = String::new();
|
|
84
86
|
|
|
85
87
|
match Bibliography::parse(&bibtex_str) {
|
|
@@ -129,19 +131,19 @@ impl DocumentExtractor for BibtexExtractor {
|
|
|
129
131
|
}
|
|
130
132
|
}
|
|
131
133
|
|
|
132
|
-
let mut additional =
|
|
134
|
+
let mut additional: AHashMap<Cow<'static, str>, serde_json::Value> = AHashMap::new();
|
|
133
135
|
|
|
134
|
-
additional.insert("entry_count"
|
|
136
|
+
additional.insert(Cow::Borrowed("entry_count"), serde_json::json!(entries_vec.len()));
|
|
135
137
|
|
|
136
138
|
let mut authors_list: Vec<String> = authors_set.into_iter().collect();
|
|
137
139
|
authors_list.sort();
|
|
138
|
-
additional.insert("authors"
|
|
140
|
+
additional.insert(Cow::Borrowed("authors"), serde_json::json!(authors_list));
|
|
139
141
|
|
|
140
142
|
if !years_set.is_empty() {
|
|
141
143
|
let min_year = years_set.iter().min().copied().unwrap_or(0);
|
|
142
144
|
let max_year = years_set.iter().max().copied().unwrap_or(0);
|
|
143
145
|
additional.insert(
|
|
144
|
-
"year_range"
|
|
146
|
+
Cow::Borrowed("year_range"),
|
|
145
147
|
serde_json::json!({
|
|
146
148
|
"min": min_year,
|
|
147
149
|
"max": max_year,
|
|
@@ -155,14 +157,14 @@ impl DocumentExtractor for BibtexExtractor {
|
|
|
155
157
|
for (entry_type, count) in entry_types_map {
|
|
156
158
|
entry_types_json[entry_type] = serde_json::json!(count);
|
|
157
159
|
}
|
|
158
|
-
additional.insert("entry_types"
|
|
160
|
+
additional.insert(Cow::Borrowed("entry_types"), entry_types_json);
|
|
159
161
|
}
|
|
160
162
|
|
|
161
|
-
additional.insert("citation_keys"
|
|
163
|
+
additional.insert(Cow::Borrowed("citation_keys"), serde_json::json!(entries_vec));
|
|
162
164
|
|
|
163
165
|
Ok(ExtractionResult {
|
|
164
166
|
content: formatted_entries,
|
|
165
|
-
mime_type: mime_type.to_string(),
|
|
167
|
+
mime_type: mime_type.to_string().into(),
|
|
166
168
|
metadata: Metadata {
|
|
167
169
|
additional,
|
|
168
170
|
..Default::default()
|
|
@@ -222,7 +224,10 @@ mod tests {
|
|
|
222
224
|
assert!(result.content.contains("Sample Title"));
|
|
223
225
|
|
|
224
226
|
let metadata = &result.metadata;
|
|
225
|
-
assert_eq!(
|
|
227
|
+
assert_eq!(
|
|
228
|
+
metadata.additional.get(&Cow::Borrowed("entry_count")),
|
|
229
|
+
Some(&serde_json::json!(1))
|
|
230
|
+
);
|
|
226
231
|
}
|
|
227
232
|
|
|
228
233
|
#[tokio::test]
|
|
@@ -258,15 +263,18 @@ mod tests {
|
|
|
258
263
|
|
|
259
264
|
let metadata = &result.metadata;
|
|
260
265
|
|
|
261
|
-
assert_eq!(
|
|
266
|
+
assert_eq!(
|
|
267
|
+
metadata.additional.get(&Cow::Borrowed("entry_count")),
|
|
268
|
+
Some(&serde_json::json!(3))
|
|
269
|
+
);
|
|
262
270
|
|
|
263
|
-
if let Some(keys) = metadata.additional.get("citation_keys")
|
|
271
|
+
if let Some(keys) = metadata.additional.get(&Cow::Borrowed("citation_keys"))
|
|
264
272
|
&& let Some(keys_array) = keys.as_array()
|
|
265
273
|
{
|
|
266
274
|
assert_eq!(keys_array.len(), 3);
|
|
267
275
|
}
|
|
268
276
|
|
|
269
|
-
if let Some(types) = metadata.additional.get("entry_types") {
|
|
277
|
+
if let Some(types) = metadata.additional.get(&Cow::Borrowed("entry_types")) {
|
|
270
278
|
assert!(types.get("article").is_some());
|
|
271
279
|
assert!(types.get("book").is_some());
|
|
272
280
|
assert!(types.get("inproceedings").is_some());
|
|
@@ -330,7 +338,10 @@ mod tests {
|
|
|
330
338
|
assert!(result.content.contains("The TeXbook"));
|
|
331
339
|
|
|
332
340
|
let metadata = &result.metadata;
|
|
333
|
-
assert_eq!(
|
|
341
|
+
assert_eq!(
|
|
342
|
+
metadata.additional.get(&Cow::Borrowed("entry_count")),
|
|
343
|
+
Some(&serde_json::json!(1))
|
|
344
|
+
);
|
|
334
345
|
|
|
335
346
|
if let Some(year_range) = metadata.additional.get("year_range") {
|
|
336
347
|
assert_eq!(year_range.get("min"), Some(&serde_json::json!(1984)));
|
|
@@ -368,7 +379,10 @@ mod tests {
|
|
|
368
379
|
let result = result.expect("Should extract valid metadata");
|
|
369
380
|
let metadata = &result.metadata;
|
|
370
381
|
|
|
371
|
-
assert_eq!(
|
|
382
|
+
assert_eq!(
|
|
383
|
+
metadata.additional.get(&Cow::Borrowed("entry_count")),
|
|
384
|
+
Some(&serde_json::json!(3))
|
|
385
|
+
);
|
|
372
386
|
|
|
373
387
|
if let Some(authors) = metadata.additional.get("authors")
|
|
374
388
|
&& let Some(authors_array) = authors.as_array()
|
|
@@ -381,7 +395,7 @@ mod tests {
|
|
|
381
395
|
assert_eq!(year_range.get("max"), Some(&serde_json::json!(2021)));
|
|
382
396
|
}
|
|
383
397
|
|
|
384
|
-
if let Some(types) = metadata.additional.get("entry_types") {
|
|
398
|
+
if let Some(types) = metadata.additional.get(&Cow::Borrowed("entry_types")) {
|
|
385
399
|
assert_eq!(types.get("article"), Some(&serde_json::json!(2)));
|
|
386
400
|
assert_eq!(types.get("book"), Some(&serde_json::json!(1)));
|
|
387
401
|
}
|
|
@@ -401,7 +415,10 @@ mod tests {
|
|
|
401
415
|
let result = result.expect("Should extract empty bibliography");
|
|
402
416
|
let metadata = &result.metadata;
|
|
403
417
|
|
|
404
|
-
assert_eq!(
|
|
418
|
+
assert_eq!(
|
|
419
|
+
metadata.additional.get(&Cow::Borrowed("entry_count")),
|
|
420
|
+
Some(&serde_json::json!(0))
|
|
421
|
+
);
|
|
405
422
|
}
|
|
406
423
|
|
|
407
424
|
#[tokio::test]
|
|
@@ -2,8 +2,6 @@
|
|
|
2
2
|
//!
|
|
3
3
|
//! Handles parsing of Djot attributes from jotdown events and string syntax.
|
|
4
4
|
|
|
5
|
-
use std::collections::HashMap;
|
|
6
|
-
|
|
7
5
|
/// Parse jotdown attributes into our Attributes representation.
|
|
8
6
|
///
|
|
9
7
|
/// Converts jotdown's internal attribute representation to Kreuzberg's
|
|
@@ -14,7 +12,7 @@ pub fn parse_jotdown_attributes(attrs: &jotdown::Attributes) -> crate::types::At
|
|
|
14
12
|
|
|
15
13
|
let mut id = None;
|
|
16
14
|
let mut classes = Vec::new();
|
|
17
|
-
let mut key_values =
|
|
15
|
+
let mut key_values = Vec::new();
|
|
18
16
|
|
|
19
17
|
for (kind, value) in attrs.iter() {
|
|
20
18
|
match kind {
|
|
@@ -26,7 +24,7 @@ pub fn parse_jotdown_attributes(attrs: &jotdown::Attributes) -> crate::types::At
|
|
|
26
24
|
classes.push(value.to_string());
|
|
27
25
|
}
|
|
28
26
|
AttributeKind::Pair { key } => {
|
|
29
|
-
key_values.
|
|
27
|
+
key_values.push((key.to_string(), value.to_string()));
|
|
30
28
|
}
|
|
31
29
|
AttributeKind::Comment => {
|
|
32
30
|
// Comments are ignored in our representation
|
|
@@ -49,7 +47,7 @@ pub fn parse_djot_attributes(attr_str: &str) -> crate::types::Attributes {
|
|
|
49
47
|
let mut attrs = Attributes {
|
|
50
48
|
id: None,
|
|
51
49
|
classes: Vec::new(),
|
|
52
|
-
key_values:
|
|
50
|
+
key_values: Vec::new(),
|
|
53
51
|
};
|
|
54
52
|
|
|
55
53
|
// Simple parser for attribute syntax
|
|
@@ -66,7 +64,7 @@ pub fn parse_djot_attributes(attr_str: &str) -> crate::types::Attributes {
|
|
|
66
64
|
// Key-value pair
|
|
67
65
|
if let Some((key, value)) = token.split_once('=') {
|
|
68
66
|
let clean_value = value.trim_matches('"').trim_matches('\'');
|
|
69
|
-
attrs.key_values.
|
|
67
|
+
attrs.key_values.push((key.to_string(), clean_value.to_string()));
|
|
70
68
|
}
|
|
71
69
|
}
|
|
72
70
|
}
|
|
@@ -106,12 +104,11 @@ mod tests {
|
|
|
106
104
|
|
|
107
105
|
#[test]
|
|
108
106
|
fn test_render_attributes_with_all_parts() {
|
|
109
|
-
let
|
|
107
|
+
let attrs = crate::types::Attributes {
|
|
110
108
|
id: Some("my-id".to_string()),
|
|
111
109
|
classes: vec!["class1".to_string(), "class2".to_string()],
|
|
112
|
-
key_values:
|
|
110
|
+
key_values: vec![("data-test".to_string(), "value".to_string())],
|
|
113
111
|
};
|
|
114
|
-
attrs.key_values.insert("data-test".to_string(), "value".to_string());
|
|
115
112
|
|
|
116
113
|
let rendered = render_attributes(&attrs);
|
|
117
114
|
assert!(rendered.contains("#my-id"));
|
|
@@ -125,7 +122,7 @@ mod tests {
|
|
|
125
122
|
let attrs = crate::types::Attributes {
|
|
126
123
|
id: None,
|
|
127
124
|
classes: vec![],
|
|
128
|
-
key_values:
|
|
125
|
+
key_values: Vec::new(),
|
|
129
126
|
};
|
|
130
127
|
|
|
131
128
|
let rendered = render_attributes(&attrs);
|
|
@@ -7,6 +7,8 @@
|
|
|
7
7
|
|
|
8
8
|
use super::rendering::render_block_to_djot;
|
|
9
9
|
use jotdown::Parser;
|
|
10
|
+
#[cfg(test)]
|
|
11
|
+
use std::borrow::Cow;
|
|
10
12
|
|
|
11
13
|
/// Convert DjotContent back to djot markup.
|
|
12
14
|
///
|
|
@@ -150,7 +152,7 @@ mod tests {
|
|
|
150
152
|
fn test_extraction_result_to_djot_with_djot_content() {
|
|
151
153
|
let result = ExtractionResult {
|
|
152
154
|
content: "Test content".to_string(),
|
|
153
|
-
mime_type: "text/djot"
|
|
155
|
+
mime_type: Cow::Borrowed("text/djot"),
|
|
154
156
|
metadata: Metadata::default(),
|
|
155
157
|
tables: vec![],
|
|
156
158
|
detected_languages: None,
|
|
@@ -191,7 +193,7 @@ mod tests {
|
|
|
191
193
|
fn test_extraction_result_to_djot_without_djot_content() {
|
|
192
194
|
let result = ExtractionResult {
|
|
193
195
|
content: "Paragraph one\n\nParagraph two".to_string(),
|
|
194
|
-
mime_type: "text/plain"
|
|
196
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
195
197
|
metadata: Metadata::default(),
|
|
196
198
|
tables: vec![],
|
|
197
199
|
detected_languages: None,
|
|
@@ -9,6 +9,7 @@ use crate::plugins::{DocumentExtractor, Plugin};
|
|
|
9
9
|
use crate::types::{ExtractionResult, Metadata};
|
|
10
10
|
use async_trait::async_trait;
|
|
11
11
|
use jotdown::{Event, Parser};
|
|
12
|
+
use std::borrow::Cow;
|
|
12
13
|
|
|
13
14
|
/// Djot markup extractor with metadata and table support.
|
|
14
15
|
///
|
|
@@ -90,7 +91,7 @@ impl DocumentExtractor for DjotExtractor {
|
|
|
90
91
|
if !metadata.additional.contains_key("title")
|
|
91
92
|
&& let Some(title) = crate::extractors::frontmatter_utils::extract_title_from_content(&remaining_content)
|
|
92
93
|
{
|
|
93
|
-
metadata.additional.insert("title"
|
|
94
|
+
metadata.additional.insert(Cow::Borrowed("title"), title.into());
|
|
94
95
|
}
|
|
95
96
|
|
|
96
97
|
// Parse with jotdown and collect events once for extraction
|
|
@@ -105,7 +106,7 @@ impl DocumentExtractor for DjotExtractor {
|
|
|
105
106
|
|
|
106
107
|
Ok(ExtractionResult {
|
|
107
108
|
content: extracted_text,
|
|
108
|
-
mime_type: mime_type.to_string(),
|
|
109
|
+
mime_type: mime_type.to_string().into(),
|
|
109
110
|
metadata,
|
|
110
111
|
tables,
|
|
111
112
|
detected_languages: None,
|
|
@@ -135,7 +135,7 @@ pub(super) fn handle_block_start(
|
|
|
135
135
|
}
|
|
136
136
|
Container::TaskListItem { checked } => {
|
|
137
137
|
let mut attrs = parsed_attrs.unwrap_or_default();
|
|
138
|
-
attrs.key_values.
|
|
138
|
+
attrs.key_values.push(("checked".to_string(), checked.to_string()));
|
|
139
139
|
push_block(
|
|
140
140
|
state,
|
|
141
141
|
FormattedBlock {
|
|
@@ -14,8 +14,6 @@ use super::text_extraction::extract_text_from_events;
|
|
|
14
14
|
use crate::extractors::djot_format::attributes::parse_jotdown_attributes;
|
|
15
15
|
use crate::types::{Attributes, DjotContent, DjotImage, DjotLink, FormattedBlock};
|
|
16
16
|
use jotdown::{Container, Event};
|
|
17
|
-
use std::collections::HashMap;
|
|
18
|
-
|
|
19
17
|
/// Extract complete djot content with 100% feature extraction.
|
|
20
18
|
///
|
|
21
19
|
/// Processes ALL djot events to build a rich DjotContent structure including:
|
|
@@ -42,7 +40,7 @@ pub fn extract_complete_djot_content(
|
|
|
42
40
|
let mut images = Vec::new();
|
|
43
41
|
let mut links = Vec::new();
|
|
44
42
|
let mut footnotes = Vec::new();
|
|
45
|
-
let attributes_map:
|
|
43
|
+
let attributes_map: Vec<(String, Attributes)> = Vec::new();
|
|
46
44
|
|
|
47
45
|
let mut state = ExtractionState::new();
|
|
48
46
|
|
|
@@ -186,7 +184,7 @@ fn handle_start_event(
|
|
|
186
184
|
};
|
|
187
185
|
|
|
188
186
|
// Try block handlers first
|
|
189
|
-
if handle_block_start(state, container, attrs, parsed_attrs.
|
|
187
|
+
if handle_block_start(state, container, attrs, parsed_attrs.as_ref().cloned(), footnotes) {
|
|
190
188
|
return;
|
|
191
189
|
}
|
|
192
190
|
|
|
@@ -9,7 +9,7 @@ use std::collections::HashMap;
|
|
|
9
9
|
pub(super) fn handle_footnote_reference(state: &mut ExtractionState, label: &str) {
|
|
10
10
|
state.flush_text();
|
|
11
11
|
|
|
12
|
-
let mut meta = HashMap::new();
|
|
12
|
+
let mut meta: HashMap<String, String> = HashMap::new();
|
|
13
13
|
meta.insert("label".to_string(), label.to_string());
|
|
14
14
|
|
|
15
15
|
state.current_inline_elements.push(InlineElement {
|
|
@@ -3,7 +3,6 @@
|
|
|
3
3
|
use super::state::ExtractionState;
|
|
4
4
|
use crate::types::{DjotImage, DjotLink, InlineElement, InlineType};
|
|
5
5
|
use jotdown::Container;
|
|
6
|
-
use std::collections::HashMap;
|
|
7
6
|
|
|
8
7
|
/// Handle start of inline elements.
|
|
9
8
|
pub(super) fn handle_inline_start(
|
|
@@ -123,7 +122,7 @@ pub(super) fn handle_math_end(state: &mut ExtractionState, display: bool) {
|
|
|
123
122
|
let math_text = std::mem::take(&mut state.math_content);
|
|
124
123
|
state.inline_type_stack.pop();
|
|
125
124
|
|
|
126
|
-
let mut meta = HashMap::new();
|
|
125
|
+
let mut meta: std::collections::HashMap<String, String> = std::collections::HashMap::new();
|
|
127
126
|
meta.insert("display".to_string(), display.to_string());
|
|
128
127
|
|
|
129
128
|
state.current_inline_elements.push(InlineElement {
|
|
@@ -144,7 +143,7 @@ pub(super) fn finalize_inline_element(state: &mut ExtractionState, container: &C
|
|
|
144
143
|
if matches!(container, Container::RawInline { .. })
|
|
145
144
|
&& let Some(fmt) = state.raw_format.take()
|
|
146
145
|
{
|
|
147
|
-
let mut m = HashMap::new();
|
|
146
|
+
let mut m: std::collections::HashMap<String, String> = std::collections::HashMap::new();
|
|
148
147
|
m.insert("format".to_string(), fmt);
|
|
149
148
|
meta = Some(m);
|
|
150
149
|
}
|
|
@@ -167,7 +166,7 @@ pub(super) fn handle_link_end(state: &mut ExtractionState, url: &str, links: &mu
|
|
|
167
166
|
}
|
|
168
167
|
state.inline_type_stack.pop();
|
|
169
168
|
|
|
170
|
-
let mut meta = HashMap::new();
|
|
169
|
+
let mut meta: std::collections::HashMap<String, String> = std::collections::HashMap::new();
|
|
171
170
|
meta.insert("href".to_string(), url.to_string());
|
|
172
171
|
|
|
173
172
|
state.current_inline_elements.push(InlineElement {
|
|
@@ -188,7 +187,7 @@ pub(super) fn handle_image_end(state: &mut ExtractionState, src: &str, images: &
|
|
|
188
187
|
}
|
|
189
188
|
state.inline_type_stack.pop();
|
|
190
189
|
|
|
191
|
-
let mut meta = HashMap::new();
|
|
190
|
+
let mut meta: std::collections::HashMap<String, String> = std::collections::HashMap::new();
|
|
192
191
|
meta.insert("src".to_string(), src.to_string());
|
|
193
192
|
|
|
194
193
|
state.current_inline_elements.push(InlineElement {
|
|
@@ -43,7 +43,7 @@ pub fn extract_tables_from_events(events: &[Event]) -> Vec<Table> {
|
|
|
43
43
|
if !current_row.is_empty()
|
|
44
44
|
&& let Some((ref mut rows, _)) = current_table
|
|
45
45
|
{
|
|
46
|
-
rows.push(current_row
|
|
46
|
+
rows.push(std::mem::take(&mut current_row));
|
|
47
47
|
}
|
|
48
48
|
current_row = Vec::new();
|
|
49
49
|
}
|
|
@@ -9,7 +9,9 @@ use crate::core::config::ExtractionConfig;
|
|
|
9
9
|
use crate::extraction::{cells_to_markdown, office_metadata};
|
|
10
10
|
use crate::plugins::{DocumentExtractor, Plugin};
|
|
11
11
|
use crate::types::{ExtractionResult, Metadata, PageBoundary, PageInfo, PageStructure, PageUnitType, Table};
|
|
12
|
+
use ahash::AHashMap;
|
|
12
13
|
use async_trait::async_trait;
|
|
14
|
+
use std::borrow::Cow;
|
|
13
15
|
use std::io::Cursor;
|
|
14
16
|
|
|
15
17
|
/// High-performance DOCX extractor using docx-lite.
|
|
@@ -181,22 +183,22 @@ impl DocumentExtractor for DocxExtractor {
|
|
|
181
183
|
.map_err(|e| crate::error::KreuzbergError::parsing(format!("Failed to open ZIP archive: {}", e)))?
|
|
182
184
|
};
|
|
183
185
|
|
|
184
|
-
let mut metadata_map =
|
|
186
|
+
let mut metadata_map = AHashMap::new();
|
|
185
187
|
let mut parsed_keywords: Option<Vec<String>> = None;
|
|
186
188
|
|
|
187
189
|
if let Ok(core) = office_metadata::extract_core_properties(&mut archive) {
|
|
188
190
|
if let Some(title) = core.title {
|
|
189
|
-
metadata_map.insert("title"
|
|
191
|
+
metadata_map.insert(Cow::Borrowed("title"), serde_json::Value::String(title));
|
|
190
192
|
}
|
|
191
193
|
if let Some(creator) = core.creator {
|
|
192
194
|
metadata_map.insert(
|
|
193
|
-
"authors"
|
|
195
|
+
Cow::Borrowed("authors"),
|
|
194
196
|
serde_json::Value::Array(vec![serde_json::Value::String(creator.clone())]),
|
|
195
197
|
);
|
|
196
|
-
metadata_map.insert("created_by"
|
|
198
|
+
metadata_map.insert(Cow::Borrowed("created_by"), serde_json::Value::String(creator));
|
|
197
199
|
}
|
|
198
200
|
if let Some(subject) = core.subject {
|
|
199
|
-
metadata_map.insert("subject"
|
|
201
|
+
metadata_map.insert(Cow::Borrowed("subject"), serde_json::Value::String(subject));
|
|
200
202
|
}
|
|
201
203
|
if let Some(keywords) = core.keywords {
|
|
202
204
|
// Parse comma-separated keywords into Vec<String>
|
|
@@ -209,70 +211,76 @@ impl DocumentExtractor for DocxExtractor {
|
|
|
209
211
|
);
|
|
210
212
|
}
|
|
211
213
|
if let Some(description) = core.description {
|
|
212
|
-
metadata_map.insert("description"
|
|
214
|
+
metadata_map.insert(Cow::Borrowed("description"), serde_json::Value::String(description));
|
|
213
215
|
}
|
|
214
216
|
if let Some(modified_by) = core.last_modified_by {
|
|
215
|
-
metadata_map.insert("modified_by"
|
|
217
|
+
metadata_map.insert(Cow::Borrowed("modified_by"), serde_json::Value::String(modified_by));
|
|
216
218
|
}
|
|
217
219
|
if let Some(created) = core.created {
|
|
218
|
-
metadata_map.insert("created_at"
|
|
220
|
+
metadata_map.insert(Cow::Borrowed("created_at"), serde_json::Value::String(created));
|
|
219
221
|
}
|
|
220
222
|
if let Some(modified) = core.modified {
|
|
221
|
-
metadata_map.insert("modified_at"
|
|
223
|
+
metadata_map.insert(Cow::Borrowed("modified_at"), serde_json::Value::String(modified));
|
|
222
224
|
}
|
|
223
225
|
if let Some(revision) = core.revision {
|
|
224
|
-
metadata_map.insert("revision"
|
|
226
|
+
metadata_map.insert(Cow::Borrowed("revision"), serde_json::Value::String(revision));
|
|
225
227
|
}
|
|
226
228
|
if let Some(category) = core.category {
|
|
227
|
-
metadata_map.insert("category"
|
|
229
|
+
metadata_map.insert(Cow::Borrowed("category"), serde_json::Value::String(category));
|
|
228
230
|
}
|
|
229
231
|
if let Some(content_status) = core.content_status {
|
|
230
|
-
metadata_map.insert(
|
|
232
|
+
metadata_map.insert(
|
|
233
|
+
Cow::Borrowed("content_status"),
|
|
234
|
+
serde_json::Value::String(content_status),
|
|
235
|
+
);
|
|
231
236
|
}
|
|
232
237
|
if let Some(language) = core.language {
|
|
233
|
-
metadata_map.insert("language"
|
|
238
|
+
metadata_map.insert(Cow::Borrowed("language"), serde_json::Value::String(language));
|
|
234
239
|
}
|
|
235
240
|
}
|
|
236
241
|
|
|
237
242
|
if let Ok(app) = office_metadata::extract_docx_app_properties(&mut archive) {
|
|
238
243
|
if let Some(pages) = app.pages {
|
|
239
|
-
metadata_map.insert("page_count"
|
|
244
|
+
metadata_map.insert(Cow::Borrowed("page_count"), serde_json::Value::Number(pages.into()));
|
|
240
245
|
}
|
|
241
246
|
if let Some(words) = app.words {
|
|
242
|
-
metadata_map.insert("word_count"
|
|
247
|
+
metadata_map.insert(Cow::Borrowed("word_count"), serde_json::Value::Number(words.into()));
|
|
243
248
|
}
|
|
244
249
|
if let Some(chars) = app.characters {
|
|
245
|
-
metadata_map.insert(
|
|
250
|
+
metadata_map.insert(
|
|
251
|
+
Cow::Borrowed("character_count"),
|
|
252
|
+
serde_json::Value::Number(chars.into()),
|
|
253
|
+
);
|
|
246
254
|
}
|
|
247
255
|
if let Some(lines) = app.lines {
|
|
248
|
-
metadata_map.insert("line_count"
|
|
256
|
+
metadata_map.insert(Cow::Borrowed("line_count"), serde_json::Value::Number(lines.into()));
|
|
249
257
|
}
|
|
250
258
|
if let Some(paragraphs) = app.paragraphs {
|
|
251
259
|
metadata_map.insert(
|
|
252
|
-
"paragraph_count"
|
|
260
|
+
Cow::Borrowed("paragraph_count"),
|
|
253
261
|
serde_json::Value::Number(paragraphs.into()),
|
|
254
262
|
);
|
|
255
263
|
}
|
|
256
264
|
if let Some(template) = app.template {
|
|
257
|
-
metadata_map.insert("template"
|
|
265
|
+
metadata_map.insert(Cow::Borrowed("template"), serde_json::Value::String(template));
|
|
258
266
|
}
|
|
259
267
|
if let Some(company) = app.company {
|
|
260
|
-
metadata_map.insert("
|
|
268
|
+
metadata_map.insert(Cow::Borrowed("company"), serde_json::Value::String(company));
|
|
261
269
|
}
|
|
262
270
|
if let Some(time) = app.total_time {
|
|
263
271
|
metadata_map.insert(
|
|
264
|
-
"total_editing_time_minutes"
|
|
272
|
+
Cow::Borrowed("total_editing_time_minutes"),
|
|
265
273
|
serde_json::Value::Number(time.into()),
|
|
266
274
|
);
|
|
267
275
|
}
|
|
268
276
|
if let Some(application) = app.application {
|
|
269
|
-
metadata_map.insert("application"
|
|
277
|
+
metadata_map.insert(Cow::Borrowed("application"), serde_json::Value::String(application));
|
|
270
278
|
}
|
|
271
279
|
}
|
|
272
280
|
|
|
273
281
|
if let Ok(custom) = office_metadata::extract_custom_properties(&mut archive) {
|
|
274
282
|
for (key, value) in custom {
|
|
275
|
-
metadata_map.insert(format!("custom_{}", key), value);
|
|
283
|
+
metadata_map.insert(Cow::Owned(format!("custom_{}", key)), value);
|
|
276
284
|
}
|
|
277
285
|
}
|
|
278
286
|
|
|
@@ -301,7 +309,7 @@ impl DocumentExtractor for DocxExtractor {
|
|
|
301
309
|
|
|
302
310
|
Ok(ExtractionResult {
|
|
303
311
|
content: text,
|
|
304
|
-
mime_type: mime_type.to_string(),
|
|
312
|
+
mime_type: mime_type.to_string().into(),
|
|
305
313
|
metadata: Metadata {
|
|
306
314
|
pages: page_structure,
|
|
307
315
|
keywords: parsed_keywords,
|