kreuzberg 4.0.0.pre.rc.6 → 4.0.0.rc1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +0 -6
- data/.rubocop.yaml +534 -1
- data/Gemfile +2 -1
- data/Gemfile.lock +11 -11
- data/README.md +5 -10
- data/examples/async_patterns.rb +0 -1
- data/ext/kreuzberg_rb/extconf.rb +0 -10
- data/ext/kreuzberg_rb/native/Cargo.toml +15 -23
- data/ext/kreuzberg_rb/native/build.rs +2 -0
- data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
- data/ext/kreuzberg_rb/native/include/strings.h +2 -2
- data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
- data/ext/kreuzberg_rb/native/src/lib.rs +16 -75
- data/kreuzberg.gemspec +14 -57
- data/lib/kreuzberg/cache_api.rb +0 -1
- data/lib/kreuzberg/cli.rb +2 -2
- data/lib/kreuzberg/config.rb +2 -9
- data/lib/kreuzberg/errors.rb +7 -75
- data/lib/kreuzberg/extraction_api.rb +0 -1
- data/lib/kreuzberg/setup_lib_path.rb +0 -1
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +0 -21
- data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
- data/sig/kreuzberg.rbs +3 -55
- data/spec/binding/cli_proxy_spec.rb +4 -2
- data/spec/binding/cli_spec.rb +11 -12
- data/spec/examples.txt +104 -0
- data/spec/fixtures/config.yaml +1 -0
- data/spec/spec_helper.rb +1 -1
- data/vendor/kreuzberg/Cargo.toml +42 -112
- data/vendor/kreuzberg/README.md +2 -2
- data/vendor/kreuzberg/build.rs +4 -18
- data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
- data/vendor/kreuzberg/src/cache/mod.rs +3 -27
- data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
- data/vendor/kreuzberg/src/core/extractor.rs +81 -202
- data/vendor/kreuzberg/src/core/io.rs +2 -4
- data/vendor/kreuzberg/src/core/mime.rs +12 -2
- data/vendor/kreuzberg/src/core/mod.rs +1 -4
- data/vendor/kreuzberg/src/core/pipeline.rs +33 -111
- data/vendor/kreuzberg/src/embeddings.rs +16 -125
- data/vendor/kreuzberg/src/error.rs +1 -1
- data/vendor/kreuzberg/src/extraction/docx.rs +1 -1
- data/vendor/kreuzberg/src/extraction/image.rs +13 -13
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +1 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +5 -9
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
- data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
- data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
- data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
- data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +0 -21
- data/vendor/kreuzberg/src/extractors/docx.rs +128 -16
- data/vendor/kreuzberg/src/extractors/email.rs +0 -14
- data/vendor/kreuzberg/src/extractors/excel.rs +20 -19
- data/vendor/kreuzberg/src/extractors/html.rs +154 -137
- data/vendor/kreuzberg/src/extractors/image.rs +4 -7
- data/vendor/kreuzberg/src/extractors/mod.rs +9 -106
- data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +15 -12
- data/vendor/kreuzberg/src/extractors/pptx.rs +3 -17
- data/vendor/kreuzberg/src/extractors/structured.rs +0 -14
- data/vendor/kreuzberg/src/extractors/text.rs +5 -23
- data/vendor/kreuzberg/src/extractors/xml.rs +0 -7
- data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
- data/vendor/kreuzberg/src/lib.rs +1 -4
- data/vendor/kreuzberg/src/mcp/mod.rs +1 -1
- data/vendor/kreuzberg/src/mcp/server.rs +3 -5
- data/vendor/kreuzberg/src/ocr/processor.rs +2 -18
- data/vendor/kreuzberg/src/pdf/error.rs +1 -1
- data/vendor/kreuzberg/src/pdf/table.rs +44 -17
- data/vendor/kreuzberg/src/pdf/text.rs +3 -0
- data/vendor/kreuzberg/src/plugins/extractor.rs +5 -8
- data/vendor/kreuzberg/src/plugins/ocr.rs +11 -2
- data/vendor/kreuzberg/src/plugins/processor.rs +1 -2
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -13
- data/vendor/kreuzberg/src/plugins/validator.rs +8 -9
- data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
- data/vendor/kreuzberg/src/types.rs +12 -42
- data/vendor/kreuzberg/tests/batch_orchestration.rs +5 -19
- data/vendor/kreuzberg/tests/batch_processing.rs +3 -15
- data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +1 -17
- data/vendor/kreuzberg/tests/config_features.rs +0 -18
- data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -15
- data/vendor/kreuzberg/tests/core_integration.rs +7 -24
- data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
- data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1 -0
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -1
- data/vendor/kreuzberg/tests/security_validation.rs +1 -12
- metadata +25 -90
- data/.rubocop.yml +0 -538
- data/ext/kreuzberg_rb/native/Cargo.lock +0 -6535
- data/lib/kreuzberg/error_context.rb +0 -32
- data/vendor/kreuzberg/benches/otel_overhead.rs +0 -48
- data/vendor/kreuzberg/src/extraction/markdown.rs +0 -213
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -287
- data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -469
- data/vendor/kreuzberg/src/extractors/docbook.rs +0 -502
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -707
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -491
- data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
- data/vendor/kreuzberg/src/extractors/jats.rs +0 -1051
- data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -367
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -652
- data/vendor/kreuzberg/src/extractors/markdown.rs +0 -700
- data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -634
- data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -528
- data/vendor/kreuzberg/src/extractors/rst.rs +0 -576
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -810
- data/vendor/kreuzberg/src/extractors/security.rs +0 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
- data/vendor/kreuzberg/src/extractors/typst.rs +0 -650
- data/vendor/kreuzberg/src/panic_context.rs +0 -154
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -498
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
- data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
- data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -695
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -692
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -776
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1259
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -647
- data/vendor/rb-sys/.cargo-ok +0 -1
- data/vendor/rb-sys/.cargo_vcs_info.json +0 -6
- data/vendor/rb-sys/Cargo.lock +0 -393
- data/vendor/rb-sys/Cargo.toml +0 -70
- data/vendor/rb-sys/Cargo.toml.orig +0 -57
- data/vendor/rb-sys/LICENSE-APACHE +0 -190
- data/vendor/rb-sys/LICENSE-MIT +0 -21
- data/vendor/rb-sys/bin/release.sh +0 -21
- data/vendor/rb-sys/build/features.rs +0 -108
- data/vendor/rb-sys/build/main.rs +0 -246
- data/vendor/rb-sys/build/stable_api_config.rs +0 -153
- data/vendor/rb-sys/build/version.rs +0 -48
- data/vendor/rb-sys/readme.md +0 -36
- data/vendor/rb-sys/src/bindings.rs +0 -21
- data/vendor/rb-sys/src/hidden.rs +0 -11
- data/vendor/rb-sys/src/lib.rs +0 -34
- data/vendor/rb-sys/src/macros.rs +0 -371
- data/vendor/rb-sys/src/memory.rs +0 -53
- data/vendor/rb-sys/src/ruby_abi_version.rs +0 -38
- data/vendor/rb-sys/src/special_consts.rs +0 -31
- data/vendor/rb-sys/src/stable_api/compiled.c +0 -179
- data/vendor/rb-sys/src/stable_api/compiled.rs +0 -257
- data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +0 -316
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +0 -324
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +0 -317
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +0 -315
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +0 -326
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +0 -327
- data/vendor/rb-sys/src/stable_api.rs +0 -261
- data/vendor/rb-sys/src/symbol.rs +0 -31
- data/vendor/rb-sys/src/tracking_allocator.rs +0 -332
- data/vendor/rb-sys/src/utils.rs +0 -89
- data/vendor/rb-sys/src/value_type.rs +0 -7
|
@@ -1,32 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require 'json'
|
|
4
|
-
|
|
5
|
-
module Kreuzberg
|
|
6
|
-
# ErrorContext module provides access to FFI error introspection functions.
|
|
7
|
-
# Retrieve the last error code and panic context information from errors.
|
|
8
|
-
module ErrorContext
|
|
9
|
-
class << self
|
|
10
|
-
def last_error_code
|
|
11
|
-
Kreuzberg._last_error_code_native
|
|
12
|
-
rescue StandardError
|
|
13
|
-
0
|
|
14
|
-
end
|
|
15
|
-
|
|
16
|
-
def last_panic_context
|
|
17
|
-
json_str = Kreuzberg._last_panic_context_json_native
|
|
18
|
-
return nil unless json_str
|
|
19
|
-
|
|
20
|
-
Errors::PanicContext.from_json(json_str)
|
|
21
|
-
rescue StandardError
|
|
22
|
-
nil
|
|
23
|
-
end
|
|
24
|
-
|
|
25
|
-
def last_panic_context_json
|
|
26
|
-
Kreuzberg._last_panic_context_json_native
|
|
27
|
-
rescue StandardError
|
|
28
|
-
nil
|
|
29
|
-
end
|
|
30
|
-
end
|
|
31
|
-
end
|
|
32
|
-
end
|
|
@@ -1,48 +0,0 @@
|
|
|
1
|
-
use criterion::{Criterion, criterion_group, criterion_main};
|
|
2
|
-
use std::hint::black_box;
|
|
3
|
-
|
|
4
|
-
fn bench_text_extraction(c: &mut Criterion) {
|
|
5
|
-
let runtime = tokio::runtime::Runtime::new().unwrap();
|
|
6
|
-
|
|
7
|
-
c.bench_function("extract_text_no_otel", |b| {
|
|
8
|
-
b.iter(|| {
|
|
9
|
-
runtime.block_on(async {
|
|
10
|
-
use kreuzberg::core::config::ExtractionConfig;
|
|
11
|
-
use kreuzberg::core::extractor::extract_bytes;
|
|
12
|
-
|
|
13
|
-
let test_content = black_box(b"Hello, World! This is a test document.");
|
|
14
|
-
let config = ExtractionConfig::default();
|
|
15
|
-
|
|
16
|
-
extract_bytes(test_content, "text/plain", &config).await
|
|
17
|
-
})
|
|
18
|
-
});
|
|
19
|
-
});
|
|
20
|
-
}
|
|
21
|
-
|
|
22
|
-
fn bench_cache_operations(c: &mut Criterion) {
|
|
23
|
-
use kreuzberg::cache::GenericCache;
|
|
24
|
-
use tempfile::tempdir;
|
|
25
|
-
|
|
26
|
-
let temp_dir = tempdir().unwrap();
|
|
27
|
-
let cache = GenericCache::new(
|
|
28
|
-
"bench".to_string(),
|
|
29
|
-
Some(temp_dir.path().to_str().unwrap().to_string()),
|
|
30
|
-
30.0,
|
|
31
|
-
500.0,
|
|
32
|
-
1000.0,
|
|
33
|
-
)
|
|
34
|
-
.unwrap();
|
|
35
|
-
|
|
36
|
-
c.bench_function("cache_set_get", |b| {
|
|
37
|
-
b.iter(|| {
|
|
38
|
-
let key = black_box("bench_key");
|
|
39
|
-
let data = black_box(b"benchmark data".to_vec());
|
|
40
|
-
|
|
41
|
-
cache.set(key, data.clone(), None).unwrap();
|
|
42
|
-
cache.get(key, None).unwrap()
|
|
43
|
-
});
|
|
44
|
-
});
|
|
45
|
-
}
|
|
46
|
-
|
|
47
|
-
criterion_group!(benches, bench_text_extraction, bench_cache_operations);
|
|
48
|
-
criterion_main!(benches);
|
|
@@ -1,213 +0,0 @@
|
|
|
1
|
-
//! Markdown table formatting utilities
|
|
2
|
-
//!
|
|
3
|
-
//! This module provides utilities for converting tabular data into GitHub-Flavored Markdown (GFM) tables.
|
|
4
|
-
//! It's used by multiple extractors (DOCX, HTML) that need to represent structured table data in markdown format.
|
|
5
|
-
|
|
6
|
-
/// Converts a 2D vector of cell strings into a GitHub-Flavored Markdown table.
|
|
7
|
-
///
|
|
8
|
-
/// # Behavior
|
|
9
|
-
///
|
|
10
|
-
/// - The first row is treated as the header row
|
|
11
|
-
/// - A separator row is inserted after the header
|
|
12
|
-
/// - Pipe characters (`|`) in cell content are automatically escaped with backslash
|
|
13
|
-
/// - Irregular tables (rows with varying column counts) are padded with empty cells to match the header
|
|
14
|
-
/// - Returns an empty string for empty input
|
|
15
|
-
///
|
|
16
|
-
/// # Arguments
|
|
17
|
-
///
|
|
18
|
-
/// * `cells` - A slice of vectors representing table rows, where each inner vector contains cell values
|
|
19
|
-
///
|
|
20
|
-
/// # Returns
|
|
21
|
-
///
|
|
22
|
-
/// A `String` containing the GFM markdown table representation
|
|
23
|
-
///
|
|
24
|
-
/// # Examples
|
|
25
|
-
///
|
|
26
|
-
/// ```
|
|
27
|
-
/// # use kreuzberg::extraction::cells_to_markdown;
|
|
28
|
-
/// let cells = vec![
|
|
29
|
-
/// vec!["Name".to_string(), "Age".to_string()],
|
|
30
|
-
/// vec!["Alice".to_string(), "30".to_string()],
|
|
31
|
-
/// vec!["Bob".to_string(), "25".to_string()],
|
|
32
|
-
/// ];
|
|
33
|
-
///
|
|
34
|
-
/// let markdown = cells_to_markdown(&cells);
|
|
35
|
-
/// assert!(markdown.contains("| Name | Age |"));
|
|
36
|
-
/// assert!(markdown.contains("|------|------|"));
|
|
37
|
-
/// ```
|
|
38
|
-
pub fn cells_to_markdown(cells: &[Vec<String>]) -> String {
|
|
39
|
-
if cells.is_empty() {
|
|
40
|
-
return String::new();
|
|
41
|
-
}
|
|
42
|
-
|
|
43
|
-
let mut markdown = String::new();
|
|
44
|
-
|
|
45
|
-
let num_cols = cells.first().map(|r| r.len()).unwrap_or(0);
|
|
46
|
-
if num_cols == 0 {
|
|
47
|
-
return String::new();
|
|
48
|
-
}
|
|
49
|
-
|
|
50
|
-
if let Some(header) = cells.first() {
|
|
51
|
-
markdown.push('|');
|
|
52
|
-
for cell in header {
|
|
53
|
-
markdown.push(' ');
|
|
54
|
-
let escaped = cell.replace('|', "\\|");
|
|
55
|
-
markdown.push_str(&escaped);
|
|
56
|
-
markdown.push_str(" |");
|
|
57
|
-
}
|
|
58
|
-
markdown.push('\n');
|
|
59
|
-
|
|
60
|
-
markdown.push('|');
|
|
61
|
-
for _ in 0..num_cols {
|
|
62
|
-
markdown.push_str("------|");
|
|
63
|
-
}
|
|
64
|
-
markdown.push('\n');
|
|
65
|
-
}
|
|
66
|
-
|
|
67
|
-
for row in cells.iter().skip(1) {
|
|
68
|
-
markdown.push('|');
|
|
69
|
-
for (idx, cell) in row.iter().enumerate() {
|
|
70
|
-
if idx >= num_cols {
|
|
71
|
-
break;
|
|
72
|
-
}
|
|
73
|
-
markdown.push(' ');
|
|
74
|
-
let escaped = cell.replace('|', "\\|");
|
|
75
|
-
markdown.push_str(&escaped);
|
|
76
|
-
markdown.push_str(" |");
|
|
77
|
-
}
|
|
78
|
-
for _ in row.len()..num_cols {
|
|
79
|
-
markdown.push_str(" |");
|
|
80
|
-
}
|
|
81
|
-
markdown.push('\n');
|
|
82
|
-
}
|
|
83
|
-
|
|
84
|
-
markdown
|
|
85
|
-
}
|
|
86
|
-
|
|
87
|
-
#[cfg(test)]
|
|
88
|
-
mod tests {
|
|
89
|
-
use super::*;
|
|
90
|
-
|
|
91
|
-
#[test]
|
|
92
|
-
fn test_markdown_formatting_from_simple_table() {
|
|
93
|
-
let cells = vec![
|
|
94
|
-
vec!["Header1".to_string(), "Header2".to_string()],
|
|
95
|
-
vec!["Row1Col1".to_string(), "Row1Col2".to_string()],
|
|
96
|
-
vec!["Row2Col1".to_string(), "Row2Col2".to_string()],
|
|
97
|
-
];
|
|
98
|
-
|
|
99
|
-
let markdown = cells_to_markdown(&cells);
|
|
100
|
-
|
|
101
|
-
assert!(markdown.contains("| Header1 | Header2 |"));
|
|
102
|
-
assert!(markdown.contains("|------|------|"));
|
|
103
|
-
assert!(markdown.contains("| Row1Col1 | Row1Col2 |"));
|
|
104
|
-
assert!(markdown.contains("| Row2Col1 | Row2Col2 |"));
|
|
105
|
-
|
|
106
|
-
let lines: Vec<&str> = markdown.lines().collect();
|
|
107
|
-
assert_eq!(lines.len(), 4);
|
|
108
|
-
}
|
|
109
|
-
|
|
110
|
-
#[test]
|
|
111
|
-
fn test_markdown_handles_empty_input() {
|
|
112
|
-
let cells: Vec<Vec<String>> = vec![];
|
|
113
|
-
|
|
114
|
-
let markdown = cells_to_markdown(&cells);
|
|
115
|
-
|
|
116
|
-
assert_eq!(markdown, "");
|
|
117
|
-
}
|
|
118
|
-
|
|
119
|
-
#[test]
|
|
120
|
-
fn test_markdown_escapes_pipe_characters() {
|
|
121
|
-
let cells = vec![vec!["Header".to_string()], vec!["Cell with | pipe".to_string()]];
|
|
122
|
-
|
|
123
|
-
let markdown = cells_to_markdown(&cells);
|
|
124
|
-
|
|
125
|
-
assert!(markdown.contains("Cell with \\| pipe"));
|
|
126
|
-
|
|
127
|
-
for line in markdown.lines() {
|
|
128
|
-
if !line.is_empty() {
|
|
129
|
-
assert!(line.starts_with('|'));
|
|
130
|
-
assert!(line.ends_with('|'));
|
|
131
|
-
}
|
|
132
|
-
}
|
|
133
|
-
}
|
|
134
|
-
|
|
135
|
-
#[test]
|
|
136
|
-
fn test_markdown_pads_irregular_tables() {
|
|
137
|
-
let cells = vec![
|
|
138
|
-
vec!["H1".to_string(), "H2".to_string(), "H3".to_string()],
|
|
139
|
-
vec!["R1C1".to_string(), "R1C2".to_string()],
|
|
140
|
-
vec!["R2C1".to_string(), "R2C2".to_string(), "R2C3".to_string()],
|
|
141
|
-
];
|
|
142
|
-
|
|
143
|
-
let markdown = cells_to_markdown(&cells);
|
|
144
|
-
|
|
145
|
-
assert!(markdown.contains("| H1 | H2 | H3 |"));
|
|
146
|
-
|
|
147
|
-
assert!(markdown.contains("| R1C1 | R1C2 | |"));
|
|
148
|
-
|
|
149
|
-
let lines: Vec<&str> = markdown.lines().filter(|l| !l.is_empty()).collect();
|
|
150
|
-
let pipe_counts: Vec<usize> = lines
|
|
151
|
-
.iter()
|
|
152
|
-
.map(|line| line.chars().filter(|c| *c == '|').count())
|
|
153
|
-
.collect();
|
|
154
|
-
assert!(pipe_counts.iter().all(|&count| count == pipe_counts[0]));
|
|
155
|
-
}
|
|
156
|
-
|
|
157
|
-
#[test]
|
|
158
|
-
fn test_markdown_single_row_table() {
|
|
159
|
-
let cells = vec![vec!["OnlyHeader".to_string()]];
|
|
160
|
-
|
|
161
|
-
let markdown = cells_to_markdown(&cells);
|
|
162
|
-
|
|
163
|
-
assert!(markdown.contains("| OnlyHeader |"));
|
|
164
|
-
assert!(markdown.contains("|------|"));
|
|
165
|
-
|
|
166
|
-
let lines: Vec<&str> = markdown.lines().collect();
|
|
167
|
-
assert_eq!(lines.len(), 2);
|
|
168
|
-
}
|
|
169
|
-
|
|
170
|
-
#[test]
|
|
171
|
-
fn test_markdown_single_column_table() {
|
|
172
|
-
let cells = vec![
|
|
173
|
-
vec!["Header".to_string()],
|
|
174
|
-
vec!["Data1".to_string()],
|
|
175
|
-
vec!["Data2".to_string()],
|
|
176
|
-
];
|
|
177
|
-
|
|
178
|
-
let markdown = cells_to_markdown(&cells);
|
|
179
|
-
|
|
180
|
-
assert!(markdown.contains("| Header |"));
|
|
181
|
-
assert!(markdown.contains("|------|"));
|
|
182
|
-
assert!(markdown.contains("| Data1 |"));
|
|
183
|
-
assert!(markdown.contains("| Data2 |"));
|
|
184
|
-
}
|
|
185
|
-
|
|
186
|
-
#[test]
|
|
187
|
-
fn test_markdown_special_characters() {
|
|
188
|
-
let cells = vec![
|
|
189
|
-
vec!["*Header*".to_string(), "#Title".to_string()],
|
|
190
|
-
vec!["**Bold**".to_string(), "~~Strike~~".to_string()],
|
|
191
|
-
];
|
|
192
|
-
|
|
193
|
-
let markdown = cells_to_markdown(&cells);
|
|
194
|
-
|
|
195
|
-
assert!(markdown.contains("*Header*"));
|
|
196
|
-
assert!(markdown.contains("#Title"));
|
|
197
|
-
assert!(markdown.contains("**Bold**"));
|
|
198
|
-
assert!(markdown.contains("~~Strike~~"));
|
|
199
|
-
}
|
|
200
|
-
|
|
201
|
-
#[test]
|
|
202
|
-
fn test_markdown_unicode_content() {
|
|
203
|
-
let cells = vec![
|
|
204
|
-
vec!["Emoji".to_string(), "Accents".to_string()],
|
|
205
|
-
vec!["🎉 Party".to_string(), "Café".to_string()],
|
|
206
|
-
];
|
|
207
|
-
|
|
208
|
-
let markdown = cells_to_markdown(&cells);
|
|
209
|
-
|
|
210
|
-
assert!(markdown.contains("🎉 Party"));
|
|
211
|
-
assert!(markdown.contains("Café"));
|
|
212
|
-
}
|
|
213
|
-
}
|
|
@@ -1,287 +0,0 @@
|
|
|
1
|
-
//! ODT (OpenDocument) metadata extraction from meta.xml
|
|
2
|
-
//!
|
|
3
|
-
//! Extracts metadata from OpenDocument Text files following the OASIS OpenDocument standard.
|
|
4
|
-
|
|
5
|
-
use crate::error::{KreuzbergError, Result};
|
|
6
|
-
use std::io::Read;
|
|
7
|
-
use zip::ZipArchive;
|
|
8
|
-
|
|
9
|
-
/// OpenDocument metadata from meta.xml
|
|
10
|
-
///
|
|
11
|
-
/// Contains metadata fields defined by the OASIS OpenDocument Format standard.
|
|
12
|
-
/// Uses Dublin Core elements (dc:) and OpenDocument meta elements (meta:).
|
|
13
|
-
#[derive(Debug, Clone, Default, PartialEq)]
|
|
14
|
-
pub struct OdtProperties {
|
|
15
|
-
/// Document title (dc:title)
|
|
16
|
-
pub title: Option<String>,
|
|
17
|
-
/// Document subject/topic (dc:subject)
|
|
18
|
-
pub subject: Option<String>,
|
|
19
|
-
/// Current document creator/author (dc:creator)
|
|
20
|
-
pub creator: Option<String>,
|
|
21
|
-
/// Initial creator of the document (meta:initial-creator)
|
|
22
|
-
pub initial_creator: Option<String>,
|
|
23
|
-
/// Keywords or tags (meta:keyword)
|
|
24
|
-
pub keywords: Option<String>,
|
|
25
|
-
/// Document description (dc:description)
|
|
26
|
-
pub description: Option<String>,
|
|
27
|
-
/// Current modification date (dc:date)
|
|
28
|
-
pub date: Option<String>,
|
|
29
|
-
/// Initial creation date (meta:creation-date)
|
|
30
|
-
pub creation_date: Option<String>,
|
|
31
|
-
/// Document language (dc:language)
|
|
32
|
-
pub language: Option<String>,
|
|
33
|
-
/// Generator/application that created the document (meta:generator)
|
|
34
|
-
pub generator: Option<String>,
|
|
35
|
-
/// Editing duration in ISO 8601 format (meta:editing-duration)
|
|
36
|
-
pub editing_duration: Option<String>,
|
|
37
|
-
/// Number of edits/revisions (meta:editing-cycles)
|
|
38
|
-
pub editing_cycles: Option<String>,
|
|
39
|
-
/// Document statistics - page count (meta:page-count)
|
|
40
|
-
pub page_count: Option<i32>,
|
|
41
|
-
/// Document statistics - word count (meta:word-count)
|
|
42
|
-
pub word_count: Option<i32>,
|
|
43
|
-
/// Document statistics - character count (meta:character-count)
|
|
44
|
-
pub character_count: Option<i32>,
|
|
45
|
-
/// Document statistics - paragraph count (meta:paragraph-count)
|
|
46
|
-
pub paragraph_count: Option<i32>,
|
|
47
|
-
/// Document statistics - table count (meta:table-count)
|
|
48
|
-
pub table_count: Option<i32>,
|
|
49
|
-
/// Document statistics - image count (meta:image-count)
|
|
50
|
-
pub image_count: Option<i32>,
|
|
51
|
-
}
|
|
52
|
-
|
|
53
|
-
/// Extract ODT metadata from an OpenDocument file
|
|
54
|
-
///
|
|
55
|
-
/// Parses `meta.xml` from the ZIP archive and extracts OpenDocument metadata.
|
|
56
|
-
///
|
|
57
|
-
/// # Arguments
|
|
58
|
-
///
|
|
59
|
-
/// * `archive` - ZIP archive containing the OpenDocument file
|
|
60
|
-
///
|
|
61
|
-
/// # Returns
|
|
62
|
-
///
|
|
63
|
-
/// Returns `OdtProperties` with extracted metadata. Fields that are not present
|
|
64
|
-
/// in the document will be `None`.
|
|
65
|
-
///
|
|
66
|
-
/// # Errors
|
|
67
|
-
///
|
|
68
|
-
/// Returns an error if:
|
|
69
|
-
/// - The ZIP archive cannot be read
|
|
70
|
-
/// - The meta.xml file is malformed
|
|
71
|
-
/// - XML parsing fails
|
|
72
|
-
///
|
|
73
|
-
/// # Example
|
|
74
|
-
///
|
|
75
|
-
/// ```no_run
|
|
76
|
-
/// use kreuzberg::extraction::office_metadata::extract_odt_properties;
|
|
77
|
-
/// use std::fs::File;
|
|
78
|
-
/// use zip::ZipArchive;
|
|
79
|
-
///
|
|
80
|
-
/// let file = File::open("document.odt")?;
|
|
81
|
-
/// let mut archive = ZipArchive::new(file)?;
|
|
82
|
-
/// let props = extract_odt_properties(&mut archive)?;
|
|
83
|
-
///
|
|
84
|
-
/// println!("Title: {:?}", props.title);
|
|
85
|
-
/// println!("Creator: {:?}", props.creator);
|
|
86
|
-
/// println!("Created: {:?}", props.creation_date);
|
|
87
|
-
/// # Ok::<(), Box<dyn std::error::Error>>(())
|
|
88
|
-
/// ```
|
|
89
|
-
pub fn extract_odt_properties<R: Read + std::io::Seek>(archive: &mut ZipArchive<R>) -> Result<OdtProperties> {
|
|
90
|
-
let mut xml_content = String::new();
|
|
91
|
-
|
|
92
|
-
match archive.by_name("meta.xml") {
|
|
93
|
-
Ok(mut file) => {
|
|
94
|
-
file.read_to_string(&mut xml_content)
|
|
95
|
-
.map_err(|e| KreuzbergError::parsing(format!("Failed to read meta.xml: {}", e)))?;
|
|
96
|
-
}
|
|
97
|
-
Err(_) => {
|
|
98
|
-
return Ok(OdtProperties::default());
|
|
99
|
-
}
|
|
100
|
-
}
|
|
101
|
-
|
|
102
|
-
let doc = roxmltree::Document::parse(&xml_content)
|
|
103
|
-
.map_err(|e| KreuzbergError::parsing(format!("Failed to parse meta.xml: {}", e)))?;
|
|
104
|
-
|
|
105
|
-
let root = doc.root_element();
|
|
106
|
-
|
|
107
|
-
// Extract Dublin Core elements
|
|
108
|
-
let title = super::parse_xml_text(root, "title");
|
|
109
|
-
let subject = super::parse_xml_text(root, "subject");
|
|
110
|
-
let creator = super::parse_xml_text(root, "creator");
|
|
111
|
-
let description = super::parse_xml_text(root, "description");
|
|
112
|
-
let language = super::parse_xml_text(root, "language");
|
|
113
|
-
let date = super::parse_xml_text(root, "date");
|
|
114
|
-
|
|
115
|
-
// Extract OpenDocument meta elements
|
|
116
|
-
let initial_creator = super::parse_xml_text(root, "initial-creator");
|
|
117
|
-
let keywords = super::parse_xml_text(root, "keyword");
|
|
118
|
-
let creation_date = super::parse_xml_text(root, "creation-date");
|
|
119
|
-
let generator = super::parse_xml_text(root, "generator");
|
|
120
|
-
let editing_duration = super::parse_xml_text(root, "editing-duration");
|
|
121
|
-
let editing_cycles = super::parse_xml_text(root, "editing-cycles");
|
|
122
|
-
|
|
123
|
-
// Extract document statistics
|
|
124
|
-
let page_count = super::parse_xml_int(root, "page-count");
|
|
125
|
-
let word_count = super::parse_xml_int(root, "word-count");
|
|
126
|
-
let character_count = super::parse_xml_int(root, "character-count");
|
|
127
|
-
let paragraph_count = super::parse_xml_int(root, "paragraph-count");
|
|
128
|
-
let table_count = super::parse_xml_int(root, "table-count");
|
|
129
|
-
let image_count = super::parse_xml_int(root, "image-count");
|
|
130
|
-
|
|
131
|
-
Ok(OdtProperties {
|
|
132
|
-
title,
|
|
133
|
-
subject,
|
|
134
|
-
creator,
|
|
135
|
-
initial_creator,
|
|
136
|
-
keywords,
|
|
137
|
-
description,
|
|
138
|
-
date,
|
|
139
|
-
creation_date,
|
|
140
|
-
language,
|
|
141
|
-
generator,
|
|
142
|
-
editing_duration,
|
|
143
|
-
editing_cycles,
|
|
144
|
-
page_count,
|
|
145
|
-
word_count,
|
|
146
|
-
character_count,
|
|
147
|
-
paragraph_count,
|
|
148
|
-
table_count,
|
|
149
|
-
image_count,
|
|
150
|
-
})
|
|
151
|
-
}
|
|
152
|
-
|
|
153
|
-
#[cfg(test)]
|
|
154
|
-
mod tests {
|
|
155
|
-
use super::*;
|
|
156
|
-
use std::io::{Cursor, Write};
|
|
157
|
-
|
|
158
|
-
fn create_test_zip_with_meta_xml(meta_xml: &str) -> ZipArchive<Cursor<Vec<u8>>> {
|
|
159
|
-
let buffer = Vec::new();
|
|
160
|
-
let cursor = Cursor::new(buffer);
|
|
161
|
-
let mut zip = zip::ZipWriter::new(cursor);
|
|
162
|
-
|
|
163
|
-
let options = zip::write::FileOptions::<()>::default().compression_method(zip::CompressionMethod::Stored);
|
|
164
|
-
|
|
165
|
-
zip.start_file("meta.xml", options).unwrap();
|
|
166
|
-
zip.write_all(meta_xml.as_bytes()).unwrap();
|
|
167
|
-
|
|
168
|
-
let cursor = zip.finish().unwrap();
|
|
169
|
-
ZipArchive::new(cursor).unwrap()
|
|
170
|
-
}
|
|
171
|
-
|
|
172
|
-
#[test]
|
|
173
|
-
fn test_extract_odt_properties_full() {
|
|
174
|
-
let meta_xml = r#"<?xml version="1.0" encoding="UTF-8"?>
|
|
175
|
-
<office:document-meta xmlns:office="urn:oasis:names:tc:opendocument:xmlns:office:1.0"
|
|
176
|
-
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
|
177
|
-
xmlns:meta="urn:oasis:names:tc:opendocument:xmlns:meta:1.0"
|
|
178
|
-
office:version="1.3">
|
|
179
|
-
<office:meta>
|
|
180
|
-
<dc:title>Test Document</dc:title>
|
|
181
|
-
<dc:subject>Testing</dc:subject>
|
|
182
|
-
<dc:creator>John Doe</dc:creator>
|
|
183
|
-
<meta:initial-creator>Jane Smith</meta:initial-creator>
|
|
184
|
-
<dc:description>A test document for ODT metadata</dc:description>
|
|
185
|
-
<meta:keyword>test, metadata, odt</meta:keyword>
|
|
186
|
-
<dc:language>en-US</dc:language>
|
|
187
|
-
<meta:creation-date>2024-01-01T10:00:00Z</meta:creation-date>
|
|
188
|
-
<dc:date>2024-01-02T15:30:00Z</dc:date>
|
|
189
|
-
<meta:generator>LibreOffice/24.2</meta:generator>
|
|
190
|
-
<meta:editing-duration>PT2H30M</meta:editing-duration>
|
|
191
|
-
<meta:editing-cycles>5</meta:editing-cycles>
|
|
192
|
-
<meta:page-count>10</meta:page-count>
|
|
193
|
-
<meta:word-count>1500</meta:word-count>
|
|
194
|
-
<meta:character-count>9000</meta:character-count>
|
|
195
|
-
<meta:paragraph-count>45</meta:paragraph-count>
|
|
196
|
-
<meta:table-count>3</meta:table-count>
|
|
197
|
-
<meta:image-count>7</meta:image-count>
|
|
198
|
-
</office:meta>
|
|
199
|
-
</office:document-meta>"#;
|
|
200
|
-
|
|
201
|
-
let mut archive = create_test_zip_with_meta_xml(meta_xml);
|
|
202
|
-
let props = extract_odt_properties(&mut archive).unwrap();
|
|
203
|
-
|
|
204
|
-
assert_eq!(props.title, Some("Test Document".to_string()));
|
|
205
|
-
assert_eq!(props.subject, Some("Testing".to_string()));
|
|
206
|
-
assert_eq!(props.creator, Some("John Doe".to_string()));
|
|
207
|
-
assert_eq!(props.initial_creator, Some("Jane Smith".to_string()));
|
|
208
|
-
assert_eq!(props.keywords, Some("test, metadata, odt".to_string()));
|
|
209
|
-
assert_eq!(props.description, Some("A test document for ODT metadata".to_string()));
|
|
210
|
-
assert_eq!(props.language, Some("en-US".to_string()));
|
|
211
|
-
assert_eq!(props.creation_date, Some("2024-01-01T10:00:00Z".to_string()));
|
|
212
|
-
assert_eq!(props.date, Some("2024-01-02T15:30:00Z".to_string()));
|
|
213
|
-
assert_eq!(props.generator, Some("LibreOffice/24.2".to_string()));
|
|
214
|
-
assert_eq!(props.editing_duration, Some("PT2H30M".to_string()));
|
|
215
|
-
assert_eq!(props.editing_cycles, Some("5".to_string()));
|
|
216
|
-
assert_eq!(props.page_count, Some(10));
|
|
217
|
-
assert_eq!(props.word_count, Some(1500));
|
|
218
|
-
assert_eq!(props.character_count, Some(9000));
|
|
219
|
-
assert_eq!(props.paragraph_count, Some(45));
|
|
220
|
-
assert_eq!(props.table_count, Some(3));
|
|
221
|
-
assert_eq!(props.image_count, Some(7));
|
|
222
|
-
}
|
|
223
|
-
|
|
224
|
-
#[test]
|
|
225
|
-
fn test_extract_odt_properties_minimal() {
|
|
226
|
-
let meta_xml = r#"<?xml version="1.0" encoding="UTF-8"?>
|
|
227
|
-
<office:document-meta xmlns:office="urn:oasis:names:tc:opendocument:xmlns:office:1.0"
|
|
228
|
-
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
|
229
|
-
xmlns:meta="urn:oasis:names:tc:opendocument:xmlns:meta:1.0"
|
|
230
|
-
office:version="1.3">
|
|
231
|
-
<office:meta>
|
|
232
|
-
<dc:creator>Alice</dc:creator>
|
|
233
|
-
<meta:creation-date>2024-01-01T10:00:00Z</meta:creation-date>
|
|
234
|
-
</office:meta>
|
|
235
|
-
</office:document-meta>"#;
|
|
236
|
-
|
|
237
|
-
let mut archive = create_test_zip_with_meta_xml(meta_xml);
|
|
238
|
-
let props = extract_odt_properties(&mut archive).unwrap();
|
|
239
|
-
|
|
240
|
-
assert_eq!(props.creator, Some("Alice".to_string()));
|
|
241
|
-
assert_eq!(props.creation_date, Some("2024-01-01T10:00:00Z".to_string()));
|
|
242
|
-
assert_eq!(props.title, None);
|
|
243
|
-
assert_eq!(props.keywords, None);
|
|
244
|
-
assert_eq!(props.word_count, None);
|
|
245
|
-
}
|
|
246
|
-
|
|
247
|
-
#[test]
|
|
248
|
-
fn test_extract_odt_properties_empty_elements() {
|
|
249
|
-
let meta_xml = r#"<?xml version="1.0" encoding="UTF-8"?>
|
|
250
|
-
<office:document-meta xmlns:office="urn:oasis:names:tc:opendocument:xmlns:office:1.0"
|
|
251
|
-
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
|
252
|
-
xmlns:meta="urn:oasis:names:tc:opendocument:xmlns:meta:1.0"
|
|
253
|
-
office:version="1.3">
|
|
254
|
-
<office:meta>
|
|
255
|
-
<dc:title></dc:title>
|
|
256
|
-
<dc:creator>Bob</dc:creator>
|
|
257
|
-
</office:meta>
|
|
258
|
-
</office:document-meta>"#;
|
|
259
|
-
|
|
260
|
-
let mut archive = create_test_zip_with_meta_xml(meta_xml);
|
|
261
|
-
let props = extract_odt_properties(&mut archive).unwrap();
|
|
262
|
-
|
|
263
|
-
assert_eq!(props.title, None);
|
|
264
|
-
assert_eq!(props.creator, Some("Bob".to_string()));
|
|
265
|
-
}
|
|
266
|
-
|
|
267
|
-
#[test]
|
|
268
|
-
fn test_extract_odt_properties_missing_file() {
|
|
269
|
-
let buffer = Vec::new();
|
|
270
|
-
let cursor = Cursor::new(buffer);
|
|
271
|
-
let zip = zip::ZipWriter::new(cursor);
|
|
272
|
-
let cursor = zip.finish().unwrap();
|
|
273
|
-
let mut archive = ZipArchive::new(cursor).unwrap();
|
|
274
|
-
|
|
275
|
-
let props = extract_odt_properties(&mut archive).unwrap();
|
|
276
|
-
assert_eq!(props, OdtProperties::default());
|
|
277
|
-
}
|
|
278
|
-
|
|
279
|
-
#[test]
|
|
280
|
-
fn test_extract_odt_properties_malformed_xml() {
|
|
281
|
-
let meta_xml = "not valid xml <";
|
|
282
|
-
let mut archive = create_test_zip_with_meta_xml(meta_xml);
|
|
283
|
-
|
|
284
|
-
let result = extract_odt_properties(&mut archive);
|
|
285
|
-
assert!(result.is_err());
|
|
286
|
-
}
|
|
287
|
-
}
|