kreuzberg 4.0.0.pre.rc.6 → 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (175) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +0 -6
  3. data/.rubocop.yaml +534 -1
  4. data/Gemfile +2 -1
  5. data/Gemfile.lock +11 -11
  6. data/README.md +5 -10
  7. data/examples/async_patterns.rb +0 -1
  8. data/ext/kreuzberg_rb/extconf.rb +0 -10
  9. data/ext/kreuzberg_rb/native/Cargo.toml +15 -23
  10. data/ext/kreuzberg_rb/native/build.rs +2 -0
  11. data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
  12. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
  13. data/ext/kreuzberg_rb/native/include/strings.h +2 -2
  14. data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
  15. data/ext/kreuzberg_rb/native/src/lib.rs +16 -75
  16. data/kreuzberg.gemspec +14 -57
  17. data/lib/kreuzberg/cache_api.rb +0 -1
  18. data/lib/kreuzberg/cli.rb +2 -2
  19. data/lib/kreuzberg/config.rb +2 -9
  20. data/lib/kreuzberg/errors.rb +7 -75
  21. data/lib/kreuzberg/extraction_api.rb +0 -1
  22. data/lib/kreuzberg/setup_lib_path.rb +0 -1
  23. data/lib/kreuzberg/version.rb +1 -1
  24. data/lib/kreuzberg.rb +0 -21
  25. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  26. data/sig/kreuzberg.rbs +3 -55
  27. data/spec/binding/cli_proxy_spec.rb +4 -2
  28. data/spec/binding/cli_spec.rb +11 -12
  29. data/spec/examples.txt +104 -0
  30. data/spec/fixtures/config.yaml +1 -0
  31. data/spec/spec_helper.rb +1 -1
  32. data/vendor/kreuzberg/Cargo.toml +42 -112
  33. data/vendor/kreuzberg/README.md +2 -2
  34. data/vendor/kreuzberg/build.rs +4 -18
  35. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  36. data/vendor/kreuzberg/src/cache/mod.rs +3 -27
  37. data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
  38. data/vendor/kreuzberg/src/core/extractor.rs +81 -202
  39. data/vendor/kreuzberg/src/core/io.rs +2 -4
  40. data/vendor/kreuzberg/src/core/mime.rs +12 -2
  41. data/vendor/kreuzberg/src/core/mod.rs +1 -4
  42. data/vendor/kreuzberg/src/core/pipeline.rs +33 -111
  43. data/vendor/kreuzberg/src/embeddings.rs +16 -125
  44. data/vendor/kreuzberg/src/error.rs +1 -1
  45. data/vendor/kreuzberg/src/extraction/docx.rs +1 -1
  46. data/vendor/kreuzberg/src/extraction/image.rs +13 -13
  47. data/vendor/kreuzberg/src/extraction/libreoffice.rs +1 -0
  48. data/vendor/kreuzberg/src/extraction/mod.rs +5 -9
  49. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
  50. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  51. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  52. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  53. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  54. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  55. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  56. data/vendor/kreuzberg/src/extractors/archive.rs +0 -21
  57. data/vendor/kreuzberg/src/extractors/docx.rs +128 -16
  58. data/vendor/kreuzberg/src/extractors/email.rs +0 -14
  59. data/vendor/kreuzberg/src/extractors/excel.rs +20 -19
  60. data/vendor/kreuzberg/src/extractors/html.rs +154 -137
  61. data/vendor/kreuzberg/src/extractors/image.rs +4 -7
  62. data/vendor/kreuzberg/src/extractors/mod.rs +9 -106
  63. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  64. data/vendor/kreuzberg/src/extractors/pdf.rs +15 -12
  65. data/vendor/kreuzberg/src/extractors/pptx.rs +3 -17
  66. data/vendor/kreuzberg/src/extractors/structured.rs +0 -14
  67. data/vendor/kreuzberg/src/extractors/text.rs +5 -23
  68. data/vendor/kreuzberg/src/extractors/xml.rs +0 -7
  69. data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
  70. data/vendor/kreuzberg/src/lib.rs +1 -4
  71. data/vendor/kreuzberg/src/mcp/mod.rs +1 -1
  72. data/vendor/kreuzberg/src/mcp/server.rs +3 -5
  73. data/vendor/kreuzberg/src/ocr/processor.rs +2 -18
  74. data/vendor/kreuzberg/src/pdf/error.rs +1 -1
  75. data/vendor/kreuzberg/src/pdf/table.rs +44 -17
  76. data/vendor/kreuzberg/src/pdf/text.rs +3 -0
  77. data/vendor/kreuzberg/src/plugins/extractor.rs +5 -8
  78. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -2
  79. data/vendor/kreuzberg/src/plugins/processor.rs +1 -2
  80. data/vendor/kreuzberg/src/plugins/registry.rs +0 -13
  81. data/vendor/kreuzberg/src/plugins/validator.rs +8 -9
  82. data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
  83. data/vendor/kreuzberg/src/types.rs +12 -42
  84. data/vendor/kreuzberg/tests/batch_orchestration.rs +5 -19
  85. data/vendor/kreuzberg/tests/batch_processing.rs +3 -15
  86. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  87. data/vendor/kreuzberg/tests/concurrency_stress.rs +1 -17
  88. data/vendor/kreuzberg/tests/config_features.rs +0 -18
  89. data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -15
  90. data/vendor/kreuzberg/tests/core_integration.rs +7 -24
  91. data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
  92. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
  93. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  94. data/vendor/kreuzberg/tests/pipeline_integration.rs +1 -0
  95. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -0
  96. data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -1
  97. data/vendor/kreuzberg/tests/security_validation.rs +1 -12
  98. metadata +25 -90
  99. data/.rubocop.yml +0 -538
  100. data/ext/kreuzberg_rb/native/Cargo.lock +0 -6535
  101. data/lib/kreuzberg/error_context.rb +0 -32
  102. data/vendor/kreuzberg/benches/otel_overhead.rs +0 -48
  103. data/vendor/kreuzberg/src/extraction/markdown.rs +0 -213
  104. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -287
  105. data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -469
  106. data/vendor/kreuzberg/src/extractors/docbook.rs +0 -502
  107. data/vendor/kreuzberg/src/extractors/epub.rs +0 -707
  108. data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -491
  109. data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
  110. data/vendor/kreuzberg/src/extractors/jats.rs +0 -1051
  111. data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -367
  112. data/vendor/kreuzberg/src/extractors/latex.rs +0 -652
  113. data/vendor/kreuzberg/src/extractors/markdown.rs +0 -700
  114. data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
  115. data/vendor/kreuzberg/src/extractors/opml.rs +0 -634
  116. data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -528
  117. data/vendor/kreuzberg/src/extractors/rst.rs +0 -576
  118. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -810
  119. data/vendor/kreuzberg/src/extractors/security.rs +0 -484
  120. data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
  121. data/vendor/kreuzberg/src/extractors/typst.rs +0 -650
  122. data/vendor/kreuzberg/src/panic_context.rs +0 -154
  123. data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
  124. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
  125. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -498
  126. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
  127. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
  128. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
  129. data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
  130. data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
  131. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
  132. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
  133. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
  134. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
  135. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -695
  136. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
  137. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
  138. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -692
  139. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -776
  140. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1259
  141. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -647
  142. data/vendor/rb-sys/.cargo-ok +0 -1
  143. data/vendor/rb-sys/.cargo_vcs_info.json +0 -6
  144. data/vendor/rb-sys/Cargo.lock +0 -393
  145. data/vendor/rb-sys/Cargo.toml +0 -70
  146. data/vendor/rb-sys/Cargo.toml.orig +0 -57
  147. data/vendor/rb-sys/LICENSE-APACHE +0 -190
  148. data/vendor/rb-sys/LICENSE-MIT +0 -21
  149. data/vendor/rb-sys/bin/release.sh +0 -21
  150. data/vendor/rb-sys/build/features.rs +0 -108
  151. data/vendor/rb-sys/build/main.rs +0 -246
  152. data/vendor/rb-sys/build/stable_api_config.rs +0 -153
  153. data/vendor/rb-sys/build/version.rs +0 -48
  154. data/vendor/rb-sys/readme.md +0 -36
  155. data/vendor/rb-sys/src/bindings.rs +0 -21
  156. data/vendor/rb-sys/src/hidden.rs +0 -11
  157. data/vendor/rb-sys/src/lib.rs +0 -34
  158. data/vendor/rb-sys/src/macros.rs +0 -371
  159. data/vendor/rb-sys/src/memory.rs +0 -53
  160. data/vendor/rb-sys/src/ruby_abi_version.rs +0 -38
  161. data/vendor/rb-sys/src/special_consts.rs +0 -31
  162. data/vendor/rb-sys/src/stable_api/compiled.c +0 -179
  163. data/vendor/rb-sys/src/stable_api/compiled.rs +0 -257
  164. data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
  165. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +0 -316
  166. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +0 -324
  167. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +0 -317
  168. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +0 -315
  169. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +0 -326
  170. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +0 -327
  171. data/vendor/rb-sys/src/stable_api.rs +0 -261
  172. data/vendor/rb-sys/src/symbol.rs +0 -31
  173. data/vendor/rb-sys/src/tracking_allocator.rs +0 -332
  174. data/vendor/rb-sys/src/utils.rs +0 -89
  175. data/vendor/rb-sys/src/value_type.rs +0 -7
@@ -1,32 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'json'
4
-
5
- module Kreuzberg
6
- # ErrorContext module provides access to FFI error introspection functions.
7
- # Retrieve the last error code and panic context information from errors.
8
- module ErrorContext
9
- class << self
10
- def last_error_code
11
- Kreuzberg._last_error_code_native
12
- rescue StandardError
13
- 0
14
- end
15
-
16
- def last_panic_context
17
- json_str = Kreuzberg._last_panic_context_json_native
18
- return nil unless json_str
19
-
20
- Errors::PanicContext.from_json(json_str)
21
- rescue StandardError
22
- nil
23
- end
24
-
25
- def last_panic_context_json
26
- Kreuzberg._last_panic_context_json_native
27
- rescue StandardError
28
- nil
29
- end
30
- end
31
- end
32
- end
@@ -1,48 +0,0 @@
1
- use criterion::{Criterion, criterion_group, criterion_main};
2
- use std::hint::black_box;
3
-
4
- fn bench_text_extraction(c: &mut Criterion) {
5
- let runtime = tokio::runtime::Runtime::new().unwrap();
6
-
7
- c.bench_function("extract_text_no_otel", |b| {
8
- b.iter(|| {
9
- runtime.block_on(async {
10
- use kreuzberg::core::config::ExtractionConfig;
11
- use kreuzberg::core::extractor::extract_bytes;
12
-
13
- let test_content = black_box(b"Hello, World! This is a test document.");
14
- let config = ExtractionConfig::default();
15
-
16
- extract_bytes(test_content, "text/plain", &config).await
17
- })
18
- });
19
- });
20
- }
21
-
22
- fn bench_cache_operations(c: &mut Criterion) {
23
- use kreuzberg::cache::GenericCache;
24
- use tempfile::tempdir;
25
-
26
- let temp_dir = tempdir().unwrap();
27
- let cache = GenericCache::new(
28
- "bench".to_string(),
29
- Some(temp_dir.path().to_str().unwrap().to_string()),
30
- 30.0,
31
- 500.0,
32
- 1000.0,
33
- )
34
- .unwrap();
35
-
36
- c.bench_function("cache_set_get", |b| {
37
- b.iter(|| {
38
- let key = black_box("bench_key");
39
- let data = black_box(b"benchmark data".to_vec());
40
-
41
- cache.set(key, data.clone(), None).unwrap();
42
- cache.get(key, None).unwrap()
43
- });
44
- });
45
- }
46
-
47
- criterion_group!(benches, bench_text_extraction, bench_cache_operations);
48
- criterion_main!(benches);
@@ -1,213 +0,0 @@
1
- //! Markdown table formatting utilities
2
- //!
3
- //! This module provides utilities for converting tabular data into GitHub-Flavored Markdown (GFM) tables.
4
- //! It's used by multiple extractors (DOCX, HTML) that need to represent structured table data in markdown format.
5
-
6
- /// Converts a 2D vector of cell strings into a GitHub-Flavored Markdown table.
7
- ///
8
- /// # Behavior
9
- ///
10
- /// - The first row is treated as the header row
11
- /// - A separator row is inserted after the header
12
- /// - Pipe characters (`|`) in cell content are automatically escaped with backslash
13
- /// - Irregular tables (rows with varying column counts) are padded with empty cells to match the header
14
- /// - Returns an empty string for empty input
15
- ///
16
- /// # Arguments
17
- ///
18
- /// * `cells` - A slice of vectors representing table rows, where each inner vector contains cell values
19
- ///
20
- /// # Returns
21
- ///
22
- /// A `String` containing the GFM markdown table representation
23
- ///
24
- /// # Examples
25
- ///
26
- /// ```
27
- /// # use kreuzberg::extraction::cells_to_markdown;
28
- /// let cells = vec![
29
- /// vec!["Name".to_string(), "Age".to_string()],
30
- /// vec!["Alice".to_string(), "30".to_string()],
31
- /// vec!["Bob".to_string(), "25".to_string()],
32
- /// ];
33
- ///
34
- /// let markdown = cells_to_markdown(&cells);
35
- /// assert!(markdown.contains("| Name | Age |"));
36
- /// assert!(markdown.contains("|------|------|"));
37
- /// ```
38
- pub fn cells_to_markdown(cells: &[Vec<String>]) -> String {
39
- if cells.is_empty() {
40
- return String::new();
41
- }
42
-
43
- let mut markdown = String::new();
44
-
45
- let num_cols = cells.first().map(|r| r.len()).unwrap_or(0);
46
- if num_cols == 0 {
47
- return String::new();
48
- }
49
-
50
- if let Some(header) = cells.first() {
51
- markdown.push('|');
52
- for cell in header {
53
- markdown.push(' ');
54
- let escaped = cell.replace('|', "\\|");
55
- markdown.push_str(&escaped);
56
- markdown.push_str(" |");
57
- }
58
- markdown.push('\n');
59
-
60
- markdown.push('|');
61
- for _ in 0..num_cols {
62
- markdown.push_str("------|");
63
- }
64
- markdown.push('\n');
65
- }
66
-
67
- for row in cells.iter().skip(1) {
68
- markdown.push('|');
69
- for (idx, cell) in row.iter().enumerate() {
70
- if idx >= num_cols {
71
- break;
72
- }
73
- markdown.push(' ');
74
- let escaped = cell.replace('|', "\\|");
75
- markdown.push_str(&escaped);
76
- markdown.push_str(" |");
77
- }
78
- for _ in row.len()..num_cols {
79
- markdown.push_str(" |");
80
- }
81
- markdown.push('\n');
82
- }
83
-
84
- markdown
85
- }
86
-
87
- #[cfg(test)]
88
- mod tests {
89
- use super::*;
90
-
91
- #[test]
92
- fn test_markdown_formatting_from_simple_table() {
93
- let cells = vec![
94
- vec!["Header1".to_string(), "Header2".to_string()],
95
- vec!["Row1Col1".to_string(), "Row1Col2".to_string()],
96
- vec!["Row2Col1".to_string(), "Row2Col2".to_string()],
97
- ];
98
-
99
- let markdown = cells_to_markdown(&cells);
100
-
101
- assert!(markdown.contains("| Header1 | Header2 |"));
102
- assert!(markdown.contains("|------|------|"));
103
- assert!(markdown.contains("| Row1Col1 | Row1Col2 |"));
104
- assert!(markdown.contains("| Row2Col1 | Row2Col2 |"));
105
-
106
- let lines: Vec<&str> = markdown.lines().collect();
107
- assert_eq!(lines.len(), 4);
108
- }
109
-
110
- #[test]
111
- fn test_markdown_handles_empty_input() {
112
- let cells: Vec<Vec<String>> = vec![];
113
-
114
- let markdown = cells_to_markdown(&cells);
115
-
116
- assert_eq!(markdown, "");
117
- }
118
-
119
- #[test]
120
- fn test_markdown_escapes_pipe_characters() {
121
- let cells = vec![vec!["Header".to_string()], vec!["Cell with | pipe".to_string()]];
122
-
123
- let markdown = cells_to_markdown(&cells);
124
-
125
- assert!(markdown.contains("Cell with \\| pipe"));
126
-
127
- for line in markdown.lines() {
128
- if !line.is_empty() {
129
- assert!(line.starts_with('|'));
130
- assert!(line.ends_with('|'));
131
- }
132
- }
133
- }
134
-
135
- #[test]
136
- fn test_markdown_pads_irregular_tables() {
137
- let cells = vec![
138
- vec!["H1".to_string(), "H2".to_string(), "H3".to_string()],
139
- vec!["R1C1".to_string(), "R1C2".to_string()],
140
- vec!["R2C1".to_string(), "R2C2".to_string(), "R2C3".to_string()],
141
- ];
142
-
143
- let markdown = cells_to_markdown(&cells);
144
-
145
- assert!(markdown.contains("| H1 | H2 | H3 |"));
146
-
147
- assert!(markdown.contains("| R1C1 | R1C2 | |"));
148
-
149
- let lines: Vec<&str> = markdown.lines().filter(|l| !l.is_empty()).collect();
150
- let pipe_counts: Vec<usize> = lines
151
- .iter()
152
- .map(|line| line.chars().filter(|c| *c == '|').count())
153
- .collect();
154
- assert!(pipe_counts.iter().all(|&count| count == pipe_counts[0]));
155
- }
156
-
157
- #[test]
158
- fn test_markdown_single_row_table() {
159
- let cells = vec![vec!["OnlyHeader".to_string()]];
160
-
161
- let markdown = cells_to_markdown(&cells);
162
-
163
- assert!(markdown.contains("| OnlyHeader |"));
164
- assert!(markdown.contains("|------|"));
165
-
166
- let lines: Vec<&str> = markdown.lines().collect();
167
- assert_eq!(lines.len(), 2);
168
- }
169
-
170
- #[test]
171
- fn test_markdown_single_column_table() {
172
- let cells = vec![
173
- vec!["Header".to_string()],
174
- vec!["Data1".to_string()],
175
- vec!["Data2".to_string()],
176
- ];
177
-
178
- let markdown = cells_to_markdown(&cells);
179
-
180
- assert!(markdown.contains("| Header |"));
181
- assert!(markdown.contains("|------|"));
182
- assert!(markdown.contains("| Data1 |"));
183
- assert!(markdown.contains("| Data2 |"));
184
- }
185
-
186
- #[test]
187
- fn test_markdown_special_characters() {
188
- let cells = vec![
189
- vec!["*Header*".to_string(), "#Title".to_string()],
190
- vec!["**Bold**".to_string(), "~~Strike~~".to_string()],
191
- ];
192
-
193
- let markdown = cells_to_markdown(&cells);
194
-
195
- assert!(markdown.contains("*Header*"));
196
- assert!(markdown.contains("#Title"));
197
- assert!(markdown.contains("**Bold**"));
198
- assert!(markdown.contains("~~Strike~~"));
199
- }
200
-
201
- #[test]
202
- fn test_markdown_unicode_content() {
203
- let cells = vec![
204
- vec!["Emoji".to_string(), "Accents".to_string()],
205
- vec!["🎉 Party".to_string(), "Café".to_string()],
206
- ];
207
-
208
- let markdown = cells_to_markdown(&cells);
209
-
210
- assert!(markdown.contains("🎉 Party"));
211
- assert!(markdown.contains("Café"));
212
- }
213
- }
@@ -1,287 +0,0 @@
1
- //! ODT (OpenDocument) metadata extraction from meta.xml
2
- //!
3
- //! Extracts metadata from OpenDocument Text files following the OASIS OpenDocument standard.
4
-
5
- use crate::error::{KreuzbergError, Result};
6
- use std::io::Read;
7
- use zip::ZipArchive;
8
-
9
- /// OpenDocument metadata from meta.xml
10
- ///
11
- /// Contains metadata fields defined by the OASIS OpenDocument Format standard.
12
- /// Uses Dublin Core elements (dc:) and OpenDocument meta elements (meta:).
13
- #[derive(Debug, Clone, Default, PartialEq)]
14
- pub struct OdtProperties {
15
- /// Document title (dc:title)
16
- pub title: Option<String>,
17
- /// Document subject/topic (dc:subject)
18
- pub subject: Option<String>,
19
- /// Current document creator/author (dc:creator)
20
- pub creator: Option<String>,
21
- /// Initial creator of the document (meta:initial-creator)
22
- pub initial_creator: Option<String>,
23
- /// Keywords or tags (meta:keyword)
24
- pub keywords: Option<String>,
25
- /// Document description (dc:description)
26
- pub description: Option<String>,
27
- /// Current modification date (dc:date)
28
- pub date: Option<String>,
29
- /// Initial creation date (meta:creation-date)
30
- pub creation_date: Option<String>,
31
- /// Document language (dc:language)
32
- pub language: Option<String>,
33
- /// Generator/application that created the document (meta:generator)
34
- pub generator: Option<String>,
35
- /// Editing duration in ISO 8601 format (meta:editing-duration)
36
- pub editing_duration: Option<String>,
37
- /// Number of edits/revisions (meta:editing-cycles)
38
- pub editing_cycles: Option<String>,
39
- /// Document statistics - page count (meta:page-count)
40
- pub page_count: Option<i32>,
41
- /// Document statistics - word count (meta:word-count)
42
- pub word_count: Option<i32>,
43
- /// Document statistics - character count (meta:character-count)
44
- pub character_count: Option<i32>,
45
- /// Document statistics - paragraph count (meta:paragraph-count)
46
- pub paragraph_count: Option<i32>,
47
- /// Document statistics - table count (meta:table-count)
48
- pub table_count: Option<i32>,
49
- /// Document statistics - image count (meta:image-count)
50
- pub image_count: Option<i32>,
51
- }
52
-
53
- /// Extract ODT metadata from an OpenDocument file
54
- ///
55
- /// Parses `meta.xml` from the ZIP archive and extracts OpenDocument metadata.
56
- ///
57
- /// # Arguments
58
- ///
59
- /// * `archive` - ZIP archive containing the OpenDocument file
60
- ///
61
- /// # Returns
62
- ///
63
- /// Returns `OdtProperties` with extracted metadata. Fields that are not present
64
- /// in the document will be `None`.
65
- ///
66
- /// # Errors
67
- ///
68
- /// Returns an error if:
69
- /// - The ZIP archive cannot be read
70
- /// - The meta.xml file is malformed
71
- /// - XML parsing fails
72
- ///
73
- /// # Example
74
- ///
75
- /// ```no_run
76
- /// use kreuzberg::extraction::office_metadata::extract_odt_properties;
77
- /// use std::fs::File;
78
- /// use zip::ZipArchive;
79
- ///
80
- /// let file = File::open("document.odt")?;
81
- /// let mut archive = ZipArchive::new(file)?;
82
- /// let props = extract_odt_properties(&mut archive)?;
83
- ///
84
- /// println!("Title: {:?}", props.title);
85
- /// println!("Creator: {:?}", props.creator);
86
- /// println!("Created: {:?}", props.creation_date);
87
- /// # Ok::<(), Box<dyn std::error::Error>>(())
88
- /// ```
89
- pub fn extract_odt_properties<R: Read + std::io::Seek>(archive: &mut ZipArchive<R>) -> Result<OdtProperties> {
90
- let mut xml_content = String::new();
91
-
92
- match archive.by_name("meta.xml") {
93
- Ok(mut file) => {
94
- file.read_to_string(&mut xml_content)
95
- .map_err(|e| KreuzbergError::parsing(format!("Failed to read meta.xml: {}", e)))?;
96
- }
97
- Err(_) => {
98
- return Ok(OdtProperties::default());
99
- }
100
- }
101
-
102
- let doc = roxmltree::Document::parse(&xml_content)
103
- .map_err(|e| KreuzbergError::parsing(format!("Failed to parse meta.xml: {}", e)))?;
104
-
105
- let root = doc.root_element();
106
-
107
- // Extract Dublin Core elements
108
- let title = super::parse_xml_text(root, "title");
109
- let subject = super::parse_xml_text(root, "subject");
110
- let creator = super::parse_xml_text(root, "creator");
111
- let description = super::parse_xml_text(root, "description");
112
- let language = super::parse_xml_text(root, "language");
113
- let date = super::parse_xml_text(root, "date");
114
-
115
- // Extract OpenDocument meta elements
116
- let initial_creator = super::parse_xml_text(root, "initial-creator");
117
- let keywords = super::parse_xml_text(root, "keyword");
118
- let creation_date = super::parse_xml_text(root, "creation-date");
119
- let generator = super::parse_xml_text(root, "generator");
120
- let editing_duration = super::parse_xml_text(root, "editing-duration");
121
- let editing_cycles = super::parse_xml_text(root, "editing-cycles");
122
-
123
- // Extract document statistics
124
- let page_count = super::parse_xml_int(root, "page-count");
125
- let word_count = super::parse_xml_int(root, "word-count");
126
- let character_count = super::parse_xml_int(root, "character-count");
127
- let paragraph_count = super::parse_xml_int(root, "paragraph-count");
128
- let table_count = super::parse_xml_int(root, "table-count");
129
- let image_count = super::parse_xml_int(root, "image-count");
130
-
131
- Ok(OdtProperties {
132
- title,
133
- subject,
134
- creator,
135
- initial_creator,
136
- keywords,
137
- description,
138
- date,
139
- creation_date,
140
- language,
141
- generator,
142
- editing_duration,
143
- editing_cycles,
144
- page_count,
145
- word_count,
146
- character_count,
147
- paragraph_count,
148
- table_count,
149
- image_count,
150
- })
151
- }
152
-
153
- #[cfg(test)]
154
- mod tests {
155
- use super::*;
156
- use std::io::{Cursor, Write};
157
-
158
- fn create_test_zip_with_meta_xml(meta_xml: &str) -> ZipArchive<Cursor<Vec<u8>>> {
159
- let buffer = Vec::new();
160
- let cursor = Cursor::new(buffer);
161
- let mut zip = zip::ZipWriter::new(cursor);
162
-
163
- let options = zip::write::FileOptions::<()>::default().compression_method(zip::CompressionMethod::Stored);
164
-
165
- zip.start_file("meta.xml", options).unwrap();
166
- zip.write_all(meta_xml.as_bytes()).unwrap();
167
-
168
- let cursor = zip.finish().unwrap();
169
- ZipArchive::new(cursor).unwrap()
170
- }
171
-
172
- #[test]
173
- fn test_extract_odt_properties_full() {
174
- let meta_xml = r#"<?xml version="1.0" encoding="UTF-8"?>
175
- <office:document-meta xmlns:office="urn:oasis:names:tc:opendocument:xmlns:office:1.0"
176
- xmlns:dc="http://purl.org/dc/elements/1.1/"
177
- xmlns:meta="urn:oasis:names:tc:opendocument:xmlns:meta:1.0"
178
- office:version="1.3">
179
- <office:meta>
180
- <dc:title>Test Document</dc:title>
181
- <dc:subject>Testing</dc:subject>
182
- <dc:creator>John Doe</dc:creator>
183
- <meta:initial-creator>Jane Smith</meta:initial-creator>
184
- <dc:description>A test document for ODT metadata</dc:description>
185
- <meta:keyword>test, metadata, odt</meta:keyword>
186
- <dc:language>en-US</dc:language>
187
- <meta:creation-date>2024-01-01T10:00:00Z</meta:creation-date>
188
- <dc:date>2024-01-02T15:30:00Z</dc:date>
189
- <meta:generator>LibreOffice/24.2</meta:generator>
190
- <meta:editing-duration>PT2H30M</meta:editing-duration>
191
- <meta:editing-cycles>5</meta:editing-cycles>
192
- <meta:page-count>10</meta:page-count>
193
- <meta:word-count>1500</meta:word-count>
194
- <meta:character-count>9000</meta:character-count>
195
- <meta:paragraph-count>45</meta:paragraph-count>
196
- <meta:table-count>3</meta:table-count>
197
- <meta:image-count>7</meta:image-count>
198
- </office:meta>
199
- </office:document-meta>"#;
200
-
201
- let mut archive = create_test_zip_with_meta_xml(meta_xml);
202
- let props = extract_odt_properties(&mut archive).unwrap();
203
-
204
- assert_eq!(props.title, Some("Test Document".to_string()));
205
- assert_eq!(props.subject, Some("Testing".to_string()));
206
- assert_eq!(props.creator, Some("John Doe".to_string()));
207
- assert_eq!(props.initial_creator, Some("Jane Smith".to_string()));
208
- assert_eq!(props.keywords, Some("test, metadata, odt".to_string()));
209
- assert_eq!(props.description, Some("A test document for ODT metadata".to_string()));
210
- assert_eq!(props.language, Some("en-US".to_string()));
211
- assert_eq!(props.creation_date, Some("2024-01-01T10:00:00Z".to_string()));
212
- assert_eq!(props.date, Some("2024-01-02T15:30:00Z".to_string()));
213
- assert_eq!(props.generator, Some("LibreOffice/24.2".to_string()));
214
- assert_eq!(props.editing_duration, Some("PT2H30M".to_string()));
215
- assert_eq!(props.editing_cycles, Some("5".to_string()));
216
- assert_eq!(props.page_count, Some(10));
217
- assert_eq!(props.word_count, Some(1500));
218
- assert_eq!(props.character_count, Some(9000));
219
- assert_eq!(props.paragraph_count, Some(45));
220
- assert_eq!(props.table_count, Some(3));
221
- assert_eq!(props.image_count, Some(7));
222
- }
223
-
224
- #[test]
225
- fn test_extract_odt_properties_minimal() {
226
- let meta_xml = r#"<?xml version="1.0" encoding="UTF-8"?>
227
- <office:document-meta xmlns:office="urn:oasis:names:tc:opendocument:xmlns:office:1.0"
228
- xmlns:dc="http://purl.org/dc/elements/1.1/"
229
- xmlns:meta="urn:oasis:names:tc:opendocument:xmlns:meta:1.0"
230
- office:version="1.3">
231
- <office:meta>
232
- <dc:creator>Alice</dc:creator>
233
- <meta:creation-date>2024-01-01T10:00:00Z</meta:creation-date>
234
- </office:meta>
235
- </office:document-meta>"#;
236
-
237
- let mut archive = create_test_zip_with_meta_xml(meta_xml);
238
- let props = extract_odt_properties(&mut archive).unwrap();
239
-
240
- assert_eq!(props.creator, Some("Alice".to_string()));
241
- assert_eq!(props.creation_date, Some("2024-01-01T10:00:00Z".to_string()));
242
- assert_eq!(props.title, None);
243
- assert_eq!(props.keywords, None);
244
- assert_eq!(props.word_count, None);
245
- }
246
-
247
- #[test]
248
- fn test_extract_odt_properties_empty_elements() {
249
- let meta_xml = r#"<?xml version="1.0" encoding="UTF-8"?>
250
- <office:document-meta xmlns:office="urn:oasis:names:tc:opendocument:xmlns:office:1.0"
251
- xmlns:dc="http://purl.org/dc/elements/1.1/"
252
- xmlns:meta="urn:oasis:names:tc:opendocument:xmlns:meta:1.0"
253
- office:version="1.3">
254
- <office:meta>
255
- <dc:title></dc:title>
256
- <dc:creator>Bob</dc:creator>
257
- </office:meta>
258
- </office:document-meta>"#;
259
-
260
- let mut archive = create_test_zip_with_meta_xml(meta_xml);
261
- let props = extract_odt_properties(&mut archive).unwrap();
262
-
263
- assert_eq!(props.title, None);
264
- assert_eq!(props.creator, Some("Bob".to_string()));
265
- }
266
-
267
- #[test]
268
- fn test_extract_odt_properties_missing_file() {
269
- let buffer = Vec::new();
270
- let cursor = Cursor::new(buffer);
271
- let zip = zip::ZipWriter::new(cursor);
272
- let cursor = zip.finish().unwrap();
273
- let mut archive = ZipArchive::new(cursor).unwrap();
274
-
275
- let props = extract_odt_properties(&mut archive).unwrap();
276
- assert_eq!(props, OdtProperties::default());
277
- }
278
-
279
- #[test]
280
- fn test_extract_odt_properties_malformed_xml() {
281
- let meta_xml = "not valid xml <";
282
- let mut archive = create_test_zip_with_meta_xml(meta_xml);
283
-
284
- let result = extract_odt_properties(&mut archive);
285
- assert!(result.is_err());
286
- }
287
- }