kreuzberg 4.0.0.pre.rc.6 → 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (175) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +0 -6
  3. data/.rubocop.yaml +534 -1
  4. data/Gemfile +2 -1
  5. data/Gemfile.lock +11 -11
  6. data/README.md +5 -10
  7. data/examples/async_patterns.rb +0 -1
  8. data/ext/kreuzberg_rb/extconf.rb +0 -10
  9. data/ext/kreuzberg_rb/native/Cargo.toml +15 -23
  10. data/ext/kreuzberg_rb/native/build.rs +2 -0
  11. data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
  12. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
  13. data/ext/kreuzberg_rb/native/include/strings.h +2 -2
  14. data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
  15. data/ext/kreuzberg_rb/native/src/lib.rs +16 -75
  16. data/kreuzberg.gemspec +14 -57
  17. data/lib/kreuzberg/cache_api.rb +0 -1
  18. data/lib/kreuzberg/cli.rb +2 -2
  19. data/lib/kreuzberg/config.rb +2 -9
  20. data/lib/kreuzberg/errors.rb +7 -75
  21. data/lib/kreuzberg/extraction_api.rb +0 -1
  22. data/lib/kreuzberg/setup_lib_path.rb +0 -1
  23. data/lib/kreuzberg/version.rb +1 -1
  24. data/lib/kreuzberg.rb +0 -21
  25. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  26. data/sig/kreuzberg.rbs +3 -55
  27. data/spec/binding/cli_proxy_spec.rb +4 -2
  28. data/spec/binding/cli_spec.rb +11 -12
  29. data/spec/examples.txt +104 -0
  30. data/spec/fixtures/config.yaml +1 -0
  31. data/spec/spec_helper.rb +1 -1
  32. data/vendor/kreuzberg/Cargo.toml +42 -112
  33. data/vendor/kreuzberg/README.md +2 -2
  34. data/vendor/kreuzberg/build.rs +4 -18
  35. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  36. data/vendor/kreuzberg/src/cache/mod.rs +3 -27
  37. data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
  38. data/vendor/kreuzberg/src/core/extractor.rs +81 -202
  39. data/vendor/kreuzberg/src/core/io.rs +2 -4
  40. data/vendor/kreuzberg/src/core/mime.rs +12 -2
  41. data/vendor/kreuzberg/src/core/mod.rs +1 -4
  42. data/vendor/kreuzberg/src/core/pipeline.rs +33 -111
  43. data/vendor/kreuzberg/src/embeddings.rs +16 -125
  44. data/vendor/kreuzberg/src/error.rs +1 -1
  45. data/vendor/kreuzberg/src/extraction/docx.rs +1 -1
  46. data/vendor/kreuzberg/src/extraction/image.rs +13 -13
  47. data/vendor/kreuzberg/src/extraction/libreoffice.rs +1 -0
  48. data/vendor/kreuzberg/src/extraction/mod.rs +5 -9
  49. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
  50. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  51. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  52. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  53. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  54. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  55. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  56. data/vendor/kreuzberg/src/extractors/archive.rs +0 -21
  57. data/vendor/kreuzberg/src/extractors/docx.rs +128 -16
  58. data/vendor/kreuzberg/src/extractors/email.rs +0 -14
  59. data/vendor/kreuzberg/src/extractors/excel.rs +20 -19
  60. data/vendor/kreuzberg/src/extractors/html.rs +154 -137
  61. data/vendor/kreuzberg/src/extractors/image.rs +4 -7
  62. data/vendor/kreuzberg/src/extractors/mod.rs +9 -106
  63. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  64. data/vendor/kreuzberg/src/extractors/pdf.rs +15 -12
  65. data/vendor/kreuzberg/src/extractors/pptx.rs +3 -17
  66. data/vendor/kreuzberg/src/extractors/structured.rs +0 -14
  67. data/vendor/kreuzberg/src/extractors/text.rs +5 -23
  68. data/vendor/kreuzberg/src/extractors/xml.rs +0 -7
  69. data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
  70. data/vendor/kreuzberg/src/lib.rs +1 -4
  71. data/vendor/kreuzberg/src/mcp/mod.rs +1 -1
  72. data/vendor/kreuzberg/src/mcp/server.rs +3 -5
  73. data/vendor/kreuzberg/src/ocr/processor.rs +2 -18
  74. data/vendor/kreuzberg/src/pdf/error.rs +1 -1
  75. data/vendor/kreuzberg/src/pdf/table.rs +44 -17
  76. data/vendor/kreuzberg/src/pdf/text.rs +3 -0
  77. data/vendor/kreuzberg/src/plugins/extractor.rs +5 -8
  78. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -2
  79. data/vendor/kreuzberg/src/plugins/processor.rs +1 -2
  80. data/vendor/kreuzberg/src/plugins/registry.rs +0 -13
  81. data/vendor/kreuzberg/src/plugins/validator.rs +8 -9
  82. data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
  83. data/vendor/kreuzberg/src/types.rs +12 -42
  84. data/vendor/kreuzberg/tests/batch_orchestration.rs +5 -19
  85. data/vendor/kreuzberg/tests/batch_processing.rs +3 -15
  86. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  87. data/vendor/kreuzberg/tests/concurrency_stress.rs +1 -17
  88. data/vendor/kreuzberg/tests/config_features.rs +0 -18
  89. data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -15
  90. data/vendor/kreuzberg/tests/core_integration.rs +7 -24
  91. data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
  92. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
  93. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  94. data/vendor/kreuzberg/tests/pipeline_integration.rs +1 -0
  95. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -0
  96. data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -1
  97. data/vendor/kreuzberg/tests/security_validation.rs +1 -12
  98. metadata +25 -90
  99. data/.rubocop.yml +0 -538
  100. data/ext/kreuzberg_rb/native/Cargo.lock +0 -6535
  101. data/lib/kreuzberg/error_context.rb +0 -32
  102. data/vendor/kreuzberg/benches/otel_overhead.rs +0 -48
  103. data/vendor/kreuzberg/src/extraction/markdown.rs +0 -213
  104. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -287
  105. data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -469
  106. data/vendor/kreuzberg/src/extractors/docbook.rs +0 -502
  107. data/vendor/kreuzberg/src/extractors/epub.rs +0 -707
  108. data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -491
  109. data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
  110. data/vendor/kreuzberg/src/extractors/jats.rs +0 -1051
  111. data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -367
  112. data/vendor/kreuzberg/src/extractors/latex.rs +0 -652
  113. data/vendor/kreuzberg/src/extractors/markdown.rs +0 -700
  114. data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
  115. data/vendor/kreuzberg/src/extractors/opml.rs +0 -634
  116. data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -528
  117. data/vendor/kreuzberg/src/extractors/rst.rs +0 -576
  118. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -810
  119. data/vendor/kreuzberg/src/extractors/security.rs +0 -484
  120. data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
  121. data/vendor/kreuzberg/src/extractors/typst.rs +0 -650
  122. data/vendor/kreuzberg/src/panic_context.rs +0 -154
  123. data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
  124. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
  125. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -498
  126. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
  127. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
  128. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
  129. data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
  130. data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
  131. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
  132. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
  133. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
  134. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
  135. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -695
  136. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
  137. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
  138. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -692
  139. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -776
  140. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1259
  141. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -647
  142. data/vendor/rb-sys/.cargo-ok +0 -1
  143. data/vendor/rb-sys/.cargo_vcs_info.json +0 -6
  144. data/vendor/rb-sys/Cargo.lock +0 -393
  145. data/vendor/rb-sys/Cargo.toml +0 -70
  146. data/vendor/rb-sys/Cargo.toml.orig +0 -57
  147. data/vendor/rb-sys/LICENSE-APACHE +0 -190
  148. data/vendor/rb-sys/LICENSE-MIT +0 -21
  149. data/vendor/rb-sys/bin/release.sh +0 -21
  150. data/vendor/rb-sys/build/features.rs +0 -108
  151. data/vendor/rb-sys/build/main.rs +0 -246
  152. data/vendor/rb-sys/build/stable_api_config.rs +0 -153
  153. data/vendor/rb-sys/build/version.rs +0 -48
  154. data/vendor/rb-sys/readme.md +0 -36
  155. data/vendor/rb-sys/src/bindings.rs +0 -21
  156. data/vendor/rb-sys/src/hidden.rs +0 -11
  157. data/vendor/rb-sys/src/lib.rs +0 -34
  158. data/vendor/rb-sys/src/macros.rs +0 -371
  159. data/vendor/rb-sys/src/memory.rs +0 -53
  160. data/vendor/rb-sys/src/ruby_abi_version.rs +0 -38
  161. data/vendor/rb-sys/src/special_consts.rs +0 -31
  162. data/vendor/rb-sys/src/stable_api/compiled.c +0 -179
  163. data/vendor/rb-sys/src/stable_api/compiled.rs +0 -257
  164. data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
  165. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +0 -316
  166. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +0 -324
  167. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +0 -317
  168. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +0 -315
  169. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +0 -326
  170. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +0 -327
  171. data/vendor/rb-sys/src/stable_api.rs +0 -261
  172. data/vendor/rb-sys/src/symbol.rs +0 -31
  173. data/vendor/rb-sys/src/tracking_allocator.rs +0 -332
  174. data/vendor/rb-sys/src/utils.rs +0 -89
  175. data/vendor/rb-sys/src/value_type.rs +0 -7
@@ -1,469 +0,0 @@
1
- //! BibTeX bibliography extractor.
2
- //!
3
- //! Extracts and parses BibTeX bibliography files, providing structured access
4
- //! to bibliography entries, metadata, and author information.
5
-
6
- use crate::Result;
7
- use crate::core::config::ExtractionConfig;
8
- use crate::plugins::{DocumentExtractor, Plugin};
9
- use crate::types::{ExtractionResult, Metadata};
10
- use async_trait::async_trait;
11
- use std::collections::{HashMap, HashSet};
12
-
13
- #[cfg(feature = "office")]
14
- use biblatex::{Bibliography, ChunksExt};
15
-
16
- /// BibTeX bibliography extractor.
17
- ///
18
- /// Parses BibTeX files and extracts structured bibliography data including
19
- /// entries, authors, publication years, and entry type distribution.
20
- pub struct BibtexExtractor;
21
-
22
- impl BibtexExtractor {
23
- /// Create a new BibTeX extractor.
24
- pub fn new() -> Self {
25
- Self
26
- }
27
- }
28
-
29
- impl Default for BibtexExtractor {
30
- fn default() -> Self {
31
- Self::new()
32
- }
33
- }
34
-
35
- impl Plugin for BibtexExtractor {
36
- fn name(&self) -> &str {
37
- "bibtex-extractor"
38
- }
39
-
40
- fn version(&self) -> String {
41
- env!("CARGO_PKG_VERSION").to_string()
42
- }
43
-
44
- fn initialize(&self) -> Result<()> {
45
- Ok(())
46
- }
47
-
48
- fn shutdown(&self) -> Result<()> {
49
- Ok(())
50
- }
51
-
52
- fn description(&self) -> &str {
53
- "Extracts and parses BibTeX bibliography files with structured metadata"
54
- }
55
-
56
- fn author(&self) -> &str {
57
- "Kreuzberg Team"
58
- }
59
- }
60
-
61
- #[cfg(feature = "office")]
62
- #[async_trait]
63
- impl DocumentExtractor for BibtexExtractor {
64
- #[cfg_attr(feature = "otel", tracing::instrument(
65
- skip(self, content, _config),
66
- fields(
67
- extractor.name = self.name(),
68
- content.size_bytes = content.len(),
69
- )
70
- ))]
71
- async fn extract_bytes(
72
- &self,
73
- content: &[u8],
74
- mime_type: &str,
75
- _config: &ExtractionConfig,
76
- ) -> Result<ExtractionResult> {
77
- let bibtex_str = String::from_utf8_lossy(content);
78
-
79
- let mut entries_vec = Vec::new();
80
- let mut authors_set = HashSet::new();
81
- let mut years_set = HashSet::new();
82
- let mut entry_types_map = HashMap::new();
83
- let mut formatted_entries = String::new();
84
-
85
- match Bibliography::parse(&bibtex_str) {
86
- Ok(bib) => {
87
- for entry in bib.iter() {
88
- let key = entry.key.clone();
89
- let entry_type = entry.entry_type.clone();
90
-
91
- formatted_entries.push_str(&format!("@{} {{\n", entry_type));
92
- formatted_entries.push_str(&format!(" key = {},\n", key));
93
-
94
- for (field_name, field_chunks) in &entry.fields {
95
- let field_text = field_chunks.format_verbatim();
96
- formatted_entries.push_str(&format!(" {} = {},\n", field_name, field_text));
97
-
98
- if field_name.to_lowercase() == "author" {
99
- let authors_text = field_chunks.format_verbatim();
100
- for author in authors_text.split(" and ") {
101
- let trimmed_author = author.trim().to_string();
102
- if !trimmed_author.is_empty() {
103
- authors_set.insert(trimmed_author);
104
- }
105
- }
106
- }
107
-
108
- if field_name.to_lowercase() == "year" {
109
- let year_str = field_chunks.format_verbatim();
110
- if let Ok(year) = year_str.parse::<u32>() {
111
- years_set.insert(year);
112
- }
113
- }
114
- }
115
-
116
- formatted_entries.push_str("}\n\n");
117
-
118
- *entry_types_map
119
- .entry(entry_type.to_string().to_lowercase())
120
- .or_insert(0) += 1;
121
-
122
- entries_vec.push(key);
123
- }
124
- }
125
- Err(_err) => {
126
- #[cfg(feature = "otel")]
127
- tracing::warn!("BibTeX parsing failed, returning raw content: {}", _err);
128
- formatted_entries = bibtex_str.to_string();
129
- }
130
- }
131
-
132
- let mut additional = HashMap::new();
133
-
134
- additional.insert("entry_count".to_string(), serde_json::json!(entries_vec.len()));
135
-
136
- let mut authors_list: Vec<String> = authors_set.into_iter().collect();
137
- authors_list.sort();
138
- additional.insert("authors".to_string(), serde_json::json!(authors_list));
139
-
140
- if !years_set.is_empty() {
141
- let min_year = years_set.iter().min().copied().unwrap_or(0);
142
- let max_year = years_set.iter().max().copied().unwrap_or(0);
143
- additional.insert(
144
- "year_range".to_string(),
145
- serde_json::json!({
146
- "min": min_year,
147
- "max": max_year,
148
- "years": years_set.into_iter().collect::<Vec<_>>()
149
- }),
150
- );
151
- }
152
-
153
- if !entry_types_map.is_empty() {
154
- let mut entry_types_json = serde_json::json!({});
155
- for (entry_type, count) in entry_types_map {
156
- entry_types_json[entry_type] = serde_json::json!(count);
157
- }
158
- additional.insert("entry_types".to_string(), entry_types_json);
159
- }
160
-
161
- additional.insert("citation_keys".to_string(), serde_json::json!(entries_vec));
162
-
163
- Ok(ExtractionResult {
164
- content: formatted_entries,
165
- mime_type: mime_type.to_string(),
166
- metadata: Metadata {
167
- additional,
168
- ..Default::default()
169
- },
170
- tables: vec![],
171
- detected_languages: None,
172
- chunks: None,
173
- images: None,
174
- })
175
- }
176
-
177
- fn supported_mime_types(&self) -> &[&str] {
178
- &["application/x-bibtex", "text/x-bibtex"]
179
- }
180
-
181
- fn priority(&self) -> i32 {
182
- 50
183
- }
184
- }
185
-
186
- #[cfg(all(test, feature = "office"))]
187
- mod tests {
188
- use super::*;
189
-
190
- #[tokio::test]
191
- async fn test_can_extract_bibtex_mime_types() {
192
- let extractor = BibtexExtractor::new();
193
- let supported = extractor.supported_mime_types();
194
-
195
- assert!(supported.contains(&"application/x-bibtex"));
196
- assert!(supported.contains(&"text/x-bibtex"));
197
- assert_eq!(supported.len(), 2);
198
- }
199
-
200
- #[tokio::test]
201
- async fn test_extract_simple_bibtex() {
202
- let extractor = BibtexExtractor::new();
203
- let bibtex_content = br#"@article{key2023,
204
- title = {Sample Title},
205
- author = {John Doe},
206
- year = {2023}
207
- }"#;
208
-
209
- let config = ExtractionConfig::default();
210
- let result = extractor
211
- .extract_bytes(bibtex_content, "application/x-bibtex", &config)
212
- .await;
213
-
214
- assert!(result.is_ok());
215
- let result = result.expect("Should extract valid BibTeX entry");
216
-
217
- assert!(result.content.contains("@article"));
218
- assert!(result.content.contains("key2023"));
219
- assert!(result.content.contains("Sample Title"));
220
-
221
- let metadata = &result.metadata;
222
- assert_eq!(metadata.additional.get("entry_count"), Some(&serde_json::json!(1)));
223
- }
224
-
225
- #[tokio::test]
226
- async fn test_extract_multiple_entries() {
227
- let extractor = BibtexExtractor::new();
228
- let bibtex_content = br#"@article{first2020,
229
- title = {First Paper},
230
- author = {Author One},
231
- year = {2020},
232
- journal = {Test Journal}
233
- }
234
-
235
- @book{second2021,
236
- title = {Test Book},
237
- author = {Author Two},
238
- year = {2021},
239
- publisher = {Test Publisher}
240
- }
241
-
242
- @inproceedings{third2022,
243
- title = {Conference Paper},
244
- author = {Author Three},
245
- year = {2022}
246
- }"#;
247
-
248
- let config = ExtractionConfig::default();
249
- let result = extractor
250
- .extract_bytes(bibtex_content, "application/x-bibtex", &config)
251
- .await;
252
-
253
- assert!(result.is_ok());
254
- let result = result.expect("Should extract valid BibTeX entries");
255
-
256
- let metadata = &result.metadata;
257
-
258
- assert_eq!(metadata.additional.get("entry_count"), Some(&serde_json::json!(3)));
259
-
260
- if let Some(keys) = metadata.additional.get("citation_keys")
261
- && let Some(keys_array) = keys.as_array()
262
- {
263
- assert_eq!(keys_array.len(), 3);
264
- }
265
-
266
- if let Some(types) = metadata.additional.get("entry_types") {
267
- assert!(types.get("article").is_some());
268
- assert!(types.get("book").is_some());
269
- assert!(types.get("inproceedings").is_some());
270
- }
271
- }
272
-
273
- #[tokio::test]
274
- async fn test_extract_article_entry() {
275
- let extractor = BibtexExtractor::new();
276
- let bibtex_content = br#"@article{einstein1905,
277
- author = {Albert Einstein},
278
- title = {On the Electrodynamics of Moving Bodies},
279
- journal = {Annalen der Physik},
280
- year = {1905},
281
- volume = {17},
282
- pages = {891-921}
283
- }"#;
284
-
285
- let config = ExtractionConfig::default();
286
- let result = extractor
287
- .extract_bytes(bibtex_content, "application/x-bibtex", &config)
288
- .await;
289
-
290
- assert!(result.is_ok());
291
- let result = result.expect("Should extract valid article entry");
292
-
293
- assert!(result.content.contains("@article"));
294
- assert!(result.content.contains("einstein1905"));
295
- assert!(result.content.contains("On the Electrodynamics of Moving Bodies"));
296
- assert!(result.content.contains("Annalen der Physik"));
297
-
298
- let metadata = &result.metadata;
299
- if let Some(authors) = metadata.additional.get("authors")
300
- && let Some(authors_array) = authors.as_array()
301
- {
302
- assert!(!authors_array.is_empty());
303
- assert!(authors_array[0].as_str().unwrap_or("").contains("Einstein"));
304
- }
305
- }
306
-
307
- #[tokio::test]
308
- async fn test_extract_book_entry() {
309
- let extractor = BibtexExtractor::new();
310
- let bibtex_content = br#"@book{knuth1984,
311
- author = {Donald E. Knuth},
312
- title = {The TeXbook},
313
- publisher = {Addison-Wesley},
314
- year = {1984}
315
- }"#;
316
-
317
- let config = ExtractionConfig::default();
318
- let result = extractor
319
- .extract_bytes(bibtex_content, "application/x-bibtex", &config)
320
- .await;
321
-
322
- assert!(result.is_ok());
323
- let result = result.expect("Should extract valid book entry");
324
-
325
- assert!(result.content.contains("@book"));
326
- assert!(result.content.contains("knuth1984"));
327
- assert!(result.content.contains("The TeXbook"));
328
-
329
- let metadata = &result.metadata;
330
- assert_eq!(metadata.additional.get("entry_count"), Some(&serde_json::json!(1)));
331
-
332
- if let Some(year_range) = metadata.additional.get("year_range") {
333
- assert_eq!(year_range.get("min"), Some(&serde_json::json!(1984)));
334
- assert_eq!(year_range.get("max"), Some(&serde_json::json!(1984)));
335
- }
336
- }
337
-
338
- #[tokio::test]
339
- async fn test_extract_metadata() {
340
- let extractor = BibtexExtractor::new();
341
- let bibtex_content = br#"@article{paper1,
342
- author = {Alice Smith and Bob Jones},
343
- title = {Title 1},
344
- year = {2020}
345
- }
346
-
347
- @article{paper2,
348
- author = {Charlie Brown},
349
- title = {Title 2},
350
- year = {2021}
351
- }
352
-
353
- @book{book1,
354
- author = {David Lee},
355
- title = {Book Title},
356
- year = {2019}
357
- }"#;
358
-
359
- let config = ExtractionConfig::default();
360
- let result = extractor
361
- .extract_bytes(bibtex_content, "application/x-bibtex", &config)
362
- .await;
363
-
364
- assert!(result.is_ok());
365
- let result = result.expect("Should extract valid metadata");
366
- let metadata = &result.metadata;
367
-
368
- assert_eq!(metadata.additional.get("entry_count"), Some(&serde_json::json!(3)));
369
-
370
- if let Some(authors) = metadata.additional.get("authors")
371
- && let Some(authors_array) = authors.as_array()
372
- {
373
- assert!(authors_array.len() >= 4);
374
- }
375
-
376
- if let Some(year_range) = metadata.additional.get("year_range") {
377
- assert_eq!(year_range.get("min"), Some(&serde_json::json!(2019)));
378
- assert_eq!(year_range.get("max"), Some(&serde_json::json!(2021)));
379
- }
380
-
381
- if let Some(types) = metadata.additional.get("entry_types") {
382
- assert_eq!(types.get("article"), Some(&serde_json::json!(2)));
383
- assert_eq!(types.get("book"), Some(&serde_json::json!(1)));
384
- }
385
- }
386
-
387
- #[tokio::test]
388
- async fn test_empty_bibliography() {
389
- let extractor = BibtexExtractor::new();
390
- let bibtex_content = b"";
391
-
392
- let config = ExtractionConfig::default();
393
- let result = extractor
394
- .extract_bytes(bibtex_content, "application/x-bibtex", &config)
395
- .await;
396
-
397
- assert!(result.is_ok());
398
- let result = result.expect("Should extract empty bibliography");
399
- let metadata = &result.metadata;
400
-
401
- assert_eq!(metadata.additional.get("entry_count"), Some(&serde_json::json!(0)));
402
- }
403
-
404
- #[tokio::test]
405
- async fn test_malformed_entry() {
406
- let extractor = BibtexExtractor::new();
407
- let bibtex_content = br#"@article{incomplete
408
- title = {Missing fields}
409
-
410
- Some random text that's not valid BibTeX"#;
411
-
412
- let config = ExtractionConfig::default();
413
- let result = extractor
414
- .extract_bytes(bibtex_content, "application/x-bibtex", &config)
415
- .await;
416
-
417
- assert!(result.is_ok());
418
- let result = result.expect("Should extract malformed entry as raw content");
419
-
420
- assert!(!result.content.is_empty());
421
- }
422
-
423
- #[tokio::test]
424
- async fn test_multiple_authors_extraction() {
425
- let extractor = BibtexExtractor::new();
426
- let bibtex_content = br#"@article{collab2022,
427
- author = {First Author and Second Author and Third Author},
428
- title = {Collaborative Work},
429
- year = {2022}
430
- }"#;
431
-
432
- let config = ExtractionConfig::default();
433
- let result = extractor
434
- .extract_bytes(bibtex_content, "application/x-bibtex", &config)
435
- .await;
436
-
437
- assert!(result.is_ok());
438
- let result = result.expect("Should extract multiple authors");
439
- let metadata = &result.metadata;
440
-
441
- if let Some(authors) = metadata.additional.get("authors")
442
- && let Some(authors_array) = authors.as_array()
443
- {
444
- assert!(authors_array.len() >= 3);
445
- }
446
- }
447
-
448
- #[tokio::test]
449
- async fn test_bibtex_extractor_plugin_interface() {
450
- let extractor = BibtexExtractor::new();
451
- assert_eq!(extractor.name(), "bibtex-extractor");
452
- assert_eq!(extractor.version(), env!("CARGO_PKG_VERSION"));
453
- assert_eq!(extractor.priority(), 50);
454
- assert!(!extractor.supported_mime_types().is_empty());
455
- }
456
-
457
- #[test]
458
- fn test_bibtex_extractor_default() {
459
- let extractor = BibtexExtractor;
460
- assert_eq!(extractor.name(), "bibtex-extractor");
461
- }
462
-
463
- #[tokio::test]
464
- async fn test_bibtex_extractor_initialize_shutdown() {
465
- let extractor = BibtexExtractor::new();
466
- assert!(extractor.initialize().is_ok());
467
- assert!(extractor.shutdown().is_ok());
468
- }
469
- }