kreuzberg 4.0.0.pre.rc.6 → 4.0.0.rc1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +0 -6
- data/.rubocop.yaml +534 -1
- data/Gemfile +2 -1
- data/Gemfile.lock +11 -11
- data/README.md +5 -10
- data/examples/async_patterns.rb +0 -1
- data/ext/kreuzberg_rb/extconf.rb +0 -10
- data/ext/kreuzberg_rb/native/Cargo.toml +15 -23
- data/ext/kreuzberg_rb/native/build.rs +2 -0
- data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
- data/ext/kreuzberg_rb/native/include/strings.h +2 -2
- data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
- data/ext/kreuzberg_rb/native/src/lib.rs +16 -75
- data/kreuzberg.gemspec +14 -57
- data/lib/kreuzberg/cache_api.rb +0 -1
- data/lib/kreuzberg/cli.rb +2 -2
- data/lib/kreuzberg/config.rb +2 -9
- data/lib/kreuzberg/errors.rb +7 -75
- data/lib/kreuzberg/extraction_api.rb +0 -1
- data/lib/kreuzberg/setup_lib_path.rb +0 -1
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +0 -21
- data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
- data/sig/kreuzberg.rbs +3 -55
- data/spec/binding/cli_proxy_spec.rb +4 -2
- data/spec/binding/cli_spec.rb +11 -12
- data/spec/examples.txt +104 -0
- data/spec/fixtures/config.yaml +1 -0
- data/spec/spec_helper.rb +1 -1
- data/vendor/kreuzberg/Cargo.toml +42 -112
- data/vendor/kreuzberg/README.md +2 -2
- data/vendor/kreuzberg/build.rs +4 -18
- data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
- data/vendor/kreuzberg/src/cache/mod.rs +3 -27
- data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
- data/vendor/kreuzberg/src/core/extractor.rs +81 -202
- data/vendor/kreuzberg/src/core/io.rs +2 -4
- data/vendor/kreuzberg/src/core/mime.rs +12 -2
- data/vendor/kreuzberg/src/core/mod.rs +1 -4
- data/vendor/kreuzberg/src/core/pipeline.rs +33 -111
- data/vendor/kreuzberg/src/embeddings.rs +16 -125
- data/vendor/kreuzberg/src/error.rs +1 -1
- data/vendor/kreuzberg/src/extraction/docx.rs +1 -1
- data/vendor/kreuzberg/src/extraction/image.rs +13 -13
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +1 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +5 -9
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
- data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
- data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
- data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
- data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +0 -21
- data/vendor/kreuzberg/src/extractors/docx.rs +128 -16
- data/vendor/kreuzberg/src/extractors/email.rs +0 -14
- data/vendor/kreuzberg/src/extractors/excel.rs +20 -19
- data/vendor/kreuzberg/src/extractors/html.rs +154 -137
- data/vendor/kreuzberg/src/extractors/image.rs +4 -7
- data/vendor/kreuzberg/src/extractors/mod.rs +9 -106
- data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +15 -12
- data/vendor/kreuzberg/src/extractors/pptx.rs +3 -17
- data/vendor/kreuzberg/src/extractors/structured.rs +0 -14
- data/vendor/kreuzberg/src/extractors/text.rs +5 -23
- data/vendor/kreuzberg/src/extractors/xml.rs +0 -7
- data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
- data/vendor/kreuzberg/src/lib.rs +1 -4
- data/vendor/kreuzberg/src/mcp/mod.rs +1 -1
- data/vendor/kreuzberg/src/mcp/server.rs +3 -5
- data/vendor/kreuzberg/src/ocr/processor.rs +2 -18
- data/vendor/kreuzberg/src/pdf/error.rs +1 -1
- data/vendor/kreuzberg/src/pdf/table.rs +44 -17
- data/vendor/kreuzberg/src/pdf/text.rs +3 -0
- data/vendor/kreuzberg/src/plugins/extractor.rs +5 -8
- data/vendor/kreuzberg/src/plugins/ocr.rs +11 -2
- data/vendor/kreuzberg/src/plugins/processor.rs +1 -2
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -13
- data/vendor/kreuzberg/src/plugins/validator.rs +8 -9
- data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
- data/vendor/kreuzberg/src/types.rs +12 -42
- data/vendor/kreuzberg/tests/batch_orchestration.rs +5 -19
- data/vendor/kreuzberg/tests/batch_processing.rs +3 -15
- data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +1 -17
- data/vendor/kreuzberg/tests/config_features.rs +0 -18
- data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -15
- data/vendor/kreuzberg/tests/core_integration.rs +7 -24
- data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
- data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1 -0
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -1
- data/vendor/kreuzberg/tests/security_validation.rs +1 -12
- metadata +25 -90
- data/.rubocop.yml +0 -538
- data/ext/kreuzberg_rb/native/Cargo.lock +0 -6535
- data/lib/kreuzberg/error_context.rb +0 -32
- data/vendor/kreuzberg/benches/otel_overhead.rs +0 -48
- data/vendor/kreuzberg/src/extraction/markdown.rs +0 -213
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -287
- data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -469
- data/vendor/kreuzberg/src/extractors/docbook.rs +0 -502
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -707
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -491
- data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
- data/vendor/kreuzberg/src/extractors/jats.rs +0 -1051
- data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -367
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -652
- data/vendor/kreuzberg/src/extractors/markdown.rs +0 -700
- data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -634
- data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -528
- data/vendor/kreuzberg/src/extractors/rst.rs +0 -576
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -810
- data/vendor/kreuzberg/src/extractors/security.rs +0 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
- data/vendor/kreuzberg/src/extractors/typst.rs +0 -650
- data/vendor/kreuzberg/src/panic_context.rs +0 -154
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -498
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
- data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
- data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -695
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -692
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -776
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1259
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -647
- data/vendor/rb-sys/.cargo-ok +0 -1
- data/vendor/rb-sys/.cargo_vcs_info.json +0 -6
- data/vendor/rb-sys/Cargo.lock +0 -393
- data/vendor/rb-sys/Cargo.toml +0 -70
- data/vendor/rb-sys/Cargo.toml.orig +0 -57
- data/vendor/rb-sys/LICENSE-APACHE +0 -190
- data/vendor/rb-sys/LICENSE-MIT +0 -21
- data/vendor/rb-sys/bin/release.sh +0 -21
- data/vendor/rb-sys/build/features.rs +0 -108
- data/vendor/rb-sys/build/main.rs +0 -246
- data/vendor/rb-sys/build/stable_api_config.rs +0 -153
- data/vendor/rb-sys/build/version.rs +0 -48
- data/vendor/rb-sys/readme.md +0 -36
- data/vendor/rb-sys/src/bindings.rs +0 -21
- data/vendor/rb-sys/src/hidden.rs +0 -11
- data/vendor/rb-sys/src/lib.rs +0 -34
- data/vendor/rb-sys/src/macros.rs +0 -371
- data/vendor/rb-sys/src/memory.rs +0 -53
- data/vendor/rb-sys/src/ruby_abi_version.rs +0 -38
- data/vendor/rb-sys/src/special_consts.rs +0 -31
- data/vendor/rb-sys/src/stable_api/compiled.c +0 -179
- data/vendor/rb-sys/src/stable_api/compiled.rs +0 -257
- data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +0 -316
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +0 -324
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +0 -317
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +0 -315
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +0 -326
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +0 -327
- data/vendor/rb-sys/src/stable_api.rs +0 -261
- data/vendor/rb-sys/src/symbol.rs +0 -31
- data/vendor/rb-sys/src/tracking_allocator.rs +0 -332
- data/vendor/rb-sys/src/utils.rs +0 -89
- data/vendor/rb-sys/src/value_type.rs +0 -7
|
@@ -1,469 +0,0 @@
|
|
|
1
|
-
//! BibTeX bibliography extractor.
|
|
2
|
-
//!
|
|
3
|
-
//! Extracts and parses BibTeX bibliography files, providing structured access
|
|
4
|
-
//! to bibliography entries, metadata, and author information.
|
|
5
|
-
|
|
6
|
-
use crate::Result;
|
|
7
|
-
use crate::core::config::ExtractionConfig;
|
|
8
|
-
use crate::plugins::{DocumentExtractor, Plugin};
|
|
9
|
-
use crate::types::{ExtractionResult, Metadata};
|
|
10
|
-
use async_trait::async_trait;
|
|
11
|
-
use std::collections::{HashMap, HashSet};
|
|
12
|
-
|
|
13
|
-
#[cfg(feature = "office")]
|
|
14
|
-
use biblatex::{Bibliography, ChunksExt};
|
|
15
|
-
|
|
16
|
-
/// BibTeX bibliography extractor.
|
|
17
|
-
///
|
|
18
|
-
/// Parses BibTeX files and extracts structured bibliography data including
|
|
19
|
-
/// entries, authors, publication years, and entry type distribution.
|
|
20
|
-
pub struct BibtexExtractor;
|
|
21
|
-
|
|
22
|
-
impl BibtexExtractor {
|
|
23
|
-
/// Create a new BibTeX extractor.
|
|
24
|
-
pub fn new() -> Self {
|
|
25
|
-
Self
|
|
26
|
-
}
|
|
27
|
-
}
|
|
28
|
-
|
|
29
|
-
impl Default for BibtexExtractor {
|
|
30
|
-
fn default() -> Self {
|
|
31
|
-
Self::new()
|
|
32
|
-
}
|
|
33
|
-
}
|
|
34
|
-
|
|
35
|
-
impl Plugin for BibtexExtractor {
|
|
36
|
-
fn name(&self) -> &str {
|
|
37
|
-
"bibtex-extractor"
|
|
38
|
-
}
|
|
39
|
-
|
|
40
|
-
fn version(&self) -> String {
|
|
41
|
-
env!("CARGO_PKG_VERSION").to_string()
|
|
42
|
-
}
|
|
43
|
-
|
|
44
|
-
fn initialize(&self) -> Result<()> {
|
|
45
|
-
Ok(())
|
|
46
|
-
}
|
|
47
|
-
|
|
48
|
-
fn shutdown(&self) -> Result<()> {
|
|
49
|
-
Ok(())
|
|
50
|
-
}
|
|
51
|
-
|
|
52
|
-
fn description(&self) -> &str {
|
|
53
|
-
"Extracts and parses BibTeX bibliography files with structured metadata"
|
|
54
|
-
}
|
|
55
|
-
|
|
56
|
-
fn author(&self) -> &str {
|
|
57
|
-
"Kreuzberg Team"
|
|
58
|
-
}
|
|
59
|
-
}
|
|
60
|
-
|
|
61
|
-
#[cfg(feature = "office")]
|
|
62
|
-
#[async_trait]
|
|
63
|
-
impl DocumentExtractor for BibtexExtractor {
|
|
64
|
-
#[cfg_attr(feature = "otel", tracing::instrument(
|
|
65
|
-
skip(self, content, _config),
|
|
66
|
-
fields(
|
|
67
|
-
extractor.name = self.name(),
|
|
68
|
-
content.size_bytes = content.len(),
|
|
69
|
-
)
|
|
70
|
-
))]
|
|
71
|
-
async fn extract_bytes(
|
|
72
|
-
&self,
|
|
73
|
-
content: &[u8],
|
|
74
|
-
mime_type: &str,
|
|
75
|
-
_config: &ExtractionConfig,
|
|
76
|
-
) -> Result<ExtractionResult> {
|
|
77
|
-
let bibtex_str = String::from_utf8_lossy(content);
|
|
78
|
-
|
|
79
|
-
let mut entries_vec = Vec::new();
|
|
80
|
-
let mut authors_set = HashSet::new();
|
|
81
|
-
let mut years_set = HashSet::new();
|
|
82
|
-
let mut entry_types_map = HashMap::new();
|
|
83
|
-
let mut formatted_entries = String::new();
|
|
84
|
-
|
|
85
|
-
match Bibliography::parse(&bibtex_str) {
|
|
86
|
-
Ok(bib) => {
|
|
87
|
-
for entry in bib.iter() {
|
|
88
|
-
let key = entry.key.clone();
|
|
89
|
-
let entry_type = entry.entry_type.clone();
|
|
90
|
-
|
|
91
|
-
formatted_entries.push_str(&format!("@{} {{\n", entry_type));
|
|
92
|
-
formatted_entries.push_str(&format!(" key = {},\n", key));
|
|
93
|
-
|
|
94
|
-
for (field_name, field_chunks) in &entry.fields {
|
|
95
|
-
let field_text = field_chunks.format_verbatim();
|
|
96
|
-
formatted_entries.push_str(&format!(" {} = {},\n", field_name, field_text));
|
|
97
|
-
|
|
98
|
-
if field_name.to_lowercase() == "author" {
|
|
99
|
-
let authors_text = field_chunks.format_verbatim();
|
|
100
|
-
for author in authors_text.split(" and ") {
|
|
101
|
-
let trimmed_author = author.trim().to_string();
|
|
102
|
-
if !trimmed_author.is_empty() {
|
|
103
|
-
authors_set.insert(trimmed_author);
|
|
104
|
-
}
|
|
105
|
-
}
|
|
106
|
-
}
|
|
107
|
-
|
|
108
|
-
if field_name.to_lowercase() == "year" {
|
|
109
|
-
let year_str = field_chunks.format_verbatim();
|
|
110
|
-
if let Ok(year) = year_str.parse::<u32>() {
|
|
111
|
-
years_set.insert(year);
|
|
112
|
-
}
|
|
113
|
-
}
|
|
114
|
-
}
|
|
115
|
-
|
|
116
|
-
formatted_entries.push_str("}\n\n");
|
|
117
|
-
|
|
118
|
-
*entry_types_map
|
|
119
|
-
.entry(entry_type.to_string().to_lowercase())
|
|
120
|
-
.or_insert(0) += 1;
|
|
121
|
-
|
|
122
|
-
entries_vec.push(key);
|
|
123
|
-
}
|
|
124
|
-
}
|
|
125
|
-
Err(_err) => {
|
|
126
|
-
#[cfg(feature = "otel")]
|
|
127
|
-
tracing::warn!("BibTeX parsing failed, returning raw content: {}", _err);
|
|
128
|
-
formatted_entries = bibtex_str.to_string();
|
|
129
|
-
}
|
|
130
|
-
}
|
|
131
|
-
|
|
132
|
-
let mut additional = HashMap::new();
|
|
133
|
-
|
|
134
|
-
additional.insert("entry_count".to_string(), serde_json::json!(entries_vec.len()));
|
|
135
|
-
|
|
136
|
-
let mut authors_list: Vec<String> = authors_set.into_iter().collect();
|
|
137
|
-
authors_list.sort();
|
|
138
|
-
additional.insert("authors".to_string(), serde_json::json!(authors_list));
|
|
139
|
-
|
|
140
|
-
if !years_set.is_empty() {
|
|
141
|
-
let min_year = years_set.iter().min().copied().unwrap_or(0);
|
|
142
|
-
let max_year = years_set.iter().max().copied().unwrap_or(0);
|
|
143
|
-
additional.insert(
|
|
144
|
-
"year_range".to_string(),
|
|
145
|
-
serde_json::json!({
|
|
146
|
-
"min": min_year,
|
|
147
|
-
"max": max_year,
|
|
148
|
-
"years": years_set.into_iter().collect::<Vec<_>>()
|
|
149
|
-
}),
|
|
150
|
-
);
|
|
151
|
-
}
|
|
152
|
-
|
|
153
|
-
if !entry_types_map.is_empty() {
|
|
154
|
-
let mut entry_types_json = serde_json::json!({});
|
|
155
|
-
for (entry_type, count) in entry_types_map {
|
|
156
|
-
entry_types_json[entry_type] = serde_json::json!(count);
|
|
157
|
-
}
|
|
158
|
-
additional.insert("entry_types".to_string(), entry_types_json);
|
|
159
|
-
}
|
|
160
|
-
|
|
161
|
-
additional.insert("citation_keys".to_string(), serde_json::json!(entries_vec));
|
|
162
|
-
|
|
163
|
-
Ok(ExtractionResult {
|
|
164
|
-
content: formatted_entries,
|
|
165
|
-
mime_type: mime_type.to_string(),
|
|
166
|
-
metadata: Metadata {
|
|
167
|
-
additional,
|
|
168
|
-
..Default::default()
|
|
169
|
-
},
|
|
170
|
-
tables: vec![],
|
|
171
|
-
detected_languages: None,
|
|
172
|
-
chunks: None,
|
|
173
|
-
images: None,
|
|
174
|
-
})
|
|
175
|
-
}
|
|
176
|
-
|
|
177
|
-
fn supported_mime_types(&self) -> &[&str] {
|
|
178
|
-
&["application/x-bibtex", "text/x-bibtex"]
|
|
179
|
-
}
|
|
180
|
-
|
|
181
|
-
fn priority(&self) -> i32 {
|
|
182
|
-
50
|
|
183
|
-
}
|
|
184
|
-
}
|
|
185
|
-
|
|
186
|
-
#[cfg(all(test, feature = "office"))]
|
|
187
|
-
mod tests {
|
|
188
|
-
use super::*;
|
|
189
|
-
|
|
190
|
-
#[tokio::test]
|
|
191
|
-
async fn test_can_extract_bibtex_mime_types() {
|
|
192
|
-
let extractor = BibtexExtractor::new();
|
|
193
|
-
let supported = extractor.supported_mime_types();
|
|
194
|
-
|
|
195
|
-
assert!(supported.contains(&"application/x-bibtex"));
|
|
196
|
-
assert!(supported.contains(&"text/x-bibtex"));
|
|
197
|
-
assert_eq!(supported.len(), 2);
|
|
198
|
-
}
|
|
199
|
-
|
|
200
|
-
#[tokio::test]
|
|
201
|
-
async fn test_extract_simple_bibtex() {
|
|
202
|
-
let extractor = BibtexExtractor::new();
|
|
203
|
-
let bibtex_content = br#"@article{key2023,
|
|
204
|
-
title = {Sample Title},
|
|
205
|
-
author = {John Doe},
|
|
206
|
-
year = {2023}
|
|
207
|
-
}"#;
|
|
208
|
-
|
|
209
|
-
let config = ExtractionConfig::default();
|
|
210
|
-
let result = extractor
|
|
211
|
-
.extract_bytes(bibtex_content, "application/x-bibtex", &config)
|
|
212
|
-
.await;
|
|
213
|
-
|
|
214
|
-
assert!(result.is_ok());
|
|
215
|
-
let result = result.expect("Should extract valid BibTeX entry");
|
|
216
|
-
|
|
217
|
-
assert!(result.content.contains("@article"));
|
|
218
|
-
assert!(result.content.contains("key2023"));
|
|
219
|
-
assert!(result.content.contains("Sample Title"));
|
|
220
|
-
|
|
221
|
-
let metadata = &result.metadata;
|
|
222
|
-
assert_eq!(metadata.additional.get("entry_count"), Some(&serde_json::json!(1)));
|
|
223
|
-
}
|
|
224
|
-
|
|
225
|
-
#[tokio::test]
|
|
226
|
-
async fn test_extract_multiple_entries() {
|
|
227
|
-
let extractor = BibtexExtractor::new();
|
|
228
|
-
let bibtex_content = br#"@article{first2020,
|
|
229
|
-
title = {First Paper},
|
|
230
|
-
author = {Author One},
|
|
231
|
-
year = {2020},
|
|
232
|
-
journal = {Test Journal}
|
|
233
|
-
}
|
|
234
|
-
|
|
235
|
-
@book{second2021,
|
|
236
|
-
title = {Test Book},
|
|
237
|
-
author = {Author Two},
|
|
238
|
-
year = {2021},
|
|
239
|
-
publisher = {Test Publisher}
|
|
240
|
-
}
|
|
241
|
-
|
|
242
|
-
@inproceedings{third2022,
|
|
243
|
-
title = {Conference Paper},
|
|
244
|
-
author = {Author Three},
|
|
245
|
-
year = {2022}
|
|
246
|
-
}"#;
|
|
247
|
-
|
|
248
|
-
let config = ExtractionConfig::default();
|
|
249
|
-
let result = extractor
|
|
250
|
-
.extract_bytes(bibtex_content, "application/x-bibtex", &config)
|
|
251
|
-
.await;
|
|
252
|
-
|
|
253
|
-
assert!(result.is_ok());
|
|
254
|
-
let result = result.expect("Should extract valid BibTeX entries");
|
|
255
|
-
|
|
256
|
-
let metadata = &result.metadata;
|
|
257
|
-
|
|
258
|
-
assert_eq!(metadata.additional.get("entry_count"), Some(&serde_json::json!(3)));
|
|
259
|
-
|
|
260
|
-
if let Some(keys) = metadata.additional.get("citation_keys")
|
|
261
|
-
&& let Some(keys_array) = keys.as_array()
|
|
262
|
-
{
|
|
263
|
-
assert_eq!(keys_array.len(), 3);
|
|
264
|
-
}
|
|
265
|
-
|
|
266
|
-
if let Some(types) = metadata.additional.get("entry_types") {
|
|
267
|
-
assert!(types.get("article").is_some());
|
|
268
|
-
assert!(types.get("book").is_some());
|
|
269
|
-
assert!(types.get("inproceedings").is_some());
|
|
270
|
-
}
|
|
271
|
-
}
|
|
272
|
-
|
|
273
|
-
#[tokio::test]
|
|
274
|
-
async fn test_extract_article_entry() {
|
|
275
|
-
let extractor = BibtexExtractor::new();
|
|
276
|
-
let bibtex_content = br#"@article{einstein1905,
|
|
277
|
-
author = {Albert Einstein},
|
|
278
|
-
title = {On the Electrodynamics of Moving Bodies},
|
|
279
|
-
journal = {Annalen der Physik},
|
|
280
|
-
year = {1905},
|
|
281
|
-
volume = {17},
|
|
282
|
-
pages = {891-921}
|
|
283
|
-
}"#;
|
|
284
|
-
|
|
285
|
-
let config = ExtractionConfig::default();
|
|
286
|
-
let result = extractor
|
|
287
|
-
.extract_bytes(bibtex_content, "application/x-bibtex", &config)
|
|
288
|
-
.await;
|
|
289
|
-
|
|
290
|
-
assert!(result.is_ok());
|
|
291
|
-
let result = result.expect("Should extract valid article entry");
|
|
292
|
-
|
|
293
|
-
assert!(result.content.contains("@article"));
|
|
294
|
-
assert!(result.content.contains("einstein1905"));
|
|
295
|
-
assert!(result.content.contains("On the Electrodynamics of Moving Bodies"));
|
|
296
|
-
assert!(result.content.contains("Annalen der Physik"));
|
|
297
|
-
|
|
298
|
-
let metadata = &result.metadata;
|
|
299
|
-
if let Some(authors) = metadata.additional.get("authors")
|
|
300
|
-
&& let Some(authors_array) = authors.as_array()
|
|
301
|
-
{
|
|
302
|
-
assert!(!authors_array.is_empty());
|
|
303
|
-
assert!(authors_array[0].as_str().unwrap_or("").contains("Einstein"));
|
|
304
|
-
}
|
|
305
|
-
}
|
|
306
|
-
|
|
307
|
-
#[tokio::test]
|
|
308
|
-
async fn test_extract_book_entry() {
|
|
309
|
-
let extractor = BibtexExtractor::new();
|
|
310
|
-
let bibtex_content = br#"@book{knuth1984,
|
|
311
|
-
author = {Donald E. Knuth},
|
|
312
|
-
title = {The TeXbook},
|
|
313
|
-
publisher = {Addison-Wesley},
|
|
314
|
-
year = {1984}
|
|
315
|
-
}"#;
|
|
316
|
-
|
|
317
|
-
let config = ExtractionConfig::default();
|
|
318
|
-
let result = extractor
|
|
319
|
-
.extract_bytes(bibtex_content, "application/x-bibtex", &config)
|
|
320
|
-
.await;
|
|
321
|
-
|
|
322
|
-
assert!(result.is_ok());
|
|
323
|
-
let result = result.expect("Should extract valid book entry");
|
|
324
|
-
|
|
325
|
-
assert!(result.content.contains("@book"));
|
|
326
|
-
assert!(result.content.contains("knuth1984"));
|
|
327
|
-
assert!(result.content.contains("The TeXbook"));
|
|
328
|
-
|
|
329
|
-
let metadata = &result.metadata;
|
|
330
|
-
assert_eq!(metadata.additional.get("entry_count"), Some(&serde_json::json!(1)));
|
|
331
|
-
|
|
332
|
-
if let Some(year_range) = metadata.additional.get("year_range") {
|
|
333
|
-
assert_eq!(year_range.get("min"), Some(&serde_json::json!(1984)));
|
|
334
|
-
assert_eq!(year_range.get("max"), Some(&serde_json::json!(1984)));
|
|
335
|
-
}
|
|
336
|
-
}
|
|
337
|
-
|
|
338
|
-
#[tokio::test]
|
|
339
|
-
async fn test_extract_metadata() {
|
|
340
|
-
let extractor = BibtexExtractor::new();
|
|
341
|
-
let bibtex_content = br#"@article{paper1,
|
|
342
|
-
author = {Alice Smith and Bob Jones},
|
|
343
|
-
title = {Title 1},
|
|
344
|
-
year = {2020}
|
|
345
|
-
}
|
|
346
|
-
|
|
347
|
-
@article{paper2,
|
|
348
|
-
author = {Charlie Brown},
|
|
349
|
-
title = {Title 2},
|
|
350
|
-
year = {2021}
|
|
351
|
-
}
|
|
352
|
-
|
|
353
|
-
@book{book1,
|
|
354
|
-
author = {David Lee},
|
|
355
|
-
title = {Book Title},
|
|
356
|
-
year = {2019}
|
|
357
|
-
}"#;
|
|
358
|
-
|
|
359
|
-
let config = ExtractionConfig::default();
|
|
360
|
-
let result = extractor
|
|
361
|
-
.extract_bytes(bibtex_content, "application/x-bibtex", &config)
|
|
362
|
-
.await;
|
|
363
|
-
|
|
364
|
-
assert!(result.is_ok());
|
|
365
|
-
let result = result.expect("Should extract valid metadata");
|
|
366
|
-
let metadata = &result.metadata;
|
|
367
|
-
|
|
368
|
-
assert_eq!(metadata.additional.get("entry_count"), Some(&serde_json::json!(3)));
|
|
369
|
-
|
|
370
|
-
if let Some(authors) = metadata.additional.get("authors")
|
|
371
|
-
&& let Some(authors_array) = authors.as_array()
|
|
372
|
-
{
|
|
373
|
-
assert!(authors_array.len() >= 4);
|
|
374
|
-
}
|
|
375
|
-
|
|
376
|
-
if let Some(year_range) = metadata.additional.get("year_range") {
|
|
377
|
-
assert_eq!(year_range.get("min"), Some(&serde_json::json!(2019)));
|
|
378
|
-
assert_eq!(year_range.get("max"), Some(&serde_json::json!(2021)));
|
|
379
|
-
}
|
|
380
|
-
|
|
381
|
-
if let Some(types) = metadata.additional.get("entry_types") {
|
|
382
|
-
assert_eq!(types.get("article"), Some(&serde_json::json!(2)));
|
|
383
|
-
assert_eq!(types.get("book"), Some(&serde_json::json!(1)));
|
|
384
|
-
}
|
|
385
|
-
}
|
|
386
|
-
|
|
387
|
-
#[tokio::test]
|
|
388
|
-
async fn test_empty_bibliography() {
|
|
389
|
-
let extractor = BibtexExtractor::new();
|
|
390
|
-
let bibtex_content = b"";
|
|
391
|
-
|
|
392
|
-
let config = ExtractionConfig::default();
|
|
393
|
-
let result = extractor
|
|
394
|
-
.extract_bytes(bibtex_content, "application/x-bibtex", &config)
|
|
395
|
-
.await;
|
|
396
|
-
|
|
397
|
-
assert!(result.is_ok());
|
|
398
|
-
let result = result.expect("Should extract empty bibliography");
|
|
399
|
-
let metadata = &result.metadata;
|
|
400
|
-
|
|
401
|
-
assert_eq!(metadata.additional.get("entry_count"), Some(&serde_json::json!(0)));
|
|
402
|
-
}
|
|
403
|
-
|
|
404
|
-
#[tokio::test]
|
|
405
|
-
async fn test_malformed_entry() {
|
|
406
|
-
let extractor = BibtexExtractor::new();
|
|
407
|
-
let bibtex_content = br#"@article{incomplete
|
|
408
|
-
title = {Missing fields}
|
|
409
|
-
|
|
410
|
-
Some random text that's not valid BibTeX"#;
|
|
411
|
-
|
|
412
|
-
let config = ExtractionConfig::default();
|
|
413
|
-
let result = extractor
|
|
414
|
-
.extract_bytes(bibtex_content, "application/x-bibtex", &config)
|
|
415
|
-
.await;
|
|
416
|
-
|
|
417
|
-
assert!(result.is_ok());
|
|
418
|
-
let result = result.expect("Should extract malformed entry as raw content");
|
|
419
|
-
|
|
420
|
-
assert!(!result.content.is_empty());
|
|
421
|
-
}
|
|
422
|
-
|
|
423
|
-
#[tokio::test]
|
|
424
|
-
async fn test_multiple_authors_extraction() {
|
|
425
|
-
let extractor = BibtexExtractor::new();
|
|
426
|
-
let bibtex_content = br#"@article{collab2022,
|
|
427
|
-
author = {First Author and Second Author and Third Author},
|
|
428
|
-
title = {Collaborative Work},
|
|
429
|
-
year = {2022}
|
|
430
|
-
}"#;
|
|
431
|
-
|
|
432
|
-
let config = ExtractionConfig::default();
|
|
433
|
-
let result = extractor
|
|
434
|
-
.extract_bytes(bibtex_content, "application/x-bibtex", &config)
|
|
435
|
-
.await;
|
|
436
|
-
|
|
437
|
-
assert!(result.is_ok());
|
|
438
|
-
let result = result.expect("Should extract multiple authors");
|
|
439
|
-
let metadata = &result.metadata;
|
|
440
|
-
|
|
441
|
-
if let Some(authors) = metadata.additional.get("authors")
|
|
442
|
-
&& let Some(authors_array) = authors.as_array()
|
|
443
|
-
{
|
|
444
|
-
assert!(authors_array.len() >= 3);
|
|
445
|
-
}
|
|
446
|
-
}
|
|
447
|
-
|
|
448
|
-
#[tokio::test]
|
|
449
|
-
async fn test_bibtex_extractor_plugin_interface() {
|
|
450
|
-
let extractor = BibtexExtractor::new();
|
|
451
|
-
assert_eq!(extractor.name(), "bibtex-extractor");
|
|
452
|
-
assert_eq!(extractor.version(), env!("CARGO_PKG_VERSION"));
|
|
453
|
-
assert_eq!(extractor.priority(), 50);
|
|
454
|
-
assert!(!extractor.supported_mime_types().is_empty());
|
|
455
|
-
}
|
|
456
|
-
|
|
457
|
-
#[test]
|
|
458
|
-
fn test_bibtex_extractor_default() {
|
|
459
|
-
let extractor = BibtexExtractor;
|
|
460
|
-
assert_eq!(extractor.name(), "bibtex-extractor");
|
|
461
|
-
}
|
|
462
|
-
|
|
463
|
-
#[tokio::test]
|
|
464
|
-
async fn test_bibtex_extractor_initialize_shutdown() {
|
|
465
|
-
let extractor = BibtexExtractor::new();
|
|
466
|
-
assert!(extractor.initialize().is_ok());
|
|
467
|
-
assert!(extractor.shutdown().is_ok());
|
|
468
|
-
}
|
|
469
|
-
}
|