kreuzberg 4.0.0.pre.rc.6 → 4.0.0.rc1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +0 -6
- data/.rubocop.yaml +534 -1
- data/Gemfile +2 -1
- data/Gemfile.lock +11 -11
- data/README.md +5 -10
- data/examples/async_patterns.rb +0 -1
- data/ext/kreuzberg_rb/extconf.rb +0 -10
- data/ext/kreuzberg_rb/native/Cargo.toml +15 -23
- data/ext/kreuzberg_rb/native/build.rs +2 -0
- data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
- data/ext/kreuzberg_rb/native/include/strings.h +2 -2
- data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
- data/ext/kreuzberg_rb/native/src/lib.rs +16 -75
- data/kreuzberg.gemspec +14 -57
- data/lib/kreuzberg/cache_api.rb +0 -1
- data/lib/kreuzberg/cli.rb +2 -2
- data/lib/kreuzberg/config.rb +2 -9
- data/lib/kreuzberg/errors.rb +7 -75
- data/lib/kreuzberg/extraction_api.rb +0 -1
- data/lib/kreuzberg/setup_lib_path.rb +0 -1
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +0 -21
- data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
- data/sig/kreuzberg.rbs +3 -55
- data/spec/binding/cli_proxy_spec.rb +4 -2
- data/spec/binding/cli_spec.rb +11 -12
- data/spec/examples.txt +104 -0
- data/spec/fixtures/config.yaml +1 -0
- data/spec/spec_helper.rb +1 -1
- data/vendor/kreuzberg/Cargo.toml +42 -112
- data/vendor/kreuzberg/README.md +2 -2
- data/vendor/kreuzberg/build.rs +4 -18
- data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
- data/vendor/kreuzberg/src/cache/mod.rs +3 -27
- data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
- data/vendor/kreuzberg/src/core/extractor.rs +81 -202
- data/vendor/kreuzberg/src/core/io.rs +2 -4
- data/vendor/kreuzberg/src/core/mime.rs +12 -2
- data/vendor/kreuzberg/src/core/mod.rs +1 -4
- data/vendor/kreuzberg/src/core/pipeline.rs +33 -111
- data/vendor/kreuzberg/src/embeddings.rs +16 -125
- data/vendor/kreuzberg/src/error.rs +1 -1
- data/vendor/kreuzberg/src/extraction/docx.rs +1 -1
- data/vendor/kreuzberg/src/extraction/image.rs +13 -13
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +1 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +5 -9
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
- data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
- data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
- data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
- data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +0 -21
- data/vendor/kreuzberg/src/extractors/docx.rs +128 -16
- data/vendor/kreuzberg/src/extractors/email.rs +0 -14
- data/vendor/kreuzberg/src/extractors/excel.rs +20 -19
- data/vendor/kreuzberg/src/extractors/html.rs +154 -137
- data/vendor/kreuzberg/src/extractors/image.rs +4 -7
- data/vendor/kreuzberg/src/extractors/mod.rs +9 -106
- data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +15 -12
- data/vendor/kreuzberg/src/extractors/pptx.rs +3 -17
- data/vendor/kreuzberg/src/extractors/structured.rs +0 -14
- data/vendor/kreuzberg/src/extractors/text.rs +5 -23
- data/vendor/kreuzberg/src/extractors/xml.rs +0 -7
- data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
- data/vendor/kreuzberg/src/lib.rs +1 -4
- data/vendor/kreuzberg/src/mcp/mod.rs +1 -1
- data/vendor/kreuzberg/src/mcp/server.rs +3 -5
- data/vendor/kreuzberg/src/ocr/processor.rs +2 -18
- data/vendor/kreuzberg/src/pdf/error.rs +1 -1
- data/vendor/kreuzberg/src/pdf/table.rs +44 -17
- data/vendor/kreuzberg/src/pdf/text.rs +3 -0
- data/vendor/kreuzberg/src/plugins/extractor.rs +5 -8
- data/vendor/kreuzberg/src/plugins/ocr.rs +11 -2
- data/vendor/kreuzberg/src/plugins/processor.rs +1 -2
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -13
- data/vendor/kreuzberg/src/plugins/validator.rs +8 -9
- data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
- data/vendor/kreuzberg/src/types.rs +12 -42
- data/vendor/kreuzberg/tests/batch_orchestration.rs +5 -19
- data/vendor/kreuzberg/tests/batch_processing.rs +3 -15
- data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +1 -17
- data/vendor/kreuzberg/tests/config_features.rs +0 -18
- data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -15
- data/vendor/kreuzberg/tests/core_integration.rs +7 -24
- data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
- data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1 -0
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -1
- data/vendor/kreuzberg/tests/security_validation.rs +1 -12
- metadata +25 -90
- data/.rubocop.yml +0 -538
- data/ext/kreuzberg_rb/native/Cargo.lock +0 -6535
- data/lib/kreuzberg/error_context.rb +0 -32
- data/vendor/kreuzberg/benches/otel_overhead.rs +0 -48
- data/vendor/kreuzberg/src/extraction/markdown.rs +0 -213
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -287
- data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -469
- data/vendor/kreuzberg/src/extractors/docbook.rs +0 -502
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -707
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -491
- data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
- data/vendor/kreuzberg/src/extractors/jats.rs +0 -1051
- data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -367
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -652
- data/vendor/kreuzberg/src/extractors/markdown.rs +0 -700
- data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -634
- data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -528
- data/vendor/kreuzberg/src/extractors/rst.rs +0 -576
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -810
- data/vendor/kreuzberg/src/extractors/security.rs +0 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
- data/vendor/kreuzberg/src/extractors/typst.rs +0 -650
- data/vendor/kreuzberg/src/panic_context.rs +0 -154
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -498
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
- data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
- data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -695
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -692
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -776
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1259
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -647
- data/vendor/rb-sys/.cargo-ok +0 -1
- data/vendor/rb-sys/.cargo_vcs_info.json +0 -6
- data/vendor/rb-sys/Cargo.lock +0 -393
- data/vendor/rb-sys/Cargo.toml +0 -70
- data/vendor/rb-sys/Cargo.toml.orig +0 -57
- data/vendor/rb-sys/LICENSE-APACHE +0 -190
- data/vendor/rb-sys/LICENSE-MIT +0 -21
- data/vendor/rb-sys/bin/release.sh +0 -21
- data/vendor/rb-sys/build/features.rs +0 -108
- data/vendor/rb-sys/build/main.rs +0 -246
- data/vendor/rb-sys/build/stable_api_config.rs +0 -153
- data/vendor/rb-sys/build/version.rs +0 -48
- data/vendor/rb-sys/readme.md +0 -36
- data/vendor/rb-sys/src/bindings.rs +0 -21
- data/vendor/rb-sys/src/hidden.rs +0 -11
- data/vendor/rb-sys/src/lib.rs +0 -34
- data/vendor/rb-sys/src/macros.rs +0 -371
- data/vendor/rb-sys/src/memory.rs +0 -53
- data/vendor/rb-sys/src/ruby_abi_version.rs +0 -38
- data/vendor/rb-sys/src/special_consts.rs +0 -31
- data/vendor/rb-sys/src/stable_api/compiled.c +0 -179
- data/vendor/rb-sys/src/stable_api/compiled.rs +0 -257
- data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +0 -316
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +0 -324
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +0 -317
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +0 -315
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +0 -326
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +0 -327
- data/vendor/rb-sys/src/stable_api.rs +0 -261
- data/vendor/rb-sys/src/symbol.rs +0 -31
- data/vendor/rb-sys/src/tracking_allocator.rs +0 -332
- data/vendor/rb-sys/src/utils.rs +0 -89
- data/vendor/rb-sys/src/value_type.rs +0 -7
|
@@ -1,154 +0,0 @@
|
|
|
1
|
-
use std::any::Any;
|
|
2
|
-
use std::time::{SystemTime, UNIX_EPOCH};
|
|
3
|
-
|
|
4
|
-
/// Context information captured when a panic occurs.
|
|
5
|
-
///
|
|
6
|
-
/// This struct stores detailed information about where and when a panic happened,
|
|
7
|
-
/// enabling better error reporting across FFI boundaries.
|
|
8
|
-
#[derive(Debug, Clone)]
|
|
9
|
-
pub struct PanicContext {
|
|
10
|
-
/// Source file where the panic occurred
|
|
11
|
-
pub file: &'static str,
|
|
12
|
-
/// Line number where the panic occurred
|
|
13
|
-
pub line: u32,
|
|
14
|
-
/// Function name where the panic occurred
|
|
15
|
-
pub function: &'static str,
|
|
16
|
-
/// Panic message extracted from the panic payload
|
|
17
|
-
pub message: String,
|
|
18
|
-
/// Timestamp when the panic was captured
|
|
19
|
-
pub timestamp: SystemTime,
|
|
20
|
-
}
|
|
21
|
-
|
|
22
|
-
impl PanicContext {
|
|
23
|
-
/// Creates a new PanicContext with the given parameters.
|
|
24
|
-
///
|
|
25
|
-
/// # Arguments
|
|
26
|
-
///
|
|
27
|
-
/// * `file` - Source file path
|
|
28
|
-
/// * `line` - Line number
|
|
29
|
-
/// * `function` - Function name
|
|
30
|
-
/// * `panic_info` - The panic payload to extract message from
|
|
31
|
-
pub fn new(file: &'static str, line: u32, function: &'static str, panic_info: &dyn Any) -> Self {
|
|
32
|
-
let timestamp = std::panic::catch_unwind(SystemTime::now).unwrap_or(UNIX_EPOCH);
|
|
33
|
-
|
|
34
|
-
Self {
|
|
35
|
-
file,
|
|
36
|
-
line,
|
|
37
|
-
function,
|
|
38
|
-
message: extract_panic_message(panic_info),
|
|
39
|
-
timestamp,
|
|
40
|
-
}
|
|
41
|
-
}
|
|
42
|
-
|
|
43
|
-
/// Formats the panic context as a human-readable string.
|
|
44
|
-
pub fn format(&self) -> String {
|
|
45
|
-
format!(
|
|
46
|
-
"Panic at {}:{}:{} - {}",
|
|
47
|
-
self.file, self.line, self.function, self.message
|
|
48
|
-
)
|
|
49
|
-
}
|
|
50
|
-
}
|
|
51
|
-
|
|
52
|
-
/// Maximum panic message length to prevent DoS attacks
|
|
53
|
-
const MAX_PANIC_MESSAGE_LEN: usize = 4096;
|
|
54
|
-
|
|
55
|
-
/// Extracts a human-readable message from a panic payload.
|
|
56
|
-
///
|
|
57
|
-
/// Attempts to downcast the panic payload to common types (String, &str)
|
|
58
|
-
/// to extract a meaningful error message.
|
|
59
|
-
///
|
|
60
|
-
/// Message is truncated to 4KB to prevent DoS attacks via extremely large panic messages.
|
|
61
|
-
///
|
|
62
|
-
/// # Arguments
|
|
63
|
-
///
|
|
64
|
-
/// * `panic_info` - The panic payload from catch_unwind
|
|
65
|
-
///
|
|
66
|
-
/// # Returns
|
|
67
|
-
///
|
|
68
|
-
/// A string representation of the panic message (truncated if necessary)
|
|
69
|
-
pub fn extract_panic_message(panic_info: &dyn Any) -> String {
|
|
70
|
-
let msg = if let Some(s) = panic_info.downcast_ref::<String>() {
|
|
71
|
-
s.clone()
|
|
72
|
-
} else if let Some(s) = panic_info.downcast_ref::<&str>() {
|
|
73
|
-
(*s).to_string()
|
|
74
|
-
} else {
|
|
75
|
-
"Unknown panic payload".to_string()
|
|
76
|
-
};
|
|
77
|
-
|
|
78
|
-
if msg.len() > MAX_PANIC_MESSAGE_LEN {
|
|
79
|
-
let truncate_at = msg.floor_char_boundary(MAX_PANIC_MESSAGE_LEN);
|
|
80
|
-
format!("{}... [truncated]", &msg[..truncate_at])
|
|
81
|
-
} else {
|
|
82
|
-
msg
|
|
83
|
-
}
|
|
84
|
-
}
|
|
85
|
-
|
|
86
|
-
#[cfg(test)]
|
|
87
|
-
mod tests {
|
|
88
|
-
use super::*;
|
|
89
|
-
|
|
90
|
-
#[test]
|
|
91
|
-
fn test_extract_panic_message_string() {
|
|
92
|
-
let panic_msg = "test panic".to_string();
|
|
93
|
-
let msg = extract_panic_message(&panic_msg);
|
|
94
|
-
assert_eq!(msg, "test panic");
|
|
95
|
-
}
|
|
96
|
-
|
|
97
|
-
#[test]
|
|
98
|
-
fn test_extract_panic_message_str() {
|
|
99
|
-
let panic_msg: &str = "test panic";
|
|
100
|
-
let msg = extract_panic_message(&panic_msg);
|
|
101
|
-
assert_eq!(msg, "test panic");
|
|
102
|
-
}
|
|
103
|
-
|
|
104
|
-
#[test]
|
|
105
|
-
fn test_extract_panic_message_unknown() {
|
|
106
|
-
let panic_msg = 42i32;
|
|
107
|
-
let msg = extract_panic_message(&panic_msg);
|
|
108
|
-
assert_eq!(msg, "Unknown panic payload");
|
|
109
|
-
}
|
|
110
|
-
|
|
111
|
-
#[test]
|
|
112
|
-
fn test_panic_context_format() {
|
|
113
|
-
let panic_msg = "test error".to_string();
|
|
114
|
-
let ctx = PanicContext::new("test.rs", 42, "test_function", &panic_msg);
|
|
115
|
-
|
|
116
|
-
let formatted = ctx.format();
|
|
117
|
-
assert!(formatted.contains("test.rs"));
|
|
118
|
-
assert!(formatted.contains("42"));
|
|
119
|
-
assert!(formatted.contains("test_function"));
|
|
120
|
-
assert!(formatted.contains("test error"));
|
|
121
|
-
}
|
|
122
|
-
|
|
123
|
-
#[test]
|
|
124
|
-
fn test_panic_message_truncation() {
|
|
125
|
-
let long_msg = "x".repeat(5000);
|
|
126
|
-
let msg = extract_panic_message(&long_msg);
|
|
127
|
-
assert!(msg.len() <= MAX_PANIC_MESSAGE_LEN + 20);
|
|
128
|
-
assert!(msg.ends_with("... [truncated]"));
|
|
129
|
-
}
|
|
130
|
-
|
|
131
|
-
#[test]
|
|
132
|
-
fn test_panic_message_truncation_utf8_boundary() {
|
|
133
|
-
let mut msg = "x".repeat(4093);
|
|
134
|
-
msg.push('🦀');
|
|
135
|
-
msg.push_str("yyy");
|
|
136
|
-
|
|
137
|
-
let truncated = extract_panic_message(&msg);
|
|
138
|
-
|
|
139
|
-
assert!(truncated.ends_with("... [truncated]"));
|
|
140
|
-
|
|
141
|
-
assert!(std::str::from_utf8(truncated.as_bytes()).is_ok());
|
|
142
|
-
|
|
143
|
-
assert!(!truncated.contains("🦀"));
|
|
144
|
-
assert!(!truncated.contains("yyy"));
|
|
145
|
-
}
|
|
146
|
-
|
|
147
|
-
#[test]
|
|
148
|
-
fn test_panic_message_no_truncation_needed() {
|
|
149
|
-
let short_msg = "short".to_string();
|
|
150
|
-
let msg = extract_panic_message(&short_msg);
|
|
151
|
-
assert_eq!(msg, "short");
|
|
152
|
-
assert!(!msg.contains("[truncated]"));
|
|
153
|
-
}
|
|
154
|
-
}
|
|
@@ -1,52 +0,0 @@
|
|
|
1
|
-
#![cfg(feature = "api")]
|
|
2
|
-
//! Integration test for the `/extract` API handler using multipart uploads.
|
|
3
|
-
|
|
4
|
-
use axum::{
|
|
5
|
-
body::{Body, to_bytes},
|
|
6
|
-
http::{Request, StatusCode},
|
|
7
|
-
};
|
|
8
|
-
use kreuzberg::{
|
|
9
|
-
ExtractionConfig,
|
|
10
|
-
api::{ApiSizeLimits, create_router_with_limits},
|
|
11
|
-
};
|
|
12
|
-
use serde_json::Value;
|
|
13
|
-
use tower::ServiceExt;
|
|
14
|
-
|
|
15
|
-
#[tokio::test]
|
|
16
|
-
async fn test_extract_accepts_single_file_multipart() {
|
|
17
|
-
let router = create_router_with_limits(ExtractionConfig::default(), ApiSizeLimits::from_mb(5, 5));
|
|
18
|
-
|
|
19
|
-
let boundary = "X-BOUNDARY";
|
|
20
|
-
let body = format!(
|
|
21
|
-
"--{boundary}\r\n\
|
|
22
|
-
Content-Disposition: form-data; name=\"files\"; filename=\"test.txt\"\r\n\
|
|
23
|
-
Content-Type: text/plain\r\n\
|
|
24
|
-
\r\n\
|
|
25
|
-
Hello world\r\n\
|
|
26
|
-
--{boundary}--\r\n"
|
|
27
|
-
);
|
|
28
|
-
let body_bytes = body.into_bytes();
|
|
29
|
-
|
|
30
|
-
let request = Request::builder()
|
|
31
|
-
.method("POST")
|
|
32
|
-
.uri("/extract")
|
|
33
|
-
.header("content-type", format!("multipart/form-data; boundary={boundary}"))
|
|
34
|
-
.header("content-length", body_bytes.len())
|
|
35
|
-
.body(Body::from(body_bytes))
|
|
36
|
-
.expect("Failed to build request");
|
|
37
|
-
|
|
38
|
-
let response = router.oneshot(request).await.expect("Request failed");
|
|
39
|
-
assert_eq!(response.status(), StatusCode::OK);
|
|
40
|
-
|
|
41
|
-
let bytes = to_bytes(response.into_body(), 1_000_000)
|
|
42
|
-
.await
|
|
43
|
-
.expect("Failed to read body");
|
|
44
|
-
let value: Value = serde_json::from_slice(&bytes).expect("Response JSON parse failed");
|
|
45
|
-
let content = value
|
|
46
|
-
.get(0)
|
|
47
|
-
.and_then(|v| v.get("content"))
|
|
48
|
-
.and_then(Value::as_str)
|
|
49
|
-
.expect("Response should include extracted content");
|
|
50
|
-
|
|
51
|
-
assert_eq!(content.trim_end_matches('\n'), "Hello world");
|
|
52
|
-
}
|
|
@@ -1,421 +0,0 @@
|
|
|
1
|
-
#![cfg(feature = "office")]
|
|
2
|
-
//! Comprehensive test for BibTeX extractor parity with Pandoc
|
|
3
|
-
|
|
4
|
-
use kreuzberg::core::config::ExtractionConfig;
|
|
5
|
-
use kreuzberg::extractors::BibtexExtractor;
|
|
6
|
-
use kreuzberg::plugins::DocumentExtractor;
|
|
7
|
-
|
|
8
|
-
mod helpers;
|
|
9
|
-
use helpers::get_test_file_path;
|
|
10
|
-
|
|
11
|
-
#[tokio::test]
|
|
12
|
-
async fn test_all_entry_types() {
|
|
13
|
-
let extractor = BibtexExtractor::new();
|
|
14
|
-
|
|
15
|
-
let test_cases = vec![
|
|
16
|
-
(
|
|
17
|
-
"@article{test, author={John Doe}, title={Test}, journal={Journal}, year={2023}}",
|
|
18
|
-
"article",
|
|
19
|
-
),
|
|
20
|
-
(
|
|
21
|
-
"@book{test, author={John Doe}, title={Test}, publisher={Publisher}, year={2023}}",
|
|
22
|
-
"book",
|
|
23
|
-
),
|
|
24
|
-
(
|
|
25
|
-
"@inproceedings{test, author={John Doe}, title={Test}, booktitle={Conference}, year={2023}}",
|
|
26
|
-
"inproceedings",
|
|
27
|
-
),
|
|
28
|
-
(
|
|
29
|
-
"@phdthesis{test, author={John Doe}, title={Test}, school={University}, year={2023}}",
|
|
30
|
-
"phdthesis",
|
|
31
|
-
),
|
|
32
|
-
(
|
|
33
|
-
"@mastersthesis{test, author={John Doe}, title={Test}, school={University}, year={2023}}",
|
|
34
|
-
"mastersthesis",
|
|
35
|
-
),
|
|
36
|
-
(
|
|
37
|
-
"@techreport{test, author={John Doe}, title={Test}, institution={Institute}, year={2023}}",
|
|
38
|
-
"techreport",
|
|
39
|
-
),
|
|
40
|
-
("@manual{test, title={Test Manual}, year={2023}}", "manual"),
|
|
41
|
-
("@misc{test, author={John Doe}, title={Test}, year={2023}}", "misc"),
|
|
42
|
-
(
|
|
43
|
-
"@unpublished{test, author={John Doe}, title={Test}, note={Unpublished}, year={2023}}",
|
|
44
|
-
"unpublished",
|
|
45
|
-
),
|
|
46
|
-
(
|
|
47
|
-
"@incollection{test, author={John Doe}, title={Test}, booktitle={Book}, publisher={Pub}, year={2023}}",
|
|
48
|
-
"incollection",
|
|
49
|
-
),
|
|
50
|
-
(
|
|
51
|
-
"@inbook{test, author={John Doe}, title={Test}, chapter={5}, publisher={Pub}, year={2023}}",
|
|
52
|
-
"inbook",
|
|
53
|
-
),
|
|
54
|
-
(
|
|
55
|
-
"@proceedings{test, title={Conference Proceedings}, year={2023}}",
|
|
56
|
-
"proceedings",
|
|
57
|
-
),
|
|
58
|
-
("@booklet{test, title={Booklet}, year={2023}}", "booklet"),
|
|
59
|
-
];
|
|
60
|
-
|
|
61
|
-
for (bibtex_content, expected_type) in test_cases {
|
|
62
|
-
let config = ExtractionConfig::default();
|
|
63
|
-
let result = extractor
|
|
64
|
-
.extract_bytes(bibtex_content.as_bytes(), "application/x-bibtex", &config)
|
|
65
|
-
.await;
|
|
66
|
-
|
|
67
|
-
assert!(result.is_ok(), "Failed to parse {} entry", expected_type);
|
|
68
|
-
let result = result.unwrap();
|
|
69
|
-
|
|
70
|
-
if let Some(entry_types) = result.metadata.additional.get("entry_types") {
|
|
71
|
-
assert!(entry_types.as_object().is_some(), "Entry types should be an object");
|
|
72
|
-
println!("Entry type '{}' extracted successfully", expected_type);
|
|
73
|
-
}
|
|
74
|
-
}
|
|
75
|
-
}
|
|
76
|
-
|
|
77
|
-
#[tokio::test]
|
|
78
|
-
async fn test_all_common_fields() {
|
|
79
|
-
let extractor = BibtexExtractor::new();
|
|
80
|
-
|
|
81
|
-
let bibtex_content = r#"
|
|
82
|
-
@article{comprehensive,
|
|
83
|
-
author = {Smith, John and Doe, Jane},
|
|
84
|
-
title = {Comprehensive Test},
|
|
85
|
-
journal = {Test Journal},
|
|
86
|
-
year = {2023},
|
|
87
|
-
volume = {42},
|
|
88
|
-
number = {3},
|
|
89
|
-
pages = {123--145},
|
|
90
|
-
month = {June},
|
|
91
|
-
doi = {10.1234/test.001},
|
|
92
|
-
url = {https://example.com},
|
|
93
|
-
issn = {1234-5678},
|
|
94
|
-
isbn = {978-0-12-345678-9},
|
|
95
|
-
abstract = {This is an abstract},
|
|
96
|
-
keywords = {test, bibtex},
|
|
97
|
-
note = {Additional notes},
|
|
98
|
-
publisher = {Test Publisher},
|
|
99
|
-
address = {Test City},
|
|
100
|
-
edition = {2nd},
|
|
101
|
-
editor = {Editor Name},
|
|
102
|
-
series = {Test Series},
|
|
103
|
-
organization = {Test Org},
|
|
104
|
-
institution = {Test Institute},
|
|
105
|
-
school = {Test School},
|
|
106
|
-
howpublished = {Online},
|
|
107
|
-
type = {Research Article},
|
|
108
|
-
chapter = {5},
|
|
109
|
-
booktitle = {Book Title}
|
|
110
|
-
}
|
|
111
|
-
"#;
|
|
112
|
-
|
|
113
|
-
let config = ExtractionConfig::default();
|
|
114
|
-
let result = extractor
|
|
115
|
-
.extract_bytes(bibtex_content.as_bytes(), "application/x-bibtex", &config)
|
|
116
|
-
.await;
|
|
117
|
-
|
|
118
|
-
assert!(result.is_ok());
|
|
119
|
-
let result = result.unwrap();
|
|
120
|
-
|
|
121
|
-
let content = &result.content;
|
|
122
|
-
|
|
123
|
-
let expected_fields = vec![
|
|
124
|
-
"author",
|
|
125
|
-
"title",
|
|
126
|
-
"journal",
|
|
127
|
-
"year",
|
|
128
|
-
"volume",
|
|
129
|
-
"number",
|
|
130
|
-
"pages",
|
|
131
|
-
"month",
|
|
132
|
-
"doi",
|
|
133
|
-
"url",
|
|
134
|
-
"issn",
|
|
135
|
-
"isbn",
|
|
136
|
-
"abstract",
|
|
137
|
-
"keywords",
|
|
138
|
-
"note",
|
|
139
|
-
"publisher",
|
|
140
|
-
"address",
|
|
141
|
-
"edition",
|
|
142
|
-
"editor",
|
|
143
|
-
"series",
|
|
144
|
-
"organization",
|
|
145
|
-
"institution",
|
|
146
|
-
"school",
|
|
147
|
-
"howpublished",
|
|
148
|
-
"type",
|
|
149
|
-
"chapter",
|
|
150
|
-
"booktitle",
|
|
151
|
-
];
|
|
152
|
-
|
|
153
|
-
let num_fields = expected_fields.len();
|
|
154
|
-
for field in expected_fields {
|
|
155
|
-
assert!(content.contains(field), "Field '{}' should be present in output", field);
|
|
156
|
-
}
|
|
157
|
-
|
|
158
|
-
println!("All {} fields were extracted", num_fields);
|
|
159
|
-
}
|
|
160
|
-
|
|
161
|
-
#[tokio::test]
|
|
162
|
-
async fn test_author_parsing() {
|
|
163
|
-
let extractor = BibtexExtractor::new();
|
|
164
|
-
|
|
165
|
-
let test_cases = vec![
|
|
166
|
-
("author = {John Doe}", vec!["John Doe"]),
|
|
167
|
-
("author = {John Doe and Jane Smith}", vec!["John Doe", "Jane Smith"]),
|
|
168
|
-
("author = {Smith, John and Doe, Jane}", vec!["Smith, John", "Doe, Jane"]),
|
|
169
|
-
(
|
|
170
|
-
"author = {John Doe and Jane Smith and Bob Jones}",
|
|
171
|
-
vec!["John Doe", "Jane Smith", "Bob Jones"],
|
|
172
|
-
),
|
|
173
|
-
("author = {van der Berg, Hans}", vec!["van der Berg, Hans"]),
|
|
174
|
-
("author = {Smith, Jr., John}", vec!["Smith, Jr., John"]),
|
|
175
|
-
];
|
|
176
|
-
|
|
177
|
-
for (author_field, expected_authors) in test_cases {
|
|
178
|
-
let bibtex = format!("@article{{test, {}, title={{Test}}, year={{2023}}}}", author_field);
|
|
179
|
-
|
|
180
|
-
let config = ExtractionConfig::default();
|
|
181
|
-
let result = extractor
|
|
182
|
-
.extract_bytes(bibtex.as_bytes(), "application/x-bibtex", &config)
|
|
183
|
-
.await;
|
|
184
|
-
|
|
185
|
-
assert!(result.is_ok());
|
|
186
|
-
let result = result.unwrap();
|
|
187
|
-
|
|
188
|
-
if let Some(authors) = result.metadata.additional.get("authors") {
|
|
189
|
-
let authors_array = authors.as_array().expect("Authors should be an array");
|
|
190
|
-
|
|
191
|
-
for expected_author in &expected_authors {
|
|
192
|
-
let found = authors_array
|
|
193
|
-
.iter()
|
|
194
|
-
.any(|a| a.as_str().map(|s| s.contains(expected_author)).unwrap_or(false));
|
|
195
|
-
assert!(
|
|
196
|
-
found,
|
|
197
|
-
"Expected author '{}' not found in {:?}",
|
|
198
|
-
expected_author, authors_array
|
|
199
|
-
);
|
|
200
|
-
}
|
|
201
|
-
}
|
|
202
|
-
}
|
|
203
|
-
}
|
|
204
|
-
|
|
205
|
-
#[tokio::test]
|
|
206
|
-
async fn test_special_characters() {
|
|
207
|
-
let extractor = BibtexExtractor::new();
|
|
208
|
-
|
|
209
|
-
let bibtex_content = r#"
|
|
210
|
-
@article{special,
|
|
211
|
-
author = {M{\"u}ller, Hans and Sch{\"o}n, Anna and Garc{\'\i}a, Jos{\'e}},
|
|
212
|
-
title = {Special Characters in {BibTeX}: {\"O}berblick},
|
|
213
|
-
journal = {Test Journal},
|
|
214
|
-
year = {2022}
|
|
215
|
-
}
|
|
216
|
-
"#;
|
|
217
|
-
|
|
218
|
-
let config = ExtractionConfig::default();
|
|
219
|
-
let result = extractor
|
|
220
|
-
.extract_bytes(bibtex_content.as_bytes(), "application/x-bibtex", &config)
|
|
221
|
-
.await;
|
|
222
|
-
|
|
223
|
-
assert!(result.is_ok());
|
|
224
|
-
let result = result.unwrap();
|
|
225
|
-
|
|
226
|
-
assert_eq!(
|
|
227
|
-
result.metadata.additional.get("entry_count"),
|
|
228
|
-
Some(&serde_json::json!(1))
|
|
229
|
-
);
|
|
230
|
-
|
|
231
|
-
if let Some(authors) = result.metadata.additional.get("authors") {
|
|
232
|
-
let authors_array = authors.as_array().expect("Authors should be an array");
|
|
233
|
-
assert!(authors_array.len() >= 3, "Should have 3 authors");
|
|
234
|
-
}
|
|
235
|
-
}
|
|
236
|
-
|
|
237
|
-
#[tokio::test]
|
|
238
|
-
async fn test_year_range_extraction() {
|
|
239
|
-
let extractor = BibtexExtractor::new();
|
|
240
|
-
|
|
241
|
-
let bibtex_content = r#"
|
|
242
|
-
@article{old, author={A}, title={Old}, year={1990}}
|
|
243
|
-
@article{mid, author={B}, title={Mid}, year={2005}}
|
|
244
|
-
@article{new, author={C}, title={New}, year={2023}}
|
|
245
|
-
"#;
|
|
246
|
-
|
|
247
|
-
let config = ExtractionConfig::default();
|
|
248
|
-
let result = extractor
|
|
249
|
-
.extract_bytes(bibtex_content.as_bytes(), "application/x-bibtex", &config)
|
|
250
|
-
.await;
|
|
251
|
-
|
|
252
|
-
assert!(result.is_ok());
|
|
253
|
-
let result = result.unwrap();
|
|
254
|
-
|
|
255
|
-
if let Some(year_range) = result.metadata.additional.get("year_range") {
|
|
256
|
-
assert_eq!(year_range.get("min"), Some(&serde_json::json!(1990)));
|
|
257
|
-
assert_eq!(year_range.get("max"), Some(&serde_json::json!(2023)));
|
|
258
|
-
|
|
259
|
-
if let Some(years) = year_range.get("years") {
|
|
260
|
-
let years_array = years.as_array().expect("Years should be an array");
|
|
261
|
-
assert_eq!(years_array.len(), 3, "Should have 3 unique years");
|
|
262
|
-
}
|
|
263
|
-
} else {
|
|
264
|
-
panic!("Year range not extracted");
|
|
265
|
-
}
|
|
266
|
-
}
|
|
267
|
-
|
|
268
|
-
#[tokio::test]
|
|
269
|
-
async fn test_citation_keys_extraction() {
|
|
270
|
-
let extractor = BibtexExtractor::new();
|
|
271
|
-
|
|
272
|
-
let bibtex_content = r#"
|
|
273
|
-
@article{key1, author={A}, title={T1}, year={2023}}
|
|
274
|
-
@book{key2, author={B}, title={T2}, year={2023}}
|
|
275
|
-
@inproceedings{key3, author={C}, title={T3}, year={2023}}
|
|
276
|
-
"#;
|
|
277
|
-
|
|
278
|
-
let config = ExtractionConfig::default();
|
|
279
|
-
let result = extractor
|
|
280
|
-
.extract_bytes(bibtex_content.as_bytes(), "application/x-bibtex", &config)
|
|
281
|
-
.await;
|
|
282
|
-
|
|
283
|
-
assert!(result.is_ok());
|
|
284
|
-
let result = result.unwrap();
|
|
285
|
-
|
|
286
|
-
if let Some(citation_keys) = result.metadata.additional.get("citation_keys") {
|
|
287
|
-
let keys_array = citation_keys.as_array().expect("Citation keys should be an array");
|
|
288
|
-
assert_eq!(keys_array.len(), 3);
|
|
289
|
-
|
|
290
|
-
let expected_keys = vec!["key1", "key2", "key3"];
|
|
291
|
-
for expected_key in expected_keys {
|
|
292
|
-
let found = keys_array.iter().any(|k| k.as_str() == Some(expected_key));
|
|
293
|
-
assert!(found, "Citation key '{}' not found", expected_key);
|
|
294
|
-
}
|
|
295
|
-
} else {
|
|
296
|
-
panic!("Citation keys not extracted");
|
|
297
|
-
}
|
|
298
|
-
}
|
|
299
|
-
|
|
300
|
-
#[tokio::test]
|
|
301
|
-
async fn test_entry_type_distribution() {
|
|
302
|
-
let extractor = BibtexExtractor::new();
|
|
303
|
-
|
|
304
|
-
let bibtex_content = r#"
|
|
305
|
-
@article{a1, author={A}, title={T1}, year={2023}}
|
|
306
|
-
@article{a2, author={B}, title={T2}, year={2023}}
|
|
307
|
-
@book{b1, author={C}, title={T3}, year={2023}}
|
|
308
|
-
@inproceedings{c1, author={D}, title={T4}, year={2023}}
|
|
309
|
-
@inproceedings{c2, author={E}, title={T5}, year={2023}}
|
|
310
|
-
@inproceedings{c3, author={F}, title={T6}, year={2023}}
|
|
311
|
-
"#;
|
|
312
|
-
|
|
313
|
-
let config = ExtractionConfig::default();
|
|
314
|
-
let result = extractor
|
|
315
|
-
.extract_bytes(bibtex_content.as_bytes(), "application/x-bibtex", &config)
|
|
316
|
-
.await;
|
|
317
|
-
|
|
318
|
-
assert!(result.is_ok());
|
|
319
|
-
let result = result.unwrap();
|
|
320
|
-
|
|
321
|
-
if let Some(entry_types) = result.metadata.additional.get("entry_types") {
|
|
322
|
-
let types_obj = entry_types.as_object().expect("Entry types should be an object");
|
|
323
|
-
|
|
324
|
-
assert_eq!(types_obj.get("article"), Some(&serde_json::json!(2)));
|
|
325
|
-
assert_eq!(types_obj.get("book"), Some(&serde_json::json!(1)));
|
|
326
|
-
assert_eq!(types_obj.get("inproceedings"), Some(&serde_json::json!(3)));
|
|
327
|
-
} else {
|
|
328
|
-
panic!("Entry types not extracted");
|
|
329
|
-
}
|
|
330
|
-
}
|
|
331
|
-
|
|
332
|
-
#[tokio::test]
|
|
333
|
-
async fn test_unicode_support() {
|
|
334
|
-
let extractor = BibtexExtractor::new();
|
|
335
|
-
|
|
336
|
-
let bibtex_content = r#"
|
|
337
|
-
@article{unicode,
|
|
338
|
-
author = {Müller, Hans and Søren, Kierkegård},
|
|
339
|
-
title = {Unicode in BibTeX: A Global Perspective},
|
|
340
|
-
journal = {International Journal},
|
|
341
|
-
year = {2023}
|
|
342
|
-
}
|
|
343
|
-
"#;
|
|
344
|
-
|
|
345
|
-
let config = ExtractionConfig::default();
|
|
346
|
-
let result = extractor
|
|
347
|
-
.extract_bytes(bibtex_content.as_bytes(), "application/x-bibtex", &config)
|
|
348
|
-
.await;
|
|
349
|
-
|
|
350
|
-
assert!(result.is_ok());
|
|
351
|
-
let result = result.unwrap();
|
|
352
|
-
|
|
353
|
-
assert_eq!(
|
|
354
|
-
result.metadata.additional.get("entry_count"),
|
|
355
|
-
Some(&serde_json::json!(1))
|
|
356
|
-
);
|
|
357
|
-
}
|
|
358
|
-
|
|
359
|
-
#[tokio::test]
|
|
360
|
-
async fn test_empty_fields() {
|
|
361
|
-
let extractor = BibtexExtractor::new();
|
|
362
|
-
|
|
363
|
-
let bibtex_content = r#"
|
|
364
|
-
@article{empty,
|
|
365
|
-
author = {Smith, John},
|
|
366
|
-
title = {Test},
|
|
367
|
-
journal = {},
|
|
368
|
-
year = {2023},
|
|
369
|
-
volume = {}
|
|
370
|
-
}
|
|
371
|
-
"#;
|
|
372
|
-
|
|
373
|
-
let config = ExtractionConfig::default();
|
|
374
|
-
let result = extractor
|
|
375
|
-
.extract_bytes(bibtex_content.as_bytes(), "application/x-bibtex", &config)
|
|
376
|
-
.await;
|
|
377
|
-
|
|
378
|
-
assert!(result.is_ok());
|
|
379
|
-
let result = result.unwrap();
|
|
380
|
-
assert_eq!(
|
|
381
|
-
result.metadata.additional.get("entry_count"),
|
|
382
|
-
Some(&serde_json::json!(1))
|
|
383
|
-
);
|
|
384
|
-
}
|
|
385
|
-
|
|
386
|
-
#[tokio::test]
|
|
387
|
-
async fn test_comprehensive_file() {
|
|
388
|
-
let extractor = BibtexExtractor::new();
|
|
389
|
-
|
|
390
|
-
let fixture_path = get_test_file_path("bibtex/comprehensive.bib");
|
|
391
|
-
let bibtex_content = std::fs::read(&fixture_path)
|
|
392
|
-
.unwrap_or_else(|err| panic!("Failed to read test file at {}: {}", fixture_path.display(), err));
|
|
393
|
-
|
|
394
|
-
let config = ExtractionConfig::default();
|
|
395
|
-
let result = extractor
|
|
396
|
-
.extract_bytes(&bibtex_content, "application/x-bibtex", &config)
|
|
397
|
-
.await;
|
|
398
|
-
|
|
399
|
-
assert!(result.is_ok());
|
|
400
|
-
let result = result.unwrap();
|
|
401
|
-
|
|
402
|
-
assert_eq!(
|
|
403
|
-
result.metadata.additional.get("entry_count"),
|
|
404
|
-
Some(&serde_json::json!(20))
|
|
405
|
-
);
|
|
406
|
-
|
|
407
|
-
if let Some(entry_types) = result.metadata.additional.get("entry_types") {
|
|
408
|
-
let types_obj = entry_types.as_object().expect("Entry types should be an object");
|
|
409
|
-
assert!(types_obj.len() >= 10, "Should have at least 10 different entry types");
|
|
410
|
-
}
|
|
411
|
-
|
|
412
|
-
if let Some(authors) = result.metadata.additional.get("authors") {
|
|
413
|
-
let authors_array = authors.as_array().expect("Authors should be an array");
|
|
414
|
-
assert!(authors_array.len() > 10, "Should have many unique authors");
|
|
415
|
-
}
|
|
416
|
-
|
|
417
|
-
if let Some(year_range) = result.metadata.additional.get("year_range") {
|
|
418
|
-
assert!(year_range.get("min").is_some());
|
|
419
|
-
assert!(year_range.get("max").is_some());
|
|
420
|
-
}
|
|
421
|
-
}
|