kreuzberg 4.0.0.pre.rc.6 → 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (175) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +0 -6
  3. data/.rubocop.yaml +534 -1
  4. data/Gemfile +2 -1
  5. data/Gemfile.lock +11 -11
  6. data/README.md +5 -10
  7. data/examples/async_patterns.rb +0 -1
  8. data/ext/kreuzberg_rb/extconf.rb +0 -10
  9. data/ext/kreuzberg_rb/native/Cargo.toml +15 -23
  10. data/ext/kreuzberg_rb/native/build.rs +2 -0
  11. data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
  12. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
  13. data/ext/kreuzberg_rb/native/include/strings.h +2 -2
  14. data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
  15. data/ext/kreuzberg_rb/native/src/lib.rs +16 -75
  16. data/kreuzberg.gemspec +14 -57
  17. data/lib/kreuzberg/cache_api.rb +0 -1
  18. data/lib/kreuzberg/cli.rb +2 -2
  19. data/lib/kreuzberg/config.rb +2 -9
  20. data/lib/kreuzberg/errors.rb +7 -75
  21. data/lib/kreuzberg/extraction_api.rb +0 -1
  22. data/lib/kreuzberg/setup_lib_path.rb +0 -1
  23. data/lib/kreuzberg/version.rb +1 -1
  24. data/lib/kreuzberg.rb +0 -21
  25. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  26. data/sig/kreuzberg.rbs +3 -55
  27. data/spec/binding/cli_proxy_spec.rb +4 -2
  28. data/spec/binding/cli_spec.rb +11 -12
  29. data/spec/examples.txt +104 -0
  30. data/spec/fixtures/config.yaml +1 -0
  31. data/spec/spec_helper.rb +1 -1
  32. data/vendor/kreuzberg/Cargo.toml +42 -112
  33. data/vendor/kreuzberg/README.md +2 -2
  34. data/vendor/kreuzberg/build.rs +4 -18
  35. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  36. data/vendor/kreuzberg/src/cache/mod.rs +3 -27
  37. data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
  38. data/vendor/kreuzberg/src/core/extractor.rs +81 -202
  39. data/vendor/kreuzberg/src/core/io.rs +2 -4
  40. data/vendor/kreuzberg/src/core/mime.rs +12 -2
  41. data/vendor/kreuzberg/src/core/mod.rs +1 -4
  42. data/vendor/kreuzberg/src/core/pipeline.rs +33 -111
  43. data/vendor/kreuzberg/src/embeddings.rs +16 -125
  44. data/vendor/kreuzberg/src/error.rs +1 -1
  45. data/vendor/kreuzberg/src/extraction/docx.rs +1 -1
  46. data/vendor/kreuzberg/src/extraction/image.rs +13 -13
  47. data/vendor/kreuzberg/src/extraction/libreoffice.rs +1 -0
  48. data/vendor/kreuzberg/src/extraction/mod.rs +5 -9
  49. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
  50. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  51. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  52. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  53. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  54. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  55. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  56. data/vendor/kreuzberg/src/extractors/archive.rs +0 -21
  57. data/vendor/kreuzberg/src/extractors/docx.rs +128 -16
  58. data/vendor/kreuzberg/src/extractors/email.rs +0 -14
  59. data/vendor/kreuzberg/src/extractors/excel.rs +20 -19
  60. data/vendor/kreuzberg/src/extractors/html.rs +154 -137
  61. data/vendor/kreuzberg/src/extractors/image.rs +4 -7
  62. data/vendor/kreuzberg/src/extractors/mod.rs +9 -106
  63. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  64. data/vendor/kreuzberg/src/extractors/pdf.rs +15 -12
  65. data/vendor/kreuzberg/src/extractors/pptx.rs +3 -17
  66. data/vendor/kreuzberg/src/extractors/structured.rs +0 -14
  67. data/vendor/kreuzberg/src/extractors/text.rs +5 -23
  68. data/vendor/kreuzberg/src/extractors/xml.rs +0 -7
  69. data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
  70. data/vendor/kreuzberg/src/lib.rs +1 -4
  71. data/vendor/kreuzberg/src/mcp/mod.rs +1 -1
  72. data/vendor/kreuzberg/src/mcp/server.rs +3 -5
  73. data/vendor/kreuzberg/src/ocr/processor.rs +2 -18
  74. data/vendor/kreuzberg/src/pdf/error.rs +1 -1
  75. data/vendor/kreuzberg/src/pdf/table.rs +44 -17
  76. data/vendor/kreuzberg/src/pdf/text.rs +3 -0
  77. data/vendor/kreuzberg/src/plugins/extractor.rs +5 -8
  78. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -2
  79. data/vendor/kreuzberg/src/plugins/processor.rs +1 -2
  80. data/vendor/kreuzberg/src/plugins/registry.rs +0 -13
  81. data/vendor/kreuzberg/src/plugins/validator.rs +8 -9
  82. data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
  83. data/vendor/kreuzberg/src/types.rs +12 -42
  84. data/vendor/kreuzberg/tests/batch_orchestration.rs +5 -19
  85. data/vendor/kreuzberg/tests/batch_processing.rs +3 -15
  86. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  87. data/vendor/kreuzberg/tests/concurrency_stress.rs +1 -17
  88. data/vendor/kreuzberg/tests/config_features.rs +0 -18
  89. data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -15
  90. data/vendor/kreuzberg/tests/core_integration.rs +7 -24
  91. data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
  92. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
  93. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  94. data/vendor/kreuzberg/tests/pipeline_integration.rs +1 -0
  95. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -0
  96. data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -1
  97. data/vendor/kreuzberg/tests/security_validation.rs +1 -12
  98. metadata +25 -90
  99. data/.rubocop.yml +0 -538
  100. data/ext/kreuzberg_rb/native/Cargo.lock +0 -6535
  101. data/lib/kreuzberg/error_context.rb +0 -32
  102. data/vendor/kreuzberg/benches/otel_overhead.rs +0 -48
  103. data/vendor/kreuzberg/src/extraction/markdown.rs +0 -213
  104. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -287
  105. data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -469
  106. data/vendor/kreuzberg/src/extractors/docbook.rs +0 -502
  107. data/vendor/kreuzberg/src/extractors/epub.rs +0 -707
  108. data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -491
  109. data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
  110. data/vendor/kreuzberg/src/extractors/jats.rs +0 -1051
  111. data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -367
  112. data/vendor/kreuzberg/src/extractors/latex.rs +0 -652
  113. data/vendor/kreuzberg/src/extractors/markdown.rs +0 -700
  114. data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
  115. data/vendor/kreuzberg/src/extractors/opml.rs +0 -634
  116. data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -528
  117. data/vendor/kreuzberg/src/extractors/rst.rs +0 -576
  118. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -810
  119. data/vendor/kreuzberg/src/extractors/security.rs +0 -484
  120. data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
  121. data/vendor/kreuzberg/src/extractors/typst.rs +0 -650
  122. data/vendor/kreuzberg/src/panic_context.rs +0 -154
  123. data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
  124. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
  125. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -498
  126. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
  127. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
  128. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
  129. data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
  130. data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
  131. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
  132. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
  133. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
  134. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
  135. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -695
  136. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
  137. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
  138. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -692
  139. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -776
  140. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1259
  141. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -647
  142. data/vendor/rb-sys/.cargo-ok +0 -1
  143. data/vendor/rb-sys/.cargo_vcs_info.json +0 -6
  144. data/vendor/rb-sys/Cargo.lock +0 -393
  145. data/vendor/rb-sys/Cargo.toml +0 -70
  146. data/vendor/rb-sys/Cargo.toml.orig +0 -57
  147. data/vendor/rb-sys/LICENSE-APACHE +0 -190
  148. data/vendor/rb-sys/LICENSE-MIT +0 -21
  149. data/vendor/rb-sys/bin/release.sh +0 -21
  150. data/vendor/rb-sys/build/features.rs +0 -108
  151. data/vendor/rb-sys/build/main.rs +0 -246
  152. data/vendor/rb-sys/build/stable_api_config.rs +0 -153
  153. data/vendor/rb-sys/build/version.rs +0 -48
  154. data/vendor/rb-sys/readme.md +0 -36
  155. data/vendor/rb-sys/src/bindings.rs +0 -21
  156. data/vendor/rb-sys/src/hidden.rs +0 -11
  157. data/vendor/rb-sys/src/lib.rs +0 -34
  158. data/vendor/rb-sys/src/macros.rs +0 -371
  159. data/vendor/rb-sys/src/memory.rs +0 -53
  160. data/vendor/rb-sys/src/ruby_abi_version.rs +0 -38
  161. data/vendor/rb-sys/src/special_consts.rs +0 -31
  162. data/vendor/rb-sys/src/stable_api/compiled.c +0 -179
  163. data/vendor/rb-sys/src/stable_api/compiled.rs +0 -257
  164. data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
  165. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +0 -316
  166. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +0 -324
  167. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +0 -317
  168. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +0 -315
  169. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +0 -326
  170. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +0 -327
  171. data/vendor/rb-sys/src/stable_api.rs +0 -261
  172. data/vendor/rb-sys/src/symbol.rs +0 -31
  173. data/vendor/rb-sys/src/tracking_allocator.rs +0 -332
  174. data/vendor/rb-sys/src/utils.rs +0 -89
  175. data/vendor/rb-sys/src/value_type.rs +0 -7
@@ -1,154 +0,0 @@
1
- use std::any::Any;
2
- use std::time::{SystemTime, UNIX_EPOCH};
3
-
4
- /// Context information captured when a panic occurs.
5
- ///
6
- /// This struct stores detailed information about where and when a panic happened,
7
- /// enabling better error reporting across FFI boundaries.
8
- #[derive(Debug, Clone)]
9
- pub struct PanicContext {
10
- /// Source file where the panic occurred
11
- pub file: &'static str,
12
- /// Line number where the panic occurred
13
- pub line: u32,
14
- /// Function name where the panic occurred
15
- pub function: &'static str,
16
- /// Panic message extracted from the panic payload
17
- pub message: String,
18
- /// Timestamp when the panic was captured
19
- pub timestamp: SystemTime,
20
- }
21
-
22
- impl PanicContext {
23
- /// Creates a new PanicContext with the given parameters.
24
- ///
25
- /// # Arguments
26
- ///
27
- /// * `file` - Source file path
28
- /// * `line` - Line number
29
- /// * `function` - Function name
30
- /// * `panic_info` - The panic payload to extract message from
31
- pub fn new(file: &'static str, line: u32, function: &'static str, panic_info: &dyn Any) -> Self {
32
- let timestamp = std::panic::catch_unwind(SystemTime::now).unwrap_or(UNIX_EPOCH);
33
-
34
- Self {
35
- file,
36
- line,
37
- function,
38
- message: extract_panic_message(panic_info),
39
- timestamp,
40
- }
41
- }
42
-
43
- /// Formats the panic context as a human-readable string.
44
- pub fn format(&self) -> String {
45
- format!(
46
- "Panic at {}:{}:{} - {}",
47
- self.file, self.line, self.function, self.message
48
- )
49
- }
50
- }
51
-
52
- /// Maximum panic message length to prevent DoS attacks
53
- const MAX_PANIC_MESSAGE_LEN: usize = 4096;
54
-
55
- /// Extracts a human-readable message from a panic payload.
56
- ///
57
- /// Attempts to downcast the panic payload to common types (String, &str)
58
- /// to extract a meaningful error message.
59
- ///
60
- /// Message is truncated to 4KB to prevent DoS attacks via extremely large panic messages.
61
- ///
62
- /// # Arguments
63
- ///
64
- /// * `panic_info` - The panic payload from catch_unwind
65
- ///
66
- /// # Returns
67
- ///
68
- /// A string representation of the panic message (truncated if necessary)
69
- pub fn extract_panic_message(panic_info: &dyn Any) -> String {
70
- let msg = if let Some(s) = panic_info.downcast_ref::<String>() {
71
- s.clone()
72
- } else if let Some(s) = panic_info.downcast_ref::<&str>() {
73
- (*s).to_string()
74
- } else {
75
- "Unknown panic payload".to_string()
76
- };
77
-
78
- if msg.len() > MAX_PANIC_MESSAGE_LEN {
79
- let truncate_at = msg.floor_char_boundary(MAX_PANIC_MESSAGE_LEN);
80
- format!("{}... [truncated]", &msg[..truncate_at])
81
- } else {
82
- msg
83
- }
84
- }
85
-
86
- #[cfg(test)]
87
- mod tests {
88
- use super::*;
89
-
90
- #[test]
91
- fn test_extract_panic_message_string() {
92
- let panic_msg = "test panic".to_string();
93
- let msg = extract_panic_message(&panic_msg);
94
- assert_eq!(msg, "test panic");
95
- }
96
-
97
- #[test]
98
- fn test_extract_panic_message_str() {
99
- let panic_msg: &str = "test panic";
100
- let msg = extract_panic_message(&panic_msg);
101
- assert_eq!(msg, "test panic");
102
- }
103
-
104
- #[test]
105
- fn test_extract_panic_message_unknown() {
106
- let panic_msg = 42i32;
107
- let msg = extract_panic_message(&panic_msg);
108
- assert_eq!(msg, "Unknown panic payload");
109
- }
110
-
111
- #[test]
112
- fn test_panic_context_format() {
113
- let panic_msg = "test error".to_string();
114
- let ctx = PanicContext::new("test.rs", 42, "test_function", &panic_msg);
115
-
116
- let formatted = ctx.format();
117
- assert!(formatted.contains("test.rs"));
118
- assert!(formatted.contains("42"));
119
- assert!(formatted.contains("test_function"));
120
- assert!(formatted.contains("test error"));
121
- }
122
-
123
- #[test]
124
- fn test_panic_message_truncation() {
125
- let long_msg = "x".repeat(5000);
126
- let msg = extract_panic_message(&long_msg);
127
- assert!(msg.len() <= MAX_PANIC_MESSAGE_LEN + 20);
128
- assert!(msg.ends_with("... [truncated]"));
129
- }
130
-
131
- #[test]
132
- fn test_panic_message_truncation_utf8_boundary() {
133
- let mut msg = "x".repeat(4093);
134
- msg.push('🦀');
135
- msg.push_str("yyy");
136
-
137
- let truncated = extract_panic_message(&msg);
138
-
139
- assert!(truncated.ends_with("... [truncated]"));
140
-
141
- assert!(std::str::from_utf8(truncated.as_bytes()).is_ok());
142
-
143
- assert!(!truncated.contains("🦀"));
144
- assert!(!truncated.contains("yyy"));
145
- }
146
-
147
- #[test]
148
- fn test_panic_message_no_truncation_needed() {
149
- let short_msg = "short".to_string();
150
- let msg = extract_panic_message(&short_msg);
151
- assert_eq!(msg, "short");
152
- assert!(!msg.contains("[truncated]"));
153
- }
154
- }
@@ -1,52 +0,0 @@
1
- #![cfg(feature = "api")]
2
- //! Integration test for the `/extract` API handler using multipart uploads.
3
-
4
- use axum::{
5
- body::{Body, to_bytes},
6
- http::{Request, StatusCode},
7
- };
8
- use kreuzberg::{
9
- ExtractionConfig,
10
- api::{ApiSizeLimits, create_router_with_limits},
11
- };
12
- use serde_json::Value;
13
- use tower::ServiceExt;
14
-
15
- #[tokio::test]
16
- async fn test_extract_accepts_single_file_multipart() {
17
- let router = create_router_with_limits(ExtractionConfig::default(), ApiSizeLimits::from_mb(5, 5));
18
-
19
- let boundary = "X-BOUNDARY";
20
- let body = format!(
21
- "--{boundary}\r\n\
22
- Content-Disposition: form-data; name=\"files\"; filename=\"test.txt\"\r\n\
23
- Content-Type: text/plain\r\n\
24
- \r\n\
25
- Hello world\r\n\
26
- --{boundary}--\r\n"
27
- );
28
- let body_bytes = body.into_bytes();
29
-
30
- let request = Request::builder()
31
- .method("POST")
32
- .uri("/extract")
33
- .header("content-type", format!("multipart/form-data; boundary={boundary}"))
34
- .header("content-length", body_bytes.len())
35
- .body(Body::from(body_bytes))
36
- .expect("Failed to build request");
37
-
38
- let response = router.oneshot(request).await.expect("Request failed");
39
- assert_eq!(response.status(), StatusCode::OK);
40
-
41
- let bytes = to_bytes(response.into_body(), 1_000_000)
42
- .await
43
- .expect("Failed to read body");
44
- let value: Value = serde_json::from_slice(&bytes).expect("Response JSON parse failed");
45
- let content = value
46
- .get(0)
47
- .and_then(|v| v.get("content"))
48
- .and_then(Value::as_str)
49
- .expect("Response should include extracted content");
50
-
51
- assert_eq!(content.trim_end_matches('\n'), "Hello world");
52
- }
@@ -1,421 +0,0 @@
1
- #![cfg(feature = "office")]
2
- //! Comprehensive test for BibTeX extractor parity with Pandoc
3
-
4
- use kreuzberg::core::config::ExtractionConfig;
5
- use kreuzberg::extractors::BibtexExtractor;
6
- use kreuzberg::plugins::DocumentExtractor;
7
-
8
- mod helpers;
9
- use helpers::get_test_file_path;
10
-
11
- #[tokio::test]
12
- async fn test_all_entry_types() {
13
- let extractor = BibtexExtractor::new();
14
-
15
- let test_cases = vec![
16
- (
17
- "@article{test, author={John Doe}, title={Test}, journal={Journal}, year={2023}}",
18
- "article",
19
- ),
20
- (
21
- "@book{test, author={John Doe}, title={Test}, publisher={Publisher}, year={2023}}",
22
- "book",
23
- ),
24
- (
25
- "@inproceedings{test, author={John Doe}, title={Test}, booktitle={Conference}, year={2023}}",
26
- "inproceedings",
27
- ),
28
- (
29
- "@phdthesis{test, author={John Doe}, title={Test}, school={University}, year={2023}}",
30
- "phdthesis",
31
- ),
32
- (
33
- "@mastersthesis{test, author={John Doe}, title={Test}, school={University}, year={2023}}",
34
- "mastersthesis",
35
- ),
36
- (
37
- "@techreport{test, author={John Doe}, title={Test}, institution={Institute}, year={2023}}",
38
- "techreport",
39
- ),
40
- ("@manual{test, title={Test Manual}, year={2023}}", "manual"),
41
- ("@misc{test, author={John Doe}, title={Test}, year={2023}}", "misc"),
42
- (
43
- "@unpublished{test, author={John Doe}, title={Test}, note={Unpublished}, year={2023}}",
44
- "unpublished",
45
- ),
46
- (
47
- "@incollection{test, author={John Doe}, title={Test}, booktitle={Book}, publisher={Pub}, year={2023}}",
48
- "incollection",
49
- ),
50
- (
51
- "@inbook{test, author={John Doe}, title={Test}, chapter={5}, publisher={Pub}, year={2023}}",
52
- "inbook",
53
- ),
54
- (
55
- "@proceedings{test, title={Conference Proceedings}, year={2023}}",
56
- "proceedings",
57
- ),
58
- ("@booklet{test, title={Booklet}, year={2023}}", "booklet"),
59
- ];
60
-
61
- for (bibtex_content, expected_type) in test_cases {
62
- let config = ExtractionConfig::default();
63
- let result = extractor
64
- .extract_bytes(bibtex_content.as_bytes(), "application/x-bibtex", &config)
65
- .await;
66
-
67
- assert!(result.is_ok(), "Failed to parse {} entry", expected_type);
68
- let result = result.unwrap();
69
-
70
- if let Some(entry_types) = result.metadata.additional.get("entry_types") {
71
- assert!(entry_types.as_object().is_some(), "Entry types should be an object");
72
- println!("Entry type '{}' extracted successfully", expected_type);
73
- }
74
- }
75
- }
76
-
77
- #[tokio::test]
78
- async fn test_all_common_fields() {
79
- let extractor = BibtexExtractor::new();
80
-
81
- let bibtex_content = r#"
82
- @article{comprehensive,
83
- author = {Smith, John and Doe, Jane},
84
- title = {Comprehensive Test},
85
- journal = {Test Journal},
86
- year = {2023},
87
- volume = {42},
88
- number = {3},
89
- pages = {123--145},
90
- month = {June},
91
- doi = {10.1234/test.001},
92
- url = {https://example.com},
93
- issn = {1234-5678},
94
- isbn = {978-0-12-345678-9},
95
- abstract = {This is an abstract},
96
- keywords = {test, bibtex},
97
- note = {Additional notes},
98
- publisher = {Test Publisher},
99
- address = {Test City},
100
- edition = {2nd},
101
- editor = {Editor Name},
102
- series = {Test Series},
103
- organization = {Test Org},
104
- institution = {Test Institute},
105
- school = {Test School},
106
- howpublished = {Online},
107
- type = {Research Article},
108
- chapter = {5},
109
- booktitle = {Book Title}
110
- }
111
- "#;
112
-
113
- let config = ExtractionConfig::default();
114
- let result = extractor
115
- .extract_bytes(bibtex_content.as_bytes(), "application/x-bibtex", &config)
116
- .await;
117
-
118
- assert!(result.is_ok());
119
- let result = result.unwrap();
120
-
121
- let content = &result.content;
122
-
123
- let expected_fields = vec![
124
- "author",
125
- "title",
126
- "journal",
127
- "year",
128
- "volume",
129
- "number",
130
- "pages",
131
- "month",
132
- "doi",
133
- "url",
134
- "issn",
135
- "isbn",
136
- "abstract",
137
- "keywords",
138
- "note",
139
- "publisher",
140
- "address",
141
- "edition",
142
- "editor",
143
- "series",
144
- "organization",
145
- "institution",
146
- "school",
147
- "howpublished",
148
- "type",
149
- "chapter",
150
- "booktitle",
151
- ];
152
-
153
- let num_fields = expected_fields.len();
154
- for field in expected_fields {
155
- assert!(content.contains(field), "Field '{}' should be present in output", field);
156
- }
157
-
158
- println!("All {} fields were extracted", num_fields);
159
- }
160
-
161
- #[tokio::test]
162
- async fn test_author_parsing() {
163
- let extractor = BibtexExtractor::new();
164
-
165
- let test_cases = vec![
166
- ("author = {John Doe}", vec!["John Doe"]),
167
- ("author = {John Doe and Jane Smith}", vec!["John Doe", "Jane Smith"]),
168
- ("author = {Smith, John and Doe, Jane}", vec!["Smith, John", "Doe, Jane"]),
169
- (
170
- "author = {John Doe and Jane Smith and Bob Jones}",
171
- vec!["John Doe", "Jane Smith", "Bob Jones"],
172
- ),
173
- ("author = {van der Berg, Hans}", vec!["van der Berg, Hans"]),
174
- ("author = {Smith, Jr., John}", vec!["Smith, Jr., John"]),
175
- ];
176
-
177
- for (author_field, expected_authors) in test_cases {
178
- let bibtex = format!("@article{{test, {}, title={{Test}}, year={{2023}}}}", author_field);
179
-
180
- let config = ExtractionConfig::default();
181
- let result = extractor
182
- .extract_bytes(bibtex.as_bytes(), "application/x-bibtex", &config)
183
- .await;
184
-
185
- assert!(result.is_ok());
186
- let result = result.unwrap();
187
-
188
- if let Some(authors) = result.metadata.additional.get("authors") {
189
- let authors_array = authors.as_array().expect("Authors should be an array");
190
-
191
- for expected_author in &expected_authors {
192
- let found = authors_array
193
- .iter()
194
- .any(|a| a.as_str().map(|s| s.contains(expected_author)).unwrap_or(false));
195
- assert!(
196
- found,
197
- "Expected author '{}' not found in {:?}",
198
- expected_author, authors_array
199
- );
200
- }
201
- }
202
- }
203
- }
204
-
205
- #[tokio::test]
206
- async fn test_special_characters() {
207
- let extractor = BibtexExtractor::new();
208
-
209
- let bibtex_content = r#"
210
- @article{special,
211
- author = {M{\"u}ller, Hans and Sch{\"o}n, Anna and Garc{\'\i}a, Jos{\'e}},
212
- title = {Special Characters in {BibTeX}: {\"O}berblick},
213
- journal = {Test Journal},
214
- year = {2022}
215
- }
216
- "#;
217
-
218
- let config = ExtractionConfig::default();
219
- let result = extractor
220
- .extract_bytes(bibtex_content.as_bytes(), "application/x-bibtex", &config)
221
- .await;
222
-
223
- assert!(result.is_ok());
224
- let result = result.unwrap();
225
-
226
- assert_eq!(
227
- result.metadata.additional.get("entry_count"),
228
- Some(&serde_json::json!(1))
229
- );
230
-
231
- if let Some(authors) = result.metadata.additional.get("authors") {
232
- let authors_array = authors.as_array().expect("Authors should be an array");
233
- assert!(authors_array.len() >= 3, "Should have 3 authors");
234
- }
235
- }
236
-
237
- #[tokio::test]
238
- async fn test_year_range_extraction() {
239
- let extractor = BibtexExtractor::new();
240
-
241
- let bibtex_content = r#"
242
- @article{old, author={A}, title={Old}, year={1990}}
243
- @article{mid, author={B}, title={Mid}, year={2005}}
244
- @article{new, author={C}, title={New}, year={2023}}
245
- "#;
246
-
247
- let config = ExtractionConfig::default();
248
- let result = extractor
249
- .extract_bytes(bibtex_content.as_bytes(), "application/x-bibtex", &config)
250
- .await;
251
-
252
- assert!(result.is_ok());
253
- let result = result.unwrap();
254
-
255
- if let Some(year_range) = result.metadata.additional.get("year_range") {
256
- assert_eq!(year_range.get("min"), Some(&serde_json::json!(1990)));
257
- assert_eq!(year_range.get("max"), Some(&serde_json::json!(2023)));
258
-
259
- if let Some(years) = year_range.get("years") {
260
- let years_array = years.as_array().expect("Years should be an array");
261
- assert_eq!(years_array.len(), 3, "Should have 3 unique years");
262
- }
263
- } else {
264
- panic!("Year range not extracted");
265
- }
266
- }
267
-
268
- #[tokio::test]
269
- async fn test_citation_keys_extraction() {
270
- let extractor = BibtexExtractor::new();
271
-
272
- let bibtex_content = r#"
273
- @article{key1, author={A}, title={T1}, year={2023}}
274
- @book{key2, author={B}, title={T2}, year={2023}}
275
- @inproceedings{key3, author={C}, title={T3}, year={2023}}
276
- "#;
277
-
278
- let config = ExtractionConfig::default();
279
- let result = extractor
280
- .extract_bytes(bibtex_content.as_bytes(), "application/x-bibtex", &config)
281
- .await;
282
-
283
- assert!(result.is_ok());
284
- let result = result.unwrap();
285
-
286
- if let Some(citation_keys) = result.metadata.additional.get("citation_keys") {
287
- let keys_array = citation_keys.as_array().expect("Citation keys should be an array");
288
- assert_eq!(keys_array.len(), 3);
289
-
290
- let expected_keys = vec!["key1", "key2", "key3"];
291
- for expected_key in expected_keys {
292
- let found = keys_array.iter().any(|k| k.as_str() == Some(expected_key));
293
- assert!(found, "Citation key '{}' not found", expected_key);
294
- }
295
- } else {
296
- panic!("Citation keys not extracted");
297
- }
298
- }
299
-
300
- #[tokio::test]
301
- async fn test_entry_type_distribution() {
302
- let extractor = BibtexExtractor::new();
303
-
304
- let bibtex_content = r#"
305
- @article{a1, author={A}, title={T1}, year={2023}}
306
- @article{a2, author={B}, title={T2}, year={2023}}
307
- @book{b1, author={C}, title={T3}, year={2023}}
308
- @inproceedings{c1, author={D}, title={T4}, year={2023}}
309
- @inproceedings{c2, author={E}, title={T5}, year={2023}}
310
- @inproceedings{c3, author={F}, title={T6}, year={2023}}
311
- "#;
312
-
313
- let config = ExtractionConfig::default();
314
- let result = extractor
315
- .extract_bytes(bibtex_content.as_bytes(), "application/x-bibtex", &config)
316
- .await;
317
-
318
- assert!(result.is_ok());
319
- let result = result.unwrap();
320
-
321
- if let Some(entry_types) = result.metadata.additional.get("entry_types") {
322
- let types_obj = entry_types.as_object().expect("Entry types should be an object");
323
-
324
- assert_eq!(types_obj.get("article"), Some(&serde_json::json!(2)));
325
- assert_eq!(types_obj.get("book"), Some(&serde_json::json!(1)));
326
- assert_eq!(types_obj.get("inproceedings"), Some(&serde_json::json!(3)));
327
- } else {
328
- panic!("Entry types not extracted");
329
- }
330
- }
331
-
332
- #[tokio::test]
333
- async fn test_unicode_support() {
334
- let extractor = BibtexExtractor::new();
335
-
336
- let bibtex_content = r#"
337
- @article{unicode,
338
- author = {Müller, Hans and Søren, Kierkegård},
339
- title = {Unicode in BibTeX: A Global Perspective},
340
- journal = {International Journal},
341
- year = {2023}
342
- }
343
- "#;
344
-
345
- let config = ExtractionConfig::default();
346
- let result = extractor
347
- .extract_bytes(bibtex_content.as_bytes(), "application/x-bibtex", &config)
348
- .await;
349
-
350
- assert!(result.is_ok());
351
- let result = result.unwrap();
352
-
353
- assert_eq!(
354
- result.metadata.additional.get("entry_count"),
355
- Some(&serde_json::json!(1))
356
- );
357
- }
358
-
359
- #[tokio::test]
360
- async fn test_empty_fields() {
361
- let extractor = BibtexExtractor::new();
362
-
363
- let bibtex_content = r#"
364
- @article{empty,
365
- author = {Smith, John},
366
- title = {Test},
367
- journal = {},
368
- year = {2023},
369
- volume = {}
370
- }
371
- "#;
372
-
373
- let config = ExtractionConfig::default();
374
- let result = extractor
375
- .extract_bytes(bibtex_content.as_bytes(), "application/x-bibtex", &config)
376
- .await;
377
-
378
- assert!(result.is_ok());
379
- let result = result.unwrap();
380
- assert_eq!(
381
- result.metadata.additional.get("entry_count"),
382
- Some(&serde_json::json!(1))
383
- );
384
- }
385
-
386
- #[tokio::test]
387
- async fn test_comprehensive_file() {
388
- let extractor = BibtexExtractor::new();
389
-
390
- let fixture_path = get_test_file_path("bibtex/comprehensive.bib");
391
- let bibtex_content = std::fs::read(&fixture_path)
392
- .unwrap_or_else(|err| panic!("Failed to read test file at {}: {}", fixture_path.display(), err));
393
-
394
- let config = ExtractionConfig::default();
395
- let result = extractor
396
- .extract_bytes(&bibtex_content, "application/x-bibtex", &config)
397
- .await;
398
-
399
- assert!(result.is_ok());
400
- let result = result.unwrap();
401
-
402
- assert_eq!(
403
- result.metadata.additional.get("entry_count"),
404
- Some(&serde_json::json!(20))
405
- );
406
-
407
- if let Some(entry_types) = result.metadata.additional.get("entry_types") {
408
- let types_obj = entry_types.as_object().expect("Entry types should be an object");
409
- assert!(types_obj.len() >= 10, "Should have at least 10 different entry types");
410
- }
411
-
412
- if let Some(authors) = result.metadata.additional.get("authors") {
413
- let authors_array = authors.as_array().expect("Authors should be an array");
414
- assert!(authors_array.len() > 10, "Should have many unique authors");
415
- }
416
-
417
- if let Some(year_range) = result.metadata.additional.get("year_range") {
418
- assert!(year_range.get("min").is_some());
419
- assert!(year_range.get("max").is_some());
420
- }
421
- }