kreuzberg 4.0.0.pre.rc.6 → 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (175) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +0 -6
  3. data/.rubocop.yaml +534 -1
  4. data/Gemfile +2 -1
  5. data/Gemfile.lock +11 -11
  6. data/README.md +5 -10
  7. data/examples/async_patterns.rb +0 -1
  8. data/ext/kreuzberg_rb/extconf.rb +0 -10
  9. data/ext/kreuzberg_rb/native/Cargo.toml +15 -23
  10. data/ext/kreuzberg_rb/native/build.rs +2 -0
  11. data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
  12. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
  13. data/ext/kreuzberg_rb/native/include/strings.h +2 -2
  14. data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
  15. data/ext/kreuzberg_rb/native/src/lib.rs +16 -75
  16. data/kreuzberg.gemspec +14 -57
  17. data/lib/kreuzberg/cache_api.rb +0 -1
  18. data/lib/kreuzberg/cli.rb +2 -2
  19. data/lib/kreuzberg/config.rb +2 -9
  20. data/lib/kreuzberg/errors.rb +7 -75
  21. data/lib/kreuzberg/extraction_api.rb +0 -1
  22. data/lib/kreuzberg/setup_lib_path.rb +0 -1
  23. data/lib/kreuzberg/version.rb +1 -1
  24. data/lib/kreuzberg.rb +0 -21
  25. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  26. data/sig/kreuzberg.rbs +3 -55
  27. data/spec/binding/cli_proxy_spec.rb +4 -2
  28. data/spec/binding/cli_spec.rb +11 -12
  29. data/spec/examples.txt +104 -0
  30. data/spec/fixtures/config.yaml +1 -0
  31. data/spec/spec_helper.rb +1 -1
  32. data/vendor/kreuzberg/Cargo.toml +42 -112
  33. data/vendor/kreuzberg/README.md +2 -2
  34. data/vendor/kreuzberg/build.rs +4 -18
  35. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  36. data/vendor/kreuzberg/src/cache/mod.rs +3 -27
  37. data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
  38. data/vendor/kreuzberg/src/core/extractor.rs +81 -202
  39. data/vendor/kreuzberg/src/core/io.rs +2 -4
  40. data/vendor/kreuzberg/src/core/mime.rs +12 -2
  41. data/vendor/kreuzberg/src/core/mod.rs +1 -4
  42. data/vendor/kreuzberg/src/core/pipeline.rs +33 -111
  43. data/vendor/kreuzberg/src/embeddings.rs +16 -125
  44. data/vendor/kreuzberg/src/error.rs +1 -1
  45. data/vendor/kreuzberg/src/extraction/docx.rs +1 -1
  46. data/vendor/kreuzberg/src/extraction/image.rs +13 -13
  47. data/vendor/kreuzberg/src/extraction/libreoffice.rs +1 -0
  48. data/vendor/kreuzberg/src/extraction/mod.rs +5 -9
  49. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
  50. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  51. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  52. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  53. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  54. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  55. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  56. data/vendor/kreuzberg/src/extractors/archive.rs +0 -21
  57. data/vendor/kreuzberg/src/extractors/docx.rs +128 -16
  58. data/vendor/kreuzberg/src/extractors/email.rs +0 -14
  59. data/vendor/kreuzberg/src/extractors/excel.rs +20 -19
  60. data/vendor/kreuzberg/src/extractors/html.rs +154 -137
  61. data/vendor/kreuzberg/src/extractors/image.rs +4 -7
  62. data/vendor/kreuzberg/src/extractors/mod.rs +9 -106
  63. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  64. data/vendor/kreuzberg/src/extractors/pdf.rs +15 -12
  65. data/vendor/kreuzberg/src/extractors/pptx.rs +3 -17
  66. data/vendor/kreuzberg/src/extractors/structured.rs +0 -14
  67. data/vendor/kreuzberg/src/extractors/text.rs +5 -23
  68. data/vendor/kreuzberg/src/extractors/xml.rs +0 -7
  69. data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
  70. data/vendor/kreuzberg/src/lib.rs +1 -4
  71. data/vendor/kreuzberg/src/mcp/mod.rs +1 -1
  72. data/vendor/kreuzberg/src/mcp/server.rs +3 -5
  73. data/vendor/kreuzberg/src/ocr/processor.rs +2 -18
  74. data/vendor/kreuzberg/src/pdf/error.rs +1 -1
  75. data/vendor/kreuzberg/src/pdf/table.rs +44 -17
  76. data/vendor/kreuzberg/src/pdf/text.rs +3 -0
  77. data/vendor/kreuzberg/src/plugins/extractor.rs +5 -8
  78. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -2
  79. data/vendor/kreuzberg/src/plugins/processor.rs +1 -2
  80. data/vendor/kreuzberg/src/plugins/registry.rs +0 -13
  81. data/vendor/kreuzberg/src/plugins/validator.rs +8 -9
  82. data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
  83. data/vendor/kreuzberg/src/types.rs +12 -42
  84. data/vendor/kreuzberg/tests/batch_orchestration.rs +5 -19
  85. data/vendor/kreuzberg/tests/batch_processing.rs +3 -15
  86. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  87. data/vendor/kreuzberg/tests/concurrency_stress.rs +1 -17
  88. data/vendor/kreuzberg/tests/config_features.rs +0 -18
  89. data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -15
  90. data/vendor/kreuzberg/tests/core_integration.rs +7 -24
  91. data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
  92. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
  93. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  94. data/vendor/kreuzberg/tests/pipeline_integration.rs +1 -0
  95. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -0
  96. data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -1
  97. data/vendor/kreuzberg/tests/security_validation.rs +1 -12
  98. metadata +25 -90
  99. data/.rubocop.yml +0 -538
  100. data/ext/kreuzberg_rb/native/Cargo.lock +0 -6535
  101. data/lib/kreuzberg/error_context.rb +0 -32
  102. data/vendor/kreuzberg/benches/otel_overhead.rs +0 -48
  103. data/vendor/kreuzberg/src/extraction/markdown.rs +0 -213
  104. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -287
  105. data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -469
  106. data/vendor/kreuzberg/src/extractors/docbook.rs +0 -502
  107. data/vendor/kreuzberg/src/extractors/epub.rs +0 -707
  108. data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -491
  109. data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
  110. data/vendor/kreuzberg/src/extractors/jats.rs +0 -1051
  111. data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -367
  112. data/vendor/kreuzberg/src/extractors/latex.rs +0 -652
  113. data/vendor/kreuzberg/src/extractors/markdown.rs +0 -700
  114. data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
  115. data/vendor/kreuzberg/src/extractors/opml.rs +0 -634
  116. data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -528
  117. data/vendor/kreuzberg/src/extractors/rst.rs +0 -576
  118. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -810
  119. data/vendor/kreuzberg/src/extractors/security.rs +0 -484
  120. data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
  121. data/vendor/kreuzberg/src/extractors/typst.rs +0 -650
  122. data/vendor/kreuzberg/src/panic_context.rs +0 -154
  123. data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
  124. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
  125. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -498
  126. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
  127. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
  128. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
  129. data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
  130. data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
  131. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
  132. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
  133. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
  134. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
  135. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -695
  136. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
  137. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
  138. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -692
  139. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -776
  140. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1259
  141. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -647
  142. data/vendor/rb-sys/.cargo-ok +0 -1
  143. data/vendor/rb-sys/.cargo_vcs_info.json +0 -6
  144. data/vendor/rb-sys/Cargo.lock +0 -393
  145. data/vendor/rb-sys/Cargo.toml +0 -70
  146. data/vendor/rb-sys/Cargo.toml.orig +0 -57
  147. data/vendor/rb-sys/LICENSE-APACHE +0 -190
  148. data/vendor/rb-sys/LICENSE-MIT +0 -21
  149. data/vendor/rb-sys/bin/release.sh +0 -21
  150. data/vendor/rb-sys/build/features.rs +0 -108
  151. data/vendor/rb-sys/build/main.rs +0 -246
  152. data/vendor/rb-sys/build/stable_api_config.rs +0 -153
  153. data/vendor/rb-sys/build/version.rs +0 -48
  154. data/vendor/rb-sys/readme.md +0 -36
  155. data/vendor/rb-sys/src/bindings.rs +0 -21
  156. data/vendor/rb-sys/src/hidden.rs +0 -11
  157. data/vendor/rb-sys/src/lib.rs +0 -34
  158. data/vendor/rb-sys/src/macros.rs +0 -371
  159. data/vendor/rb-sys/src/memory.rs +0 -53
  160. data/vendor/rb-sys/src/ruby_abi_version.rs +0 -38
  161. data/vendor/rb-sys/src/special_consts.rs +0 -31
  162. data/vendor/rb-sys/src/stable_api/compiled.c +0 -179
  163. data/vendor/rb-sys/src/stable_api/compiled.rs +0 -257
  164. data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
  165. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +0 -316
  166. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +0 -324
  167. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +0 -317
  168. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +0 -315
  169. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +0 -326
  170. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +0 -327
  171. data/vendor/rb-sys/src/stable_api.rs +0 -261
  172. data/vendor/rb-sys/src/symbol.rs +0 -31
  173. data/vendor/rb-sys/src/tracking_allocator.rs +0 -332
  174. data/vendor/rb-sys/src/utils.rs +0 -89
  175. data/vendor/rb-sys/src/value_type.rs +0 -7
@@ -1,650 +0,0 @@
1
- //! Native Rust Typst document extractor.
2
- //!
3
- //! This extractor provides Typst document parsing and text extraction.
4
- //! It uses a hybrid approach combining regex patterns and character-level parsing
5
- //! to extract text while preserving document structure.
6
- //!
7
- //! Features:
8
- //! - Metadata extraction: title, author, date, subject, keywords from `#set document()`
9
- //! - Section hierarchy: `=`, `==`, `===`, etc. heading levels
10
- //! - Inline formatting: `*bold*`, `_italic_`, `` `code` ``
11
- //! - Lists: extraction of list content (both `+` and `-` markers)
12
- //! - Links: extraction of URLs and link text from `#link("url")[text]` syntax
13
- //! - Math: inline (`$...$`) and display math preservation
14
- //! - Code blocks: triple-backtick code blocks with language specifiers
15
- //! - Tables: extraction of `#table()` function content
16
- //! - Complex formatting: handling of nested and combined formatting
17
- //!
18
- //! Requires the `office` feature.
19
-
20
- #[cfg(feature = "office")]
21
- use crate::Result;
22
- #[cfg(feature = "office")]
23
- use crate::core::config::ExtractionConfig;
24
- #[cfg(feature = "office")]
25
- use crate::plugins::{DocumentExtractor, Plugin};
26
- #[cfg(feature = "office")]
27
- use crate::types::{ExtractionResult, Metadata};
28
- #[cfg(feature = "office")]
29
- use async_trait::async_trait;
30
- #[cfg(feature = "office")]
31
- use regex::Regex;
32
-
33
- /// Typst document extractor
34
- #[cfg(feature = "office")]
35
- pub struct TypstExtractor;
36
-
37
- #[cfg(feature = "office")]
38
- impl TypstExtractor {
39
- /// Create a new Typst extractor.
40
- pub fn new() -> Self {
41
- Self
42
- }
43
-
44
- /// Parse Typst content and extract text.
45
- fn extract_from_typst(content: &str) -> (String, Metadata) {
46
- let mut extractor = TypstParser::new(content);
47
- let text = extractor.parse();
48
- let metadata = extractor.metadata;
49
-
50
- (text, metadata)
51
- }
52
- }
53
-
54
- #[cfg(feature = "office")]
55
- impl Default for TypstExtractor {
56
- fn default() -> Self {
57
- Self::new()
58
- }
59
- }
60
-
61
- #[cfg(feature = "office")]
62
- impl Plugin for TypstExtractor {
63
- fn name(&self) -> &str {
64
- "typst-extractor"
65
- }
66
-
67
- fn version(&self) -> String {
68
- env!("CARGO_PKG_VERSION").to_string()
69
- }
70
-
71
- fn initialize(&self) -> Result<()> {
72
- Ok(())
73
- }
74
-
75
- fn shutdown(&self) -> Result<()> {
76
- Ok(())
77
- }
78
-
79
- fn description(&self) -> &str {
80
- "Native Rust Typst document extractor with metadata support"
81
- }
82
-
83
- fn author(&self) -> &str {
84
- "Kreuzberg Team"
85
- }
86
- }
87
-
88
- #[cfg(feature = "office")]
89
- #[async_trait]
90
- impl DocumentExtractor for TypstExtractor {
91
- #[cfg_attr(feature = "otel", tracing::instrument(
92
- skip(self, content, _config),
93
- fields(
94
- extractor.name = self.name(),
95
- content.size_bytes = content.len(),
96
- )
97
- ))]
98
- async fn extract_bytes(
99
- &self,
100
- content: &[u8],
101
- mime_type: &str,
102
- _config: &ExtractionConfig,
103
- ) -> Result<ExtractionResult> {
104
- let typst_str = String::from_utf8_lossy(content).to_string();
105
- let (text, metadata) = Self::extract_from_typst(&typst_str);
106
-
107
- Ok(ExtractionResult {
108
- content: text,
109
- mime_type: mime_type.to_string(),
110
- metadata,
111
- tables: Vec::new(),
112
- detected_languages: None,
113
- chunks: None,
114
- images: None,
115
- })
116
- }
117
-
118
- fn supported_mime_types(&self) -> &[&str] {
119
- &["application/x-typst", "text/x-typst"]
120
- }
121
-
122
- fn priority(&self) -> i32 {
123
- 50
124
- }
125
- }
126
-
127
- /// Internal Typst parser
128
- #[cfg(feature = "office")]
129
- struct TypstParser {
130
- content: String,
131
- metadata: Metadata,
132
- }
133
-
134
- #[cfg(feature = "office")]
135
- impl TypstParser {
136
- fn new(content: &str) -> Self {
137
- Self {
138
- content: content.to_string(),
139
- metadata: Metadata::default(),
140
- }
141
- }
142
-
143
- fn parse(&mut self) -> String {
144
- self.extract_metadata();
145
-
146
- self.extract_content()
147
- }
148
-
149
- fn extract_metadata(&mut self) {
150
- if let Some(title) = self.extract_quoted_value("title") {
151
- self.metadata.additional.insert("title".to_string(), title.into());
152
- }
153
-
154
- if let Some(author) = self.extract_quoted_value("author") {
155
- self.metadata.additional.insert("author".to_string(), author.into());
156
- }
157
-
158
- if let Some(date) = self.extract_quoted_value("date") {
159
- self.metadata.date = Some(date);
160
- }
161
-
162
- if let Some(subject) = self.extract_quoted_value("subject") {
163
- self.metadata.additional.insert("subject".to_string(), subject.into());
164
- }
165
-
166
- if let Some(keywords) = self.extract_keywords() {
167
- self.metadata.additional.insert("keywords".to_string(), keywords.into());
168
- }
169
- }
170
-
171
- fn extract_quoted_value(&self, field: &str) -> Option<String> {
172
- let pattern = format!(r#"{}:\s*"([^"]*)""#, regex::escape(field));
173
- if let Ok(re) = Regex::new(&pattern)
174
- && let Some(caps) = re.captures(&self.content)
175
- {
176
- return caps.get(1).map(|m| m.as_str().to_string());
177
- }
178
- None
179
- }
180
-
181
- fn extract_keywords(&self) -> Option<String> {
182
- let pattern = r#"keywords:\s*(?:"([^"]*)"|(\([^)]*\)))"#;
183
- if let Ok(re) = Regex::new(pattern)
184
- && let Some(caps) = re.captures(&self.content)
185
- {
186
- if let Some(m) = caps.get(1) {
187
- return Some(m.as_str().to_string());
188
- }
189
- if let Some(m) = caps.get(2) {
190
- let array_str = m.as_str();
191
- let mut keywords = Vec::new();
192
- let item_pattern = r#""([^"]*)""#;
193
- if let Ok(item_re) = Regex::new(item_pattern) {
194
- for item_caps in item_re.captures_iter(array_str) {
195
- if let Some(keyword) = item_caps.get(1) {
196
- keywords.push(keyword.as_str().to_string());
197
- }
198
- }
199
- }
200
- if !keywords.is_empty() {
201
- return Some(keywords.join(", "));
202
- }
203
- }
204
- }
205
- None
206
- }
207
-
208
- fn extract_content(&self) -> String {
209
- let mut output = String::new();
210
- let mut lines = self.content.lines().peekable();
211
- let mut in_code_block = false;
212
- let mut code_block_fence = String::new();
213
-
214
- while let Some(line) = lines.next() {
215
- let trimmed = line.trim();
216
-
217
- if trimmed.starts_with("```") {
218
- if in_code_block {
219
- if trimmed == "```" {
220
- in_code_block = false;
221
- code_block_fence.clear();
222
- output.push_str("```\n");
223
- continue;
224
- }
225
- } else {
226
- in_code_block = true;
227
- code_block_fence = "```".to_string();
228
- output.push_str("```");
229
- if let Some(lang) = trimmed.strip_prefix("```") {
230
- let lang = lang.trim();
231
- if !lang.is_empty() {
232
- output.push_str(lang);
233
- }
234
- }
235
- output.push('\n');
236
- continue;
237
- }
238
- }
239
-
240
- if in_code_block {
241
- output.push_str(line);
242
- output.push('\n');
243
- continue;
244
- }
245
-
246
- if trimmed.starts_with("#set ") || trimmed.starts_with("#let ") {
247
- continue;
248
- }
249
-
250
- if trimmed.starts_with("#import ") || trimmed.starts_with("#include ") {
251
- continue;
252
- }
253
-
254
- if trimmed.starts_with("#table(") {
255
- output.push_str("TABLE:\n");
256
- let table_content = self.extract_table_content(trimmed, &mut lines);
257
- output.push_str(&table_content);
258
- output.push('\n');
259
- continue;
260
- }
261
-
262
- if trimmed.starts_with('=') {
263
- let next_char_pos = trimmed.find(|c: char| c != '=');
264
- if next_char_pos.is_some() {
265
- let heading_level = trimmed.chars().take_while(|&c| c == '=').count();
266
- let heading_text = trimmed[heading_level..].trim();
267
-
268
- for _ in 0..heading_level {
269
- output.push('=');
270
- }
271
- output.push(' ');
272
- output.push_str(heading_text);
273
- output.push('\n');
274
- continue;
275
- }
276
- }
277
-
278
- if (trimmed.starts_with('+') || trimmed.starts_with('-'))
279
- && trimmed.len() > 1
280
- && trimmed.chars().nth(1).is_some_and(|c| !c.is_alphanumeric())
281
- {
282
- output.push_str("- ");
283
- output.push_str(trimmed[1..].trim());
284
- output.push('\n');
285
- continue;
286
- }
287
-
288
- if trimmed.starts_with('#')
289
- && !trimmed.starts_with("#set")
290
- && !trimmed.starts_with("#let")
291
- && !trimmed.starts_with("#import")
292
- && !trimmed.starts_with("#include")
293
- {
294
- if trimmed.contains('[')
295
- && trimmed.contains(']')
296
- && let Some(content) = self.extract_text_from_brackets(trimmed)
297
- {
298
- let processed = self.process_line(&content);
299
- if !processed.is_empty() {
300
- output.push_str(&processed);
301
- output.push('\n');
302
- }
303
- }
304
- continue;
305
- }
306
-
307
- if !trimmed.is_empty() {
308
- let processed = self.process_line(trimmed);
309
- if !processed.is_empty() {
310
- output.push_str(&processed);
311
- output.push('\n');
312
- }
313
- } else {
314
- output.push('\n');
315
- }
316
- }
317
-
318
- output
319
- }
320
-
321
- /// Extract content from #table() function calls
322
- fn extract_table_content<'a, I>(&self, first_line: &str, lines: &mut std::iter::Peekable<I>) -> String
323
- where
324
- I: Iterator<Item = &'a str>,
325
- {
326
- let mut table_content = String::new();
327
- let mut content = first_line.to_string();
328
- let mut bracket_depth = 0;
329
- let mut paren_depth = if first_line.contains('(') { 1 } else { 0 };
330
-
331
- for ch in first_line.chars() {
332
- match ch {
333
- '(' => paren_depth += 1,
334
- ')' => paren_depth -= 1,
335
- '[' => bracket_depth += 1,
336
- ']' => bracket_depth -= 1,
337
- _ => {}
338
- }
339
- }
340
-
341
- while paren_depth > 0 || bracket_depth > 0 {
342
- if let Some(next_line) = lines.next() {
343
- content.push('\n');
344
- content.push_str(next_line);
345
- for ch in next_line.chars() {
346
- match ch {
347
- '(' => paren_depth += 1,
348
- ')' => paren_depth -= 1,
349
- '[' => bracket_depth += 1,
350
- ']' => bracket_depth -= 1,
351
- _ => {}
352
- }
353
- }
354
- } else {
355
- break;
356
- }
357
- }
358
-
359
- let mut in_bracket = false;
360
- let mut cell = String::new();
361
- for ch in content.chars() {
362
- match ch {
363
- '[' => {
364
- in_bracket = true;
365
- cell.clear();
366
- }
367
- ']' => {
368
- if in_bracket {
369
- let trimmed = cell.trim();
370
- if !trimmed.is_empty() {
371
- table_content.push_str(trimmed);
372
- table_content.push_str(" | ");
373
- }
374
- in_bracket = false;
375
- cell.clear();
376
- }
377
- }
378
- _ if in_bracket => {
379
- cell.push(ch);
380
- }
381
- _ => {}
382
- }
383
- }
384
-
385
- if table_content.ends_with(" | ") {
386
- table_content.truncate(table_content.len() - 3);
387
- }
388
-
389
- table_content
390
- }
391
-
392
- fn process_line(&self, line: &str) -> String {
393
- let mut result = String::new();
394
- let mut chars = line.chars().peekable();
395
-
396
- while let Some(ch) = chars.next() {
397
- match ch {
398
- '`' => {
399
- result.push('`');
400
- for c in chars.by_ref() {
401
- result.push(c);
402
- if c == '`' {
403
- break;
404
- }
405
- }
406
- }
407
- '$' => {
408
- result.push('$');
409
- for c in chars.by_ref() {
410
- result.push(c);
411
- if c == '$' {
412
- break;
413
- }
414
- }
415
- }
416
- '*' => {
417
- result.push('*');
418
- for c in chars.by_ref() {
419
- result.push(c);
420
- if c == '*' {
421
- break;
422
- }
423
- }
424
- }
425
- '_' => {
426
- result.push('_');
427
- for c in chars.by_ref() {
428
- result.push(c);
429
- if c == '_' {
430
- break;
431
- }
432
- }
433
- }
434
- '#' if chars.peek() == Some(&'l') => {
435
- result.push(ch);
436
- }
437
- _ => {
438
- result.push(ch);
439
- }
440
- }
441
- }
442
-
443
- self.extract_link_text(&result)
444
- }
445
-
446
- fn extract_link_text(&self, line: &str) -> String {
447
- let pattern = r#"link\("([^"]*)"\)\[([^\]]*)\]"#;
448
- if let Ok(re) = Regex::new(pattern) {
449
- return re
450
- .replace_all(line, |caps: &regex::Captures| {
451
- let url = caps.get(1).map(|m| m.as_str()).unwrap_or("");
452
- let text = caps.get(2).map(|m| m.as_str()).unwrap_or("");
453
- format!("[{}]({})", text, url)
454
- })
455
- .to_string();
456
- }
457
- line.to_string()
458
- }
459
-
460
- fn extract_text_from_brackets(&self, line: &str) -> Option<String> {
461
- if let Some(start) = line.find('[')
462
- && let Some(end) = line.rfind(']')
463
- && end > start
464
- {
465
- let text = &line[start + 1..end];
466
- return Some(text.to_string());
467
- }
468
- None
469
- }
470
- }
471
-
472
- #[cfg(test)]
473
- mod tests {
474
- use super::*;
475
-
476
- #[test]
477
- fn test_extract_metadata() {
478
- let content = r#"#set document(
479
- title: "Test Document",
480
- author: "Test Author"
481
- )
482
-
483
- = Heading
484
- Some text
485
- "#;
486
-
487
- let (_, metadata) = TypstExtractor::extract_from_typst(content);
488
-
489
- assert!(metadata.additional.contains_key("title"));
490
- assert!(metadata.additional.contains_key("author"));
491
- }
492
-
493
- #[test]
494
- fn test_extract_headings() {
495
- let content = r#"= Level 1
496
- Content
497
-
498
- == Level 2
499
- More content
500
- "#;
501
-
502
- let (output, _) = TypstExtractor::extract_from_typst(content);
503
-
504
- assert!(output.contains("= Level 1"));
505
- assert!(output.contains("== Level 2"));
506
- }
507
-
508
- #[test]
509
- fn test_extract_formatting() {
510
- let content = r#"Some *bold* and _italic_ text with `code`."#;
511
-
512
- let (output, _) = TypstExtractor::extract_from_typst(content);
513
-
514
- assert!(output.contains("*bold*") || output.contains("bold"));
515
- assert!(output.contains("_italic_") || output.contains("italic"));
516
- assert!(output.contains("`code`") || output.contains("code"));
517
- }
518
-
519
- #[test]
520
- fn test_extract_code_blocks() {
521
- let content = r#"Here is code:
522
-
523
- ```python
524
- def hello():
525
- print("world")
526
- ```
527
-
528
- Done."#;
529
-
530
- let (output, _) = TypstExtractor::extract_from_typst(content);
531
-
532
- assert!(output.contains("```python"));
533
- assert!(output.contains("def hello"));
534
- assert!(output.contains("print"));
535
- }
536
-
537
- #[test]
538
- fn test_extract_links() {
539
- let content = r#"Visit #link("https://example.com")[example site] for info."#;
540
-
541
- let (output, _) = TypstExtractor::extract_from_typst(content);
542
-
543
- assert!(
544
- output.contains("example.com")
545
- || output.contains("example site")
546
- || output.contains("[example site](https://example.com)")
547
- );
548
- }
549
-
550
- #[test]
551
- fn test_extract_list_items() {
552
- let content = r#"= Lists
553
-
554
- + First item
555
- + Second item
556
- + Third item"#;
557
-
558
- let (output, _) = TypstExtractor::extract_from_typst(content);
559
-
560
- assert!(output.contains("First item"));
561
- assert!(output.contains("Second item"));
562
- assert!(output.contains("Third item"));
563
- }
564
-
565
- #[test]
566
- fn test_extract_tables() {
567
- let content = r#"== Tables
568
-
569
- #table(
570
- columns: 2,
571
- [Name], [Age],
572
- [Alice], [30],
573
- )"#;
574
-
575
- let (output, _) = TypstExtractor::extract_from_typst(content);
576
-
577
- assert!(output.contains("TABLE:") || output.contains("Name") || output.contains("Alice"));
578
- }
579
-
580
- #[test]
581
- fn test_extract_math() {
582
- let content = r#"The formula $E = mc^2$ is important.
583
-
584
- Display:
585
- $ a^2 + b^2 = c^2 $"#;
586
-
587
- let (output, _) = TypstExtractor::extract_from_typst(content);
588
-
589
- assert!(output.contains("$") && output.contains("mc"));
590
- }
591
-
592
- #[test]
593
- fn test_metadata_extraction_comprehensive() {
594
- let content = r#"#set document(
595
- title: "Advanced Document",
596
- author: "John Doe",
597
- date: "2024-12-06",
598
- subject: "Test Subject",
599
- keywords: ("test", "example", "rust")
600
- )
601
-
602
- Content here."#;
603
-
604
- let (_, metadata) = TypstExtractor::extract_from_typst(content);
605
-
606
- assert!(metadata.additional.contains_key("title"), "Title should be extracted");
607
- assert!(metadata.additional.contains_key("author"), "Author should be extracted");
608
- assert!(metadata.date.is_some(), "Date should be extracted");
609
- assert!(
610
- metadata.additional.contains_key("subject"),
611
- "Subject should be extracted"
612
- );
613
- assert!(
614
- metadata
615
- .additional
616
- .get("keywords")
617
- .map(|v| !v.to_string().is_empty())
618
- .unwrap_or(false)
619
- );
620
- }
621
-
622
- #[test]
623
- fn test_skip_directives() {
624
- let content = r#"#set heading(numbering: "1.")
625
- #let x = 5
626
- #import "@preview/foo:1.0"
627
- #include "other.typ"
628
-
629
- = Heading
630
- Actual content"#;
631
-
632
- let (output, _) = TypstExtractor::extract_from_typst(content);
633
-
634
- assert!(!output.contains("#set"));
635
- assert!(!output.contains("#let"));
636
- assert!(!output.contains("#import"));
637
- assert!(!output.contains("#include"));
638
- assert!(output.contains("Heading"));
639
- assert!(output.contains("content"));
640
- }
641
-
642
- #[test]
643
- fn test_combined_formatting() {
644
- let content = r#"This is *bold with _nested italic_* and more."#;
645
-
646
- let (output, _) = TypstExtractor::extract_from_typst(content);
647
-
648
- assert!(output.contains("*") || output.contains("_") || (output.contains("bold") && output.contains("italic")));
649
- }
650
- }