kreuzberg 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (265) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +8 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yaml +534 -0
  5. data/Gemfile +9 -0
  6. data/Gemfile.lock +157 -0
  7. data/README.md +421 -0
  8. data/Rakefile +25 -0
  9. data/Steepfile +47 -0
  10. data/examples/async_patterns.rb +340 -0
  11. data/ext/kreuzberg_rb/extconf.rb +35 -0
  12. data/ext/kreuzberg_rb/native/Cargo.toml +36 -0
  13. data/ext/kreuzberg_rb/native/README.md +425 -0
  14. data/ext/kreuzberg_rb/native/build.rs +17 -0
  15. data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -0
  16. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -0
  17. data/ext/kreuzberg_rb/native/include/strings.h +20 -0
  18. data/ext/kreuzberg_rb/native/include/unistd.h +47 -0
  19. data/ext/kreuzberg_rb/native/src/lib.rs +2939 -0
  20. data/extconf.rb +28 -0
  21. data/kreuzberg.gemspec +105 -0
  22. data/lib/kreuzberg/api_proxy.rb +142 -0
  23. data/lib/kreuzberg/cache_api.rb +45 -0
  24. data/lib/kreuzberg/cli.rb +55 -0
  25. data/lib/kreuzberg/cli_proxy.rb +127 -0
  26. data/lib/kreuzberg/config.rb +684 -0
  27. data/lib/kreuzberg/errors.rb +50 -0
  28. data/lib/kreuzberg/extraction_api.rb +84 -0
  29. data/lib/kreuzberg/mcp_proxy.rb +186 -0
  30. data/lib/kreuzberg/ocr_backend_protocol.rb +113 -0
  31. data/lib/kreuzberg/post_processor_protocol.rb +86 -0
  32. data/lib/kreuzberg/result.rb +216 -0
  33. data/lib/kreuzberg/setup_lib_path.rb +79 -0
  34. data/lib/kreuzberg/validator_protocol.rb +89 -0
  35. data/lib/kreuzberg/version.rb +5 -0
  36. data/lib/kreuzberg.rb +82 -0
  37. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  38. data/sig/kreuzberg/internal.rbs +184 -0
  39. data/sig/kreuzberg.rbs +468 -0
  40. data/spec/binding/cache_spec.rb +227 -0
  41. data/spec/binding/cli_proxy_spec.rb +87 -0
  42. data/spec/binding/cli_spec.rb +54 -0
  43. data/spec/binding/config_spec.rb +345 -0
  44. data/spec/binding/config_validation_spec.rb +283 -0
  45. data/spec/binding/error_handling_spec.rb +213 -0
  46. data/spec/binding/errors_spec.rb +66 -0
  47. data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
  48. data/spec/binding/plugins/postprocessor_spec.rb +269 -0
  49. data/spec/binding/plugins/validator_spec.rb +274 -0
  50. data/spec/examples.txt +104 -0
  51. data/spec/fixtures/config.toml +39 -0
  52. data/spec/fixtures/config.yaml +42 -0
  53. data/spec/fixtures/invalid_config.toml +4 -0
  54. data/spec/smoke/package_spec.rb +178 -0
  55. data/spec/spec_helper.rb +42 -0
  56. data/vendor/kreuzberg/Cargo.toml +134 -0
  57. data/vendor/kreuzberg/README.md +175 -0
  58. data/vendor/kreuzberg/build.rs +460 -0
  59. data/vendor/kreuzberg/src/api/error.rs +81 -0
  60. data/vendor/kreuzberg/src/api/handlers.rs +199 -0
  61. data/vendor/kreuzberg/src/api/mod.rs +79 -0
  62. data/vendor/kreuzberg/src/api/server.rs +353 -0
  63. data/vendor/kreuzberg/src/api/types.rs +170 -0
  64. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  65. data/vendor/kreuzberg/src/cache/mod.rs +1143 -0
  66. data/vendor/kreuzberg/src/chunking/mod.rs +677 -0
  67. data/vendor/kreuzberg/src/core/batch_mode.rs +35 -0
  68. data/vendor/kreuzberg/src/core/config.rs +1032 -0
  69. data/vendor/kreuzberg/src/core/extractor.rs +903 -0
  70. data/vendor/kreuzberg/src/core/io.rs +327 -0
  71. data/vendor/kreuzberg/src/core/mime.rs +615 -0
  72. data/vendor/kreuzberg/src/core/mod.rs +42 -0
  73. data/vendor/kreuzberg/src/core/pipeline.rs +906 -0
  74. data/vendor/kreuzberg/src/embeddings.rs +323 -0
  75. data/vendor/kreuzberg/src/error.rs +431 -0
  76. data/vendor/kreuzberg/src/extraction/archive.rs +954 -0
  77. data/vendor/kreuzberg/src/extraction/docx.rs +40 -0
  78. data/vendor/kreuzberg/src/extraction/email.rs +854 -0
  79. data/vendor/kreuzberg/src/extraction/excel.rs +688 -0
  80. data/vendor/kreuzberg/src/extraction/html.rs +553 -0
  81. data/vendor/kreuzberg/src/extraction/image.rs +368 -0
  82. data/vendor/kreuzberg/src/extraction/libreoffice.rs +564 -0
  83. data/vendor/kreuzberg/src/extraction/mod.rs +77 -0
  84. data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -0
  85. data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -0
  86. data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -0
  87. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +128 -0
  88. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  89. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  90. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  91. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  92. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  93. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  94. data/vendor/kreuzberg/src/extraction/pptx.rs +3000 -0
  95. data/vendor/kreuzberg/src/extraction/structured.rs +490 -0
  96. data/vendor/kreuzberg/src/extraction/table.rs +328 -0
  97. data/vendor/kreuzberg/src/extraction/text.rs +269 -0
  98. data/vendor/kreuzberg/src/extraction/xml.rs +333 -0
  99. data/vendor/kreuzberg/src/extractors/archive.rs +425 -0
  100. data/vendor/kreuzberg/src/extractors/docx.rs +479 -0
  101. data/vendor/kreuzberg/src/extractors/email.rs +129 -0
  102. data/vendor/kreuzberg/src/extractors/excel.rs +344 -0
  103. data/vendor/kreuzberg/src/extractors/html.rs +410 -0
  104. data/vendor/kreuzberg/src/extractors/image.rs +195 -0
  105. data/vendor/kreuzberg/src/extractors/mod.rs +268 -0
  106. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  107. data/vendor/kreuzberg/src/extractors/pdf.rs +496 -0
  108. data/vendor/kreuzberg/src/extractors/pptx.rs +234 -0
  109. data/vendor/kreuzberg/src/extractors/structured.rs +126 -0
  110. data/vendor/kreuzberg/src/extractors/text.rs +242 -0
  111. data/vendor/kreuzberg/src/extractors/xml.rs +128 -0
  112. data/vendor/kreuzberg/src/image/dpi.rs +164 -0
  113. data/vendor/kreuzberg/src/image/mod.rs +6 -0
  114. data/vendor/kreuzberg/src/image/preprocessing.rs +417 -0
  115. data/vendor/kreuzberg/src/image/resize.rs +89 -0
  116. data/vendor/kreuzberg/src/keywords/config.rs +154 -0
  117. data/vendor/kreuzberg/src/keywords/mod.rs +237 -0
  118. data/vendor/kreuzberg/src/keywords/processor.rs +267 -0
  119. data/vendor/kreuzberg/src/keywords/rake.rs +294 -0
  120. data/vendor/kreuzberg/src/keywords/types.rs +68 -0
  121. data/vendor/kreuzberg/src/keywords/yake.rs +163 -0
  122. data/vendor/kreuzberg/src/language_detection/mod.rs +942 -0
  123. data/vendor/kreuzberg/src/lib.rs +102 -0
  124. data/vendor/kreuzberg/src/mcp/mod.rs +32 -0
  125. data/vendor/kreuzberg/src/mcp/server.rs +1966 -0
  126. data/vendor/kreuzberg/src/ocr/cache.rs +469 -0
  127. data/vendor/kreuzberg/src/ocr/error.rs +37 -0
  128. data/vendor/kreuzberg/src/ocr/hocr.rs +216 -0
  129. data/vendor/kreuzberg/src/ocr/mod.rs +58 -0
  130. data/vendor/kreuzberg/src/ocr/processor.rs +847 -0
  131. data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -0
  132. data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -0
  133. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +450 -0
  134. data/vendor/kreuzberg/src/ocr/types.rs +393 -0
  135. data/vendor/kreuzberg/src/ocr/utils.rs +47 -0
  136. data/vendor/kreuzberg/src/ocr/validation.rs +206 -0
  137. data/vendor/kreuzberg/src/pdf/error.rs +122 -0
  138. data/vendor/kreuzberg/src/pdf/images.rs +139 -0
  139. data/vendor/kreuzberg/src/pdf/metadata.rs +346 -0
  140. data/vendor/kreuzberg/src/pdf/mod.rs +50 -0
  141. data/vendor/kreuzberg/src/pdf/rendering.rs +369 -0
  142. data/vendor/kreuzberg/src/pdf/table.rs +420 -0
  143. data/vendor/kreuzberg/src/pdf/text.rs +161 -0
  144. data/vendor/kreuzberg/src/plugins/extractor.rs +1010 -0
  145. data/vendor/kreuzberg/src/plugins/mod.rs +209 -0
  146. data/vendor/kreuzberg/src/plugins/ocr.rs +629 -0
  147. data/vendor/kreuzberg/src/plugins/processor.rs +641 -0
  148. data/vendor/kreuzberg/src/plugins/registry.rs +1324 -0
  149. data/vendor/kreuzberg/src/plugins/traits.rs +258 -0
  150. data/vendor/kreuzberg/src/plugins/validator.rs +955 -0
  151. data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -0
  152. data/vendor/kreuzberg/src/text/mod.rs +19 -0
  153. data/vendor/kreuzberg/src/text/quality.rs +697 -0
  154. data/vendor/kreuzberg/src/text/string_utils.rs +217 -0
  155. data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -0
  156. data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -0
  157. data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -0
  158. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -0
  159. data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -0
  160. data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -0
  161. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -0
  162. data/vendor/kreuzberg/src/types.rs +873 -0
  163. data/vendor/kreuzberg/src/utils/mod.rs +17 -0
  164. data/vendor/kreuzberg/src/utils/quality.rs +959 -0
  165. data/vendor/kreuzberg/src/utils/string_utils.rs +381 -0
  166. data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -0
  167. data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -0
  168. data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -0
  169. data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -0
  170. data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -0
  171. data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -0
  172. data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -0
  173. data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -0
  174. data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -0
  175. data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -0
  176. data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -0
  177. data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -0
  178. data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -0
  179. data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -0
  180. data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -0
  181. data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -0
  182. data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -0
  183. data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -0
  184. data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -0
  185. data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -0
  186. data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -0
  187. data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -0
  188. data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -0
  189. data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -0
  190. data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -0
  191. data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -0
  192. data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -0
  193. data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -0
  194. data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -0
  195. data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -0
  196. data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -0
  197. data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -0
  198. data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -0
  199. data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -0
  200. data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -0
  201. data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -0
  202. data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -0
  203. data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -0
  204. data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -0
  205. data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -0
  206. data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -0
  207. data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -0
  208. data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -0
  209. data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -0
  210. data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -0
  211. data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -0
  212. data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -0
  213. data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -0
  214. data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -0
  215. data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -0
  216. data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -0
  217. data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -0
  218. data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -0
  219. data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -0
  220. data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -0
  221. data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -0
  222. data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -0
  223. data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -0
  224. data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -0
  225. data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -0
  226. data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -0
  227. data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -0
  228. data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -0
  229. data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -0
  230. data/vendor/kreuzberg/tests/api_tests.rs +966 -0
  231. data/vendor/kreuzberg/tests/archive_integration.rs +543 -0
  232. data/vendor/kreuzberg/tests/batch_orchestration.rs +542 -0
  233. data/vendor/kreuzberg/tests/batch_processing.rs +304 -0
  234. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  235. data/vendor/kreuzberg/tests/concurrency_stress.rs +509 -0
  236. data/vendor/kreuzberg/tests/config_features.rs +580 -0
  237. data/vendor/kreuzberg/tests/config_loading_tests.rs +439 -0
  238. data/vendor/kreuzberg/tests/core_integration.rs +493 -0
  239. data/vendor/kreuzberg/tests/csv_integration.rs +424 -0
  240. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +124 -0
  241. data/vendor/kreuzberg/tests/email_integration.rs +325 -0
  242. data/vendor/kreuzberg/tests/error_handling.rs +393 -0
  243. data/vendor/kreuzberg/tests/format_integration.rs +159 -0
  244. data/vendor/kreuzberg/tests/helpers/mod.rs +142 -0
  245. data/vendor/kreuzberg/tests/image_integration.rs +253 -0
  246. data/vendor/kreuzberg/tests/keywords_integration.rs +479 -0
  247. data/vendor/kreuzberg/tests/keywords_quality.rs +509 -0
  248. data/vendor/kreuzberg/tests/mime_detection.rs +428 -0
  249. data/vendor/kreuzberg/tests/ocr_configuration.rs +510 -0
  250. data/vendor/kreuzberg/tests/ocr_errors.rs +676 -0
  251. data/vendor/kreuzberg/tests/ocr_quality.rs +627 -0
  252. data/vendor/kreuzberg/tests/ocr_stress.rs +469 -0
  253. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  254. data/vendor/kreuzberg/tests/pdf_integration.rs +43 -0
  255. data/vendor/kreuzberg/tests/pipeline_integration.rs +1412 -0
  256. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +771 -0
  257. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +561 -0
  258. data/vendor/kreuzberg/tests/plugin_system.rs +921 -0
  259. data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -0
  260. data/vendor/kreuzberg/tests/registry_integration_tests.rs +607 -0
  261. data/vendor/kreuzberg/tests/security_validation.rs +404 -0
  262. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -0
  263. data/vendor/kreuzberg/tests/test_fastembed.rs +609 -0
  264. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -0
  265. metadata +471 -0
@@ -0,0 +1,328 @@
1
+ //! Table conversion utilities.
2
+ //!
3
+ //! This module provides functions for converting tabular data between different formats.
4
+ //! Currently supports converting Apache Arrow IPC format to Markdown tables using Polars.
5
+ //!
6
+ //! # Features
7
+ //!
8
+ //! - **Arrow IPC parsing**: Read tables from Arrow IPC binary format
9
+ //! - **Markdown generation**: Convert DataFrames to clean Markdown tables
10
+ //! - **Type-safe**: Handles all Polars data types safely
11
+ //! - **Empty table handling**: Gracefully handles empty DataFrames
12
+ //!
13
+ //! # Supported Conversions
14
+ //!
15
+ //! - Arrow IPC β†’ Markdown table
16
+ //! - Polars DataFrame β†’ Markdown table
17
+ //!
18
+ //! # Example
19
+ //!
20
+ //! ```rust,no_run
21
+ //! use kreuzberg::extraction::table::table_from_arrow_to_markdown;
22
+ //!
23
+ //! # fn example() -> kreuzberg::Result<()> {
24
+ //! // Convert Arrow IPC bytes to Markdown
25
+ //! let arrow_bytes = vec![/* Arrow IPC data */];
26
+ //! let markdown = table_from_arrow_to_markdown(&arrow_bytes)?;
27
+ //!
28
+ //! println!("Markdown table:\n{}", markdown);
29
+ //! # Ok(())
30
+ //! # }
31
+ //! ```
32
+ //!
33
+ //! # Output Format
34
+ //!
35
+ //! The generated Markdown follows GitHub Flavored Markdown table syntax:
36
+ //! ```markdown
37
+ //! | Column1 | Column2 | Column3 |
38
+ //! |------|------|------|
39
+ //! | value1 | value2 | value3 |
40
+ //! | value4 | value5 | value6 |
41
+ //! ```
42
+
43
+ use crate::error::{KreuzbergError, Result};
44
+ use polars::prelude::*;
45
+ use std::io::Cursor;
46
+
47
+ /// Convert Arrow IPC bytes to markdown table format
48
+ pub fn table_from_arrow_to_markdown(arrow_bytes: &[u8]) -> Result<String> {
49
+ let cursor = Cursor::new(arrow_bytes);
50
+ let df = IpcReader::new(cursor)
51
+ .finish()
52
+ .map_err(|e| KreuzbergError::parsing(format!("Failed to read Arrow IPC data: {}", e)))?;
53
+
54
+ dataframe_to_markdown(&df)
55
+ }
56
+
57
+ /// Convert a Polars DataFrame to markdown table format
58
+ fn dataframe_to_markdown(df: &DataFrame) -> Result<String> {
59
+ if df.is_empty() {
60
+ return Ok(String::new());
61
+ }
62
+
63
+ let mut markdown = String::new();
64
+
65
+ markdown.push_str("| ");
66
+ for col_name in df.get_column_names() {
67
+ markdown.push_str(col_name);
68
+ markdown.push_str(" | ");
69
+ }
70
+ markdown.push('\n');
71
+
72
+ markdown.push('|');
73
+ for _ in 0..df.width() {
74
+ markdown.push_str("------|");
75
+ }
76
+ markdown.push('\n');
77
+
78
+ for row_idx in 0..df.height() {
79
+ markdown.push_str("| ");
80
+ for col in df.get_columns() {
81
+ let series = col.as_materialized_series();
82
+ let value = format_cell_value(series, row_idx)?;
83
+ markdown.push_str(&value);
84
+ markdown.push_str(" | ");
85
+ }
86
+ markdown.push('\n');
87
+ }
88
+
89
+ Ok(markdown)
90
+ }
91
+
92
+ fn format_cell_value(series: &Series, idx: usize) -> Result<String> {
93
+ let is_null_array = series.is_null();
94
+ if is_null_array.get(idx).unwrap_or(false) {
95
+ return Ok(String::new());
96
+ }
97
+
98
+ let value_str = match series.dtype() {
99
+ DataType::Int8 | DataType::Int16 | DataType::Int32 | DataType::Int64 => {
100
+ let casted = series
101
+ .cast(&DataType::Int64)
102
+ .map_err(|e| KreuzbergError::parsing(format!("Failed to cast to i64: {}", e)))?;
103
+ casted
104
+ .i64()
105
+ .map_err(|e| KreuzbergError::parsing(format!("Failed to get i64 value: {}", e)))?
106
+ .get(idx)
107
+ .map(|v| v.to_string())
108
+ .unwrap_or_default()
109
+ }
110
+ DataType::UInt8 | DataType::UInt16 | DataType::UInt32 | DataType::UInt64 => {
111
+ let casted = series
112
+ .cast(&DataType::UInt64)
113
+ .map_err(|e| KreuzbergError::parsing(format!("Failed to cast to u64: {}", e)))?;
114
+ casted
115
+ .u64()
116
+ .map_err(|e| KreuzbergError::parsing(format!("Failed to get u64 value: {}", e)))?
117
+ .get(idx)
118
+ .map(|v| v.to_string())
119
+ .unwrap_or_default()
120
+ }
121
+ DataType::Float32 | DataType::Float64 => {
122
+ let casted = series
123
+ .cast(&DataType::Float64)
124
+ .map_err(|e| KreuzbergError::parsing(format!("Failed to cast to f64: {}", e)))?;
125
+ casted
126
+ .f64()
127
+ .map_err(|e| KreuzbergError::parsing(format!("Failed to get f64 value: {}", e)))?
128
+ .get(idx)
129
+ .map(|v| format!("{:.2}", v))
130
+ .unwrap_or_default()
131
+ }
132
+ DataType::Boolean => series
133
+ .bool()
134
+ .map_err(|e| KreuzbergError::parsing(format!("Failed to get bool value: {}", e)))?
135
+ .get(idx)
136
+ .map(|v| v.to_string())
137
+ .unwrap_or_default(),
138
+ DataType::String => series
139
+ .str()
140
+ .map_err(|e| KreuzbergError::parsing(format!("Failed to get string value: {}", e)))?
141
+ .get(idx)
142
+ .map(|v| v.to_string())
143
+ .unwrap_or_default(),
144
+ _ => {
145
+ format!("{:?}", series.get(idx))
146
+ }
147
+ };
148
+
149
+ Ok(value_str)
150
+ }
151
+
152
+ #[cfg(test)]
153
+ mod tests {
154
+ use super::*;
155
+ use std::io::Cursor;
156
+
157
+ fn create_test_dataframe() -> DataFrame {
158
+ df!(
159
+ "name" => &["Alice", "Bob", "Charlie"],
160
+ "age" => &[30, 25, 35],
161
+ "score" => &[95.5, 87.3, 92.1]
162
+ )
163
+ .unwrap()
164
+ }
165
+
166
+ fn dataframe_to_arrow_bytes(df: &DataFrame) -> Vec<u8> {
167
+ let mut buffer = Vec::new();
168
+ let mut cursor = Cursor::new(&mut buffer);
169
+ let mut df_mut = df.clone();
170
+ IpcWriter::new(&mut cursor).finish(&mut df_mut).unwrap();
171
+ buffer
172
+ }
173
+
174
+ #[test]
175
+ fn test_dataframe_to_markdown_basic() {
176
+ let df = create_test_dataframe();
177
+ let markdown = dataframe_to_markdown(&df).unwrap();
178
+
179
+ assert!(markdown.contains("| name | age | score |"));
180
+ assert!(markdown.contains("|------|------|------|"));
181
+ assert!(markdown.contains("| Alice | 30 | 95.50 |"));
182
+ assert!(markdown.contains("| Bob | 25 | 87.30 |"));
183
+ assert!(markdown.contains("| Charlie | 35 | 92.10 |"));
184
+ }
185
+
186
+ #[test]
187
+ fn test_table_from_arrow_to_markdown() {
188
+ let df = create_test_dataframe();
189
+ let arrow_bytes = dataframe_to_arrow_bytes(&df);
190
+
191
+ let markdown = table_from_arrow_to_markdown(&arrow_bytes).unwrap();
192
+
193
+ assert!(markdown.contains("| name | age | score |"));
194
+ assert!(markdown.contains("| Alice | 30 | 95.50 |"));
195
+ assert!(markdown.contains("| Bob | 25 | 87.30 |"));
196
+ assert!(markdown.contains("| Charlie | 35 | 92.10 |"));
197
+ }
198
+
199
+ #[test]
200
+ fn test_empty_dataframe() {
201
+ let df = df!("col1" => Vec::<i32>::new()).unwrap();
202
+ let markdown = dataframe_to_markdown(&df).unwrap();
203
+ assert_eq!(markdown, "");
204
+ }
205
+
206
+ #[test]
207
+ fn test_dataframe_with_nulls() {
208
+ let s1 = Series::new("name".into(), &["Alice", "Bob", "Charlie"]);
209
+ let s2 = Series::new("value".into(), &[Some(1), None, Some(3)]);
210
+ let df = DataFrame::new(vec![s1.into(), s2.into()]).unwrap();
211
+
212
+ let markdown = dataframe_to_markdown(&df).unwrap();
213
+
214
+ assert!(markdown.contains("| name | value |"));
215
+ assert!(markdown.contains("| Alice | 1 |"));
216
+ assert!(markdown.contains("| Bob | |"));
217
+ assert!(markdown.contains("| Charlie | 3 |"));
218
+ }
219
+
220
+ #[test]
221
+ fn test_dataframe_with_booleans() {
222
+ let df = df!(
223
+ "name" => &["Alice", "Bob"],
224
+ "active" => &[true, false]
225
+ )
226
+ .unwrap();
227
+
228
+ let markdown = dataframe_to_markdown(&df).unwrap();
229
+
230
+ assert!(markdown.contains("| name | active |"));
231
+ assert!(markdown.contains("| Alice | true |"));
232
+ assert!(markdown.contains("| Bob | false |"));
233
+ }
234
+
235
+ #[test]
236
+ fn test_dataframe_with_integers() {
237
+ let df = df!(
238
+ "id" => &[1i64, 2i64, 3i64],
239
+ "count" => &[100u64, 200u64, 300u64]
240
+ )
241
+ .unwrap();
242
+
243
+ let markdown = dataframe_to_markdown(&df).unwrap();
244
+
245
+ assert!(markdown.contains("| id | count |"));
246
+ assert!(markdown.contains("| 1 | 100 |"));
247
+ assert!(markdown.contains("| 2 | 200 |"));
248
+ assert!(markdown.contains("| 3 | 300 |"));
249
+ }
250
+
251
+ #[test]
252
+ fn test_single_column_dataframe() {
253
+ let df = df!("name" => &["Alice", "Bob", "Charlie"]).unwrap();
254
+ let markdown = dataframe_to_markdown(&df).unwrap();
255
+
256
+ assert!(markdown.contains("| name |"));
257
+ assert!(markdown.contains("|------|"));
258
+ assert!(markdown.contains("| Alice |"));
259
+ assert!(markdown.contains("| Bob |"));
260
+ assert!(markdown.contains("| Charlie |"));
261
+ }
262
+
263
+ #[test]
264
+ fn test_single_row_dataframe() {
265
+ let df = df!(
266
+ "name" => &["Alice"],
267
+ "age" => &[30]
268
+ )
269
+ .unwrap();
270
+
271
+ let markdown = dataframe_to_markdown(&df).unwrap();
272
+
273
+ assert!(markdown.contains("| name | age |"));
274
+ assert!(markdown.contains("| Alice | 30 |"));
275
+ }
276
+
277
+ #[test]
278
+ fn test_arrow_bytes_roundtrip() {
279
+ let original_df = df!(
280
+ "col1" => &[1, 2, 3],
281
+ "col2" => &["a", "b", "c"]
282
+ )
283
+ .unwrap();
284
+
285
+ let arrow_bytes = dataframe_to_arrow_bytes(&original_df);
286
+ let markdown = table_from_arrow_to_markdown(&arrow_bytes).unwrap();
287
+
288
+ assert!(markdown.contains("| col1 | col2 |"));
289
+ assert!(markdown.contains("| 1 | a |"));
290
+ assert!(markdown.contains("| 2 | b |"));
291
+ assert!(markdown.contains("| 3 | c |"));
292
+ }
293
+
294
+ #[test]
295
+ fn test_invalid_arrow_bytes() {
296
+ let invalid_bytes = vec![0u8; 10];
297
+ let result = table_from_arrow_to_markdown(&invalid_bytes);
298
+ assert!(result.is_err());
299
+ }
300
+
301
+ #[test]
302
+ fn test_float_formatting() {
303
+ let df = df!(
304
+ "value" => &[1.234, 5.678, 9.012]
305
+ )
306
+ .unwrap();
307
+
308
+ let markdown = dataframe_to_markdown(&df).unwrap();
309
+
310
+ assert!(markdown.contains("| 1.23 |"));
311
+ assert!(markdown.contains("| 5.68 |"));
312
+ assert!(markdown.contains("| 9.01 |"));
313
+ }
314
+
315
+ #[test]
316
+ fn test_special_characters_in_strings() {
317
+ let df = df!(
318
+ "text" => &["Hello | World", "A & B", "C > D"]
319
+ )
320
+ .unwrap();
321
+
322
+ let markdown = dataframe_to_markdown(&df).unwrap();
323
+
324
+ assert!(markdown.contains("| Hello | World |"));
325
+ assert!(markdown.contains("| A & B |"));
326
+ assert!(markdown.contains("| C > D |"));
327
+ }
328
+ }
@@ -0,0 +1,269 @@
1
+ //! Plain text and Markdown extraction functions.
2
+ //!
3
+ //! This module provides memory-efficient streaming parsers for plain text and Markdown files.
4
+ //! Key features:
5
+ //!
6
+ //! - **Streaming parsing**: Processes files line-by-line to handle multi-GB files
7
+ //! - **Markdown support**: Extracts headers, links, and code blocks from Markdown
8
+ //! - **Word/line counting**: Accurate statistics without loading entire file
9
+ //! - **CRLF support**: Handles both Unix and Windows line endings
10
+ //!
11
+ //! # Example
12
+ //!
13
+ //! ```rust
14
+ //! use kreuzberg::extraction::text::parse_text;
15
+ //!
16
+ //! # fn example() -> kreuzberg::Result<()> {
17
+ //! let text = b"# Hello\n\nThis is [a link](https://example.com).";
18
+ //! let result = parse_text(text, true)?; // true = is Markdown
19
+ //!
20
+ //! assert_eq!(result.line_count, 3);
21
+ //! assert!(result.headers.unwrap().contains(&"Hello".to_string()));
22
+ //! # Ok(())
23
+ //! # }
24
+ //! ```
25
+ use once_cell::sync::Lazy;
26
+ use regex::Regex;
27
+
28
+ use crate::error::Result;
29
+ use crate::types::TextExtractionResult;
30
+
31
+ static MARKDOWN_HEADER: Lazy<Regex> =
32
+ Lazy::new(|| Regex::new(r"^#{1,6}\s*(.+)$").expect("Markdown header regex pattern is valid and should compile"));
33
+ static MARKDOWN_LINK: Lazy<Regex> = Lazy::new(|| {
34
+ Regex::new(r"\[([^\]]+)\]\(([^)]+)\)").expect("Markdown link regex pattern is valid and should compile")
35
+ });
36
+ static CODE_BLOCK_DELIMITER: Lazy<Regex> = Lazy::new(|| {
37
+ Regex::new(r"^```(\w*)[\r]?$").expect("Code block delimiter regex pattern is valid and should compile")
38
+ });
39
+
40
+ pub fn parse_text(text_bytes: &[u8], is_markdown: bool) -> Result<TextExtractionResult> {
41
+ let text = String::from_utf8_lossy(text_bytes).into_owned();
42
+
43
+ let mut line_count = 0;
44
+ let mut word_count = 0;
45
+ let character_count = text.len();
46
+
47
+ let mut headers = Vec::new();
48
+ let mut links = Vec::new();
49
+ let mut code_blocks = Vec::new();
50
+ let mut in_code_block = false;
51
+ let mut current_code_lang = String::new();
52
+ let mut current_code = String::new();
53
+
54
+ for line in text.lines() {
55
+ line_count += 1;
56
+ word_count += line.split_whitespace().count();
57
+
58
+ if !is_markdown {
59
+ continue;
60
+ }
61
+
62
+ if CODE_BLOCK_DELIMITER.is_match(line) {
63
+ if in_code_block {
64
+ code_blocks.push((
65
+ if current_code_lang.is_empty() {
66
+ "plain".to_string()
67
+ } else {
68
+ current_code_lang.clone()
69
+ },
70
+ current_code.trim_end().to_string(),
71
+ ));
72
+ current_code.clear();
73
+ current_code_lang.clear();
74
+ in_code_block = false;
75
+ } else {
76
+ if let Some(caps) = CODE_BLOCK_DELIMITER.captures(line) {
77
+ current_code_lang = caps.get(1).map(|m| m.as_str()).unwrap_or("").to_string();
78
+ }
79
+ in_code_block = true;
80
+ }
81
+ continue;
82
+ }
83
+
84
+ if in_code_block {
85
+ current_code.push_str(line);
86
+ current_code.push('\n');
87
+ continue;
88
+ }
89
+
90
+ if let Some(caps) = MARKDOWN_HEADER.captures(line)
91
+ && let Some(header) = caps.get(1)
92
+ {
93
+ headers.push(header.as_str().to_string());
94
+ }
95
+
96
+ for caps in MARKDOWN_LINK.captures_iter(line) {
97
+ if let (Some(text), Some(url)) = (caps.get(1), caps.get(2)) {
98
+ links.push((text.as_str().to_string(), url.as_str().to_string()));
99
+ }
100
+ }
101
+ }
102
+
103
+ Ok(TextExtractionResult {
104
+ content: text,
105
+ line_count,
106
+ word_count,
107
+ character_count,
108
+ headers: if headers.is_empty() { None } else { Some(headers) },
109
+ links: if links.is_empty() { None } else { Some(links) },
110
+ code_blocks: if code_blocks.is_empty() {
111
+ None
112
+ } else {
113
+ Some(code_blocks)
114
+ },
115
+ })
116
+ }
117
+
118
+ #[cfg(test)]
119
+ mod tests {
120
+ use super::*;
121
+
122
+ #[test]
123
+ fn test_plain_text_basic() {
124
+ let text = b"Hello, World!\nThis is a test.\nThird line here.";
125
+ let result = parse_text(text, false).unwrap();
126
+ assert_eq!(result.content, "Hello, World!\nThis is a test.\nThird line here.");
127
+ assert_eq!(result.line_count, 3);
128
+ assert_eq!(result.word_count, 9);
129
+ assert_eq!(result.character_count, result.content.len());
130
+ assert!(result.headers.is_none());
131
+ assert!(result.links.is_none());
132
+ assert!(result.code_blocks.is_none());
133
+ }
134
+
135
+ #[test]
136
+ fn test_plain_text_empty() {
137
+ let text = b"";
138
+ let result = parse_text(text, false).unwrap();
139
+ assert_eq!(result.content, "");
140
+ assert_eq!(result.line_count, 0);
141
+ assert_eq!(result.word_count, 0);
142
+ assert_eq!(result.character_count, 0);
143
+ }
144
+
145
+ #[test]
146
+ fn test_markdown_headers() {
147
+ let text = b"# Header 1\n## Header 2\n### Header 3\n#NoSpace\n## Multiple spaces";
148
+ let result = parse_text(text, true).unwrap();
149
+ assert_eq!(result.line_count, 5);
150
+ let headers = result.headers.unwrap();
151
+ assert_eq!(headers.len(), 5);
152
+ assert!(headers.contains(&"Header 1".to_string()));
153
+ assert!(headers.contains(&"Header 2".to_string()));
154
+ assert!(headers.contains(&"Header 3".to_string()));
155
+ assert!(headers.contains(&"NoSpace".to_string()));
156
+ assert!(headers.contains(&"Multiple spaces".to_string()));
157
+ }
158
+
159
+ #[test]
160
+ fn test_markdown_links() {
161
+ let text =
162
+ b"Check [Google](https://google.com) and [GitHub](https://github.com).\n[Another](https://example.com)";
163
+ let result = parse_text(text, true).unwrap();
164
+ let links = result.links.unwrap();
165
+ assert_eq!(links.len(), 3);
166
+ assert!(links.contains(&("Google".to_string(), "https://google.com".to_string())));
167
+ assert!(links.contains(&("GitHub".to_string(), "https://github.com".to_string())));
168
+ assert!(links.contains(&("Another".to_string(), "https://example.com".to_string())));
169
+ }
170
+
171
+ #[test]
172
+ fn test_markdown_code_blocks() {
173
+ let text = b"```python\ndef hello():\n print(\"Hello\")\n```\n\n```javascript\nconsole.log(\"Hi\");\n```\n\n```\nplain code\n```";
174
+ let result = parse_text(text, true).unwrap();
175
+ let code_blocks = result.code_blocks.unwrap();
176
+ assert_eq!(code_blocks.len(), 3);
177
+
178
+ let python_block = code_blocks.iter().find(|(lang, _)| lang == "python").unwrap();
179
+ assert!(python_block.1.contains("def hello()"));
180
+
181
+ let js_block = code_blocks.iter().find(|(lang, _)| lang == "javascript").unwrap();
182
+ assert!(js_block.1.contains("console.log"));
183
+
184
+ let plain_block = code_blocks.iter().find(|(lang, _)| lang == "plain").unwrap();
185
+ assert!(plain_block.1.contains("plain code"));
186
+ }
187
+
188
+ #[test]
189
+ fn test_markdown_code_blocks_crlf() {
190
+ let text = b"```python\r\ndef hello():\r\n print(\"Hello\")\r\n```\r\n";
191
+ let result = parse_text(text, true).unwrap();
192
+ let code_blocks = result.code_blocks.unwrap();
193
+ assert_eq!(code_blocks.len(), 1);
194
+ assert_eq!(code_blocks[0].0, "python");
195
+ assert!(code_blocks[0].1.contains("def hello()"));
196
+ }
197
+
198
+ #[test]
199
+ fn test_markdown_complex() {
200
+ let text = b"# Documentation\n\n## Overview\nThis is a [test](https://example.com).\n\n```python\nx = 42\n```\n\n## Another\nMore [links](https://test.com).";
201
+ let result = parse_text(text, true).unwrap();
202
+ assert!(result.line_count > 0);
203
+ assert!(result.word_count > 0);
204
+
205
+ let headers = result.headers.unwrap();
206
+ assert_eq!(headers.len(), 3);
207
+
208
+ let links = result.links.unwrap();
209
+ assert_eq!(links.len(), 2);
210
+
211
+ let code_blocks = result.code_blocks.unwrap();
212
+ assert_eq!(code_blocks.len(), 1);
213
+ }
214
+
215
+ #[test]
216
+ fn test_unicode_content() {
217
+ let text = "Hello δΈ–η•Œ 🌍\nUnicode test".as_bytes();
218
+ let result = parse_text(text, false).unwrap();
219
+ assert!(result.content.contains("δΈ–η•Œ"));
220
+ assert!(result.content.contains("🌍"));
221
+ assert_eq!(result.line_count, 2);
222
+ }
223
+
224
+ #[test]
225
+ fn test_word_count_accuracy() {
226
+ let text = b"One two three four five.\nSix seven eight.\nNine.";
227
+ let result = parse_text(text, false).unwrap();
228
+ assert_eq!(result.line_count, 3);
229
+ assert_eq!(result.word_count, 9);
230
+ }
231
+
232
+ #[test]
233
+ fn test_headers_not_in_code_blocks() {
234
+ let text = b"# Real Header\n```\n# Not a header\n```\n## Another Real";
235
+ let result = parse_text(text, true).unwrap();
236
+ let headers = result.headers.unwrap();
237
+ assert_eq!(headers.len(), 2);
238
+ assert!(headers.contains(&"Real Header".to_string()));
239
+ assert!(headers.contains(&"Another Real".to_string()));
240
+ assert!(!headers.iter().any(|h| h.contains("Not a header")));
241
+ }
242
+
243
+ #[test]
244
+ fn test_links_not_in_code_blocks() {
245
+ let text = b"[Real Link](https://real.com)\n```\n[Not Link](https://fake.com)\n```";
246
+ let result = parse_text(text, true).unwrap();
247
+ let links = result.links.unwrap();
248
+ assert_eq!(links.len(), 1);
249
+ assert_eq!(links[0].0, "Real Link");
250
+ assert_eq!(links[0].1, "https://real.com");
251
+ }
252
+
253
+ #[test]
254
+ fn test_empty_code_block_language() {
255
+ let text = b"```\ncode without language\n```";
256
+ let result = parse_text(text, true).unwrap();
257
+ let code_blocks = result.code_blocks.unwrap();
258
+ assert_eq!(code_blocks.len(), 1);
259
+ assert_eq!(code_blocks[0].0, "plain");
260
+ }
261
+
262
+ #[test]
263
+ fn test_large_text_streaming() {
264
+ let large_text = "Line\n".repeat(10000);
265
+ let result = parse_text(large_text.as_bytes(), false).unwrap();
266
+ assert_eq!(result.line_count, 10000);
267
+ assert_eq!(result.word_count, 10000);
268
+ }
269
+ }