kreuzberg 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (265) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +8 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yaml +534 -0
  5. data/Gemfile +9 -0
  6. data/Gemfile.lock +157 -0
  7. data/README.md +421 -0
  8. data/Rakefile +25 -0
  9. data/Steepfile +47 -0
  10. data/examples/async_patterns.rb +340 -0
  11. data/ext/kreuzberg_rb/extconf.rb +35 -0
  12. data/ext/kreuzberg_rb/native/Cargo.toml +36 -0
  13. data/ext/kreuzberg_rb/native/README.md +425 -0
  14. data/ext/kreuzberg_rb/native/build.rs +17 -0
  15. data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -0
  16. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -0
  17. data/ext/kreuzberg_rb/native/include/strings.h +20 -0
  18. data/ext/kreuzberg_rb/native/include/unistd.h +47 -0
  19. data/ext/kreuzberg_rb/native/src/lib.rs +2939 -0
  20. data/extconf.rb +28 -0
  21. data/kreuzberg.gemspec +105 -0
  22. data/lib/kreuzberg/api_proxy.rb +142 -0
  23. data/lib/kreuzberg/cache_api.rb +45 -0
  24. data/lib/kreuzberg/cli.rb +55 -0
  25. data/lib/kreuzberg/cli_proxy.rb +127 -0
  26. data/lib/kreuzberg/config.rb +684 -0
  27. data/lib/kreuzberg/errors.rb +50 -0
  28. data/lib/kreuzberg/extraction_api.rb +84 -0
  29. data/lib/kreuzberg/mcp_proxy.rb +186 -0
  30. data/lib/kreuzberg/ocr_backend_protocol.rb +113 -0
  31. data/lib/kreuzberg/post_processor_protocol.rb +86 -0
  32. data/lib/kreuzberg/result.rb +216 -0
  33. data/lib/kreuzberg/setup_lib_path.rb +79 -0
  34. data/lib/kreuzberg/validator_protocol.rb +89 -0
  35. data/lib/kreuzberg/version.rb +5 -0
  36. data/lib/kreuzberg.rb +82 -0
  37. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  38. data/sig/kreuzberg/internal.rbs +184 -0
  39. data/sig/kreuzberg.rbs +468 -0
  40. data/spec/binding/cache_spec.rb +227 -0
  41. data/spec/binding/cli_proxy_spec.rb +87 -0
  42. data/spec/binding/cli_spec.rb +54 -0
  43. data/spec/binding/config_spec.rb +345 -0
  44. data/spec/binding/config_validation_spec.rb +283 -0
  45. data/spec/binding/error_handling_spec.rb +213 -0
  46. data/spec/binding/errors_spec.rb +66 -0
  47. data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
  48. data/spec/binding/plugins/postprocessor_spec.rb +269 -0
  49. data/spec/binding/plugins/validator_spec.rb +274 -0
  50. data/spec/examples.txt +104 -0
  51. data/spec/fixtures/config.toml +39 -0
  52. data/spec/fixtures/config.yaml +42 -0
  53. data/spec/fixtures/invalid_config.toml +4 -0
  54. data/spec/smoke/package_spec.rb +178 -0
  55. data/spec/spec_helper.rb +42 -0
  56. data/vendor/kreuzberg/Cargo.toml +134 -0
  57. data/vendor/kreuzberg/README.md +175 -0
  58. data/vendor/kreuzberg/build.rs +460 -0
  59. data/vendor/kreuzberg/src/api/error.rs +81 -0
  60. data/vendor/kreuzberg/src/api/handlers.rs +199 -0
  61. data/vendor/kreuzberg/src/api/mod.rs +79 -0
  62. data/vendor/kreuzberg/src/api/server.rs +353 -0
  63. data/vendor/kreuzberg/src/api/types.rs +170 -0
  64. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  65. data/vendor/kreuzberg/src/cache/mod.rs +1143 -0
  66. data/vendor/kreuzberg/src/chunking/mod.rs +677 -0
  67. data/vendor/kreuzberg/src/core/batch_mode.rs +35 -0
  68. data/vendor/kreuzberg/src/core/config.rs +1032 -0
  69. data/vendor/kreuzberg/src/core/extractor.rs +903 -0
  70. data/vendor/kreuzberg/src/core/io.rs +327 -0
  71. data/vendor/kreuzberg/src/core/mime.rs +615 -0
  72. data/vendor/kreuzberg/src/core/mod.rs +42 -0
  73. data/vendor/kreuzberg/src/core/pipeline.rs +906 -0
  74. data/vendor/kreuzberg/src/embeddings.rs +323 -0
  75. data/vendor/kreuzberg/src/error.rs +431 -0
  76. data/vendor/kreuzberg/src/extraction/archive.rs +954 -0
  77. data/vendor/kreuzberg/src/extraction/docx.rs +40 -0
  78. data/vendor/kreuzberg/src/extraction/email.rs +854 -0
  79. data/vendor/kreuzberg/src/extraction/excel.rs +688 -0
  80. data/vendor/kreuzberg/src/extraction/html.rs +553 -0
  81. data/vendor/kreuzberg/src/extraction/image.rs +368 -0
  82. data/vendor/kreuzberg/src/extraction/libreoffice.rs +564 -0
  83. data/vendor/kreuzberg/src/extraction/mod.rs +77 -0
  84. data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -0
  85. data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -0
  86. data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -0
  87. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +128 -0
  88. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  89. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  90. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  91. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  92. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  93. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  94. data/vendor/kreuzberg/src/extraction/pptx.rs +3000 -0
  95. data/vendor/kreuzberg/src/extraction/structured.rs +490 -0
  96. data/vendor/kreuzberg/src/extraction/table.rs +328 -0
  97. data/vendor/kreuzberg/src/extraction/text.rs +269 -0
  98. data/vendor/kreuzberg/src/extraction/xml.rs +333 -0
  99. data/vendor/kreuzberg/src/extractors/archive.rs +425 -0
  100. data/vendor/kreuzberg/src/extractors/docx.rs +479 -0
  101. data/vendor/kreuzberg/src/extractors/email.rs +129 -0
  102. data/vendor/kreuzberg/src/extractors/excel.rs +344 -0
  103. data/vendor/kreuzberg/src/extractors/html.rs +410 -0
  104. data/vendor/kreuzberg/src/extractors/image.rs +195 -0
  105. data/vendor/kreuzberg/src/extractors/mod.rs +268 -0
  106. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  107. data/vendor/kreuzberg/src/extractors/pdf.rs +496 -0
  108. data/vendor/kreuzberg/src/extractors/pptx.rs +234 -0
  109. data/vendor/kreuzberg/src/extractors/structured.rs +126 -0
  110. data/vendor/kreuzberg/src/extractors/text.rs +242 -0
  111. data/vendor/kreuzberg/src/extractors/xml.rs +128 -0
  112. data/vendor/kreuzberg/src/image/dpi.rs +164 -0
  113. data/vendor/kreuzberg/src/image/mod.rs +6 -0
  114. data/vendor/kreuzberg/src/image/preprocessing.rs +417 -0
  115. data/vendor/kreuzberg/src/image/resize.rs +89 -0
  116. data/vendor/kreuzberg/src/keywords/config.rs +154 -0
  117. data/vendor/kreuzberg/src/keywords/mod.rs +237 -0
  118. data/vendor/kreuzberg/src/keywords/processor.rs +267 -0
  119. data/vendor/kreuzberg/src/keywords/rake.rs +294 -0
  120. data/vendor/kreuzberg/src/keywords/types.rs +68 -0
  121. data/vendor/kreuzberg/src/keywords/yake.rs +163 -0
  122. data/vendor/kreuzberg/src/language_detection/mod.rs +942 -0
  123. data/vendor/kreuzberg/src/lib.rs +102 -0
  124. data/vendor/kreuzberg/src/mcp/mod.rs +32 -0
  125. data/vendor/kreuzberg/src/mcp/server.rs +1966 -0
  126. data/vendor/kreuzberg/src/ocr/cache.rs +469 -0
  127. data/vendor/kreuzberg/src/ocr/error.rs +37 -0
  128. data/vendor/kreuzberg/src/ocr/hocr.rs +216 -0
  129. data/vendor/kreuzberg/src/ocr/mod.rs +58 -0
  130. data/vendor/kreuzberg/src/ocr/processor.rs +847 -0
  131. data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -0
  132. data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -0
  133. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +450 -0
  134. data/vendor/kreuzberg/src/ocr/types.rs +393 -0
  135. data/vendor/kreuzberg/src/ocr/utils.rs +47 -0
  136. data/vendor/kreuzberg/src/ocr/validation.rs +206 -0
  137. data/vendor/kreuzberg/src/pdf/error.rs +122 -0
  138. data/vendor/kreuzberg/src/pdf/images.rs +139 -0
  139. data/vendor/kreuzberg/src/pdf/metadata.rs +346 -0
  140. data/vendor/kreuzberg/src/pdf/mod.rs +50 -0
  141. data/vendor/kreuzberg/src/pdf/rendering.rs +369 -0
  142. data/vendor/kreuzberg/src/pdf/table.rs +420 -0
  143. data/vendor/kreuzberg/src/pdf/text.rs +161 -0
  144. data/vendor/kreuzberg/src/plugins/extractor.rs +1010 -0
  145. data/vendor/kreuzberg/src/plugins/mod.rs +209 -0
  146. data/vendor/kreuzberg/src/plugins/ocr.rs +629 -0
  147. data/vendor/kreuzberg/src/plugins/processor.rs +641 -0
  148. data/vendor/kreuzberg/src/plugins/registry.rs +1324 -0
  149. data/vendor/kreuzberg/src/plugins/traits.rs +258 -0
  150. data/vendor/kreuzberg/src/plugins/validator.rs +955 -0
  151. data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -0
  152. data/vendor/kreuzberg/src/text/mod.rs +19 -0
  153. data/vendor/kreuzberg/src/text/quality.rs +697 -0
  154. data/vendor/kreuzberg/src/text/string_utils.rs +217 -0
  155. data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -0
  156. data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -0
  157. data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -0
  158. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -0
  159. data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -0
  160. data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -0
  161. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -0
  162. data/vendor/kreuzberg/src/types.rs +873 -0
  163. data/vendor/kreuzberg/src/utils/mod.rs +17 -0
  164. data/vendor/kreuzberg/src/utils/quality.rs +959 -0
  165. data/vendor/kreuzberg/src/utils/string_utils.rs +381 -0
  166. data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -0
  167. data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -0
  168. data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -0
  169. data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -0
  170. data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -0
  171. data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -0
  172. data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -0
  173. data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -0
  174. data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -0
  175. data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -0
  176. data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -0
  177. data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -0
  178. data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -0
  179. data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -0
  180. data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -0
  181. data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -0
  182. data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -0
  183. data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -0
  184. data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -0
  185. data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -0
  186. data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -0
  187. data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -0
  188. data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -0
  189. data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -0
  190. data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -0
  191. data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -0
  192. data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -0
  193. data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -0
  194. data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -0
  195. data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -0
  196. data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -0
  197. data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -0
  198. data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -0
  199. data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -0
  200. data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -0
  201. data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -0
  202. data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -0
  203. data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -0
  204. data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -0
  205. data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -0
  206. data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -0
  207. data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -0
  208. data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -0
  209. data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -0
  210. data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -0
  211. data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -0
  212. data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -0
  213. data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -0
  214. data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -0
  215. data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -0
  216. data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -0
  217. data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -0
  218. data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -0
  219. data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -0
  220. data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -0
  221. data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -0
  222. data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -0
  223. data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -0
  224. data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -0
  225. data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -0
  226. data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -0
  227. data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -0
  228. data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -0
  229. data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -0
  230. data/vendor/kreuzberg/tests/api_tests.rs +966 -0
  231. data/vendor/kreuzberg/tests/archive_integration.rs +543 -0
  232. data/vendor/kreuzberg/tests/batch_orchestration.rs +542 -0
  233. data/vendor/kreuzberg/tests/batch_processing.rs +304 -0
  234. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  235. data/vendor/kreuzberg/tests/concurrency_stress.rs +509 -0
  236. data/vendor/kreuzberg/tests/config_features.rs +580 -0
  237. data/vendor/kreuzberg/tests/config_loading_tests.rs +439 -0
  238. data/vendor/kreuzberg/tests/core_integration.rs +493 -0
  239. data/vendor/kreuzberg/tests/csv_integration.rs +424 -0
  240. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +124 -0
  241. data/vendor/kreuzberg/tests/email_integration.rs +325 -0
  242. data/vendor/kreuzberg/tests/error_handling.rs +393 -0
  243. data/vendor/kreuzberg/tests/format_integration.rs +159 -0
  244. data/vendor/kreuzberg/tests/helpers/mod.rs +142 -0
  245. data/vendor/kreuzberg/tests/image_integration.rs +253 -0
  246. data/vendor/kreuzberg/tests/keywords_integration.rs +479 -0
  247. data/vendor/kreuzberg/tests/keywords_quality.rs +509 -0
  248. data/vendor/kreuzberg/tests/mime_detection.rs +428 -0
  249. data/vendor/kreuzberg/tests/ocr_configuration.rs +510 -0
  250. data/vendor/kreuzberg/tests/ocr_errors.rs +676 -0
  251. data/vendor/kreuzberg/tests/ocr_quality.rs +627 -0
  252. data/vendor/kreuzberg/tests/ocr_stress.rs +469 -0
  253. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  254. data/vendor/kreuzberg/tests/pdf_integration.rs +43 -0
  255. data/vendor/kreuzberg/tests/pipeline_integration.rs +1412 -0
  256. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +771 -0
  257. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +561 -0
  258. data/vendor/kreuzberg/tests/plugin_system.rs +921 -0
  259. data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -0
  260. data/vendor/kreuzberg/tests/registry_integration_tests.rs +607 -0
  261. data/vendor/kreuzberg/tests/security_validation.rs +404 -0
  262. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -0
  263. data/vendor/kreuzberg/tests/test_fastembed.rs +609 -0
  264. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -0
  265. metadata +471 -0
@@ -0,0 +1,410 @@
1
+ //! HTML document extractor.
2
+
3
+ use crate::Result;
4
+ use crate::core::config::ExtractionConfig;
5
+ use crate::plugins::{DocumentExtractor, Plugin};
6
+ use crate::types::{ExtractionResult, Metadata, Table};
7
+ use async_trait::async_trait;
8
+ use scraper::{Html, Selector};
9
+ use std::path::Path;
10
+
11
+ /// HTML document extractor using html-to-markdown.
12
+ pub struct HtmlExtractor;
13
+
14
+ impl Default for HtmlExtractor {
15
+ fn default() -> Self {
16
+ Self::new()
17
+ }
18
+ }
19
+
20
+ impl HtmlExtractor {
21
+ pub fn new() -> Self {
22
+ Self
23
+ }
24
+ }
25
+
26
+ /// Extract all tables from HTML content.
27
+ ///
28
+ /// Parses HTML to find `<table>` elements and extracts their structure
29
+ /// into `Table` objects with cells and markdown representation.
30
+ fn extract_html_tables(html: &str) -> Result<Vec<Table>> {
31
+ let document = Html::parse_document(html);
32
+ let table_selector = Selector::parse("table")
33
+ .map_err(|e| crate::error::KreuzbergError::parsing(format!("Failed to parse table selector: {}", e)))?;
34
+ let row_selector = Selector::parse("tr")
35
+ .map_err(|e| crate::error::KreuzbergError::parsing(format!("Failed to parse row selector: {}", e)))?;
36
+ let header_selector = Selector::parse("th")
37
+ .map_err(|e| crate::error::KreuzbergError::parsing(format!("Failed to parse header selector: {}", e)))?;
38
+ let cell_selector = Selector::parse("td")
39
+ .map_err(|e| crate::error::KreuzbergError::parsing(format!("Failed to parse cell selector: {}", e)))?;
40
+
41
+ let mut tables = Vec::new();
42
+
43
+ for (table_index, table_elem) in document.select(&table_selector).enumerate() {
44
+ let mut cells: Vec<Vec<String>> = Vec::new();
45
+
46
+ for row in table_elem.select(&row_selector) {
47
+ let mut row_cells = Vec::new();
48
+
49
+ // Try headers first (th elements)
50
+ let headers: Vec<_> = row.select(&header_selector).collect();
51
+ if !headers.is_empty() {
52
+ for header in headers {
53
+ let text = header
54
+ .text()
55
+ .collect::<Vec<_>>()
56
+ .join(" ")
57
+ .split_whitespace()
58
+ .collect::<Vec<_>>()
59
+ .join(" ");
60
+ row_cells.push(text);
61
+ }
62
+ } else {
63
+ // Use data cells (td elements)
64
+ for cell in row.select(&cell_selector) {
65
+ let text = cell
66
+ .text()
67
+ .collect::<Vec<_>>()
68
+ .join(" ")
69
+ .split_whitespace()
70
+ .collect::<Vec<_>>()
71
+ .join(" ");
72
+ row_cells.push(text);
73
+ }
74
+ }
75
+
76
+ if !row_cells.is_empty() {
77
+ cells.push(row_cells);
78
+ }
79
+ }
80
+
81
+ // Only create a table if it has content
82
+ if !cells.is_empty() {
83
+ let markdown = cells_to_markdown(&cells);
84
+ tables.push(Table {
85
+ cells,
86
+ markdown,
87
+ page_number: table_index + 1, // 1-indexed
88
+ });
89
+ }
90
+ }
91
+
92
+ Ok(tables)
93
+ }
94
+
95
+ /// Convert table cells to markdown format.
96
+ ///
97
+ /// Reuses the same logic as DOCX extractor for consistency.
98
+ /// First row is treated as header, remaining rows as data.
99
+ ///
100
+ /// # Arguments
101
+ /// * `cells` - 2D vector of cell strings (rows × columns)
102
+ ///
103
+ /// # Returns
104
+ /// * `String` - Markdown formatted table
105
+ fn cells_to_markdown(cells: &[Vec<String>]) -> String {
106
+ if cells.is_empty() {
107
+ return String::new();
108
+ }
109
+
110
+ let mut markdown = String::new();
111
+
112
+ // Determine number of columns from first row
113
+ let num_cols = cells.first().map(|r| r.len()).unwrap_or(0);
114
+ if num_cols == 0 {
115
+ return String::new();
116
+ }
117
+
118
+ // Header row (first row)
119
+ if let Some(header) = cells.first() {
120
+ markdown.push_str("| ");
121
+ for cell in header {
122
+ // Escape pipe characters in cell content
123
+ let escaped = cell.replace('|', "\\|");
124
+ markdown.push_str(&escaped);
125
+ markdown.push_str(" | ");
126
+ }
127
+ markdown.push('\n');
128
+
129
+ // Separator row
130
+ markdown.push('|');
131
+ for _ in 0..num_cols {
132
+ markdown.push_str("------|");
133
+ }
134
+ markdown.push('\n');
135
+ }
136
+
137
+ // Data rows (skip first row as it's the header)
138
+ for row in cells.iter().skip(1) {
139
+ markdown.push_str("| ");
140
+ for (idx, cell) in row.iter().enumerate() {
141
+ if idx >= num_cols {
142
+ break; // Handle irregular tables
143
+ }
144
+ // Escape pipe characters in cell content
145
+ let escaped = cell.replace('|', "\\|");
146
+ markdown.push_str(&escaped);
147
+ markdown.push_str(" | ");
148
+ }
149
+ // Pad with empty cells if row is shorter than expected
150
+ for _ in row.len()..num_cols {
151
+ markdown.push_str(" | ");
152
+ }
153
+ markdown.push('\n');
154
+ }
155
+
156
+ markdown
157
+ }
158
+
159
+ impl Plugin for HtmlExtractor {
160
+ fn name(&self) -> &str {
161
+ "html-extractor"
162
+ }
163
+
164
+ fn version(&self) -> String {
165
+ env!("CARGO_PKG_VERSION").to_string()
166
+ }
167
+
168
+ fn initialize(&self) -> Result<()> {
169
+ Ok(())
170
+ }
171
+
172
+ fn shutdown(&self) -> Result<()> {
173
+ Ok(())
174
+ }
175
+ }
176
+
177
+ #[async_trait]
178
+ impl DocumentExtractor for HtmlExtractor {
179
+ async fn extract_bytes(
180
+ &self,
181
+ content: &[u8],
182
+ mime_type: &str,
183
+ config: &ExtractionConfig,
184
+ ) -> Result<ExtractionResult> {
185
+ let html = std::str::from_utf8(content)
186
+ .map(|s| s.to_string())
187
+ .unwrap_or_else(|_| String::from_utf8_lossy(content).to_string());
188
+
189
+ // Extract tables from HTML
190
+ let tables = extract_html_tables(&html)?;
191
+
192
+ let markdown = crate::extraction::html::convert_html_to_markdown(&html, config.html_options.clone())?;
193
+
194
+ let (html_metadata, content_without_frontmatter) = crate::extraction::html::parse_html_metadata(&markdown)?;
195
+
196
+ Ok(ExtractionResult {
197
+ content: content_without_frontmatter,
198
+ mime_type: mime_type.to_string(),
199
+ metadata: Metadata {
200
+ format: html_metadata.map(|m| crate::types::FormatMetadata::Html(Box::new(m))),
201
+ ..Default::default()
202
+ },
203
+ tables,
204
+ detected_languages: None,
205
+ chunks: None,
206
+ images: None,
207
+ })
208
+ }
209
+
210
+ async fn extract_file(&self, path: &Path, mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult> {
211
+ let bytes = tokio::fs::read(path).await?;
212
+ self.extract_bytes(&bytes, mime_type, config).await
213
+ }
214
+
215
+ fn supported_mime_types(&self) -> &[&str] {
216
+ &["text/html", "application/xhtml+xml"]
217
+ }
218
+
219
+ fn priority(&self) -> i32 {
220
+ 50
221
+ }
222
+ }
223
+
224
+ #[cfg(test)]
225
+ mod tests {
226
+ use super::*;
227
+
228
+ #[test]
229
+ fn test_html_extractor_plugin_interface() {
230
+ let extractor = HtmlExtractor::new();
231
+ assert_eq!(extractor.name(), "html-extractor");
232
+ assert!(extractor.initialize().is_ok());
233
+ assert!(extractor.shutdown().is_ok());
234
+ }
235
+
236
+ #[test]
237
+ fn test_html_extractor_supported_mime_types() {
238
+ let extractor = HtmlExtractor::new();
239
+ let mime_types = extractor.supported_mime_types();
240
+ assert_eq!(mime_types.len(), 2);
241
+ assert!(mime_types.contains(&"text/html"));
242
+ assert!(mime_types.contains(&"application/xhtml+xml"));
243
+ }
244
+
245
+ #[test]
246
+ fn test_extract_html_tables_basic() {
247
+ let html = r#"
248
+ <table>
249
+ <tr><th>Header1</th><th>Header2</th></tr>
250
+ <tr><td>Row1Col1</td><td>Row1Col2</td></tr>
251
+ <tr><td>Row2Col1</td><td>Row2Col2</td></tr>
252
+ </table>
253
+ "#;
254
+
255
+ let tables = extract_html_tables(html).unwrap();
256
+ assert_eq!(tables.len(), 1);
257
+
258
+ let table = &tables[0];
259
+ assert_eq!(table.cells.len(), 3);
260
+ assert_eq!(table.cells[0], vec!["Header1", "Header2"]);
261
+ assert_eq!(table.cells[1], vec!["Row1Col1", "Row1Col2"]);
262
+ assert_eq!(table.cells[2], vec!["Row2Col1", "Row2Col2"]);
263
+ assert_eq!(table.page_number, 1);
264
+
265
+ // Check markdown format
266
+ assert!(table.markdown.contains("| Header1 | Header2 |"));
267
+ assert!(table.markdown.contains("|------|------|"));
268
+ assert!(table.markdown.contains("| Row1Col1 | Row1Col2 |"));
269
+ }
270
+
271
+ #[test]
272
+ fn test_extract_html_tables_multiple() {
273
+ let html = r#"
274
+ <table>
275
+ <tr><th>Table1</th></tr>
276
+ <tr><td>Data1</td></tr>
277
+ </table>
278
+ <p>Some text</p>
279
+ <table>
280
+ <tr><th>Table2</th></tr>
281
+ <tr><td>Data2</td></tr>
282
+ </table>
283
+ "#;
284
+
285
+ let tables = extract_html_tables(html).unwrap();
286
+ assert_eq!(tables.len(), 2);
287
+ assert_eq!(tables[0].page_number, 1);
288
+ assert_eq!(tables[1].page_number, 2);
289
+ }
290
+
291
+ #[test]
292
+ fn test_extract_html_tables_no_thead() {
293
+ let html = r#"
294
+ <table>
295
+ <tr><td>Cell1</td><td>Cell2</td></tr>
296
+ <tr><td>Cell3</td><td>Cell4</td></tr>
297
+ </table>
298
+ "#;
299
+
300
+ let tables = extract_html_tables(html).unwrap();
301
+ assert_eq!(tables.len(), 1);
302
+
303
+ let table = &tables[0];
304
+ assert_eq!(table.cells.len(), 2);
305
+ assert_eq!(table.cells[0], vec!["Cell1", "Cell2"]);
306
+ assert_eq!(table.cells[1], vec!["Cell3", "Cell4"]);
307
+ }
308
+
309
+ #[test]
310
+ fn test_extract_html_tables_empty() {
311
+ let html = "<p>No tables here</p>";
312
+ let tables = extract_html_tables(html).unwrap();
313
+ assert_eq!(tables.len(), 0);
314
+ }
315
+
316
+ #[test]
317
+ fn test_extract_html_tables_with_nested_elements() {
318
+ let html = r#"
319
+ <table>
320
+ <tr><th>Header <strong>Bold</strong></th></tr>
321
+ <tr><td>Data with <em>emphasis</em></td></tr>
322
+ </table>
323
+ "#;
324
+
325
+ let tables = extract_html_tables(html).unwrap();
326
+ assert_eq!(tables.len(), 1);
327
+
328
+ let table = &tables[0];
329
+ // Whitespace is normalized during text extraction
330
+ assert_eq!(table.cells[0][0], "Header Bold");
331
+ assert_eq!(table.cells[1][0], "Data with emphasis");
332
+ }
333
+
334
+ #[test]
335
+ fn test_cells_to_markdown_basic() {
336
+ let cells = vec![
337
+ vec!["Header1".to_string(), "Header2".to_string()],
338
+ vec!["Row1Col1".to_string(), "Row1Col2".to_string()],
339
+ vec!["Row2Col1".to_string(), "Row2Col2".to_string()],
340
+ ];
341
+
342
+ let markdown = cells_to_markdown(&cells);
343
+
344
+ assert!(markdown.contains("| Header1 | Header2 |"));
345
+ assert!(markdown.contains("|------|------|"));
346
+ assert!(markdown.contains("| Row1Col1 | Row1Col2 |"));
347
+ assert!(markdown.contains("| Row2Col1 | Row2Col2 |"));
348
+ }
349
+
350
+ #[test]
351
+ fn test_cells_to_markdown_empty() {
352
+ let cells: Vec<Vec<String>> = vec![];
353
+ let markdown = cells_to_markdown(&cells);
354
+ assert_eq!(markdown, "");
355
+ }
356
+
357
+ #[test]
358
+ fn test_cells_to_markdown_escape_pipes() {
359
+ let cells = vec![vec!["Header".to_string()], vec!["Cell with | pipe".to_string()]];
360
+
361
+ let markdown = cells_to_markdown(&cells);
362
+ assert!(markdown.contains("Cell with \\| pipe"));
363
+ }
364
+
365
+ #[test]
366
+ fn test_cells_to_markdown_irregular_rows() {
367
+ let cells = vec![
368
+ vec!["H1".to_string(), "H2".to_string(), "H3".to_string()],
369
+ vec!["R1C1".to_string(), "R1C2".to_string()], // Missing third column
370
+ vec!["R2C1".to_string(), "R2C2".to_string(), "R2C3".to_string()],
371
+ ];
372
+
373
+ let markdown = cells_to_markdown(&cells);
374
+
375
+ // Should have 3 columns in header
376
+ assert!(markdown.contains("| H1 | H2 | H3 |"));
377
+ // Should pad short rows
378
+ assert!(markdown.contains("| R1C1 | R1C2 | |"));
379
+ }
380
+
381
+ #[tokio::test]
382
+ async fn test_html_extractor_with_table() {
383
+ let html = r#"
384
+ <html>
385
+ <body>
386
+ <h1>Test Page</h1>
387
+ <table>
388
+ <tr><th>Name</th><th>Age</th></tr>
389
+ <tr><td>Alice</td><td>30</td></tr>
390
+ <tr><td>Bob</td><td>25</td></tr>
391
+ </table>
392
+ </body>
393
+ </html>
394
+ "#;
395
+
396
+ let extractor = HtmlExtractor::new();
397
+ let config = ExtractionConfig::default();
398
+ let result = extractor
399
+ .extract_bytes(html.as_bytes(), "text/html", &config)
400
+ .await
401
+ .unwrap();
402
+
403
+ assert_eq!(result.tables.len(), 1);
404
+ let table = &result.tables[0];
405
+ assert_eq!(table.cells.len(), 3);
406
+ assert_eq!(table.cells[0], vec!["Name", "Age"]);
407
+ assert_eq!(table.cells[1], vec!["Alice", "30"]);
408
+ assert_eq!(table.cells[2], vec!["Bob", "25"]);
409
+ }
410
+ }
@@ -0,0 +1,195 @@
1
+ //! Image extractors for various image formats.
2
+
3
+ use crate::Result;
4
+ use crate::core::config::ExtractionConfig;
5
+ use crate::extraction::image::extract_image_metadata;
6
+ use crate::plugins::{DocumentExtractor, Plugin};
7
+ use crate::types::{ExtractionResult, Metadata};
8
+ use async_trait::async_trait;
9
+
10
+ /// Image extractor for various image formats.
11
+ ///
12
+ /// Supports: PNG, JPEG, WebP, BMP, TIFF, GIF.
13
+ /// Extracts dimensions, format, and EXIF metadata.
14
+ /// Optionally runs OCR when configured.
15
+ pub struct ImageExtractor;
16
+
17
+ impl ImageExtractor {
18
+ /// Create a new image extractor.
19
+ pub fn new() -> Self {
20
+ Self
21
+ }
22
+
23
+ /// Extract text from image using OCR.
24
+ #[cfg(feature = "ocr")]
25
+ async fn extract_with_ocr(&self, content: &[u8], config: &ExtractionConfig) -> Result<ExtractionResult> {
26
+ use crate::plugins::registry::get_ocr_backend_registry;
27
+
28
+ let ocr_config = config.ocr.as_ref().ok_or_else(|| crate::KreuzbergError::Parsing {
29
+ message: "OCR config required for image OCR".to_string(),
30
+ source: None,
31
+ })?;
32
+
33
+ let backend = {
34
+ let registry = get_ocr_backend_registry();
35
+ let registry = registry.read().map_err(|e| crate::KreuzbergError::Plugin {
36
+ message: format!("Failed to acquire read lock on OCR backend registry: {}", e),
37
+ plugin_name: "ocr-registry".to_string(),
38
+ })?;
39
+ registry.get(&ocr_config.backend)?
40
+ };
41
+
42
+ // Process image using the backend - returns full ExtractionResult with tables/metadata
43
+ backend.process_image(content, ocr_config).await
44
+ }
45
+ }
46
+
47
+ impl Default for ImageExtractor {
48
+ fn default() -> Self {
49
+ Self::new()
50
+ }
51
+ }
52
+
53
+ impl Plugin for ImageExtractor {
54
+ fn name(&self) -> &str {
55
+ "image-extractor"
56
+ }
57
+
58
+ fn version(&self) -> String {
59
+ env!("CARGO_PKG_VERSION").to_string()
60
+ }
61
+
62
+ fn initialize(&self) -> Result<()> {
63
+ Ok(())
64
+ }
65
+
66
+ fn shutdown(&self) -> Result<()> {
67
+ Ok(())
68
+ }
69
+
70
+ fn description(&self) -> &str {
71
+ "Extracts dimensions, format, and EXIF data from images (PNG, JPEG, WebP, BMP, TIFF, GIF)"
72
+ }
73
+
74
+ fn author(&self) -> &str {
75
+ "Kreuzberg Team"
76
+ }
77
+ }
78
+
79
+ #[async_trait]
80
+ impl DocumentExtractor for ImageExtractor {
81
+ async fn extract_bytes(
82
+ &self,
83
+ content: &[u8],
84
+ mime_type: &str,
85
+ config: &ExtractionConfig,
86
+ ) -> Result<ExtractionResult> {
87
+ let extraction_metadata = extract_image_metadata(content)?;
88
+
89
+ let image_metadata = crate::types::ImageMetadata {
90
+ width: extraction_metadata.width,
91
+ height: extraction_metadata.height,
92
+ format: extraction_metadata.format.clone(),
93
+ exif: extraction_metadata.exif_data,
94
+ };
95
+
96
+ // If OCR is enabled, use OCR result (which includes tables and OCR-specific metadata)
97
+ if config.ocr.is_some() {
98
+ #[cfg(feature = "ocr")]
99
+ {
100
+ let mut ocr_result = self.extract_with_ocr(content, config).await?;
101
+
102
+ // Add image metadata to the OCR result
103
+ ocr_result.metadata.format = Some(crate::types::FormatMetadata::Image(image_metadata));
104
+ ocr_result.mime_type = mime_type.to_string();
105
+
106
+ return Ok(ocr_result);
107
+ }
108
+ #[cfg(not(feature = "ocr"))]
109
+ {
110
+ let content_text = format!(
111
+ "Image: {} {}x{}",
112
+ extraction_metadata.format, extraction_metadata.width, extraction_metadata.height
113
+ );
114
+
115
+ return Ok(ExtractionResult {
116
+ content: content_text,
117
+ mime_type: mime_type.to_string(),
118
+ metadata: Metadata {
119
+ format: Some(crate::types::FormatMetadata::Image(image_metadata)),
120
+ ..Default::default()
121
+ },
122
+ tables: vec![],
123
+ detected_languages: None,
124
+ chunks: None,
125
+ images: None,
126
+ });
127
+ }
128
+ }
129
+
130
+ // No OCR - just return image dimensions
131
+ Ok(ExtractionResult {
132
+ content: format!(
133
+ "Image: {} {}x{}",
134
+ extraction_metadata.format, extraction_metadata.width, extraction_metadata.height
135
+ ),
136
+ mime_type: mime_type.to_string(),
137
+ metadata: Metadata {
138
+ format: Some(crate::types::FormatMetadata::Image(image_metadata)),
139
+ ..Default::default()
140
+ },
141
+ tables: vec![],
142
+ detected_languages: None,
143
+ chunks: None,
144
+ images: None,
145
+ })
146
+ }
147
+
148
+ fn supported_mime_types(&self) -> &[&str] {
149
+ &[
150
+ "image/png",
151
+ "image/jpeg",
152
+ "image/jpg",
153
+ "image/webp",
154
+ "image/bmp",
155
+ "image/tiff",
156
+ "image/gif",
157
+ ]
158
+ }
159
+
160
+ fn priority(&self) -> i32 {
161
+ 50
162
+ }
163
+ }
164
+
165
+ #[cfg(test)]
166
+ mod tests {
167
+ use super::*;
168
+
169
+ #[tokio::test]
170
+ async fn test_image_extractor_invalid_image() {
171
+ let extractor = ImageExtractor::new();
172
+ let invalid_bytes = vec![0, 1, 2, 3, 4, 5];
173
+ let config = ExtractionConfig::default();
174
+
175
+ let result = extractor.extract_bytes(&invalid_bytes, "image/png", &config).await;
176
+ assert!(result.is_err());
177
+ }
178
+
179
+ #[test]
180
+ fn test_image_plugin_interface() {
181
+ let extractor = ImageExtractor::new();
182
+ assert_eq!(extractor.name(), "image-extractor");
183
+ assert_eq!(extractor.version(), env!("CARGO_PKG_VERSION"));
184
+ assert!(extractor.supported_mime_types().contains(&"image/png"));
185
+ assert!(extractor.supported_mime_types().contains(&"image/jpeg"));
186
+ assert!(extractor.supported_mime_types().contains(&"image/webp"));
187
+ assert_eq!(extractor.priority(), 50);
188
+ }
189
+
190
+ #[test]
191
+ fn test_image_extractor_default() {
192
+ let extractor = ImageExtractor;
193
+ assert_eq!(extractor.name(), "image-extractor");
194
+ }
195
+ }