kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (321) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +0 -6
  3. data/.rubocop.yaml +534 -1
  4. data/Gemfile +2 -1
  5. data/Gemfile.lock +28 -116
  6. data/README.md +269 -629
  7. data/Rakefile +0 -9
  8. data/Steepfile +4 -8
  9. data/examples/async_patterns.rb +58 -1
  10. data/ext/kreuzberg_rb/extconf.rb +5 -35
  11. data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
  12. data/ext/kreuzberg_rb/native/build.rs +14 -12
  13. data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
  14. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
  15. data/ext/kreuzberg_rb/native/include/strings.h +2 -2
  16. data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
  17. data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
  18. data/extconf.rb +6 -38
  19. data/kreuzberg.gemspec +20 -114
  20. data/lib/kreuzberg/api_proxy.rb +18 -2
  21. data/lib/kreuzberg/cache_api.rb +0 -22
  22. data/lib/kreuzberg/cli.rb +10 -2
  23. data/lib/kreuzberg/cli_proxy.rb +10 -0
  24. data/lib/kreuzberg/config.rb +22 -274
  25. data/lib/kreuzberg/errors.rb +7 -73
  26. data/lib/kreuzberg/extraction_api.rb +8 -237
  27. data/lib/kreuzberg/mcp_proxy.rb +11 -2
  28. data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
  29. data/lib/kreuzberg/post_processor_protocol.rb +71 -0
  30. data/lib/kreuzberg/result.rb +33 -151
  31. data/lib/kreuzberg/setup_lib_path.rb +2 -22
  32. data/lib/kreuzberg/validator_protocol.rb +73 -0
  33. data/lib/kreuzberg/version.rb +1 -1
  34. data/lib/kreuzberg.rb +13 -27
  35. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  36. data/sig/kreuzberg.rbs +12 -105
  37. data/spec/binding/cache_spec.rb +22 -22
  38. data/spec/binding/cli_proxy_spec.rb +4 -2
  39. data/spec/binding/cli_spec.rb +11 -12
  40. data/spec/binding/config_spec.rb +0 -74
  41. data/spec/binding/config_validation_spec.rb +6 -100
  42. data/spec/binding/error_handling_spec.rb +97 -283
  43. data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
  44. data/spec/binding/plugins/postprocessor_spec.rb +11 -11
  45. data/spec/binding/plugins/validator_spec.rb +13 -12
  46. data/spec/examples.txt +104 -0
  47. data/spec/fixtures/config.toml +1 -0
  48. data/spec/fixtures/config.yaml +1 -0
  49. data/spec/fixtures/invalid_config.toml +1 -0
  50. data/spec/smoke/package_spec.rb +3 -2
  51. data/spec/spec_helper.rb +3 -1
  52. data/vendor/kreuzberg/Cargo.toml +67 -192
  53. data/vendor/kreuzberg/README.md +9 -97
  54. data/vendor/kreuzberg/build.rs +194 -516
  55. data/vendor/kreuzberg/src/api/handlers.rs +9 -130
  56. data/vendor/kreuzberg/src/api/mod.rs +3 -18
  57. data/vendor/kreuzberg/src/api/server.rs +71 -236
  58. data/vendor/kreuzberg/src/api/types.rs +7 -43
  59. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  60. data/vendor/kreuzberg/src/cache/mod.rs +3 -27
  61. data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
  62. data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
  63. data/vendor/kreuzberg/src/core/config.rs +23 -905
  64. data/vendor/kreuzberg/src/core/extractor.rs +106 -403
  65. data/vendor/kreuzberg/src/core/io.rs +2 -4
  66. data/vendor/kreuzberg/src/core/mime.rs +12 -2
  67. data/vendor/kreuzberg/src/core/mod.rs +3 -22
  68. data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
  69. data/vendor/kreuzberg/src/embeddings.rs +21 -169
  70. data/vendor/kreuzberg/src/error.rs +2 -2
  71. data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
  72. data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
  73. data/vendor/kreuzberg/src/extraction/email.rs +11 -12
  74. data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
  75. data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
  76. data/vendor/kreuzberg/src/extraction/image.rs +14 -138
  77. data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
  78. data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
  79. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
  80. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  81. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  82. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  83. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  84. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  85. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  86. data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
  87. data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
  88. data/vendor/kreuzberg/src/extraction/table.rs +1 -2
  89. data/vendor/kreuzberg/src/extraction/text.rs +10 -18
  90. data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
  91. data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
  92. data/vendor/kreuzberg/src/extractors/email.rs +9 -37
  93. data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
  94. data/vendor/kreuzberg/src/extractors/html.rs +173 -182
  95. data/vendor/kreuzberg/src/extractors/image.rs +8 -32
  96. data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
  97. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  98. data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
  99. data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
  100. data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
  101. data/vendor/kreuzberg/src/extractors/text.rs +7 -30
  102. data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
  103. data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
  104. data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
  105. data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
  106. data/vendor/kreuzberg/src/lib.rs +5 -17
  107. data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
  108. data/vendor/kreuzberg/src/mcp/server.rs +21 -145
  109. data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
  110. data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
  111. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
  112. data/vendor/kreuzberg/src/pdf/error.rs +1 -93
  113. data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
  114. data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
  115. data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
  116. data/vendor/kreuzberg/src/pdf/table.rs +64 -61
  117. data/vendor/kreuzberg/src/pdf/text.rs +24 -416
  118. data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
  119. data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
  120. data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
  121. data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
  122. data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
  123. data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
  124. data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
  125. data/vendor/kreuzberg/src/text/mod.rs +0 -8
  126. data/vendor/kreuzberg/src/text/quality.rs +15 -28
  127. data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
  128. data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
  129. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
  130. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
  131. data/vendor/kreuzberg/src/types.rs +67 -907
  132. data/vendor/kreuzberg/src/utils/mod.rs +0 -14
  133. data/vendor/kreuzberg/src/utils/quality.rs +3 -12
  134. data/vendor/kreuzberg/tests/api_tests.rs +0 -506
  135. data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
  136. data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
  137. data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
  138. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  139. data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
  140. data/vendor/kreuzberg/tests/config_features.rs +1 -33
  141. data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
  142. data/vendor/kreuzberg/tests/core_integration.rs +9 -35
  143. data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
  144. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
  145. data/vendor/kreuzberg/tests/email_integration.rs +1 -3
  146. data/vendor/kreuzberg/tests/error_handling.rs +34 -43
  147. data/vendor/kreuzberg/tests/format_integration.rs +1 -7
  148. data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
  149. data/vendor/kreuzberg/tests/image_integration.rs +0 -2
  150. data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
  151. data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
  152. data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
  153. data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
  154. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  155. data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
  156. data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
  157. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
  158. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
  159. data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
  160. data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
  161. data/vendor/kreuzberg/tests/security_validation.rs +1 -13
  162. data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
  163. metadata +25 -171
  164. data/.rubocop.yml +0 -543
  165. data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
  166. data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
  167. data/lib/kreuzberg/error_context.rb +0 -136
  168. data/lib/kreuzberg/types.rb +0 -170
  169. data/lib/libpdfium.so +0 -0
  170. data/spec/binding/async_operations_spec.rb +0 -473
  171. data/spec/binding/batch_operations_spec.rb +0 -595
  172. data/spec/binding/batch_spec.rb +0 -359
  173. data/spec/binding/config_result_spec.rb +0 -377
  174. data/spec/binding/embeddings_spec.rb +0 -816
  175. data/spec/binding/error_recovery_spec.rb +0 -488
  176. data/spec/binding/font_config_spec.rb +0 -220
  177. data/spec/binding/images_spec.rb +0 -738
  178. data/spec/binding/keywords_extraction_spec.rb +0 -600
  179. data/spec/binding/metadata_types_spec.rb +0 -1228
  180. data/spec/binding/pages_extraction_spec.rb +0 -471
  181. data/spec/binding/tables_spec.rb +0 -641
  182. data/spec/unit/config/chunking_config_spec.rb +0 -213
  183. data/spec/unit/config/embedding_config_spec.rb +0 -343
  184. data/spec/unit/config/extraction_config_spec.rb +0 -438
  185. data/spec/unit/config/font_config_spec.rb +0 -285
  186. data/spec/unit/config/hierarchy_config_spec.rb +0 -314
  187. data/spec/unit/config/image_extraction_config_spec.rb +0 -209
  188. data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
  189. data/spec/unit/config/keyword_config_spec.rb +0 -229
  190. data/spec/unit/config/language_detection_config_spec.rb +0 -258
  191. data/spec/unit/config/ocr_config_spec.rb +0 -171
  192. data/spec/unit/config/page_config_spec.rb +0 -221
  193. data/spec/unit/config/pdf_config_spec.rb +0 -267
  194. data/spec/unit/config/postprocessor_config_spec.rb +0 -290
  195. data/spec/unit/config/tesseract_config_spec.rb +0 -181
  196. data/spec/unit/config/token_reduction_config_spec.rb +0 -251
  197. data/test/metadata_types_test.rb +0 -959
  198. data/vendor/Cargo.toml +0 -61
  199. data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
  200. data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
  201. data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
  202. data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
  203. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  204. data/vendor/kreuzberg/src/core/formats.rs +0 -235
  205. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  206. data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
  207. data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
  208. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
  209. data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
  210. data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
  211. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  212. data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
  213. data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
  214. data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
  215. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  216. data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
  217. data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
  218. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  219. data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
  220. data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
  221. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  222. data/vendor/kreuzberg/src/extractors/security.rs +0 -484
  223. data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
  224. data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
  225. data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
  226. data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
  227. data/vendor/kreuzberg/src/panic_context.rs +0 -154
  228. data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
  229. data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
  230. data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
  231. data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
  232. data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
  233. data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
  234. data/vendor/kreuzberg/src/utils/pool.rs +0 -503
  235. data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
  236. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
  237. data/vendor/kreuzberg/tests/api_embed.rs +0 -360
  238. data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
  239. data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
  240. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
  241. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
  242. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
  243. data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
  244. data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
  245. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
  246. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
  247. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
  248. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
  249. data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
  250. data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
  251. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
  252. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
  253. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
  254. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
  255. data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
  256. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
  257. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
  258. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
  259. data/vendor/kreuzberg/tests/page_markers.rs +0 -297
  260. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
  261. data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
  262. data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
  263. data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
  264. data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
  265. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
  266. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
  267. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
  268. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
  269. data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
  270. data/vendor/kreuzberg-ffi/README.md +0 -851
  271. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
  272. data/vendor/kreuzberg-ffi/build.rs +0 -168
  273. data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
  274. data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
  275. data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
  276. data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
  277. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
  278. data/vendor/kreuzberg-ffi/src/error.rs +0 -901
  279. data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
  280. data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
  281. data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
  282. data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
  283. data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
  284. data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
  285. data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
  286. data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
  287. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
  288. data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
  289. data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
  290. data/vendor/kreuzberg-ffi/src/result.rs +0 -510
  291. data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
  292. data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
  293. data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
  294. data/vendor/kreuzberg-ffi/src/types.rs +0 -363
  295. data/vendor/kreuzberg-ffi/src/util.rs +0 -210
  296. data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
  297. data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
  298. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
  299. data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
  300. data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
  301. data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
  302. data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
  303. data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
  304. data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
  305. data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
  306. data/vendor/kreuzberg-tesseract/LICENSE +0 -22
  307. data/vendor/kreuzberg-tesseract/README.md +0 -399
  308. data/vendor/kreuzberg-tesseract/build.rs +0 -1127
  309. data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
  310. data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
  311. data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
  312. data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
  313. data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
  314. data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
  315. data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
  316. data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
  317. data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
  318. data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
  319. data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
  320. data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
  321. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211
@@ -34,7 +34,6 @@ use std::io::Cursor;
34
34
  use std::path::Path;
35
35
 
36
36
  use crate::error::{KreuzbergError, Result};
37
- use crate::extraction::capacity;
38
37
  use crate::types::{ExcelSheet, ExcelWorkbook};
39
38
 
40
39
  #[cfg(feature = "office")]
@@ -145,153 +144,110 @@ fn process_sheet(name: &str, range: &Range<Data>) -> ExcelSheet {
145
144
 
146
145
  let estimated_capacity = 50 + (cols * 20) + (rows * cols * 12);
147
146
 
148
- if rows == 0 || cols == 0 {
149
- let markdown = format!("## {}\n\n*Empty sheet*", name);
150
- ExcelSheet {
151
- name: name.to_owned(),
152
- markdown,
153
- row_count: rows,
154
- col_count: cols,
155
- cell_count,
156
- table_cells: None,
157
- }
147
+ let markdown = if rows == 0 || cols == 0 {
148
+ format!("## {}\n\n*Empty sheet*", name)
158
149
  } else {
159
- let (markdown, table_cells) = generate_markdown_and_cells(name, range, estimated_capacity);
160
- ExcelSheet {
161
- name: name.to_owned(),
162
- markdown,
163
- row_count: rows,
164
- col_count: cols,
165
- cell_count,
166
- table_cells: Some(table_cells),
167
- }
150
+ generate_markdown_from_range_optimized(name, range, estimated_capacity)
151
+ };
152
+
153
+ ExcelSheet {
154
+ name: name.to_owned(),
155
+ markdown,
156
+ row_count: rows,
157
+ col_count: cols,
158
+ cell_count,
168
159
  }
169
160
  }
170
161
 
171
- /// Generate both markdown and extracted cells in a single pass.
172
- ///
173
- /// This function produces both the markdown representation and the structured
174
- /// cell data simultaneously, avoiding the expensive markdown re-parsing that
175
- /// was previously done in `sheets_to_tables()`.
176
- ///
177
- /// Returns (markdown, table_cells) where table_cells is a 2D vector of strings.
178
- fn generate_markdown_and_cells(sheet_name: &str, range: &Range<Data>, capacity: usize) -> (String, Vec<Vec<String>>) {
162
+ fn generate_markdown_from_range_optimized(sheet_name: &str, range: &Range<Data>, capacity: usize) -> String {
163
+ let mut result = String::with_capacity(capacity);
164
+
165
+ write!(result, "## {}\n\n", sheet_name).unwrap();
166
+
179
167
  let rows: Vec<_> = range.rows().collect();
180
168
  if rows.is_empty() {
181
- let result_capacity = 50 + sheet_name.len();
182
- let mut result = String::with_capacity(result_capacity);
183
- write!(result, "## {}\n\n*No data*", sheet_name).unwrap();
184
- return (result, Vec::new());
169
+ result.push_str("*No data*");
170
+ return result;
185
171
  }
186
172
 
187
173
  let header = &rows[0];
188
174
  let header_len = header.len();
189
- let row_count = rows.len();
190
-
191
- let table_capacity = capacity::estimate_table_markdown_capacity(row_count, header_len);
192
-
193
- let mut exact_size = 16 + sheet_name.len();
194
-
195
- exact_size += 2 + (header_len * 2);
196
- exact_size += header_len * 10;
197
-
198
- exact_size += 5 + (header_len * 5);
199
-
200
- exact_size += (row_count - 1) * (5 + header_len * 15);
201
-
202
- let mut markdown = String::with_capacity(exact_size.max(table_capacity).max(capacity));
203
- let mut cells: Vec<Vec<String>> = Vec::with_capacity(row_count);
204
-
205
- write!(markdown, "## {}\n\n", sheet_name).unwrap();
206
175
 
207
- let mut header_cells = Vec::with_capacity(header_len);
208
- markdown.push_str("| ");
176
+ result.push_str("| ");
209
177
  for (i, cell) in header.iter().enumerate() {
210
178
  if i > 0 {
211
- markdown.push_str(" | ");
212
- }
213
- let cell_str = format_cell_to_string(cell);
214
- header_cells.push(cell_str.clone());
215
-
216
- if cell_str.contains('|') || cell_str.contains('\\') {
217
- escape_markdown_into(&mut markdown, &cell_str);
218
- } else {
219
- markdown.push_str(&cell_str);
179
+ result.push_str(" | ");
220
180
  }
181
+ format_cell_value_into(&mut result, cell);
221
182
  }
222
- markdown.push_str(" |\n");
223
- cells.push(header_cells);
183
+ result.push_str(" |\n");
224
184
 
225
- markdown.push_str("| ");
185
+ result.push_str("| ");
226
186
  for i in 0..header_len {
227
187
  if i > 0 {
228
- markdown.push_str(" | ");
188
+ result.push_str(" | ");
229
189
  }
230
- markdown.push_str("---");
190
+ result.push_str("---");
231
191
  }
232
- markdown.push_str(" |\n");
192
+ result.push_str(" |\n");
233
193
 
234
194
  for row in rows.iter().skip(1) {
235
- let mut row_cells = Vec::with_capacity(header_len);
236
- markdown.push_str("| ");
195
+ result.push_str("| ");
237
196
  for i in 0..header_len {
238
197
  if i > 0 {
239
- markdown.push_str(" | ");
198
+ result.push_str(" | ");
240
199
  }
241
200
  if let Some(cell) = row.get(i) {
242
- let cell_str = format_cell_to_string(cell);
243
- row_cells.push(cell_str.clone());
244
-
245
- if cell_str.contains('|') || cell_str.contains('\\') {
246
- escape_markdown_into(&mut markdown, &cell_str);
247
- } else {
248
- markdown.push_str(&cell_str);
249
- }
250
- } else {
251
- row_cells.push(String::new());
201
+ format_cell_value_into(&mut result, cell);
252
202
  }
253
203
  }
254
- markdown.push_str(" |\n");
255
- cells.push(row_cells);
204
+ result.push_str(" |\n");
256
205
  }
257
206
 
258
- (markdown, cells)
207
+ result
259
208
  }
260
209
 
261
- /// Convert a Data cell to its string representation.
262
- ///
263
- /// This helper function is shared between markdown generation and cell extraction
264
- /// to ensure byte-identical output.
265
210
  #[inline]
266
- fn format_cell_to_string(data: &Data) -> String {
211
+ fn format_cell_value_into(buffer: &mut String, data: &Data) {
267
212
  match data {
268
- Data::Empty => String::new(),
269
- Data::String(s) => s.clone(),
213
+ Data::Empty => {}
214
+ Data::String(s) => {
215
+ if s.contains('|') || s.contains('\\') {
216
+ escape_markdown_into(buffer, s);
217
+ } else {
218
+ buffer.push_str(s);
219
+ }
220
+ }
270
221
  Data::Float(f) => {
271
222
  if f.fract() == 0.0 {
272
- format!("{:.1}", f)
223
+ write!(buffer, "{:.1}", f).unwrap();
273
224
  } else {
274
- format!("{}", f)
225
+ write!(buffer, "{}", f).unwrap();
275
226
  }
276
227
  }
277
- Data::Int(i) => format!("{}", i),
228
+ Data::Int(i) => {
229
+ write!(buffer, "{}", i).unwrap();
230
+ }
278
231
  Data::Bool(b) => {
279
- if *b {
280
- "true".to_string()
281
- } else {
282
- "false".to_string()
283
- }
232
+ buffer.push_str(if *b { "true" } else { "false" });
284
233
  }
285
234
  Data::DateTime(dt) => {
286
235
  if let Some(datetime) = dt.as_datetime() {
287
- format!("{}", datetime.format("%Y-%m-%d %H:%M:%S"))
236
+ write!(buffer, "{}", datetime.format("%Y-%m-%d %H:%M:%S")).unwrap();
288
237
  } else {
289
- format!("{:?}", dt)
238
+ write!(buffer, "{:?}", dt).unwrap();
290
239
  }
291
240
  }
292
- Data::Error(e) => format!("#ERR: {:?}", e),
293
- Data::DateTimeIso(s) => s.clone(),
294
- Data::DurationIso(s) => format!("DURATION: {}", s),
241
+ Data::Error(e) => {
242
+ write!(buffer, "#ERR: {:?}", e).unwrap();
243
+ }
244
+ Data::DateTimeIso(s) => {
245
+ buffer.push_str(s);
246
+ }
247
+ Data::DurationIso(s) => {
248
+ buffer.push_str("DURATION: ");
249
+ buffer.push_str(s);
250
+ }
295
251
  }
296
252
  }
297
253
 
@@ -469,12 +425,31 @@ mod tests {
469
425
  use super::*;
470
426
 
471
427
  #[test]
472
- fn test_format_cell_to_string_basic() {
473
- assert_eq!(format_cell_to_string(&Data::Empty), "");
474
- assert_eq!(format_cell_to_string(&Data::String("test".to_owned())), "test");
475
- assert_eq!(format_cell_to_string(&Data::Float(42.0)), "42.0");
476
- assert_eq!(format_cell_to_string(&Data::Int(100)), "100");
477
- assert_eq!(format_cell_to_string(&Data::Bool(true)), "true");
428
+ fn test_format_cell_value_into() {
429
+ let mut buffer = String::with_capacity(100);
430
+
431
+ format_cell_value_into(&mut buffer, &Data::Empty);
432
+ assert_eq!(buffer, "");
433
+
434
+ buffer.clear();
435
+ format_cell_value_into(&mut buffer, &Data::String("test".to_owned()));
436
+ assert_eq!(buffer, "test");
437
+
438
+ buffer.clear();
439
+ format_cell_value_into(&mut buffer, &Data::Float(42.0));
440
+ assert_eq!(buffer, "42.0");
441
+
442
+ buffer.clear();
443
+ format_cell_value_into(&mut buffer, &Data::Float(std::f64::consts::PI));
444
+ assert_eq!(buffer, "3.141592653589793");
445
+
446
+ buffer.clear();
447
+ format_cell_value_into(&mut buffer, &Data::Int(100));
448
+ assert_eq!(buffer, "100");
449
+
450
+ buffer.clear();
451
+ format_cell_value_into(&mut buffer, &Data::Bool(true));
452
+ assert_eq!(buffer, "true");
478
453
  }
479
454
 
480
455
  #[test]
@@ -495,35 +470,43 @@ mod tests {
495
470
 
496
471
  #[test]
497
472
  fn test_capacity_optimization() {
498
- let buffer = String::with_capacity(100);
473
+ let mut buffer = String::with_capacity(100);
474
+ format_cell_value_into(&mut buffer, &Data::String("test".to_owned()));
475
+
499
476
  assert!(buffer.capacity() >= 100);
500
477
  }
501
478
 
502
479
  #[test]
503
480
  fn test_format_cell_value_datetime() {
504
481
  use calamine::{ExcelDateTime, ExcelDateTimeType};
482
+ let mut buffer = String::new();
483
+
505
484
  let dt = Data::DateTime(ExcelDateTime::new(49353.5, ExcelDateTimeType::DateTime, false));
506
- let result = format_cell_to_string(&dt);
507
- assert!(!result.is_empty());
485
+ format_cell_value_into(&mut buffer, &dt);
486
+ assert!(!buffer.is_empty());
508
487
  }
509
488
 
510
489
  #[test]
511
490
  fn test_format_cell_value_error() {
512
491
  use calamine::CellErrorType;
513
- let result = format_cell_to_string(&Data::Error(CellErrorType::Div0));
514
- assert!(result.contains("#ERR"));
492
+ let mut buffer = String::new();
493
+
494
+ format_cell_value_into(&mut buffer, &Data::Error(CellErrorType::Div0));
495
+ assert!(buffer.contains("#ERR"));
515
496
  }
516
497
 
517
498
  #[test]
518
499
  fn test_format_cell_value_datetime_iso() {
519
- let result = format_cell_to_string(&Data::DateTimeIso("2024-01-01T10:30:00".to_owned()));
520
- assert_eq!(result, "2024-01-01T10:30:00");
500
+ let mut buffer = String::new();
501
+ format_cell_value_into(&mut buffer, &Data::DateTimeIso("2024-01-01T10:30:00".to_owned()));
502
+ assert_eq!(buffer, "2024-01-01T10:30:00");
521
503
  }
522
504
 
523
505
  #[test]
524
506
  fn test_format_cell_value_duration_iso() {
525
- let result = format_cell_to_string(&Data::DurationIso("PT1H30M".to_owned()));
526
- assert_eq!(result, "DURATION: PT1H30M");
507
+ let mut buffer = String::new();
508
+ format_cell_value_into(&mut buffer, &Data::DurationIso("PT1H30M".to_owned()));
509
+ assert_eq!(buffer, "DURATION: PT1H30M");
527
510
  }
528
511
 
529
512
  #[test]
@@ -588,16 +571,16 @@ mod tests {
588
571
  }
589
572
 
590
573
  #[test]
591
- fn test_generate_markdown_and_cells_empty() {
592
- let range: Range<Data> = Range::empty();
593
- let (markdown, cells) = generate_markdown_and_cells("Test", &range, 100);
574
+ fn test_generate_markdown_empty_range() {
575
+ let range: Range<Data> = Range::new((0, 0), (0, 0));
576
+ let markdown = generate_markdown_from_range_optimized("Test", &range, 100);
594
577
 
595
578
  assert!(markdown.contains("## Test"));
596
- assert!(cells.is_empty());
579
+ assert!(markdown.contains("|"));
597
580
  }
598
581
 
599
582
  #[test]
600
- fn test_generate_markdown_and_cells_with_data() {
583
+ fn test_generate_markdown_with_headers() {
601
584
  let mut range: Range<Data> = Range::new((0, 0), (1, 2));
602
585
  range.set_value((0, 0), Data::String("Col1".to_owned()));
603
586
  range.set_value((0, 1), Data::String("Col2".to_owned()));
@@ -606,16 +589,20 @@ mod tests {
606
589
  range.set_value((1, 1), Data::String("B".to_owned()));
607
590
  range.set_value((1, 2), Data::String("C".to_owned()));
608
591
 
609
- let (markdown, cells) = generate_markdown_and_cells("Sheet1", &range, 200);
592
+ let markdown = generate_markdown_from_range_optimized("Sheet1", &range, 200);
610
593
 
611
594
  assert!(markdown.contains("## Sheet1"));
612
595
  assert!(markdown.contains("Col1"));
596
+ assert!(markdown.contains("Col2"));
597
+ assert!(markdown.contains("Col3"));
613
598
  assert!(markdown.contains("---"));
614
- assert_eq!(cells.len(), 2);
599
+ assert!(markdown.contains("A"));
600
+ assert!(markdown.contains("B"));
601
+ assert!(markdown.contains("C"));
615
602
  }
616
603
 
617
604
  #[test]
618
- fn test_generate_markdown_and_cells_sparse() {
605
+ fn test_generate_markdown_sparse_data() {
619
606
  let mut range: Range<Data> = Range::new((0, 0), (2, 2));
620
607
  range.set_value((0, 0), Data::String("A".to_owned()));
621
608
  range.set_value((0, 1), Data::String("B".to_owned()));
@@ -623,42 +610,46 @@ mod tests {
623
610
  range.set_value((1, 0), Data::String("X".to_owned()));
624
611
  range.set_value((1, 2), Data::String("Z".to_owned()));
625
612
 
626
- let (markdown, cells) = generate_markdown_and_cells("Sparse", &range, 200);
613
+ let markdown = generate_markdown_from_range_optimized("Sparse", &range, 200);
627
614
 
628
615
  assert!(markdown.contains("X"));
629
616
  assert!(markdown.contains("Z"));
630
- assert_eq!(cells.len(), 3);
617
+ let lines: Vec<&str> = markdown.lines().collect();
618
+ assert!(lines.iter().any(|line| line.contains("| |") || line.contains("| |")));
631
619
  }
632
620
 
633
621
  #[test]
634
622
  fn test_format_cell_value_float_integer() {
635
- let result = format_cell_to_string(&Data::Float(100.0));
636
- assert_eq!(result, "100.0");
623
+ let mut buffer = String::new();
624
+ format_cell_value_into(&mut buffer, &Data::Float(100.0));
625
+ assert_eq!(buffer, "100.0");
637
626
  }
638
627
 
639
628
  #[test]
640
629
  fn test_format_cell_value_float_decimal() {
641
- let result = format_cell_to_string(&Data::Float(12.3456));
642
- assert_eq!(result, "12.3456");
630
+ let mut buffer = String::new();
631
+ format_cell_value_into(&mut buffer, &Data::Float(12.3456));
632
+ assert_eq!(buffer, "12.3456");
643
633
  }
644
634
 
645
635
  #[test]
646
636
  fn test_format_cell_value_bool_false() {
647
- let result = format_cell_to_string(&Data::Bool(false));
648
- assert_eq!(result, "false");
637
+ let mut buffer = String::new();
638
+ format_cell_value_into(&mut buffer, &Data::Bool(false));
639
+ assert_eq!(buffer, "false");
649
640
  }
650
641
 
651
642
  #[test]
652
- fn test_format_cell_escape_pipe() {
643
+ fn test_format_cell_value_string_with_pipe() {
653
644
  let mut buffer = String::new();
654
- escape_markdown_into(&mut buffer, "value|with|pipes");
645
+ format_cell_value_into(&mut buffer, &Data::String("value|with|pipes".to_owned()));
655
646
  assert_eq!(buffer, "value\\|with\\|pipes");
656
647
  }
657
648
 
658
649
  #[test]
659
- fn test_format_cell_escape_backslash() {
650
+ fn test_format_cell_value_string_with_backslash() {
660
651
  let mut buffer = String::new();
661
- escape_markdown_into(&mut buffer, "path\\to\\file");
652
+ format_cell_value_into(&mut buffer, &Data::String("path\\to\\file".to_owned()));
662
653
  assert_eq!(buffer, "path\\\\to\\\\file");
663
654
  }
664
655
 
@@ -670,7 +661,7 @@ mod tests {
670
661
  range.set_value((1, 0), Data::String("A".to_owned()));
671
662
  range.set_value((1, 1), Data::String("B".to_owned()));
672
663
 
673
- let (markdown, _cells) = generate_markdown_and_cells("Test", &range, 100);
664
+ let markdown = generate_markdown_from_range_optimized("Test", &range, 100);
674
665
 
675
666
  let lines: Vec<&str> = markdown.lines().collect();
676
667
  assert!(lines[0].contains("## Test"));