kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (321) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +0 -6
  3. data/.rubocop.yaml +534 -1
  4. data/Gemfile +2 -1
  5. data/Gemfile.lock +28 -116
  6. data/README.md +269 -629
  7. data/Rakefile +0 -9
  8. data/Steepfile +4 -8
  9. data/examples/async_patterns.rb +58 -1
  10. data/ext/kreuzberg_rb/extconf.rb +5 -35
  11. data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
  12. data/ext/kreuzberg_rb/native/build.rs +14 -12
  13. data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
  14. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
  15. data/ext/kreuzberg_rb/native/include/strings.h +2 -2
  16. data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
  17. data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
  18. data/extconf.rb +6 -38
  19. data/kreuzberg.gemspec +20 -114
  20. data/lib/kreuzberg/api_proxy.rb +18 -2
  21. data/lib/kreuzberg/cache_api.rb +0 -22
  22. data/lib/kreuzberg/cli.rb +10 -2
  23. data/lib/kreuzberg/cli_proxy.rb +10 -0
  24. data/lib/kreuzberg/config.rb +22 -274
  25. data/lib/kreuzberg/errors.rb +7 -73
  26. data/lib/kreuzberg/extraction_api.rb +8 -237
  27. data/lib/kreuzberg/mcp_proxy.rb +11 -2
  28. data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
  29. data/lib/kreuzberg/post_processor_protocol.rb +71 -0
  30. data/lib/kreuzberg/result.rb +33 -151
  31. data/lib/kreuzberg/setup_lib_path.rb +2 -22
  32. data/lib/kreuzberg/validator_protocol.rb +73 -0
  33. data/lib/kreuzberg/version.rb +1 -1
  34. data/lib/kreuzberg.rb +13 -27
  35. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  36. data/sig/kreuzberg.rbs +12 -105
  37. data/spec/binding/cache_spec.rb +22 -22
  38. data/spec/binding/cli_proxy_spec.rb +4 -2
  39. data/spec/binding/cli_spec.rb +11 -12
  40. data/spec/binding/config_spec.rb +0 -74
  41. data/spec/binding/config_validation_spec.rb +6 -100
  42. data/spec/binding/error_handling_spec.rb +97 -283
  43. data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
  44. data/spec/binding/plugins/postprocessor_spec.rb +11 -11
  45. data/spec/binding/plugins/validator_spec.rb +13 -12
  46. data/spec/examples.txt +104 -0
  47. data/spec/fixtures/config.toml +1 -0
  48. data/spec/fixtures/config.yaml +1 -0
  49. data/spec/fixtures/invalid_config.toml +1 -0
  50. data/spec/smoke/package_spec.rb +3 -2
  51. data/spec/spec_helper.rb +3 -1
  52. data/vendor/kreuzberg/Cargo.toml +67 -192
  53. data/vendor/kreuzberg/README.md +9 -97
  54. data/vendor/kreuzberg/build.rs +194 -516
  55. data/vendor/kreuzberg/src/api/handlers.rs +9 -130
  56. data/vendor/kreuzberg/src/api/mod.rs +3 -18
  57. data/vendor/kreuzberg/src/api/server.rs +71 -236
  58. data/vendor/kreuzberg/src/api/types.rs +7 -43
  59. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  60. data/vendor/kreuzberg/src/cache/mod.rs +3 -27
  61. data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
  62. data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
  63. data/vendor/kreuzberg/src/core/config.rs +23 -905
  64. data/vendor/kreuzberg/src/core/extractor.rs +106 -403
  65. data/vendor/kreuzberg/src/core/io.rs +2 -4
  66. data/vendor/kreuzberg/src/core/mime.rs +12 -2
  67. data/vendor/kreuzberg/src/core/mod.rs +3 -22
  68. data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
  69. data/vendor/kreuzberg/src/embeddings.rs +21 -169
  70. data/vendor/kreuzberg/src/error.rs +2 -2
  71. data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
  72. data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
  73. data/vendor/kreuzberg/src/extraction/email.rs +11 -12
  74. data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
  75. data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
  76. data/vendor/kreuzberg/src/extraction/image.rs +14 -138
  77. data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
  78. data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
  79. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
  80. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  81. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  82. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  83. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  84. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  85. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  86. data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
  87. data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
  88. data/vendor/kreuzberg/src/extraction/table.rs +1 -2
  89. data/vendor/kreuzberg/src/extraction/text.rs +10 -18
  90. data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
  91. data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
  92. data/vendor/kreuzberg/src/extractors/email.rs +9 -37
  93. data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
  94. data/vendor/kreuzberg/src/extractors/html.rs +173 -182
  95. data/vendor/kreuzberg/src/extractors/image.rs +8 -32
  96. data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
  97. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  98. data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
  99. data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
  100. data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
  101. data/vendor/kreuzberg/src/extractors/text.rs +7 -30
  102. data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
  103. data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
  104. data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
  105. data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
  106. data/vendor/kreuzberg/src/lib.rs +5 -17
  107. data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
  108. data/vendor/kreuzberg/src/mcp/server.rs +21 -145
  109. data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
  110. data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
  111. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
  112. data/vendor/kreuzberg/src/pdf/error.rs +1 -93
  113. data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
  114. data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
  115. data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
  116. data/vendor/kreuzberg/src/pdf/table.rs +64 -61
  117. data/vendor/kreuzberg/src/pdf/text.rs +24 -416
  118. data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
  119. data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
  120. data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
  121. data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
  122. data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
  123. data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
  124. data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
  125. data/vendor/kreuzberg/src/text/mod.rs +0 -8
  126. data/vendor/kreuzberg/src/text/quality.rs +15 -28
  127. data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
  128. data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
  129. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
  130. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
  131. data/vendor/kreuzberg/src/types.rs +67 -907
  132. data/vendor/kreuzberg/src/utils/mod.rs +0 -14
  133. data/vendor/kreuzberg/src/utils/quality.rs +3 -12
  134. data/vendor/kreuzberg/tests/api_tests.rs +0 -506
  135. data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
  136. data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
  137. data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
  138. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  139. data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
  140. data/vendor/kreuzberg/tests/config_features.rs +1 -33
  141. data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
  142. data/vendor/kreuzberg/tests/core_integration.rs +9 -35
  143. data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
  144. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
  145. data/vendor/kreuzberg/tests/email_integration.rs +1 -3
  146. data/vendor/kreuzberg/tests/error_handling.rs +34 -43
  147. data/vendor/kreuzberg/tests/format_integration.rs +1 -7
  148. data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
  149. data/vendor/kreuzberg/tests/image_integration.rs +0 -2
  150. data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
  151. data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
  152. data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
  153. data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
  154. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  155. data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
  156. data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
  157. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
  158. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
  159. data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
  160. data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
  161. data/vendor/kreuzberg/tests/security_validation.rs +1 -13
  162. data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
  163. metadata +25 -171
  164. data/.rubocop.yml +0 -543
  165. data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
  166. data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
  167. data/lib/kreuzberg/error_context.rb +0 -136
  168. data/lib/kreuzberg/types.rb +0 -170
  169. data/lib/libpdfium.so +0 -0
  170. data/spec/binding/async_operations_spec.rb +0 -473
  171. data/spec/binding/batch_operations_spec.rb +0 -595
  172. data/spec/binding/batch_spec.rb +0 -359
  173. data/spec/binding/config_result_spec.rb +0 -377
  174. data/spec/binding/embeddings_spec.rb +0 -816
  175. data/spec/binding/error_recovery_spec.rb +0 -488
  176. data/spec/binding/font_config_spec.rb +0 -220
  177. data/spec/binding/images_spec.rb +0 -738
  178. data/spec/binding/keywords_extraction_spec.rb +0 -600
  179. data/spec/binding/metadata_types_spec.rb +0 -1228
  180. data/spec/binding/pages_extraction_spec.rb +0 -471
  181. data/spec/binding/tables_spec.rb +0 -641
  182. data/spec/unit/config/chunking_config_spec.rb +0 -213
  183. data/spec/unit/config/embedding_config_spec.rb +0 -343
  184. data/spec/unit/config/extraction_config_spec.rb +0 -438
  185. data/spec/unit/config/font_config_spec.rb +0 -285
  186. data/spec/unit/config/hierarchy_config_spec.rb +0 -314
  187. data/spec/unit/config/image_extraction_config_spec.rb +0 -209
  188. data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
  189. data/spec/unit/config/keyword_config_spec.rb +0 -229
  190. data/spec/unit/config/language_detection_config_spec.rb +0 -258
  191. data/spec/unit/config/ocr_config_spec.rb +0 -171
  192. data/spec/unit/config/page_config_spec.rb +0 -221
  193. data/spec/unit/config/pdf_config_spec.rb +0 -267
  194. data/spec/unit/config/postprocessor_config_spec.rb +0 -290
  195. data/spec/unit/config/tesseract_config_spec.rb +0 -181
  196. data/spec/unit/config/token_reduction_config_spec.rb +0 -251
  197. data/test/metadata_types_test.rb +0 -959
  198. data/vendor/Cargo.toml +0 -61
  199. data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
  200. data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
  201. data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
  202. data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
  203. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  204. data/vendor/kreuzberg/src/core/formats.rs +0 -235
  205. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  206. data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
  207. data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
  208. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
  209. data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
  210. data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
  211. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  212. data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
  213. data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
  214. data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
  215. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  216. data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
  217. data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
  218. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  219. data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
  220. data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
  221. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  222. data/vendor/kreuzberg/src/extractors/security.rs +0 -484
  223. data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
  224. data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
  225. data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
  226. data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
  227. data/vendor/kreuzberg/src/panic_context.rs +0 -154
  228. data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
  229. data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
  230. data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
  231. data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
  232. data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
  233. data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
  234. data/vendor/kreuzberg/src/utils/pool.rs +0 -503
  235. data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
  236. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
  237. data/vendor/kreuzberg/tests/api_embed.rs +0 -360
  238. data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
  239. data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
  240. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
  241. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
  242. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
  243. data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
  244. data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
  245. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
  246. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
  247. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
  248. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
  249. data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
  250. data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
  251. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
  252. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
  253. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
  254. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
  255. data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
  256. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
  257. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
  258. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
  259. data/vendor/kreuzberg/tests/page_markers.rs +0 -297
  260. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
  261. data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
  262. data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
  263. data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
  264. data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
  265. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
  266. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
  267. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
  268. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
  269. data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
  270. data/vendor/kreuzberg-ffi/README.md +0 -851
  271. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
  272. data/vendor/kreuzberg-ffi/build.rs +0 -168
  273. data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
  274. data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
  275. data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
  276. data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
  277. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
  278. data/vendor/kreuzberg-ffi/src/error.rs +0 -901
  279. data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
  280. data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
  281. data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
  282. data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
  283. data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
  284. data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
  285. data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
  286. data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
  287. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
  288. data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
  289. data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
  290. data/vendor/kreuzberg-ffi/src/result.rs +0 -510
  291. data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
  292. data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
  293. data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
  294. data/vendor/kreuzberg-ffi/src/types.rs +0 -363
  295. data/vendor/kreuzberg-ffi/src/util.rs +0 -210
  296. data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
  297. data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
  298. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
  299. data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
  300. data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
  301. data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
  302. data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
  303. data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
  304. data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
  305. data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
  306. data/vendor/kreuzberg-tesseract/LICENSE +0 -22
  307. data/vendor/kreuzberg-tesseract/README.md +0 -399
  308. data/vendor/kreuzberg-tesseract/build.rs +0 -1127
  309. data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
  310. data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
  311. data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
  312. data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
  313. data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
  314. data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
  315. data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
  316. data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
  317. data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
  318. data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
  319. data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
  320. data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
  321. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211
@@ -6,7 +6,6 @@ use crate::text::token_reduction::{
6
6
  semantic::SemanticAnalyzer,
7
7
  simd_text::{SimdTextProcessor, chunk_text_for_parallel},
8
8
  };
9
- use ahash::AHashMap;
10
9
  use once_cell::sync::Lazy;
11
10
  use rayon::prelude::*;
12
11
  use regex::Regex;
@@ -99,12 +98,10 @@ impl TokenReducer {
99
98
  return text.to_string();
100
99
  }
101
100
 
102
- let nfc_string;
103
101
  let working_text = if text.is_ascii() {
104
102
  text
105
103
  } else {
106
- nfc_string = text.nfc().collect::<String>();
107
- &nfc_string
104
+ &text.nfc().collect::<String>()
108
105
  };
109
106
 
110
107
  match self.config.level {
@@ -185,21 +182,13 @@ impl TokenReducer {
185
182
  }
186
183
 
187
184
  fn clean_punctuation_optimized(&self, text: &str) -> String {
188
- use std::borrow::Cow;
185
+ let mut result = text.to_string();
189
186
 
190
- let mut result = Cow::Borrowed(text);
187
+ result = REPEATED_EXCLAMATION.replace_all(&result, "!").to_string();
188
+ result = REPEATED_QUESTION.replace_all(&result, "?").to_string();
189
+ result = REPEATED_COMMA.replace_all(&result, ",").to_string();
191
190
 
192
- if REPEATED_EXCLAMATION.is_match(&result) {
193
- result = Cow::Owned(REPEATED_EXCLAMATION.replace_all(&result, "!").into_owned());
194
- }
195
- if REPEATED_QUESTION.is_match(&result) {
196
- result = Cow::Owned(REPEATED_QUESTION.replace_all(&result, "?").into_owned());
197
- }
198
- if REPEATED_COMMA.is_match(&result) {
199
- result = Cow::Owned(REPEATED_COMMA.replace_all(&result, ",").into_owned());
200
- }
201
-
202
- result.into_owned()
191
+ result
203
192
  }
204
193
 
205
194
  fn remove_additional_common_words(&self, text: &str) -> String {
@@ -209,10 +198,8 @@ impl TokenReducer {
209
198
  return text.to_string();
210
199
  }
211
200
 
212
- let estimated_unique = (words.len() as f32 * 0.7).ceil() as usize;
213
- let mut word_freq = AHashMap::with_capacity(estimated_unique);
214
-
215
- let mut word_lengths = Vec::with_capacity(words.len());
201
+ let mut word_freq = std::collections::HashMap::new();
202
+ let mut word_lengths = Vec::new();
216
203
 
217
204
  for word in &words {
218
205
  let clean_word = if word.chars().all(|c| c.is_alphabetic()) {
@@ -237,34 +224,34 @@ impl TokenReducer {
237
224
  };
238
225
 
239
226
  let original_count = words.len();
240
- let has_cjk_content = text.chars().any(|c| c as u32 >= 0x4E00 && (c as u32) <= 0x9FFF);
241
227
 
242
- let mut filtered_words = Vec::with_capacity(words.len());
243
- for word in &words {
244
- let clean_word = if word.chars().all(|c| c.is_alphabetic()) {
245
- word.to_lowercase()
246
- } else {
247
- word.chars()
248
- .filter(|c| c.is_alphabetic())
249
- .collect::<String>()
250
- .to_lowercase()
251
- };
228
+ let filtered_words: Vec<String> = words
229
+ .iter()
230
+ .filter(|word| {
231
+ let clean_word = if word.chars().all(|c| c.is_alphabetic()) {
232
+ word.to_lowercase()
233
+ } else {
234
+ word.chars()
235
+ .filter(|c| c.is_alphabetic())
236
+ .collect::<String>()
237
+ .to_lowercase()
238
+ };
239
+
240
+ if clean_word.is_empty() {
241
+ return true;
242
+ }
252
243
 
253
- if clean_word.is_empty() {
254
- filtered_words.push(word.clone());
255
- } else {
256
244
  let freq = word_freq.get(&clean_word).unwrap_or(&0);
257
245
  let word_len = clean_word.chars().count() as f32;
258
246
 
259
- if self.has_important_characteristics(word)
247
+ self.has_important_characteristics(word)
260
248
  || (*freq <= 2 && word_len >= avg_length * 0.8)
261
249
  || (word_len >= avg_length * 1.5)
262
- {
263
- filtered_words.push(word.clone());
264
- }
265
- }
266
- }
250
+ })
251
+ .cloned()
252
+ .collect();
267
253
 
254
+ let has_cjk_content = text.chars().any(|c| c as u32 >= 0x4E00 && (c as u32) <= 0x9FFF);
268
255
  let fallback_threshold = if has_cjk_content {
269
256
  original_count / 5
270
257
  } else {
@@ -272,19 +259,19 @@ impl TokenReducer {
272
259
  };
273
260
 
274
261
  if filtered_words.len() < fallback_threshold {
275
- let mut fallback_words = Vec::with_capacity(words.len());
276
- for word in &words {
277
- let clean_word = if word.chars().all(|c| c.is_alphabetic()) {
278
- word.to_lowercase()
279
- } else {
280
- word.chars().filter(|c| c.is_alphabetic()).collect::<String>()
281
- };
282
-
283
- if clean_word.is_empty() || clean_word.chars().count() >= 3 || self.has_important_characteristics(word)
284
- {
285
- fallback_words.push(word.clone());
286
- }
287
- }
262
+ let fallback_words: Vec<String> = words
263
+ .iter()
264
+ .filter(|word| {
265
+ let clean_word = if word.chars().all(|c| c.is_alphabetic()) {
266
+ (*word).clone()
267
+ } else {
268
+ word.chars().filter(|c| c.is_alphabetic()).collect::<String>()
269
+ };
270
+
271
+ clean_word.is_empty() || clean_word.chars().count() >= 3 || self.has_important_characteristics(word)
272
+ })
273
+ .cloned()
274
+ .collect();
288
275
  self.smart_join(&fallback_words, has_cjk_content)
289
276
  } else {
290
277
  self.smart_join(&filtered_words, has_cjk_content)
@@ -445,37 +432,16 @@ impl TokenReducer {
445
432
  score += (long_word_count as f32 / words.len() as f32) * LONG_WORD_WEIGHT;
446
433
  score += (punct_density as f32 / sentence.len() as f32) * PUNCTUATION_DENSITY_WEIGHT;
447
434
 
448
- let estimated_unique = (words.len() as f32 * 0.6).ceil() as usize;
449
- let mut unique_words: ahash::AHashSet<String> = ahash::AHashSet::with_capacity(estimated_unique.max(10));
450
-
451
- for w in &words {
452
- let clean = w
453
- .chars()
454
- .filter(|c| c.is_alphabetic())
455
- .collect::<String>()
456
- .to_lowercase();
457
- unique_words.insert(clean);
458
-
459
- if unique_words.len() >= estimated_unique {
460
- break;
461
- }
462
- }
463
-
464
- let final_unique_count = if unique_words.len() >= estimated_unique {
465
- unique_words.len()
466
- } else {
467
- for w in &words {
468
- let clean = w
469
- .chars()
435
+ let unique_words: std::collections::HashSet<_> = words
436
+ .iter()
437
+ .map(|w| {
438
+ w.chars()
470
439
  .filter(|c| c.is_alphabetic())
471
440
  .collect::<String>()
472
- .to_lowercase();
473
- unique_words.insert(clean);
474
- }
475
- unique_words.len()
476
- };
477
-
478
- let diversity_ratio = final_unique_count as f32 / words.len() as f32;
441
+ .to_lowercase()
442
+ })
443
+ .collect();
444
+ let diversity_ratio = unique_words.len() as f32 / words.len() as f32;
479
445
  score += diversity_ratio * DIVERSITY_RATIO_WEIGHT;
480
446
 
481
447
  let char_entropy = self.calculate_char_entropy(sentence);
@@ -494,9 +460,7 @@ impl TokenReducer {
494
460
  return 0.0;
495
461
  }
496
462
 
497
- let estimated_unique = (chars.len() as f32 * 0.1).ceil() as usize;
498
- let mut char_freq = AHashMap::with_capacity(estimated_unique.max(26));
499
-
463
+ let mut char_freq = std::collections::HashMap::new();
500
464
  for &ch in &chars {
501
465
  let lowercase_ch = ch
502
466
  .to_lowercase()
@@ -1,7 +1,6 @@
1
1
  use crate::error::{KreuzbergError, Result};
2
2
  use crate::stopwords::STOPWORDS;
3
3
  use crate::text::token_reduction::config::TokenReductionConfig;
4
- use crate::text::utf8_validation;
5
4
  use ahash::{AHashMap, AHashSet};
6
5
  use once_cell::sync::Lazy;
7
6
  use regex::Regex;
@@ -64,48 +63,34 @@ impl FilterPipeline {
64
63
  }
65
64
 
66
65
  pub fn apply_light_filters(&self, text: &str) -> String {
67
- use std::borrow::Cow;
68
-
69
- let mut result = Cow::Borrowed(text);
66
+ let mut result = text.to_string();
70
67
 
71
- let mut preserved_blocks: Option<AHashMap<String, String>> = None;
68
+ let mut preserved_blocks = AHashMap::new();
72
69
  if self.config.preserve_markdown {
73
- let mut blocks = AHashMap::new();
74
- result = Cow::Owned(self.extract_and_preserve_code(result.as_ref(), &mut blocks));
75
- preserved_blocks = Some(blocks);
70
+ result = self.extract_and_preserve_code(&result, &mut preserved_blocks);
76
71
  }
77
72
 
78
- if HTML_COMMENT_REGEX.is_match(&result) {
79
- result = Cow::Owned(HTML_COMMENT_REGEX.replace_all(&result, "").into_owned());
80
- }
73
+ result = HTML_COMMENT_REGEX.replace_all(&result, "").to_string();
81
74
 
82
- if MULTIPLE_SPACES_REGEX.is_match(&result) {
83
- result = Cow::Owned(MULTIPLE_SPACES_REGEX.replace_all(&result, " ").into_owned());
84
- }
75
+ result = MULTIPLE_SPACES_REGEX.replace_all(&result, " ").to_string();
85
76
 
86
- if EXCESSIVE_NEWLINES_REGEX.is_match(&result) {
87
- result = Cow::Owned(EXCESSIVE_NEWLINES_REGEX.replace_all(&result, "\n\n").into_owned());
88
- }
77
+ result = EXCESSIVE_NEWLINES_REGEX.replace_all(&result, "\n\n").to_string();
89
78
 
90
79
  if self.config.preserve_markdown {
91
- result = Cow::Owned(self.preserve_markdown_structure(&result));
80
+ result = self.preserve_markdown_structure(&result);
92
81
  }
93
82
 
94
- if let Some(blocks) = &preserved_blocks {
95
- result = Cow::Owned(self.restore_preserved_blocks(&result, blocks));
96
- }
83
+ result = self.restore_preserved_blocks(&result, &preserved_blocks);
97
84
 
98
- result.into_owned()
85
+ result
99
86
  }
100
87
 
101
88
  pub fn apply_moderate_filters(&self, text: &str) -> String {
102
89
  let mut result = self.apply_light_filters(text);
103
90
 
104
- let mut preserved_blocks: Option<AHashMap<String, String>> = None;
91
+ let mut preserved_blocks = AHashMap::new();
105
92
  if self.config.preserve_code {
106
- let mut blocks = AHashMap::new();
107
- result = self.extract_and_preserve_code(&result, &mut blocks);
108
- preserved_blocks = Some(blocks);
93
+ result = self.extract_and_preserve_code(&result, &mut preserved_blocks);
109
94
  }
110
95
 
111
96
  if self.config.preserve_markdown {
@@ -114,16 +99,14 @@ impl FilterPipeline {
114
99
  result = self.remove_stopwords(&result);
115
100
  }
116
101
 
117
- if let Some(blocks) = &preserved_blocks {
118
- result = self.restore_preserved_blocks(&result, blocks);
119
- }
102
+ result = self.restore_preserved_blocks(&result, &preserved_blocks);
120
103
 
121
104
  result
122
105
  }
123
106
 
124
107
  fn remove_stopwords_preserving_markdown(&self, text: &str) -> String {
125
108
  let lines: Vec<&str> = text.lines().collect();
126
- let mut processed_lines = Vec::with_capacity(lines.len());
109
+ let mut processed_lines = Vec::new();
127
110
 
128
111
  for line in lines {
129
112
  if MARKDOWN_HEADERS_REGEX.is_match(line) {
@@ -150,7 +133,7 @@ impl FilterPipeline {
150
133
 
151
134
  fn remove_stopwords(&self, text: &str) -> String {
152
135
  let words: Vec<&str> = text.split_whitespace().collect();
153
- let mut filtered_words = Vec::with_capacity((words.len() as f32 * 0.7).ceil() as usize);
136
+ let mut filtered_words = Vec::with_capacity(words.len());
154
137
 
155
138
  for word in words {
156
139
  if word.is_empty() {
@@ -178,7 +161,7 @@ impl FilterPipeline {
178
161
  .filter(|&b| b.is_ascii_alphabetic())
179
162
  .map(|b| b.to_ascii_lowercase())
180
163
  .collect();
181
- utf8_validation::string_from_utf8(clean_bytes).unwrap_or_else(|_| {
164
+ String::from_utf8(clean_bytes).unwrap_or_else(|_| {
182
165
  word.chars()
183
166
  .filter(|c| c.is_alphabetic())
184
167
  .collect::<String>()
@@ -249,7 +232,7 @@ impl FilterPipeline {
249
232
 
250
233
  fn preserve_markdown_structure(&self, text: &str) -> String {
251
234
  let lines: Vec<&str> = text.lines().collect();
252
- let mut processed_lines = Vec::with_capacity(lines.len());
235
+ let mut processed_lines = Vec::new();
253
236
 
254
237
  for line in lines {
255
238
  if MARKDOWN_HEADERS_REGEX.is_match(line) {
@@ -297,10 +280,6 @@ impl FilterPipeline {
297
280
  }
298
281
 
299
282
  fn restore_preserved_blocks(&self, text: &str, preserved: &AHashMap<String, String>) -> String {
300
- if preserved.is_empty() {
301
- return text.to_string();
302
- }
303
-
304
283
  let mut result = text.to_string();
305
284
 
306
285
  for (placeholder, original_content) in preserved {
@@ -1,4 +1,3 @@
1
- use crate::text::utf8_validation;
2
1
  use memchr::{memchr, memchr3};
3
2
 
4
3
  pub struct SimdTextProcessor;
@@ -51,7 +50,7 @@ impl SimdTextProcessor {
51
50
  i = sequence_end;
52
51
  }
53
52
 
54
- utf8_validation::string_from_utf8(result).unwrap_or_else(|_| text.to_string())
53
+ String::from_utf8(result).unwrap_or_else(|_| text.to_string())
55
54
  }
56
55
 
57
56
  #[inline]