kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (321) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +0 -6
  3. data/.rubocop.yaml +534 -1
  4. data/Gemfile +2 -1
  5. data/Gemfile.lock +28 -116
  6. data/README.md +269 -629
  7. data/Rakefile +0 -9
  8. data/Steepfile +4 -8
  9. data/examples/async_patterns.rb +58 -1
  10. data/ext/kreuzberg_rb/extconf.rb +5 -35
  11. data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
  12. data/ext/kreuzberg_rb/native/build.rs +14 -12
  13. data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
  14. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
  15. data/ext/kreuzberg_rb/native/include/strings.h +2 -2
  16. data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
  17. data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
  18. data/extconf.rb +6 -38
  19. data/kreuzberg.gemspec +20 -114
  20. data/lib/kreuzberg/api_proxy.rb +18 -2
  21. data/lib/kreuzberg/cache_api.rb +0 -22
  22. data/lib/kreuzberg/cli.rb +10 -2
  23. data/lib/kreuzberg/cli_proxy.rb +10 -0
  24. data/lib/kreuzberg/config.rb +22 -274
  25. data/lib/kreuzberg/errors.rb +7 -73
  26. data/lib/kreuzberg/extraction_api.rb +8 -237
  27. data/lib/kreuzberg/mcp_proxy.rb +11 -2
  28. data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
  29. data/lib/kreuzberg/post_processor_protocol.rb +71 -0
  30. data/lib/kreuzberg/result.rb +33 -151
  31. data/lib/kreuzberg/setup_lib_path.rb +2 -22
  32. data/lib/kreuzberg/validator_protocol.rb +73 -0
  33. data/lib/kreuzberg/version.rb +1 -1
  34. data/lib/kreuzberg.rb +13 -27
  35. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  36. data/sig/kreuzberg.rbs +12 -105
  37. data/spec/binding/cache_spec.rb +22 -22
  38. data/spec/binding/cli_proxy_spec.rb +4 -2
  39. data/spec/binding/cli_spec.rb +11 -12
  40. data/spec/binding/config_spec.rb +0 -74
  41. data/spec/binding/config_validation_spec.rb +6 -100
  42. data/spec/binding/error_handling_spec.rb +97 -283
  43. data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
  44. data/spec/binding/plugins/postprocessor_spec.rb +11 -11
  45. data/spec/binding/plugins/validator_spec.rb +13 -12
  46. data/spec/examples.txt +104 -0
  47. data/spec/fixtures/config.toml +1 -0
  48. data/spec/fixtures/config.yaml +1 -0
  49. data/spec/fixtures/invalid_config.toml +1 -0
  50. data/spec/smoke/package_spec.rb +3 -2
  51. data/spec/spec_helper.rb +3 -1
  52. data/vendor/kreuzberg/Cargo.toml +67 -192
  53. data/vendor/kreuzberg/README.md +9 -97
  54. data/vendor/kreuzberg/build.rs +194 -516
  55. data/vendor/kreuzberg/src/api/handlers.rs +9 -130
  56. data/vendor/kreuzberg/src/api/mod.rs +3 -18
  57. data/vendor/kreuzberg/src/api/server.rs +71 -236
  58. data/vendor/kreuzberg/src/api/types.rs +7 -43
  59. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  60. data/vendor/kreuzberg/src/cache/mod.rs +3 -27
  61. data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
  62. data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
  63. data/vendor/kreuzberg/src/core/config.rs +23 -905
  64. data/vendor/kreuzberg/src/core/extractor.rs +106 -403
  65. data/vendor/kreuzberg/src/core/io.rs +2 -4
  66. data/vendor/kreuzberg/src/core/mime.rs +12 -2
  67. data/vendor/kreuzberg/src/core/mod.rs +3 -22
  68. data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
  69. data/vendor/kreuzberg/src/embeddings.rs +21 -169
  70. data/vendor/kreuzberg/src/error.rs +2 -2
  71. data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
  72. data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
  73. data/vendor/kreuzberg/src/extraction/email.rs +11 -12
  74. data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
  75. data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
  76. data/vendor/kreuzberg/src/extraction/image.rs +14 -138
  77. data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
  78. data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
  79. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
  80. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  81. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  82. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  83. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  84. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  85. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  86. data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
  87. data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
  88. data/vendor/kreuzberg/src/extraction/table.rs +1 -2
  89. data/vendor/kreuzberg/src/extraction/text.rs +10 -18
  90. data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
  91. data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
  92. data/vendor/kreuzberg/src/extractors/email.rs +9 -37
  93. data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
  94. data/vendor/kreuzberg/src/extractors/html.rs +173 -182
  95. data/vendor/kreuzberg/src/extractors/image.rs +8 -32
  96. data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
  97. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  98. data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
  99. data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
  100. data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
  101. data/vendor/kreuzberg/src/extractors/text.rs +7 -30
  102. data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
  103. data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
  104. data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
  105. data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
  106. data/vendor/kreuzberg/src/lib.rs +5 -17
  107. data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
  108. data/vendor/kreuzberg/src/mcp/server.rs +21 -145
  109. data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
  110. data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
  111. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
  112. data/vendor/kreuzberg/src/pdf/error.rs +1 -93
  113. data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
  114. data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
  115. data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
  116. data/vendor/kreuzberg/src/pdf/table.rs +64 -61
  117. data/vendor/kreuzberg/src/pdf/text.rs +24 -416
  118. data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
  119. data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
  120. data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
  121. data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
  122. data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
  123. data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
  124. data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
  125. data/vendor/kreuzberg/src/text/mod.rs +0 -8
  126. data/vendor/kreuzberg/src/text/quality.rs +15 -28
  127. data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
  128. data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
  129. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
  130. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
  131. data/vendor/kreuzberg/src/types.rs +67 -907
  132. data/vendor/kreuzberg/src/utils/mod.rs +0 -14
  133. data/vendor/kreuzberg/src/utils/quality.rs +3 -12
  134. data/vendor/kreuzberg/tests/api_tests.rs +0 -506
  135. data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
  136. data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
  137. data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
  138. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  139. data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
  140. data/vendor/kreuzberg/tests/config_features.rs +1 -33
  141. data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
  142. data/vendor/kreuzberg/tests/core_integration.rs +9 -35
  143. data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
  144. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
  145. data/vendor/kreuzberg/tests/email_integration.rs +1 -3
  146. data/vendor/kreuzberg/tests/error_handling.rs +34 -43
  147. data/vendor/kreuzberg/tests/format_integration.rs +1 -7
  148. data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
  149. data/vendor/kreuzberg/tests/image_integration.rs +0 -2
  150. data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
  151. data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
  152. data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
  153. data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
  154. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  155. data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
  156. data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
  157. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
  158. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
  159. data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
  160. data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
  161. data/vendor/kreuzberg/tests/security_validation.rs +1 -13
  162. data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
  163. metadata +25 -171
  164. data/.rubocop.yml +0 -543
  165. data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
  166. data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
  167. data/lib/kreuzberg/error_context.rb +0 -136
  168. data/lib/kreuzberg/types.rb +0 -170
  169. data/lib/libpdfium.so +0 -0
  170. data/spec/binding/async_operations_spec.rb +0 -473
  171. data/spec/binding/batch_operations_spec.rb +0 -595
  172. data/spec/binding/batch_spec.rb +0 -359
  173. data/spec/binding/config_result_spec.rb +0 -377
  174. data/spec/binding/embeddings_spec.rb +0 -816
  175. data/spec/binding/error_recovery_spec.rb +0 -488
  176. data/spec/binding/font_config_spec.rb +0 -220
  177. data/spec/binding/images_spec.rb +0 -738
  178. data/spec/binding/keywords_extraction_spec.rb +0 -600
  179. data/spec/binding/metadata_types_spec.rb +0 -1228
  180. data/spec/binding/pages_extraction_spec.rb +0 -471
  181. data/spec/binding/tables_spec.rb +0 -641
  182. data/spec/unit/config/chunking_config_spec.rb +0 -213
  183. data/spec/unit/config/embedding_config_spec.rb +0 -343
  184. data/spec/unit/config/extraction_config_spec.rb +0 -438
  185. data/spec/unit/config/font_config_spec.rb +0 -285
  186. data/spec/unit/config/hierarchy_config_spec.rb +0 -314
  187. data/spec/unit/config/image_extraction_config_spec.rb +0 -209
  188. data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
  189. data/spec/unit/config/keyword_config_spec.rb +0 -229
  190. data/spec/unit/config/language_detection_config_spec.rb +0 -258
  191. data/spec/unit/config/ocr_config_spec.rb +0 -171
  192. data/spec/unit/config/page_config_spec.rb +0 -221
  193. data/spec/unit/config/pdf_config_spec.rb +0 -267
  194. data/spec/unit/config/postprocessor_config_spec.rb +0 -290
  195. data/spec/unit/config/tesseract_config_spec.rb +0 -181
  196. data/spec/unit/config/token_reduction_config_spec.rb +0 -251
  197. data/test/metadata_types_test.rb +0 -959
  198. data/vendor/Cargo.toml +0 -61
  199. data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
  200. data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
  201. data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
  202. data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
  203. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  204. data/vendor/kreuzberg/src/core/formats.rs +0 -235
  205. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  206. data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
  207. data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
  208. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
  209. data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
  210. data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
  211. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  212. data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
  213. data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
  214. data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
  215. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  216. data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
  217. data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
  218. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  219. data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
  220. data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
  221. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  222. data/vendor/kreuzberg/src/extractors/security.rs +0 -484
  223. data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
  224. data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
  225. data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
  226. data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
  227. data/vendor/kreuzberg/src/panic_context.rs +0 -154
  228. data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
  229. data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
  230. data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
  231. data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
  232. data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
  233. data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
  234. data/vendor/kreuzberg/src/utils/pool.rs +0 -503
  235. data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
  236. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
  237. data/vendor/kreuzberg/tests/api_embed.rs +0 -360
  238. data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
  239. data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
  240. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
  241. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
  242. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
  243. data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
  244. data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
  245. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
  246. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
  247. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
  248. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
  249. data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
  250. data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
  251. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
  252. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
  253. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
  254. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
  255. data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
  256. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
  257. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
  258. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
  259. data/vendor/kreuzberg/tests/page_markers.rs +0 -297
  260. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
  261. data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
  262. data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
  263. data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
  264. data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
  265. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
  266. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
  267. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
  268. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
  269. data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
  270. data/vendor/kreuzberg-ffi/README.md +0 -851
  271. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
  272. data/vendor/kreuzberg-ffi/build.rs +0 -168
  273. data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
  274. data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
  275. data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
  276. data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
  277. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
  278. data/vendor/kreuzberg-ffi/src/error.rs +0 -901
  279. data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
  280. data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
  281. data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
  282. data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
  283. data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
  284. data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
  285. data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
  286. data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
  287. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
  288. data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
  289. data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
  290. data/vendor/kreuzberg-ffi/src/result.rs +0 -510
  291. data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
  292. data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
  293. data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
  294. data/vendor/kreuzberg-ffi/src/types.rs +0 -363
  295. data/vendor/kreuzberg-ffi/src/util.rs +0 -210
  296. data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
  297. data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
  298. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
  299. data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
  300. data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
  301. data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
  302. data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
  303. data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
  304. data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
  305. data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
  306. data/vendor/kreuzberg-tesseract/LICENSE +0 -22
  307. data/vendor/kreuzberg-tesseract/README.md +0 -399
  308. data/vendor/kreuzberg-tesseract/build.rs +0 -1127
  309. data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
  310. data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
  311. data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
  312. data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
  313. data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
  314. data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
  315. data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
  316. data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
  317. data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
  318. data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
  319. data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
  320. data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
  321. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211
@@ -47,7 +47,6 @@
47
47
  //! # detected_languages: None,
48
48
  //! # chunks: None,
49
49
  //! # images: None,
50
- //! # pages: None,
51
50
  //! # })
52
51
  //! # }
53
52
  //! # async fn extract_file(&self, _: &std::path::Path, _: &str, _: &kreuzberg::ExtractionConfig)
@@ -60,7 +59,6 @@
60
59
  //! # detected_languages: None,
61
60
  //! # chunks: None,
62
61
  //! # images: None,
63
- //! # pages: None,
64
62
  //! # })
65
63
  //! # }
66
64
  //! # fn supported_mime_types(&self) -> &[&str] { &[] }
@@ -122,7 +120,6 @@
122
120
  //! detected_languages: None,
123
121
  //! chunks: None,
124
122
  //! images: None,
125
- //! pages: None,
126
123
  //! })
127
124
  //! }
128
125
  //!
@@ -10,9 +10,6 @@ use async_trait::async_trait;
10
10
  use std::path::Path;
11
11
  use std::sync::Arc;
12
12
 
13
- #[cfg(not(feature = "tokio-runtime"))]
14
- use crate::KreuzbergError;
15
-
16
13
  /// OCR backend types.
17
14
  #[derive(Debug, Clone, Copy, PartialEq, Eq)]
18
15
  pub enum OcrBackendType {
@@ -67,7 +64,6 @@ pub enum OcrBackendType {
67
64
  /// detected_languages: None,
68
65
  /// chunks: None,
69
66
  /// images: None,
70
- /// pages: None,
71
67
  /// })
72
68
  /// }
73
69
  ///
@@ -85,8 +81,7 @@ pub enum OcrBackendType {
85
81
  /// }
86
82
  /// }
87
83
  /// ```
88
- #[cfg_attr(not(target_arch = "wasm32"), async_trait)]
89
- #[cfg_attr(target_arch = "wasm32", async_trait(?Send))]
84
+ #[async_trait]
90
85
  pub trait OcrBackend: Plugin {
91
86
  /// Process an image and extract text via OCR.
92
87
  ///
@@ -146,7 +141,6 @@ pub trait OcrBackend: Plugin {
146
141
  /// detected_languages: None,
147
142
  /// chunks: None,
148
143
  /// images: None,
149
- /// pages: None,
150
144
  /// })
151
145
  /// }
152
146
  /// # }
@@ -167,19 +161,9 @@ pub trait OcrBackend: Plugin {
167
161
  ///
168
162
  /// Same as `process_image`, plus file I/O errors.
169
163
  async fn process_file(&self, path: &Path, config: &OcrConfig) -> Result<ExtractionResult> {
170
- #[cfg(feature = "tokio-runtime")]
171
- {
172
- use crate::core::io;
173
- let bytes = io::read_file_async(path).await?;
174
- self.process_image(&bytes, config).await
175
- }
176
- #[cfg(not(feature = "tokio-runtime"))]
177
- {
178
- let _ = (path, config);
179
- Err(KreuzbergError::Other(
180
- "File-based OCR processing requires the tokio-runtime feature".to_string(),
181
- ))
182
- }
164
+ use crate::core::io;
165
+ let bytes = io::read_file_async(path).await?;
166
+ self.process_image(&bytes, config).await
183
167
  }
184
168
 
185
169
  /// Check if this backend supports a given language code.
@@ -268,6 +252,8 @@ pub trait OcrBackend: Plugin {
268
252
  }
269
253
  }
270
254
 
255
+ // Public registration APIs
256
+
271
257
  /// Register an OCR backend with the global registry.
272
258
  ///
273
259
  /// The OCR backend will be registered with its name from the `name()` method
@@ -317,7 +303,6 @@ pub trait OcrBackend: Plugin {
317
303
  /// detected_languages: None,
318
304
  /// chunks: None,
319
305
  /// images: None,
320
- /// pages: None,
321
306
  /// })
322
307
  /// }
323
308
  /// fn supports_language(&self, _: &str) -> bool { true }
@@ -335,6 +320,8 @@ pub fn register_ocr_backend(backend: Arc<dyn OcrBackend>) -> crate::Result<()> {
335
320
 
336
321
  let registry = get_ocr_backend_registry();
337
322
  // ~keep: Lock poisoning indicates a panic in another thread holding the lock.
323
+ // This is a critical runtime error (similar to OOM) that should bubble up
324
+ // as it indicates the registry is in an inconsistent state.
338
325
  let mut registry = registry
339
326
  .write()
340
327
  .expect("OCR backend registry lock poisoned - critical runtime error");
@@ -370,6 +357,8 @@ pub fn unregister_ocr_backend(name: &str) -> crate::Result<()> {
370
357
 
371
358
  let registry = get_ocr_backend_registry();
372
359
  // ~keep: Lock poisoning indicates a panic in another thread holding the lock.
360
+ // This is a critical runtime error (similar to OOM) that should bubble up
361
+ // as it indicates the registry is in an inconsistent state.
373
362
  let mut registry = registry
374
363
  .write()
375
364
  .expect("OCR backend registry lock poisoned - critical runtime error");
@@ -403,6 +392,8 @@ pub fn list_ocr_backends() -> crate::Result<Vec<String>> {
403
392
 
404
393
  let registry = get_ocr_backend_registry();
405
394
  // ~keep: Lock poisoning indicates a panic in another thread holding the lock.
395
+ // This is a critical runtime error (similar to OOM) that should bubble up
396
+ // as it indicates the registry is in an inconsistent state.
406
397
  let registry = registry
407
398
  .read()
408
399
  .expect("OCR backend registry lock poisoned - critical runtime error");
@@ -434,6 +425,8 @@ pub fn clear_ocr_backends() -> crate::Result<()> {
434
425
 
435
426
  let registry = get_ocr_backend_registry();
436
427
  // ~keep: Lock poisoning indicates a panic in another thread holding the lock.
428
+ // This is a critical runtime error (similar to OOM) that should bubble up
429
+ // as it indicates the registry is in an inconsistent state.
437
430
  let mut registry = registry
438
431
  .write()
439
432
  .expect("OCR backend registry lock poisoned - critical runtime error");
@@ -478,7 +471,6 @@ mod tests {
478
471
  detected_languages: None,
479
472
  chunks: None,
480
473
  images: None,
481
- pages: None,
482
474
  })
483
475
  }
484
476
 
@@ -105,8 +105,7 @@ pub enum ProcessingStage {
105
105
  /// }
106
106
  /// }
107
107
  /// ```
108
- #[cfg_attr(not(target_arch = "wasm32"), async_trait)]
109
- #[cfg_attr(target_arch = "wasm32", async_trait(?Send))]
108
+ #[async_trait]
110
109
  pub trait PostProcessor: Plugin {
111
110
  /// Process an extraction result.
112
111
  ///
@@ -373,7 +372,6 @@ mod tests {
373
372
  detected_languages: None,
374
373
  chunks: None,
375
374
  images: None,
376
- pages: None,
377
375
  };
378
376
 
379
377
  let config = ExtractionConfig::default();
@@ -423,7 +421,6 @@ mod tests {
423
421
  detected_languages: None,
424
422
  chunks: None,
425
423
  images: None,
426
- pages: None,
427
424
  };
428
425
 
429
426
  let config = ExtractionConfig::default();
@@ -490,7 +487,6 @@ mod tests {
490
487
  detected_languages: None,
491
488
  chunks: None,
492
489
  images: None,
493
- pages: None,
494
490
  };
495
491
 
496
492
  let config = ExtractionConfig::default();
@@ -516,7 +512,6 @@ mod tests {
516
512
  additional,
517
513
  ..Default::default()
518
514
  },
519
- pages: None,
520
515
  tables: vec![],
521
516
  detected_languages: None,
522
517
  chunks: None,
@@ -547,7 +542,6 @@ mod tests {
547
542
  detected_languages: None,
548
543
  chunks: None,
549
544
  images: None,
550
- pages: None,
551
545
  };
552
546
 
553
547
  assert_eq!(processor.estimated_duration_ms(&result), 0);
@@ -598,7 +592,6 @@ mod tests {
598
592
  detected_languages: None,
599
593
  chunks: None,
600
594
  images: None,
601
- pages: None,
602
595
  };
603
596
 
604
597
  let txt_result = ExtractionResult {
@@ -609,7 +602,6 @@ mod tests {
609
602
  detected_languages: None,
610
603
  chunks: None,
611
604
  images: None,
612
- pages: None,
613
605
  };
614
606
 
615
607
  assert!(processor.should_process(&pdf_result, &config));
@@ -638,7 +630,6 @@ mod tests {
638
630
  detected_languages: None,
639
631
  chunks: None,
640
632
  images: None,
641
- pages: None,
642
633
  };
643
634
 
644
635
  let config = ExtractionConfig::default();
@@ -264,19 +264,10 @@ impl DocumentExtractorRegistry {
264
264
  /// # Returns
265
265
  ///
266
266
  /// The highest priority extractor, or an error if none found.
267
- #[cfg_attr(feature = "otel", tracing::instrument(
268
- skip(self),
269
- fields(
270
- registry.mime_type = %mime_type,
271
- registry.found = tracing::field::Empty,
272
- )
273
- ))]
274
267
  pub fn get(&self, mime_type: &str) -> Result<Arc<dyn DocumentExtractor>> {
275
268
  if let Some(priority_map) = self.extractors.get(mime_type)
276
269
  && let Some((_priority, extractor)) = priority_map.iter().next_back()
277
270
  {
278
- #[cfg(feature = "otel")]
279
- tracing::Span::current().record("registry.found", true);
280
271
  return Ok(Arc::clone(extractor));
281
272
  }
282
273
 
@@ -302,13 +293,9 @@ impl DocumentExtractorRegistry {
302
293
  }
303
294
 
304
295
  if let Some((_priority, extractor)) = best_match {
305
- #[cfg(feature = "otel")]
306
- tracing::Span::current().record("registry.found", true);
307
296
  return Ok(extractor);
308
297
  }
309
298
 
310
- #[cfg(feature = "otel")]
311
- tracing::Span::current().record("registry.found", false);
312
299
  Err(KreuzbergError::UnsupportedFormat(mime_type.to_string()))
313
300
  }
314
301
 
@@ -661,7 +648,6 @@ mod tests {
661
648
  detected_languages: None,
662
649
  chunks: None,
663
650
  images: None,
664
- pages: None,
665
651
  })
666
652
  }
667
653
 
@@ -706,7 +692,6 @@ mod tests {
706
692
  detected_languages: None,
707
693
  chunks: None,
708
694
  images: None,
709
- pages: None,
710
695
  })
711
696
  }
712
697
 
@@ -68,8 +68,7 @@ use std::sync::Arc;
68
68
  /// }
69
69
  /// }
70
70
  /// ```
71
- #[cfg_attr(not(target_arch = "wasm32"), async_trait)]
72
- #[cfg_attr(target_arch = "wasm32", async_trait(?Send))]
71
+ #[async_trait]
73
72
  pub trait Validator: Plugin {
74
73
  /// Validate an extraction result.
75
74
  ///
@@ -276,6 +275,8 @@ pub trait Validator: Plugin {
276
275
  }
277
276
  }
278
277
 
278
+ // Public registration APIs
279
+
279
280
  /// Register a validator with the global registry.
280
281
  ///
281
282
  /// The validator will be registered with its default priority and will be called
@@ -489,7 +490,6 @@ mod tests {
489
490
  detected_languages: None,
490
491
  chunks: None,
491
492
  images: None,
492
- pages: None,
493
493
  };
494
494
 
495
495
  let config = ExtractionConfig::default();
@@ -508,7 +508,6 @@ mod tests {
508
508
  detected_languages: None,
509
509
  chunks: None,
510
510
  images: None,
511
- pages: None,
512
511
  };
513
512
 
514
513
  let config = ExtractionConfig::default();
@@ -529,7 +528,6 @@ mod tests {
529
528
  detected_languages: None,
530
529
  chunks: None,
531
530
  images: None,
532
- pages: None,
533
531
  };
534
532
 
535
533
  let config = ExtractionConfig::default();
@@ -565,7 +563,6 @@ mod tests {
565
563
  detected_languages: None,
566
564
  chunks: None,
567
565
  images: None,
568
- pages: None,
569
566
  };
570
567
 
571
568
  let config = ExtractionConfig::default();
@@ -613,7 +610,6 @@ mod tests {
613
610
  detected_languages: None,
614
611
  chunks: None,
615
612
  images: None,
616
- pages: None,
617
613
  };
618
614
 
619
615
  let txt_result = ExtractionResult {
@@ -624,7 +620,6 @@ mod tests {
624
620
  detected_languages: None,
625
621
  chunks: None,
626
622
  images: None,
627
- pages: None,
628
623
  };
629
624
 
630
625
  assert!(validator.should_validate(&pdf_result, &config));
@@ -708,7 +703,6 @@ mod tests {
708
703
  detected_languages: None,
709
704
  chunks: None,
710
705
  images: None,
711
- pages: None,
712
706
  };
713
707
 
714
708
  let config = ExtractionConfig::default();
@@ -736,7 +730,6 @@ mod tests {
736
730
  additional,
737
731
  ..Default::default()
738
732
  },
739
- pages: None,
740
733
  tables: vec![],
741
734
  detected_languages: None,
742
735
  chunks: None,
@@ -767,7 +760,6 @@ mod tests {
767
760
  detected_languages: None,
768
761
  chunks: None,
769
762
  images: None,
770
- pages: None,
771
763
  };
772
764
 
773
765
  let config = ExtractionConfig::default();
@@ -796,7 +788,6 @@ mod tests {
796
788
  detected_languages: None,
797
789
  chunks: None,
798
790
  images: None,
799
- pages: None,
800
791
  };
801
792
 
802
793
  assert!(validator.validate(&result, &config).await.is_ok());
@@ -815,15 +806,15 @@ mod tests {
815
806
  detected_languages: None,
816
807
  chunks: None,
817
808
  images: None,
818
- pages: None,
819
809
  };
820
810
 
821
811
  let config = ExtractionConfig::default();
822
812
  assert!(validator.validate(&result, &config).await.is_ok());
823
813
  }
824
814
 
815
+ // Tests for public registration APIs
816
+
825
817
  #[test]
826
- #[serial_test::serial]
827
818
  fn test_register_validator() {
828
819
  use std::sync::Arc;
829
820
 
@@ -835,7 +826,6 @@ mod tests {
835
826
  }
836
827
 
837
828
  #[test]
838
- #[serial_test::serial]
839
829
  fn test_unregister_validator() {
840
830
  use std::sync::Arc;
841
831
 
@@ -847,20 +837,19 @@ mod tests {
847
837
  }
848
838
 
849
839
  #[test]
850
- #[serial_test::serial]
851
840
  fn test_unregister_nonexistent_validator() {
852
841
  let result = super::unregister_validator("nonexistent-validator-xyz");
853
842
  assert!(result.is_ok());
854
843
  }
855
844
 
856
845
  #[test]
857
- #[serial_test::serial]
858
846
  fn test_list_validators() {
859
847
  use std::sync::Arc;
860
848
 
861
849
  super::clear_validators().unwrap();
862
850
 
863
851
  let validator1 = Arc::new(MockValidator { should_fail: false });
852
+ // Both validators have the same name, so only one will be registered
864
853
  let validator2 = Arc::new(MockValidator { should_fail: false });
865
854
 
866
855
  let list_before = super::list_validators().unwrap();
@@ -870,6 +859,7 @@ mod tests {
870
859
  super::register_validator(validator2).unwrap();
871
860
 
872
861
  let list = super::list_validators().unwrap();
862
+ // Only 1 validator registered since they have the same name
873
863
  assert_eq!(list.len(), 1);
874
864
  assert!(list.contains(&"mock-validator".to_string()));
875
865
 
@@ -877,7 +867,6 @@ mod tests {
877
867
  }
878
868
 
879
869
  #[test]
880
- #[serial_test::serial]
881
870
  fn test_clear_validators() {
882
871
  use std::sync::Arc;
883
872
 
@@ -889,6 +878,7 @@ mod tests {
889
878
  super::register_validator(validator1).unwrap();
890
879
  super::register_validator(validator2).unwrap();
891
880
 
881
+ // Verify at least one validator is registered
892
882
  let list_before = super::list_validators().unwrap();
893
883
  assert!(!list_before.is_empty());
894
884
 
@@ -900,7 +890,6 @@ mod tests {
900
890
  }
901
891
 
902
892
  #[test]
903
- #[serial_test::serial]
904
893
  fn test_register_validator_with_invalid_name() {
905
894
  use std::sync::Arc;
906
895
 
@@ -933,7 +922,6 @@ mod tests {
933
922
  }
934
923
 
935
924
  #[test]
936
- #[serial_test::serial]
937
925
  fn test_register_validator_with_empty_name() {
938
926
  use std::sync::Arc;
939
927
 
@@ -100,7 +100,7 @@ macro_rules! embed_stopwords {
100
100
  panic!(
101
101
  "Failed to parse embedded stopwords for language '{}': {}. \
102
102
  This indicates corrupted or malformed JSON in the embedded stopwords data. \
103
- Please report this issue at https://github.com/kreuzberg-dev/kreuzberg/issues",
103
+ Please report this issue at https://github.com/Goldziher/kreuzberg/issues",
104
104
  $lang, e
105
105
  );
106
106
  }
@@ -1437,7 +1437,7 @@ mod tests {
1437
1437
  let duration = start.elapsed();
1438
1438
 
1439
1439
  assert!(
1440
- duration.as_millis() < 500,
1440
+ duration.as_millis() < 100,
1441
1441
  "30,000 lookups took too long: {:?}",
1442
1442
  duration
1443
1443
  );
@@ -1,5 +1,3 @@
1
- pub mod utf8_validation;
2
-
3
1
  #[cfg(feature = "quality")]
4
2
  pub mod quality;
5
3
 
@@ -9,15 +7,9 @@ pub mod string_utils;
9
7
  #[cfg(feature = "quality")]
10
8
  pub mod token_reduction;
11
9
 
12
- #[cfg(feature = "quality")]
13
- pub mod quality_processor;
14
-
15
10
  #[cfg(feature = "quality")]
16
11
  pub use quality::{calculate_quality_score, clean_extracted_text, normalize_spaces};
17
12
 
18
- #[cfg(feature = "quality")]
19
- pub use quality_processor::QualityProcessor;
20
-
21
13
  #[cfg(feature = "quality")]
22
14
  pub use string_utils::{calculate_text_confidence, fix_mojibake, get_encoding_cache_key, safe_decode};
23
15
 
@@ -39,23 +39,6 @@ static MALFORMED_WORDS_PATTERN: Lazy<Regex> = Lazy::new(|| {
39
39
  static EXCESSIVE_WHITESPACE_PATTERN: Lazy<Regex> =
40
40
  Lazy::new(|| Regex::new(r"\s{3,}").expect("Excessive whitespace regex pattern is valid and should compile"));
41
41
 
42
- /// Combined OCR artifact pattern for single-pass scanning (used in calculate_ocr_penalty).
43
- /// This pattern combines 5 of the 6 OCR patterns with alternation to reduce regex passes
44
- /// from 5 separate find_iter calls to 1. The dash pattern is handled separately due to
45
- /// line-based context checking.
46
- static COMBINED_OCR_ARTIFACTS_PATTERN: Lazy<Regex> = Lazy::new(|| {
47
- Regex::new(
48
- r"(?x)
49
- \b[a-zA-Z]\s{2,}[a-zA-Z]\s{2,}[a-zA-Z]\b | # Scattered chars
50
- [.]{3,}|[_]{3,} | # Repeated punctuation
51
- \s[.,;:!?]\s | # Isolated punctuation
52
- \b[a-zA-Z]+[0-9]+[a-zA-Z]+[a-zA-Z0-9]*\b | # Malformed words
53
- \s{3,} # Excessive whitespace
54
- ",
55
- )
56
- .expect("Combined OCR artifacts regex pattern is valid and should compile")
57
- });
58
-
59
42
  static JS_FUNCTION_PATTERN: Lazy<Regex> = Lazy::new(|| {
60
43
  Regex::new(r"(?i)function\s+\w+\s*\([^)]*\)\s*\{[^}]*\}")
61
44
  .expect("JavaScript function regex pattern is valid and should compile")
@@ -123,7 +106,7 @@ where
123
106
  }
124
107
  }
125
108
 
126
- pub fn calculate_quality_score(text: &str, metadata: Option<&HashMap<String, serde_json::Value>>) -> f64 {
109
+ pub fn calculate_quality_score(text: &str, metadata: Option<&HashMap<String, String>>) -> f64 {
127
110
  if text.is_empty() || text.trim().is_empty() {
128
111
  return 0.0;
129
112
  }
@@ -168,8 +151,12 @@ fn calculate_ocr_penalty(text: &str, total_chars: f64) -> f64 {
168
151
  return 0.0;
169
152
  }
170
153
 
171
- let artifact_chars =
172
- sum_match_lengths(text, &COMBINED_OCR_ARTIFACTS_PATTERN) + count_non_table_dash_artifacts(text);
154
+ let artifact_chars = sum_match_lengths(text, &SCATTERED_CHARS_PATTERN)
155
+ + sum_match_lengths(text, &REPEATED_PUNCT_PATTERN)
156
+ + count_non_table_dash_artifacts(text)
157
+ + sum_match_lengths(text, &ISOLATED_PUNCT_PATTERN)
158
+ + sum_match_lengths(text, &MALFORMED_WORDS_PATTERN)
159
+ + sum_match_lengths(text, &EXCESSIVE_WHITESPACE_PATTERN);
173
160
 
174
161
  (artifact_chars as f64 / total_chars).min(1.0)
175
162
  }
@@ -266,7 +253,7 @@ fn calculate_structure_bonus(text: &str) -> f64 {
266
253
  }
267
254
 
268
255
  #[inline]
269
- fn calculate_metadata_bonus(metadata: &HashMap<String, serde_json::Value>) -> f64 {
256
+ fn calculate_metadata_bonus(metadata: &HashMap<String, String>) -> f64 {
270
257
  const IMPORTANT_FIELDS: &[&str] = &["title", "author", "subject", "description", "keywords"];
271
258
 
272
259
  let present_fields = IMPORTANT_FIELDS
@@ -492,8 +479,8 @@ mod tests {
492
479
  fn test_calculate_quality_score_with_metadata() {
493
480
  let text = "This is a normal text with proper structure.";
494
481
  let mut metadata = HashMap::new();
495
- metadata.insert("title".to_string(), serde_json::json!("Test Title"));
496
- metadata.insert("author".to_string(), serde_json::json!("Test Author"));
482
+ metadata.insert("title".to_string(), "Test Title".to_string());
483
+ metadata.insert("author".to_string(), "Test Author".to_string());
497
484
 
498
485
  let score = calculate_quality_score(text, Some(&metadata));
499
486
  assert!(score > 0.0);
@@ -566,11 +553,11 @@ mod tests {
566
553
  #[test]
567
554
  fn test_calculate_metadata_bonus_full() {
568
555
  let mut metadata = HashMap::new();
569
- metadata.insert("title".to_string(), serde_json::json!("Title"));
570
- metadata.insert("author".to_string(), serde_json::json!("Author"));
571
- metadata.insert("subject".to_string(), serde_json::json!("Subject"));
572
- metadata.insert("description".to_string(), serde_json::json!("Description"));
573
- metadata.insert("keywords".to_string(), serde_json::json!("Keywords"));
556
+ metadata.insert("title".to_string(), "Title".to_string());
557
+ metadata.insert("author".to_string(), "Author".to_string());
558
+ metadata.insert("subject".to_string(), "Subject".to_string());
559
+ metadata.insert("description".to_string(), "Description".to_string());
560
+ metadata.insert("keywords".to_string(), "Keywords".to_string());
574
561
 
575
562
  let bonus = calculate_metadata_bonus(&metadata);
576
563
  assert_eq!(bonus, 1.0);
@@ -45,9 +45,7 @@ fn calculate_cache_key(data: &[u8]) -> String {
45
45
  let sample = if data.len() > 1024 { &data[..1024] } else { data };
46
46
  sample.hash(&mut hasher);
47
47
  data.len().hash(&mut hasher);
48
- let mut result = String::with_capacity(16);
49
- result.push_str(&format!("{:x}", hasher.finish()));
50
- result
48
+ format!("{:x}", hasher.finish())
51
49
  }
52
50
 
53
51
  pub fn safe_decode(byte_data: &[u8], encoding: Option<&str>) -> String {
@@ -59,7 +57,7 @@ pub fn safe_decode(byte_data: &[u8], encoding: Option<&str>) -> String {
59
57
  && let Some(enc) = Encoding::for_label(enc_name.as_bytes())
60
58
  {
61
59
  let (decoded, _, _) = enc.decode(byte_data);
62
- return fix_mojibake_internal(&decoded).into_owned();
60
+ return fix_mojibake_internal(&decoded);
63
61
  }
64
62
 
65
63
  let cache_key = calculate_cache_key(byte_data);
@@ -68,7 +66,7 @@ pub fn safe_decode(byte_data: &[u8], encoding: Option<&str>) -> String {
68
66
  && let Some(&cached_encoding) = cache.get(&cache_key)
69
67
  {
70
68
  let (decoded, _, _) = cached_encoding.decode(byte_data);
71
- return fix_mojibake_internal(&decoded).into_owned();
69
+ return fix_mojibake_internal(&decoded);
72
70
  }
73
71
 
74
72
  let mut detector = EncodingDetector::new();
@@ -95,23 +93,17 @@ pub fn safe_decode(byte_data: &[u8], encoding: Option<&str>) -> String {
95
93
  if let Some(enc) = Encoding::for_label(enc_name.as_bytes()) {
96
94
  let (test_decoded, _, test_errors) = enc.decode(byte_data);
97
95
  if !test_errors && calculate_text_confidence_internal(&test_decoded) > 0.5 {
98
- return fix_mojibake_internal(&test_decoded).into_owned();
96
+ return fix_mojibake_internal(&test_decoded);
99
97
  }
100
98
  }
101
99
  }
102
100
  }
103
101
 
104
- fix_mojibake_internal(&decoded).into_owned()
102
+ fix_mojibake_internal(&decoded)
105
103
  }
106
104
 
107
105
  pub fn get_encoding_cache_key(data_hash: &str, size: usize) -> String {
108
- let estimated_capacity = 16 + 1 + 20;
109
- let mut result = String::with_capacity(estimated_capacity);
110
- result.push_str(data_hash);
111
- result.push(':');
112
- use std::fmt::Write;
113
- let _ = write!(result, "{}", size);
114
- result
106
+ format!("{}:{}", data_hash, size)
115
107
  }
116
108
 
117
109
  pub fn calculate_text_confidence(text: &str) -> f64 {
@@ -149,16 +141,12 @@ fn calculate_text_confidence_internal(text: &str) -> f64 {
149
141
  }
150
142
 
151
143
  pub fn fix_mojibake(text: &str) -> String {
152
- fix_mojibake_internal(text).into_owned()
144
+ fix_mojibake_internal(text)
153
145
  }
154
146
 
155
- fn fix_mojibake_internal(text: &str) -> Cow<'_, str> {
147
+ fn fix_mojibake_internal(text: &str) -> String {
156
148
  if text.is_empty() {
157
- return Cow::Borrowed("");
158
- }
159
-
160
- if !CONTROL_CHARS.is_match(text) && !REPLACEMENT_CHARS.is_match(text) && !ISOLATED_COMBINING.is_match(text) {
161
- return Cow::Borrowed(text);
149
+ return text.to_string();
162
150
  }
163
151
 
164
152
  let replacements = [
@@ -167,7 +155,7 @@ fn fix_mojibake_internal(text: &str) -> Cow<'_, str> {
167
155
  (&*ISOLATED_COMBINING, ""),
168
156
  ];
169
157
 
170
- chain_replacements(Cow::Borrowed(text), &replacements)
158
+ chain_replacements(Cow::Borrowed(text), &replacements).into_owned()
171
159
  }
172
160
 
173
161
  #[cfg(test)]