kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (321) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +0 -6
  3. data/.rubocop.yaml +534 -1
  4. data/Gemfile +2 -1
  5. data/Gemfile.lock +28 -116
  6. data/README.md +269 -629
  7. data/Rakefile +0 -9
  8. data/Steepfile +4 -8
  9. data/examples/async_patterns.rb +58 -1
  10. data/ext/kreuzberg_rb/extconf.rb +5 -35
  11. data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
  12. data/ext/kreuzberg_rb/native/build.rs +14 -12
  13. data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
  14. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
  15. data/ext/kreuzberg_rb/native/include/strings.h +2 -2
  16. data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
  17. data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
  18. data/extconf.rb +6 -38
  19. data/kreuzberg.gemspec +20 -114
  20. data/lib/kreuzberg/api_proxy.rb +18 -2
  21. data/lib/kreuzberg/cache_api.rb +0 -22
  22. data/lib/kreuzberg/cli.rb +10 -2
  23. data/lib/kreuzberg/cli_proxy.rb +10 -0
  24. data/lib/kreuzberg/config.rb +22 -274
  25. data/lib/kreuzberg/errors.rb +7 -73
  26. data/lib/kreuzberg/extraction_api.rb +8 -237
  27. data/lib/kreuzberg/mcp_proxy.rb +11 -2
  28. data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
  29. data/lib/kreuzberg/post_processor_protocol.rb +71 -0
  30. data/lib/kreuzberg/result.rb +33 -151
  31. data/lib/kreuzberg/setup_lib_path.rb +2 -22
  32. data/lib/kreuzberg/validator_protocol.rb +73 -0
  33. data/lib/kreuzberg/version.rb +1 -1
  34. data/lib/kreuzberg.rb +13 -27
  35. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  36. data/sig/kreuzberg.rbs +12 -105
  37. data/spec/binding/cache_spec.rb +22 -22
  38. data/spec/binding/cli_proxy_spec.rb +4 -2
  39. data/spec/binding/cli_spec.rb +11 -12
  40. data/spec/binding/config_spec.rb +0 -74
  41. data/spec/binding/config_validation_spec.rb +6 -100
  42. data/spec/binding/error_handling_spec.rb +97 -283
  43. data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
  44. data/spec/binding/plugins/postprocessor_spec.rb +11 -11
  45. data/spec/binding/plugins/validator_spec.rb +13 -12
  46. data/spec/examples.txt +104 -0
  47. data/spec/fixtures/config.toml +1 -0
  48. data/spec/fixtures/config.yaml +1 -0
  49. data/spec/fixtures/invalid_config.toml +1 -0
  50. data/spec/smoke/package_spec.rb +3 -2
  51. data/spec/spec_helper.rb +3 -1
  52. data/vendor/kreuzberg/Cargo.toml +67 -192
  53. data/vendor/kreuzberg/README.md +9 -97
  54. data/vendor/kreuzberg/build.rs +194 -516
  55. data/vendor/kreuzberg/src/api/handlers.rs +9 -130
  56. data/vendor/kreuzberg/src/api/mod.rs +3 -18
  57. data/vendor/kreuzberg/src/api/server.rs +71 -236
  58. data/vendor/kreuzberg/src/api/types.rs +7 -43
  59. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  60. data/vendor/kreuzberg/src/cache/mod.rs +3 -27
  61. data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
  62. data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
  63. data/vendor/kreuzberg/src/core/config.rs +23 -905
  64. data/vendor/kreuzberg/src/core/extractor.rs +106 -403
  65. data/vendor/kreuzberg/src/core/io.rs +2 -4
  66. data/vendor/kreuzberg/src/core/mime.rs +12 -2
  67. data/vendor/kreuzberg/src/core/mod.rs +3 -22
  68. data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
  69. data/vendor/kreuzberg/src/embeddings.rs +21 -169
  70. data/vendor/kreuzberg/src/error.rs +2 -2
  71. data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
  72. data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
  73. data/vendor/kreuzberg/src/extraction/email.rs +11 -12
  74. data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
  75. data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
  76. data/vendor/kreuzberg/src/extraction/image.rs +14 -138
  77. data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
  78. data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
  79. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
  80. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  81. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  82. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  83. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  84. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  85. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  86. data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
  87. data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
  88. data/vendor/kreuzberg/src/extraction/table.rs +1 -2
  89. data/vendor/kreuzberg/src/extraction/text.rs +10 -18
  90. data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
  91. data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
  92. data/vendor/kreuzberg/src/extractors/email.rs +9 -37
  93. data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
  94. data/vendor/kreuzberg/src/extractors/html.rs +173 -182
  95. data/vendor/kreuzberg/src/extractors/image.rs +8 -32
  96. data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
  97. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  98. data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
  99. data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
  100. data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
  101. data/vendor/kreuzberg/src/extractors/text.rs +7 -30
  102. data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
  103. data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
  104. data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
  105. data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
  106. data/vendor/kreuzberg/src/lib.rs +5 -17
  107. data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
  108. data/vendor/kreuzberg/src/mcp/server.rs +21 -145
  109. data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
  110. data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
  111. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
  112. data/vendor/kreuzberg/src/pdf/error.rs +1 -93
  113. data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
  114. data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
  115. data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
  116. data/vendor/kreuzberg/src/pdf/table.rs +64 -61
  117. data/vendor/kreuzberg/src/pdf/text.rs +24 -416
  118. data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
  119. data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
  120. data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
  121. data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
  122. data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
  123. data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
  124. data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
  125. data/vendor/kreuzberg/src/text/mod.rs +0 -8
  126. data/vendor/kreuzberg/src/text/quality.rs +15 -28
  127. data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
  128. data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
  129. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
  130. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
  131. data/vendor/kreuzberg/src/types.rs +67 -907
  132. data/vendor/kreuzberg/src/utils/mod.rs +0 -14
  133. data/vendor/kreuzberg/src/utils/quality.rs +3 -12
  134. data/vendor/kreuzberg/tests/api_tests.rs +0 -506
  135. data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
  136. data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
  137. data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
  138. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  139. data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
  140. data/vendor/kreuzberg/tests/config_features.rs +1 -33
  141. data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
  142. data/vendor/kreuzberg/tests/core_integration.rs +9 -35
  143. data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
  144. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
  145. data/vendor/kreuzberg/tests/email_integration.rs +1 -3
  146. data/vendor/kreuzberg/tests/error_handling.rs +34 -43
  147. data/vendor/kreuzberg/tests/format_integration.rs +1 -7
  148. data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
  149. data/vendor/kreuzberg/tests/image_integration.rs +0 -2
  150. data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
  151. data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
  152. data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
  153. data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
  154. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  155. data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
  156. data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
  157. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
  158. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
  159. data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
  160. data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
  161. data/vendor/kreuzberg/tests/security_validation.rs +1 -13
  162. data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
  163. metadata +25 -171
  164. data/.rubocop.yml +0 -543
  165. data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
  166. data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
  167. data/lib/kreuzberg/error_context.rb +0 -136
  168. data/lib/kreuzberg/types.rb +0 -170
  169. data/lib/libpdfium.so +0 -0
  170. data/spec/binding/async_operations_spec.rb +0 -473
  171. data/spec/binding/batch_operations_spec.rb +0 -595
  172. data/spec/binding/batch_spec.rb +0 -359
  173. data/spec/binding/config_result_spec.rb +0 -377
  174. data/spec/binding/embeddings_spec.rb +0 -816
  175. data/spec/binding/error_recovery_spec.rb +0 -488
  176. data/spec/binding/font_config_spec.rb +0 -220
  177. data/spec/binding/images_spec.rb +0 -738
  178. data/spec/binding/keywords_extraction_spec.rb +0 -600
  179. data/spec/binding/metadata_types_spec.rb +0 -1228
  180. data/spec/binding/pages_extraction_spec.rb +0 -471
  181. data/spec/binding/tables_spec.rb +0 -641
  182. data/spec/unit/config/chunking_config_spec.rb +0 -213
  183. data/spec/unit/config/embedding_config_spec.rb +0 -343
  184. data/spec/unit/config/extraction_config_spec.rb +0 -438
  185. data/spec/unit/config/font_config_spec.rb +0 -285
  186. data/spec/unit/config/hierarchy_config_spec.rb +0 -314
  187. data/spec/unit/config/image_extraction_config_spec.rb +0 -209
  188. data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
  189. data/spec/unit/config/keyword_config_spec.rb +0 -229
  190. data/spec/unit/config/language_detection_config_spec.rb +0 -258
  191. data/spec/unit/config/ocr_config_spec.rb +0 -171
  192. data/spec/unit/config/page_config_spec.rb +0 -221
  193. data/spec/unit/config/pdf_config_spec.rb +0 -267
  194. data/spec/unit/config/postprocessor_config_spec.rb +0 -290
  195. data/spec/unit/config/tesseract_config_spec.rb +0 -181
  196. data/spec/unit/config/token_reduction_config_spec.rb +0 -251
  197. data/test/metadata_types_test.rb +0 -959
  198. data/vendor/Cargo.toml +0 -61
  199. data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
  200. data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
  201. data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
  202. data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
  203. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  204. data/vendor/kreuzberg/src/core/formats.rs +0 -235
  205. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  206. data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
  207. data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
  208. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
  209. data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
  210. data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
  211. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  212. data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
  213. data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
  214. data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
  215. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  216. data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
  217. data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
  218. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  219. data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
  220. data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
  221. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  222. data/vendor/kreuzberg/src/extractors/security.rs +0 -484
  223. data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
  224. data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
  225. data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
  226. data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
  227. data/vendor/kreuzberg/src/panic_context.rs +0 -154
  228. data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
  229. data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
  230. data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
  231. data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
  232. data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
  233. data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
  234. data/vendor/kreuzberg/src/utils/pool.rs +0 -503
  235. data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
  236. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
  237. data/vendor/kreuzberg/tests/api_embed.rs +0 -360
  238. data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
  239. data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
  240. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
  241. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
  242. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
  243. data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
  244. data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
  245. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
  246. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
  247. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
  248. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
  249. data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
  250. data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
  251. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
  252. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
  253. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
  254. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
  255. data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
  256. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
  257. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
  258. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
  259. data/vendor/kreuzberg/tests/page_markers.rs +0 -297
  260. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
  261. data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
  262. data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
  263. data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
  264. data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
  265. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
  266. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
  267. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
  268. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
  269. data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
  270. data/vendor/kreuzberg-ffi/README.md +0 -851
  271. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
  272. data/vendor/kreuzberg-ffi/build.rs +0 -168
  273. data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
  274. data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
  275. data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
  276. data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
  277. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
  278. data/vendor/kreuzberg-ffi/src/error.rs +0 -901
  279. data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
  280. data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
  281. data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
  282. data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
  283. data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
  284. data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
  285. data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
  286. data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
  287. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
  288. data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
  289. data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
  290. data/vendor/kreuzberg-ffi/src/result.rs +0 -510
  291. data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
  292. data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
  293. data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
  294. data/vendor/kreuzberg-ffi/src/types.rs +0 -363
  295. data/vendor/kreuzberg-ffi/src/util.rs +0 -210
  296. data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
  297. data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
  298. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
  299. data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
  300. data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
  301. data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
  302. data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
  303. data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
  304. data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
  305. data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
  306. data/vendor/kreuzberg-tesseract/LICENSE +0 -22
  307. data/vendor/kreuzberg-tesseract/README.md +0 -399
  308. data/vendor/kreuzberg-tesseract/build.rs +0 -1127
  309. data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
  310. data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
  311. data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
  312. data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
  313. data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
  314. data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
  315. data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
  316. data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
  317. data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
  318. data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
  319. data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
  320. data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
  321. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211
@@ -1,364 +0,0 @@
1
- //! Dynamic pool sizing heuristics based on document complexity.
2
- //!
3
- //! This module provides functions to estimate optimal pool sizes based on file size
4
- //! and document format (MIME type). By sizing pools to match actual document complexity,
5
- //! we reduce memory waste from pre-allocated but unused capacity.
6
- //!
7
- //! # Sizing Strategy
8
- //!
9
- //! Pool size is determined by a combination of:
10
- //! 1. **Format-specific ratio**: Extraction overhead varies by format
11
- //! - PDF: 25% (binary, compression overhead)
12
- //! - DOCX/XLSX/PPTX: 40-45% (compressed, XML-heavy)
13
- //! - HTML: 65% (markup overhead)
14
- //! - Text/Markdown: 95% (minimal overhead)
15
- //! - Default: 50% (conservative)
16
- //! 2. **File size scaling**: Larger documents benefit from more buffers
17
- //! - Small (< 100KB): Base allocation
18
- //! - Medium (100KB-1MB): +2 buffers
19
- //! - Large (1MB-10MB): +4 buffers
20
- //! - Huge (>10MB): +6 buffers
21
- //!
22
- //! # Example
23
- //!
24
- //! ```rust,ignore
25
- //! use kreuzberg::utils::pool_sizing::estimate_pool_size;
26
- //!
27
- //! // 5MB PDF → pool sized at ~1.25MB (5MB * 0.25)
28
- //! let hint = estimate_pool_size(5_000_000, "application/pdf");
29
- //! assert_eq!(hint.estimated_total_size, 1_250_000);
30
- //!
31
- //! // 2MB HTML → pool sized at ~1.3MB (2MB * 0.65)
32
- //! let hint = estimate_pool_size(2_000_000, "text/html");
33
- //! assert_eq!(hint.estimated_total_size, 1_300_000);
34
- //! ```
35
-
36
- /// Hint for optimal pool sizing based on document characteristics.
37
- ///
38
- /// This struct contains the estimated sizes for string and byte buffers
39
- /// that should be allocated in the pool to handle extraction without
40
- /// excessive reallocation.
41
- #[derive(Debug, Clone, Copy)]
42
- pub struct PoolSizeHint {
43
- /// Estimated total string buffer pool size in bytes
44
- pub estimated_total_size: usize,
45
- /// Recommended number of string buffers
46
- pub string_buffer_count: usize,
47
- /// Recommended capacity per string buffer in bytes
48
- pub string_buffer_capacity: usize,
49
- /// Recommended number of byte buffers
50
- pub byte_buffer_count: usize,
51
- /// Recommended capacity per byte buffer in bytes
52
- pub byte_buffer_capacity: usize,
53
- }
54
-
55
- impl PoolSizeHint {
56
- /// Calculate the estimated string pool memory in bytes.
57
- ///
58
- /// This is the total estimated memory for all string buffers.
59
- #[inline]
60
- pub fn estimated_string_pool_memory(&self) -> usize {
61
- self.string_buffer_count * self.string_buffer_capacity
62
- }
63
-
64
- /// Calculate the estimated byte pool memory in bytes.
65
- ///
66
- /// This is the total estimated memory for all byte buffers.
67
- #[inline]
68
- pub fn estimated_byte_pool_memory(&self) -> usize {
69
- self.byte_buffer_count * self.byte_buffer_capacity
70
- }
71
-
72
- /// Calculate the total estimated pool memory in bytes.
73
- ///
74
- /// This includes both string and byte buffer pools.
75
- #[inline]
76
- pub fn total_pool_memory(&self) -> usize {
77
- self.estimated_string_pool_memory() + self.estimated_byte_pool_memory()
78
- }
79
- }
80
-
81
- /// Get the format-specific extraction ratio.
82
- ///
83
- /// This ratio represents the approximate size of extracted content
84
- /// as a percentage of the original file size. Different formats have
85
- /// different overhead due to compression, binary structures, markup, etc.
86
- ///
87
- /// # Arguments
88
- ///
89
- /// * `mime_type` - The MIME type of the document (e.g., "application/pdf")
90
- ///
91
- /// # Returns
92
- ///
93
- /// A ratio between 0.0 and 1.0 representing the expected extraction ratio
94
- #[inline]
95
- fn get_format_ratio(mime_type: &str) -> f64 {
96
- match mime_type {
97
- "text/plain" | "text/markdown" | "text/x-markdown" => 0.95,
98
- "text/csv" | "text/tab-separated-values" => 0.90,
99
-
100
- "text/html" | "text/html; charset=utf-8" => 0.65,
101
- "application/xml" | "text/xml" => 0.60,
102
- "image/svg+xml" => 0.55,
103
-
104
- "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
105
- | "application/vnd.openxmlformats-officedocument.wordprocessingml.macro-enabled.document"
106
- | "application/msword" => 0.45,
107
- "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
108
- | "application/vnd.openxmlformats-officedocument.spreadsheetml.macro-enabled.sheet"
109
- | "application/vnd.ms-excel" => 0.40,
110
- "application/vnd.openxmlformats-officedocument.presentationml.presentation"
111
- | "application/vnd.openxmlformats-officedocument.presentationml.macro-enabled.presentation"
112
- | "application/vnd.ms-powerpoint" => 0.35,
113
-
114
- "application/vnd.oasis.opendocument.text" => 0.45,
115
- "application/vnd.oasis.opendocument.spreadsheet" => 0.40,
116
- "application/vnd.oasis.opendocument.presentation" => 0.35,
117
-
118
- "application/pdf" => 0.25,
119
-
120
- "application/json" | "text/json" => 0.80,
121
- "application/x-yaml" | "text/yaml" | "text/x-yaml" | "application/yaml" => 0.85,
122
-
123
- "application/zip" | "application/x-zip-compressed" => 0.30,
124
- "application/gzip" | "application/x-gzip" => 0.25,
125
- "application/x-rar-compressed" => 0.30,
126
- "application/x-7z-compressed" => 0.25,
127
-
128
- _ => 0.50,
129
- }
130
- }
131
-
132
- /// Get base pool configuration for a format type.
133
- ///
134
- /// The base configuration represents the minimum number of buffers
135
- /// needed for typical documents of that format.
136
- ///
137
- /// # Arguments
138
- ///
139
- /// * `mime_type` - The MIME type of the document
140
- ///
141
- /// # Returns
142
- ///
143
- /// A tuple of (base_buffer_count, base_buffer_capacity)
144
- #[inline]
145
- fn get_format_base_config(mime_type: &str) -> (usize, usize) {
146
- match mime_type {
147
- "text/plain" | "text/markdown" | "text/x-markdown" => (2, 4096),
148
- "text/csv" | "text/tab-separated-values" => (3, 8192),
149
-
150
- "text/html" | "text/html; charset=utf-8" => (8, 16384),
151
-
152
- "application/xml" | "text/xml" => (5, 8192),
153
- "image/svg+xml" => (4, 8192),
154
-
155
- "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
156
- | "application/vnd.openxmlformats-officedocument.wordprocessingml.macro-enabled.document"
157
- | "application/msword" => (5, 8192),
158
- "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
159
- | "application/vnd.openxmlformats-officedocument.spreadsheetml.macro-enabled.sheet"
160
- | "application/vnd.ms-excel" => (4, 8192),
161
- "application/vnd.openxmlformats-officedocument.presentationml.presentation"
162
- | "application/vnd.openxmlformats-officedocument.presentationml.macro-enabled.presentation"
163
- | "application/vnd.ms-powerpoint" => (4, 8192),
164
-
165
- "application/vnd.oasis.opendocument.text" => (5, 8192),
166
- "application/vnd.oasis.opendocument.spreadsheet" => (4, 8192),
167
- "application/vnd.oasis.opendocument.presentation" => (4, 8192),
168
-
169
- "application/pdf" => (6, 16384),
170
-
171
- "application/json" | "text/json" => (4, 8192),
172
- "application/x-yaml" | "text/yaml" | "text/x-yaml" | "application/yaml" => (4, 8192),
173
-
174
- _ => (3, 8192),
175
- }
176
- }
177
-
178
- /// Estimate optimal pool configuration based on document size.
179
- ///
180
- /// Adjusts the base configuration up for larger documents to provide
181
- /// adequate buffering for streaming extraction operations.
182
- ///
183
- /// # Arguments
184
- ///
185
- /// * `file_size` - Size of the file in bytes
186
- /// * `base_count` - Base buffer count from format config
187
- ///
188
- /// # Returns
189
- ///
190
- /// Adjusted buffer count considering file size
191
- #[inline]
192
- fn adjust_for_file_size(file_size: u64, base_count: usize) -> usize {
193
- match file_size {
194
- 0..=100_000 => base_count,
195
- 100_001..=1_000_000 => base_count.saturating_add(2),
196
- 1_000_001..=10_000_000 => base_count.saturating_add(4),
197
- _ => base_count.saturating_add(6),
198
- }
199
- }
200
-
201
- /// Estimate pool capacity based on file size.
202
- ///
203
- /// Larger files benefit from larger buffers to reduce reallocation cycles
204
- /// during extraction.
205
- ///
206
- /// # Arguments
207
- ///
208
- /// * `file_size` - Size of the file in bytes
209
- ///
210
- /// # Returns
211
- ///
212
- /// Recommended buffer capacity in bytes
213
- #[inline]
214
- fn estimate_buffer_capacity(file_size: u64) -> usize {
215
- match file_size {
216
- 0..=10_000 => 1024,
217
- 10_001..=100_000 => 4096,
218
- 100_001..=1_000_000 => 16384,
219
- 1_000_001..=10_000_000 => 65536,
220
- _ => 262144,
221
- }
222
- }
223
-
224
- /// Estimate optimal pool sizing based on file size and document type.
225
- ///
226
- /// This function uses the file size and MIME type to estimate how many
227
- /// buffers and what capacity they should have. The estimates are conservative
228
- /// to avoid starving large document processing.
229
- ///
230
- /// # Arguments
231
- ///
232
- /// * `file_size` - Size of the file in bytes
233
- /// * `mime_type` - MIME type of the document (e.g., "application/pdf")
234
- ///
235
- /// # Returns
236
- ///
237
- /// A `PoolSizeHint` with recommended pool configuration
238
- ///
239
- /// # Example
240
- ///
241
- /// ```rust,ignore
242
- /// use kreuzberg::utils::pool_sizing::estimate_pool_size;
243
- ///
244
- /// let hint = estimate_pool_size(5_000_000, "application/pdf");
245
- /// // PDF at 5MB gets 10 string buffers (base 6 + 4 for size)
246
- /// // of 65KB each (for 1-10MB files)
247
- /// ```
248
- #[inline]
249
- pub fn estimate_pool_size(file_size: u64, mime_type: &str) -> PoolSizeHint {
250
- let format_ratio = get_format_ratio(mime_type);
251
- let (base_count, _base_capacity) = get_format_base_config(mime_type);
252
-
253
- let adjusted_string_buffer_count = adjust_for_file_size(file_size, base_count);
254
-
255
- let buffer_capacity = estimate_buffer_capacity(file_size);
256
-
257
- let estimated_total_size = (file_size as f64 * format_ratio).ceil() as usize;
258
-
259
- let byte_buffer_count = (adjusted_string_buffer_count / 2).max(1);
260
- let byte_buffer_capacity = buffer_capacity * 8;
261
-
262
- PoolSizeHint {
263
- estimated_total_size,
264
- string_buffer_count: adjusted_string_buffer_count,
265
- string_buffer_capacity: buffer_capacity,
266
- byte_buffer_count,
267
- byte_buffer_capacity,
268
- }
269
- }
270
-
271
- #[cfg(test)]
272
- mod tests {
273
- use super::*;
274
-
275
- #[test]
276
- fn test_format_ratio_pdf() {
277
- let ratio = get_format_ratio("application/pdf");
278
- assert_eq!(ratio, 0.25);
279
- }
280
-
281
- #[test]
282
- fn test_format_ratio_html() {
283
- let ratio = get_format_ratio("text/html");
284
- assert_eq!(ratio, 0.65);
285
- }
286
-
287
- #[test]
288
- fn test_format_ratio_docx() {
289
- let ratio = get_format_ratio("application/vnd.openxmlformats-officedocument.wordprocessingml.document");
290
- assert_eq!(ratio, 0.45);
291
- }
292
-
293
- #[test]
294
- fn test_format_ratio_default() {
295
- let ratio = get_format_ratio("application/unknown-format");
296
- assert_eq!(ratio, 0.50);
297
- }
298
-
299
- #[test]
300
- fn test_small_file_sizing() {
301
- let hint = estimate_pool_size(5_000, "application/pdf");
302
- assert_eq!(hint.string_buffer_count, 6);
303
- assert_eq!(hint.string_buffer_capacity, 1024);
304
- }
305
-
306
- #[test]
307
- fn test_medium_file_sizing() {
308
- let hint = estimate_pool_size(500_000, "application/pdf");
309
- assert_eq!(hint.string_buffer_count, 8);
310
- assert_eq!(hint.string_buffer_capacity, 16384);
311
- }
312
-
313
- #[test]
314
- fn test_large_file_sizing() {
315
- let hint = estimate_pool_size(5_000_000, "application/pdf");
316
- assert_eq!(hint.string_buffer_count, 10);
317
- assert_eq!(hint.string_buffer_capacity, 65536);
318
- }
319
-
320
- #[test]
321
- fn test_huge_file_sizing() {
322
- let hint = estimate_pool_size(50_000_000, "application/pdf");
323
- assert_eq!(hint.string_buffer_count, 12);
324
- assert_eq!(hint.string_buffer_capacity, 262144);
325
- }
326
-
327
- #[test]
328
- fn test_html_sizing() {
329
- let hint = estimate_pool_size(1_000_000, "text/html");
330
- assert_eq!(hint.string_buffer_count, 10);
331
- assert_eq!(hint.string_buffer_capacity, 16384);
332
- assert_eq!(hint.estimated_total_size, 650_000);
333
- }
334
-
335
- #[test]
336
- fn test_text_sizing() {
337
- let hint = estimate_pool_size(1_000_000, "text/plain");
338
- assert_eq!(hint.string_buffer_count, 4);
339
- assert_eq!(hint.estimated_total_size, 950_000);
340
- }
341
-
342
- #[test]
343
- fn test_byte_buffer_sizing() {
344
- let hint = estimate_pool_size(5_000_000, "application/pdf");
345
- assert!(hint.byte_buffer_count < hint.string_buffer_count);
346
- assert_eq!(hint.byte_buffer_capacity, hint.string_buffer_capacity * 8);
347
- }
348
-
349
- #[test]
350
- fn test_total_size_estimation() {
351
- let hint = estimate_pool_size(10_000_000, "application/pdf");
352
- assert_eq!(hint.estimated_total_size, 2_500_000);
353
- }
354
-
355
- #[test]
356
- fn test_xlsx_sizing() {
357
- let hint = estimate_pool_size(
358
- 2_000_000,
359
- "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
360
- );
361
- assert_eq!(hint.estimated_total_size, 800_000);
362
- assert_eq!(hint.string_buffer_count, 8);
363
- }
364
- }