kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (321) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +0 -6
  3. data/.rubocop.yaml +534 -1
  4. data/Gemfile +2 -1
  5. data/Gemfile.lock +28 -116
  6. data/README.md +269 -629
  7. data/Rakefile +0 -9
  8. data/Steepfile +4 -8
  9. data/examples/async_patterns.rb +58 -1
  10. data/ext/kreuzberg_rb/extconf.rb +5 -35
  11. data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
  12. data/ext/kreuzberg_rb/native/build.rs +14 -12
  13. data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
  14. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
  15. data/ext/kreuzberg_rb/native/include/strings.h +2 -2
  16. data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
  17. data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
  18. data/extconf.rb +6 -38
  19. data/kreuzberg.gemspec +20 -114
  20. data/lib/kreuzberg/api_proxy.rb +18 -2
  21. data/lib/kreuzberg/cache_api.rb +0 -22
  22. data/lib/kreuzberg/cli.rb +10 -2
  23. data/lib/kreuzberg/cli_proxy.rb +10 -0
  24. data/lib/kreuzberg/config.rb +22 -274
  25. data/lib/kreuzberg/errors.rb +7 -73
  26. data/lib/kreuzberg/extraction_api.rb +8 -237
  27. data/lib/kreuzberg/mcp_proxy.rb +11 -2
  28. data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
  29. data/lib/kreuzberg/post_processor_protocol.rb +71 -0
  30. data/lib/kreuzberg/result.rb +33 -151
  31. data/lib/kreuzberg/setup_lib_path.rb +2 -22
  32. data/lib/kreuzberg/validator_protocol.rb +73 -0
  33. data/lib/kreuzberg/version.rb +1 -1
  34. data/lib/kreuzberg.rb +13 -27
  35. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  36. data/sig/kreuzberg.rbs +12 -105
  37. data/spec/binding/cache_spec.rb +22 -22
  38. data/spec/binding/cli_proxy_spec.rb +4 -2
  39. data/spec/binding/cli_spec.rb +11 -12
  40. data/spec/binding/config_spec.rb +0 -74
  41. data/spec/binding/config_validation_spec.rb +6 -100
  42. data/spec/binding/error_handling_spec.rb +97 -283
  43. data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
  44. data/spec/binding/plugins/postprocessor_spec.rb +11 -11
  45. data/spec/binding/plugins/validator_spec.rb +13 -12
  46. data/spec/examples.txt +104 -0
  47. data/spec/fixtures/config.toml +1 -0
  48. data/spec/fixtures/config.yaml +1 -0
  49. data/spec/fixtures/invalid_config.toml +1 -0
  50. data/spec/smoke/package_spec.rb +3 -2
  51. data/spec/spec_helper.rb +3 -1
  52. data/vendor/kreuzberg/Cargo.toml +67 -192
  53. data/vendor/kreuzberg/README.md +9 -97
  54. data/vendor/kreuzberg/build.rs +194 -516
  55. data/vendor/kreuzberg/src/api/handlers.rs +9 -130
  56. data/vendor/kreuzberg/src/api/mod.rs +3 -18
  57. data/vendor/kreuzberg/src/api/server.rs +71 -236
  58. data/vendor/kreuzberg/src/api/types.rs +7 -43
  59. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  60. data/vendor/kreuzberg/src/cache/mod.rs +3 -27
  61. data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
  62. data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
  63. data/vendor/kreuzberg/src/core/config.rs +23 -905
  64. data/vendor/kreuzberg/src/core/extractor.rs +106 -403
  65. data/vendor/kreuzberg/src/core/io.rs +2 -4
  66. data/vendor/kreuzberg/src/core/mime.rs +12 -2
  67. data/vendor/kreuzberg/src/core/mod.rs +3 -22
  68. data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
  69. data/vendor/kreuzberg/src/embeddings.rs +21 -169
  70. data/vendor/kreuzberg/src/error.rs +2 -2
  71. data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
  72. data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
  73. data/vendor/kreuzberg/src/extraction/email.rs +11 -12
  74. data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
  75. data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
  76. data/vendor/kreuzberg/src/extraction/image.rs +14 -138
  77. data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
  78. data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
  79. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
  80. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  81. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  82. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  83. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  84. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  85. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  86. data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
  87. data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
  88. data/vendor/kreuzberg/src/extraction/table.rs +1 -2
  89. data/vendor/kreuzberg/src/extraction/text.rs +10 -18
  90. data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
  91. data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
  92. data/vendor/kreuzberg/src/extractors/email.rs +9 -37
  93. data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
  94. data/vendor/kreuzberg/src/extractors/html.rs +173 -182
  95. data/vendor/kreuzberg/src/extractors/image.rs +8 -32
  96. data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
  97. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  98. data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
  99. data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
  100. data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
  101. data/vendor/kreuzberg/src/extractors/text.rs +7 -30
  102. data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
  103. data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
  104. data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
  105. data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
  106. data/vendor/kreuzberg/src/lib.rs +5 -17
  107. data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
  108. data/vendor/kreuzberg/src/mcp/server.rs +21 -145
  109. data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
  110. data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
  111. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
  112. data/vendor/kreuzberg/src/pdf/error.rs +1 -93
  113. data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
  114. data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
  115. data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
  116. data/vendor/kreuzberg/src/pdf/table.rs +64 -61
  117. data/vendor/kreuzberg/src/pdf/text.rs +24 -416
  118. data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
  119. data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
  120. data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
  121. data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
  122. data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
  123. data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
  124. data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
  125. data/vendor/kreuzberg/src/text/mod.rs +0 -8
  126. data/vendor/kreuzberg/src/text/quality.rs +15 -28
  127. data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
  128. data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
  129. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
  130. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
  131. data/vendor/kreuzberg/src/types.rs +67 -907
  132. data/vendor/kreuzberg/src/utils/mod.rs +0 -14
  133. data/vendor/kreuzberg/src/utils/quality.rs +3 -12
  134. data/vendor/kreuzberg/tests/api_tests.rs +0 -506
  135. data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
  136. data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
  137. data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
  138. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  139. data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
  140. data/vendor/kreuzberg/tests/config_features.rs +1 -33
  141. data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
  142. data/vendor/kreuzberg/tests/core_integration.rs +9 -35
  143. data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
  144. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
  145. data/vendor/kreuzberg/tests/email_integration.rs +1 -3
  146. data/vendor/kreuzberg/tests/error_handling.rs +34 -43
  147. data/vendor/kreuzberg/tests/format_integration.rs +1 -7
  148. data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
  149. data/vendor/kreuzberg/tests/image_integration.rs +0 -2
  150. data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
  151. data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
  152. data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
  153. data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
  154. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  155. data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
  156. data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
  157. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
  158. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
  159. data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
  160. data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
  161. data/vendor/kreuzberg/tests/security_validation.rs +1 -13
  162. data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
  163. metadata +25 -171
  164. data/.rubocop.yml +0 -543
  165. data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
  166. data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
  167. data/lib/kreuzberg/error_context.rb +0 -136
  168. data/lib/kreuzberg/types.rb +0 -170
  169. data/lib/libpdfium.so +0 -0
  170. data/spec/binding/async_operations_spec.rb +0 -473
  171. data/spec/binding/batch_operations_spec.rb +0 -595
  172. data/spec/binding/batch_spec.rb +0 -359
  173. data/spec/binding/config_result_spec.rb +0 -377
  174. data/spec/binding/embeddings_spec.rb +0 -816
  175. data/spec/binding/error_recovery_spec.rb +0 -488
  176. data/spec/binding/font_config_spec.rb +0 -220
  177. data/spec/binding/images_spec.rb +0 -738
  178. data/spec/binding/keywords_extraction_spec.rb +0 -600
  179. data/spec/binding/metadata_types_spec.rb +0 -1228
  180. data/spec/binding/pages_extraction_spec.rb +0 -471
  181. data/spec/binding/tables_spec.rb +0 -641
  182. data/spec/unit/config/chunking_config_spec.rb +0 -213
  183. data/spec/unit/config/embedding_config_spec.rb +0 -343
  184. data/spec/unit/config/extraction_config_spec.rb +0 -438
  185. data/spec/unit/config/font_config_spec.rb +0 -285
  186. data/spec/unit/config/hierarchy_config_spec.rb +0 -314
  187. data/spec/unit/config/image_extraction_config_spec.rb +0 -209
  188. data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
  189. data/spec/unit/config/keyword_config_spec.rb +0 -229
  190. data/spec/unit/config/language_detection_config_spec.rb +0 -258
  191. data/spec/unit/config/ocr_config_spec.rb +0 -171
  192. data/spec/unit/config/page_config_spec.rb +0 -221
  193. data/spec/unit/config/pdf_config_spec.rb +0 -267
  194. data/spec/unit/config/postprocessor_config_spec.rb +0 -290
  195. data/spec/unit/config/tesseract_config_spec.rb +0 -181
  196. data/spec/unit/config/token_reduction_config_spec.rb +0 -251
  197. data/test/metadata_types_test.rb +0 -959
  198. data/vendor/Cargo.toml +0 -61
  199. data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
  200. data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
  201. data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
  202. data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
  203. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  204. data/vendor/kreuzberg/src/core/formats.rs +0 -235
  205. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  206. data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
  207. data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
  208. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
  209. data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
  210. data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
  211. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  212. data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
  213. data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
  214. data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
  215. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  216. data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
  217. data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
  218. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  219. data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
  220. data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
  221. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  222. data/vendor/kreuzberg/src/extractors/security.rs +0 -484
  223. data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
  224. data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
  225. data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
  226. data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
  227. data/vendor/kreuzberg/src/panic_context.rs +0 -154
  228. data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
  229. data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
  230. data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
  231. data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
  232. data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
  233. data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
  234. data/vendor/kreuzberg/src/utils/pool.rs +0 -503
  235. data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
  236. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
  237. data/vendor/kreuzberg/tests/api_embed.rs +0 -360
  238. data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
  239. data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
  240. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
  241. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
  242. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
  243. data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
  244. data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
  245. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
  246. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
  247. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
  248. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
  249. data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
  250. data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
  251. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
  252. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
  253. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
  254. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
  255. data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
  256. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
  257. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
  258. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
  259. data/vendor/kreuzberg/tests/page_markers.rs +0 -297
  260. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
  261. data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
  262. data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
  263. data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
  264. data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
  265. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
  266. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
  267. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
  268. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
  269. data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
  270. data/vendor/kreuzberg-ffi/README.md +0 -851
  271. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
  272. data/vendor/kreuzberg-ffi/build.rs +0 -168
  273. data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
  274. data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
  275. data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
  276. data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
  277. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
  278. data/vendor/kreuzberg-ffi/src/error.rs +0 -901
  279. data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
  280. data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
  281. data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
  282. data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
  283. data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
  284. data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
  285. data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
  286. data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
  287. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
  288. data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
  289. data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
  290. data/vendor/kreuzberg-ffi/src/result.rs +0 -510
  291. data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
  292. data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
  293. data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
  294. data/vendor/kreuzberg-ffi/src/types.rs +0 -363
  295. data/vendor/kreuzberg-ffi/src/util.rs +0 -210
  296. data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
  297. data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
  298. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
  299. data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
  300. data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
  301. data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
  302. data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
  303. data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
  304. data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
  305. data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
  306. data/vendor/kreuzberg-tesseract/LICENSE +0 -22
  307. data/vendor/kreuzberg-tesseract/README.md +0 -399
  308. data/vendor/kreuzberg-tesseract/build.rs +0 -1127
  309. data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
  310. data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
  311. data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
  312. data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
  313. data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
  314. data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
  315. data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
  316. data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
  317. data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
  318. data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
  319. data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
  320. data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
  321. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211
@@ -1,471 +0,0 @@
1
- #![cfg(feature = "api")]
2
- //! Integration tests for large PDF file extraction (issue #248).
3
- //!
4
- //! Tests verify that the Kreuzberg API server can handle large PDF files
5
- //! without size limits or with very large limits (>2MB, >10MB, >100MB).
6
- //!
7
- //! These tests are designed to be TDD tests - they FAIL with the current
8
- //! implementation if size limits are enforced, demonstrating the bug.
9
- //!
10
- //! The tests ensure:
11
- //! - Large PDFs (>2MB) can be extracted without rejection
12
- //! - Multipart uploads handle large payloads correctly
13
- //! - Server doesn't impose unreasonable size restrictions
14
- //! - Configuration allows tuning limits for different deployment scenarios
15
-
16
- use axum::{
17
- body::Body,
18
- http::{Request, StatusCode},
19
- };
20
- use kreuzberg::{
21
- ExtractionConfig,
22
- api::{ApiSizeLimits, create_router_with_limits},
23
- };
24
- use tower::ServiceExt;
25
-
26
- /// Helper function to create mock PDF content of a specified size.
27
- ///
28
- /// Creates a minimal PDF structure that is valid and parseable, scaled to
29
- /// the requested byte size. The PDF contains repeated text content to reach
30
- /// the target size.
31
- ///
32
- /// # Arguments
33
- ///
34
- /// * `size_bytes` - Target size of the PDF in bytes
35
- ///
36
- /// # Returns
37
- ///
38
- /// A Vec<u8> containing valid PDF content of approximately the specified size
39
- fn create_mock_pdf_content(size_bytes: usize) -> Vec<u8> {
40
- let pdf_header = b"%PDF-1.4\n";
41
- let mut content = pdf_header.to_vec();
42
-
43
- let catalog = b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n";
44
- content.extend_from_slice(catalog);
45
-
46
- let pages = b"2 0 obj\n<< /Type /Pages /Kids [3 0 R] /Count 1 >>\nendobj\n";
47
- content.extend_from_slice(pages);
48
-
49
- let page_header = b"3 0 obj\n<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Contents 4 0 R >>\nendobj\n";
50
- content.extend_from_slice(page_header);
51
-
52
- let text_content = b"BT /F1 12 Tf 50 750 Td (Large PDF Content for Testing) Tj ET\n";
53
- let stream_prefix = b"4 0 obj\n<< /Length ";
54
- let stream_suffix = b" >>\nstream\n";
55
- let stream_end = b"\nendstream\nendobj\n";
56
-
57
- let text_repeat_count = if size_bytes > content.len() + 200 {
58
- (size_bytes - content.len() - 200) / text_content.len()
59
- } else {
60
- 1
61
- };
62
-
63
- content.extend_from_slice(stream_prefix);
64
-
65
- let stream_size = text_content.len() * text_repeat_count + text_repeat_count;
66
- content.extend_from_slice(stream_size.to_string().as_bytes());
67
- content.extend_from_slice(stream_suffix);
68
-
69
- for _ in 0..text_repeat_count {
70
- content.extend_from_slice(text_content);
71
- content.push(b'\n');
72
- }
73
-
74
- content.extend_from_slice(stream_end);
75
-
76
- let xref_offset = content.len();
77
- let xref = b"xref\n0 5\n0000000000 65535 f \n";
78
- content.extend_from_slice(xref);
79
-
80
- let trailer = format!(
81
- "trailer\n<< /Size 5 /Root 1 0 R >>\nstartxref\n{}\n%%EOF\n",
82
- xref_offset
83
- );
84
- content.extend_from_slice(trailer.as_bytes());
85
-
86
- content
87
- }
88
-
89
- /// Helper function to create a multipart request body with a PDF file.
90
- ///
91
- /// Constructs a properly formatted multipart/form-data request body
92
- /// containing a single PDF file.
93
- ///
94
- /// # Arguments
95
- ///
96
- /// * `boundary` - The multipart boundary string
97
- /// * `pdf_content` - The PDF file content as bytes
98
- /// * `filename` - Name of the PDF file
99
- ///
100
- /// # Returns
101
- ///
102
- /// A Vec<u8> containing the complete multipart request body
103
- fn create_multipart_pdf_request(boundary: &str, pdf_content: &[u8], filename: &str) -> Vec<u8> {
104
- let mut body = Vec::new();
105
-
106
- body.extend_from_slice(format!("--{}\r\n", boundary).as_bytes());
107
-
108
- body.extend_from_slice(
109
- format!(
110
- "Content-Disposition: form-data; name=\"files\"; filename=\"{}\"\r\n",
111
- filename
112
- )
113
- .as_bytes(),
114
- );
115
-
116
- body.extend_from_slice(b"Content-Type: application/pdf\r\n");
117
-
118
- body.extend_from_slice(b"\r\n");
119
-
120
- body.extend_from_slice(pdf_content);
121
-
122
- body.extend_from_slice(format!("\r\n--{}--\r\n", boundary).as_bytes());
123
-
124
- body
125
- }
126
-
127
- /// Test extracting a 5MB PDF file.
128
- ///
129
- /// This test verifies that the API can handle PDF files larger than 2MB,
130
- /// which was the issue reported in #248. The test should FAIL if the server
131
- /// is rejecting requests based on file size limits.
132
- ///
133
- /// # Expected Behavior
134
- ///
135
- /// The request should succeed with HTTP 200 and return valid extraction results.
136
- /// If the server has a hard limit below 5MB, this test will fail with HTTP 413
137
- /// (Payload Too Large).
138
- #[tokio::test]
139
- async fn test_extract_5mb_pdf_file() {
140
- let limits = ApiSizeLimits::from_mb(10, 10);
141
- let router = create_router_with_limits(ExtractionConfig::default(), limits);
142
-
143
- let pdf_size = 5 * 1024 * 1024;
144
- let pdf_content = create_mock_pdf_content(pdf_size);
145
-
146
- let boundary = "----large-pdf-boundary";
147
- let request_body = create_multipart_pdf_request(boundary, &pdf_content, "large_5mb.pdf");
148
-
149
- let request = Request::builder()
150
- .method("POST")
151
- .uri("/extract")
152
- .header("content-type", format!("multipart/form-data; boundary={}", boundary))
153
- .header("content-length", request_body.len())
154
- .body(Body::from(request_body))
155
- .expect("Failed to build request");
156
-
157
- let response = router.oneshot(request).await.expect("Request failed");
158
-
159
- assert_eq!(
160
- response.status(),
161
- StatusCode::OK,
162
- "Should successfully extract 5MB PDF file. If status is 413, the server has size limit issues (issue #248)."
163
- );
164
- }
165
-
166
- /// Test extracting a 10MB PDF file.
167
- ///
168
- /// This test pushes the size limits further to verify that the API can handle
169
- /// significantly large PDF files (10x the original problem size of 1MB).
170
- ///
171
- /// # Expected Behavior
172
- ///
173
- /// The request should succeed with HTTP 200. If this fails with HTTP 413,
174
- /// it indicates the server's default size limits are too restrictive.
175
- #[tokio::test]
176
- async fn test_extract_10mb_pdf_file() {
177
- let limits = ApiSizeLimits::from_mb(20, 20);
178
- let router = create_router_with_limits(ExtractionConfig::default(), limits);
179
-
180
- let pdf_size = 10 * 1024 * 1024;
181
- let pdf_content = create_mock_pdf_content(pdf_size);
182
-
183
- let boundary = "----large-pdf-boundary";
184
- let request_body = create_multipart_pdf_request(boundary, &pdf_content, "large_10mb.pdf");
185
-
186
- let request = Request::builder()
187
- .method("POST")
188
- .uri("/extract")
189
- .header("content-type", format!("multipart/form-data; boundary={}", boundary))
190
- .header("content-length", request_body.len())
191
- .body(Body::from(request_body))
192
- .expect("Failed to build request");
193
-
194
- let response = router.oneshot(request).await.expect("Request failed");
195
-
196
- assert_eq!(
197
- response.status(),
198
- StatusCode::OK,
199
- "Should successfully extract 10MB PDF file without size limit rejection"
200
- );
201
- }
202
-
203
- /// Test extracting a 100MB PDF file.
204
- ///
205
- /// This test verifies that the API can handle very large PDF files (100x the
206
- /// original problem size). This is important for production deployments that
207
- /// need to process large document repositories.
208
- ///
209
- /// Note: This test may require significant memory and time.
210
- ///
211
- /// # Expected Behavior
212
- ///
213
- /// The request should succeed with HTTP 200. The test uses very large limits
214
- /// (500MB) to allow the file to be processed.
215
- #[tokio::test]
216
- #[ignore]
217
- async fn test_extract_100mb_pdf_file() {
218
- let limits = ApiSizeLimits::from_mb(500, 500);
219
- let router = create_router_with_limits(ExtractionConfig::default(), limits);
220
-
221
- let pdf_size = 100 * 1024 * 1024;
222
- let pdf_content = create_mock_pdf_content(pdf_size);
223
-
224
- let boundary = "----large-pdf-boundary";
225
- let request_body = create_multipart_pdf_request(boundary, &pdf_content, "large_100mb.pdf");
226
-
227
- let request = Request::builder()
228
- .method("POST")
229
- .uri("/extract")
230
- .header("content-type", format!("multipart/form-data; boundary={}", boundary))
231
- .header("content-length", request_body.len())
232
- .body(Body::from(request_body))
233
- .expect("Failed to build request");
234
-
235
- let response = router.oneshot(request).await.expect("Request failed");
236
-
237
- assert_eq!(
238
- response.status(),
239
- StatusCode::OK,
240
- "Should successfully extract 100MB PDF file. Requires --ignored flag to run and significant memory."
241
- );
242
- }
243
-
244
- /// Test that default size limits can be exceeded with custom configuration.
245
- ///
246
- /// This test verifies that the API respects custom size limit configuration,
247
- /// allowing deployments to tune limits based on their requirements.
248
- ///
249
- /// # Expected Behavior
250
- ///
251
- /// A 6MB file should fail with the default 100MB limit (actually it shouldn't fail,
252
- /// but it demonstrates how to check if custom limits work). We test with a router
253
- /// configured for smaller limits, then larger limits.
254
- #[tokio::test]
255
- async fn test_size_limits_configurable() {
256
- let pdf_size = 6 * 1024 * 1024;
257
- let pdf_content = create_mock_pdf_content(pdf_size);
258
- let boundary = "----size-limit-test";
259
-
260
- let small_limits = ApiSizeLimits::from_mb(5, 5);
261
- let router_small = create_router_with_limits(ExtractionConfig::default(), small_limits);
262
-
263
- let request_body = create_multipart_pdf_request(boundary, &pdf_content, "test_6mb.pdf");
264
-
265
- let request = Request::builder()
266
- .method("POST")
267
- .uri("/extract")
268
- .header("content-type", format!("multipart/form-data; boundary={}", boundary))
269
- .header("content-length", request_body.len())
270
- .body(Body::from(request_body.clone()))
271
- .expect("Failed to build request");
272
-
273
- let response_small = router_small.oneshot(request).await.expect("Request failed");
274
-
275
- assert_eq!(
276
- response_small.status(),
277
- StatusCode::PAYLOAD_TOO_LARGE,
278
- "6MB file should be rejected when limit is 5MB"
279
- );
280
-
281
- let large_limits = ApiSizeLimits::from_mb(10, 10);
282
- let router_large = create_router_with_limits(ExtractionConfig::default(), large_limits);
283
-
284
- let request = Request::builder()
285
- .method("POST")
286
- .uri("/extract")
287
- .header("content-type", format!("multipart/form-data; boundary={}", boundary))
288
- .header("content-length", request_body.len())
289
- .body(Body::from(request_body))
290
- .expect("Failed to build request");
291
-
292
- let response_large = router_large.oneshot(request).await.expect("Request failed");
293
-
294
- assert_eq!(
295
- response_large.status(),
296
- StatusCode::OK,
297
- "6MB file should be accepted when limit is 10MB"
298
- );
299
- }
300
-
301
- /// Test that custom limits work via ApiSizeLimits::from_mb.
302
- ///
303
- /// This test verifies the public API for configuring size limits,
304
- /// ensuring that applications can set limits appropriate for their use case.
305
- ///
306
- /// # Expected Behavior
307
- ///
308
- /// The test creates limits for 15MB and 20MB separately, demonstrating
309
- /// different request/field limits.
310
- #[tokio::test]
311
- async fn test_api_size_limits_from_mb() {
312
- let limits_15 = ApiSizeLimits::from_mb(15, 15);
313
- assert_eq!(limits_15.max_request_body_bytes, 15 * 1024 * 1024);
314
- assert_eq!(limits_15.max_multipart_field_bytes, 15 * 1024 * 1024);
315
-
316
- let limits_20_10 = ApiSizeLimits::from_mb(20, 10);
317
- assert_eq!(limits_20_10.max_request_body_bytes, 20 * 1024 * 1024);
318
- assert_eq!(limits_20_10.max_multipart_field_bytes, 10 * 1024 * 1024);
319
-
320
- let router_15 = create_router_with_limits(ExtractionConfig::default(), limits_15);
321
- let router_20_10 = create_router_with_limits(ExtractionConfig::default(), limits_20_10);
322
-
323
- assert!(size_of_val(&router_15) > 0);
324
- assert!(size_of_val(&router_20_10) > 0);
325
- }
326
-
327
- /// Test multipart upload with large payload handles streaming correctly.
328
- ///
329
- /// This test verifies that the multipart parser can handle large payloads
330
- /// without loading the entire file into memory at once, which is important
331
- /// for processing very large documents.
332
- ///
333
- /// # Expected Behavior
334
- ///
335
- /// A 12MB file sent via multipart should be accepted if limits allow.
336
- /// The API should handle streaming without excessive memory consumption.
337
- #[tokio::test]
338
- async fn test_multipart_large_payload_streaming() {
339
- let limits = ApiSizeLimits::from_mb(15, 15);
340
- let router = create_router_with_limits(ExtractionConfig::default(), limits);
341
-
342
- let pdf_size = 12 * 1024 * 1024;
343
- let pdf_content = create_mock_pdf_content(pdf_size);
344
-
345
- let boundary = "----multipart-stream-test";
346
- let request_body = create_multipart_pdf_request(boundary, &pdf_content, "stream_test_12mb.pdf");
347
-
348
- let request = Request::builder()
349
- .method("POST")
350
- .uri("/extract")
351
- .header("content-type", format!("multipart/form-data; boundary={}", boundary))
352
- .header("content-length", request_body.len())
353
- .body(Body::from(request_body))
354
- .expect("Failed to build request");
355
-
356
- let response = router.oneshot(request).await.expect("Request failed");
357
-
358
- assert_eq!(
359
- response.status(),
360
- StatusCode::OK,
361
- "Multipart upload with 12MB payload should be handled via streaming"
362
- );
363
- }
364
-
365
- /// Test that gigabyte-scale limits can be configured.
366
- ///
367
- /// This test verifies that the API can be configured with very large limits
368
- /// suitable for enterprise deployments that need to process massive documents.
369
- ///
370
- /// # Expected Behavior
371
- ///
372
- /// The API should support limit configurations up to gigabyte scale without
373
- /// panicking or causing overflow. This test doesn't actually send gigabyte
374
- /// files (due to memory constraints), but verifies configuration is possible.
375
- #[tokio::test]
376
- async fn test_gigabyte_scale_limits() {
377
- let limits = ApiSizeLimits::from_mb(1024, 1024);
378
- assert_eq!(limits.max_request_body_bytes, 1024 * 1024 * 1024);
379
- assert_eq!(limits.max_multipart_field_bytes, 1024 * 1024 * 1024);
380
-
381
- let router = create_router_with_limits(ExtractionConfig::default(), limits);
382
-
383
- assert!(size_of_val(&router) > 0);
384
-
385
- let health_request = Request::builder()
386
- .uri("/health")
387
- .body(Body::empty())
388
- .expect("Failed to build health check request");
389
-
390
- let response = router.oneshot(health_request).await.expect("Request failed");
391
- assert_eq!(response.status(), StatusCode::OK);
392
- }
393
-
394
- /// Test extracting multiple large PDF files in a single request.
395
- ///
396
- /// This test verifies that batch processing of large files works correctly,
397
- /// with the total request size being the sum of all file sizes.
398
- ///
399
- /// # Expected Behavior
400
- ///
401
- /// Two 4MB PDFs (8MB total) should be accepted when limits are 15MB,
402
- /// demonstrating that the limit applies to total request size, not per-file.
403
- #[tokio::test]
404
- async fn test_extract_multiple_large_pdfs() {
405
- let limits = ApiSizeLimits::from_mb(15, 15);
406
- let router = create_router_with_limits(ExtractionConfig::default(), limits);
407
-
408
- let pdf_size = 4 * 1024 * 1024;
409
- let pdf_content_1 = create_mock_pdf_content(pdf_size);
410
- let pdf_content_2 = create_mock_pdf_content(pdf_size);
411
-
412
- let boundary = "----multi-large-boundary";
413
- let mut request_body = Vec::new();
414
-
415
- request_body.extend_from_slice(format!("--{}\r\n", boundary).as_bytes());
416
- request_body.extend_from_slice(b"Content-Disposition: form-data; name=\"files\"; filename=\"large1.pdf\"\r\n");
417
- request_body.extend_from_slice(b"Content-Type: application/pdf\r\n\r\n");
418
- request_body.extend_from_slice(&pdf_content_1);
419
- request_body.extend_from_slice(b"\r\n");
420
-
421
- request_body.extend_from_slice(format!("--{}\r\n", boundary).as_bytes());
422
- request_body.extend_from_slice(b"Content-Disposition: form-data; name=\"files\"; filename=\"large2.pdf\"\r\n");
423
- request_body.extend_from_slice(b"Content-Type: application/pdf\r\n\r\n");
424
- request_body.extend_from_slice(&pdf_content_2);
425
- request_body.extend_from_slice(b"\r\n");
426
-
427
- request_body.extend_from_slice(format!("--{}--\r\n", boundary).as_bytes());
428
-
429
- let request = Request::builder()
430
- .method("POST")
431
- .uri("/extract")
432
- .header("content-type", format!("multipart/form-data; boundary={}", boundary))
433
- .header("content-length", request_body.len())
434
- .body(Body::from(request_body))
435
- .expect("Failed to build request");
436
-
437
- let response = router.oneshot(request).await.expect("Request failed");
438
-
439
- assert_eq!(
440
- response.status(),
441
- StatusCode::OK,
442
- "Should successfully extract multiple large PDF files when total size within limits"
443
- );
444
- }
445
-
446
- /// Test that API respects environment variable configuration for size limits.
447
- ///
448
- /// This test documents how the API parses size limits from the environment,
449
- /// via the ServerConfig which handles environment variable reading.
450
- ///
451
- /// # Note
452
- ///
453
- /// This test verifies the ApiSizeLimits struct itself can be configured,
454
- /// demonstrating the pattern that environment variables should follow.
455
- #[tokio::test]
456
- async fn test_environment_configurable_limits_pattern() {
457
- let env_configured_mb = 256;
458
-
459
- let limits = ApiSizeLimits::from_mb(env_configured_mb, env_configured_mb);
460
- let router = create_router_with_limits(ExtractionConfig::default(), limits);
461
-
462
- assert_eq!(limits.max_request_body_bytes, 256 * 1024 * 1024);
463
-
464
- let health_request = Request::builder()
465
- .uri("/health")
466
- .body(Body::empty())
467
- .expect("Failed to build health check request");
468
-
469
- let response = router.oneshot(health_request).await.expect("Request failed");
470
- assert_eq!(response.status(), StatusCode::OK);
471
- }