kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (321) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +0 -6
  3. data/.rubocop.yaml +534 -1
  4. data/Gemfile +2 -1
  5. data/Gemfile.lock +28 -116
  6. data/README.md +269 -629
  7. data/Rakefile +0 -9
  8. data/Steepfile +4 -8
  9. data/examples/async_patterns.rb +58 -1
  10. data/ext/kreuzberg_rb/extconf.rb +5 -35
  11. data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
  12. data/ext/kreuzberg_rb/native/build.rs +14 -12
  13. data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
  14. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
  15. data/ext/kreuzberg_rb/native/include/strings.h +2 -2
  16. data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
  17. data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
  18. data/extconf.rb +6 -38
  19. data/kreuzberg.gemspec +20 -114
  20. data/lib/kreuzberg/api_proxy.rb +18 -2
  21. data/lib/kreuzberg/cache_api.rb +0 -22
  22. data/lib/kreuzberg/cli.rb +10 -2
  23. data/lib/kreuzberg/cli_proxy.rb +10 -0
  24. data/lib/kreuzberg/config.rb +22 -274
  25. data/lib/kreuzberg/errors.rb +7 -73
  26. data/lib/kreuzberg/extraction_api.rb +8 -237
  27. data/lib/kreuzberg/mcp_proxy.rb +11 -2
  28. data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
  29. data/lib/kreuzberg/post_processor_protocol.rb +71 -0
  30. data/lib/kreuzberg/result.rb +33 -151
  31. data/lib/kreuzberg/setup_lib_path.rb +2 -22
  32. data/lib/kreuzberg/validator_protocol.rb +73 -0
  33. data/lib/kreuzberg/version.rb +1 -1
  34. data/lib/kreuzberg.rb +13 -27
  35. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  36. data/sig/kreuzberg.rbs +12 -105
  37. data/spec/binding/cache_spec.rb +22 -22
  38. data/spec/binding/cli_proxy_spec.rb +4 -2
  39. data/spec/binding/cli_spec.rb +11 -12
  40. data/spec/binding/config_spec.rb +0 -74
  41. data/spec/binding/config_validation_spec.rb +6 -100
  42. data/spec/binding/error_handling_spec.rb +97 -283
  43. data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
  44. data/spec/binding/plugins/postprocessor_spec.rb +11 -11
  45. data/spec/binding/plugins/validator_spec.rb +13 -12
  46. data/spec/examples.txt +104 -0
  47. data/spec/fixtures/config.toml +1 -0
  48. data/spec/fixtures/config.yaml +1 -0
  49. data/spec/fixtures/invalid_config.toml +1 -0
  50. data/spec/smoke/package_spec.rb +3 -2
  51. data/spec/spec_helper.rb +3 -1
  52. data/vendor/kreuzberg/Cargo.toml +67 -192
  53. data/vendor/kreuzberg/README.md +9 -97
  54. data/vendor/kreuzberg/build.rs +194 -516
  55. data/vendor/kreuzberg/src/api/handlers.rs +9 -130
  56. data/vendor/kreuzberg/src/api/mod.rs +3 -18
  57. data/vendor/kreuzberg/src/api/server.rs +71 -236
  58. data/vendor/kreuzberg/src/api/types.rs +7 -43
  59. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  60. data/vendor/kreuzberg/src/cache/mod.rs +3 -27
  61. data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
  62. data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
  63. data/vendor/kreuzberg/src/core/config.rs +23 -905
  64. data/vendor/kreuzberg/src/core/extractor.rs +106 -403
  65. data/vendor/kreuzberg/src/core/io.rs +2 -4
  66. data/vendor/kreuzberg/src/core/mime.rs +12 -2
  67. data/vendor/kreuzberg/src/core/mod.rs +3 -22
  68. data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
  69. data/vendor/kreuzberg/src/embeddings.rs +21 -169
  70. data/vendor/kreuzberg/src/error.rs +2 -2
  71. data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
  72. data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
  73. data/vendor/kreuzberg/src/extraction/email.rs +11 -12
  74. data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
  75. data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
  76. data/vendor/kreuzberg/src/extraction/image.rs +14 -138
  77. data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
  78. data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
  79. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
  80. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  81. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  82. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  83. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  84. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  85. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  86. data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
  87. data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
  88. data/vendor/kreuzberg/src/extraction/table.rs +1 -2
  89. data/vendor/kreuzberg/src/extraction/text.rs +10 -18
  90. data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
  91. data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
  92. data/vendor/kreuzberg/src/extractors/email.rs +9 -37
  93. data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
  94. data/vendor/kreuzberg/src/extractors/html.rs +173 -182
  95. data/vendor/kreuzberg/src/extractors/image.rs +8 -32
  96. data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
  97. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  98. data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
  99. data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
  100. data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
  101. data/vendor/kreuzberg/src/extractors/text.rs +7 -30
  102. data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
  103. data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
  104. data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
  105. data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
  106. data/vendor/kreuzberg/src/lib.rs +5 -17
  107. data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
  108. data/vendor/kreuzberg/src/mcp/server.rs +21 -145
  109. data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
  110. data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
  111. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
  112. data/vendor/kreuzberg/src/pdf/error.rs +1 -93
  113. data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
  114. data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
  115. data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
  116. data/vendor/kreuzberg/src/pdf/table.rs +64 -61
  117. data/vendor/kreuzberg/src/pdf/text.rs +24 -416
  118. data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
  119. data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
  120. data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
  121. data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
  122. data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
  123. data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
  124. data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
  125. data/vendor/kreuzberg/src/text/mod.rs +0 -8
  126. data/vendor/kreuzberg/src/text/quality.rs +15 -28
  127. data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
  128. data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
  129. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
  130. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
  131. data/vendor/kreuzberg/src/types.rs +67 -907
  132. data/vendor/kreuzberg/src/utils/mod.rs +0 -14
  133. data/vendor/kreuzberg/src/utils/quality.rs +3 -12
  134. data/vendor/kreuzberg/tests/api_tests.rs +0 -506
  135. data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
  136. data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
  137. data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
  138. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  139. data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
  140. data/vendor/kreuzberg/tests/config_features.rs +1 -33
  141. data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
  142. data/vendor/kreuzberg/tests/core_integration.rs +9 -35
  143. data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
  144. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
  145. data/vendor/kreuzberg/tests/email_integration.rs +1 -3
  146. data/vendor/kreuzberg/tests/error_handling.rs +34 -43
  147. data/vendor/kreuzberg/tests/format_integration.rs +1 -7
  148. data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
  149. data/vendor/kreuzberg/tests/image_integration.rs +0 -2
  150. data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
  151. data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
  152. data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
  153. data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
  154. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  155. data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
  156. data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
  157. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
  158. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
  159. data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
  160. data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
  161. data/vendor/kreuzberg/tests/security_validation.rs +1 -13
  162. data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
  163. metadata +25 -171
  164. data/.rubocop.yml +0 -543
  165. data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
  166. data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
  167. data/lib/kreuzberg/error_context.rb +0 -136
  168. data/lib/kreuzberg/types.rb +0 -170
  169. data/lib/libpdfium.so +0 -0
  170. data/spec/binding/async_operations_spec.rb +0 -473
  171. data/spec/binding/batch_operations_spec.rb +0 -595
  172. data/spec/binding/batch_spec.rb +0 -359
  173. data/spec/binding/config_result_spec.rb +0 -377
  174. data/spec/binding/embeddings_spec.rb +0 -816
  175. data/spec/binding/error_recovery_spec.rb +0 -488
  176. data/spec/binding/font_config_spec.rb +0 -220
  177. data/spec/binding/images_spec.rb +0 -738
  178. data/spec/binding/keywords_extraction_spec.rb +0 -600
  179. data/spec/binding/metadata_types_spec.rb +0 -1228
  180. data/spec/binding/pages_extraction_spec.rb +0 -471
  181. data/spec/binding/tables_spec.rb +0 -641
  182. data/spec/unit/config/chunking_config_spec.rb +0 -213
  183. data/spec/unit/config/embedding_config_spec.rb +0 -343
  184. data/spec/unit/config/extraction_config_spec.rb +0 -438
  185. data/spec/unit/config/font_config_spec.rb +0 -285
  186. data/spec/unit/config/hierarchy_config_spec.rb +0 -314
  187. data/spec/unit/config/image_extraction_config_spec.rb +0 -209
  188. data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
  189. data/spec/unit/config/keyword_config_spec.rb +0 -229
  190. data/spec/unit/config/language_detection_config_spec.rb +0 -258
  191. data/spec/unit/config/ocr_config_spec.rb +0 -171
  192. data/spec/unit/config/page_config_spec.rb +0 -221
  193. data/spec/unit/config/pdf_config_spec.rb +0 -267
  194. data/spec/unit/config/postprocessor_config_spec.rb +0 -290
  195. data/spec/unit/config/tesseract_config_spec.rb +0 -181
  196. data/spec/unit/config/token_reduction_config_spec.rb +0 -251
  197. data/test/metadata_types_test.rb +0 -959
  198. data/vendor/Cargo.toml +0 -61
  199. data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
  200. data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
  201. data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
  202. data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
  203. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  204. data/vendor/kreuzberg/src/core/formats.rs +0 -235
  205. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  206. data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
  207. data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
  208. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
  209. data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
  210. data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
  211. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  212. data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
  213. data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
  214. data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
  215. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  216. data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
  217. data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
  218. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  219. data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
  220. data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
  221. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  222. data/vendor/kreuzberg/src/extractors/security.rs +0 -484
  223. data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
  224. data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
  225. data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
  226. data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
  227. data/vendor/kreuzberg/src/panic_context.rs +0 -154
  228. data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
  229. data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
  230. data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
  231. data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
  232. data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
  233. data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
  234. data/vendor/kreuzberg/src/utils/pool.rs +0 -503
  235. data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
  236. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
  237. data/vendor/kreuzberg/tests/api_embed.rs +0 -360
  238. data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
  239. data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
  240. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
  241. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
  242. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
  243. data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
  244. data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
  245. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
  246. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
  247. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
  248. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
  249. data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
  250. data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
  251. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
  252. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
  253. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
  254. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
  255. data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
  256. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
  257. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
  258. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
  259. data/vendor/kreuzberg/tests/page_markers.rs +0 -297
  260. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
  261. data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
  262. data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
  263. data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
  264. data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
  265. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
  266. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
  267. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
  268. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
  269. data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
  270. data/vendor/kreuzberg-ffi/README.md +0 -851
  271. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
  272. data/vendor/kreuzberg-ffi/build.rs +0 -168
  273. data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
  274. data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
  275. data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
  276. data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
  277. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
  278. data/vendor/kreuzberg-ffi/src/error.rs +0 -901
  279. data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
  280. data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
  281. data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
  282. data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
  283. data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
  284. data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
  285. data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
  286. data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
  287. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
  288. data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
  289. data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
  290. data/vendor/kreuzberg-ffi/src/result.rs +0 -510
  291. data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
  292. data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
  293. data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
  294. data/vendor/kreuzberg-ffi/src/types.rs +0 -363
  295. data/vendor/kreuzberg-ffi/src/util.rs +0 -210
  296. data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
  297. data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
  298. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
  299. data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
  300. data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
  301. data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
  302. data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
  303. data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
  304. data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
  305. data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
  306. data/vendor/kreuzberg-tesseract/LICENSE +0 -22
  307. data/vendor/kreuzberg-tesseract/README.md +0 -399
  308. data/vendor/kreuzberg-tesseract/build.rs +0 -1127
  309. data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
  310. data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
  311. data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
  312. data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
  313. data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
  314. data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
  315. data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
  316. data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
  317. data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
  318. data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
  319. data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
  320. data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
  321. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211
@@ -1,977 +0,0 @@
1
- //! C FFI bindings for Kreuzberg document intelligence library.
2
- //!
3
- //! Provides a C-compatible API that can be consumed by Java (Panama FFI),
4
- //! Go (cgo), C# (P/Invoke), Zig, and other languages with C FFI support.
5
-
6
- mod batch_streaming;
7
- mod config;
8
- mod error;
9
- mod extraction;
10
- mod helpers;
11
- mod memory;
12
- mod mime;
13
- mod panic_shield;
14
- mod plugins;
15
- mod result;
16
- mod result_pool;
17
- mod result_view;
18
- mod string_intern;
19
- mod types;
20
- mod util;
21
- mod validation;
22
-
23
- pub use batch_streaming::{
24
- ErrorCallback, ResultCallback, kreuzberg_extract_batch_parallel, kreuzberg_extract_batch_streaming,
25
- };
26
- pub use config::{
27
- kreuzberg_config_discover, kreuzberg_config_free, kreuzberg_config_from_file, kreuzberg_config_from_json,
28
- kreuzberg_config_get_field, kreuzberg_config_is_valid, kreuzberg_config_merge, kreuzberg_config_to_json,
29
- kreuzberg_get_embedding_preset, kreuzberg_list_embedding_presets, kreuzberg_load_extraction_config_from_file,
30
- };
31
- pub use error::ErrorCode as KreuzbergErrorCode;
32
- pub use error::{
33
- CErrorDetails, kreuzberg_classify_error, kreuzberg_error_code_count, kreuzberg_error_code_description,
34
- kreuzberg_error_code_internal, kreuzberg_error_code_io, kreuzberg_error_code_missing_dependency,
35
- kreuzberg_error_code_name, kreuzberg_error_code_ocr, kreuzberg_error_code_parsing, kreuzberg_error_code_plugin,
36
- kreuzberg_error_code_unsupported_format, kreuzberg_error_code_validation, kreuzberg_get_error_details,
37
- };
38
- pub use extraction::{
39
- kreuzberg_batch_extract_bytes_sync, kreuzberg_batch_extract_files_sync, kreuzberg_extract_bytes_sync,
40
- kreuzberg_extract_bytes_sync_with_config, kreuzberg_extract_file_sync, kreuzberg_extract_file_sync_with_config,
41
- };
42
- pub use helpers::*;
43
- pub use memory::{kreuzberg_clone_string, kreuzberg_free_batch_result, kreuzberg_free_result, kreuzberg_free_string};
44
- pub use mime::{
45
- kreuzberg_detect_mime_type, kreuzberg_detect_mime_type_from_bytes, kreuzberg_detect_mime_type_from_path,
46
- kreuzberg_get_extensions_for_mime, kreuzberg_validate_mime_type,
47
- };
48
- pub use panic_shield::{
49
- ErrorCode, StructuredError, clear_structured_error, get_last_error_code, get_last_error_message,
50
- get_last_panic_context, set_structured_error,
51
- };
52
- pub use plugins::*;
53
- pub use result::{
54
- CMetadataField, kreuzberg_result_get_chunk_count, kreuzberg_result_get_detected_language,
55
- kreuzberg_result_get_metadata_field, kreuzberg_result_get_page_count,
56
- };
57
- pub use result_pool::{
58
- CResultPoolStats, ResultPool, kreuzberg_extract_file_into_pool, kreuzberg_extract_file_into_pool_view,
59
- kreuzberg_result_pool_free, kreuzberg_result_pool_new, kreuzberg_result_pool_reset, kreuzberg_result_pool_stats,
60
- };
61
- pub use result_view::{
62
- CExtractionResultView, kreuzberg_get_result_view, kreuzberg_view_get_content, kreuzberg_view_get_mime_type,
63
- };
64
- pub use string_intern::{
65
- CStringInternStats, kreuzberg_free_interned_string, kreuzberg_intern_string, kreuzberg_string_intern_reset,
66
- kreuzberg_string_intern_stats,
67
- };
68
- pub use types::*;
69
- pub use util::{kreuzberg_last_error, kreuzberg_last_error_code, kreuzberg_last_panic_context, kreuzberg_version};
70
- pub use validation::*;
71
-
72
- #[cfg(test)]
73
- mod tests {
74
- use super::*;
75
- use std::ffi::{CStr, CString};
76
- use std::os::raw::c_char;
77
- use std::ptr;
78
-
79
- #[test]
80
- fn test_version() {
81
- unsafe {
82
- let version = kreuzberg_version();
83
- assert!(!version.is_null());
84
- let version_str = CStr::from_ptr(version).to_str().unwrap();
85
- assert!(!version_str.is_empty());
86
- }
87
- }
88
-
89
- #[test]
90
- fn test_null_path() {
91
- unsafe {
92
- let result = kreuzberg_extract_file_sync(ptr::null());
93
- assert!(result.is_null());
94
-
95
- let error = kreuzberg_last_error();
96
- assert!(!error.is_null());
97
- let error_str = CStr::from_ptr(error).to_str().unwrap();
98
- assert!(error_str.contains("NULL"));
99
- }
100
- }
101
-
102
- #[test]
103
- fn test_nonexistent_file() {
104
- unsafe {
105
- let path = CString::new("/nonexistent/file.pdf").unwrap();
106
- let result = kreuzberg_extract_file_sync(path.as_ptr());
107
- assert!(result.is_null());
108
-
109
- let error = kreuzberg_last_error();
110
- assert!(!error.is_null());
111
- }
112
- }
113
-
114
- // ==================== Struct Layout Tests ====================
115
-
116
- #[test]
117
- fn test_cextraction_result_layout() {
118
- // Test size
119
- assert_eq!(
120
- std::mem::size_of::<CExtractionResult>(),
121
- 104,
122
- "CExtractionResult must be exactly 104 bytes"
123
- );
124
-
125
- // Test alignment
126
- assert_eq!(
127
- std::mem::align_of::<CExtractionResult>(),
128
- 8,
129
- "CExtractionResult must be 8-byte aligned"
130
- );
131
- }
132
-
133
- #[test]
134
- fn test_cbatch_result_layout() {
135
- // Test size
136
- assert_eq!(
137
- std::mem::size_of::<CBatchResult>(),
138
- 24,
139
- "CBatchResult must be exactly 24 bytes"
140
- );
141
-
142
- // Test alignment
143
- assert_eq!(
144
- std::mem::align_of::<CBatchResult>(),
145
- 8,
146
- "CBatchResult must be 8-byte aligned"
147
- );
148
- }
149
-
150
- #[test]
151
- fn test_cbytes_with_mime_layout() {
152
- // Test size
153
- assert_eq!(
154
- std::mem::size_of::<CBytesWithMime>(),
155
- 24,
156
- "CBytesWithMime must be exactly 24 bytes"
157
- );
158
-
159
- // Test alignment
160
- assert_eq!(
161
- std::mem::align_of::<CBytesWithMime>(),
162
- 8,
163
- "CBytesWithMime must be 8-byte aligned"
164
- );
165
- }
166
-
167
- // ==================== Memory Safety Tests ====================
168
-
169
- /// Helper function to create a mock CExtractionResult for testing
170
- fn create_mock_extraction_result() -> *mut CExtractionResult {
171
- Box::into_raw(Box::new(CExtractionResult {
172
- content: CString::new("test content").unwrap().into_raw(),
173
- mime_type: CString::new("text/plain").unwrap().into_raw(),
174
- language: CString::new("en").unwrap().into_raw(),
175
- date: ptr::null_mut(),
176
- subject: ptr::null_mut(),
177
- tables_json: ptr::null_mut(),
178
- detected_languages_json: ptr::null_mut(),
179
- metadata_json: ptr::null_mut(),
180
- chunks_json: ptr::null_mut(),
181
- images_json: ptr::null_mut(),
182
- page_structure_json: ptr::null_mut(),
183
- pages_json: ptr::null_mut(),
184
- success: true,
185
- _padding1: [0u8; 7],
186
- }))
187
- }
188
-
189
- #[test]
190
- fn test_batch_result_allocation_deallocation() {
191
- unsafe {
192
- // Simulate the exact allocation pattern from kreuzberg_batch_extract_files_sync
193
- let c_results = vec![
194
- create_mock_extraction_result(),
195
- create_mock_extraction_result(),
196
- create_mock_extraction_result(),
197
- ];
198
-
199
- let actual_count = c_results.len();
200
-
201
- // This is the exact pattern used in kreuzberg_batch_extract_files_sync
202
- let results_array = c_results.into_boxed_slice();
203
- let results_ptr = Box::into_raw(results_array) as *mut *mut CExtractionResult;
204
-
205
- let batch_result = Box::into_raw(Box::new(CBatchResult {
206
- results: results_ptr,
207
- count: actual_count,
208
- success: true,
209
- _padding2: [0u8; 7],
210
- }));
211
-
212
- // Verify the batch result is valid
213
- assert!(!batch_result.is_null());
214
- assert_eq!((*batch_result).count, 3);
215
- assert!((*batch_result).success);
216
-
217
- // Now free it using the public API
218
- kreuzberg_free_batch_result(batch_result);
219
-
220
- // If we got here without crashing, the allocation/deallocation pattern is correct
221
- }
222
- }
223
-
224
- #[test]
225
- fn test_free_null_batch() {
226
- unsafe {
227
- // Freeing NULL batch should not crash
228
- kreuzberg_free_batch_result(ptr::null_mut());
229
- }
230
- }
231
-
232
- #[test]
233
- fn test_free_null_result() {
234
- unsafe {
235
- // Freeing NULL result should not crash
236
- kreuzberg_free_result(ptr::null_mut());
237
- }
238
- }
239
-
240
- #[test]
241
- fn test_free_null_string() {
242
- unsafe {
243
- // Freeing NULL string should not crash
244
- kreuzberg_free_string(ptr::null_mut());
245
- }
246
- }
247
-
248
- #[test]
249
- fn test_batch_result_with_empty_results() {
250
- unsafe {
251
- // Test batch result with zero results
252
- let c_results: Vec<*mut CExtractionResult> = Vec::new();
253
- let results_array = c_results.into_boxed_slice();
254
- let results_ptr = Box::into_raw(results_array) as *mut *mut CExtractionResult;
255
-
256
- let batch_result = Box::into_raw(Box::new(CBatchResult {
257
- results: results_ptr,
258
- count: 0,
259
- success: true,
260
- _padding2: [0u8; 7],
261
- }));
262
-
263
- assert!(!batch_result.is_null());
264
- assert_eq!((*batch_result).count, 0);
265
-
266
- // Free should handle empty batch gracefully
267
- kreuzberg_free_batch_result(batch_result);
268
- }
269
- }
270
-
271
- #[test]
272
- fn test_batch_result_with_null_elements() {
273
- unsafe {
274
- // Test batch result where some elements are NULL
275
- let c_results = vec![
276
- create_mock_extraction_result(),
277
- ptr::null_mut(), // NULL element
278
- create_mock_extraction_result(),
279
- ];
280
-
281
- let actual_count = c_results.len();
282
- let results_array = c_results.into_boxed_slice();
283
- let results_ptr = Box::into_raw(results_array) as *mut *mut CExtractionResult;
284
-
285
- let batch_result = Box::into_raw(Box::new(CBatchResult {
286
- results: results_ptr,
287
- count: actual_count,
288
- success: true,
289
- _padding2: [0u8; 7],
290
- }));
291
-
292
- // Free should handle NULL elements gracefully
293
- kreuzberg_free_batch_result(batch_result);
294
- }
295
- }
296
-
297
- #[test]
298
- fn test_batch_result_single_element() {
299
- unsafe {
300
- // Test batch allocation/deallocation with exactly 1 result
301
- // This is important for boundary condition testing
302
- let c_results = vec![create_mock_extraction_result()];
303
-
304
- let actual_count = c_results.len();
305
- let results_array = c_results.into_boxed_slice();
306
- let results_ptr = Box::into_raw(results_array) as *mut *mut CExtractionResult;
307
-
308
- let batch_result = Box::into_raw(Box::new(CBatchResult {
309
- results: results_ptr,
310
- count: actual_count,
311
- success: true,
312
- _padding2: [0u8; 7],
313
- }));
314
-
315
- // Verify the batch result is valid
316
- assert!(!batch_result.is_null());
317
- assert_eq!((*batch_result).count, 1);
318
- assert!((*batch_result).success);
319
-
320
- // Free should handle single-element batch correctly
321
- kreuzberg_free_batch_result(batch_result);
322
- }
323
- }
324
-
325
- #[test]
326
- fn test_batch_result_large_size() {
327
- unsafe {
328
- // Test batch with 100 elements to catch boundary conditions
329
- // This verifies the system can handle larger batches without memory corruption
330
- let mut c_results = Vec::with_capacity(100);
331
-
332
- for _ in 0..100 {
333
- c_results.push(create_mock_extraction_result());
334
- }
335
-
336
- let actual_count = c_results.len();
337
- let results_array = c_results.into_boxed_slice();
338
- let results_ptr = Box::into_raw(results_array) as *mut *mut CExtractionResult;
339
-
340
- let batch_result = Box::into_raw(Box::new(CBatchResult {
341
- results: results_ptr,
342
- count: actual_count,
343
- success: true,
344
- _padding2: [0u8; 7],
345
- }));
346
-
347
- // Verify the batch result is valid
348
- assert!(!batch_result.is_null());
349
- assert_eq!((*batch_result).count, 100);
350
- assert!((*batch_result).success);
351
-
352
- // Free should handle large batch correctly without memory issues
353
- kreuzberg_free_batch_result(batch_result);
354
- }
355
- }
356
-
357
- #[test]
358
- fn test_repeated_allocation_deallocation() {
359
- unsafe {
360
- // Stress test: 1000 iterations of allocation/deallocation
361
- // This catches memory leaks, corruption, and use-after-free issues
362
- for _ in 0..1000 {
363
- let result = create_mock_extraction_result();
364
-
365
- // Verify the result is valid
366
- assert!(!result.is_null());
367
- assert!((*result).success);
368
-
369
- // Free the result
370
- kreuzberg_free_result(result);
371
- }
372
-
373
- // If we got here without crashing or leaking, the memory management is sound
374
- }
375
- }
376
-
377
- // ==================== Box/Vec Symmetry Test ====================
378
-
379
- #[test]
380
- fn test_box_vec_symmetry() {
381
- unsafe {
382
- // This test verifies the fix for the allocation/deallocation bug
383
- // Create a Vec, convert to boxed slice, then verify correct deallocation
384
-
385
- let mut vec = Vec::with_capacity(5);
386
- vec.push(42u32);
387
- vec.push(100u32);
388
- vec.push(255u32);
389
-
390
- let len = vec.len();
391
-
392
- // Convert to boxed slice (this is what kreuzberg_batch_extract_files_sync does)
393
- let boxed_slice = vec.into_boxed_slice();
394
- let raw_ptr = Box::into_raw(boxed_slice) as *mut u32;
395
-
396
- // Verify we can read the values
397
- assert_eq!(*raw_ptr.add(0), 42);
398
- assert_eq!(*raw_ptr.add(1), 100);
399
- assert_eq!(*raw_ptr.add(2), 255);
400
-
401
- // Now deallocate using the correct method (from kreuzberg_free_batch_result)
402
- // IMPORTANT: Must use Box::from_raw with slice pointer, not Vec::from_raw_parts
403
- let _boxed_slice = Box::from_raw(std::ptr::slice_from_raw_parts_mut(raw_ptr, len));
404
-
405
- // If we got here without crashing, the symmetry is correct
406
- }
407
- }
408
-
409
- #[test]
410
- fn test_box_vec_symmetry_pointers() {
411
- unsafe {
412
- // Test with pointer types (as used in CBatchResult)
413
- let vec: Vec<*mut CExtractionResult> = vec![
414
- create_mock_extraction_result(),
415
- create_mock_extraction_result(),
416
- create_mock_extraction_result(),
417
- ];
418
-
419
- let len = vec.len();
420
-
421
- // Convert to boxed slice
422
- let boxed_slice = vec.into_boxed_slice();
423
- let raw_ptr = Box::into_raw(boxed_slice) as *mut *mut CExtractionResult;
424
-
425
- // Free individual results first
426
- for i in 0..len {
427
- let result_ptr = *raw_ptr.add(i);
428
- if !result_ptr.is_null() {
429
- kreuzberg_free_result(result_ptr);
430
- }
431
- }
432
-
433
- // Now free the array itself
434
- let _boxed_slice = Box::from_raw(std::ptr::slice_from_raw_parts_mut(raw_ptr, len));
435
-
436
- // If we got here without crashing, the symmetry is correct
437
- }
438
- }
439
-
440
- // ==================== FFI Function Smoke Tests ====================
441
-
442
- #[test]
443
- fn test_version_not_null() {
444
- unsafe {
445
- let version = kreuzberg_version();
446
- assert!(!version.is_null(), "Version string should not be NULL");
447
-
448
- let version_str = CStr::from_ptr(version).to_str().unwrap();
449
- assert!(!version_str.is_empty(), "Version string should not be empty");
450
-
451
- // Verify it looks like a version string (has dots or numbers)
452
- assert!(
453
- version_str.contains('.') || version_str.chars().any(|c| c.is_numeric()),
454
- "Version string should contain version info"
455
- );
456
- }
457
- }
458
-
459
- #[test]
460
- fn test_null_config_handling() {
461
- unsafe {
462
- // Test that functions handle NULL config gracefully
463
- // kreuzberg_batch_extract_files_sync with NULL config should use defaults
464
-
465
- // Create a valid file paths array
466
- let path1 = CString::new("/tmp/test1.txt").unwrap();
467
- let path2 = CString::new("/tmp/test2.txt").unwrap();
468
- let paths = [path1.as_ptr(), path2.as_ptr()];
469
-
470
- // This should not crash with NULL config (though it may fail due to missing files)
471
- let result = kreuzberg_batch_extract_files_sync(paths.as_ptr(), 2, ptr::null());
472
-
473
- // Result might be NULL due to file not existing, but it shouldn't crash
474
- if !result.is_null() {
475
- kreuzberg_free_batch_result(result);
476
- }
477
- }
478
- }
479
-
480
- #[test]
481
- fn test_extraction_result_free_with_null_fields() {
482
- unsafe {
483
- // Test freeing a result where most fields are NULL
484
- let result = Box::into_raw(Box::new(CExtractionResult {
485
- content: CString::new("content").unwrap().into_raw(),
486
- mime_type: CString::new("text/plain").unwrap().into_raw(),
487
- language: ptr::null_mut(),
488
- date: ptr::null_mut(),
489
- subject: ptr::null_mut(),
490
- tables_json: ptr::null_mut(),
491
- detected_languages_json: ptr::null_mut(),
492
- metadata_json: ptr::null_mut(),
493
- chunks_json: ptr::null_mut(),
494
- images_json: ptr::null_mut(),
495
- page_structure_json: ptr::null_mut(),
496
- pages_json: ptr::null_mut(),
497
- success: true,
498
- _padding1: [0u8; 7],
499
- }));
500
-
501
- // Should not crash when freeing result with NULL fields
502
- kreuzberg_free_result(result);
503
- }
504
- }
505
-
506
- #[test]
507
- fn test_extraction_result_free_all_fields_allocated() {
508
- unsafe {
509
- // Test freeing a result where ALL 12 string fields are allocated
510
- // This verifies that kreuzberg_free_result properly frees all fields
511
- let result = Box::into_raw(Box::new(CExtractionResult {
512
- content: CString::new("test content").unwrap().into_raw(),
513
- mime_type: CString::new("application/pdf").unwrap().into_raw(),
514
- language: CString::new("en").unwrap().into_raw(),
515
- date: CString::new("2024-01-01").unwrap().into_raw(),
516
- subject: CString::new("Test Subject").unwrap().into_raw(),
517
- tables_json: CString::new("[]").unwrap().into_raw(),
518
- detected_languages_json: CString::new("[\"en\"]").unwrap().into_raw(),
519
- metadata_json: CString::new("{}").unwrap().into_raw(),
520
- chunks_json: CString::new("[{\"text\":\"chunk1\"}]").unwrap().into_raw(),
521
- images_json: CString::new("[{\"data\":\"base64\"}]").unwrap().into_raw(),
522
- page_structure_json: CString::new("{\"pages\":1}").unwrap().into_raw(),
523
- pages_json: CString::new("[{\"page\":1,\"content\":\"test\"}]").unwrap().into_raw(),
524
- success: true,
525
- _padding1: [0u8; 7],
526
- }));
527
-
528
- // Should properly free all 12 allocated string fields without leaking memory
529
- kreuzberg_free_result(result);
530
- }
531
- }
532
-
533
- #[test]
534
- fn test_string_allocation_deallocation() {
535
- unsafe {
536
- // Test string cloning and freeing
537
- let original = CString::new("test string").unwrap();
538
- let cloned = kreuzberg_clone_string(original.as_ptr());
539
-
540
- assert!(!cloned.is_null(), "Cloned string should not be NULL");
541
-
542
- let cloned_str = CStr::from_ptr(cloned).to_str().unwrap();
543
- assert_eq!(cloned_str, "test string", "Cloned string should match original");
544
-
545
- // Free the cloned string
546
- kreuzberg_free_string(cloned);
547
- }
548
- }
549
-
550
- #[test]
551
- fn test_clone_null_string() {
552
- unsafe {
553
- // Cloning NULL should return NULL and set error
554
- clear_last_error();
555
- let cloned = kreuzberg_clone_string(ptr::null());
556
-
557
- assert!(cloned.is_null(), "Cloning NULL should return NULL");
558
-
559
- let error = kreuzberg_last_error();
560
- assert!(!error.is_null(), "Error should be set");
561
- let error_str = CStr::from_ptr(error).to_str().unwrap();
562
- assert!(error_str.contains("NULL"), "Error should mention NULL");
563
- }
564
- }
565
-
566
- #[test]
567
- fn test_batch_result_success_field() {
568
- unsafe {
569
- // Test that success field is properly set
570
- let c_results: Vec<*mut CExtractionResult> = Vec::new();
571
- let results_array = c_results.into_boxed_slice();
572
- let results_ptr = Box::into_raw(results_array) as *mut *mut CExtractionResult;
573
-
574
- let batch_result = Box::into_raw(Box::new(CBatchResult {
575
- results: results_ptr,
576
- count: 0,
577
- success: true,
578
- _padding2: [0u8; 7],
579
- }));
580
-
581
- assert!((*batch_result).success, "Success field should be true");
582
-
583
- kreuzberg_free_batch_result(batch_result);
584
- }
585
- }
586
-
587
- #[test]
588
- fn test_last_error_cleared() {
589
- unsafe {
590
- // Test that clear_last_error works
591
- set_last_error("test error".to_string());
592
-
593
- let error = kreuzberg_last_error();
594
- assert!(!error.is_null());
595
-
596
- clear_last_error();
597
-
598
- let error_after = kreuzberg_last_error();
599
- assert!(error_after.is_null(), "Error should be cleared");
600
- }
601
- }
602
-
603
- // ==================== Additional Safety Net Tests (PR #1) ====================
604
-
605
- /// Test CExtractionResult size exactly matches FFI contract
606
- #[test]
607
- fn test_c_extraction_result_size() {
608
- assert_eq!(std::mem::size_of::<CExtractionResult>(), 104);
609
- assert_eq!(std::mem::align_of::<CExtractionResult>(), 8);
610
- }
611
-
612
- /// Test CBatchResult size exactly matches FFI contract
613
- #[test]
614
- fn test_c_batch_result_size() {
615
- assert_eq!(std::mem::size_of::<CBatchResult>(), 24);
616
- assert_eq!(std::mem::align_of::<CBatchResult>(), 8);
617
- }
618
-
619
- /// Test CBytesWithMime size exactly matches FFI contract
620
- #[test]
621
- fn test_c_bytes_with_mime_size() {
622
- assert_eq!(std::mem::size_of::<CBytesWithMime>(), 24);
623
- assert_eq!(std::mem::align_of::<CBytesWithMime>(), 8);
624
- }
625
-
626
- /// Test that kreuzberg_extract_bytes_sync handles NULL data pointer
627
- #[test]
628
- fn test_extract_bytes_null_data() {
629
- unsafe {
630
- let mime = CString::new("text/plain").unwrap();
631
- let result = kreuzberg_extract_bytes_sync(ptr::null(), 0, mime.as_ptr());
632
- assert!(result.is_null(), "Should return NULL for NULL data pointer");
633
- }
634
- }
635
-
636
- /// Test that kreuzberg_extract_bytes_sync handles NULL mime type
637
- #[test]
638
- fn test_extract_bytes_null_mime() {
639
- unsafe {
640
- let data = b"test data";
641
- let result = kreuzberg_extract_bytes_sync(data.as_ptr(), data.len(), ptr::null());
642
- assert!(result.is_null(), "Should return NULL for NULL mime type");
643
- }
644
- }
645
-
646
- /// Test that kreuzberg_batch_extract_files_sync handles NULL paths pointer
647
- #[test]
648
- fn test_batch_extract_null_paths() {
649
- unsafe {
650
- let result = kreuzberg_batch_extract_files_sync(ptr::null(), 0, ptr::null());
651
- assert!(result.is_null(), "Should return NULL for NULL paths pointer");
652
- }
653
- }
654
-
655
- /// Test that kreuzberg_batch_extract_bytes_sync handles NULL bytes pointer
656
- #[test]
657
- fn test_batch_extract_bytes_null() {
658
- unsafe {
659
- let result = kreuzberg_batch_extract_bytes_sync(ptr::null(), 0, ptr::null());
660
- assert!(result.is_null(), "Should return NULL for NULL bytes pointer");
661
- }
662
- }
663
-
664
- /// Test that kreuzberg_register_ocr_backend handles NULL name
665
- #[test]
666
- fn test_register_ocr_backend_null_name() {
667
- unsafe {
668
- extern "C" fn dummy_callback(_: *const u8, _: usize, _: *const c_char) -> *mut c_char {
669
- ptr::null_mut()
670
- }
671
- let result = kreuzberg_register_ocr_backend(ptr::null(), dummy_callback);
672
- assert!(!result, "Should return false for NULL backend name");
673
- }
674
- }
675
-
676
- /// Test that kreuzberg_unregister_ocr_backend handles NULL name
677
- #[test]
678
- fn test_unregister_ocr_backend_null_name() {
679
- unsafe {
680
- let result = kreuzberg_unregister_ocr_backend(ptr::null());
681
- assert!(!result, "Should return false for NULL backend name");
682
- }
683
- }
684
-
685
- /// Test that kreuzberg_register_post_processor handles NULL name
686
- #[test]
687
- fn test_register_post_processor_null_name() {
688
- unsafe {
689
- extern "C" fn dummy_callback(_: *const c_char) -> *mut c_char {
690
- ptr::null_mut()
691
- }
692
- let result = kreuzberg_register_post_processor(ptr::null(), dummy_callback, 0);
693
- assert!(!result, "Should return false for NULL processor name");
694
- }
695
- }
696
-
697
- /// Test that kreuzberg_unregister_post_processor handles NULL name
698
- #[test]
699
- fn test_unregister_post_processor_null_name() {
700
- unsafe {
701
- let result = kreuzberg_unregister_post_processor(ptr::null());
702
- assert!(!result, "Should return false for NULL processor name");
703
- }
704
- }
705
-
706
- /// Test that kreuzberg_register_validator handles NULL name
707
- #[test]
708
- fn test_register_validator_null_name() {
709
- unsafe {
710
- extern "C" fn dummy_callback(_: *const c_char) -> *mut c_char {
711
- ptr::null_mut()
712
- }
713
- let result = kreuzberg_register_validator(ptr::null(), dummy_callback, 0);
714
- assert!(!result, "Should return false for NULL validator name");
715
- }
716
- }
717
-
718
- /// Test that kreuzberg_unregister_validator handles NULL name
719
- #[test]
720
- fn test_unregister_validator_null_name() {
721
- unsafe {
722
- let result = kreuzberg_unregister_validator(ptr::null());
723
- assert!(!result, "Should return false for NULL validator name");
724
- }
725
- }
726
-
727
- /// Test that kreuzberg_get_ocr_languages handles NULL backend
728
- #[test]
729
- fn test_get_ocr_languages_null_backend() {
730
- unsafe {
731
- let result = kreuzberg_get_ocr_languages(ptr::null());
732
- assert!(result.is_null(), "Should return NULL for NULL backend name");
733
- }
734
- }
735
-
736
- /// Test that kreuzberg_is_language_supported handles NULL backend
737
- #[test]
738
- fn test_is_language_supported_null_backend() {
739
- unsafe {
740
- let lang = CString::new("en").unwrap();
741
- let result = kreuzberg_is_language_supported(ptr::null(), lang.as_ptr());
742
- assert_eq!(result, 0, "Should return 0 (false) for NULL backend");
743
- }
744
- }
745
-
746
- /// Test that kreuzberg_is_language_supported handles NULL language
747
- #[test]
748
- fn test_is_language_supported_null_language() {
749
- unsafe {
750
- let backend = CString::new("tesseract").unwrap();
751
- let result = kreuzberg_is_language_supported(backend.as_ptr(), ptr::null());
752
- assert_eq!(result, 0, "Should return 0 (false) for NULL language");
753
- }
754
- }
755
-
756
- /// Test that kreuzberg_validate_binarization_method handles NULL
757
- #[test]
758
- fn test_validate_binarization_method_null() {
759
- unsafe {
760
- let result = kreuzberg_validate_binarization_method(ptr::null());
761
- assert_eq!(result, 0, "Should return 0 (invalid) for NULL method");
762
- }
763
- }
764
-
765
- /// Test that kreuzberg_validate_token_reduction_level handles NULL
766
- #[test]
767
- fn test_validate_token_reduction_level_null() {
768
- unsafe {
769
- let result = kreuzberg_validate_token_reduction_level(ptr::null());
770
- assert_eq!(result, 0, "Should return 0 (invalid) for NULL level");
771
- }
772
- }
773
-
774
- /// Test that kreuzberg_validate_ocr_backend handles NULL
775
- #[test]
776
- fn test_validate_ocr_backend_null() {
777
- unsafe {
778
- let result = kreuzberg_validate_ocr_backend(ptr::null());
779
- assert_eq!(result, 0, "Should return 0 (invalid) for NULL backend");
780
- }
781
- }
782
-
783
- /// Test that kreuzberg_validate_language_code handles NULL
784
- #[test]
785
- fn test_validate_language_code_null() {
786
- unsafe {
787
- let result = kreuzberg_validate_language_code(ptr::null());
788
- assert_eq!(result, 0, "Should return 0 (invalid) for NULL language code");
789
- }
790
- }
791
-
792
- /// Test that kreuzberg_validate_output_format handles NULL
793
- #[test]
794
- fn test_validate_output_format_null() {
795
- unsafe {
796
- let result = kreuzberg_validate_output_format(ptr::null());
797
- assert_eq!(result, 0, "Should return 0 (invalid) for NULL format");
798
- }
799
- }
800
-
801
- /// Test that kreuzberg_version returns non-null
802
- #[test]
803
- fn test_version_returns_non_null() {
804
- unsafe {
805
- let version = kreuzberg_version();
806
- assert!(!version.is_null(), "kreuzberg_version should never return NULL");
807
- let version_str = CStr::from_ptr(version).to_str().unwrap();
808
- assert!(!version_str.is_empty(), "Version string should not be empty");
809
- }
810
- }
811
-
812
- /// Test that kreuzberg_last_error returns NULL when no error
813
- #[test]
814
- fn test_last_error_null_when_no_error() {
815
- unsafe {
816
- clear_last_error();
817
- let error = kreuzberg_last_error();
818
- assert!(error.is_null(), "Should return NULL when no error is set");
819
- }
820
- }
821
-
822
- /// Test that kreuzberg_clone_string returns non-null for valid input
823
- #[test]
824
- fn test_clone_string_returns_non_null() {
825
- unsafe {
826
- let input = CString::new("test").unwrap();
827
- let cloned = kreuzberg_clone_string(input.as_ptr());
828
- assert!(!cloned.is_null(), "Clone should return non-NULL for valid input");
829
- kreuzberg_free_string(cloned);
830
- }
831
- }
832
-
833
- /// Test clearing OCR backends doesn't crash
834
- #[test]
835
- fn test_clear_ocr_backends_doesnt_crash() {
836
- unsafe {
837
- // This should not crash even if called multiple times
838
- kreuzberg_clear_ocr_backends();
839
- kreuzberg_clear_ocr_backends();
840
- }
841
- }
842
-
843
- /// Test clearing post processors doesn't crash
844
- #[test]
845
- fn test_clear_post_processors_doesnt_crash() {
846
- unsafe {
847
- // This should not crash even if called multiple times
848
- kreuzberg_clear_post_processors();
849
- kreuzberg_clear_post_processors();
850
- }
851
- }
852
-
853
- /// Test clearing validators doesn't crash
854
- #[test]
855
- fn test_clear_validators_doesnt_crash() {
856
- unsafe {
857
- // This should not crash even if called multiple times
858
- kreuzberg_clear_validators();
859
- kreuzberg_clear_validators();
860
- }
861
- }
862
-
863
- /// Test clearing document extractors doesn't crash
864
- #[test]
865
- fn test_clear_document_extractors_doesnt_crash() {
866
- unsafe {
867
- // This should not crash even if called multiple times
868
- kreuzberg_clear_document_extractors();
869
- kreuzberg_clear_document_extractors();
870
- }
871
- }
872
-
873
- /// Test that list functions return non-null JSON arrays
874
- #[test]
875
- fn test_list_functions_return_non_null() {
876
- unsafe {
877
- // All list functions should return non-NULL JSON arrays (even if empty)
878
- let ocr = kreuzberg_list_ocr_backends();
879
- assert!(!ocr.is_null(), "list_ocr_backends should return non-NULL");
880
- kreuzberg_free_string(ocr);
881
-
882
- let processors = kreuzberg_list_post_processors();
883
- assert!(!processors.is_null(), "list_post_processors should return non-NULL");
884
- kreuzberg_free_string(processors);
885
-
886
- let validators = kreuzberg_list_validators();
887
- assert!(!validators.is_null(), "list_validators should return non-NULL");
888
- kreuzberg_free_string(validators);
889
-
890
- let extractors = kreuzberg_list_document_extractors();
891
- assert!(!extractors.is_null(), "list_document_extractors should return non-NULL");
892
- kreuzberg_free_string(extractors);
893
-
894
- let backends_with_langs = kreuzberg_list_ocr_backends_with_languages();
895
- assert!(
896
- !backends_with_langs.is_null(),
897
- "list_ocr_backends_with_languages should return non-NULL"
898
- );
899
- kreuzberg_free_string(backends_with_langs);
900
- }
901
- }
902
-
903
- /// Test numeric validation functions with edge cases
904
- #[test]
905
- fn test_numeric_validation_edge_cases() {
906
- // Test Tesseract PSM validation with invalid values
907
- assert_eq!(
908
- kreuzberg_validate_tesseract_psm(-1),
909
- 0,
910
- "Negative PSM should be invalid"
911
- );
912
- assert_eq!(kreuzberg_validate_tesseract_psm(0), 1, "PSM 0 should be valid");
913
- assert_eq!(kreuzberg_validate_tesseract_psm(13), 1, "PSM 13 should be valid");
914
- assert_eq!(kreuzberg_validate_tesseract_psm(14), 0, "PSM 14 should be invalid");
915
-
916
- // Test Tesseract OEM validation
917
- assert_eq!(
918
- kreuzberg_validate_tesseract_oem(-1),
919
- 0,
920
- "Negative OEM should be invalid"
921
- );
922
- assert_eq!(kreuzberg_validate_tesseract_oem(0), 1, "OEM 0 should be valid");
923
- assert_eq!(kreuzberg_validate_tesseract_oem(3), 1, "OEM 3 should be valid");
924
- assert_eq!(kreuzberg_validate_tesseract_oem(4), 0, "OEM 4 should be invalid");
925
-
926
- // Test confidence validation
927
- assert_eq!(
928
- kreuzberg_validate_confidence(-0.1),
929
- 0,
930
- "Negative confidence should be invalid"
931
- );
932
- assert_eq!(kreuzberg_validate_confidence(0.0), 1, "0.0 confidence should be valid");
933
- assert_eq!(kreuzberg_validate_confidence(0.5), 1, "0.5 confidence should be valid");
934
- assert_eq!(kreuzberg_validate_confidence(1.0), 1, "1.0 confidence should be valid");
935
- assert_eq!(
936
- kreuzberg_validate_confidence(1.1),
937
- 0,
938
- "1.1 confidence should be invalid"
939
- );
940
-
941
- // Test DPI validation
942
- assert_eq!(kreuzberg_validate_dpi(0), 0, "0 DPI should be invalid");
943
- assert_eq!(kreuzberg_validate_dpi(-1), 0, "-1 DPI should be invalid");
944
- assert_eq!(kreuzberg_validate_dpi(1), 1, "1 DPI should be valid");
945
- assert_eq!(kreuzberg_validate_dpi(72), 1, "72 DPI should be valid");
946
- assert_eq!(kreuzberg_validate_dpi(300), 1, "300 DPI should be valid");
947
- assert_eq!(kreuzberg_validate_dpi(2400), 1, "2400 DPI should be valid");
948
- assert_eq!(kreuzberg_validate_dpi(2401), 0, "2401 DPI should be invalid");
949
-
950
- // Test chunking params validation
951
- assert_eq!(
952
- kreuzberg_validate_chunking_params(0, 0),
953
- 0,
954
- "0 max_chars should be invalid"
955
- );
956
- assert_eq!(
957
- kreuzberg_validate_chunking_params(100, 0),
958
- 1,
959
- "Valid params should pass"
960
- );
961
- assert_eq!(
962
- kreuzberg_validate_chunking_params(100, 50),
963
- 1,
964
- "Valid overlap should pass"
965
- );
966
- assert_eq!(
967
- kreuzberg_validate_chunking_params(100, 100),
968
- 0,
969
- "Overlap >= max_chars should be invalid"
970
- );
971
- assert_eq!(
972
- kreuzberg_validate_chunking_params(100, 101),
973
- 0,
974
- "Overlap > max_chars should be invalid"
975
- );
976
- }
977
- }