kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (321) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +0 -6
  3. data/.rubocop.yaml +534 -1
  4. data/Gemfile +2 -1
  5. data/Gemfile.lock +28 -116
  6. data/README.md +269 -629
  7. data/Rakefile +0 -9
  8. data/Steepfile +4 -8
  9. data/examples/async_patterns.rb +58 -1
  10. data/ext/kreuzberg_rb/extconf.rb +5 -35
  11. data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
  12. data/ext/kreuzberg_rb/native/build.rs +14 -12
  13. data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
  14. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
  15. data/ext/kreuzberg_rb/native/include/strings.h +2 -2
  16. data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
  17. data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
  18. data/extconf.rb +6 -38
  19. data/kreuzberg.gemspec +20 -114
  20. data/lib/kreuzberg/api_proxy.rb +18 -2
  21. data/lib/kreuzberg/cache_api.rb +0 -22
  22. data/lib/kreuzberg/cli.rb +10 -2
  23. data/lib/kreuzberg/cli_proxy.rb +10 -0
  24. data/lib/kreuzberg/config.rb +22 -274
  25. data/lib/kreuzberg/errors.rb +7 -73
  26. data/lib/kreuzberg/extraction_api.rb +8 -237
  27. data/lib/kreuzberg/mcp_proxy.rb +11 -2
  28. data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
  29. data/lib/kreuzberg/post_processor_protocol.rb +71 -0
  30. data/lib/kreuzberg/result.rb +33 -151
  31. data/lib/kreuzberg/setup_lib_path.rb +2 -22
  32. data/lib/kreuzberg/validator_protocol.rb +73 -0
  33. data/lib/kreuzberg/version.rb +1 -1
  34. data/lib/kreuzberg.rb +13 -27
  35. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  36. data/sig/kreuzberg.rbs +12 -105
  37. data/spec/binding/cache_spec.rb +22 -22
  38. data/spec/binding/cli_proxy_spec.rb +4 -2
  39. data/spec/binding/cli_spec.rb +11 -12
  40. data/spec/binding/config_spec.rb +0 -74
  41. data/spec/binding/config_validation_spec.rb +6 -100
  42. data/spec/binding/error_handling_spec.rb +97 -283
  43. data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
  44. data/spec/binding/plugins/postprocessor_spec.rb +11 -11
  45. data/spec/binding/plugins/validator_spec.rb +13 -12
  46. data/spec/examples.txt +104 -0
  47. data/spec/fixtures/config.toml +1 -0
  48. data/spec/fixtures/config.yaml +1 -0
  49. data/spec/fixtures/invalid_config.toml +1 -0
  50. data/spec/smoke/package_spec.rb +3 -2
  51. data/spec/spec_helper.rb +3 -1
  52. data/vendor/kreuzberg/Cargo.toml +67 -192
  53. data/vendor/kreuzberg/README.md +9 -97
  54. data/vendor/kreuzberg/build.rs +194 -516
  55. data/vendor/kreuzberg/src/api/handlers.rs +9 -130
  56. data/vendor/kreuzberg/src/api/mod.rs +3 -18
  57. data/vendor/kreuzberg/src/api/server.rs +71 -236
  58. data/vendor/kreuzberg/src/api/types.rs +7 -43
  59. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  60. data/vendor/kreuzberg/src/cache/mod.rs +3 -27
  61. data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
  62. data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
  63. data/vendor/kreuzberg/src/core/config.rs +23 -905
  64. data/vendor/kreuzberg/src/core/extractor.rs +106 -403
  65. data/vendor/kreuzberg/src/core/io.rs +2 -4
  66. data/vendor/kreuzberg/src/core/mime.rs +12 -2
  67. data/vendor/kreuzberg/src/core/mod.rs +3 -22
  68. data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
  69. data/vendor/kreuzberg/src/embeddings.rs +21 -169
  70. data/vendor/kreuzberg/src/error.rs +2 -2
  71. data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
  72. data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
  73. data/vendor/kreuzberg/src/extraction/email.rs +11 -12
  74. data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
  75. data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
  76. data/vendor/kreuzberg/src/extraction/image.rs +14 -138
  77. data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
  78. data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
  79. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
  80. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  81. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  82. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  83. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  84. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  85. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  86. data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
  87. data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
  88. data/vendor/kreuzberg/src/extraction/table.rs +1 -2
  89. data/vendor/kreuzberg/src/extraction/text.rs +10 -18
  90. data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
  91. data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
  92. data/vendor/kreuzberg/src/extractors/email.rs +9 -37
  93. data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
  94. data/vendor/kreuzberg/src/extractors/html.rs +173 -182
  95. data/vendor/kreuzberg/src/extractors/image.rs +8 -32
  96. data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
  97. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  98. data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
  99. data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
  100. data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
  101. data/vendor/kreuzberg/src/extractors/text.rs +7 -30
  102. data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
  103. data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
  104. data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
  105. data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
  106. data/vendor/kreuzberg/src/lib.rs +5 -17
  107. data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
  108. data/vendor/kreuzberg/src/mcp/server.rs +21 -145
  109. data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
  110. data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
  111. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
  112. data/vendor/kreuzberg/src/pdf/error.rs +1 -93
  113. data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
  114. data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
  115. data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
  116. data/vendor/kreuzberg/src/pdf/table.rs +64 -61
  117. data/vendor/kreuzberg/src/pdf/text.rs +24 -416
  118. data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
  119. data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
  120. data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
  121. data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
  122. data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
  123. data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
  124. data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
  125. data/vendor/kreuzberg/src/text/mod.rs +0 -8
  126. data/vendor/kreuzberg/src/text/quality.rs +15 -28
  127. data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
  128. data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
  129. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
  130. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
  131. data/vendor/kreuzberg/src/types.rs +67 -907
  132. data/vendor/kreuzberg/src/utils/mod.rs +0 -14
  133. data/vendor/kreuzberg/src/utils/quality.rs +3 -12
  134. data/vendor/kreuzberg/tests/api_tests.rs +0 -506
  135. data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
  136. data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
  137. data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
  138. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  139. data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
  140. data/vendor/kreuzberg/tests/config_features.rs +1 -33
  141. data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
  142. data/vendor/kreuzberg/tests/core_integration.rs +9 -35
  143. data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
  144. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
  145. data/vendor/kreuzberg/tests/email_integration.rs +1 -3
  146. data/vendor/kreuzberg/tests/error_handling.rs +34 -43
  147. data/vendor/kreuzberg/tests/format_integration.rs +1 -7
  148. data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
  149. data/vendor/kreuzberg/tests/image_integration.rs +0 -2
  150. data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
  151. data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
  152. data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
  153. data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
  154. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  155. data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
  156. data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
  157. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
  158. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
  159. data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
  160. data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
  161. data/vendor/kreuzberg/tests/security_validation.rs +1 -13
  162. data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
  163. metadata +25 -171
  164. data/.rubocop.yml +0 -543
  165. data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
  166. data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
  167. data/lib/kreuzberg/error_context.rb +0 -136
  168. data/lib/kreuzberg/types.rb +0 -170
  169. data/lib/libpdfium.so +0 -0
  170. data/spec/binding/async_operations_spec.rb +0 -473
  171. data/spec/binding/batch_operations_spec.rb +0 -595
  172. data/spec/binding/batch_spec.rb +0 -359
  173. data/spec/binding/config_result_spec.rb +0 -377
  174. data/spec/binding/embeddings_spec.rb +0 -816
  175. data/spec/binding/error_recovery_spec.rb +0 -488
  176. data/spec/binding/font_config_spec.rb +0 -220
  177. data/spec/binding/images_spec.rb +0 -738
  178. data/spec/binding/keywords_extraction_spec.rb +0 -600
  179. data/spec/binding/metadata_types_spec.rb +0 -1228
  180. data/spec/binding/pages_extraction_spec.rb +0 -471
  181. data/spec/binding/tables_spec.rb +0 -641
  182. data/spec/unit/config/chunking_config_spec.rb +0 -213
  183. data/spec/unit/config/embedding_config_spec.rb +0 -343
  184. data/spec/unit/config/extraction_config_spec.rb +0 -438
  185. data/spec/unit/config/font_config_spec.rb +0 -285
  186. data/spec/unit/config/hierarchy_config_spec.rb +0 -314
  187. data/spec/unit/config/image_extraction_config_spec.rb +0 -209
  188. data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
  189. data/spec/unit/config/keyword_config_spec.rb +0 -229
  190. data/spec/unit/config/language_detection_config_spec.rb +0 -258
  191. data/spec/unit/config/ocr_config_spec.rb +0 -171
  192. data/spec/unit/config/page_config_spec.rb +0 -221
  193. data/spec/unit/config/pdf_config_spec.rb +0 -267
  194. data/spec/unit/config/postprocessor_config_spec.rb +0 -290
  195. data/spec/unit/config/tesseract_config_spec.rb +0 -181
  196. data/spec/unit/config/token_reduction_config_spec.rb +0 -251
  197. data/test/metadata_types_test.rb +0 -959
  198. data/vendor/Cargo.toml +0 -61
  199. data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
  200. data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
  201. data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
  202. data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
  203. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  204. data/vendor/kreuzberg/src/core/formats.rs +0 -235
  205. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  206. data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
  207. data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
  208. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
  209. data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
  210. data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
  211. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  212. data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
  213. data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
  214. data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
  215. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  216. data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
  217. data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
  218. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  219. data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
  220. data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
  221. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  222. data/vendor/kreuzberg/src/extractors/security.rs +0 -484
  223. data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
  224. data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
  225. data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
  226. data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
  227. data/vendor/kreuzberg/src/panic_context.rs +0 -154
  228. data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
  229. data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
  230. data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
  231. data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
  232. data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
  233. data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
  234. data/vendor/kreuzberg/src/utils/pool.rs +0 -503
  235. data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
  236. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
  237. data/vendor/kreuzberg/tests/api_embed.rs +0 -360
  238. data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
  239. data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
  240. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
  241. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
  242. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
  243. data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
  244. data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
  245. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
  246. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
  247. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
  248. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
  249. data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
  250. data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
  251. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
  252. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
  253. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
  254. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
  255. data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
  256. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
  257. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
  258. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
  259. data/vendor/kreuzberg/tests/page_markers.rs +0 -297
  260. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
  261. data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
  262. data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
  263. data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
  264. data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
  265. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
  266. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
  267. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
  268. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
  269. data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
  270. data/vendor/kreuzberg-ffi/README.md +0 -851
  271. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
  272. data/vendor/kreuzberg-ffi/build.rs +0 -168
  273. data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
  274. data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
  275. data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
  276. data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
  277. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
  278. data/vendor/kreuzberg-ffi/src/error.rs +0 -901
  279. data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
  280. data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
  281. data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
  282. data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
  283. data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
  284. data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
  285. data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
  286. data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
  287. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
  288. data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
  289. data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
  290. data/vendor/kreuzberg-ffi/src/result.rs +0 -510
  291. data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
  292. data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
  293. data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
  294. data/vendor/kreuzberg-ffi/src/types.rs +0 -363
  295. data/vendor/kreuzberg-ffi/src/util.rs +0 -210
  296. data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
  297. data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
  298. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
  299. data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
  300. data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
  301. data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
  302. data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
  303. data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
  304. data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
  305. data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
  306. data/vendor/kreuzberg-tesseract/LICENSE +0 -22
  307. data/vendor/kreuzberg-tesseract/README.md +0 -399
  308. data/vendor/kreuzberg-tesseract/build.rs +0 -1127
  309. data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
  310. data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
  311. data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
  312. data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
  313. data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
  314. data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
  315. data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
  316. data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
  317. data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
  318. data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
  319. data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
  320. data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
  321. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211
data/Rakefile CHANGED
@@ -6,13 +6,6 @@ require 'rspec/core/rake_task'
6
6
 
7
7
  GEMSPEC = Gem::Specification.load(File.expand_path('kreuzberg.gemspec', __dir__))
8
8
 
9
- # Vendor kreuzberg core crates before compilation
10
- task :vendor do
11
- vendor_script = File.expand_path('../../scripts/ci/ruby/vendor-kreuzberg-core.sh', __dir__)
12
- puts 'Vendoring kreuzberg core crates...'
13
- sh "bash #{vendor_script}"
14
- end
15
-
16
9
  Rake::ExtensionTask.new('kreuzberg_rb', GEMSPEC) do |ext|
17
10
  ext.lib_dir = 'lib'
18
11
  ext.ext_dir = 'ext/kreuzberg_rb'
@@ -23,12 +16,10 @@ Rake::ExtensionTask.new('kreuzberg_rb', GEMSPEC) do |ext|
23
16
  x86_64-darwin
24
17
  arm64-darwin
25
18
  x64-mingw32
26
- x64-mingw-ucrt
27
19
  ]
28
20
  end
29
21
 
30
22
  RSpec::Core::RakeTask.new(:spec)
31
23
 
32
- task compile: :vendor
33
24
  task spec: :compile
34
25
  task default: :spec
data/Steepfile CHANGED
@@ -15,23 +15,19 @@ target :lib do
15
15
 
16
16
  # Strategic ignores for steep limitations (not fixable, safe to ignore):
17
17
 
18
- # 1. Sorbet type annotations - Steep doesn't recognize Sorbet's T::Struct and T::Sig
19
- # This file uses Sorbet exclusively for type definitions
20
- ignore 'lib/kreuzberg/types.rb'
21
-
22
- # 2. Struct.new with keyword_init - steep cannot understand implicit attr_readers
18
+ # 1. Struct.new with keyword_init - steep cannot understand implicit attr_readers
23
19
  # defined by Struct.new in blocks (Table and Chunk classes)
24
20
  ignore 'lib/kreuzberg/result.rb'
25
21
 
26
- # 3. Generic type parameters in normalize_config - steep has difficulty with
22
+ # 2. Generic type parameters in normalize_config - steep has difficulty with
27
23
  # methods that take Class as parameter and return instances
28
24
  ignore 'lib/kreuzberg/config.rb'
29
25
 
30
- # 4. Interface types - steep doesn't recognize that all Ruby objects have nil? and is_a?
26
+ # 3. Interface types - steep doesn't recognize that all Ruby objects have nil? and is_a?
31
27
  # even for interface types like _ToH
32
28
  ignore 'lib/kreuzberg/extraction_api.rb'
33
29
 
34
- # 5. Open3 methods - steep's built-in Open3 RBS signatures incomplete
30
+ # 4. Open3 methods - steep's built-in Open3 RBS signatures incomplete
35
31
  # (capture2, capture3, popen3 are standard library methods)
36
32
  ignore 'lib/kreuzberg/setup_lib_path.rb'
37
33
  ignore 'lib/kreuzberg/cli_proxy.rb'
@@ -1,5 +1,10 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ # Async Patterns for Kreuzberg Ruby Bindings
4
+ #
5
+ # This example demonstrates async patterns and concurrency approaches for Ruby,
6
+ # with comparison to the underlying Rust implementation.
7
+
3
8
  require 'kreuzberg'
4
9
 
5
10
  # NOTE: Ruby bindings use Tokio runtime with block_on() internally.
@@ -21,6 +26,8 @@ end
21
26
  # ============================================================================
22
27
 
23
28
  def basic_async_extraction
29
+ # This LOOKS async but actually blocks the Ruby thread
30
+ # Internally uses: runtime.block_on(async { ... })
24
31
  result = Kreuzberg.extract_file('document.pdf')
25
32
  puts "Content: #{result[:content]}"
26
33
  end
@@ -32,6 +39,8 @@ end
32
39
  def concurrent_with_threads
33
40
  files = ['doc1.pdf', 'doc2.pdf', 'doc3.pdf']
34
41
 
42
+ # Use Ruby threads to achieve parallelism
43
+ # Each thread calls the synchronous API
35
44
  threads = files.map do |file|
36
45
  Thread.new do
37
46
  Kreuzberg.extract_file_sync(file)
@@ -51,6 +60,8 @@ end
51
60
  def batch_processing
52
61
  files = ['doc1.pdf', 'doc2.pdf', 'doc3.pdf']
53
62
 
63
+ # The batch API handles concurrency internally via Rust/Tokio
64
+ # This is more efficient than Ruby threads
54
65
  results = Kreuzberg.batch_extract_files_sync(files)
55
66
 
56
67
  puts "Processed #{results.length} files"
@@ -64,6 +75,7 @@ end
64
75
  # ============================================================================
65
76
 
66
77
  def extraction_with_config
78
+ # Configure OCR
67
79
  config = {
68
80
  ocr: {
69
81
  backend: 'tesseract',
@@ -129,8 +141,11 @@ end
129
141
  # Example ActiveJob for async processing in Rails
130
142
  # < ApplicationJob
131
143
  class DocumentExtractionJob
144
+ # queue_as :default
145
+
132
146
  def perform(file_path)
133
147
  result = Kreuzberg.extract_file_sync(file_path)
148
+ # Store result in database or process further
134
149
  puts "Background extraction complete: #{result[:content][0..100]}"
135
150
  end
136
151
  end
@@ -147,6 +162,7 @@ def concurrent_with_parallel_gem
147
162
 
148
163
  files = ['doc1.pdf', 'doc2.pdf', 'doc3.pdf', 'doc4.pdf']
149
164
 
165
+ # Process files in parallel using multiple CPU cores
150
166
  results = Parallel.map(files, in_processes: 4) do |file|
151
167
  Kreuzberg.extract_file_sync(file)
152
168
  end
@@ -176,6 +192,7 @@ end
176
192
  # ============================================================================
177
193
 
178
194
  def register_postprocessor
195
+ # Register a Ruby-based post-processor
179
196
  uppercase_processor = lambda do |result|
180
197
  result[:content] = result[:content].upcase
181
198
  result
@@ -183,9 +200,11 @@ def register_postprocessor
183
200
 
184
201
  Kreuzberg.register_post_processor('uppercase', uppercase_processor, 100)
185
202
 
203
+ # Now all extractions will use the uppercase processor
186
204
  result = Kreuzberg.extract_file_sync('document.pdf')
187
205
  puts "Uppercase content: #{result[:content]}"
188
206
 
207
+ # Clean up
189
208
  Kreuzberg.unregister_post_processor('uppercase')
190
209
  end
191
210
 
@@ -194,12 +213,14 @@ end
194
213
  # ============================================================================
195
214
 
196
215
  def register_validator
216
+ # Register a Ruby-based validator
197
217
  min_length_validator = lambda do |result|
198
218
  raise 'Content too short' if result[:content].length < 100
199
219
  end
200
220
 
201
221
  Kreuzberg.register_validator('min_length', min_length_validator, 100)
202
222
 
223
+ # Validation will run automatically during extraction
203
224
  begin
204
225
  result = Kreuzberg.extract_file_sync('short_document.pdf')
205
226
  puts "Validation passed: #{result[:content]}"
@@ -207,6 +228,7 @@ def register_validator
207
228
  puts "Validation failed: #{e.message}"
208
229
  end
209
230
 
231
+ # Clean up
210
232
  Kreuzberg.unregister_validator('min_length')
211
233
  end
212
234
 
@@ -214,9 +236,12 @@ end
214
236
  # Pattern 15: Custom Ruby OCR Backend Plugin
215
237
  # ============================================================================
216
238
 
217
- # Example OCR backend implementation for custom processing.
218
239
  class CustomOcrBackend
219
240
  def process_image(image_bytes, language)
241
+ # In a real implementation, you would:
242
+ # 1. Call an external OCR service
243
+ # 2. Use an HTTP API
244
+ # 3. Process with a Ruby gem
220
245
  "Extracted text from #{image_bytes.length} bytes using #{language}"
221
246
  end
222
247
 
@@ -229,6 +254,7 @@ def register_ocr_backend
229
254
  backend = CustomOcrBackend.new
230
255
  Kreuzberg.register_ocr_backend('custom', backend)
231
256
 
257
+ # Now you can use the custom backend
232
258
  config = {
233
259
  ocr: {
234
260
  backend: 'custom',
@@ -280,4 +306,35 @@ def main
280
306
  register_validator
281
307
  end
282
308
 
309
+ # Run if executed directly
283
310
  main if __FILE__ == $PROGRAM_NAME
311
+
312
+ # ============================================================================
313
+ # Key Takeaways:
314
+ #
315
+ # 1. Ruby bindings use Tokio runtime with block_on() internally
316
+ # 2. "Async" functions block the Ruby GVL - no concurrency benefit
317
+ # 3. Use _sync variants for clarity (same performance)
318
+ # 4. Use Ruby threads or Parallel gem for concurrent processing
319
+ # 5. Batch API is most efficient for multiple files
320
+ # 6. ActiveJob for background processing in Rails
321
+ # 7. Ruby plugins (PostProcessor, Validator, OCR) are fully supported
322
+ #
323
+ # Performance Comparison:
324
+ # - Magnus: Blocks GVL, same overhead as sync (~Xms per call)
325
+ # - PyO3 (optimized): ~0.17ms overhead, GIL released during await
326
+ # - NAPI-RS: ~0ms overhead, automatic Promise conversion
327
+ #
328
+ # When to Use Ruby Bindings:
329
+ # ✅ Rails applications (ActiveJob for background processing)
330
+ # ✅ Ruby scripts (existing Ruby codebases)
331
+ # ✅ Simple extraction (single-file processing)
332
+ # ✅ Batch processing (batch API handles concurrency)
333
+ #
334
+ # Consider Other Bindings For:
335
+ # ❌ High concurrency (use Node.js/NAPI-RS instead)
336
+ # ❌ Real-time processing (use Node.js/NAPI-RS instead)
337
+ # ❌ I/O-bound workloads (use Python/PyO3 or Node.js/NAPI-RS)
338
+ #
339
+ # See packages/ruby/ext/kreuzberg_rb/native/README.md for detailed async runtime documentation.
340
+ # ============================================================================
@@ -3,57 +3,27 @@
3
3
  require 'mkmf'
4
4
  require 'rb_sys/mkmf'
5
5
  require 'rbconfig'
6
- require 'fileutils'
7
-
8
- if Gem.win_platform?
9
- # Use CI-provided CARGO_TARGET_DIR if available, otherwise use a short path
10
- # GitHub Actions sets CARGO_TARGET_DIR=C:\t for MAX_PATH mitigation
11
- if ENV['CARGO_TARGET_DIR']
12
- puts "Windows detected: Using existing CARGO_TARGET_DIR=#{ENV['CARGO_TARGET_DIR']}"
13
- else
14
- # Try C:\t first (CI convention), fall back to D:/kz-build
15
- short_target_dir = Dir.exist?('C:/t') ? 'C:/t' : 'C:/kz-build'
16
- begin
17
- FileUtils.mkdir_p(short_target_dir)
18
- ENV['CARGO_TARGET_DIR'] = short_target_dir
19
- ENV['OUT_DIR'] = short_target_dir
20
- puts "Windows detected: Using short build path #{short_target_dir}"
21
- rescue StandardError => e
22
- puts "Warning: Could not create short path #{short_target_dir}: #{e.message}"
23
- # Fall back to relative path which rb_sys will handle
24
- end
25
- end
26
- end
27
6
 
28
7
  if /mswin|mingw/.match?(RbConfig::CONFIG['host_os'])
29
8
  devkit = ENV.fetch('RI_DEVKIT', nil)
30
9
  prefix = ENV['MSYSTEM_PREFIX'] || '/ucrt64'
31
-
32
- # Set up include paths for MSVC compatibility headers
33
- native_include = File.expand_path('native/include', __dir__).tr('\\', '/')
34
10
  compat_include = File.expand_path('native/include/msvc_compat', __dir__).tr('\\', '/')
35
11
 
36
12
  extra_args = []
37
- extra_args << "-I#{native_include}"
38
13
  extra_args << "-I#{compat_include}"
39
14
 
40
- # Add Windows-specific flags for better compatibility
41
- extra_args << '-fms-extensions'
42
- extra_args << '-fno-omit-frame-pointer'
43
-
44
15
  if devkit
45
- sysroot = "#{devkit}#{prefix}".tr('\\', '/')
46
- extra_args.push('--target=x86_64-pc-windows-gnu', "--sysroot=#{sysroot}")
16
+ sysroot = "#{devkit}#{prefix}".tr('\\\\', '/')
17
+ extra_args.concat([
18
+ '--target=x86_64-pc-windows-gnu',
19
+ "--sysroot=#{sysroot}"
20
+ ])
47
21
  end
48
22
 
49
23
  unless extra_args.empty?
50
24
  existing = ENV['BINDGEN_EXTRA_CLANG_ARGS'].to_s.split(/\s+/).reject(&:empty?)
51
25
  ENV['BINDGEN_EXTRA_CLANG_ARGS'] = (existing + extra_args).uniq.join(' ')
52
- puts "BINDGEN_EXTRA_CLANG_ARGS set to: #{ENV.fetch('BINDGEN_EXTRA_CLANG_ARGS', nil)}"
53
26
  end
54
-
55
- # Set target for Windows GNU toolchain if not already set
56
- ENV['CARGO_BUILD_TARGET'] ||= 'x86_64-pc-windows-gnu' if devkit || ENV['MSYSTEM']
57
27
  end
58
28
 
59
29
  default_profile = ENV.fetch('CARGO_PROFILE', 'release')
@@ -1,75 +1,36 @@
1
- [workspace]
2
-
3
- [workspace.lints.clippy]
4
- collapsible_if = "allow"
5
-
6
1
  [package]
7
2
  name = "kreuzberg-rb"
8
- version = "4.0.0-rc.28"
3
+ version = "4.0.0-rc.1"
9
4
  edition = "2024"
10
- rust-version = "1.91"
5
+ rust-version = "1.85"
11
6
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
12
7
  license = "MIT"
13
- repository = "https://github.com/kreuzberg-dev/kreuzberg"
8
+ repository = "https://github.com/Goldziher/kreuzberg"
14
9
  homepage = "https://kreuzberg.dev"
15
10
  documentation = "https://docs.rs/kreuzberg"
16
11
  readme = "README.md"
17
12
  description = "Ruby bindings (Magnus) for Kreuzberg - high-performance document intelligence framework"
18
- keywords = ["ruby", "magnus", "document", "extraction", "bindings"]
19
- categories = ["api-bindings", "text-processing"]
20
-
21
- [lints]
22
- workspace = true
13
+ keywords = ["document", "extraction", "ocr", "pdf", "ruby"]
14
+ categories = ["api-bindings"]
23
15
 
24
16
  [lib]
25
17
  name = "kreuzberg_rb"
26
18
  crate-type = ["cdylib", "rlib"]
27
19
 
28
20
  [features]
29
- default = ["embeddings"]
30
- embeddings = ["kreuzberg/embeddings"]
21
+ default = []
31
22
 
32
23
  [dependencies]
33
- async-trait = "0.1.89"
34
- kreuzberg = { path = "../../../vendor/kreuzberg", default-features = false, features = [
35
- "pdf",
36
- "excel",
37
- "office",
38
- "email",
39
- "html",
40
- "xml",
41
- "archives",
42
- "ocr",
43
- "language-detection",
44
- "chunking",
45
- "embeddings",
46
- "quality",
47
- "keywords",
48
- "api",
49
- "mcp",
50
- "otel",
51
- "bundled-pdfium",
52
- "tokio-runtime",
53
- ] }
54
- kreuzberg-ffi = { path = "../../../vendor/kreuzberg-ffi" }
55
- magnus = { git = "https://github.com/matsadler/magnus", rev = "f6db11769efb517427bf7f121f9c32e18b059b38", features = [
56
- "rb-sys",
57
- ] }
58
- rb-sys = { version = "0.9.119", default-features = false, features = [
59
- "stable-api-compiled-fallback",
60
- ] }
61
- serde_json = "1.0.145"
62
- tokio = { version = "1.48.0", features = [
63
- "rt",
64
- "rt-multi-thread",
65
- "macros",
66
- "sync",
67
- "process",
68
- "fs",
69
- "time",
70
- "io-util",
71
- ] }
72
- html-to-markdown-rs = { version = "2.14.2", default-features = false }
24
+ async-trait = "0.1"
25
+ kreuzberg = { version = "4.0.0-rc.1", features = ["full", "embeddings"] }
26
+ magnus = { git = "https://github.com/matsadler/magnus", rev = "f6db11769efb517427bf7f121f9c32e18b059b38", features = ["rb-sys"] }
27
+ rb-sys = { version = "0.9.117", default-features = false, features = ["stable-api-compiled-fallback"] }
28
+ serde_json = "1.0"
29
+ tokio = { version = "1.48", features = ["rt", "macros"] }
30
+ html-to-markdown-rs = { version = "2.9.1", default-features = false }
73
31
 
74
32
  [dev-dependencies]
75
33
  pretty_assertions = "1.4"
34
+
35
+ [patch.crates-io]
36
+ kreuzberg = { path = "../../../../vendor/kreuzberg" }
@@ -1,15 +1,17 @@
1
+ #[cfg(target_os = "macos")]
1
2
  fn main() {
2
- let target = std::env::var("TARGET").unwrap();
3
-
4
- if target.contains("darwin") {
5
- println!("cargo:rustc-link-arg=-Wl,-undefined,dynamic_lookup");
6
- println!("cargo:rustc-link-arg=-Wl,-rpath,@loader_path");
7
- } else if target.contains("linux") {
8
- println!("cargo:rustc-link-arg=-Wl,-rpath,$ORIGIN");
9
- } else if target.contains("windows") {
10
- // Windows doesn't need rpath or dynamic_lookup equivalents
11
- // The linker flags are already configured in .cargo/config.toml
12
- }
3
+ println!("cargo:rustc-link-arg=-Wl,-undefined,dynamic_lookup");
4
+ // Set rpath to look for libpdfium.dylib in the same directory as the Ruby extension
5
+ println!("cargo:rustc-link-arg=-Wl,-rpath,@loader_path");
6
+ println!("cargo:rustc-link-arg=-Wl,-rpath,@loader_path/.");
7
+ }
13
8
 
14
- println!("cargo:rerun-if-changed=build.rs");
9
+ #[cfg(target_os = "linux")]
10
+ fn main() {
11
+ // Set rpath to look for libpdfium.so in the same directory as the Ruby extension
12
+ println!("cargo:rustc-link-arg=-Wl,-rpath,$ORIGIN");
13
+ println!("cargo:rustc-link-arg=-Wl,-rpath,$ORIGIN/.");
15
14
  }
15
+
16
+ #[cfg(not(any(target_os = "macos", target_os = "linux")))]
17
+ fn main() {}
@@ -8,4 +8,4 @@
8
8
  #include_next <ieeefp.h>
9
9
  #endif
10
10
 
11
- #endif
11
+ #endif // KREUZBERG_RUBY_IEEFP_H
@@ -11,4 +11,4 @@
11
11
  #define strncasecmp _strnicmp
12
12
  #endif
13
13
 
14
- #endif
14
+ #endif /* KREUZBERG_RB_MSVC_COMPAT_STRINGS_H */
@@ -15,6 +15,6 @@
15
15
  #ifndef bzero
16
16
  #define bzero(ptr, size) memset((ptr), 0, (size))
17
17
  #endif
18
- #endif
18
+ #endif // _MSC_VER
19
19
 
20
- #endif
20
+ #endif // KREUZBERG_RUBY_STRINGS_H
@@ -44,4 +44,4 @@ typedef long ssize_t;
44
44
  #include_next <unistd.h>
45
45
  #endif
46
46
 
47
- #endif
47
+ #endif // KREUZBERG_RUBY_UNISTD_H