kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (321) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +0 -6
  3. data/.rubocop.yaml +534 -1
  4. data/Gemfile +2 -1
  5. data/Gemfile.lock +28 -116
  6. data/README.md +269 -629
  7. data/Rakefile +0 -9
  8. data/Steepfile +4 -8
  9. data/examples/async_patterns.rb +58 -1
  10. data/ext/kreuzberg_rb/extconf.rb +5 -35
  11. data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
  12. data/ext/kreuzberg_rb/native/build.rs +14 -12
  13. data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
  14. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
  15. data/ext/kreuzberg_rb/native/include/strings.h +2 -2
  16. data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
  17. data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
  18. data/extconf.rb +6 -38
  19. data/kreuzberg.gemspec +20 -114
  20. data/lib/kreuzberg/api_proxy.rb +18 -2
  21. data/lib/kreuzberg/cache_api.rb +0 -22
  22. data/lib/kreuzberg/cli.rb +10 -2
  23. data/lib/kreuzberg/cli_proxy.rb +10 -0
  24. data/lib/kreuzberg/config.rb +22 -274
  25. data/lib/kreuzberg/errors.rb +7 -73
  26. data/lib/kreuzberg/extraction_api.rb +8 -237
  27. data/lib/kreuzberg/mcp_proxy.rb +11 -2
  28. data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
  29. data/lib/kreuzberg/post_processor_protocol.rb +71 -0
  30. data/lib/kreuzberg/result.rb +33 -151
  31. data/lib/kreuzberg/setup_lib_path.rb +2 -22
  32. data/lib/kreuzberg/validator_protocol.rb +73 -0
  33. data/lib/kreuzberg/version.rb +1 -1
  34. data/lib/kreuzberg.rb +13 -27
  35. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  36. data/sig/kreuzberg.rbs +12 -105
  37. data/spec/binding/cache_spec.rb +22 -22
  38. data/spec/binding/cli_proxy_spec.rb +4 -2
  39. data/spec/binding/cli_spec.rb +11 -12
  40. data/spec/binding/config_spec.rb +0 -74
  41. data/spec/binding/config_validation_spec.rb +6 -100
  42. data/spec/binding/error_handling_spec.rb +97 -283
  43. data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
  44. data/spec/binding/plugins/postprocessor_spec.rb +11 -11
  45. data/spec/binding/plugins/validator_spec.rb +13 -12
  46. data/spec/examples.txt +104 -0
  47. data/spec/fixtures/config.toml +1 -0
  48. data/spec/fixtures/config.yaml +1 -0
  49. data/spec/fixtures/invalid_config.toml +1 -0
  50. data/spec/smoke/package_spec.rb +3 -2
  51. data/spec/spec_helper.rb +3 -1
  52. data/vendor/kreuzberg/Cargo.toml +67 -192
  53. data/vendor/kreuzberg/README.md +9 -97
  54. data/vendor/kreuzberg/build.rs +194 -516
  55. data/vendor/kreuzberg/src/api/handlers.rs +9 -130
  56. data/vendor/kreuzberg/src/api/mod.rs +3 -18
  57. data/vendor/kreuzberg/src/api/server.rs +71 -236
  58. data/vendor/kreuzberg/src/api/types.rs +7 -43
  59. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  60. data/vendor/kreuzberg/src/cache/mod.rs +3 -27
  61. data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
  62. data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
  63. data/vendor/kreuzberg/src/core/config.rs +23 -905
  64. data/vendor/kreuzberg/src/core/extractor.rs +106 -403
  65. data/vendor/kreuzberg/src/core/io.rs +2 -4
  66. data/vendor/kreuzberg/src/core/mime.rs +12 -2
  67. data/vendor/kreuzberg/src/core/mod.rs +3 -22
  68. data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
  69. data/vendor/kreuzberg/src/embeddings.rs +21 -169
  70. data/vendor/kreuzberg/src/error.rs +2 -2
  71. data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
  72. data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
  73. data/vendor/kreuzberg/src/extraction/email.rs +11 -12
  74. data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
  75. data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
  76. data/vendor/kreuzberg/src/extraction/image.rs +14 -138
  77. data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
  78. data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
  79. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
  80. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  81. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  82. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  83. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  84. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  85. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  86. data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
  87. data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
  88. data/vendor/kreuzberg/src/extraction/table.rs +1 -2
  89. data/vendor/kreuzberg/src/extraction/text.rs +10 -18
  90. data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
  91. data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
  92. data/vendor/kreuzberg/src/extractors/email.rs +9 -37
  93. data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
  94. data/vendor/kreuzberg/src/extractors/html.rs +173 -182
  95. data/vendor/kreuzberg/src/extractors/image.rs +8 -32
  96. data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
  97. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  98. data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
  99. data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
  100. data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
  101. data/vendor/kreuzberg/src/extractors/text.rs +7 -30
  102. data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
  103. data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
  104. data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
  105. data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
  106. data/vendor/kreuzberg/src/lib.rs +5 -17
  107. data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
  108. data/vendor/kreuzberg/src/mcp/server.rs +21 -145
  109. data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
  110. data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
  111. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
  112. data/vendor/kreuzberg/src/pdf/error.rs +1 -93
  113. data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
  114. data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
  115. data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
  116. data/vendor/kreuzberg/src/pdf/table.rs +64 -61
  117. data/vendor/kreuzberg/src/pdf/text.rs +24 -416
  118. data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
  119. data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
  120. data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
  121. data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
  122. data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
  123. data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
  124. data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
  125. data/vendor/kreuzberg/src/text/mod.rs +0 -8
  126. data/vendor/kreuzberg/src/text/quality.rs +15 -28
  127. data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
  128. data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
  129. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
  130. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
  131. data/vendor/kreuzberg/src/types.rs +67 -907
  132. data/vendor/kreuzberg/src/utils/mod.rs +0 -14
  133. data/vendor/kreuzberg/src/utils/quality.rs +3 -12
  134. data/vendor/kreuzberg/tests/api_tests.rs +0 -506
  135. data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
  136. data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
  137. data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
  138. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  139. data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
  140. data/vendor/kreuzberg/tests/config_features.rs +1 -33
  141. data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
  142. data/vendor/kreuzberg/tests/core_integration.rs +9 -35
  143. data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
  144. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
  145. data/vendor/kreuzberg/tests/email_integration.rs +1 -3
  146. data/vendor/kreuzberg/tests/error_handling.rs +34 -43
  147. data/vendor/kreuzberg/tests/format_integration.rs +1 -7
  148. data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
  149. data/vendor/kreuzberg/tests/image_integration.rs +0 -2
  150. data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
  151. data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
  152. data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
  153. data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
  154. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  155. data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
  156. data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
  157. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
  158. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
  159. data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
  160. data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
  161. data/vendor/kreuzberg/tests/security_validation.rs +1 -13
  162. data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
  163. metadata +25 -171
  164. data/.rubocop.yml +0 -543
  165. data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
  166. data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
  167. data/lib/kreuzberg/error_context.rb +0 -136
  168. data/lib/kreuzberg/types.rb +0 -170
  169. data/lib/libpdfium.so +0 -0
  170. data/spec/binding/async_operations_spec.rb +0 -473
  171. data/spec/binding/batch_operations_spec.rb +0 -595
  172. data/spec/binding/batch_spec.rb +0 -359
  173. data/spec/binding/config_result_spec.rb +0 -377
  174. data/spec/binding/embeddings_spec.rb +0 -816
  175. data/spec/binding/error_recovery_spec.rb +0 -488
  176. data/spec/binding/font_config_spec.rb +0 -220
  177. data/spec/binding/images_spec.rb +0 -738
  178. data/spec/binding/keywords_extraction_spec.rb +0 -600
  179. data/spec/binding/metadata_types_spec.rb +0 -1228
  180. data/spec/binding/pages_extraction_spec.rb +0 -471
  181. data/spec/binding/tables_spec.rb +0 -641
  182. data/spec/unit/config/chunking_config_spec.rb +0 -213
  183. data/spec/unit/config/embedding_config_spec.rb +0 -343
  184. data/spec/unit/config/extraction_config_spec.rb +0 -438
  185. data/spec/unit/config/font_config_spec.rb +0 -285
  186. data/spec/unit/config/hierarchy_config_spec.rb +0 -314
  187. data/spec/unit/config/image_extraction_config_spec.rb +0 -209
  188. data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
  189. data/spec/unit/config/keyword_config_spec.rb +0 -229
  190. data/spec/unit/config/language_detection_config_spec.rb +0 -258
  191. data/spec/unit/config/ocr_config_spec.rb +0 -171
  192. data/spec/unit/config/page_config_spec.rb +0 -221
  193. data/spec/unit/config/pdf_config_spec.rb +0 -267
  194. data/spec/unit/config/postprocessor_config_spec.rb +0 -290
  195. data/spec/unit/config/tesseract_config_spec.rb +0 -181
  196. data/spec/unit/config/token_reduction_config_spec.rb +0 -251
  197. data/test/metadata_types_test.rb +0 -959
  198. data/vendor/Cargo.toml +0 -61
  199. data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
  200. data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
  201. data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
  202. data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
  203. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  204. data/vendor/kreuzberg/src/core/formats.rs +0 -235
  205. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  206. data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
  207. data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
  208. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
  209. data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
  210. data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
  211. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  212. data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
  213. data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
  214. data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
  215. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  216. data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
  217. data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
  218. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  219. data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
  220. data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
  221. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  222. data/vendor/kreuzberg/src/extractors/security.rs +0 -484
  223. data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
  224. data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
  225. data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
  226. data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
  227. data/vendor/kreuzberg/src/panic_context.rs +0 -154
  228. data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
  229. data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
  230. data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
  231. data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
  232. data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
  233. data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
  234. data/vendor/kreuzberg/src/utils/pool.rs +0 -503
  235. data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
  236. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
  237. data/vendor/kreuzberg/tests/api_embed.rs +0 -360
  238. data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
  239. data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
  240. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
  241. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
  242. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
  243. data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
  244. data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
  245. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
  246. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
  247. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
  248. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
  249. data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
  250. data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
  251. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
  252. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
  253. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
  254. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
  255. data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
  256. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
  257. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
  258. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
  259. data/vendor/kreuzberg/tests/page_markers.rs +0 -297
  260. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
  261. data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
  262. data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
  263. data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
  264. data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
  265. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
  266. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
  267. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
  268. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
  269. data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
  270. data/vendor/kreuzberg-ffi/README.md +0 -851
  271. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
  272. data/vendor/kreuzberg-ffi/build.rs +0 -168
  273. data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
  274. data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
  275. data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
  276. data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
  277. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
  278. data/vendor/kreuzberg-ffi/src/error.rs +0 -901
  279. data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
  280. data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
  281. data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
  282. data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
  283. data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
  284. data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
  285. data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
  286. data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
  287. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
  288. data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
  289. data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
  290. data/vendor/kreuzberg-ffi/src/result.rs +0 -510
  291. data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
  292. data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
  293. data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
  294. data/vendor/kreuzberg-ffi/src/types.rs +0 -363
  295. data/vendor/kreuzberg-ffi/src/util.rs +0 -210
  296. data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
  297. data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
  298. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
  299. data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
  300. data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
  301. data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
  302. data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
  303. data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
  304. data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
  305. data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
  306. data/vendor/kreuzberg-tesseract/LICENSE +0 -22
  307. data/vendor/kreuzberg-tesseract/README.md +0 -399
  308. data/vendor/kreuzberg-tesseract/build.rs +0 -1127
  309. data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
  310. data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
  311. data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
  312. data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
  313. data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
  314. data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
  315. data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
  316. data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
  317. data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
  318. data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
  319. data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
  320. data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
  321. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211
data/extconf.rb CHANGED
@@ -3,53 +3,21 @@
3
3
  require 'mkmf'
4
4
  require 'rb_sys/mkmf'
5
5
  require 'rbconfig'
6
- require 'fileutils'
7
-
8
- if Gem.win_platform?
9
- # Use CI-provided CARGO_TARGET_DIR if available, otherwise use a short path
10
- # GitHub Actions sets CARGO_TARGET_DIR=C:\t for MAX_PATH mitigation
11
- if ENV['CARGO_TARGET_DIR']
12
- puts "Windows detected: Using existing CARGO_TARGET_DIR=#{ENV['CARGO_TARGET_DIR']}"
13
- else
14
- short_target_dir = Dir.exist?('C:/t') ? 'C:/t' : 'C:/kz-build'
15
- begin
16
- FileUtils.mkdir_p(short_target_dir)
17
- ENV['CARGO_TARGET_DIR'] = short_target_dir
18
- ENV['OUT_DIR'] = short_target_dir
19
- puts "Windows detected: Using short build path #{short_target_dir}"
20
- rescue StandardError => e
21
- puts "Warning: Could not create short path #{short_target_dir}: #{e.message}"
22
- end
23
- end
24
- end
25
6
 
26
7
  if /mswin|mingw/.match?(RbConfig::CONFIG['host_os'])
27
8
  devkit = ENV.fetch('RI_DEVKIT', nil)
28
9
  prefix = ENV['MSYSTEM_PREFIX'] || '/ucrt64'
29
10
 
30
- # Set up include paths for MSVC compatibility headers
31
- native_include = File.expand_path('ext/kreuzberg_rb/native/include', __dir__).tr('\\', '/')
32
- compat_include = File.expand_path('ext/kreuzberg_rb/native/include/msvc_compat', __dir__).tr('\\', '/')
33
-
34
- extra_args = []
35
- extra_args << "-I#{native_include}"
36
- extra_args << "-I#{compat_include}"
37
- extra_args << '-fms-extensions'
38
- extra_args << '-fno-omit-frame-pointer'
39
-
40
11
  if devkit
41
- sysroot = "#{devkit}#{prefix}".tr('\\', '/')
42
- extra_args.push('--target=x86_64-pc-windows-gnu', "--sysroot=#{sysroot}")
43
- end
12
+ sysroot = "#{devkit}#{prefix}".tr('\\\\', '/')
13
+ extra_args = [
14
+ '--target=x86_64-pc-windows-gnu',
15
+ "--sysroot=#{sysroot}"
16
+ ]
44
17
 
45
- unless extra_args.empty?
46
- existing = ENV['BINDGEN_EXTRA_CLANG_ARGS'].to_s.split(/\s+/).reject(&:empty?)
18
+ existing = ENV['BINDGEN_EXTRA_CLANG_ARGS'].to_s.split(/\s+/)
47
19
  ENV['BINDGEN_EXTRA_CLANG_ARGS'] = (existing + extra_args).uniq.join(' ')
48
- puts "BINDGEN_EXTRA_CLANG_ARGS set to: #{ENV.fetch('BINDGEN_EXTRA_CLANG_ARGS', nil)}"
49
20
  end
50
-
51
- # Set target for Windows GNU toolchain if not already set
52
- ENV['CARGO_BUILD_TARGET'] ||= 'x86_64-pc-windows-gnu' if devkit || ENV['MSYSTEM']
53
21
  end
54
22
 
55
23
  default_profile = ENV.fetch('CARGO_PROFILE', 'release')
data/kreuzberg.gemspec CHANGED
@@ -4,26 +4,20 @@ require_relative 'lib/kreuzberg/version'
4
4
 
5
5
  repo_root = File.expand_path('../..', __dir__)
6
6
 
7
+ # Include files from packages/ruby
7
8
  ruby_prefix = 'packages/ruby/'
8
9
  ruby_cmd = %(git -C "#{repo_root}" ls-files -z #{ruby_prefix})
9
10
  ruby_files =
10
11
  `#{ruby_cmd}`.split("\x0")
11
- .select { |path| path.start_with?(ruby_prefix) }
12
- .map { |path| path.delete_prefix(ruby_prefix) }
12
+ .select { |path| path.start_with?(ruby_prefix) }
13
+ .map { |path| path.delete_prefix(ruby_prefix) }
13
14
 
15
+ # Include the kreuzberg core crate (needed for path patch in Cargo.toml)
14
16
  core_prefix = 'crates/kreuzberg/'
15
17
  core_cmd = %(git -C "#{repo_root}" ls-files -z #{core_prefix})
16
18
  core_files =
17
19
  `#{core_cmd}`.split("\x0")
18
- .select { |path| path.start_with?(core_prefix) }
19
- .map { |path| path.delete_prefix('crates/') }
20
- .map { |path| "vendor/#{path}" }
21
-
22
- ffi_prefix = 'crates/kreuzberg-ffi/'
23
- ffi_cmd = %(git -C "#{repo_root}" ls-files -z #{ffi_prefix})
24
- ffi_files =
25
- `#{ffi_cmd}`.split("\x0")
26
- .select { |path| path.start_with?(ffi_prefix) }
20
+ .select { |path| path.start_with?(core_prefix) }
27
21
  .map { |path| path.delete_prefix('crates/') }
28
22
  .map { |path| "vendor/#{path}" }
29
23
 
@@ -47,109 +41,17 @@ fallback_files = Dir.chdir(__dir__) do
47
41
  File::FNM_DOTMATCH
48
42
  )
49
43
 
44
+ # Fallback for core crate - copy from repo root
50
45
  core_fallback = Dir.chdir(repo_root) do
51
46
  Dir.glob('crates/kreuzberg/**/*', File::FNM_DOTMATCH)
52
- .reject { |f| File.directory?(f) }
53
- .reject { |f| f.include?('/.fastembed_cache/') }
54
- .reject { |f| f.include?('/target/') }
55
- .grep_v(/\.(swp|bak|tmp)$/)
56
- .grep_v(/~$/)
57
- .map { |path| "vendor/#{path.delete_prefix('crates/')}" }
58
- end
59
-
60
- ffi_fallback = Dir.chdir(repo_root) do
61
- Dir.glob('crates/kreuzberg-ffi/**/*', File::FNM_DOTMATCH)
62
- .reject { |f| File.directory?(f) }
63
- .reject { |f| f.include?('/target/') }
64
- .grep_v(/\.(swp|bak|tmp)$/)
65
- .grep_v(/~$/)
66
- .map { |path| "vendor/#{path.delete_prefix('crates/')}" }
67
- end
68
-
69
- tesseract_fallback = Dir.chdir(repo_root) do
70
- Dir.glob('crates/kreuzberg-tesseract/**/*', File::FNM_DOTMATCH)
71
- .reject { |f| File.directory?(f) }
72
- .reject { |f| f.include?('/target/') }
73
- .grep_v(/\.(swp|bak|tmp)$/)
74
- .grep_v(/~$/)
75
- .map { |path| "vendor/#{path.delete_prefix('crates/')}" }
47
+ .reject { |f| File.directory?(f) }
48
+ .map { |path| "vendor/#{path.delete_prefix('crates/')}" }
76
49
  end
77
50
 
78
- ruby_fallback + core_fallback + ffi_fallback + tesseract_fallback
79
- end
80
-
81
- vendor_files = Dir.chdir(__dir__) do
82
- kreuzberg_files = if Dir.exist?('vendor/kreuzberg')
83
- Dir.glob('vendor/kreuzberg/**/*', File::FNM_DOTMATCH)
84
- .reject { |f| File.directory?(f) }
85
- .reject { |f| f.include?('/.fastembed_cache/') }
86
- .reject { |f| f.include?('/.kreuzberg/') }
87
- .reject { |f| f.include?('/target/') }
88
- .grep_v(/\.(swp|bak|tmp)$/)
89
- .grep_v(/~$/)
90
- else
91
- []
92
- end
93
-
94
- kreuzberg_ffi_files = if Dir.exist?('vendor/kreuzberg-ffi')
95
- Dir.glob('vendor/kreuzberg-ffi/**/*', File::FNM_DOTMATCH)
96
- .reject { |f| File.directory?(f) }
97
- .reject { |f| f.include?('/target/') }
98
- .grep_v(/\.(swp|bak|tmp)$/)
99
- .grep_v(/~$/)
100
- else
101
- []
102
- end
103
-
104
- kreuzberg_tesseract_files = if Dir.exist?('vendor/kreuzberg-tesseract')
105
- Dir.glob('vendor/kreuzberg-tesseract/**/*', File::FNM_DOTMATCH)
106
- .reject { |f| File.directory?(f) }
107
- .reject { |f| f.include?('/target/') }
108
- .grep_v(/\.(swp|bak|tmp)$/)
109
- .grep_v(/~$/)
110
- else
111
- []
112
- end
113
-
114
- rb_sys_files = if Dir.exist?('vendor/rb-sys')
115
- Dir.glob('vendor/rb-sys/**/*', File::FNM_DOTMATCH)
116
- .reject { |f| File.directory?(f) }
117
- .reject { |f| f.include?('/target/') }
118
- .grep_v(/\.(swp|bak|tmp)$/)
119
- .grep_v(/~$/)
120
- else
121
- []
122
- end
123
-
124
- workspace_toml = if File.exist?('vendor/Cargo.toml')
125
- ['vendor/Cargo.toml']
126
- else
127
- []
128
- end
129
-
130
- kreuzberg_files + kreuzberg_ffi_files + kreuzberg_tesseract_files + rb_sys_files + workspace_toml
51
+ ruby_fallback + core_fallback
131
52
  end
132
53
 
133
- files = if (ruby_files + core_files + ffi_files).empty?
134
- fallback_files
135
- elsif vendor_files.any?
136
- ruby_files + vendor_files
137
- else
138
- ruby_files + core_files + ffi_files
139
- end
140
-
141
- native_artifacts = Dir.chdir(__dir__) do
142
- Dir.glob(%w[
143
- lib/**/*.bundle
144
- lib/**/*.so
145
- lib/**/*.dll
146
- lib/**/*.dylib
147
- ])
148
- end
149
- files.concat(native_artifacts)
150
-
151
- files = files.select { |f| File.exist?(f) }
152
- files = files.uniq
54
+ files = (ruby_files + core_files).empty? ? fallback_files : (ruby_files + core_files)
153
55
 
154
56
  Gem::Specification.new do |spec|
155
57
  spec.name = 'kreuzberg'
@@ -163,16 +65,16 @@ Gem::Specification.new do |spec|
163
65
  Rust core. Supports extraction, OCR, chunking, and language detection for 30+ file formats
164
66
  including PDF, DOCX, PPTX, XLSX, images, and more.
165
67
  DESC
166
- spec.homepage = 'https://github.com/kreuzberg-dev/kreuzberg'
68
+ spec.homepage = 'https://github.com/Goldziher/kreuzberg'
167
69
  spec.license = 'MIT'
168
70
  spec.required_ruby_version = '>= 3.2.0'
169
71
 
170
72
  spec.metadata = {
171
73
  'homepage_uri' => spec.homepage,
172
- 'source_code_uri' => 'https://github.com/kreuzberg-dev/kreuzberg',
173
- 'changelog_uri' => 'https://github.com/kreuzberg-dev/kreuzberg/blob/main/CHANGELOG.md',
74
+ 'source_code_uri' => 'https://github.com/Goldziher/kreuzberg',
75
+ 'changelog_uri' => 'https://github.com/Goldziher/kreuzberg/blob/main/CHANGELOG.md',
174
76
  'documentation_uri' => 'https://docs.kreuzberg.dev',
175
- 'bug_tracker_uri' => 'https://github.com/kreuzberg-dev/kreuzberg/issues',
77
+ 'bug_tracker_uri' => 'https://github.com/Goldziher/kreuzberg/issues',
176
78
  'rubygems_mfa_required' => 'true',
177
79
  'keywords' => 'document-intelligence,document-extraction,ocr,rust,bindings'
178
80
  }
@@ -183,10 +85,14 @@ Gem::Specification.new do |spec|
183
85
  spec.require_paths = ['lib']
184
86
  spec.extensions = ['ext/kreuzberg_rb/extconf.rb']
185
87
 
186
- spec.add_development_dependency 'bundler', '~> 4.0'
88
+ # Runtime dependencies
89
+ # None - the gem is self-contained with the Rust extension
90
+
91
+ # Development dependencies
92
+ spec.add_development_dependency 'bundler', '~> 2.0'
187
93
  spec.add_development_dependency 'rake', '~> 13.0'
188
94
  spec.add_development_dependency 'rake-compiler', '~> 1.2'
189
- spec.add_development_dependency 'rb_sys', '0.9.119'
95
+ spec.add_development_dependency 'rb_sys', '~> 0.9'
190
96
  spec.add_development_dependency 'rspec', '~> 3.12'
191
97
  unless Gem.win_platform?
192
98
  spec.add_development_dependency 'rbs', '~> 3.0'
@@ -4,8 +4,22 @@ require 'open3'
4
4
  require 'pathname'
5
5
 
6
6
  module Kreuzberg
7
+ # API server proxy
8
+ #
9
+ # Starts and manages the Kreuzberg API server (Litestar/Python-based or Rust-based).
10
+ #
7
11
  # @example Start the server
12
+ # server = Kreuzberg::APIProxy.new(port: 8000)
13
+ # server.start
14
+ # # Server runs in background
15
+ # server.stop
16
+ #
8
17
  # @example With block
18
+ # Kreuzberg::APIProxy.run(port: 8000) do |server|
19
+ # # Server runs while block executes
20
+ # response = Net::HTTP.get(URI('http://localhost:8000/health'))
21
+ # end
22
+ #
9
23
  module APIProxy
10
24
  Error = Class.new(Kreuzberg::Errors::Error)
11
25
  MissingBinaryError = Class.new(Error)
@@ -43,7 +57,7 @@ module Kreuzberg
43
57
  err: $stderr
44
58
  )
45
59
  Process.detach(@pid)
46
- sleep 1
60
+ sleep 1 # Give server time to start
47
61
  @pid
48
62
  end
49
63
 
@@ -56,7 +70,8 @@ module Kreuzberg
56
70
 
57
71
  Process.kill('TERM', @pid)
58
72
  Process.wait(@pid)
59
- rescue Errno::ESRCH, Errno::ECHILD # rubocop:disable Lint/SuppressedException
73
+ rescue Errno::ESRCH, Errno::ECHILD
74
+ # Process already dead
60
75
  ensure
61
76
  @pid = nil
62
77
  end
@@ -103,6 +118,7 @@ module Kreuzberg
103
118
  # @raise [MissingBinaryError] If not found
104
119
  #
105
120
  def find_api_binary
121
+ # API might be served by kreuzberg CLI or a separate binary
106
122
  binary_name = Gem.win_platform? ? 'kreuzberg.exe' : 'kreuzberg'
107
123
  found = CLIProxy.search_paths(binary_name).find(&:file?)
108
124
  return found if found
@@ -2,33 +2,11 @@
2
2
 
3
3
  module Kreuzberg
4
4
  module CacheAPI
5
- # @return [void] No meaningful return value
6
- # @example Clear cache
7
5
  def clear_cache
8
6
  native_clear_cache
9
7
  reset_cache_tracker!
10
8
  end
11
9
 
12
- # Retrieve cache statistics.
13
- #
14
- # Returns information about the current state of the extraction result cache,
15
- # including the number of cached entries and total memory used. Statistics include
16
- # both native Rust cache metrics and local tracker metrics.
17
- #
18
- # @return [Hash{Symbol | String => Integer}] Cache statistics hash containing:
19
- # - :total_entries [Integer] Total number of cached extraction results
20
- # - :total_size_bytes [Integer] Total memory used by cached results in bytes
21
- #
22
- # @example Get cache statistics
23
- # stats = Kreuzberg.cache_stats
24
- # puts "Cached entries: #{stats[:total_entries]}"
25
- # puts "Cache size: #{stats[:total_size_bytes]} bytes"
26
- #
27
- # @example Check if cache is full
28
- # stats = Kreuzberg.cache_stats
29
- # if stats[:total_size_bytes] > 1_000_000_000 # 1GB
30
- # Kreuzberg.clear_cache
31
- # end
32
10
  def cache_stats
33
11
  stats = native_cache_stats
34
12
  total_entries = (stats['total_entries'] || stats[:total_entries] || 0) + @__cache_tracker[:entries]
data/lib/kreuzberg/cli.rb CHANGED
@@ -1,8 +1,16 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Kreuzberg
4
+ # Command-line interface wrapper
5
+ #
6
+ # Provides a Ruby API for the Kreuzberg CLI commands.
7
+ #
4
8
  # @example Extract a file
9
+ # Kreuzberg::CLI.extract('document.pdf', output: 'text')
10
+ #
5
11
  # @example Detect file type
12
+ # mime_type = Kreuzberg::CLI.detect('document.pdf')
13
+ #
6
14
  module CLI
7
15
  module_function
8
16
 
@@ -14,8 +22,8 @@ module Kreuzberg
14
22
  # @return [String] Extracted content
15
23
  #
16
24
  def extract(path, output: 'text', ocr: false)
17
- args = ['extract', path, '--format', output]
18
- args.push('--ocr', ocr ? 'true' : 'false')
25
+ args = ['extract', path, '--output', output]
26
+ args << '--ocr' if ocr
19
27
  CLIProxy.call(args)
20
28
  end
21
29
 
@@ -4,7 +4,14 @@ require 'open3'
4
4
  require 'pathname'
5
5
 
6
6
  module Kreuzberg
7
+ # CLI binary proxy
8
+ #
9
+ # Provides access to the Kreuzberg CLI binary built from crates/kreuzberg-cli.
10
+ #
7
11
  # @example
12
+ # output = Kreuzberg::CLIProxy.call(['extract', 'document.pdf'])
13
+ # puts output
14
+ #
8
15
  module CLIProxy
9
16
  Error = Class.new(Kreuzberg::Errors::Error)
10
17
  MissingBinaryError = Class.new(Error)
@@ -89,12 +96,15 @@ module Kreuzberg
89
96
  #
90
97
  def search_paths(binary_name)
91
98
  paths = [
99
+ # In lib/bin (for packaged gems)
92
100
  lib_path.join('bin', binary_name),
93
101
  lib_path.join(binary_name),
102
+ # In local development (packages/ruby)
94
103
  root_path.join('../../crates/kreuzberg-cli/target/release', binary_name),
95
104
  root_path.join('../../target/release', binary_name)
96
105
  ]
97
106
 
107
+ # Try workspace root
98
108
  workspace_root = root_path.parent&.parent
99
109
  paths << workspace_root.join('target', 'release', binary_name) if workspace_root
100
110