kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (321) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +0 -6
  3. data/.rubocop.yaml +534 -1
  4. data/Gemfile +2 -1
  5. data/Gemfile.lock +28 -116
  6. data/README.md +269 -629
  7. data/Rakefile +0 -9
  8. data/Steepfile +4 -8
  9. data/examples/async_patterns.rb +58 -1
  10. data/ext/kreuzberg_rb/extconf.rb +5 -35
  11. data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
  12. data/ext/kreuzberg_rb/native/build.rs +14 -12
  13. data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
  14. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
  15. data/ext/kreuzberg_rb/native/include/strings.h +2 -2
  16. data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
  17. data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
  18. data/extconf.rb +6 -38
  19. data/kreuzberg.gemspec +20 -114
  20. data/lib/kreuzberg/api_proxy.rb +18 -2
  21. data/lib/kreuzberg/cache_api.rb +0 -22
  22. data/lib/kreuzberg/cli.rb +10 -2
  23. data/lib/kreuzberg/cli_proxy.rb +10 -0
  24. data/lib/kreuzberg/config.rb +22 -274
  25. data/lib/kreuzberg/errors.rb +7 -73
  26. data/lib/kreuzberg/extraction_api.rb +8 -237
  27. data/lib/kreuzberg/mcp_proxy.rb +11 -2
  28. data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
  29. data/lib/kreuzberg/post_processor_protocol.rb +71 -0
  30. data/lib/kreuzberg/result.rb +33 -151
  31. data/lib/kreuzberg/setup_lib_path.rb +2 -22
  32. data/lib/kreuzberg/validator_protocol.rb +73 -0
  33. data/lib/kreuzberg/version.rb +1 -1
  34. data/lib/kreuzberg.rb +13 -27
  35. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  36. data/sig/kreuzberg.rbs +12 -105
  37. data/spec/binding/cache_spec.rb +22 -22
  38. data/spec/binding/cli_proxy_spec.rb +4 -2
  39. data/spec/binding/cli_spec.rb +11 -12
  40. data/spec/binding/config_spec.rb +0 -74
  41. data/spec/binding/config_validation_spec.rb +6 -100
  42. data/spec/binding/error_handling_spec.rb +97 -283
  43. data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
  44. data/spec/binding/plugins/postprocessor_spec.rb +11 -11
  45. data/spec/binding/plugins/validator_spec.rb +13 -12
  46. data/spec/examples.txt +104 -0
  47. data/spec/fixtures/config.toml +1 -0
  48. data/spec/fixtures/config.yaml +1 -0
  49. data/spec/fixtures/invalid_config.toml +1 -0
  50. data/spec/smoke/package_spec.rb +3 -2
  51. data/spec/spec_helper.rb +3 -1
  52. data/vendor/kreuzberg/Cargo.toml +67 -192
  53. data/vendor/kreuzberg/README.md +9 -97
  54. data/vendor/kreuzberg/build.rs +194 -516
  55. data/vendor/kreuzberg/src/api/handlers.rs +9 -130
  56. data/vendor/kreuzberg/src/api/mod.rs +3 -18
  57. data/vendor/kreuzberg/src/api/server.rs +71 -236
  58. data/vendor/kreuzberg/src/api/types.rs +7 -43
  59. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  60. data/vendor/kreuzberg/src/cache/mod.rs +3 -27
  61. data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
  62. data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
  63. data/vendor/kreuzberg/src/core/config.rs +23 -905
  64. data/vendor/kreuzberg/src/core/extractor.rs +106 -403
  65. data/vendor/kreuzberg/src/core/io.rs +2 -4
  66. data/vendor/kreuzberg/src/core/mime.rs +12 -2
  67. data/vendor/kreuzberg/src/core/mod.rs +3 -22
  68. data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
  69. data/vendor/kreuzberg/src/embeddings.rs +21 -169
  70. data/vendor/kreuzberg/src/error.rs +2 -2
  71. data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
  72. data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
  73. data/vendor/kreuzberg/src/extraction/email.rs +11 -12
  74. data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
  75. data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
  76. data/vendor/kreuzberg/src/extraction/image.rs +14 -138
  77. data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
  78. data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
  79. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
  80. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  81. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  82. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  83. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  84. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  85. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  86. data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
  87. data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
  88. data/vendor/kreuzberg/src/extraction/table.rs +1 -2
  89. data/vendor/kreuzberg/src/extraction/text.rs +10 -18
  90. data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
  91. data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
  92. data/vendor/kreuzberg/src/extractors/email.rs +9 -37
  93. data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
  94. data/vendor/kreuzberg/src/extractors/html.rs +173 -182
  95. data/vendor/kreuzberg/src/extractors/image.rs +8 -32
  96. data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
  97. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  98. data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
  99. data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
  100. data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
  101. data/vendor/kreuzberg/src/extractors/text.rs +7 -30
  102. data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
  103. data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
  104. data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
  105. data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
  106. data/vendor/kreuzberg/src/lib.rs +5 -17
  107. data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
  108. data/vendor/kreuzberg/src/mcp/server.rs +21 -145
  109. data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
  110. data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
  111. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
  112. data/vendor/kreuzberg/src/pdf/error.rs +1 -93
  113. data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
  114. data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
  115. data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
  116. data/vendor/kreuzberg/src/pdf/table.rs +64 -61
  117. data/vendor/kreuzberg/src/pdf/text.rs +24 -416
  118. data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
  119. data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
  120. data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
  121. data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
  122. data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
  123. data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
  124. data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
  125. data/vendor/kreuzberg/src/text/mod.rs +0 -8
  126. data/vendor/kreuzberg/src/text/quality.rs +15 -28
  127. data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
  128. data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
  129. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
  130. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
  131. data/vendor/kreuzberg/src/types.rs +67 -907
  132. data/vendor/kreuzberg/src/utils/mod.rs +0 -14
  133. data/vendor/kreuzberg/src/utils/quality.rs +3 -12
  134. data/vendor/kreuzberg/tests/api_tests.rs +0 -506
  135. data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
  136. data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
  137. data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
  138. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  139. data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
  140. data/vendor/kreuzberg/tests/config_features.rs +1 -33
  141. data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
  142. data/vendor/kreuzberg/tests/core_integration.rs +9 -35
  143. data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
  144. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
  145. data/vendor/kreuzberg/tests/email_integration.rs +1 -3
  146. data/vendor/kreuzberg/tests/error_handling.rs +34 -43
  147. data/vendor/kreuzberg/tests/format_integration.rs +1 -7
  148. data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
  149. data/vendor/kreuzberg/tests/image_integration.rs +0 -2
  150. data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
  151. data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
  152. data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
  153. data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
  154. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  155. data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
  156. data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
  157. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
  158. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
  159. data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
  160. data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
  161. data/vendor/kreuzberg/tests/security_validation.rs +1 -13
  162. data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
  163. metadata +25 -171
  164. data/.rubocop.yml +0 -543
  165. data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
  166. data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
  167. data/lib/kreuzberg/error_context.rb +0 -136
  168. data/lib/kreuzberg/types.rb +0 -170
  169. data/lib/libpdfium.so +0 -0
  170. data/spec/binding/async_operations_spec.rb +0 -473
  171. data/spec/binding/batch_operations_spec.rb +0 -595
  172. data/spec/binding/batch_spec.rb +0 -359
  173. data/spec/binding/config_result_spec.rb +0 -377
  174. data/spec/binding/embeddings_spec.rb +0 -816
  175. data/spec/binding/error_recovery_spec.rb +0 -488
  176. data/spec/binding/font_config_spec.rb +0 -220
  177. data/spec/binding/images_spec.rb +0 -738
  178. data/spec/binding/keywords_extraction_spec.rb +0 -600
  179. data/spec/binding/metadata_types_spec.rb +0 -1228
  180. data/spec/binding/pages_extraction_spec.rb +0 -471
  181. data/spec/binding/tables_spec.rb +0 -641
  182. data/spec/unit/config/chunking_config_spec.rb +0 -213
  183. data/spec/unit/config/embedding_config_spec.rb +0 -343
  184. data/spec/unit/config/extraction_config_spec.rb +0 -438
  185. data/spec/unit/config/font_config_spec.rb +0 -285
  186. data/spec/unit/config/hierarchy_config_spec.rb +0 -314
  187. data/spec/unit/config/image_extraction_config_spec.rb +0 -209
  188. data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
  189. data/spec/unit/config/keyword_config_spec.rb +0 -229
  190. data/spec/unit/config/language_detection_config_spec.rb +0 -258
  191. data/spec/unit/config/ocr_config_spec.rb +0 -171
  192. data/spec/unit/config/page_config_spec.rb +0 -221
  193. data/spec/unit/config/pdf_config_spec.rb +0 -267
  194. data/spec/unit/config/postprocessor_config_spec.rb +0 -290
  195. data/spec/unit/config/tesseract_config_spec.rb +0 -181
  196. data/spec/unit/config/token_reduction_config_spec.rb +0 -251
  197. data/test/metadata_types_test.rb +0 -959
  198. data/vendor/Cargo.toml +0 -61
  199. data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
  200. data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
  201. data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
  202. data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
  203. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  204. data/vendor/kreuzberg/src/core/formats.rs +0 -235
  205. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  206. data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
  207. data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
  208. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
  209. data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
  210. data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
  211. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  212. data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
  213. data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
  214. data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
  215. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  216. data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
  217. data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
  218. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  219. data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
  220. data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
  221. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  222. data/vendor/kreuzberg/src/extractors/security.rs +0 -484
  223. data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
  224. data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
  225. data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
  226. data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
  227. data/vendor/kreuzberg/src/panic_context.rs +0 -154
  228. data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
  229. data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
  230. data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
  231. data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
  232. data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
  233. data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
  234. data/vendor/kreuzberg/src/utils/pool.rs +0 -503
  235. data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
  236. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
  237. data/vendor/kreuzberg/tests/api_embed.rs +0 -360
  238. data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
  239. data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
  240. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
  241. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
  242. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
  243. data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
  244. data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
  245. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
  246. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
  247. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
  248. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
  249. data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
  250. data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
  251. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
  252. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
  253. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
  254. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
  255. data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
  256. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
  257. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
  258. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
  259. data/vendor/kreuzberg/tests/page_markers.rs +0 -297
  260. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
  261. data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
  262. data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
  263. data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
  264. data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
  265. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
  266. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
  267. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
  268. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
  269. data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
  270. data/vendor/kreuzberg-ffi/README.md +0 -851
  271. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
  272. data/vendor/kreuzberg-ffi/build.rs +0 -168
  273. data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
  274. data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
  275. data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
  276. data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
  277. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
  278. data/vendor/kreuzberg-ffi/src/error.rs +0 -901
  279. data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
  280. data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
  281. data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
  282. data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
  283. data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
  284. data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
  285. data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
  286. data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
  287. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
  288. data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
  289. data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
  290. data/vendor/kreuzberg-ffi/src/result.rs +0 -510
  291. data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
  292. data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
  293. data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
  294. data/vendor/kreuzberg-ffi/src/types.rs +0 -363
  295. data/vendor/kreuzberg-ffi/src/util.rs +0 -210
  296. data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
  297. data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
  298. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
  299. data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
  300. data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
  301. data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
  302. data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
  303. data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
  304. data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
  305. data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
  306. data/vendor/kreuzberg-tesseract/LICENSE +0 -22
  307. data/vendor/kreuzberg-tesseract/README.md +0 -399
  308. data/vendor/kreuzberg-tesseract/build.rs +0 -1127
  309. data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
  310. data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
  311. data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
  312. data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
  313. data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
  314. data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
  315. data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
  316. data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
  317. data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
  318. data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
  319. data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
  320. data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
  321. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211
@@ -7,58 +7,66 @@ rescue LoadError
7
7
  end
8
8
 
9
9
  module Kreuzberg
10
+ # Extraction result wrapper
11
+ #
12
+ # Provides structured access to extraction results from the native extension.
13
+ #
10
14
  # @example
15
+ # result = Kreuzberg.extract_file_sync("document.pdf")
16
+ # puts result.content
17
+ # puts "MIME type: #{result.mime_type}"
18
+ # puts "Metadata: #{result.metadata.inspect}"
19
+ # result.tables.each { |table| puts table.inspect }
20
+ #
11
21
  # rubocop:disable Metrics/ClassLength
12
22
  class Result
13
23
  attr_reader :content, :mime_type, :metadata, :metadata_json, :tables,
14
- :detected_languages, :chunks, :images, :pages
24
+ :detected_languages, :chunks, :images
15
25
 
26
+ # Table structure
27
+ #
16
28
  # @!attribute [r] cells
17
29
  # @return [Array<Array<String>>] Table cells (2D array)
18
30
  # @!attribute [r] markdown
19
31
  # @return [String] Markdown representation
20
32
  # @!attribute [r] page_number
21
33
  # @return [Integer] Page number where table was found
34
+ #
22
35
  Table = Struct.new(:cells, :markdown, :page_number, keyword_init: true) do
23
36
  def to_h
24
37
  { cells: cells, markdown: markdown, page_number: page_number }
25
38
  end
26
39
  end
27
40
 
41
+ # Text chunk
42
+ #
28
43
  # @!attribute [r] content
29
44
  # @return [String] Chunk content
30
- # @!attribute [r] byte_start
31
- # @return [Integer] Starting byte offset (UTF-8)
32
- # @!attribute [r] byte_end
33
- # @return [Integer] Ending byte offset (UTF-8)
45
+ # @!attribute [r] char_start
46
+ # @return [Integer] Starting character index
47
+ # @!attribute [r] char_end
48
+ # @return [Integer] Ending character index
34
49
  # @!attribute [r] token_count
35
50
  # @return [Integer, nil] Approximate token count (may be nil)
36
- # @!attribute [r] first_page
37
- # @return [Integer, nil] First page number (1-indexed)
38
- # @!attribute [r] last_page
39
- # @return [Integer, nil] Last page number (1-indexed)
51
+ #
40
52
  Chunk = Struct.new(
41
53
  :content,
42
- :byte_start,
43
- :byte_end,
54
+ :char_start,
55
+ :char_end,
44
56
  :token_count,
45
57
  :chunk_index,
46
58
  :total_chunks,
47
- :first_page,
48
- :last_page,
49
59
  :embedding,
50
60
  keyword_init: true
51
61
  ) do
52
62
  def to_h
53
63
  {
54
64
  content: content,
55
- byte_start: byte_start,
56
- byte_end: byte_end,
65
+ char_start: char_start,
66
+ char_end: char_end,
57
67
  token_count: token_count,
58
68
  chunk_index: chunk_index,
59
69
  total_chunks: total_chunks,
60
- first_page: first_page,
61
- last_page: last_page,
62
70
  embedding: embedding
63
71
  }
64
72
  end
@@ -95,30 +103,12 @@ module Kreuzberg
95
103
  end
96
104
  end
97
105
 
98
- # @!attribute [r] page_number
99
- # @return [Integer] Page number (1-indexed)
100
- # @!attribute [r] content
101
- # @return [String] Text content for this page
102
- # @!attribute [r] tables
103
- # @return [Array<Table>] Tables on this page
104
- # @!attribute [r] images
105
- # @return [Array<Image>] Images on this page
106
- PageContent = Struct.new(:page_number, :content, :tables, :images, keyword_init: true) do
107
- def to_h
108
- {
109
- page_number: page_number,
110
- content: content,
111
- tables: tables.map(&:to_h),
112
- images: images.map(&:to_h)
113
- }
114
- end
115
- end
116
-
117
106
  # Initialize from native hash result
118
107
  #
119
108
  # @param hash [Hash] Hash returned from native extension
120
109
  #
121
110
  def initialize(hash)
111
+ # Handle both string and symbol keys for flexibility
122
112
  @content = get_value(hash, 'content', '')
123
113
  @mime_type = get_value(hash, 'mime_type', '')
124
114
  @metadata_json = get_value(hash, 'metadata_json', '{}')
@@ -127,7 +117,6 @@ module Kreuzberg
127
117
  @detected_languages = parse_detected_languages(get_value(hash, 'detected_languages'))
128
118
  @chunks = parse_chunks(get_value(hash, 'chunks'))
129
119
  @images = parse_images(get_value(hash, 'images'))
130
- @pages = parse_pages(get_value(hash, 'pages'))
131
120
  end
132
121
 
133
122
  # Convert to hash
@@ -139,11 +128,10 @@ module Kreuzberg
139
128
  content: @content,
140
129
  mime_type: @mime_type,
141
130
  metadata: @metadata,
142
- tables: serialize_tables,
131
+ tables: @tables.map(&:to_h),
143
132
  detected_languages: @detected_languages,
144
- chunks: serialize_chunks,
145
- images: serialize_images,
146
- pages: serialize_pages
133
+ chunks: @chunks&.map(&:to_h),
134
+ images: @images&.map(&:to_h)
147
135
  }
148
136
  end
149
137
 
@@ -155,100 +143,8 @@ module Kreuzberg
155
143
  to_h.to_json(*)
156
144
  end
157
145
 
158
- # Get the total number of pages in the document
159
- #
160
- # @return [Integer] Total page count (>= 0), or -1 on error
161
- #
162
- # @example
163
- # result = Kreuzberg.extract_file_sync("document.pdf")
164
- # puts "Document has #{result.page_count} pages"
165
- #
166
- def page_count
167
- if @metadata.is_a?(Hash) && @metadata['pages'].is_a?(Hash)
168
- @metadata['pages']['total_count'] || 0
169
- else
170
- 0
171
- end
172
- end
173
-
174
- # Get the total number of text chunks
175
- #
176
- # Returns 0 if chunking was not performed.
177
- #
178
- # @return [Integer] Total chunk count (>= 0), or -1 on error
179
- #
180
- # @example
181
- # result = Kreuzberg.extract_file_sync("document.pdf")
182
- # puts "Document has #{result.chunk_count} chunks"
183
- #
184
- def chunk_count
185
- @chunks&.length || 0
186
- end
187
-
188
- # Get the primary detected language
189
- #
190
- # @return [String, nil] ISO 639 language code (e.g., "en", "de"), or nil if not detected
191
- #
192
- # @example
193
- # result = Kreuzberg.extract_file_sync("document.pdf")
194
- # lang = result.detected_language
195
- # puts "Language: #{lang}" if lang
196
- #
197
- def detected_language
198
- return @metadata['language'] if @metadata.is_a?(Hash) && @metadata['language']
199
- return @detected_languages&.first if @detected_languages&.any?
200
-
201
- nil
202
- end
203
-
204
- # Get a metadata field by name
205
- #
206
- # Supports dot notation for nested fields (e.g., "format.pages").
207
- #
208
- # @param name [String, Symbol] Field name
209
- # @return [Object, nil] Field value, or nil if field doesn't exist
210
- #
211
- # @example Get a top-level field
212
- # result = Kreuzberg.extract_file_sync("document.pdf")
213
- # title = result.metadata_field("title")
214
- # puts "Title: #{title}" if title
215
- #
216
- # @example Get a nested field
217
- # format_info = result.metadata_field("format.pages")
218
- #
219
- def metadata_field(name)
220
- return nil unless @metadata.is_a?(Hash)
221
-
222
- parts = name.to_s.split('.')
223
- value = @metadata
224
-
225
- parts.each do |part|
226
- return nil unless value.is_a?(Hash)
227
-
228
- value = value[part]
229
- end
230
-
231
- value
232
- end
233
-
234
146
  private
235
147
 
236
- def serialize_tables
237
- @tables.map(&:to_h)
238
- end
239
-
240
- def serialize_chunks
241
- @chunks&.map(&:to_h)
242
- end
243
-
244
- def serialize_images
245
- @images&.map(&:to_h)
246
- end
247
-
248
- def serialize_pages
249
- @pages&.map(&:to_h)
250
- end
251
-
252
148
  def get_value(hash, key, default = nil)
253
149
  hash[key] || hash[key.to_sym] || default
254
150
  end
@@ -274,22 +170,21 @@ module Kreuzberg
274
170
  def parse_detected_languages(langs_data)
275
171
  return nil if langs_data.nil?
276
172
 
173
+ # Detected languages is now just an array of strings
277
174
  langs_data.is_a?(Array) ? langs_data : []
278
175
  end
279
176
 
280
177
  def parse_chunks(chunks_data)
281
- return [] if chunks_data.nil? || chunks_data.empty?
178
+ return nil if chunks_data.nil?
282
179
 
283
180
  chunks_data.map do |chunk_hash|
284
181
  Chunk.new(
285
182
  content: chunk_hash['content'],
286
- byte_start: chunk_hash['byte_start'],
287
- byte_end: chunk_hash['byte_end'],
183
+ char_start: chunk_hash['char_start'],
184
+ char_end: chunk_hash['char_end'],
288
185
  token_count: chunk_hash['token_count'],
289
186
  chunk_index: chunk_hash['chunk_index'],
290
187
  total_chunks: chunk_hash['total_chunks'],
291
- first_page: chunk_hash['first_page'],
292
- last_page: chunk_hash['last_page'],
293
188
  embedding: chunk_hash['embedding']
294
189
  )
295
190
  end
@@ -316,19 +211,6 @@ module Kreuzberg
316
211
  )
317
212
  end
318
213
  end
319
-
320
- def parse_pages(pages_data)
321
- return nil if pages_data.nil?
322
-
323
- pages_data.map do |page_hash|
324
- PageContent.new(
325
- page_number: page_hash['page_number'],
326
- content: page_hash['content'],
327
- tables: parse_tables(page_hash['tables']),
328
- images: parse_images(page_hash['images'])
329
- )
330
- end
331
- end
332
214
  end
333
215
  # rubocop:enable Metrics/ClassLength
334
216
  end
@@ -19,31 +19,10 @@ module Kreuzberg
19
19
  when /linux/
20
20
  prepend_env('LD_LIBRARY_PATH', lib_dir)
21
21
  when /mswin|mingw|cygwin/
22
- # Windows uses PATH to locate DLLs
23
22
  prepend_env('PATH', lib_dir, separator: ';')
24
- # Also check common locations for PDFium on Windows
25
- setup_windows_library_paths(lib_dir)
26
23
  end
27
24
  end
28
25
 
29
- def setup_windows_library_paths(lib_dir)
30
- # Add target/release to PATH for DLL lookup during development
31
- target_release = File.expand_path('../../target/release', lib_dir)
32
- prepend_env('PATH', target_release, separator: ';') if Dir.exist?(target_release)
33
-
34
- # Check for short path CARGO_TARGET_DIR (CI uses C:\t)
35
- cargo_target_dir = ENV.fetch('CARGO_TARGET_DIR', nil)
36
- return unless cargo_target_dir
37
-
38
- target_release_alt = File.join(cargo_target_dir, 'release')
39
- prepend_env('PATH', target_release_alt, separator: ';') if Dir.exist?(target_release_alt)
40
-
41
- # Also check for target-specific subdirectory (Windows GNU builds)
42
- gnu_release = File.join(cargo_target_dir, 'x86_64-pc-windows-gnu', 'release')
43
- prepend_env('PATH', gnu_release, separator: ';') if Dir.exist?(gnu_release)
44
- end
45
- private_class_method :setup_windows_library_paths
46
-
47
26
  def prepend_env(key, value, separator: ':')
48
27
  current = ENV.fetch(key, nil)
49
28
  return if current&.split(separator)&.include?(value)
@@ -58,7 +37,8 @@ module Kreuzberg
58
37
 
59
38
  ensure_install_name(bundle)
60
39
  ensure_loader_rpath(bundle)
61
- rescue Errno::ENOENT, IOError # rubocop:disable Lint/SuppressedException
40
+ rescue Errno::ENOENT, IOError
41
+ # Tool not available (e.g., on CI). The dynamic loader can still use the updated env vars.
62
42
  end
63
43
  private_class_method :fix_macos_install_name
64
44
 
@@ -1,14 +1,87 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Kreuzberg
4
+ # Validator protocol interface.
5
+ #
6
+ # This module defines the protocol that all Ruby validators must implement
7
+ # to be registered with the Rust core via the FFI bridge.
8
+ #
9
+ # Validators are called during extraction to validate results. If validation fails,
10
+ # the validator should raise a Kreuzberg::Errors::ValidationError, which will
11
+ # cause the extraction to fail.
12
+ #
4
13
  # @example Implementing a minimum length validator
14
+ # class MinimumLengthValidator
15
+ # include Kreuzberg::ValidatorProtocol
16
+ #
17
+ # def initialize(min_length = 10)
18
+ # @min_length = min_length
19
+ # end
20
+ #
21
+ # def call(result)
22
+ # if result["content"].length < @min_length
23
+ # raise Kreuzberg::Errors::ValidationError.new(
24
+ # "Content too short: #{result["content"].length} < #{@min_length}"
25
+ # )
26
+ # end
27
+ # end
28
+ # end
29
+ #
30
+ # Kreuzberg.register_validator("min_length", MinimumLengthValidator.new(100))
31
+ #
5
32
  # @example Implementing a content quality validator
33
+ # class QualityValidator
34
+ # include Kreuzberg::ValidatorProtocol
35
+ #
36
+ # def call(result)
37
+ # # Check if content has sufficient quality
38
+ # if result["content"].strip.empty?
39
+ # raise Kreuzberg::Errors::ValidationError.new("Empty content extracted")
40
+ # end
41
+ #
42
+ # # Check if metadata is present
43
+ # if result["metadata"].empty?
44
+ # raise Kreuzberg::Errors::ValidationError.new("No metadata extracted")
45
+ # end
46
+ # end
47
+ # end
48
+ #
49
+ # Kreuzberg.register_validator("quality", QualityValidator.new)
50
+ #
6
51
  # @example Using a Proc as a validator
52
+ # Kreuzberg.register_validator("not_empty", ->(result) {
53
+ # if result["content"].strip.empty?
54
+ # raise Kreuzberg::Errors::ValidationError.new("Content cannot be empty")
55
+ # end
56
+ # })
57
+ #
7
58
  module ValidatorProtocol
59
+ # Validate an extraction result.
60
+ #
61
+ # This method is called during extraction to validate results. If validation fails,
62
+ # raise a Kreuzberg::Errors::ValidationError with a descriptive message explaining
63
+ # why validation failed. If validation passes, return without raising.
64
+ #
65
+ # The validator receives the extraction result as a hash with the same structure
66
+ # as post-processors (see PostProcessorProtocol for details).
67
+ #
8
68
  # @param result [Hash] Extraction result to validate with the following structure:
69
+ # - "content" [String] - Extracted text content
70
+ # - "mime_type" [String] - MIME type of the source document
71
+ # - "metadata" [Hash] - Document metadata (title, author, etc.)
72
+ # - "tables" [Array<Hash>] - Extracted tables
73
+ # - "detected_languages" [Array<String>, nil] - Detected language codes
74
+ # - "chunks" [Array<String>, nil] - Content chunks (if chunking enabled)
75
+ #
9
76
  # @return [void]
10
77
  # @raise [Kreuzberg::Errors::ValidationError] if validation fails
78
+ #
11
79
  # @example
80
+ # def call(result)
81
+ # if result["content"].length < 10
82
+ # raise Kreuzberg::Errors::ValidationError.new("Content too short")
83
+ # end
84
+ # end
12
85
  def call(result)
13
86
  raise NotImplementedError, "#{self.class} must implement #call(result)"
14
87
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Kreuzberg
4
- VERSION = '4.0.0-rc.29'
4
+ VERSION = '4.0.0.rc1'
5
5
  end
data/lib/kreuzberg.rb CHANGED
@@ -6,8 +6,6 @@ Kreuzberg::SetupLibPath.configure
6
6
  require_relative 'kreuzberg/version'
7
7
  require 'kreuzberg_rb'
8
8
 
9
- # Kreuzberg is a Ruby binding for the Rust core library providing document extraction,
10
- # text extraction, and OCR capabilities.
11
9
  module Kreuzberg
12
10
  autoload :Config, 'kreuzberg/config'
13
11
  autoload :Result, 'kreuzberg/result'
@@ -16,28 +14,17 @@ module Kreuzberg
16
14
  autoload :APIProxy, 'kreuzberg/api_proxy'
17
15
  autoload :MCPProxy, 'kreuzberg/mcp_proxy'
18
16
  autoload :Errors, 'kreuzberg/errors'
19
- autoload :ErrorContext, 'kreuzberg/error_context'
20
17
  autoload :PostProcessorProtocol, 'kreuzberg/post_processor_protocol'
21
18
  autoload :ValidatorProtocol, 'kreuzberg/validator_protocol'
22
19
  autoload :OcrBackendProtocol, 'kreuzberg/ocr_backend_protocol'
23
20
 
24
- autoload :HtmlMetadata, 'kreuzberg/types'
25
- autoload :HeaderMetadata, 'kreuzberg/types'
26
- autoload :LinkMetadata, 'kreuzberg/types'
27
- autoload :ImageMetadata, 'kreuzberg/types'
28
- autoload :StructuredData, 'kreuzberg/types'
29
-
21
+ # Alias for API consistency with other language bindings
30
22
  ExtractionConfig = Config::Extraction
31
- PageConfig = Config::PageConfig
32
-
33
- module KeywordAlgorithm
34
- YAKE = :yake
35
- RAKE = :rake
36
- end
37
23
 
38
24
  @__cache_tracker = { entries: 0, bytes: 0 }
39
25
 
40
26
  class << self
27
+ # Store native methods as private methods
41
28
  alias native_extract_file_sync extract_file_sync
42
29
  alias native_extract_bytes_sync extract_bytes_sync
43
30
  alias native_batch_extract_files_sync batch_extract_files_sync
@@ -54,39 +41,38 @@ module Kreuzberg
54
41
  private :native_batch_extract_bytes_sync, :native_batch_extract_bytes
55
42
  end
56
43
 
44
+ # Register a Ruby post-processor that conforms to PostProcessorProtocol.
57
45
  module_function :register_post_processor
58
46
 
47
+ # Remove a post-processor by name.
59
48
  module_function :unregister_post_processor
60
49
 
50
+ # Purge all registered post-processors.
61
51
  module_function :clear_post_processors
62
52
 
53
+ # Register a validator that follows ValidatorProtocol.
63
54
  module_function :register_validator
64
55
 
56
+ # Remove a validator by name.
65
57
  module_function :unregister_validator
66
58
 
59
+ # Purge all validators.
67
60
  module_function :clear_validators
68
61
 
62
+ # List all registered validators.
69
63
  module_function :list_validators
70
64
 
65
+ # List all registered post-processors.
71
66
  module_function :list_post_processors
72
67
 
68
+ # Register an OCR backend instance implementing OcrBackendProtocol.
73
69
  module_function :register_ocr_backend
74
70
 
71
+ # Unregister an OCR backend by name.
75
72
  module_function :unregister_ocr_backend
76
73
 
74
+ # List all registered OCR backends.
77
75
  module_function :list_ocr_backends
78
-
79
- module_function :detect_mime_type
80
-
81
- module_function :detect_mime_type_from_path
82
-
83
- module_function :validate_mime_type
84
-
85
- module_function :get_extensions_for_mime
86
-
87
- module_function :list_embedding_presets
88
-
89
- module_function :get_embedding_preset
90
76
  end
91
77
 
92
78
  require_relative 'kreuzberg/cache_api'
Binary file