kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (321) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +0 -6
  3. data/.rubocop.yaml +534 -1
  4. data/Gemfile +2 -1
  5. data/Gemfile.lock +28 -116
  6. data/README.md +269 -629
  7. data/Rakefile +0 -9
  8. data/Steepfile +4 -8
  9. data/examples/async_patterns.rb +58 -1
  10. data/ext/kreuzberg_rb/extconf.rb +5 -35
  11. data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
  12. data/ext/kreuzberg_rb/native/build.rs +14 -12
  13. data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
  14. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
  15. data/ext/kreuzberg_rb/native/include/strings.h +2 -2
  16. data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
  17. data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
  18. data/extconf.rb +6 -38
  19. data/kreuzberg.gemspec +20 -114
  20. data/lib/kreuzberg/api_proxy.rb +18 -2
  21. data/lib/kreuzberg/cache_api.rb +0 -22
  22. data/lib/kreuzberg/cli.rb +10 -2
  23. data/lib/kreuzberg/cli_proxy.rb +10 -0
  24. data/lib/kreuzberg/config.rb +22 -274
  25. data/lib/kreuzberg/errors.rb +7 -73
  26. data/lib/kreuzberg/extraction_api.rb +8 -237
  27. data/lib/kreuzberg/mcp_proxy.rb +11 -2
  28. data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
  29. data/lib/kreuzberg/post_processor_protocol.rb +71 -0
  30. data/lib/kreuzberg/result.rb +33 -151
  31. data/lib/kreuzberg/setup_lib_path.rb +2 -22
  32. data/lib/kreuzberg/validator_protocol.rb +73 -0
  33. data/lib/kreuzberg/version.rb +1 -1
  34. data/lib/kreuzberg.rb +13 -27
  35. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  36. data/sig/kreuzberg.rbs +12 -105
  37. data/spec/binding/cache_spec.rb +22 -22
  38. data/spec/binding/cli_proxy_spec.rb +4 -2
  39. data/spec/binding/cli_spec.rb +11 -12
  40. data/spec/binding/config_spec.rb +0 -74
  41. data/spec/binding/config_validation_spec.rb +6 -100
  42. data/spec/binding/error_handling_spec.rb +97 -283
  43. data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
  44. data/spec/binding/plugins/postprocessor_spec.rb +11 -11
  45. data/spec/binding/plugins/validator_spec.rb +13 -12
  46. data/spec/examples.txt +104 -0
  47. data/spec/fixtures/config.toml +1 -0
  48. data/spec/fixtures/config.yaml +1 -0
  49. data/spec/fixtures/invalid_config.toml +1 -0
  50. data/spec/smoke/package_spec.rb +3 -2
  51. data/spec/spec_helper.rb +3 -1
  52. data/vendor/kreuzberg/Cargo.toml +67 -192
  53. data/vendor/kreuzberg/README.md +9 -97
  54. data/vendor/kreuzberg/build.rs +194 -516
  55. data/vendor/kreuzberg/src/api/handlers.rs +9 -130
  56. data/vendor/kreuzberg/src/api/mod.rs +3 -18
  57. data/vendor/kreuzberg/src/api/server.rs +71 -236
  58. data/vendor/kreuzberg/src/api/types.rs +7 -43
  59. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  60. data/vendor/kreuzberg/src/cache/mod.rs +3 -27
  61. data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
  62. data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
  63. data/vendor/kreuzberg/src/core/config.rs +23 -905
  64. data/vendor/kreuzberg/src/core/extractor.rs +106 -403
  65. data/vendor/kreuzberg/src/core/io.rs +2 -4
  66. data/vendor/kreuzberg/src/core/mime.rs +12 -2
  67. data/vendor/kreuzberg/src/core/mod.rs +3 -22
  68. data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
  69. data/vendor/kreuzberg/src/embeddings.rs +21 -169
  70. data/vendor/kreuzberg/src/error.rs +2 -2
  71. data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
  72. data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
  73. data/vendor/kreuzberg/src/extraction/email.rs +11 -12
  74. data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
  75. data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
  76. data/vendor/kreuzberg/src/extraction/image.rs +14 -138
  77. data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
  78. data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
  79. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
  80. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  81. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  82. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  83. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  84. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  85. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  86. data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
  87. data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
  88. data/vendor/kreuzberg/src/extraction/table.rs +1 -2
  89. data/vendor/kreuzberg/src/extraction/text.rs +10 -18
  90. data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
  91. data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
  92. data/vendor/kreuzberg/src/extractors/email.rs +9 -37
  93. data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
  94. data/vendor/kreuzberg/src/extractors/html.rs +173 -182
  95. data/vendor/kreuzberg/src/extractors/image.rs +8 -32
  96. data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
  97. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  98. data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
  99. data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
  100. data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
  101. data/vendor/kreuzberg/src/extractors/text.rs +7 -30
  102. data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
  103. data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
  104. data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
  105. data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
  106. data/vendor/kreuzberg/src/lib.rs +5 -17
  107. data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
  108. data/vendor/kreuzberg/src/mcp/server.rs +21 -145
  109. data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
  110. data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
  111. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
  112. data/vendor/kreuzberg/src/pdf/error.rs +1 -93
  113. data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
  114. data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
  115. data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
  116. data/vendor/kreuzberg/src/pdf/table.rs +64 -61
  117. data/vendor/kreuzberg/src/pdf/text.rs +24 -416
  118. data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
  119. data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
  120. data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
  121. data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
  122. data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
  123. data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
  124. data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
  125. data/vendor/kreuzberg/src/text/mod.rs +0 -8
  126. data/vendor/kreuzberg/src/text/quality.rs +15 -28
  127. data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
  128. data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
  129. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
  130. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
  131. data/vendor/kreuzberg/src/types.rs +67 -907
  132. data/vendor/kreuzberg/src/utils/mod.rs +0 -14
  133. data/vendor/kreuzberg/src/utils/quality.rs +3 -12
  134. data/vendor/kreuzberg/tests/api_tests.rs +0 -506
  135. data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
  136. data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
  137. data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
  138. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  139. data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
  140. data/vendor/kreuzberg/tests/config_features.rs +1 -33
  141. data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
  142. data/vendor/kreuzberg/tests/core_integration.rs +9 -35
  143. data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
  144. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
  145. data/vendor/kreuzberg/tests/email_integration.rs +1 -3
  146. data/vendor/kreuzberg/tests/error_handling.rs +34 -43
  147. data/vendor/kreuzberg/tests/format_integration.rs +1 -7
  148. data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
  149. data/vendor/kreuzberg/tests/image_integration.rs +0 -2
  150. data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
  151. data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
  152. data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
  153. data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
  154. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  155. data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
  156. data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
  157. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
  158. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
  159. data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
  160. data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
  161. data/vendor/kreuzberg/tests/security_validation.rs +1 -13
  162. data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
  163. metadata +25 -171
  164. data/.rubocop.yml +0 -543
  165. data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
  166. data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
  167. data/lib/kreuzberg/error_context.rb +0 -136
  168. data/lib/kreuzberg/types.rb +0 -170
  169. data/lib/libpdfium.so +0 -0
  170. data/spec/binding/async_operations_spec.rb +0 -473
  171. data/spec/binding/batch_operations_spec.rb +0 -595
  172. data/spec/binding/batch_spec.rb +0 -359
  173. data/spec/binding/config_result_spec.rb +0 -377
  174. data/spec/binding/embeddings_spec.rb +0 -816
  175. data/spec/binding/error_recovery_spec.rb +0 -488
  176. data/spec/binding/font_config_spec.rb +0 -220
  177. data/spec/binding/images_spec.rb +0 -738
  178. data/spec/binding/keywords_extraction_spec.rb +0 -600
  179. data/spec/binding/metadata_types_spec.rb +0 -1228
  180. data/spec/binding/pages_extraction_spec.rb +0 -471
  181. data/spec/binding/tables_spec.rb +0 -641
  182. data/spec/unit/config/chunking_config_spec.rb +0 -213
  183. data/spec/unit/config/embedding_config_spec.rb +0 -343
  184. data/spec/unit/config/extraction_config_spec.rb +0 -438
  185. data/spec/unit/config/font_config_spec.rb +0 -285
  186. data/spec/unit/config/hierarchy_config_spec.rb +0 -314
  187. data/spec/unit/config/image_extraction_config_spec.rb +0 -209
  188. data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
  189. data/spec/unit/config/keyword_config_spec.rb +0 -229
  190. data/spec/unit/config/language_detection_config_spec.rb +0 -258
  191. data/spec/unit/config/ocr_config_spec.rb +0 -171
  192. data/spec/unit/config/page_config_spec.rb +0 -221
  193. data/spec/unit/config/pdf_config_spec.rb +0 -267
  194. data/spec/unit/config/postprocessor_config_spec.rb +0 -290
  195. data/spec/unit/config/tesseract_config_spec.rb +0 -181
  196. data/spec/unit/config/token_reduction_config_spec.rb +0 -251
  197. data/test/metadata_types_test.rb +0 -959
  198. data/vendor/Cargo.toml +0 -61
  199. data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
  200. data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
  201. data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
  202. data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
  203. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  204. data/vendor/kreuzberg/src/core/formats.rs +0 -235
  205. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  206. data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
  207. data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
  208. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
  209. data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
  210. data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
  211. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  212. data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
  213. data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
  214. data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
  215. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  216. data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
  217. data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
  218. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  219. data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
  220. data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
  221. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  222. data/vendor/kreuzberg/src/extractors/security.rs +0 -484
  223. data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
  224. data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
  225. data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
  226. data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
  227. data/vendor/kreuzberg/src/panic_context.rs +0 -154
  228. data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
  229. data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
  230. data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
  231. data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
  232. data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
  233. data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
  234. data/vendor/kreuzberg/src/utils/pool.rs +0 -503
  235. data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
  236. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
  237. data/vendor/kreuzberg/tests/api_embed.rs +0 -360
  238. data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
  239. data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
  240. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
  241. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
  242. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
  243. data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
  244. data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
  245. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
  246. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
  247. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
  248. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
  249. data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
  250. data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
  251. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
  252. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
  253. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
  254. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
  255. data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
  256. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
  257. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
  258. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
  259. data/vendor/kreuzberg/tests/page_markers.rs +0 -297
  260. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
  261. data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
  262. data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
  263. data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
  264. data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
  265. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
  266. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
  267. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
  268. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
  269. data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
  270. data/vendor/kreuzberg-ffi/README.md +0 -851
  271. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
  272. data/vendor/kreuzberg-ffi/build.rs +0 -168
  273. data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
  274. data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
  275. data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
  276. data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
  277. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
  278. data/vendor/kreuzberg-ffi/src/error.rs +0 -901
  279. data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
  280. data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
  281. data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
  282. data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
  283. data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
  284. data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
  285. data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
  286. data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
  287. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
  288. data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
  289. data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
  290. data/vendor/kreuzberg-ffi/src/result.rs +0 -510
  291. data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
  292. data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
  293. data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
  294. data/vendor/kreuzberg-ffi/src/types.rs +0 -363
  295. data/vendor/kreuzberg-ffi/src/util.rs +0 -210
  296. data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
  297. data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
  298. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
  299. data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
  300. data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
  301. data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
  302. data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
  303. data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
  304. data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
  305. data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
  306. data/vendor/kreuzberg-tesseract/LICENSE +0 -22
  307. data/vendor/kreuzberg-tesseract/README.md +0 -399
  308. data/vendor/kreuzberg-tesseract/build.rs +0 -1127
  309. data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
  310. data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
  311. data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
  312. data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
  313. data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
  314. data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
  315. data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
  316. data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
  317. data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
  318. data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
  319. data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
  320. data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
  321. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211
@@ -1,10 +1,12 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require 'json'
4
-
5
3
  module Kreuzberg
6
4
  module Config
5
+ # OCR configuration
6
+ #
7
7
  # @example
8
+ # ocr = OCR.new(backend: "tesseract", language: "eng")
9
+ #
8
10
  class OCR
9
11
  attr_reader :backend, :language, :tesseract_config
10
12
 
@@ -37,7 +39,6 @@ module Kreuzberg
37
39
  end
38
40
  end
39
41
 
40
- # Tesseract OCR engine configuration
41
42
  class Tesseract
42
43
  attr_reader :options
43
44
 
@@ -71,7 +72,6 @@ module Kreuzberg
71
72
  class Chunking
72
73
  attr_reader :max_chars, :max_overlap, :preset, :embedding, :enabled
73
74
 
74
- # rubocop:disable Metrics/CyclomaticComplexity
75
75
  def initialize(
76
76
  max_chars: nil,
77
77
  max_overlap: nil,
@@ -81,17 +81,11 @@ module Kreuzberg
81
81
  chunk_overlap: nil,
82
82
  enabled: true
83
83
  )
84
- # rubocop:enable Metrics/CyclomaticComplexity
85
84
  resolved_size = chunk_size || max_chars || 1000
86
85
  resolved_overlap = chunk_overlap || max_overlap || 200
87
86
 
88
87
  @max_chars = resolved_size.to_i
89
88
  @max_overlap = resolved_overlap.to_i
90
-
91
- # Validate positive values
92
- raise ArgumentError, "max_chars must be a positive integer, got #{@max_chars}" if @max_chars.negative?
93
- raise ArgumentError, "max_overlap must be a positive integer, got #{@max_overlap}" if @max_overlap.negative?
94
-
95
89
  @preset = preset&.to_s
96
90
  @embedding = normalize_embedding(embedding)
97
91
  @enabled = boolean_or_nil(enabled)
@@ -126,7 +120,6 @@ module Kreuzberg
126
120
  end
127
121
  end
128
122
 
129
- # Embedding model configuration for document chunking
130
123
  class Embedding
131
124
  attr_reader :model, :normalize, :batch_size, :show_download_progress, :cache_dir
132
125
 
@@ -197,86 +190,18 @@ module Kreuzberg
197
190
  end
198
191
  end
199
192
 
200
- # Font configuration for PDF rendering
201
- #
202
- # @example
203
- # font_config = FontConfig.new(enabled: true, custom_font_dirs: ["/usr/share/fonts"])
204
- #
205
- class FontConfig
206
- attr_accessor :enabled, :custom_font_dirs
207
-
208
- def initialize(enabled: true, custom_font_dirs: nil)
209
- @enabled = enabled ? true : false
210
- @custom_font_dirs = custom_font_dirs
211
- end
212
-
213
- def to_h
214
- {
215
- enabled: @enabled,
216
- custom_font_dirs: @custom_font_dirs
217
- }.compact
218
- end
219
- end
220
-
221
- # Hierarchy detection configuration
222
- #
223
- # @example
224
- # hierarchy = Hierarchy.new(enabled: true, k_clusters: 6, include_bbox: true)
225
- #
226
- class Hierarchy
227
- attr_reader :enabled, :k_clusters, :include_bbox, :ocr_coverage_threshold
228
-
229
- def initialize(
230
- enabled: true,
231
- k_clusters: 6,
232
- include_bbox: true,
233
- ocr_coverage_threshold: nil
234
- )
235
- @enabled = enabled ? true : false
236
- @k_clusters = k_clusters&.to_i || 6
237
- @include_bbox = include_bbox ? true : false
238
- @ocr_coverage_threshold = ocr_coverage_threshold&.to_f
239
- end
240
-
241
- def to_h
242
- {
243
- enabled: @enabled,
244
- k_clusters: @k_clusters,
245
- include_bbox: @include_bbox,
246
- ocr_coverage_threshold: @ocr_coverage_threshold
247
- }.compact
248
- end
249
-
250
- def self.from_h(hash)
251
- return nil if hash.nil?
252
- return hash if hash.is_a?(self)
253
-
254
- new(**hash.transform_keys(&:to_sym)) if hash.is_a?(Hash)
255
- end
256
- end
257
-
258
193
  # PDF-specific options
259
194
  #
260
195
  # @example
261
196
  # pdf = PDF.new(extract_images: true, passwords: ["secret", "backup"])
262
197
  #
263
- # @example With font configuration
264
- # font_config = FontConfig.new(enabled: true, custom_font_dirs: ["/usr/share/fonts"])
265
- # pdf = PDF.new(extract_images: true, font_config: font_config)
266
- #
267
- # @example With hierarchy configuration
268
- # hierarchy = Hierarchy.new(enabled: true, k_clusters: 6)
269
- # pdf = PDF.new(extract_images: true, hierarchy: hierarchy)
270
- #
271
198
  class PDF
272
- attr_reader :extract_images, :passwords, :extract_metadata, :font_config, :hierarchy
199
+ attr_reader :extract_images, :passwords, :extract_metadata
273
200
 
274
201
  def initialize(
275
202
  extract_images: false,
276
203
  passwords: nil,
277
- extract_metadata: true,
278
- font_config: nil,
279
- hierarchy: nil
204
+ extract_metadata: true
280
205
  )
281
206
  @extract_images = extract_images ? true : false
282
207
  @passwords = if passwords.is_a?(Array)
@@ -285,45 +210,15 @@ module Kreuzberg
285
210
  (passwords ? [passwords.to_s] : nil)
286
211
  end
287
212
  @extract_metadata = extract_metadata ? true : false
288
- @font_config = normalize_font_config(font_config)
289
- @hierarchy = normalize_hierarchy(hierarchy)
290
213
  end
291
214
 
292
215
  def to_h
293
216
  {
294
217
  extract_images: @extract_images,
295
218
  passwords: @passwords,
296
- extract_metadata: @extract_metadata,
297
- font_config: @font_config&.to_h,
298
- hierarchy: @hierarchy&.to_h
219
+ extract_metadata: @extract_metadata
299
220
  }.compact
300
221
  end
301
-
302
- def font_config=(value)
303
- @font_config = normalize_font_config(value)
304
- end
305
-
306
- def hierarchy=(value)
307
- @hierarchy = normalize_hierarchy(value)
308
- end
309
-
310
- private
311
-
312
- def normalize_font_config(value)
313
- return nil if value.nil?
314
- return value if value.is_a?(FontConfig)
315
- return FontConfig.new(**value.transform_keys(&:to_sym)) if value.is_a?(Hash)
316
-
317
- raise ArgumentError, "Expected #{FontConfig}, Hash, or nil, got #{value.class}"
318
- end
319
-
320
- def normalize_hierarchy(value)
321
- return nil if value.nil?
322
- return value if value.is_a?(Hierarchy)
323
- return Hierarchy.new(**value.transform_keys(&:to_sym)) if value.is_a?(Hash)
324
-
325
- raise ArgumentError, "Expected #{Hierarchy}, Hash, or nil, got #{value.class}"
326
- end
327
222
  end
328
223
 
329
224
  # Image extraction configuration
@@ -394,8 +289,6 @@ module Kreuzberg
394
289
  attr_reader :target_dpi, :auto_rotate, :deskew, :denoise,
395
290
  :contrast_enhance, :binarization_method, :invert_colors
396
291
 
397
- VALID_BINARIZATION_METHODS = %w[otsu sauvola niblack wolf bradley adaptive].freeze
398
-
399
292
  def initialize(
400
293
  target_dpi: 300,
401
294
  auto_rotate: true,
@@ -413,12 +306,10 @@ module Kreuzberg
413
306
  @binarization_method = binarization_method.to_s
414
307
  @invert_colors = invert_colors ? true : false
415
308
 
416
- # Validate binarization method
417
- return if VALID_BINARIZATION_METHODS.include?(@binarization_method)
309
+ valid_methods = %w[otsu sauvola adaptive]
310
+ return if valid_methods.include?(@binarization_method)
418
311
 
419
- valid_methods = VALID_BINARIZATION_METHODS.join(', ')
420
- raise ArgumentError,
421
- "Invalid binarization_method: #{@binarization_method}. Valid methods are: #{valid_methods}"
312
+ raise ArgumentError, "binarization_method must be one of: #{valid_methods.join(', ')}"
422
313
  end
423
314
 
424
315
  def to_h
@@ -448,16 +339,14 @@ module Kreuzberg
448
339
  class TokenReduction
449
340
  attr_reader :mode, :preserve_important_words
450
341
 
451
- VALID_MODES = %w[off light moderate aggressive maximum].freeze
452
-
453
342
  def initialize(mode: 'off', preserve_important_words: true)
454
343
  @mode = mode.to_s
455
344
  @preserve_important_words = preserve_important_words ? true : false
456
345
 
457
- # Validate mode against known valid modes
458
- return if VALID_MODES.include?(@mode)
346
+ valid_modes = %w[off light moderate aggressive maximum]
347
+ return if valid_modes.include?(@mode)
459
348
 
460
- raise ArgumentError, "Invalid token reduction mode: #{@mode}. Valid modes are: #{VALID_MODES.join(', ')}"
349
+ raise ArgumentError, "mode must be one of: #{valid_modes.join(', ')}"
461
350
  end
462
351
 
463
352
  def to_h
@@ -468,7 +357,6 @@ module Kreuzberg
468
357
  end
469
358
  end
470
359
 
471
- # HTML preprocessing configuration for content extraction
472
360
  class HtmlPreprocessing
473
361
  attr_reader :enabled, :preset, :remove_navigation, :remove_forms
474
362
 
@@ -497,7 +385,6 @@ module Kreuzberg
497
385
  end
498
386
  end
499
387
 
500
- # HTML rendering options for document conversion
501
388
  class HtmlOptions
502
389
  attr_reader :options
503
390
 
@@ -525,7 +412,6 @@ module Kreuzberg
525
412
  end
526
413
  end
527
414
 
528
- # YAKE keyword extraction parameters
529
415
  class KeywordYakeParams
530
416
  attr_reader :window_size
531
417
 
@@ -538,7 +424,6 @@ module Kreuzberg
538
424
  end
539
425
  end
540
426
 
541
- # RAKE keyword extraction parameters
542
427
  class KeywordRakeParams
543
428
  attr_reader :min_word_length, :max_words_per_phrase
544
429
 
@@ -555,7 +440,6 @@ module Kreuzberg
555
440
  end
556
441
  end
557
442
 
558
- # Keyword extraction configuration for document analysis
559
443
  class Keywords
560
444
  attr_reader :algorithm, :max_keywords, :min_score, :ngram_range,
561
445
  :language, :yake_params, :rake_params
@@ -601,36 +485,6 @@ module Kreuzberg
601
485
  end
602
486
  end
603
487
 
604
- # Page tracking configuration for multi-page documents
605
- #
606
- # @example Enable page extraction
607
- # pages = PageConfig.new(extract_pages: true)
608
- #
609
- # @example Enable page markers in content
610
- # pages = PageConfig.new(insert_page_markers: true, marker_format: "--- PAGE {page_num} ---")
611
- #
612
- class PageConfig
613
- attr_reader :extract_pages, :insert_page_markers, :marker_format
614
-
615
- def initialize(
616
- extract_pages: false,
617
- insert_page_markers: false,
618
- marker_format: "\n\n<!-- PAGE {page_num} -->\n\n"
619
- )
620
- @extract_pages = extract_pages ? true : false
621
- @insert_page_markers = insert_page_markers ? true : false
622
- @marker_format = marker_format.to_s
623
- end
624
-
625
- def to_h
626
- {
627
- extract_pages: @extract_pages,
628
- insert_page_markers: @insert_page_markers,
629
- marker_format: @marker_format
630
- }
631
- end
632
- end
633
-
634
488
  # Post-processor configuration
635
489
  #
636
490
  # @example Enable all post-processors
@@ -715,7 +569,7 @@ module Kreuzberg
715
569
  attr_reader :use_cache, :enable_quality_processing, :force_ocr,
716
570
  :ocr, :chunking, :language_detection, :pdf_options,
717
571
  :image_extraction, :image_preprocessing, :postprocessor,
718
- :token_reduction, :keywords, :html_options, :pages,
572
+ :token_reduction, :keywords, :html_options,
719
573
  :max_concurrent_extractions
720
574
 
721
575
  # Load configuration from a file.
@@ -734,6 +588,7 @@ module Kreuzberg
734
588
  #
735
589
  def self.from_file(path)
736
590
  hash = Kreuzberg._config_from_file_native(path)
591
+ # Convert string keys to symbols for keyword arguments
737
592
  new(**hash.transform_keys(&:to_sym))
738
593
  end
739
594
 
@@ -754,6 +609,7 @@ module Kreuzberg
754
609
  hash = Kreuzberg._config_discover_native
755
610
  return nil if hash.nil?
756
611
 
612
+ # Convert string keys to symbols for keyword arguments
757
613
  new(**hash.transform_keys(&:to_sym))
758
614
  end
759
615
 
@@ -771,7 +627,6 @@ module Kreuzberg
771
627
  token_reduction: nil,
772
628
  keywords: nil,
773
629
  html_options: nil,
774
- pages: nil,
775
630
  max_concurrent_extractions: nil
776
631
  )
777
632
  @use_cache = use_cache ? true : false
@@ -787,11 +642,10 @@ module Kreuzberg
787
642
  @token_reduction = normalize_config(token_reduction, TokenReduction)
788
643
  @keywords = normalize_config(keywords, Keywords)
789
644
  @html_options = normalize_config(html_options, HtmlOptions)
790
- @pages = normalize_config(pages, PageConfig)
791
645
  @max_concurrent_extractions = max_concurrent_extractions&.to_i
792
646
  end
793
647
 
794
- # rubocop:disable Metrics/CyclomaticComplexity
648
+ # rubocop:disable Metrics/PerceivedComplexity
795
649
  def to_h
796
650
  {
797
651
  use_cache: @use_cache,
@@ -807,130 +661,24 @@ module Kreuzberg
807
661
  token_reduction: @token_reduction&.to_h,
808
662
  keywords: @keywords&.to_h,
809
663
  html_options: @html_options&.to_h,
810
- pages: @pages&.to_h,
811
664
  max_concurrent_extractions: @max_concurrent_extractions
812
665
  }.compact
813
666
  end
814
- # rubocop:enable Metrics/CyclomaticComplexity
815
-
816
- # Serialize configuration to JSON string
817
- #
818
- # @return [String] JSON representation of the configuration
819
- #
820
- # @example
821
- # config = Extraction.new(use_cache: true)
822
- # json = config.to_json
823
- # puts json # => "{\"use_cache\":true,...}"
824
- #
825
- def to_json(*_args)
826
- json_hash = to_h
827
- # Convert to JSON directly - the native function has issues
828
- JSON.generate(json_hash)
829
- end
830
-
831
- # Get a field from the configuration
832
- #
833
- # Supports dot notation for nested fields (e.g., "ocr.backend")
834
- #
835
- # @param field_name [String, Symbol] Field name to retrieve
836
- # @return [Object, nil] Parsed field value, or nil if field doesn't exist
837
- #
838
- # @example Get a top-level field
839
- # config = Extraction.new(use_cache: true)
840
- # config.get_field("use_cache") # => true
841
- #
842
- # @example Get a nested field
843
- # config = Extraction.new(ocr: OCR.new(backend: "tesseract"))
844
- # config.get_field("ocr.backend") # => "tesseract"
845
- #
846
- def get_field(field_name)
847
- json_hash = to_h
848
- field_path = field_name.to_s.split('.')
849
-
850
- # Navigate the nested hash using the field path
851
- field_path.reduce(json_hash) do |current, key|
852
- case current
853
- when Hash
854
- # Check both symbol and string keys, prefer symbol if exists
855
- if current.key?(key.to_sym)
856
- current[key.to_sym]
857
- elsif current.key?(key.to_s)
858
- current[key.to_s]
859
- end
860
- end
861
- end
862
- end
863
-
864
- # Merge another configuration into this one
865
- #
866
- # Returns a new configuration with fields from the other config overriding
867
- # fields from this config (shallow merge).
868
- #
869
- # @param other [Extraction, Hash] Configuration to merge
870
- # @return [Extraction] New merged configuration
871
- #
872
- # @example
873
- # base = Extraction.new(use_cache: true, force_ocr: false)
874
- # override = Extraction.new(force_ocr: true)
875
- # merged = base.merge(override)
876
- # merged.use_cache # => true
877
- # merged.force_ocr # => true
878
- #
879
- def merge(other)
880
- other_config = other.is_a?(Extraction) ? other : Extraction.new(**other)
881
- # Merge the two config hashes
882
- merged_hash = to_h.merge(other_config.to_h)
883
- Extraction.new(**merged_hash)
884
- end
885
-
886
- # Merge another configuration into this one (mutating)
887
- #
888
- # Modifies this configuration in-place by merging fields from another config.
889
- #
890
- # @param other [Extraction, Hash] Configuration to merge
891
- # @return [self]
892
- #
893
- # @example
894
- # base = Extraction.new(use_cache: true, force_ocr: false)
895
- # override = Extraction.new(force_ocr: true)
896
- # base.merge!(override)
897
- # base.use_cache # => true
898
- # base.force_ocr # => true
899
- #
900
- def merge!(other)
901
- other_config = other.is_a?(Extraction) ? other : Extraction.new(**other)
902
- merged = merge(other_config)
903
- update_from_merged(merged)
904
- self
905
- end
667
+ # rubocop:enable Metrics/PerceivedComplexity
906
668
 
907
669
  private
908
670
 
909
671
  def normalize_config(value, klass)
910
672
  return nil if value.nil?
911
673
  return value if value.is_a?(klass)
674
+ # Convert string keys to symbols for keyword arguments
912
675
  return klass.new(**value.transform_keys(&:to_sym)) if value.is_a?(Hash)
913
676
 
914
677
  raise ArgumentError, "Expected #{klass}, Hash, or nil, got #{value.class}"
915
678
  end
916
-
917
- def update_from_merged(merged)
918
- @use_cache = merged.use_cache
919
- @enable_quality_processing = merged.enable_quality_processing
920
- @force_ocr = merged.force_ocr
921
- @ocr = merged.ocr
922
- @chunking = merged.chunking
923
- @language_detection = merged.language_detection
924
- @pdf_options = merged.pdf_options
925
- @image_extraction = merged.image_extraction
926
- @image_preprocessing = merged.image_preprocessing
927
- @postprocessor = merged.postprocessor
928
- @token_reduction = merged.token_reduction
929
- @keywords = merged.keywords
930
- @html_options = merged.html_options
931
- @pages = merged.pages
932
- @max_concurrent_extractions = merged.max_concurrent_extractions
933
- end
934
679
  end
680
+
681
+ # Backwards compatibility aliases
682
+ Ocr = OCR
935
683
  end
936
684
  end
@@ -1,75 +1,9 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require 'json'
4
-
5
3
  module Kreuzberg
6
- ERROR_CODE_SUCCESS = 0
7
- ERROR_CODE_GENERIC = 1
8
- ERROR_CODE_PANIC = 2
9
- ERROR_CODE_INVALID_ARGUMENT = 3
10
- ERROR_CODE_IO = 4
11
- ERROR_CODE_PARSING = 5
12
- ERROR_CODE_OCR = 6
13
- ERROR_CODE_MISSING_DEPENDENCY = 7
14
-
15
4
  module Errors
16
- class PanicContext
17
- attr_reader :file, :line, :function, :message, :timestamp_secs
18
-
19
- def initialize(file:, line:, function:, message:, timestamp_secs:)
20
- @file = file
21
- @line = line
22
- @function = function
23
- @message = message
24
- @timestamp_secs = timestamp_secs
25
- end
26
-
27
- def to_s
28
- "#{file}:#{line}:#{function}: #{message}"
29
- end
30
-
31
- def to_h
32
- {
33
- file:,
34
- line:,
35
- function:,
36
- message:,
37
- timestamp_secs:
38
- }
39
- end
40
-
41
- def self.from_json(json_string)
42
- return nil if json_string.nil? || json_string.empty?
43
-
44
- data = JSON.parse(json_string, symbolize_names: true)
45
- sliced = data.slice(:file, :line, :function, :message, :timestamp_secs)
46
- new(**with_defaults(sliced))
47
- rescue JSON::ParserError
48
- nil
49
- end
50
-
51
- def self.with_defaults(sliced)
52
- {
53
- file: sliced[:file] || '',
54
- line: sliced[:line] || 0,
55
- function: sliced[:function] || '',
56
- message: sliced[:message] || '',
57
- timestamp_secs: sliced[:timestamp_secs] || 0
58
- }
59
- end
60
- private_class_method :with_defaults
61
- end
62
-
63
5
  # Base error class for all Kreuzberg errors
64
- class Error < StandardError
65
- attr_reader :panic_context, :error_code
66
-
67
- def initialize(message, panic_context: nil, error_code: nil)
68
- super(message)
69
- @panic_context = panic_context
70
- @error_code = error_code
71
- end
72
- end
6
+ class Error < StandardError; end
73
7
 
74
8
  # Raised when validation fails
75
9
  class ValidationError < Error; end
@@ -78,8 +12,8 @@ module Kreuzberg
78
12
  class ParsingError < Error
79
13
  attr_reader :context
80
14
 
81
- def initialize(message, context: nil, panic_context: nil, error_code: nil)
82
- super(message, panic_context:, error_code:)
15
+ def initialize(message, context: nil)
16
+ super(message)
83
17
  @context = context
84
18
  end
85
19
  end
@@ -88,8 +22,8 @@ module Kreuzberg
88
22
  class OCRError < Error
89
23
  attr_reader :context
90
24
 
91
- def initialize(message, context: nil, panic_context: nil, error_code: nil)
92
- super(message, panic_context:, error_code:)
25
+ def initialize(message, context: nil)
26
+ super(message)
93
27
  @context = context
94
28
  end
95
29
  end
@@ -98,8 +32,8 @@ module Kreuzberg
98
32
  class MissingDependencyError < Error
99
33
  attr_reader :dependency
100
34
 
101
- def initialize(message, dependency: nil, panic_context: nil, error_code: nil)
102
- super(message, panic_context:, error_code:)
35
+ def initialize(message, dependency: nil)
36
+ super(message)
103
37
  @dependency = dependency
104
38
  end
105
39
  end