kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (321) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +0 -6
  3. data/.rubocop.yaml +534 -1
  4. data/Gemfile +2 -1
  5. data/Gemfile.lock +28 -116
  6. data/README.md +269 -629
  7. data/Rakefile +0 -9
  8. data/Steepfile +4 -8
  9. data/examples/async_patterns.rb +58 -1
  10. data/ext/kreuzberg_rb/extconf.rb +5 -35
  11. data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
  12. data/ext/kreuzberg_rb/native/build.rs +14 -12
  13. data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
  14. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
  15. data/ext/kreuzberg_rb/native/include/strings.h +2 -2
  16. data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
  17. data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
  18. data/extconf.rb +6 -38
  19. data/kreuzberg.gemspec +20 -114
  20. data/lib/kreuzberg/api_proxy.rb +18 -2
  21. data/lib/kreuzberg/cache_api.rb +0 -22
  22. data/lib/kreuzberg/cli.rb +10 -2
  23. data/lib/kreuzberg/cli_proxy.rb +10 -0
  24. data/lib/kreuzberg/config.rb +22 -274
  25. data/lib/kreuzberg/errors.rb +7 -73
  26. data/lib/kreuzberg/extraction_api.rb +8 -237
  27. data/lib/kreuzberg/mcp_proxy.rb +11 -2
  28. data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
  29. data/lib/kreuzberg/post_processor_protocol.rb +71 -0
  30. data/lib/kreuzberg/result.rb +33 -151
  31. data/lib/kreuzberg/setup_lib_path.rb +2 -22
  32. data/lib/kreuzberg/validator_protocol.rb +73 -0
  33. data/lib/kreuzberg/version.rb +1 -1
  34. data/lib/kreuzberg.rb +13 -27
  35. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  36. data/sig/kreuzberg.rbs +12 -105
  37. data/spec/binding/cache_spec.rb +22 -22
  38. data/spec/binding/cli_proxy_spec.rb +4 -2
  39. data/spec/binding/cli_spec.rb +11 -12
  40. data/spec/binding/config_spec.rb +0 -74
  41. data/spec/binding/config_validation_spec.rb +6 -100
  42. data/spec/binding/error_handling_spec.rb +97 -283
  43. data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
  44. data/spec/binding/plugins/postprocessor_spec.rb +11 -11
  45. data/spec/binding/plugins/validator_spec.rb +13 -12
  46. data/spec/examples.txt +104 -0
  47. data/spec/fixtures/config.toml +1 -0
  48. data/spec/fixtures/config.yaml +1 -0
  49. data/spec/fixtures/invalid_config.toml +1 -0
  50. data/spec/smoke/package_spec.rb +3 -2
  51. data/spec/spec_helper.rb +3 -1
  52. data/vendor/kreuzberg/Cargo.toml +67 -192
  53. data/vendor/kreuzberg/README.md +9 -97
  54. data/vendor/kreuzberg/build.rs +194 -516
  55. data/vendor/kreuzberg/src/api/handlers.rs +9 -130
  56. data/vendor/kreuzberg/src/api/mod.rs +3 -18
  57. data/vendor/kreuzberg/src/api/server.rs +71 -236
  58. data/vendor/kreuzberg/src/api/types.rs +7 -43
  59. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  60. data/vendor/kreuzberg/src/cache/mod.rs +3 -27
  61. data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
  62. data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
  63. data/vendor/kreuzberg/src/core/config.rs +23 -905
  64. data/vendor/kreuzberg/src/core/extractor.rs +106 -403
  65. data/vendor/kreuzberg/src/core/io.rs +2 -4
  66. data/vendor/kreuzberg/src/core/mime.rs +12 -2
  67. data/vendor/kreuzberg/src/core/mod.rs +3 -22
  68. data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
  69. data/vendor/kreuzberg/src/embeddings.rs +21 -169
  70. data/vendor/kreuzberg/src/error.rs +2 -2
  71. data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
  72. data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
  73. data/vendor/kreuzberg/src/extraction/email.rs +11 -12
  74. data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
  75. data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
  76. data/vendor/kreuzberg/src/extraction/image.rs +14 -138
  77. data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
  78. data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
  79. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
  80. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  81. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  82. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  83. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  84. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  85. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  86. data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
  87. data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
  88. data/vendor/kreuzberg/src/extraction/table.rs +1 -2
  89. data/vendor/kreuzberg/src/extraction/text.rs +10 -18
  90. data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
  91. data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
  92. data/vendor/kreuzberg/src/extractors/email.rs +9 -37
  93. data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
  94. data/vendor/kreuzberg/src/extractors/html.rs +173 -182
  95. data/vendor/kreuzberg/src/extractors/image.rs +8 -32
  96. data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
  97. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  98. data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
  99. data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
  100. data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
  101. data/vendor/kreuzberg/src/extractors/text.rs +7 -30
  102. data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
  103. data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
  104. data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
  105. data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
  106. data/vendor/kreuzberg/src/lib.rs +5 -17
  107. data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
  108. data/vendor/kreuzberg/src/mcp/server.rs +21 -145
  109. data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
  110. data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
  111. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
  112. data/vendor/kreuzberg/src/pdf/error.rs +1 -93
  113. data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
  114. data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
  115. data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
  116. data/vendor/kreuzberg/src/pdf/table.rs +64 -61
  117. data/vendor/kreuzberg/src/pdf/text.rs +24 -416
  118. data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
  119. data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
  120. data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
  121. data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
  122. data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
  123. data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
  124. data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
  125. data/vendor/kreuzberg/src/text/mod.rs +0 -8
  126. data/vendor/kreuzberg/src/text/quality.rs +15 -28
  127. data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
  128. data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
  129. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
  130. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
  131. data/vendor/kreuzberg/src/types.rs +67 -907
  132. data/vendor/kreuzberg/src/utils/mod.rs +0 -14
  133. data/vendor/kreuzberg/src/utils/quality.rs +3 -12
  134. data/vendor/kreuzberg/tests/api_tests.rs +0 -506
  135. data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
  136. data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
  137. data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
  138. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  139. data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
  140. data/vendor/kreuzberg/tests/config_features.rs +1 -33
  141. data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
  142. data/vendor/kreuzberg/tests/core_integration.rs +9 -35
  143. data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
  144. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
  145. data/vendor/kreuzberg/tests/email_integration.rs +1 -3
  146. data/vendor/kreuzberg/tests/error_handling.rs +34 -43
  147. data/vendor/kreuzberg/tests/format_integration.rs +1 -7
  148. data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
  149. data/vendor/kreuzberg/tests/image_integration.rs +0 -2
  150. data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
  151. data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
  152. data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
  153. data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
  154. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  155. data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
  156. data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
  157. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
  158. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
  159. data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
  160. data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
  161. data/vendor/kreuzberg/tests/security_validation.rs +1 -13
  162. data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
  163. metadata +25 -171
  164. data/.rubocop.yml +0 -543
  165. data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
  166. data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
  167. data/lib/kreuzberg/error_context.rb +0 -136
  168. data/lib/kreuzberg/types.rb +0 -170
  169. data/lib/libpdfium.so +0 -0
  170. data/spec/binding/async_operations_spec.rb +0 -473
  171. data/spec/binding/batch_operations_spec.rb +0 -595
  172. data/spec/binding/batch_spec.rb +0 -359
  173. data/spec/binding/config_result_spec.rb +0 -377
  174. data/spec/binding/embeddings_spec.rb +0 -816
  175. data/spec/binding/error_recovery_spec.rb +0 -488
  176. data/spec/binding/font_config_spec.rb +0 -220
  177. data/spec/binding/images_spec.rb +0 -738
  178. data/spec/binding/keywords_extraction_spec.rb +0 -600
  179. data/spec/binding/metadata_types_spec.rb +0 -1228
  180. data/spec/binding/pages_extraction_spec.rb +0 -471
  181. data/spec/binding/tables_spec.rb +0 -641
  182. data/spec/unit/config/chunking_config_spec.rb +0 -213
  183. data/spec/unit/config/embedding_config_spec.rb +0 -343
  184. data/spec/unit/config/extraction_config_spec.rb +0 -438
  185. data/spec/unit/config/font_config_spec.rb +0 -285
  186. data/spec/unit/config/hierarchy_config_spec.rb +0 -314
  187. data/spec/unit/config/image_extraction_config_spec.rb +0 -209
  188. data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
  189. data/spec/unit/config/keyword_config_spec.rb +0 -229
  190. data/spec/unit/config/language_detection_config_spec.rb +0 -258
  191. data/spec/unit/config/ocr_config_spec.rb +0 -171
  192. data/spec/unit/config/page_config_spec.rb +0 -221
  193. data/spec/unit/config/pdf_config_spec.rb +0 -267
  194. data/spec/unit/config/postprocessor_config_spec.rb +0 -290
  195. data/spec/unit/config/tesseract_config_spec.rb +0 -181
  196. data/spec/unit/config/token_reduction_config_spec.rb +0 -251
  197. data/test/metadata_types_test.rb +0 -959
  198. data/vendor/Cargo.toml +0 -61
  199. data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
  200. data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
  201. data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
  202. data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
  203. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  204. data/vendor/kreuzberg/src/core/formats.rs +0 -235
  205. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  206. data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
  207. data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
  208. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
  209. data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
  210. data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
  211. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  212. data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
  213. data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
  214. data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
  215. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  216. data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
  217. data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
  218. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  219. data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
  220. data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
  221. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  222. data/vendor/kreuzberg/src/extractors/security.rs +0 -484
  223. data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
  224. data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
  225. data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
  226. data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
  227. data/vendor/kreuzberg/src/panic_context.rs +0 -154
  228. data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
  229. data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
  230. data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
  231. data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
  232. data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
  233. data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
  234. data/vendor/kreuzberg/src/utils/pool.rs +0 -503
  235. data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
  236. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
  237. data/vendor/kreuzberg/tests/api_embed.rs +0 -360
  238. data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
  239. data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
  240. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
  241. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
  242. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
  243. data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
  244. data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
  245. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
  246. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
  247. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
  248. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
  249. data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
  250. data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
  251. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
  252. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
  253. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
  254. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
  255. data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
  256. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
  257. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
  258. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
  259. data/vendor/kreuzberg/tests/page_markers.rs +0 -297
  260. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
  261. data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
  262. data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
  263. data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
  264. data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
  265. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
  266. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
  267. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
  268. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
  269. data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
  270. data/vendor/kreuzberg-ffi/README.md +0 -851
  271. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
  272. data/vendor/kreuzberg-ffi/build.rs +0 -168
  273. data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
  274. data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
  275. data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
  276. data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
  277. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
  278. data/vendor/kreuzberg-ffi/src/error.rs +0 -901
  279. data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
  280. data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
  281. data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
  282. data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
  283. data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
  284. data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
  285. data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
  286. data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
  287. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
  288. data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
  289. data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
  290. data/vendor/kreuzberg-ffi/src/result.rs +0 -510
  291. data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
  292. data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
  293. data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
  294. data/vendor/kreuzberg-ffi/src/types.rs +0 -363
  295. data/vendor/kreuzberg-ffi/src/util.rs +0 -210
  296. data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
  297. data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
  298. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
  299. data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
  300. data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
  301. data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
  302. data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
  303. data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
  304. data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
  305. data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
  306. data/vendor/kreuzberg-tesseract/LICENSE +0 -22
  307. data/vendor/kreuzberg-tesseract/README.md +0 -399
  308. data/vendor/kreuzberg-tesseract/build.rs +0 -1127
  309. data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
  310. data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
  311. data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
  312. data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
  313. data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
  314. data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
  315. data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
  316. data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
  317. data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
  318. data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
  319. data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
  320. data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
  321. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211
@@ -1,595 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'spec_helper'
4
- require 'tempfile'
5
- require 'fileutils'
6
- require 'securerandom'
7
-
8
- RSpec.describe 'Batch Operations' do
9
- describe 'batch_extract_files with multiple file types' do
10
- it 'processes mixed file types in single batch' do
11
- paths = []
12
-
13
- # Create text file
14
- txt_file = Tempfile.new(['batch_test', '.txt'])
15
- txt_file.write('Text file content: Machine learning transforms technology.')
16
- txt_file.close
17
- paths << txt_file.path
18
-
19
- # Create markdown file
20
- md_file = Tempfile.new(['batch_test', '.md'])
21
- md_file.write('# Markdown Header\n\nContent about artificial intelligence.')
22
- md_file.close
23
- paths << md_file.path
24
-
25
- config = Kreuzberg::Config::Extraction.new
26
- results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
27
-
28
- expect(results).to be_a(Array)
29
- expect(results.length).to eq(2)
30
- results.each do |result|
31
- expect(result).to be_a(Kreuzberg::Result)
32
- expect(result.content).not_to be_empty
33
- end
34
-
35
- paths.each { |p| FileUtils.rm_f(p) }
36
- end
37
-
38
- it 'maintains file order through batch processing' do
39
- paths = []
40
- unique_markers = []
41
-
42
- 3.times do |i|
43
- file = Tempfile.new(["ordered_#{i}", '.txt'])
44
- marker = "MARKER_#{SecureRandom.hex(4)}"
45
- file.write("File #{i}: #{marker}")
46
- file.close
47
- paths << file.path
48
- unique_markers << marker
49
- end
50
-
51
- config = Kreuzberg::Config::Extraction.new
52
- results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
53
-
54
- expect(results.length).to eq(paths.length)
55
- results.each_with_index do |result, idx|
56
- expect(result.content).to include(unique_markers[idx])
57
- end
58
-
59
- paths.each { |p| FileUtils.rm_f(p) }
60
- end
61
-
62
- it 'processes large batch operations efficiently' do
63
- paths = []
64
-
65
- # Create 20 test files
66
- 20.times do |i|
67
- file = Tempfile.new(["large_batch_#{i}", '.txt'])
68
- file.write("Content #{i}: Machine learning technology")
69
- file.close
70
- paths << file.path
71
- end
72
-
73
- config = Kreuzberg::Config::Extraction.new
74
- results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
75
-
76
- expect(results.length).to eq(20)
77
- expect(results).to all(be_a(Kreuzberg::Result))
78
-
79
- paths.each { |p| FileUtils.rm_f(p) }
80
- end
81
-
82
- it 'handles batch with different file sizes' do
83
- paths = []
84
-
85
- # Small file
86
- small = Tempfile.new(['small', '.txt'])
87
- small.write('AI')
88
- small.close
89
- paths << small.path
90
-
91
- # Medium file
92
- medium = Tempfile.new(['medium', '.txt'])
93
- medium.write('Machine learning is a subset of artificial intelligence.')
94
- medium.close
95
- paths << medium.path
96
-
97
- # Large file
98
- large = Tempfile.new(['large', '.txt'])
99
- large.write('Machine learning ' * 100)
100
- large.close
101
- paths << large.path
102
-
103
- config = Kreuzberg::Config::Extraction.new
104
- results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
105
-
106
- expect(results.length).to eq(3)
107
- expect(results).to all(be_a(Kreuzberg::Result))
108
- expect(results[2].content.length).to be >= results[0].content.length
109
-
110
- paths.each { |p| FileUtils.rm_f(p) }
111
- end
112
- end
113
-
114
- describe 'batch extraction with configuration options' do
115
- it 'applies consistent configuration across batch' do
116
- paths = []
117
-
118
- 3.times do |i|
119
- file = Tempfile.new(["config_batch_#{i}", '.txt'])
120
- file.write("Machine learning content #{i}. Artificial intelligence advances.")
121
- file.close
122
- paths << file.path
123
- end
124
-
125
- config = Kreuzberg::Config::Extraction.new(
126
- keywords: Kreuzberg::Config::Keywords.new(
127
- algorithm: 'yake',
128
- max_keywords: 5
129
- )
130
- )
131
-
132
- results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
133
-
134
- expect(results.length).to eq(3)
135
- results.each do |result|
136
- expect(result).to be_a(Kreuzberg::Result)
137
- expect(result.content).not_to be_nil
138
- end
139
-
140
- paths.each { |p| FileUtils.rm_f(p) }
141
- end
142
-
143
- it 'batch respects caching configuration' do
144
- path = Tempfile.new(['cache_test', '.txt']).tap do |f|
145
- f.write('Cache test content')
146
- f.close
147
- end
148
-
149
- config_no_cache = Kreuzberg::Config::Extraction.new(use_cache: false)
150
- results1 = Kreuzberg.batch_extract_files_sync(paths: [path.path], config: config_no_cache)
151
-
152
- config_with_cache = Kreuzberg::Config::Extraction.new(use_cache: true)
153
- results2 = Kreuzberg.batch_extract_files_sync(paths: [path.path], config: config_with_cache)
154
-
155
- expect(results1.length).to eq(1)
156
- expect(results2.length).to eq(1)
157
- expect(results1[0].content).to eq(results2[0].content)
158
-
159
- FileUtils.rm_f(path.path)
160
- end
161
-
162
- it 'supports keyword extraction configuration in batch' do
163
- paths = []
164
-
165
- 2.times do |i|
166
- file = Tempfile.new(["keywords_batch_#{i}", '.txt'])
167
- file.write('Machine learning and deep learning enable artificial intelligence.')
168
- file.close
169
- paths << file.path
170
- end
171
-
172
- algorithms = %w[yake rake]
173
-
174
- algorithms.each do |algo|
175
- config = Kreuzberg::Config::Extraction.new(
176
- keywords: Kreuzberg::Config::Keywords.new(
177
- algorithm: algo,
178
- max_keywords: 5
179
- )
180
- )
181
-
182
- results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
183
- expect(results.length).to eq(2)
184
- end
185
-
186
- paths.each { |p| FileUtils.rm_f(p) }
187
- end
188
- end
189
-
190
- describe 'batch error handling and resilience' do
191
- it 'processes batch with some invalid paths gracefully' do
192
- valid_file = Tempfile.new(['valid_batch', '.txt']).tap do |f|
193
- f.write('Valid content')
194
- f.close
195
- end
196
-
197
- valid_path = valid_file.path
198
- config = Kreuzberg::Config::Extraction.new
199
-
200
- # Process just the valid path
201
- results = Kreuzberg.batch_extract_files_sync(paths: [valid_path], config: config)
202
- expect(results.length).to eq(1)
203
- expect(results[0]).to be_a(Kreuzberg::Result)
204
-
205
- FileUtils.rm_f(valid_path)
206
- end
207
-
208
- it 'handles empty file list in batch' do
209
- config = Kreuzberg::Config::Extraction.new
210
- results = Kreuzberg.batch_extract_files_sync(paths: [], config: config)
211
-
212
- expect(results).to be_a(Array)
213
- expect(results).to be_empty
214
- end
215
-
216
- it 'processes batch with single file' do
217
- file = Tempfile.new(['single_batch', '.txt']).tap do |f|
218
- f.write('Single file batch processing')
219
- f.close
220
- end
221
-
222
- config = Kreuzberg::Config::Extraction.new
223
- results = Kreuzberg.batch_extract_files_sync(paths: [file.path], config: config)
224
-
225
- expect(results.length).to eq(1)
226
- expect(results[0]).to be_a(Kreuzberg::Result)
227
-
228
- FileUtils.rm_f(file.path)
229
- end
230
-
231
- it 'maintains batch execution on partial failures' do
232
- valid_file = Tempfile.new(['valid', '.txt']).tap do |f|
233
- f.write('Valid content')
234
- f.close
235
- end
236
-
237
- config = Kreuzberg::Config::Extraction.new
238
- results = Kreuzberg.batch_extract_files_sync(paths: [valid_file.path], config: config)
239
-
240
- expect(results).to be_a(Array)
241
- expect(results).to all(be_a(Kreuzberg::Result))
242
-
243
- FileUtils.rm_f(valid_file.path)
244
- end
245
- end
246
-
247
- describe 'batch enumerable processing' do
248
- it 'iterates over batch results with each' do
249
- paths = []
250
-
251
- 3.times do |i|
252
- file = Tempfile.new(["enum_#{i}", '.txt'])
253
- file.write("Enumerable test #{i}")
254
- file.close
255
- paths << file.path
256
- end
257
-
258
- config = Kreuzberg::Config::Extraction.new
259
- results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
260
-
261
- count = 0
262
- results.each do |result|
263
- expect(result).to be_a(Kreuzberg::Result)
264
- count += 1
265
- end
266
-
267
- expect(count).to eq(3)
268
-
269
- paths.each { |p| FileUtils.rm_f(p) }
270
- end
271
-
272
- it 'maps batch results to extract content' do
273
- paths = []
274
-
275
- 3.times do |i|
276
- file = Tempfile.new(["map_#{i}", '.txt'])
277
- file.write("Mapping #{i}")
278
- file.close
279
- paths << file.path
280
- end
281
-
282
- config = Kreuzberg::Config::Extraction.new
283
- results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
284
-
285
- contents = results.map(&:content)
286
- expect(contents).to be_a(Array)
287
- expect(contents.length).to eq(3)
288
- expect(contents).to all(be_a(String))
289
-
290
- paths.each { |p| FileUtils.rm_f(p) }
291
- end
292
-
293
- it 'filters batch results by content length' do
294
- paths = []
295
-
296
- # Small file
297
- small = Tempfile.new(['small', '.txt']).tap do |f|
298
- f.write('x')
299
- f.close
300
- end
301
- paths << small.path
302
-
303
- # Large file
304
- large = Tempfile.new(['large', '.txt']).tap do |f|
305
- f.write('content ' * 50)
306
- f.close
307
- end
308
- paths << large.path
309
-
310
- config = Kreuzberg::Config::Extraction.new
311
- results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
312
-
313
- large_results = results.select { |r| r.content.length > 20 }
314
- expect(large_results.length).to be >= 1
315
-
316
- paths.each { |p| FileUtils.rm_f(p) }
317
- end
318
-
319
- it 'reduces batch results to combined content' do
320
- paths = []
321
-
322
- 3.times do |i|
323
- file = Tempfile.new(["reduce_#{i}", '.txt'])
324
- file.write("Part #{i} ")
325
- file.close
326
- paths << file.path
327
- end
328
-
329
- config = Kreuzberg::Config::Extraction.new
330
- results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
331
-
332
- combined = results.reduce('') { |acc, r| acc + r.content }
333
- expect(combined).not_to be_empty
334
- expect(combined).to include('Part')
335
-
336
- paths.each { |p| FileUtils.rm_f(p) }
337
- end
338
- end
339
-
340
- describe 'batch with chunking and embeddings' do
341
- it 'processes batch with chunking enabled' do
342
- paths = []
343
-
344
- 2.times do |i|
345
- file = Tempfile.new(["chunking_batch_#{i}", '.txt'])
346
- file.write('Machine learning ' * 50)
347
- file.close
348
- paths << file.path
349
- end
350
-
351
- config = Kreuzberg::Config::Extraction.new(
352
- chunking: Kreuzberg::Config::Chunking.new(
353
- enabled: true,
354
- max_chars: 100
355
- )
356
- )
357
-
358
- results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
359
-
360
- expect(results.length).to eq(2)
361
- expect(results).to all(be_a(Kreuzberg::Result))
362
-
363
- paths.each { |p| FileUtils.rm_f(p) }
364
- end
365
-
366
- it 'batch processes with embedding generation' do
367
- paths = []
368
-
369
- 2.times do |i|
370
- file = Tempfile.new(["embedding_batch_#{i}", '.txt'])
371
- file.write('Artificial intelligence transforms technology development.')
372
- file.close
373
- paths << file.path
374
- end
375
-
376
- # Use basic chunking without embeddings to avoid ONNX dependency
377
- config = Kreuzberg::Config::Extraction.new(
378
- chunking: Kreuzberg::Config::Chunking.new(
379
- enabled: true,
380
- max_chars: 100
381
- )
382
- )
383
-
384
- results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
385
-
386
- expect(results.length).to eq(2)
387
- expect(results).to all(be_a(Kreuzberg::Result))
388
-
389
- paths.each { |p| FileUtils.rm_f(p) }
390
- end
391
- end
392
-
393
- describe 'batch result properties and validation' do
394
- it 'each batch result has required properties' do
395
- paths = []
396
-
397
- 2.times do |i|
398
- file = Tempfile.new(["props_#{i}", '.txt'])
399
- file.write("Result properties test #{i}")
400
- file.close
401
- paths << file.path
402
- end
403
-
404
- config = Kreuzberg::Config::Extraction.new
405
- results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
406
-
407
- results.each do |result|
408
- expect(result).to respond_to(:content)
409
- expect(result).to respond_to(:mime_type)
410
- expect(result.content).to be_a(String)
411
- expect(result.mime_type).to be_a(String)
412
- end
413
-
414
- paths.each { |p| FileUtils.rm_f(p) }
415
- end
416
-
417
- it 'batch results maintain independence' do
418
- file1 = Tempfile.new(['indep1', '.txt']).tap do |f|
419
- f.write('First file content')
420
- f.close
421
- end
422
-
423
- file2 = Tempfile.new(['indep2', '.txt']).tap do |f|
424
- f.write('Second file content')
425
- f.close
426
- end
427
-
428
- paths = [file1.path, file2.path]
429
-
430
- config = Kreuzberg::Config::Extraction.new
431
- results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
432
-
433
- expect(results[0].content).not_to eq(results[1].content)
434
- expect(results[0].content).to include('First')
435
- expect(results[1].content).to include('Second')
436
-
437
- paths.each { |p| FileUtils.rm_f(p) }
438
- end
439
-
440
- it 'batch results have consistent structure' do
441
- paths = []
442
-
443
- 3.times do |i|
444
- file = Tempfile.new(["struct_#{i}", '.txt'])
445
- file.write("Structure test #{i}")
446
- file.close
447
- paths << file.path
448
- end
449
-
450
- config = Kreuzberg::Config::Extraction.new
451
- results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
452
-
453
- first_keys = results.first.respond_to?(:to_h) ? results.first.to_h.keys : []
454
-
455
- results.each do |result|
456
- if result.respond_to?(:to_h)
457
- result_keys = result.to_h.keys
458
- expect(result_keys).to match_array(first_keys) if first_keys.any?
459
- end
460
- end
461
-
462
- paths.each { |p| FileUtils.rm_f(p) }
463
- end
464
- end
465
-
466
- describe 'batch performance characteristics' do
467
- it 'completes batch faster than sequential processing' do
468
- paths = []
469
-
470
- 5.times do |i|
471
- file = Tempfile.new(["perf_#{i}", '.txt'])
472
- file.write("Performance test #{i}")
473
- file.close
474
- paths << file.path
475
- end
476
-
477
- config = Kreuzberg::Config::Extraction.new
478
-
479
- # Batch time
480
- batch_start = Time.now
481
- batch_results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
482
- batch_time = Time.now - batch_start
483
-
484
- # Sequential time
485
- seq_start = Time.now
486
- seq_results = paths.map { |p| Kreuzberg.extract_file_sync(path: p, config: config) }
487
- seq_time = Time.now - seq_start
488
-
489
- expect(batch_results.length).to eq(seq_results.length)
490
- # Batch should be faster or comparable
491
- expect(batch_time).to be <= seq_time + 1.0
492
-
493
- paths.each { |p| FileUtils.rm_f(p) }
494
- end
495
- end
496
-
497
- describe 'batch with special configurations' do
498
- it 'batch processes with language detection' do
499
- paths = []
500
-
501
- file = Tempfile.new(['lang_batch', '.txt']).tap do |f|
502
- f.write('Machine learning is transforming industries worldwide.')
503
- f.close
504
- end
505
- paths << file.path
506
-
507
- config = Kreuzberg::Config::Extraction.new(
508
- language_detection: Kreuzberg::Config::LanguageDetection.new(enabled: true)
509
- )
510
-
511
- results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
512
- expect(results.length).to eq(1)
513
-
514
- paths.each { |p| FileUtils.rm_f(p) }
515
- end
516
-
517
- it 'batch with mixed keyword algorithms' do
518
- paths = []
519
-
520
- 2.times do |i|
521
- file = Tempfile.new(["mixed_algo_#{i}", '.txt'])
522
- file.write('Machine learning neural networks artificial intelligence')
523
- file.close
524
- paths << file.path
525
- end
526
-
527
- # First batch with YAKE
528
- config_yake = Kreuzberg::Config::Extraction.new(
529
- keywords: Kreuzberg::Config::Keywords.new(algorithm: 'yake', max_keywords: 3)
530
- )
531
- results_yake = Kreuzberg.batch_extract_files_sync(paths: paths, config: config_yake)
532
- expect(results_yake.length).to eq(2)
533
-
534
- # Second batch with RAKE
535
- config_rake = Kreuzberg::Config::Extraction.new(
536
- keywords: Kreuzberg::Config::Keywords.new(algorithm: 'rake', max_keywords: 3)
537
- )
538
- results_rake = Kreuzberg.batch_extract_files_sync(paths: paths, config: config_rake)
539
- expect(results_rake.length).to eq(2)
540
-
541
- paths.each { |p| FileUtils.rm_f(p) }
542
- end
543
- end
544
-
545
- describe 'batch with result aggregation' do
546
- it 'aggregates batch results into statistics' do
547
- paths = []
548
-
549
- 3.times do |i|
550
- file = Tempfile.new(["stats_#{i}", '.txt'])
551
- file.write("Content #{i} " * 10)
552
- file.close
553
- paths << file.path
554
- end
555
-
556
- config = Kreuzberg::Config::Extraction.new
557
- results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
558
-
559
- # Create aggregated statistics
560
- stats = {
561
- total_files: results.length,
562
- total_content_length: results.sum { |r| r.content.length },
563
- avg_content_length: results.sum { |r| r.content.length } / results.length,
564
- mime_types: results.map(&:mime_type).uniq
565
- }
566
-
567
- expect(stats[:total_files]).to eq(3)
568
- expect(stats[:total_content_length]).to be > 0
569
- expect(stats[:avg_content_length]).to be > 0
570
- expect(stats[:mime_types]).to be_a(Array)
571
-
572
- paths.each { |p| FileUtils.rm_f(p) }
573
- end
574
-
575
- it 'batch results support JSON serialization' do
576
- paths = []
577
-
578
- file = Tempfile.new(['json_batch', '.txt']).tap do |f|
579
- f.write('JSON serialization test')
580
- f.close
581
- end
582
- paths << file.path
583
-
584
- config = Kreuzberg::Config::Extraction.new
585
- results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
586
-
587
- expect(results.first).to respond_to(:to_json)
588
- json_str = results.first.to_json
589
- expect(json_str).to be_a(String)
590
- expect(json_str.length).to be > 0
591
-
592
- paths.each { |p| FileUtils.rm_f(p) }
593
- end
594
- end
595
- end