kreuzberg 4.0.0.rc1 → 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (321) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +6 -0
  3. data/.rubocop.yaml +1 -534
  4. data/.rubocop.yml +543 -0
  5. data/Gemfile +1 -2
  6. data/Gemfile.lock +116 -28
  7. data/README.md +274 -299
  8. data/Rakefile +9 -0
  9. data/Steepfile +8 -4
  10. data/examples/async_patterns.rb +1 -58
  11. data/ext/kreuzberg_rb/extconf.rb +35 -5
  12. data/ext/kreuzberg_rb/native/.cargo/config.toml +23 -0
  13. data/ext/kreuzberg_rb/native/Cargo.lock +7619 -0
  14. data/ext/kreuzberg_rb/native/Cargo.toml +55 -16
  15. data/ext/kreuzberg_rb/native/build.rs +12 -14
  16. data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
  17. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
  18. data/ext/kreuzberg_rb/native/include/strings.h +2 -2
  19. data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
  20. data/ext/kreuzberg_rb/native/src/lib.rs +897 -34
  21. data/extconf.rb +38 -6
  22. data/kreuzberg.gemspec +114 -20
  23. data/lib/kreuzberg/api_proxy.rb +2 -18
  24. data/lib/kreuzberg/cache_api.rb +22 -0
  25. data/lib/kreuzberg/cli.rb +2 -10
  26. data/lib/kreuzberg/cli_proxy.rb +0 -10
  27. data/lib/kreuzberg/config.rb +274 -22
  28. data/lib/kreuzberg/error_context.rb +136 -0
  29. data/lib/kreuzberg/errors.rb +73 -7
  30. data/lib/kreuzberg/extraction_api.rb +237 -8
  31. data/lib/kreuzberg/mcp_proxy.rb +2 -11
  32. data/lib/kreuzberg/ocr_backend_protocol.rb +0 -73
  33. data/lib/kreuzberg/post_processor_protocol.rb +0 -71
  34. data/lib/kreuzberg/result.rb +151 -33
  35. data/lib/kreuzberg/setup_lib_path.rb +22 -2
  36. data/lib/kreuzberg/types.rb +170 -0
  37. data/lib/kreuzberg/validator_protocol.rb +0 -73
  38. data/lib/kreuzberg/version.rb +1 -1
  39. data/lib/kreuzberg.rb +27 -13
  40. data/lib/libpdfium.so +0 -0
  41. data/sig/kreuzberg.rbs +105 -12
  42. data/spec/binding/async_operations_spec.rb +473 -0
  43. data/spec/binding/batch_operations_spec.rb +595 -0
  44. data/spec/binding/batch_spec.rb +359 -0
  45. data/spec/binding/cache_spec.rb +22 -22
  46. data/spec/binding/cli_proxy_spec.rb +2 -4
  47. data/spec/binding/cli_spec.rb +12 -11
  48. data/spec/binding/config_result_spec.rb +377 -0
  49. data/spec/binding/config_spec.rb +74 -0
  50. data/spec/binding/config_validation_spec.rb +100 -6
  51. data/spec/binding/embeddings_spec.rb +816 -0
  52. data/spec/binding/error_handling_spec.rb +283 -97
  53. data/spec/binding/error_recovery_spec.rb +488 -0
  54. data/spec/binding/font_config_spec.rb +220 -0
  55. data/spec/binding/images_spec.rb +738 -0
  56. data/spec/binding/keywords_extraction_spec.rb +600 -0
  57. data/spec/binding/metadata_types_spec.rb +1228 -0
  58. data/spec/binding/pages_extraction_spec.rb +471 -0
  59. data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
  60. data/spec/binding/plugins/postprocessor_spec.rb +11 -11
  61. data/spec/binding/plugins/validator_spec.rb +12 -13
  62. data/spec/binding/tables_spec.rb +641 -0
  63. data/spec/fixtures/config.toml +0 -1
  64. data/spec/fixtures/config.yaml +0 -1
  65. data/spec/fixtures/invalid_config.toml +0 -1
  66. data/spec/smoke/package_spec.rb +2 -3
  67. data/spec/spec_helper.rb +1 -3
  68. data/spec/unit/config/chunking_config_spec.rb +213 -0
  69. data/spec/unit/config/embedding_config_spec.rb +343 -0
  70. data/spec/unit/config/extraction_config_spec.rb +438 -0
  71. data/spec/unit/config/font_config_spec.rb +285 -0
  72. data/spec/unit/config/hierarchy_config_spec.rb +314 -0
  73. data/spec/unit/config/image_extraction_config_spec.rb +209 -0
  74. data/spec/unit/config/image_preprocessing_config_spec.rb +249 -0
  75. data/spec/unit/config/keyword_config_spec.rb +229 -0
  76. data/spec/unit/config/language_detection_config_spec.rb +258 -0
  77. data/spec/unit/config/ocr_config_spec.rb +171 -0
  78. data/spec/unit/config/page_config_spec.rb +221 -0
  79. data/spec/unit/config/pdf_config_spec.rb +267 -0
  80. data/spec/unit/config/postprocessor_config_spec.rb +290 -0
  81. data/spec/unit/config/tesseract_config_spec.rb +181 -0
  82. data/spec/unit/config/token_reduction_config_spec.rb +251 -0
  83. data/test/metadata_types_test.rb +959 -0
  84. data/vendor/Cargo.toml +61 -0
  85. data/vendor/kreuzberg/Cargo.toml +192 -67
  86. data/vendor/kreuzberg/README.md +98 -10
  87. data/vendor/kreuzberg/build.rs +516 -194
  88. data/vendor/kreuzberg/examples/bench_fixes.rs +71 -0
  89. data/vendor/kreuzberg/examples/test_pdfium_fork.rs +62 -0
  90. data/vendor/kreuzberg/src/api/handlers.rs +130 -9
  91. data/vendor/kreuzberg/src/api/mod.rs +18 -3
  92. data/vendor/kreuzberg/src/api/server.rs +236 -71
  93. data/vendor/kreuzberg/src/api/types.rs +43 -7
  94. data/vendor/kreuzberg/src/cache/mod.rs +27 -3
  95. data/vendor/kreuzberg/src/chunking/mod.rs +1705 -79
  96. data/vendor/kreuzberg/src/chunking/processor.rs +219 -0
  97. data/vendor/kreuzberg/src/core/batch_mode.rs +60 -0
  98. data/vendor/kreuzberg/src/core/batch_optimizations.rs +385 -0
  99. data/vendor/kreuzberg/src/core/config.rs +905 -23
  100. data/vendor/kreuzberg/src/core/config_validation.rs +949 -0
  101. data/vendor/kreuzberg/src/core/extractor.rs +403 -106
  102. data/vendor/kreuzberg/src/core/formats.rs +235 -0
  103. data/vendor/kreuzberg/src/core/io.rs +4 -2
  104. data/vendor/kreuzberg/src/core/mime.rs +2 -12
  105. data/vendor/kreuzberg/src/core/mod.rs +22 -3
  106. data/vendor/kreuzberg/src/core/pipeline.rs +395 -78
  107. data/vendor/kreuzberg/src/core/server_config.rs +1220 -0
  108. data/vendor/kreuzberg/src/embeddings.rs +169 -21
  109. data/vendor/kreuzberg/src/error.rs +2 -2
  110. data/vendor/kreuzberg/src/extraction/archive.rs +36 -31
  111. data/vendor/kreuzberg/src/extraction/capacity.rs +263 -0
  112. data/vendor/kreuzberg/src/extraction/docx.rs +365 -1
  113. data/vendor/kreuzberg/src/extraction/email.rs +12 -11
  114. data/vendor/kreuzberg/src/extraction/excel.rs +138 -129
  115. data/vendor/kreuzberg/src/extraction/html.rs +1447 -170
  116. data/vendor/kreuzberg/src/extraction/image.rs +138 -14
  117. data/vendor/kreuzberg/src/extraction/libreoffice.rs +13 -3
  118. data/vendor/kreuzberg/src/extraction/markdown.rs +216 -0
  119. data/vendor/kreuzberg/src/extraction/mod.rs +21 -5
  120. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +2 -0
  121. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -0
  122. data/vendor/kreuzberg/src/extraction/pptx.rs +196 -94
  123. data/vendor/kreuzberg/src/extraction/structured.rs +5 -4
  124. data/vendor/kreuzberg/src/extraction/table.rs +2 -1
  125. data/vendor/kreuzberg/src/extraction/text.rs +18 -10
  126. data/vendor/kreuzberg/src/extractors/archive.rs +22 -0
  127. data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -0
  128. data/vendor/kreuzberg/src/extractors/docbook.rs +504 -0
  129. data/vendor/kreuzberg/src/extractors/docx.rs +69 -148
  130. data/vendor/kreuzberg/src/extractors/email.rs +37 -9
  131. data/vendor/kreuzberg/src/extractors/epub.rs +696 -0
  132. data/vendor/kreuzberg/src/extractors/excel.rs +81 -40
  133. data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -0
  134. data/vendor/kreuzberg/src/extractors/html.rs +182 -173
  135. data/vendor/kreuzberg/src/extractors/image.rs +32 -8
  136. data/vendor/kreuzberg/src/extractors/jats.rs +1054 -0
  137. data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -0
  138. data/vendor/kreuzberg/src/extractors/latex.rs +653 -0
  139. data/vendor/kreuzberg/src/extractors/markdown.rs +701 -0
  140. data/vendor/kreuzberg/src/extractors/mod.rs +171 -10
  141. data/vendor/kreuzberg/src/extractors/odt.rs +628 -0
  142. data/vendor/kreuzberg/src/extractors/opml.rs +635 -0
  143. data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -0
  144. data/vendor/kreuzberg/src/extractors/pdf.rs +329 -64
  145. data/vendor/kreuzberg/src/extractors/pptx.rs +79 -34
  146. data/vendor/kreuzberg/src/extractors/rst.rs +577 -0
  147. data/vendor/kreuzberg/src/extractors/rtf.rs +809 -0
  148. data/vendor/kreuzberg/src/extractors/security.rs +484 -0
  149. data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -0
  150. data/vendor/kreuzberg/src/extractors/structured.rs +16 -0
  151. data/vendor/kreuzberg/src/extractors/text.rs +30 -7
  152. data/vendor/kreuzberg/src/extractors/typst.rs +651 -0
  153. data/vendor/kreuzberg/src/extractors/xml.rs +27 -8
  154. data/vendor/kreuzberg/src/keywords/processor.rs +9 -1
  155. data/vendor/kreuzberg/src/keywords/rake.rs +0 -1
  156. data/vendor/kreuzberg/src/language_detection/mod.rs +94 -51
  157. data/vendor/kreuzberg/src/language_detection/processor.rs +218 -0
  158. data/vendor/kreuzberg/src/lib.rs +17 -5
  159. data/vendor/kreuzberg/src/mcp/mod.rs +4 -1
  160. data/vendor/kreuzberg/src/mcp/server.rs +145 -21
  161. data/vendor/kreuzberg/src/ocr/language_registry.rs +520 -0
  162. data/vendor/kreuzberg/src/ocr/mod.rs +2 -0
  163. data/vendor/kreuzberg/src/ocr/processor.rs +19 -8
  164. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +56 -50
  165. data/vendor/kreuzberg/src/panic_context.rs +154 -0
  166. data/vendor/kreuzberg/src/pdf/bindings.rs +306 -0
  167. data/vendor/kreuzberg/src/pdf/bundled.rs +408 -0
  168. data/vendor/kreuzberg/src/pdf/error.rs +93 -1
  169. data/vendor/kreuzberg/src/pdf/fonts.rs +358 -0
  170. data/vendor/kreuzberg/src/pdf/hierarchy.rs +903 -0
  171. data/vendor/kreuzberg/src/pdf/metadata.rs +263 -100
  172. data/vendor/kreuzberg/src/pdf/mod.rs +33 -2
  173. data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
  174. data/vendor/kreuzberg/src/pdf/table.rs +61 -64
  175. data/vendor/kreuzberg/src/pdf/text.rs +416 -24
  176. data/vendor/kreuzberg/src/plugins/extractor.rs +40 -8
  177. data/vendor/kreuzberg/src/plugins/mod.rs +3 -0
  178. data/vendor/kreuzberg/src/plugins/ocr.rs +22 -14
  179. data/vendor/kreuzberg/src/plugins/processor.rs +10 -1
  180. data/vendor/kreuzberg/src/plugins/registry.rs +15 -0
  181. data/vendor/kreuzberg/src/plugins/validator.rs +20 -8
  182. data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
  183. data/vendor/kreuzberg/src/text/mod.rs +8 -0
  184. data/vendor/kreuzberg/src/text/quality.rs +28 -15
  185. data/vendor/kreuzberg/src/text/quality_processor.rs +231 -0
  186. data/vendor/kreuzberg/src/text/string_utils.rs +22 -10
  187. data/vendor/kreuzberg/src/text/token_reduction/core.rs +86 -50
  188. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +37 -16
  189. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +2 -1
  190. data/vendor/kreuzberg/src/text/utf8_validation.rs +193 -0
  191. data/vendor/kreuzberg/src/types.rs +907 -67
  192. data/vendor/kreuzberg/src/utils/mod.rs +14 -0
  193. data/vendor/kreuzberg/src/utils/pool.rs +503 -0
  194. data/vendor/kreuzberg/src/utils/pool_sizing.rs +364 -0
  195. data/vendor/kreuzberg/src/utils/quality.rs +12 -3
  196. data/vendor/kreuzberg/src/utils/string_pool.rs +761 -0
  197. data/vendor/kreuzberg/tests/api_embed.rs +360 -0
  198. data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -0
  199. data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +471 -0
  200. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +289 -0
  201. data/vendor/kreuzberg/tests/api_tests.rs +506 -0
  202. data/vendor/kreuzberg/tests/archive_integration.rs +2 -0
  203. data/vendor/kreuzberg/tests/batch_orchestration.rs +57 -12
  204. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +154 -0
  205. data/vendor/kreuzberg/tests/batch_processing.rs +32 -8
  206. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -0
  207. data/vendor/kreuzberg/tests/concurrency_stress.rs +40 -8
  208. data/vendor/kreuzberg/tests/config_features.rs +33 -1
  209. data/vendor/kreuzberg/tests/config_integration_test.rs +753 -0
  210. data/vendor/kreuzberg/tests/config_loading_tests.rs +16 -39
  211. data/vendor/kreuzberg/tests/core_integration.rs +35 -9
  212. data/vendor/kreuzberg/tests/csv_integration.rs +71 -81
  213. data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +294 -0
  214. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -0
  215. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +23 -25
  216. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -0
  217. data/vendor/kreuzberg/tests/email_integration.rs +3 -1
  218. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -0
  219. data/vendor/kreuzberg/tests/error_handling.rs +43 -34
  220. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -0
  221. data/vendor/kreuzberg/tests/format_integration.rs +7 -1
  222. data/vendor/kreuzberg/tests/helpers/mod.rs +60 -0
  223. data/vendor/kreuzberg/tests/html_table_test.rs +551 -0
  224. data/vendor/kreuzberg/tests/image_integration.rs +2 -0
  225. data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -0
  226. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -0
  227. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -0
  228. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -0
  229. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -0
  230. data/vendor/kreuzberg/tests/mime_detection.rs +17 -16
  231. data/vendor/kreuzberg/tests/ocr_configuration.rs +4 -0
  232. data/vendor/kreuzberg/tests/ocr_errors.rs +22 -0
  233. data/vendor/kreuzberg/tests/ocr_language_registry.rs +191 -0
  234. data/vendor/kreuzberg/tests/ocr_quality.rs +2 -0
  235. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -0
  236. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -0
  237. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -0
  238. data/vendor/kreuzberg/tests/page_markers.rs +297 -0
  239. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +301 -0
  240. data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +589 -0
  241. data/vendor/kreuzberg/tests/pdf_integration.rs +2 -0
  242. data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +301 -0
  243. data/vendor/kreuzberg/tests/pdf_text_merging.rs +475 -0
  244. data/vendor/kreuzberg/tests/pdfium_linking.rs +340 -0
  245. data/vendor/kreuzberg/tests/pipeline_integration.rs +36 -2
  246. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +5 -0
  247. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +17 -1
  248. data/vendor/kreuzberg/tests/plugin_system.rs +6 -0
  249. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -22
  250. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -0
  251. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -0
  252. data/vendor/kreuzberg/tests/security_validation.rs +13 -1
  253. data/vendor/kreuzberg/tests/test_fastembed.rs +45 -23
  254. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -0
  255. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -0
  256. data/vendor/kreuzberg-ffi/Cargo.toml +67 -0
  257. data/vendor/kreuzberg-ffi/README.md +851 -0
  258. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +227 -0
  259. data/vendor/kreuzberg-ffi/build.rs +168 -0
  260. data/vendor/kreuzberg-ffi/cbindgen.toml +37 -0
  261. data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -0
  262. data/vendor/kreuzberg-ffi/kreuzberg.h +3012 -0
  263. data/vendor/kreuzberg-ffi/src/batch_streaming.rs +588 -0
  264. data/vendor/kreuzberg-ffi/src/config.rs +1341 -0
  265. data/vendor/kreuzberg-ffi/src/error.rs +901 -0
  266. data/vendor/kreuzberg-ffi/src/extraction.rs +555 -0
  267. data/vendor/kreuzberg-ffi/src/helpers.rs +879 -0
  268. data/vendor/kreuzberg-ffi/src/lib.rs +977 -0
  269. data/vendor/kreuzberg-ffi/src/memory.rs +493 -0
  270. data/vendor/kreuzberg-ffi/src/mime.rs +329 -0
  271. data/vendor/kreuzberg-ffi/src/panic_shield.rs +265 -0
  272. data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +442 -0
  273. data/vendor/kreuzberg-ffi/src/plugins/mod.rs +14 -0
  274. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +628 -0
  275. data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +438 -0
  276. data/vendor/kreuzberg-ffi/src/plugins/validator.rs +329 -0
  277. data/vendor/kreuzberg-ffi/src/result.rs +510 -0
  278. data/vendor/kreuzberg-ffi/src/result_pool.rs +639 -0
  279. data/vendor/kreuzberg-ffi/src/result_view.rs +773 -0
  280. data/vendor/kreuzberg-ffi/src/string_intern.rs +568 -0
  281. data/vendor/kreuzberg-ffi/src/types.rs +363 -0
  282. data/vendor/kreuzberg-ffi/src/util.rs +210 -0
  283. data/vendor/kreuzberg-ffi/src/validation.rs +848 -0
  284. data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -0
  285. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -0
  286. data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -0
  287. data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -0
  288. data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -0
  289. data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -0
  290. data/vendor/kreuzberg-tesseract/.crate-ignore +2 -0
  291. data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -0
  292. data/vendor/kreuzberg-tesseract/Cargo.toml +57 -0
  293. data/vendor/kreuzberg-tesseract/LICENSE +22 -0
  294. data/vendor/kreuzberg-tesseract/README.md +399 -0
  295. data/vendor/kreuzberg-tesseract/build.rs +1127 -0
  296. data/vendor/kreuzberg-tesseract/patches/README.md +71 -0
  297. data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -0
  298. data/vendor/kreuzberg-tesseract/src/api.rs +1371 -0
  299. data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -0
  300. data/vendor/kreuzberg-tesseract/src/enums.rs +297 -0
  301. data/vendor/kreuzberg-tesseract/src/error.rs +81 -0
  302. data/vendor/kreuzberg-tesseract/src/lib.rs +145 -0
  303. data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -0
  304. data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -0
  305. data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -0
  306. data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -0
  307. data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -0
  308. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -0
  309. metadata +171 -25
  310. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  311. data/spec/examples.txt +0 -104
  312. data/vendor/kreuzberg/src/bin/profile_extract.rs +0 -455
  313. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +0 -275
  314. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +0 -178
  315. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +0 -491
  316. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +0 -496
  317. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +0 -1188
  318. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +0 -162
  319. data/vendor/kreuzberg/src/extractors/pandoc.rs +0 -201
  320. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +0 -92
  321. data/vendor/kreuzberg/tests/pandoc_integration.rs +0 -503
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: bfb7dc6c3685285b053af795140387ad8292c698cbba4cfda3411390dfe3cd48
4
- data.tar.gz: 7688722b9a8d99564044a8bca31c6fadf107208aabaa336d450069b636a02726
3
+ metadata.gz: 2aefdaf524125ecb2c2b849055c11c33552fab92ba509549d3c1407d3796b7c2
4
+ data.tar.gz: aadb510951d087fbf4c34a7c058e80e64252b01e1c3b1989bda56f710e24fe8d
5
5
  SHA512:
6
- metadata.gz: 3c5b66fc2c0457f670c39e53f038a5fa040760ac3f36b8731c35210d2ca98b1cf65ae41411bd6788411f18090ffd269878cf814d4b93b4201098f5e5950782e0
7
- data.tar.gz: 1a02c5c0ffb830454e20e03e937aa4d80e73d9e6a6ae4913ed9d35d6cdb84ffbb2141711fa3b2ce5fcd821b109613f5ec01cbfdcface5615071727b26c772347
6
+ metadata.gz: e754397146324c9bc897a084be3406b81cced652425064e81c8e09cf0a3ea6ca56cdb9187350baf768a582f6403b1ebd10e33b5fd16fc00230410ac08f9baa67
7
+ data.tar.gz: 721b76aab0dc590416051a6d214ed6630294fbcbbafd72fdaa3ade646b09c04da2d41026872ad501ef079a58232f77bf2f492de0d8d9af1d314a80a95971fd1f
data/.gitignore CHANGED
@@ -6,3 +6,9 @@ lib/*.dll
6
6
  lib/*.dylib
7
7
  lib/*.so
8
8
  lib/*.dll
9
+
10
+ # Vendor directory for local development (symlink to ../../crates/kreuzberg)
11
+ # In CI, this is replaced by the actual vendored crate files
12
+ # Exception: vendor/rb-sys is patched and committed
13
+ !vendor/rb-sys/
14
+ vendor/
data/.rubocop.yaml CHANGED
@@ -1,534 +1 @@
1
-
2
- plugins:
3
- - rubocop-performance
4
- - rubocop-rspec
5
-
6
- AllCops:
7
- TargetRubyVersion: 3.2
8
- NewCops: enable
9
- SuggestExtensions: false
10
- Exclude:
11
- - 'vendor/**/*'
12
- - 'tmp/**/*'
13
- - 'lib/**/*.bundle'
14
- - 'ext/**/*'
15
-
16
- Style/StringLiterals:
17
- Enabled: true
18
- EnforcedStyle: single_quotes
19
-
20
- Style/StringLiteralsInInterpolation:
21
- Enabled: true
22
- EnforcedStyle: single_quotes
23
-
24
- Style/FrozenStringLiteralComment:
25
- Enabled: true
26
- EnforcedStyle: always
27
-
28
- Style/Documentation:
29
- Enabled: false
30
-
31
- Layout/LineLength:
32
- Max: 120
33
- AllowedPatterns:
34
- - '\A\s*#'
35
- Exclude:
36
- - 'spec/**/*'
37
-
38
- Metrics/BlockLength:
39
- Max: 350
40
- Exclude:
41
- - '*.gemspec'
42
- - 'examples/**/*'
43
-
44
- Metrics/MethodLength:
45
- Max: 18
46
- Exclude:
47
- - 'spec/**/*'
48
- - 'examples/**/*'
49
-
50
- Metrics/AbcSize:
51
- Max: 20
52
- Exclude:
53
- - 'spec/**/*'
54
- - 'examples/**/*'
55
-
56
- Naming/FileName:
57
- Enabled: true
58
- Exclude:
59
- - 'Gemfile'
60
- - 'Rakefile'
61
-
62
- RSpec/SpecFilePathFormat:
63
- Enabled: false
64
- RSpec/SpecFilePathSuffix:
65
- Enabled: false
66
-
67
- RSpec/PendingWithoutReason:
68
- Enabled: false
69
-
70
- RSpec/LeakyConstantDeclaration:
71
- Enabled: false
72
-
73
- RSpec/UnspecifiedException:
74
- Enabled: false
75
-
76
- Lint/ConstantDefinitionInBlock:
77
- Exclude:
78
- - 'spec/**/*'
79
-
80
- RSpec/InstanceVariable:
81
- Enabled: false
82
-
83
- Gemspec/DevelopmentDependencies:
84
- Enabled: false
85
-
86
- Metrics/ParameterLists:
87
- Max: 7
88
- Exclude:
89
- - 'lib/kreuzberg/config.rb'
90
-
91
- Metrics/CyclomaticComplexity:
92
- Max: 7
93
- Exclude:
94
- - 'lib/kreuzberg/config.rb'
95
-
96
- RSpec/RepeatedExampleGroupBody:
97
- Enabled: false
98
-
99
- RSpec/DescribeClass:
100
- Enabled: false
101
-
102
- RSpec/PredicateMatcher:
103
- Enabled: false
104
-
105
- # New cops from RuboCop 1.65+
106
- Gemspec/AddRuntimeDependency:
107
- Enabled: true
108
-
109
- Gemspec/AttributeAssignment:
110
- Enabled: true
111
-
112
- Gemspec/DeprecatedAttributeAssignment:
113
- Enabled: true
114
-
115
- Gemspec/RequireMFA:
116
- Enabled: false
117
-
118
- Layout/EmptyLinesAfterModuleInclusion:
119
- Enabled: true
120
-
121
- Layout/LineContinuationLeadingSpace:
122
- Enabled: true
123
-
124
- Layout/LineContinuationSpacing:
125
- Enabled: true
126
-
127
- Layout/LineEndStringConcatenationIndentation:
128
- Enabled: true
129
-
130
- Layout/SpaceBeforeBrackets:
131
- Enabled: true
132
-
133
- Lint/AmbiguousAssignment:
134
- Enabled: true
135
-
136
- Lint/AmbiguousOperatorPrecedence:
137
- Enabled: true
138
-
139
- Lint/AmbiguousRange:
140
- Enabled: true
141
-
142
- Lint/ArrayLiteralInRegexp:
143
- Enabled: true
144
-
145
- Lint/ConstantOverwrittenInRescue:
146
- Enabled: true
147
-
148
- Lint/ConstantReassignment:
149
- Enabled: true
150
-
151
- Lint/CopDirectiveSyntax:
152
- Enabled: true
153
-
154
- Lint/DeprecatedConstants:
155
- Enabled: true
156
-
157
- Lint/DuplicateBranch:
158
- Enabled: true
159
-
160
- Lint/DuplicateMagicComment:
161
- Enabled: true
162
-
163
- Lint/DuplicateMatchPattern:
164
- Enabled: true
165
-
166
- Lint/DuplicateRegexpCharacterClassElement:
167
- Enabled: true
168
-
169
- Lint/DuplicateSetElement:
170
- Enabled: true
171
-
172
- Lint/EmptyBlock:
173
- Enabled: true
174
-
175
- Lint/EmptyClass:
176
- Enabled: true
177
-
178
- Lint/EmptyInPattern:
179
- Enabled: true
180
-
181
- Lint/HashNewWithKeywordArgumentsAsDefault:
182
- Enabled: true
183
-
184
- Lint/IncompatibleIoSelectWithFiberScheduler:
185
- Enabled: true
186
-
187
- Lint/ItWithoutArgumentsInBlock:
188
- Enabled: true
189
-
190
- Lint/LambdaWithoutLiteralBlock:
191
- Enabled: true
192
-
193
- Lint/LiteralAssignmentInCondition:
194
- Enabled: true
195
-
196
- Lint/MixedCaseRange:
197
- Enabled: true
198
-
199
- Lint/NoReturnInBeginEndBlocks:
200
- Enabled: true
201
-
202
- Lint/NonAtomicFileOperation:
203
- Enabled: true
204
-
205
- Lint/NumberedParameterAssignment:
206
- Enabled: true
207
-
208
- Lint/NumericOperationWithConstantResult:
209
- Enabled: true
210
-
211
- Lint/OrAssignmentToConstant:
212
- Enabled: true
213
-
214
- Lint/RedundantDirGlobSort:
215
- Enabled: true
216
-
217
- Lint/RedundantRegexpQuantifiers:
218
- Enabled: true
219
-
220
- Lint/RedundantTypeConversion:
221
- Enabled: true
222
-
223
- Lint/RefinementImportMethods:
224
- Enabled: true
225
-
226
- Lint/RequireRangeParentheses:
227
- Enabled: true
228
-
229
- Lint/RequireRelativeSelfPath:
230
- Enabled: true
231
-
232
- Lint/SharedMutableDefault:
233
- Enabled: true
234
-
235
- Lint/SuppressedExceptionInNumberConversion:
236
- Enabled: true
237
-
238
- Lint/SymbolConversion:
239
- Enabled: true
240
-
241
- Lint/ToEnumArguments:
242
- Enabled: true
243
-
244
- Lint/TripleQuotes:
245
- Enabled: true
246
-
247
- Lint/UnescapedBracketInRegexp:
248
- Enabled: true
249
-
250
- Lint/UnexpectedBlockArity:
251
- Enabled: true
252
-
253
- Lint/UnmodifiedReduceAccumulator:
254
- Enabled: true
255
-
256
- Lint/UselessConstantScoping:
257
- Enabled: true
258
-
259
- Lint/UselessDefaultValueArgument:
260
- Enabled: true
261
-
262
- Lint/UselessDefined:
263
- Enabled: true
264
-
265
- Lint/UselessNumericOperation:
266
- Enabled: true
267
-
268
- Lint/UselessOr:
269
- Enabled: true
270
-
271
- Lint/UselessRescue:
272
- Enabled: true
273
-
274
- Lint/UselessRuby2Keywords:
275
- Enabled: true
276
-
277
- Metrics/CollectionLiteralLength:
278
- Enabled: true
279
-
280
- Naming/BlockForwarding:
281
- Enabled: true
282
-
283
- Naming/PredicateMethod:
284
- Enabled: true
285
-
286
- Security/CompoundHash:
287
- Enabled: true
288
-
289
- Security/IoMethods:
290
- Enabled: true
291
-
292
- Style/AmbiguousEndlessMethodDefinition:
293
- Enabled: true
294
-
295
- Style/ArgumentsForwarding:
296
- Enabled: true
297
-
298
- Style/ArrayIntersect:
299
- Enabled: true
300
-
301
- Style/ArrayIntersectWithSingleElement:
302
- Enabled: true
303
-
304
- Style/BitwisePredicate:
305
- Enabled: true
306
-
307
- Style/CollectionCompact:
308
- Enabled: true
309
-
310
- Style/CollectionQuerying:
311
- Enabled: true
312
-
313
- Style/CombinableDefined:
314
- Enabled: true
315
-
316
- Style/ComparableBetween:
317
- Enabled: true
318
-
319
- Style/ComparableClamp:
320
- Enabled: true
321
-
322
- Style/ConcatArrayLiterals:
323
- Enabled: true
324
-
325
- Style/DataInheritance:
326
- Enabled: true
327
-
328
- Style/DigChain:
329
- Enabled: true
330
-
331
- Style/DirEmpty:
332
- Enabled: true
333
-
334
- Style/DocumentDynamicEvalDefinition:
335
- Enabled: true
336
-
337
- Style/EmptyHeredoc:
338
- Enabled: true
339
-
340
- Style/EmptyStringInsideInterpolation:
341
- Enabled: true
342
-
343
- Style/EndlessMethod:
344
- Enabled: true
345
-
346
- Style/EnvHome:
347
- Enabled: true
348
-
349
- Style/ExactRegexpMatch:
350
- Enabled: true
351
-
352
- Style/FetchEnvVar:
353
- Enabled: true
354
-
355
- Style/FileEmpty:
356
- Enabled: true
357
-
358
- Style/FileNull:
359
- Enabled: true
360
-
361
- Style/FileRead:
362
- Enabled: true
363
-
364
- Style/FileTouch:
365
- Enabled: true
366
-
367
- Style/FileWrite:
368
- Enabled: true
369
-
370
- Style/HashConversion:
371
- Enabled: true
372
-
373
- Style/HashExcept:
374
- Enabled: true
375
-
376
- Style/HashFetchChain:
377
- Enabled: true
378
-
379
- Style/HashSlice:
380
- Enabled: true
381
-
382
- Style/IfWithBooleanLiteralBranches:
383
- Enabled: true
384
-
385
- Style/InPatternThen:
386
- Enabled: true
387
-
388
- Style/ItAssignment:
389
- Enabled: true
390
-
391
- Style/ItBlockParameter:
392
- Enabled: true
393
-
394
- Style/KeywordArgumentsMerging:
395
- Enabled: true
396
-
397
- Style/MagicCommentFormat:
398
- Enabled: true
399
-
400
- Style/MapCompactWithConditionalBlock:
401
- Enabled: true
402
-
403
- Style/MapIntoArray:
404
- Enabled: true
405
-
406
- Style/MapToHash:
407
- Enabled: true
408
-
409
- Style/MapToSet:
410
- Enabled: true
411
-
412
- Style/MinMaxComparison:
413
- Enabled: true
414
-
415
- Style/MultilineInPatternThen:
416
- Enabled: true
417
-
418
- Style/NegatedIfElseCondition:
419
- Enabled: true
420
-
421
- Style/NestedFileDirname:
422
- Enabled: true
423
-
424
- Style/NilLambda:
425
- Enabled: true
426
-
427
- Style/NumberedParameters:
428
- Enabled: true
429
-
430
- Style/NumberedParametersLimit:
431
- Enabled: true
432
-
433
- Style/ObjectThen:
434
- Enabled: true
435
-
436
- Style/OpenStructUse:
437
- Enabled: true
438
-
439
- Style/OperatorMethodCall:
440
- Enabled: true
441
-
442
- Style/QuotedSymbols:
443
- Enabled: true
444
-
445
- Style/RedundantArgument:
446
- Enabled: true
447
-
448
- Style/RedundantArrayConstructor:
449
- Enabled: true
450
-
451
- Style/RedundantArrayFlatten:
452
- Enabled: true
453
-
454
- Style/RedundantConstantBase:
455
- Enabled: true
456
-
457
- Style/RedundantCurrentDirectoryInPath:
458
- Enabled: true
459
-
460
- Style/RedundantDoubleSplatHashBraces:
461
- Enabled: true
462
-
463
- Style/RedundantEach:
464
- Enabled: true
465
-
466
- Style/RedundantFilterChain:
467
- Enabled: true
468
-
469
- Style/RedundantFormat:
470
- Enabled: true
471
-
472
- Style/RedundantHeredocDelimiterQuotes:
473
- Enabled: true
474
-
475
- Style/RedundantInitialize:
476
- Enabled: true
477
-
478
- Style/RedundantInterpolationUnfreeze:
479
- Enabled: true
480
-
481
- Style/RedundantLineContinuation:
482
- Enabled: true
483
-
484
- Style/RedundantRegexpArgument:
485
- Enabled: true
486
-
487
- Style/RedundantRegexpConstructor:
488
- Enabled: true
489
-
490
- Style/RedundantSelfAssignmentBranch:
491
- Enabled: true
492
-
493
- Style/RedundantStringEscape:
494
- Enabled: true
495
-
496
- Style/ReturnNilInPredicateMethodDefinition:
497
- Enabled: true
498
-
499
- Style/SafeNavigationChainLength:
500
- Enabled: true
501
-
502
- Style/SelectByRegexp:
503
- Enabled: true
504
-
505
- Style/SendWithLiteralMethodName:
506
- Enabled: true
507
-
508
- Style/SingleLineDoEndBlock:
509
- Enabled: true
510
-
511
- Style/StringChars:
512
- Enabled: true
513
-
514
- Style/SuperArguments:
515
- Enabled: true
516
-
517
- Style/SuperWithArgsParentheses:
518
- Enabled: true
519
-
520
- Style/SwapValues:
521
- Enabled: true
522
-
523
- Style/YAMLFileRead:
524
- Enabled: true
525
-
526
- # Adjust RSpec metrics for test blocks
527
- RSpec/ExampleLength:
528
- Max: 35
529
-
530
- RSpec/MultipleExpectations:
531
- Max: 15
532
-
533
- RSpec/NestedGroups:
534
- Max: 6
1
+ inherit_from: .rubocop.yml