kreuzberg 4.0.0.pre.rc.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (330) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yaml +1 -0
  5. data/.rubocop.yml +538 -0
  6. data/Gemfile +8 -0
  7. data/Gemfile.lock +157 -0
  8. data/README.md +426 -0
  9. data/Rakefile +25 -0
  10. data/Steepfile +47 -0
  11. data/examples/async_patterns.rb +341 -0
  12. data/ext/kreuzberg_rb/extconf.rb +45 -0
  13. data/ext/kreuzberg_rb/native/Cargo.lock +6535 -0
  14. data/ext/kreuzberg_rb/native/Cargo.toml +44 -0
  15. data/ext/kreuzberg_rb/native/README.md +425 -0
  16. data/ext/kreuzberg_rb/native/build.rs +15 -0
  17. data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -0
  18. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -0
  19. data/ext/kreuzberg_rb/native/include/strings.h +20 -0
  20. data/ext/kreuzberg_rb/native/include/unistd.h +47 -0
  21. data/ext/kreuzberg_rb/native/src/lib.rs +2998 -0
  22. data/extconf.rb +28 -0
  23. data/kreuzberg.gemspec +148 -0
  24. data/lib/kreuzberg/api_proxy.rb +142 -0
  25. data/lib/kreuzberg/cache_api.rb +46 -0
  26. data/lib/kreuzberg/cli.rb +55 -0
  27. data/lib/kreuzberg/cli_proxy.rb +127 -0
  28. data/lib/kreuzberg/config.rb +691 -0
  29. data/lib/kreuzberg/error_context.rb +32 -0
  30. data/lib/kreuzberg/errors.rb +118 -0
  31. data/lib/kreuzberg/extraction_api.rb +85 -0
  32. data/lib/kreuzberg/mcp_proxy.rb +186 -0
  33. data/lib/kreuzberg/ocr_backend_protocol.rb +113 -0
  34. data/lib/kreuzberg/post_processor_protocol.rb +86 -0
  35. data/lib/kreuzberg/result.rb +216 -0
  36. data/lib/kreuzberg/setup_lib_path.rb +80 -0
  37. data/lib/kreuzberg/validator_protocol.rb +89 -0
  38. data/lib/kreuzberg/version.rb +5 -0
  39. data/lib/kreuzberg.rb +103 -0
  40. data/sig/kreuzberg/internal.rbs +184 -0
  41. data/sig/kreuzberg.rbs +520 -0
  42. data/spec/binding/cache_spec.rb +227 -0
  43. data/spec/binding/cli_proxy_spec.rb +85 -0
  44. data/spec/binding/cli_spec.rb +55 -0
  45. data/spec/binding/config_spec.rb +345 -0
  46. data/spec/binding/config_validation_spec.rb +283 -0
  47. data/spec/binding/error_handling_spec.rb +213 -0
  48. data/spec/binding/errors_spec.rb +66 -0
  49. data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
  50. data/spec/binding/plugins/postprocessor_spec.rb +269 -0
  51. data/spec/binding/plugins/validator_spec.rb +274 -0
  52. data/spec/fixtures/config.toml +39 -0
  53. data/spec/fixtures/config.yaml +41 -0
  54. data/spec/fixtures/invalid_config.toml +4 -0
  55. data/spec/smoke/package_spec.rb +178 -0
  56. data/spec/spec_helper.rb +42 -0
  57. data/vendor/kreuzberg/Cargo.toml +204 -0
  58. data/vendor/kreuzberg/README.md +175 -0
  59. data/vendor/kreuzberg/benches/otel_overhead.rs +48 -0
  60. data/vendor/kreuzberg/build.rs +474 -0
  61. data/vendor/kreuzberg/src/api/error.rs +81 -0
  62. data/vendor/kreuzberg/src/api/handlers.rs +199 -0
  63. data/vendor/kreuzberg/src/api/mod.rs +79 -0
  64. data/vendor/kreuzberg/src/api/server.rs +353 -0
  65. data/vendor/kreuzberg/src/api/types.rs +170 -0
  66. data/vendor/kreuzberg/src/cache/mod.rs +1167 -0
  67. data/vendor/kreuzberg/src/chunking/mod.rs +677 -0
  68. data/vendor/kreuzberg/src/core/batch_mode.rs +95 -0
  69. data/vendor/kreuzberg/src/core/config.rs +1032 -0
  70. data/vendor/kreuzberg/src/core/extractor.rs +1024 -0
  71. data/vendor/kreuzberg/src/core/io.rs +329 -0
  72. data/vendor/kreuzberg/src/core/mime.rs +605 -0
  73. data/vendor/kreuzberg/src/core/mod.rs +45 -0
  74. data/vendor/kreuzberg/src/core/pipeline.rs +984 -0
  75. data/vendor/kreuzberg/src/embeddings.rs +432 -0
  76. data/vendor/kreuzberg/src/error.rs +431 -0
  77. data/vendor/kreuzberg/src/extraction/archive.rs +954 -0
  78. data/vendor/kreuzberg/src/extraction/docx.rs +40 -0
  79. data/vendor/kreuzberg/src/extraction/email.rs +854 -0
  80. data/vendor/kreuzberg/src/extraction/excel.rs +688 -0
  81. data/vendor/kreuzberg/src/extraction/html.rs +553 -0
  82. data/vendor/kreuzberg/src/extraction/image.rs +368 -0
  83. data/vendor/kreuzberg/src/extraction/libreoffice.rs +563 -0
  84. data/vendor/kreuzberg/src/extraction/markdown.rs +213 -0
  85. data/vendor/kreuzberg/src/extraction/mod.rs +81 -0
  86. data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -0
  87. data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -0
  88. data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -0
  89. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -0
  90. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +287 -0
  91. data/vendor/kreuzberg/src/extraction/pptx.rs +3000 -0
  92. data/vendor/kreuzberg/src/extraction/structured.rs +490 -0
  93. data/vendor/kreuzberg/src/extraction/table.rs +328 -0
  94. data/vendor/kreuzberg/src/extraction/text.rs +269 -0
  95. data/vendor/kreuzberg/src/extraction/xml.rs +333 -0
  96. data/vendor/kreuzberg/src/extractors/archive.rs +446 -0
  97. data/vendor/kreuzberg/src/extractors/bibtex.rs +469 -0
  98. data/vendor/kreuzberg/src/extractors/docbook.rs +502 -0
  99. data/vendor/kreuzberg/src/extractors/docx.rs +367 -0
  100. data/vendor/kreuzberg/src/extractors/email.rs +143 -0
  101. data/vendor/kreuzberg/src/extractors/epub.rs +707 -0
  102. data/vendor/kreuzberg/src/extractors/excel.rs +343 -0
  103. data/vendor/kreuzberg/src/extractors/fictionbook.rs +491 -0
  104. data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +738 -0
  105. data/vendor/kreuzberg/src/extractors/html.rs +393 -0
  106. data/vendor/kreuzberg/src/extractors/image.rs +198 -0
  107. data/vendor/kreuzberg/src/extractors/jats.rs +1051 -0
  108. data/vendor/kreuzberg/src/extractors/jupyter.rs +367 -0
  109. data/vendor/kreuzberg/src/extractors/latex.rs +652 -0
  110. data/vendor/kreuzberg/src/extractors/markdown.rs +700 -0
  111. data/vendor/kreuzberg/src/extractors/mod.rs +365 -0
  112. data/vendor/kreuzberg/src/extractors/odt.rs +628 -0
  113. data/vendor/kreuzberg/src/extractors/opml.rs +634 -0
  114. data/vendor/kreuzberg/src/extractors/orgmode.rs +528 -0
  115. data/vendor/kreuzberg/src/extractors/pdf.rs +493 -0
  116. data/vendor/kreuzberg/src/extractors/pptx.rs +248 -0
  117. data/vendor/kreuzberg/src/extractors/rst.rs +576 -0
  118. data/vendor/kreuzberg/src/extractors/rtf.rs +810 -0
  119. data/vendor/kreuzberg/src/extractors/security.rs +484 -0
  120. data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -0
  121. data/vendor/kreuzberg/src/extractors/structured.rs +140 -0
  122. data/vendor/kreuzberg/src/extractors/text.rs +260 -0
  123. data/vendor/kreuzberg/src/extractors/typst.rs +650 -0
  124. data/vendor/kreuzberg/src/extractors/xml.rs +135 -0
  125. data/vendor/kreuzberg/src/image/dpi.rs +164 -0
  126. data/vendor/kreuzberg/src/image/mod.rs +6 -0
  127. data/vendor/kreuzberg/src/image/preprocessing.rs +417 -0
  128. data/vendor/kreuzberg/src/image/resize.rs +89 -0
  129. data/vendor/kreuzberg/src/keywords/config.rs +154 -0
  130. data/vendor/kreuzberg/src/keywords/mod.rs +237 -0
  131. data/vendor/kreuzberg/src/keywords/processor.rs +267 -0
  132. data/vendor/kreuzberg/src/keywords/rake.rs +293 -0
  133. data/vendor/kreuzberg/src/keywords/types.rs +68 -0
  134. data/vendor/kreuzberg/src/keywords/yake.rs +163 -0
  135. data/vendor/kreuzberg/src/language_detection/mod.rs +942 -0
  136. data/vendor/kreuzberg/src/lib.rs +105 -0
  137. data/vendor/kreuzberg/src/mcp/mod.rs +32 -0
  138. data/vendor/kreuzberg/src/mcp/server.rs +1968 -0
  139. data/vendor/kreuzberg/src/ocr/cache.rs +469 -0
  140. data/vendor/kreuzberg/src/ocr/error.rs +37 -0
  141. data/vendor/kreuzberg/src/ocr/hocr.rs +216 -0
  142. data/vendor/kreuzberg/src/ocr/mod.rs +58 -0
  143. data/vendor/kreuzberg/src/ocr/processor.rs +863 -0
  144. data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -0
  145. data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -0
  146. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +450 -0
  147. data/vendor/kreuzberg/src/ocr/types.rs +393 -0
  148. data/vendor/kreuzberg/src/ocr/utils.rs +47 -0
  149. data/vendor/kreuzberg/src/ocr/validation.rs +206 -0
  150. data/vendor/kreuzberg/src/panic_context.rs +154 -0
  151. data/vendor/kreuzberg/src/pdf/error.rs +122 -0
  152. data/vendor/kreuzberg/src/pdf/images.rs +139 -0
  153. data/vendor/kreuzberg/src/pdf/metadata.rs +346 -0
  154. data/vendor/kreuzberg/src/pdf/mod.rs +50 -0
  155. data/vendor/kreuzberg/src/pdf/rendering.rs +369 -0
  156. data/vendor/kreuzberg/src/pdf/table.rs +393 -0
  157. data/vendor/kreuzberg/src/pdf/text.rs +158 -0
  158. data/vendor/kreuzberg/src/plugins/extractor.rs +1013 -0
  159. data/vendor/kreuzberg/src/plugins/mod.rs +209 -0
  160. data/vendor/kreuzberg/src/plugins/ocr.rs +620 -0
  161. data/vendor/kreuzberg/src/plugins/processor.rs +642 -0
  162. data/vendor/kreuzberg/src/plugins/registry.rs +1337 -0
  163. data/vendor/kreuzberg/src/plugins/traits.rs +258 -0
  164. data/vendor/kreuzberg/src/plugins/validator.rs +956 -0
  165. data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -0
  166. data/vendor/kreuzberg/src/text/mod.rs +19 -0
  167. data/vendor/kreuzberg/src/text/quality.rs +697 -0
  168. data/vendor/kreuzberg/src/text/string_utils.rs +217 -0
  169. data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -0
  170. data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -0
  171. data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -0
  172. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -0
  173. data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -0
  174. data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -0
  175. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -0
  176. data/vendor/kreuzberg/src/types.rs +903 -0
  177. data/vendor/kreuzberg/src/utils/mod.rs +17 -0
  178. data/vendor/kreuzberg/src/utils/quality.rs +959 -0
  179. data/vendor/kreuzberg/src/utils/string_utils.rs +381 -0
  180. data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -0
  181. data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -0
  182. data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -0
  183. data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -0
  184. data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -0
  185. data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -0
  186. data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -0
  187. data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -0
  188. data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -0
  189. data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -0
  190. data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -0
  191. data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -0
  192. data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -0
  193. data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -0
  194. data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -0
  195. data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -0
  196. data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -0
  197. data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -0
  198. data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -0
  199. data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -0
  200. data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -0
  201. data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -0
  202. data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -0
  203. data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -0
  204. data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -0
  205. data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -0
  206. data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -0
  207. data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -0
  208. data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -0
  209. data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -0
  210. data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -0
  211. data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -0
  212. data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -0
  213. data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -0
  214. data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -0
  215. data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -0
  216. data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -0
  217. data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -0
  218. data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -0
  219. data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -0
  220. data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -0
  221. data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -0
  222. data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -0
  223. data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -0
  224. data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -0
  225. data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -0
  226. data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -0
  227. data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -0
  228. data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -0
  229. data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -0
  230. data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -0
  231. data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -0
  232. data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -0
  233. data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -0
  234. data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -0
  235. data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -0
  236. data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -0
  237. data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -0
  238. data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -0
  239. data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -0
  240. data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -0
  241. data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -0
  242. data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -0
  243. data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -0
  244. data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -0
  245. data/vendor/kreuzberg/tests/api_tests.rs +966 -0
  246. data/vendor/kreuzberg/tests/archive_integration.rs +543 -0
  247. data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -0
  248. data/vendor/kreuzberg/tests/batch_processing.rs +316 -0
  249. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -0
  250. data/vendor/kreuzberg/tests/concurrency_stress.rs +525 -0
  251. data/vendor/kreuzberg/tests/config_features.rs +598 -0
  252. data/vendor/kreuzberg/tests/config_loading_tests.rs +415 -0
  253. data/vendor/kreuzberg/tests/core_integration.rs +510 -0
  254. data/vendor/kreuzberg/tests/csv_integration.rs +414 -0
  255. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +498 -0
  256. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -0
  257. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -0
  258. data/vendor/kreuzberg/tests/email_integration.rs +325 -0
  259. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -0
  260. data/vendor/kreuzberg/tests/error_handling.rs +393 -0
  261. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -0
  262. data/vendor/kreuzberg/tests/format_integration.rs +159 -0
  263. data/vendor/kreuzberg/tests/helpers/mod.rs +142 -0
  264. data/vendor/kreuzberg/tests/html_table_test.rs +551 -0
  265. data/vendor/kreuzberg/tests/image_integration.rs +253 -0
  266. data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -0
  267. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -0
  268. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -0
  269. data/vendor/kreuzberg/tests/keywords_integration.rs +479 -0
  270. data/vendor/kreuzberg/tests/keywords_quality.rs +509 -0
  271. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -0
  272. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -0
  273. data/vendor/kreuzberg/tests/mime_detection.rs +428 -0
  274. data/vendor/kreuzberg/tests/ocr_configuration.rs +510 -0
  275. data/vendor/kreuzberg/tests/ocr_errors.rs +676 -0
  276. data/vendor/kreuzberg/tests/ocr_quality.rs +627 -0
  277. data/vendor/kreuzberg/tests/ocr_stress.rs +469 -0
  278. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +695 -0
  279. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -0
  280. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -0
  281. data/vendor/kreuzberg/tests/pdf_integration.rs +43 -0
  282. data/vendor/kreuzberg/tests/pipeline_integration.rs +1411 -0
  283. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +771 -0
  284. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -0
  285. data/vendor/kreuzberg/tests/plugin_system.rs +921 -0
  286. data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -0
  287. data/vendor/kreuzberg/tests/registry_integration_tests.rs +586 -0
  288. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +692 -0
  289. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +776 -0
  290. data/vendor/kreuzberg/tests/security_validation.rs +415 -0
  291. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -0
  292. data/vendor/kreuzberg/tests/test_fastembed.rs +609 -0
  293. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1259 -0
  294. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +647 -0
  295. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -0
  296. data/vendor/rb-sys/.cargo-ok +1 -0
  297. data/vendor/rb-sys/.cargo_vcs_info.json +6 -0
  298. data/vendor/rb-sys/Cargo.lock +393 -0
  299. data/vendor/rb-sys/Cargo.toml +70 -0
  300. data/vendor/rb-sys/Cargo.toml.orig +57 -0
  301. data/vendor/rb-sys/LICENSE-APACHE +190 -0
  302. data/vendor/rb-sys/LICENSE-MIT +21 -0
  303. data/vendor/rb-sys/bin/release.sh +21 -0
  304. data/vendor/rb-sys/build/features.rs +108 -0
  305. data/vendor/rb-sys/build/main.rs +246 -0
  306. data/vendor/rb-sys/build/stable_api_config.rs +153 -0
  307. data/vendor/rb-sys/build/version.rs +48 -0
  308. data/vendor/rb-sys/readme.md +36 -0
  309. data/vendor/rb-sys/src/bindings.rs +21 -0
  310. data/vendor/rb-sys/src/hidden.rs +11 -0
  311. data/vendor/rb-sys/src/lib.rs +34 -0
  312. data/vendor/rb-sys/src/macros.rs +371 -0
  313. data/vendor/rb-sys/src/memory.rs +53 -0
  314. data/vendor/rb-sys/src/ruby_abi_version.rs +38 -0
  315. data/vendor/rb-sys/src/special_consts.rs +31 -0
  316. data/vendor/rb-sys/src/stable_api/compiled.c +179 -0
  317. data/vendor/rb-sys/src/stable_api/compiled.rs +257 -0
  318. data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +316 -0
  319. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +316 -0
  320. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +324 -0
  321. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +317 -0
  322. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +315 -0
  323. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +326 -0
  324. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +327 -0
  325. data/vendor/rb-sys/src/stable_api.rs +261 -0
  326. data/vendor/rb-sys/src/symbol.rs +31 -0
  327. data/vendor/rb-sys/src/tracking_allocator.rs +332 -0
  328. data/vendor/rb-sys/src/utils.rs +89 -0
  329. data/vendor/rb-sys/src/value_type.rs +7 -0
  330. metadata +536 -0
@@ -0,0 +1,650 @@
1
+ //! Native Rust Typst document extractor.
2
+ //!
3
+ //! This extractor provides Typst document parsing and text extraction.
4
+ //! It uses a hybrid approach combining regex patterns and character-level parsing
5
+ //! to extract text while preserving document structure.
6
+ //!
7
+ //! Features:
8
+ //! - Metadata extraction: title, author, date, subject, keywords from `#set document()`
9
+ //! - Section hierarchy: `=`, `==`, `===`, etc. heading levels
10
+ //! - Inline formatting: `*bold*`, `_italic_`, `` `code` ``
11
+ //! - Lists: extraction of list content (both `+` and `-` markers)
12
+ //! - Links: extraction of URLs and link text from `#link("url")[text]` syntax
13
+ //! - Math: inline (`$...$`) and display math preservation
14
+ //! - Code blocks: triple-backtick code blocks with language specifiers
15
+ //! - Tables: extraction of `#table()` function content
16
+ //! - Complex formatting: handling of nested and combined formatting
17
+ //!
18
+ //! Requires the `office` feature.
19
+
20
+ #[cfg(feature = "office")]
21
+ use crate::Result;
22
+ #[cfg(feature = "office")]
23
+ use crate::core::config::ExtractionConfig;
24
+ #[cfg(feature = "office")]
25
+ use crate::plugins::{DocumentExtractor, Plugin};
26
+ #[cfg(feature = "office")]
27
+ use crate::types::{ExtractionResult, Metadata};
28
+ #[cfg(feature = "office")]
29
+ use async_trait::async_trait;
30
+ #[cfg(feature = "office")]
31
+ use regex::Regex;
32
+
33
+ /// Typst document extractor
34
+ #[cfg(feature = "office")]
35
+ pub struct TypstExtractor;
36
+
37
+ #[cfg(feature = "office")]
38
+ impl TypstExtractor {
39
+ /// Create a new Typst extractor.
40
+ pub fn new() -> Self {
41
+ Self
42
+ }
43
+
44
+ /// Parse Typst content and extract text.
45
+ fn extract_from_typst(content: &str) -> (String, Metadata) {
46
+ let mut extractor = TypstParser::new(content);
47
+ let text = extractor.parse();
48
+ let metadata = extractor.metadata;
49
+
50
+ (text, metadata)
51
+ }
52
+ }
53
+
54
+ #[cfg(feature = "office")]
55
+ impl Default for TypstExtractor {
56
+ fn default() -> Self {
57
+ Self::new()
58
+ }
59
+ }
60
+
61
+ #[cfg(feature = "office")]
62
+ impl Plugin for TypstExtractor {
63
+ fn name(&self) -> &str {
64
+ "typst-extractor"
65
+ }
66
+
67
+ fn version(&self) -> String {
68
+ env!("CARGO_PKG_VERSION").to_string()
69
+ }
70
+
71
+ fn initialize(&self) -> Result<()> {
72
+ Ok(())
73
+ }
74
+
75
+ fn shutdown(&self) -> Result<()> {
76
+ Ok(())
77
+ }
78
+
79
+ fn description(&self) -> &str {
80
+ "Native Rust Typst document extractor with metadata support"
81
+ }
82
+
83
+ fn author(&self) -> &str {
84
+ "Kreuzberg Team"
85
+ }
86
+ }
87
+
88
+ #[cfg(feature = "office")]
89
+ #[async_trait]
90
+ impl DocumentExtractor for TypstExtractor {
91
+ #[cfg_attr(feature = "otel", tracing::instrument(
92
+ skip(self, content, _config),
93
+ fields(
94
+ extractor.name = self.name(),
95
+ content.size_bytes = content.len(),
96
+ )
97
+ ))]
98
+ async fn extract_bytes(
99
+ &self,
100
+ content: &[u8],
101
+ mime_type: &str,
102
+ _config: &ExtractionConfig,
103
+ ) -> Result<ExtractionResult> {
104
+ let typst_str = String::from_utf8_lossy(content).to_string();
105
+ let (text, metadata) = Self::extract_from_typst(&typst_str);
106
+
107
+ Ok(ExtractionResult {
108
+ content: text,
109
+ mime_type: mime_type.to_string(),
110
+ metadata,
111
+ tables: Vec::new(),
112
+ detected_languages: None,
113
+ chunks: None,
114
+ images: None,
115
+ })
116
+ }
117
+
118
+ fn supported_mime_types(&self) -> &[&str] {
119
+ &["application/x-typst", "text/x-typst"]
120
+ }
121
+
122
+ fn priority(&self) -> i32 {
123
+ 50
124
+ }
125
+ }
126
+
127
+ /// Internal Typst parser
128
+ #[cfg(feature = "office")]
129
+ struct TypstParser {
130
+ content: String,
131
+ metadata: Metadata,
132
+ }
133
+
134
+ #[cfg(feature = "office")]
135
+ impl TypstParser {
136
+ fn new(content: &str) -> Self {
137
+ Self {
138
+ content: content.to_string(),
139
+ metadata: Metadata::default(),
140
+ }
141
+ }
142
+
143
+ fn parse(&mut self) -> String {
144
+ self.extract_metadata();
145
+
146
+ self.extract_content()
147
+ }
148
+
149
+ fn extract_metadata(&mut self) {
150
+ if let Some(title) = self.extract_quoted_value("title") {
151
+ self.metadata.additional.insert("title".to_string(), title.into());
152
+ }
153
+
154
+ if let Some(author) = self.extract_quoted_value("author") {
155
+ self.metadata.additional.insert("author".to_string(), author.into());
156
+ }
157
+
158
+ if let Some(date) = self.extract_quoted_value("date") {
159
+ self.metadata.date = Some(date);
160
+ }
161
+
162
+ if let Some(subject) = self.extract_quoted_value("subject") {
163
+ self.metadata.additional.insert("subject".to_string(), subject.into());
164
+ }
165
+
166
+ if let Some(keywords) = self.extract_keywords() {
167
+ self.metadata.additional.insert("keywords".to_string(), keywords.into());
168
+ }
169
+ }
170
+
171
+ fn extract_quoted_value(&self, field: &str) -> Option<String> {
172
+ let pattern = format!(r#"{}:\s*"([^"]*)""#, regex::escape(field));
173
+ if let Ok(re) = Regex::new(&pattern)
174
+ && let Some(caps) = re.captures(&self.content)
175
+ {
176
+ return caps.get(1).map(|m| m.as_str().to_string());
177
+ }
178
+ None
179
+ }
180
+
181
+ fn extract_keywords(&self) -> Option<String> {
182
+ let pattern = r#"keywords:\s*(?:"([^"]*)"|(\([^)]*\)))"#;
183
+ if let Ok(re) = Regex::new(pattern)
184
+ && let Some(caps) = re.captures(&self.content)
185
+ {
186
+ if let Some(m) = caps.get(1) {
187
+ return Some(m.as_str().to_string());
188
+ }
189
+ if let Some(m) = caps.get(2) {
190
+ let array_str = m.as_str();
191
+ let mut keywords = Vec::new();
192
+ let item_pattern = r#""([^"]*)""#;
193
+ if let Ok(item_re) = Regex::new(item_pattern) {
194
+ for item_caps in item_re.captures_iter(array_str) {
195
+ if let Some(keyword) = item_caps.get(1) {
196
+ keywords.push(keyword.as_str().to_string());
197
+ }
198
+ }
199
+ }
200
+ if !keywords.is_empty() {
201
+ return Some(keywords.join(", "));
202
+ }
203
+ }
204
+ }
205
+ None
206
+ }
207
+
208
+ fn extract_content(&self) -> String {
209
+ let mut output = String::new();
210
+ let mut lines = self.content.lines().peekable();
211
+ let mut in_code_block = false;
212
+ let mut code_block_fence = String::new();
213
+
214
+ while let Some(line) = lines.next() {
215
+ let trimmed = line.trim();
216
+
217
+ if trimmed.starts_with("```") {
218
+ if in_code_block {
219
+ if trimmed == "```" {
220
+ in_code_block = false;
221
+ code_block_fence.clear();
222
+ output.push_str("```\n");
223
+ continue;
224
+ }
225
+ } else {
226
+ in_code_block = true;
227
+ code_block_fence = "```".to_string();
228
+ output.push_str("```");
229
+ if let Some(lang) = trimmed.strip_prefix("```") {
230
+ let lang = lang.trim();
231
+ if !lang.is_empty() {
232
+ output.push_str(lang);
233
+ }
234
+ }
235
+ output.push('\n');
236
+ continue;
237
+ }
238
+ }
239
+
240
+ if in_code_block {
241
+ output.push_str(line);
242
+ output.push('\n');
243
+ continue;
244
+ }
245
+
246
+ if trimmed.starts_with("#set ") || trimmed.starts_with("#let ") {
247
+ continue;
248
+ }
249
+
250
+ if trimmed.starts_with("#import ") || trimmed.starts_with("#include ") {
251
+ continue;
252
+ }
253
+
254
+ if trimmed.starts_with("#table(") {
255
+ output.push_str("TABLE:\n");
256
+ let table_content = self.extract_table_content(trimmed, &mut lines);
257
+ output.push_str(&table_content);
258
+ output.push('\n');
259
+ continue;
260
+ }
261
+
262
+ if trimmed.starts_with('=') {
263
+ let next_char_pos = trimmed.find(|c: char| c != '=');
264
+ if next_char_pos.is_some() {
265
+ let heading_level = trimmed.chars().take_while(|&c| c == '=').count();
266
+ let heading_text = trimmed[heading_level..].trim();
267
+
268
+ for _ in 0..heading_level {
269
+ output.push('=');
270
+ }
271
+ output.push(' ');
272
+ output.push_str(heading_text);
273
+ output.push('\n');
274
+ continue;
275
+ }
276
+ }
277
+
278
+ if (trimmed.starts_with('+') || trimmed.starts_with('-'))
279
+ && trimmed.len() > 1
280
+ && trimmed.chars().nth(1).is_some_and(|c| !c.is_alphanumeric())
281
+ {
282
+ output.push_str("- ");
283
+ output.push_str(trimmed[1..].trim());
284
+ output.push('\n');
285
+ continue;
286
+ }
287
+
288
+ if trimmed.starts_with('#')
289
+ && !trimmed.starts_with("#set")
290
+ && !trimmed.starts_with("#let")
291
+ && !trimmed.starts_with("#import")
292
+ && !trimmed.starts_with("#include")
293
+ {
294
+ if trimmed.contains('[')
295
+ && trimmed.contains(']')
296
+ && let Some(content) = self.extract_text_from_brackets(trimmed)
297
+ {
298
+ let processed = self.process_line(&content);
299
+ if !processed.is_empty() {
300
+ output.push_str(&processed);
301
+ output.push('\n');
302
+ }
303
+ }
304
+ continue;
305
+ }
306
+
307
+ if !trimmed.is_empty() {
308
+ let processed = self.process_line(trimmed);
309
+ if !processed.is_empty() {
310
+ output.push_str(&processed);
311
+ output.push('\n');
312
+ }
313
+ } else {
314
+ output.push('\n');
315
+ }
316
+ }
317
+
318
+ output
319
+ }
320
+
321
+ /// Extract content from #table() function calls
322
+ fn extract_table_content<'a, I>(&self, first_line: &str, lines: &mut std::iter::Peekable<I>) -> String
323
+ where
324
+ I: Iterator<Item = &'a str>,
325
+ {
326
+ let mut table_content = String::new();
327
+ let mut content = first_line.to_string();
328
+ let mut bracket_depth = 0;
329
+ let mut paren_depth = if first_line.contains('(') { 1 } else { 0 };
330
+
331
+ for ch in first_line.chars() {
332
+ match ch {
333
+ '(' => paren_depth += 1,
334
+ ')' => paren_depth -= 1,
335
+ '[' => bracket_depth += 1,
336
+ ']' => bracket_depth -= 1,
337
+ _ => {}
338
+ }
339
+ }
340
+
341
+ while paren_depth > 0 || bracket_depth > 0 {
342
+ if let Some(next_line) = lines.next() {
343
+ content.push('\n');
344
+ content.push_str(next_line);
345
+ for ch in next_line.chars() {
346
+ match ch {
347
+ '(' => paren_depth += 1,
348
+ ')' => paren_depth -= 1,
349
+ '[' => bracket_depth += 1,
350
+ ']' => bracket_depth -= 1,
351
+ _ => {}
352
+ }
353
+ }
354
+ } else {
355
+ break;
356
+ }
357
+ }
358
+
359
+ let mut in_bracket = false;
360
+ let mut cell = String::new();
361
+ for ch in content.chars() {
362
+ match ch {
363
+ '[' => {
364
+ in_bracket = true;
365
+ cell.clear();
366
+ }
367
+ ']' => {
368
+ if in_bracket {
369
+ let trimmed = cell.trim();
370
+ if !trimmed.is_empty() {
371
+ table_content.push_str(trimmed);
372
+ table_content.push_str(" | ");
373
+ }
374
+ in_bracket = false;
375
+ cell.clear();
376
+ }
377
+ }
378
+ _ if in_bracket => {
379
+ cell.push(ch);
380
+ }
381
+ _ => {}
382
+ }
383
+ }
384
+
385
+ if table_content.ends_with(" | ") {
386
+ table_content.truncate(table_content.len() - 3);
387
+ }
388
+
389
+ table_content
390
+ }
391
+
392
+ fn process_line(&self, line: &str) -> String {
393
+ let mut result = String::new();
394
+ let mut chars = line.chars().peekable();
395
+
396
+ while let Some(ch) = chars.next() {
397
+ match ch {
398
+ '`' => {
399
+ result.push('`');
400
+ for c in chars.by_ref() {
401
+ result.push(c);
402
+ if c == '`' {
403
+ break;
404
+ }
405
+ }
406
+ }
407
+ '$' => {
408
+ result.push('$');
409
+ for c in chars.by_ref() {
410
+ result.push(c);
411
+ if c == '$' {
412
+ break;
413
+ }
414
+ }
415
+ }
416
+ '*' => {
417
+ result.push('*');
418
+ for c in chars.by_ref() {
419
+ result.push(c);
420
+ if c == '*' {
421
+ break;
422
+ }
423
+ }
424
+ }
425
+ '_' => {
426
+ result.push('_');
427
+ for c in chars.by_ref() {
428
+ result.push(c);
429
+ if c == '_' {
430
+ break;
431
+ }
432
+ }
433
+ }
434
+ '#' if chars.peek() == Some(&'l') => {
435
+ result.push(ch);
436
+ }
437
+ _ => {
438
+ result.push(ch);
439
+ }
440
+ }
441
+ }
442
+
443
+ self.extract_link_text(&result)
444
+ }
445
+
446
+ fn extract_link_text(&self, line: &str) -> String {
447
+ let pattern = r#"link\("([^"]*)"\)\[([^\]]*)\]"#;
448
+ if let Ok(re) = Regex::new(pattern) {
449
+ return re
450
+ .replace_all(line, |caps: &regex::Captures| {
451
+ let url = caps.get(1).map(|m| m.as_str()).unwrap_or("");
452
+ let text = caps.get(2).map(|m| m.as_str()).unwrap_or("");
453
+ format!("[{}]({})", text, url)
454
+ })
455
+ .to_string();
456
+ }
457
+ line.to_string()
458
+ }
459
+
460
+ fn extract_text_from_brackets(&self, line: &str) -> Option<String> {
461
+ if let Some(start) = line.find('[')
462
+ && let Some(end) = line.rfind(']')
463
+ && end > start
464
+ {
465
+ let text = &line[start + 1..end];
466
+ return Some(text.to_string());
467
+ }
468
+ None
469
+ }
470
+ }
471
+
472
+ #[cfg(test)]
473
+ mod tests {
474
+ use super::*;
475
+
476
+ #[test]
477
+ fn test_extract_metadata() {
478
+ let content = r#"#set document(
479
+ title: "Test Document",
480
+ author: "Test Author"
481
+ )
482
+
483
+ = Heading
484
+ Some text
485
+ "#;
486
+
487
+ let (_, metadata) = TypstExtractor::extract_from_typst(content);
488
+
489
+ assert!(metadata.additional.contains_key("title"));
490
+ assert!(metadata.additional.contains_key("author"));
491
+ }
492
+
493
+ #[test]
494
+ fn test_extract_headings() {
495
+ let content = r#"= Level 1
496
+ Content
497
+
498
+ == Level 2
499
+ More content
500
+ "#;
501
+
502
+ let (output, _) = TypstExtractor::extract_from_typst(content);
503
+
504
+ assert!(output.contains("= Level 1"));
505
+ assert!(output.contains("== Level 2"));
506
+ }
507
+
508
+ #[test]
509
+ fn test_extract_formatting() {
510
+ let content = r#"Some *bold* and _italic_ text with `code`."#;
511
+
512
+ let (output, _) = TypstExtractor::extract_from_typst(content);
513
+
514
+ assert!(output.contains("*bold*") || output.contains("bold"));
515
+ assert!(output.contains("_italic_") || output.contains("italic"));
516
+ assert!(output.contains("`code`") || output.contains("code"));
517
+ }
518
+
519
+ #[test]
520
+ fn test_extract_code_blocks() {
521
+ let content = r#"Here is code:
522
+
523
+ ```python
524
+ def hello():
525
+ print("world")
526
+ ```
527
+
528
+ Done."#;
529
+
530
+ let (output, _) = TypstExtractor::extract_from_typst(content);
531
+
532
+ assert!(output.contains("```python"));
533
+ assert!(output.contains("def hello"));
534
+ assert!(output.contains("print"));
535
+ }
536
+
537
+ #[test]
538
+ fn test_extract_links() {
539
+ let content = r#"Visit #link("https://example.com")[example site] for info."#;
540
+
541
+ let (output, _) = TypstExtractor::extract_from_typst(content);
542
+
543
+ assert!(
544
+ output.contains("example.com")
545
+ || output.contains("example site")
546
+ || output.contains("[example site](https://example.com)")
547
+ );
548
+ }
549
+
550
+ #[test]
551
+ fn test_extract_list_items() {
552
+ let content = r#"= Lists
553
+
554
+ + First item
555
+ + Second item
556
+ + Third item"#;
557
+
558
+ let (output, _) = TypstExtractor::extract_from_typst(content);
559
+
560
+ assert!(output.contains("First item"));
561
+ assert!(output.contains("Second item"));
562
+ assert!(output.contains("Third item"));
563
+ }
564
+
565
+ #[test]
566
+ fn test_extract_tables() {
567
+ let content = r#"== Tables
568
+
569
+ #table(
570
+ columns: 2,
571
+ [Name], [Age],
572
+ [Alice], [30],
573
+ )"#;
574
+
575
+ let (output, _) = TypstExtractor::extract_from_typst(content);
576
+
577
+ assert!(output.contains("TABLE:") || output.contains("Name") || output.contains("Alice"));
578
+ }
579
+
580
+ #[test]
581
+ fn test_extract_math() {
582
+ let content = r#"The formula $E = mc^2$ is important.
583
+
584
+ Display:
585
+ $ a^2 + b^2 = c^2 $"#;
586
+
587
+ let (output, _) = TypstExtractor::extract_from_typst(content);
588
+
589
+ assert!(output.contains("$") && output.contains("mc"));
590
+ }
591
+
592
+ #[test]
593
+ fn test_metadata_extraction_comprehensive() {
594
+ let content = r#"#set document(
595
+ title: "Advanced Document",
596
+ author: "John Doe",
597
+ date: "2024-12-06",
598
+ subject: "Test Subject",
599
+ keywords: ("test", "example", "rust")
600
+ )
601
+
602
+ Content here."#;
603
+
604
+ let (_, metadata) = TypstExtractor::extract_from_typst(content);
605
+
606
+ assert!(metadata.additional.contains_key("title"), "Title should be extracted");
607
+ assert!(metadata.additional.contains_key("author"), "Author should be extracted");
608
+ assert!(metadata.date.is_some(), "Date should be extracted");
609
+ assert!(
610
+ metadata.additional.contains_key("subject"),
611
+ "Subject should be extracted"
612
+ );
613
+ assert!(
614
+ metadata
615
+ .additional
616
+ .get("keywords")
617
+ .map(|v| !v.to_string().is_empty())
618
+ .unwrap_or(false)
619
+ );
620
+ }
621
+
622
+ #[test]
623
+ fn test_skip_directives() {
624
+ let content = r#"#set heading(numbering: "1.")
625
+ #let x = 5
626
+ #import "@preview/foo:1.0"
627
+ #include "other.typ"
628
+
629
+ = Heading
630
+ Actual content"#;
631
+
632
+ let (output, _) = TypstExtractor::extract_from_typst(content);
633
+
634
+ assert!(!output.contains("#set"));
635
+ assert!(!output.contains("#let"));
636
+ assert!(!output.contains("#import"));
637
+ assert!(!output.contains("#include"));
638
+ assert!(output.contains("Heading"));
639
+ assert!(output.contains("content"));
640
+ }
641
+
642
+ #[test]
643
+ fn test_combined_formatting() {
644
+ let content = r#"This is *bold with _nested italic_* and more."#;
645
+
646
+ let (output, _) = TypstExtractor::extract_from_typst(content);
647
+
648
+ assert!(output.contains("*") || output.contains("_") || (output.contains("bold") && output.contains("italic")));
649
+ }
650
+ }