kreuzberg 4.6.3 → 4.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (336) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +26 -5
  3. data/ext/kreuzberg_rb/native/Cargo.lock +307 -154
  4. data/ext/kreuzberg_rb/native/Cargo.toml +4 -4
  5. data/ext/kreuzberg_rb/native/README.md +28 -5
  6. data/ext/kreuzberg_rb/native/src/config/types.rs +21 -15
  7. data/ext/kreuzberg_rb/native/src/lib.rs +20 -5
  8. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +3 -0
  9. data/ext/kreuzberg_rb/native/src/result.rs +34 -1
  10. data/lib/kreuzberg/config.rb +17 -13
  11. data/lib/kreuzberg/result.rb +43 -6
  12. data/lib/kreuzberg/types.rb +205 -15
  13. data/lib/kreuzberg/version.rb +1 -1
  14. data/sig/kreuzberg.rbs +303 -0
  15. data/vendor/Cargo.toml +12 -8
  16. data/vendor/kreuzberg/Cargo.toml +32 -10
  17. data/vendor/kreuzberg/README.md +50 -2
  18. data/vendor/kreuzberg/build.rs +12 -16
  19. data/vendor/kreuzberg/src/api/error.rs +7 -0
  20. data/vendor/kreuzberg/src/api/handlers.rs +127 -31
  21. data/vendor/kreuzberg/src/api/mod.rs +6 -2
  22. data/vendor/kreuzberg/src/api/openapi.rs +8 -1
  23. data/vendor/kreuzberg/src/api/openweb.rs +175 -0
  24. data/vendor/kreuzberg/src/api/router.rs +6 -2
  25. data/vendor/kreuzberg/src/api/startup.rs +0 -3
  26. data/vendor/kreuzberg/src/api/types.rs +46 -3
  27. data/vendor/kreuzberg/src/chunking/builder.rs +2 -0
  28. data/vendor/kreuzberg/src/chunking/classifier.rs +491 -0
  29. data/vendor/kreuzberg/src/chunking/core.rs +16 -1
  30. data/vendor/kreuzberg/src/chunking/mod.rs +3 -1
  31. data/vendor/kreuzberg/src/chunking/processor.rs +36 -0
  32. data/vendor/kreuzberg/src/chunking/tokenizer_cache.rs +3 -3
  33. data/vendor/kreuzberg/src/chunking/yaml_section.rs +13 -5
  34. data/vendor/kreuzberg/src/core/config/extraction/core.rs +36 -2
  35. data/vendor/kreuzberg/src/core/config/extraction/env.rs +26 -8
  36. data/vendor/kreuzberg/src/core/config/extraction/file_config.rs +9 -0
  37. data/vendor/kreuzberg/src/core/config/formats.rs +60 -26
  38. data/vendor/kreuzberg/src/core/config/layout.rs +138 -17
  39. data/vendor/kreuzberg/src/core/config/mod.rs +5 -1
  40. data/vendor/kreuzberg/src/core/config/tree_sitter.rs +161 -0
  41. data/vendor/kreuzberg/src/core/extractor/bytes.rs +7 -0
  42. data/vendor/kreuzberg/src/core/extractor/file.rs +11 -4
  43. data/vendor/kreuzberg/src/core/extractor/helpers.rs +4 -0
  44. data/vendor/kreuzberg/src/core/extractor/legacy.rs +2 -2
  45. data/vendor/kreuzberg/src/core/mime.rs +61 -2
  46. data/vendor/kreuzberg/src/core/mod.rs +1 -0
  47. data/vendor/kreuzberg/src/core/path_resolver.rs +279 -0
  48. data/vendor/kreuzberg/src/core/pipeline/cache.rs +0 -10
  49. data/vendor/kreuzberg/src/core/pipeline/execution.rs +1 -6
  50. data/vendor/kreuzberg/src/core/pipeline/features.rs +134 -66
  51. data/vendor/kreuzberg/src/core/pipeline/format.rs +55 -280
  52. data/vendor/kreuzberg/src/core/pipeline/mod.rs +127 -47
  53. data/vendor/kreuzberg/src/core/pipeline/tests.rs +91 -424
  54. data/vendor/kreuzberg/src/core/server_config/env.rs +0 -14
  55. data/vendor/kreuzberg/src/core/server_config/loader.rs +4 -13
  56. data/vendor/kreuzberg/src/core/server_config/mod.rs +0 -28
  57. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +2 -39
  58. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +0 -24
  59. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +0 -20
  60. data/vendor/kreuzberg/src/core/server_config/validation.rs +1 -14
  61. data/vendor/kreuzberg/src/embeddings/mod.rs +2 -2
  62. data/vendor/kreuzberg/src/extraction/archive/gzip.rs +8 -8
  63. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +5 -5
  64. data/vendor/kreuzberg/src/extraction/archive/tar.rs +5 -5
  65. data/vendor/kreuzberg/src/extraction/archive/zip.rs +5 -5
  66. data/vendor/kreuzberg/src/extraction/derive.rs +1052 -0
  67. data/vendor/kreuzberg/src/extraction/doc/mod.rs +2 -2
  68. data/vendor/kreuzberg/src/extraction/docx/parser.rs +209 -34
  69. data/vendor/kreuzberg/src/extraction/docx/styles.rs +2 -2
  70. data/vendor/kreuzberg/src/extraction/email.rs +3 -3
  71. data/vendor/kreuzberg/src/extraction/html/converter.rs +204 -86
  72. data/vendor/kreuzberg/src/extraction/html/mod.rs +2 -2
  73. data/vendor/kreuzberg/src/extraction/html/structure.rs +91 -7
  74. data/vendor/kreuzberg/src/extraction/image_ocr.rs +54 -2
  75. data/vendor/kreuzberg/src/extraction/mod.rs +4 -0
  76. data/vendor/kreuzberg/src/extraction/ooxml_embedded.rs +140 -0
  77. data/vendor/kreuzberg/src/extraction/ppt/mod.rs +11 -2
  78. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +30 -1
  79. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +16 -0
  80. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +21 -10
  81. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +149 -28
  82. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +104 -24
  83. data/vendor/kreuzberg/src/extraction/structured.rs +3 -3
  84. data/vendor/kreuzberg/src/extraction/transform/document_tree.rs +6 -0
  85. data/vendor/kreuzberg/src/extraction/transform/mod.rs +25 -0
  86. data/vendor/kreuzberg/src/extraction/xml.rs +2 -2
  87. data/vendor/kreuzberg/src/extractors/annotation_utils.rs +97 -0
  88. data/vendor/kreuzberg/src/extractors/archive.rs +73 -64
  89. data/vendor/kreuzberg/src/extractors/bibtex.rs +188 -158
  90. data/vendor/kreuzberg/src/extractors/citation.rs +129 -144
  91. data/vendor/kreuzberg/src/extractors/code.rs +208 -0
  92. data/vendor/kreuzberg/src/extractors/csv.rs +71 -137
  93. data/vendor/kreuzberg/src/extractors/dbf.rs +31 -85
  94. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +8 -0
  95. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +260 -106
  96. data/vendor/kreuzberg/src/extractors/doc.rs +59 -74
  97. data/vendor/kreuzberg/src/extractors/docbook.rs +115 -64
  98. data/vendor/kreuzberg/src/extractors/docx.rs +763 -199
  99. data/vendor/kreuzberg/src/extractors/email.rs +269 -119
  100. data/vendor/kreuzberg/src/extractors/epub/content.rs +6 -3
  101. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +14 -24
  102. data/vendor/kreuzberg/src/extractors/epub/mod.rs +381 -85
  103. data/vendor/kreuzberg/src/extractors/excel.rs +113 -353
  104. data/vendor/kreuzberg/src/extractors/fictionbook.rs +411 -861
  105. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +26 -62
  106. data/vendor/kreuzberg/src/extractors/html.rs +445 -57
  107. data/vendor/kreuzberg/src/extractors/hwp.rs +21 -38
  108. data/vendor/kreuzberg/src/extractors/image.rs +195 -152
  109. data/vendor/kreuzberg/src/extractors/iwork/keynote.rs +107 -80
  110. data/vendor/kreuzberg/src/extractors/iwork/mod.rs +109 -0
  111. data/vendor/kreuzberg/src/extractors/iwork/numbers.rs +105 -63
  112. data/vendor/kreuzberg/src/extractors/iwork/pages.rs +137 -67
  113. data/vendor/kreuzberg/src/extractors/jats/mod.rs +159 -87
  114. data/vendor/kreuzberg/src/extractors/jats/parser.rs +189 -0
  115. data/vendor/kreuzberg/src/extractors/jupyter.rs +188 -57
  116. data/vendor/kreuzberg/src/extractors/latex/commands.rs +58 -3
  117. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +10 -18
  118. data/vendor/kreuzberg/src/extractors/latex/mod.rs +675 -255
  119. data/vendor/kreuzberg/src/extractors/markdown.rs +320 -294
  120. data/vendor/kreuzberg/src/extractors/markdown_utils.rs +1 -0
  121. data/vendor/kreuzberg/src/extractors/mdx.rs +287 -252
  122. data/vendor/kreuzberg/src/extractors/mod.rs +20 -3
  123. data/vendor/kreuzberg/src/extractors/odt.rs +556 -866
  124. data/vendor/kreuzberg/src/extractors/opml/core.rs +40 -37
  125. data/vendor/kreuzberg/src/extractors/opml/parser.rs +82 -45
  126. data/vendor/kreuzberg/src/extractors/orgmode.rs +488 -116
  127. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +56 -34
  128. data/vendor/kreuzberg/src/extractors/pdf/mod.rs +324 -151
  129. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +58 -52
  130. data/vendor/kreuzberg/src/extractors/ppt.rs +79 -87
  131. data/vendor/kreuzberg/src/extractors/pptx.rs +301 -125
  132. data/vendor/kreuzberg/src/extractors/pst.rs +89 -124
  133. data/vendor/kreuzberg/src/extractors/rst.rs +641 -143
  134. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +8 -0
  135. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +192 -0
  136. data/vendor/kreuzberg/src/extractors/rtf/images.rs +81 -10
  137. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +582 -91
  138. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +1121 -175
  139. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +11 -2
  140. data/vendor/kreuzberg/src/extractors/structured.rs +48 -58
  141. data/vendor/kreuzberg/src/extractors/text.rs +42 -47
  142. data/vendor/kreuzberg/src/extractors/typst.rs +158 -139
  143. data/vendor/kreuzberg/src/extractors/xml.rs +42 -61
  144. data/vendor/kreuzberg/src/keywords/processor.rs +30 -7
  145. data/vendor/kreuzberg/src/language_detection/processor.rs +20 -0
  146. data/vendor/kreuzberg/src/layout/engine.rs +3 -62
  147. data/vendor/kreuzberg/src/layout/mod.rs +8 -47
  148. data/vendor/kreuzberg/src/layout/models/tatr.rs +171 -15
  149. data/vendor/kreuzberg/src/lib.rs +28 -5
  150. data/vendor/kreuzberg/src/mcp/format.rs +38 -0
  151. data/vendor/kreuzberg/src/mcp/params.rs +12 -0
  152. data/vendor/kreuzberg/src/mcp/server.rs +37 -6
  153. data/vendor/kreuzberg/src/ocr/hocr_parser.rs +826 -0
  154. data/vendor/kreuzberg/src/ocr/mod.rs +2 -2
  155. data/vendor/kreuzberg/src/ocr/processor/execution.rs +14 -4
  156. data/vendor/kreuzberg/src/ocr/table/mod.rs +2 -3
  157. data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +3 -3
  158. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +10 -2
  159. data/vendor/kreuzberg/src/ocr/validation.rs +3 -3
  160. data/vendor/kreuzberg/src/paddle_ocr/backend.rs +7 -8
  161. data/vendor/kreuzberg/src/paddle_ocr/model_manager.rs +0 -23
  162. data/vendor/kreuzberg/src/pdf/bookmarks.rs +247 -0
  163. data/vendor/kreuzberg/src/pdf/embedded_files.rs +243 -0
  164. data/vendor/kreuzberg/src/pdf/fonts.rs +5 -5
  165. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +72 -2
  166. data/vendor/kreuzberg/src/pdf/images.rs +69 -0
  167. data/vendor/kreuzberg/src/pdf/mod.rs +6 -2
  168. data/vendor/kreuzberg/src/pdf/structure/assembly.rs +800 -0
  169. data/vendor/kreuzberg/src/pdf/structure/bridge.rs +1282 -0
  170. data/vendor/kreuzberg/src/pdf/structure/classify.rs +1957 -0
  171. data/vendor/kreuzberg/src/pdf/structure/columns.rs +182 -0
  172. data/vendor/kreuzberg/src/pdf/{markdown → structure}/constants.rs +0 -13
  173. data/vendor/kreuzberg/src/pdf/{markdown → structure}/content_convert.rs +17 -7
  174. data/vendor/kreuzberg/src/pdf/{markdown → structure}/layout_classify.rs +68 -61
  175. data/vendor/kreuzberg/src/pdf/structure/lines.rs +52 -0
  176. data/vendor/kreuzberg/src/pdf/structure/mod.rs +28 -0
  177. data/vendor/kreuzberg/src/pdf/structure/paragraphs.rs +487 -0
  178. data/vendor/kreuzberg/src/pdf/{markdown → structure}/pipeline.rs +167 -244
  179. data/vendor/kreuzberg/src/pdf/structure/regions/heading.rs +51 -0
  180. data/vendor/kreuzberg/src/pdf/{markdown → structure}/regions/layout_validation.rs +3 -3
  181. data/vendor/kreuzberg/src/pdf/structure/regions/mod.rs +17 -0
  182. data/vendor/kreuzberg/src/pdf/{markdown → structure}/regions/table_recognition.rs +76 -29
  183. data/vendor/kreuzberg/src/pdf/{markdown → structure}/regions/tables.rs +97 -16
  184. data/vendor/kreuzberg/src/pdf/{markdown → structure}/text_repair.rs +2 -1
  185. data/vendor/kreuzberg/src/pdf/{markdown → structure}/types.rs +14 -0
  186. data/vendor/kreuzberg/src/pdf/table_reconstruct.rs +943 -27
  187. data/vendor/kreuzberg/src/pdf/text_data.rs +1 -333
  188. data/vendor/kreuzberg/src/plugins/extractor/instrumented.rs +7 -8
  189. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +26 -48
  190. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +27 -65
  191. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +20 -235
  192. data/vendor/kreuzberg/src/plugins/mod.rs +5 -2
  193. data/vendor/kreuzberg/src/plugins/ocr.rs +4 -0
  194. data/vendor/kreuzberg/src/plugins/processor/mod.rs +32 -0
  195. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +15 -44
  196. data/vendor/kreuzberg/src/plugins/registry/mod.rs +14 -0
  197. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +78 -55
  198. data/vendor/kreuzberg/src/plugins/registry/renderer.rs +368 -0
  199. data/vendor/kreuzberg/src/plugins/renderer.rs +199 -0
  200. data/vendor/kreuzberg/src/plugins/validator/mod.rs +44 -0
  201. data/vendor/kreuzberg/src/rendering/common.rs +873 -0
  202. data/vendor/kreuzberg/src/rendering/comrak_bridge.rs +1168 -0
  203. data/vendor/kreuzberg/src/rendering/djot.rs +925 -0
  204. data/vendor/kreuzberg/src/rendering/html.rs +23 -0
  205. data/vendor/kreuzberg/src/rendering/json.rs +586 -0
  206. data/vendor/kreuzberg/src/rendering/markdown.rs +86 -605
  207. data/vendor/kreuzberg/src/rendering/mod.rs +16 -6
  208. data/vendor/kreuzberg/src/rendering/plain.rs +373 -210
  209. data/vendor/kreuzberg/src/table_core.rs +472 -0
  210. data/vendor/kreuzberg/src/text/quality_processor.rs +24 -10
  211. data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +2 -21
  212. data/vendor/kreuzberg/src/types/builder.rs +18 -0
  213. data/vendor/kreuzberg/src/types/document_structure.rs +45 -0
  214. data/vendor/kreuzberg/src/types/extraction.rs +78 -0
  215. data/vendor/kreuzberg/src/types/formats.rs +9 -0
  216. data/vendor/kreuzberg/src/types/internal.rs +600 -0
  217. data/vendor/kreuzberg/src/types/internal_builder.rs +1163 -0
  218. data/vendor/kreuzberg/src/types/metadata.rs +162 -2
  219. data/vendor/kreuzberg/src/types/mod.rs +12 -2
  220. data/vendor/kreuzberg/src/types/uri.rs +173 -0
  221. data/vendor/kreuzberg/src/utils/mod.rs +30 -0
  222. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +0 -22
  223. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +1 -0
  224. data/vendor/kreuzberg/src/utils/string_utils.rs +2 -2
  225. data/vendor/kreuzberg/test_data/hocr/english_pdf_default.hocr +97 -0
  226. data/vendor/kreuzberg/test_data/hocr/german_pdf_default.hocr +492 -0
  227. data/vendor/kreuzberg/test_data/hocr/invoice_image_default.hocr +72 -0
  228. data/vendor/kreuzberg/test_data/hocr/utf8_encoding.hocr +12 -0
  229. data/vendor/kreuzberg/test_data/hocr/v4_code_formula.hocr +541 -0
  230. data/vendor/kreuzberg/test_data/hocr/v4_embedded_tables.hocr +420 -0
  231. data/vendor/kreuzberg/test_data/hocr/word_confidence.hocr +21 -0
  232. data/vendor/kreuzberg/tests/api_consistency.rs +212 -3
  233. data/vendor/kreuzberg/tests/api_openweb.rs +335 -0
  234. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +124 -89
  235. data/vendor/kreuzberg/tests/concurrency_stress.rs +9 -6
  236. data/vendor/kreuzberg/tests/config_features.rs +65 -0
  237. data/vendor/kreuzberg/tests/config_integration_test.rs +0 -45
  238. data/vendor/kreuzberg/tests/cross_format_parity.rs +1345 -0
  239. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +15 -9
  240. data/vendor/kreuzberg/tests/epub_markdown_headings_tests.rs +7 -3
  241. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +15 -7
  242. data/vendor/kreuzberg/tests/epub_spine_semantics_tests.rs +31 -37
  243. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +40 -19
  244. data/vendor/kreuzberg/tests/html_output_quality.rs +226 -0
  245. data/vendor/kreuzberg/tests/iwork_integration.rs +5 -2
  246. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +92 -39
  247. data/vendor/kreuzberg/tests/json_schema_validation.rs +175 -0
  248. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +31 -15
  249. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +49 -24
  250. data/vendor/kreuzberg/tests/markdown_lint_quality.rs +226 -0
  251. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +4 -1
  252. data/vendor/kreuzberg/tests/path_resolution/fixtures/djot_with_images.djot +9 -0
  253. data/vendor/kreuzberg/tests/path_resolution/fixtures/images/subfolder/nested.png +0 -0
  254. data/vendor/kreuzberg/tests/path_resolution/fixtures/images/test_image.png +0 -0
  255. data/vendor/kreuzberg/tests/path_resolution/fixtures/latex_with_images.tex +15 -0
  256. data/vendor/kreuzberg/tests/path_resolution/fixtures/markdown_with_images.md +13 -0
  257. data/vendor/kreuzberg/tests/path_resolution/fixtures/orgmode_with_images.org +9 -0
  258. data/vendor/kreuzberg/tests/path_resolution/fixtures/rst_with_images.rst +12 -0
  259. data/vendor/kreuzberg/tests/path_resolution/fixtures/typst_with_images.typ +9 -0
  260. data/vendor/kreuzberg/tests/path_resolution.rs +133 -0
  261. data/vendor/kreuzberg/tests/pdf_markdown_quality.rs +1 -4
  262. data/vendor/kreuzberg/tests/pdf_markdown_regression.rs +1 -1
  263. data/vendor/kreuzberg/tests/pipeline_integration.rs +62 -150
  264. data/vendor/kreuzberg/tests/plugin_system.rs +12 -13
  265. data/vendor/kreuzberg/tests/registry_integration_tests.rs +11 -7
  266. data/vendor/kreuzberg/tests/rendering_integration_tests.rs +404 -0
  267. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +2 -2
  268. data/vendor/kreuzberg-ffi/Cargo.toml +7 -5
  269. data/vendor/kreuzberg-ffi/README.md +24 -6
  270. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +1 -0
  271. data/vendor/kreuzberg-ffi/kreuzberg.h +41 -5
  272. data/vendor/kreuzberg-ffi/src/config/html.rs +2 -21
  273. data/vendor/kreuzberg-ffi/src/config/merge.rs +5 -0
  274. data/vendor/kreuzberg-ffi/src/config_builder.rs +64 -0
  275. data/vendor/kreuzberg-ffi/src/helpers.rs +43 -1
  276. data/vendor/kreuzberg-ffi/src/lib.rs +16 -5
  277. data/vendor/kreuzberg-ffi/src/memory.rs +22 -1
  278. data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +30 -12
  279. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +3 -0
  280. data/vendor/kreuzberg-ffi/src/result.rs +5 -0
  281. data/vendor/kreuzberg-ffi/src/result_view.rs +8 -0
  282. data/vendor/kreuzberg-ffi/src/string_intern.rs +2 -2
  283. data/vendor/kreuzberg-ffi/src/types.rs +32 -23
  284. data/vendor/kreuzberg-ffi/tests.disabled/README.md +1 -1
  285. data/vendor/kreuzberg-paddle-ocr/Cargo.toml +1 -1
  286. data/vendor/kreuzberg-pdfium-render/Cargo.toml +1 -1
  287. data/vendor/kreuzberg-pdfium-render/build.rs +1 -1
  288. data/vendor/kreuzberg-pdfium-render/examples/artifact_check.rs +70 -0
  289. data/vendor/kreuzberg-pdfium-render/examples/char_order_check.rs +63 -0
  290. data/vendor/kreuzberg-pdfium-render/examples/ffi_bench.rs +207 -0
  291. data/vendor/kreuzberg-pdfium-render/examples/seg_dump.rs +54 -0
  292. data/vendor/kreuzberg-pdfium-render/src/bindings.rs +1 -1
  293. data/vendor/kreuzberg-tesseract/Cargo.toml +3 -3
  294. data/vendor/kreuzberg-tesseract/README.md +7 -3
  295. data/vendor/kreuzberg-tesseract/build.rs +141 -62
  296. data/vendor/kreuzberg-tesseract/patches/README.md +3 -0
  297. data/vendor/kreuzberg-tesseract/src/lib.rs +1 -0
  298. metadata +71 -49
  299. data/spec/binding/config_result_spec.rb +0 -377
  300. data/spec/binding/metadata_types_spec.rb +0 -1253
  301. data/spec/serialization_spec.rb +0 -134
  302. data/spec/smoke/package_spec.rb +0 -199
  303. data/spec/unit/config/chunking_config_spec.rb +0 -213
  304. data/spec/unit/config/embedding_config_spec.rb +0 -343
  305. data/spec/unit/config/extraction_config_spec.rb +0 -434
  306. data/spec/unit/config/font_config_spec.rb +0 -285
  307. data/spec/unit/config/hierarchy_config_spec.rb +0 -314
  308. data/spec/unit/config/image_extraction_config_spec.rb +0 -209
  309. data/spec/unit/config/image_preprocessing_config_spec.rb +0 -230
  310. data/spec/unit/config/keyword_config_spec.rb +0 -229
  311. data/spec/unit/config/language_detection_config_spec.rb +0 -258
  312. data/spec/unit/config/ocr_config_spec.rb +0 -171
  313. data/spec/unit/config/output_format_spec.rb +0 -380
  314. data/spec/unit/config/page_config_spec.rb +0 -221
  315. data/spec/unit/config/pdf_config_spec.rb +0 -267
  316. data/spec/unit/config/postprocessor_config_spec.rb +0 -290
  317. data/spec/unit/config/tesseract_config_spec.rb +0 -181
  318. data/spec/unit/config/token_reduction_config_spec.rb +0 -251
  319. data/vendor/kreuzberg/src/ocr/hocr.rs +0 -258
  320. data/vendor/kreuzberg/src/pdf/markdown/assembly.rs +0 -504
  321. data/vendor/kreuzberg/src/pdf/markdown/bridge.rs +0 -2553
  322. data/vendor/kreuzberg/src/pdf/markdown/classify.rs +0 -899
  323. data/vendor/kreuzberg/src/pdf/markdown/columns.rs +0 -456
  324. data/vendor/kreuzberg/src/pdf/markdown/lines.rs +0 -216
  325. data/vendor/kreuzberg/src/pdf/markdown/mod.rs +0 -30
  326. data/vendor/kreuzberg/src/pdf/markdown/paragraphs.rs +0 -594
  327. data/vendor/kreuzberg/src/pdf/markdown/regions/assignment.rs +0 -538
  328. data/vendor/kreuzberg/src/pdf/markdown/regions/heading.rs +0 -478
  329. data/vendor/kreuzberg/src/pdf/markdown/regions/merge.rs +0 -282
  330. data/vendor/kreuzberg/src/pdf/markdown/regions/mod.rs +0 -1321
  331. data/vendor/kreuzberg/src/pdf/markdown/regions/reading_order.rs +0 -480
  332. data/vendor/kreuzberg/src/pdf/markdown/render.rs +0 -1108
  333. data/vendor/kreuzberg/tests/document_structure_tests.rs +0 -3911
  334. /data/vendor/kreuzberg/src/pdf/{markdown → structure}/adapters.rs +0 -0
  335. /data/vendor/kreuzberg/src/pdf/{markdown → structure}/content.rs +0 -0
  336. /data/vendor/kreuzberg/src/pdf/{markdown → structure}/geometry.rs +0 -0
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 26e800012598eeb04fe01d85d1ff8df63ac8a02a7c6345a4632bb2aae2981300
4
- data.tar.gz: 6207e53529cbde80bbacd4db9e9e0bd6f6640242e16e1a1cb1543ccfc8ab0291
3
+ metadata.gz: 86f862f2bccbe3ad535c3d392c8bd04fbe6976de1dfd296a264a8a9f413f4617
4
+ data.tar.gz: 15aa589f639a12a312d3827b6c392ac39f5a9d7a2c5ebb82174dea9f9e687a06
5
5
  SHA512:
6
- metadata.gz: 05e510db77e1154b51918b2804549a08a32ee0091bf17ef9f22580391d7eb03044e7e220f398d4f647746b79ec163258ef2862be577662cb542d92a684f37f07
7
- data.tar.gz: eff1441ce70ff97dec6cbf1ee995aa60fb91a072fee5df9475c6e02072e59aa151fc4a797ddcdebed80c3b53c90f3f8a274392bc8e39c7425cc8fc25d77b101a
6
+ metadata.gz: 6bfa0020321043200dde821eafbdca7e6ba1b0abeec4bbb41b43599d3e2cc36c74caf8174673e444e8b1e90c15ef3a8babbac43810529b109ec73c1878bbe6fa
7
+ data.tar.gz: ce6e20a7d4dba4617c426fa0589d9e22c74ef85fdec9229ae525d2357fa3134f6e1a52c61d83c8321dc1f8c8ac745dbea702a07544005bafbe847df387d67ab7
data/README.md CHANGED
@@ -22,7 +22,7 @@
22
22
  <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
23
23
  </a>
24
24
  <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
25
- <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.6.3" alt="Go">
25
+ <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.0.0" alt="Go">
26
26
  </a>
27
27
  <a href="https://www.nuget.org/packages/Kreuzberg/">
28
28
  <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
@@ -42,13 +42,16 @@
42
42
 
43
43
  <!-- Project Info -->
44
44
  <a href="https://github.com/kreuzberg-dev/kreuzberg/blob/main/LICENSE">
45
- <img src="https://img.shields.io/badge/License-MIT-blue.svg" alt="License">
45
+ <img src="https://img.shields.io/badge/License-MIT-007ec6" alt="License">
46
46
  </a>
47
47
  <a href="https://docs.kreuzberg.dev">
48
- <img src="https://img.shields.io/badge/docs-kreuzberg.dev-blue" alt="Documentation">
48
+ <img src="https://img.shields.io/badge/docs-kreuzberg.dev-007ec6" alt="Documentation">
49
+ </a>
50
+ <a href="https://docs.kreuzberg.dev/demo.html">
51
+ <img src="https://img.shields.io/badge/%E2%96%B6%EF%B8%8F_Live_Demo-007ec6" alt="Live Demo">
49
52
  </a>
50
53
  <a href="https://huggingface.co/Kreuzberg">
51
- <img src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Models-yellow" alt="Hugging Face">
54
+ <img src="https://img.shields.io/badge/%F0%9F%A4%97_Hugging_Face-007ec6" alt="Hugging Face">
52
55
  </a>
53
56
  </div>
54
57
 
@@ -61,7 +64,7 @@
61
64
  </div>
62
65
 
63
66
 
64
- Extract text, tables, images, and metadata from 91+ file formats including PDF, Office documents, and images. Ruby bindings with idiomatic Ruby API and native performance.
67
+ Extract text, tables, images, and metadata from 91+ file formats and 248 programming languages including PDF, Office documents, and images. Ruby bindings with idiomatic Ruby API and native performance.
65
68
 
66
69
 
67
70
  ## Installation
@@ -74,6 +77,7 @@ Install via one of the supported package managers:
74
77
 
75
78
 
76
79
  **gem:**
80
+
77
81
  ```bash
78
82
  gem install kreuzberg
79
83
  ```
@@ -82,6 +86,7 @@ gem install kreuzberg
82
86
 
83
87
 
84
88
  **Bundler:**
89
+
85
90
  ```ruby
86
91
  gem 'kreuzberg'
87
92
  ```
@@ -258,6 +263,19 @@ puts "Processing time: #{result.metadata&.dig('processing_time')}ms"
258
263
  | **Scientific** | `.tex`, `.latex`, `.typst`, `.jats`, `.ipynb`, `.docbook` | LaTeX, Jupyter notebooks, PubMed JATS |
259
264
  | **Documentation** | `.opml`, `.pod`, `.mdoc`, `.troff` | Technical documentation formats |
260
265
 
266
+ #### Code Intelligence (248 Languages)
267
+
268
+ | Feature | Description |
269
+ |---------|-------------|
270
+ | **Structure Extraction** | Functions, classes, methods, structs, interfaces, enums |
271
+ | **Import/Export Analysis** | Module dependencies, re-exports, wildcard imports |
272
+ | **Symbol Extraction** | Variables, constants, type aliases, properties |
273
+ | **Docstring Parsing** | Google, NumPy, Sphinx, JSDoc, RustDoc, and 10+ formats |
274
+ | **Diagnostics** | Parse errors with line/column positions |
275
+ | **Syntax-Aware Chunking** | Split code by semantic boundaries, not arbitrary byte offsets |
276
+
277
+ Powered by [tree-sitter-language-pack](https://github.com/kreuzberg-dev/tree-sitter-language-pack) — [documentation](https://docs.tree-sitter-language-pack.kreuzberg.dev).
278
+
261
279
  **[Complete Format Reference](https://kreuzberg.dev/reference/formats/)**
262
280
 
263
281
  ### Key Capabilities
@@ -279,6 +297,9 @@ puts "Processing time: #{result.metadata&.dig('processing_time')}ms"
279
297
  - **Batch Processing** - Efficiently process multiple documents in parallel
280
298
  - **Memory Efficient** - Stream large files without loading entirely into memory
281
299
  - **Language Detection** - Detect and support multiple languages in documents
300
+
301
+ - **Code Intelligence** - Extract structure, imports, exports, symbols, and docstrings from [248 programming languages](https://docs.tree-sitter-language-pack.kreuzberg.dev) via tree-sitter
302
+
282
303
  - **Configuration** - Fine-grained control over extraction behavior
283
304
 
284
305
  ### Performance Characteristics