kreuzberg 4.8.4 → 4.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (141) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +6 -3
  3. data/ext/kreuzberg_rb/native/Cargo.lock +130 -109
  4. data/ext/kreuzberg_rb/native/Cargo.toml +3 -3
  5. data/ext/kreuzberg_rb/native/src/config/types.rs +22 -0
  6. data/ext/kreuzberg_rb/native/src/error_handling.rs +7 -0
  7. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +1 -0
  8. data/ext/kreuzberg_rb/native/src/result.rs +46 -0
  9. data/lib/kreuzberg/errors.rb +3 -0
  10. data/lib/kreuzberg/result.rb +52 -5
  11. data/lib/kreuzberg/version.rb +1 -1
  12. data/sig/kreuzberg.rbs +111 -19
  13. data/vendor/Cargo.toml +8 -8
  14. data/vendor/kreuzberg/Cargo.toml +9 -9
  15. data/vendor/kreuzberg/README.md +1 -1
  16. data/vendor/kreuzberg/src/api/error.rs +1 -0
  17. data/vendor/kreuzberg/src/api/handlers.rs +75 -2
  18. data/vendor/kreuzberg/src/api/types.rs +11 -2
  19. data/vendor/kreuzberg/src/cancellation.rs +105 -0
  20. data/vendor/kreuzberg/src/chunking/boundary_detection.rs +496 -0
  21. data/vendor/kreuzberg/src/chunking/core.rs +122 -10
  22. data/vendor/kreuzberg/src/chunking/mod.rs +9 -10
  23. data/vendor/kreuzberg/src/chunking/semantic/merge.rs +477 -0
  24. data/vendor/kreuzberg/src/chunking/semantic/mod.rs +393 -0
  25. data/vendor/kreuzberg/src/chunking/semantic/topic.rs +224 -0
  26. data/vendor/kreuzberg/src/chunking/tokenizer_cache.rs +3 -3
  27. data/vendor/kreuzberg/src/core/config/extraction/core.rs +89 -1
  28. data/vendor/kreuzberg/src/core/config/layout.rs +8 -0
  29. data/vendor/kreuzberg/src/core/config/llm.rs +47 -1
  30. data/vendor/kreuzberg/src/core/config/ocr.rs +16 -1
  31. data/vendor/kreuzberg/src/core/config/processing.rs +63 -0
  32. data/vendor/kreuzberg/src/core/extractor/bytes.rs +1 -1
  33. data/vendor/kreuzberg/src/core/extractor/file.rs +1 -1
  34. data/vendor/kreuzberg/src/core/extractor/sync.rs +24 -21
  35. data/vendor/kreuzberg/src/core/formats.rs +2 -2
  36. data/vendor/kreuzberg/src/core/mime.rs +3 -3
  37. data/vendor/kreuzberg/src/core/pipeline/cache.rs +2 -2
  38. data/vendor/kreuzberg/src/core/pipeline/mod.rs +4 -1
  39. data/vendor/kreuzberg/src/doc_orientation.rs +22 -4
  40. data/vendor/kreuzberg/src/embeddings/mod.rs +253 -18
  41. data/vendor/kreuzberg/src/error.rs +6 -0
  42. data/vendor/kreuzberg/src/extraction/derive.rs +6 -1
  43. data/vendor/kreuzberg/src/extraction/docx/drawing.rs +2 -4
  44. data/vendor/kreuzberg/src/extraction/docx/mod.rs +185 -0
  45. data/vendor/kreuzberg/src/extraction/html/structure.rs +5 -7
  46. data/vendor/kreuzberg/src/extraction/image.rs +1 -0
  47. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +1 -0
  48. data/vendor/kreuzberg/src/extraction/pst.rs +6 -7
  49. data/vendor/kreuzberg/src/extraction/transform/document_tree.rs +3 -0
  50. data/vendor/kreuzberg/src/extraction/transform/mod.rs +5 -0
  51. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +38 -50
  52. data/vendor/kreuzberg/src/extractors/doc.rs +4 -1
  53. data/vendor/kreuzberg/src/extractors/docbook.rs +8 -12
  54. data/vendor/kreuzberg/src/extractors/docx.rs +16 -5
  55. data/vendor/kreuzberg/src/extractors/excel.rs +5 -2
  56. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +2 -4
  57. data/vendor/kreuzberg/src/extractors/html.rs +173 -1
  58. data/vendor/kreuzberg/src/extractors/image.rs +268 -37
  59. data/vendor/kreuzberg/src/extractors/iwork/keynote.rs +4 -1
  60. data/vendor/kreuzberg/src/extractors/iwork/mod.rs +4 -8
  61. data/vendor/kreuzberg/src/extractors/iwork/numbers.rs +4 -1
  62. data/vendor/kreuzberg/src/extractors/iwork/pages.rs +4 -1
  63. data/vendor/kreuzberg/src/extractors/markdown.rs +22 -32
  64. data/vendor/kreuzberg/src/extractors/mdx.rs +22 -32
  65. data/vendor/kreuzberg/src/extractors/mod.rs +7 -12
  66. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +4 -0
  67. data/vendor/kreuzberg/src/extractors/pdf/mod.rs +161 -49
  68. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +148 -13
  69. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +47 -1
  70. data/vendor/kreuzberg/src/extractors/ppt.rs +3 -0
  71. data/vendor/kreuzberg/src/extractors/pptx.rs +3 -0
  72. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +4 -5
  73. data/vendor/kreuzberg/src/keywords/mod.rs +6 -10
  74. data/vendor/kreuzberg/src/language_detection/mod.rs +6 -10
  75. data/vendor/kreuzberg/src/layout/engine.rs +9 -2
  76. data/vendor/kreuzberg/src/layout/mod.rs +17 -6
  77. data/vendor/kreuzberg/src/layout/models/rtdetr.rs +5 -2
  78. data/vendor/kreuzberg/src/layout/models/slanet.rs +5 -2
  79. data/vendor/kreuzberg/src/layout/models/table_classifier.rs +5 -2
  80. data/vendor/kreuzberg/src/layout/models/tatr.rs +5 -2
  81. data/vendor/kreuzberg/src/layout/models/yolo.rs +2 -1
  82. data/vendor/kreuzberg/src/layout/session.rs +4 -51
  83. data/vendor/kreuzberg/src/lib.rs +2 -0
  84. data/vendor/kreuzberg/src/llm/mod.rs +2 -0
  85. data/vendor/kreuzberg/src/llm/structured.rs +7 -3
  86. data/vendor/kreuzberg/src/llm/usage.rs +40 -0
  87. data/vendor/kreuzberg/src/llm/vlm_embeddings.rs +5 -3
  88. data/vendor/kreuzberg/src/llm/vlm_ocr.rs +6 -3
  89. data/vendor/kreuzberg/src/mcp/errors.rs +18 -0
  90. data/vendor/kreuzberg/src/mcp/params.rs +19 -1
  91. data/vendor/kreuzberg/src/mcp/server.rs +15 -4
  92. data/vendor/kreuzberg/src/ocr/processor/execution.rs +8 -16
  93. data/vendor/kreuzberg/src/ort_discovery.rs +75 -1
  94. data/vendor/kreuzberg/src/paddle_ocr/backend.rs +43 -5
  95. data/vendor/kreuzberg/src/pdf/bindings.rs +40 -15
  96. data/vendor/kreuzberg/src/pdf/error.rs +3 -0
  97. data/vendor/kreuzberg/src/pdf/fonts.rs +2 -2
  98. data/vendor/kreuzberg/src/pdf/images.rs +1 -1
  99. data/vendor/kreuzberg/src/pdf/layout_runner.rs +1 -0
  100. data/vendor/kreuzberg/src/pdf/metadata.rs +1 -1
  101. data/vendor/kreuzberg/src/pdf/oxide/table.rs +6 -0
  102. data/vendor/kreuzberg/src/pdf/oxide/text.rs +1 -0
  103. data/vendor/kreuzberg/src/pdf/rendering.rs +1 -1
  104. data/vendor/kreuzberg/src/pdf/structure/bridge.rs +1 -1
  105. data/vendor/kreuzberg/src/pdf/structure/layout_classify.rs +2 -3
  106. data/vendor/kreuzberg/src/pdf/structure/pipeline.rs +116 -15
  107. data/vendor/kreuzberg/src/pdf/text.rs +2 -1
  108. data/vendor/kreuzberg/src/plugins/registry/mod.rs +11 -11
  109. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +30 -31
  110. data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
  111. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +2 -27
  112. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +2 -1
  113. data/vendor/kreuzberg/src/types/extraction.rs +42 -1
  114. data/vendor/kreuzberg/src/types/internal.rs +18 -0
  115. data/vendor/kreuzberg/src/types/mod.rs +5 -0
  116. data/vendor/kreuzberg/src/types/page.rs +26 -1
  117. data/vendor/kreuzberg/src/utils/markdown_utils.rs +40 -0
  118. data/vendor/kreuzberg/src/utils/mod.rs +1 -0
  119. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +3 -3
  120. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +2 -2
  121. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +2 -2
  122. data/vendor/kreuzberg/src/utils/string_utils.rs +7 -7
  123. data/vendor/kreuzberg/tests/cross_format_parity.rs +9 -4
  124. data/vendor/kreuzberg/tests/llm_integration.rs +9 -7
  125. data/vendor/kreuzberg/tests/ocr_content_integrity.rs +154 -0
  126. data/vendor/kreuzberg/tests/pdf_image_extraction_tests.rs +42 -0
  127. data/vendor/kreuzberg-ffi/Cargo.toml +5 -5
  128. data/vendor/kreuzberg-ffi/kreuzberg.h +168 -15
  129. data/vendor/kreuzberg-ffi/src/cancellation.rs +167 -0
  130. data/vendor/kreuzberg-ffi/src/error.rs +32 -7
  131. data/vendor/kreuzberg-ffi/src/helpers.rs +13 -0
  132. data/vendor/kreuzberg-ffi/src/lib.rs +16 -7
  133. data/vendor/kreuzberg-ffi/src/memory.rs +30 -11
  134. data/vendor/kreuzberg-ffi/src/result.rs +71 -0
  135. data/vendor/kreuzberg-ffi/src/types.rs +19 -16
  136. data/vendor/kreuzberg-paddle-ocr/Cargo.toml +1 -1
  137. data/vendor/kreuzberg-paddle-ocr/src/ocr_lite.rs +21 -0
  138. data/vendor/kreuzberg-pdfium-render/Cargo.toml +1 -1
  139. data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
  140. data/vendor/kreuzberg-tesseract/build.rs +11 -5
  141. metadata +15 -6
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 16deeaa47cb35ded0b844af72d43b74da5539084f21a79d17513be2da9ac2f0b
4
- data.tar.gz: 64715b14cffac78a796853e9f5d9a2d0969427de9d59a243c87a5d20699dcce3
3
+ metadata.gz: bb7b77bae36a5da34ce209fbf1ea7c0a68aef4b22f8b373b908f9c113f404ef5
4
+ data.tar.gz: a6c8667aee6ae2c9e11d45fc98fcb355561fec6e4a7d51d852664bd6367af8cc
5
5
  SHA512:
6
- metadata.gz: e362717e5db0fad6a9494737e53c2444a84cb76fd274c70283a6650eef0891e9ced2af424b2ed9501eb749f21fcfb2ca3b4f8c7b336d1a248bb99f4a7e69131e
7
- data.tar.gz: 5d05d862a170f0efe0f6f6a9867846bb3b000136f638b1efe6ddee5e94310dc92495a40a7dff204b4098f92989175e1691ba10897be884ca960477d48dcbc6ca
6
+ metadata.gz: 7569a4914ab4a4d440a0c74e622a9f26f7189b62bc9c2d05fc5e857a32c8fabde8eb854edef34e94bd95d5357e44137c1573e7ce68db45ed85c26dbe31e6972b
7
+ data.tar.gz: 9741106549d7bf79cc1ae34a07f686cca1bf6a4c19fcb01b40cd8f1372166c8e9c3a0321e1e26e416ebb98d413ee1c9093d247949292a7c07a85594ea1df508e
data/README.md CHANGED
@@ -22,7 +22,7 @@
22
22
  <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
23
23
  </a>
24
24
  <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
25
- <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.8.4" alt="Go">
25
+ <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.9.0" alt="Go">
26
26
  </a>
27
27
  <a href="https://www.nuget.org/packages/Kreuzberg/">
28
28
  <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
@@ -39,10 +39,13 @@
39
39
  <a href="https://github.com/kreuzberg-dev/kreuzberg/pkgs/container/kreuzberg">
40
40
  <img src="https://img.shields.io/badge/Docker-007ec6?logo=docker&logoColor=white" alt="Docker">
41
41
  </a>
42
+ <a href="https://artifacthub.io/packages/search?repo=kreuzberg">
43
+ <img src="https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/kreuzberg" alt="Artifact Hub">
44
+ </a>
42
45
 
43
46
  <!-- Project Info -->
44
47
  <a href="https://github.com/kreuzberg-dev/kreuzberg/blob/main/LICENSE">
45
- <img src="https://img.shields.io/badge/License-Elastic--2.0-blue.svg" alt="License">
48
+ <img src="https://img.shields.io/badge/License-MIT-007ec6" alt="License">
46
49
  </a>
47
50
  <a href="https://docs.kreuzberg.dev">
48
51
  <img src="https://img.shields.io/badge/docs-kreuzberg.dev-007ec6" alt="Documentation">
@@ -419,7 +422,7 @@ Contributions are welcome! See [Contributing Guide](https://github.com/kreuzberg
419
422
 
420
423
  ## License
421
424
 
422
- Elastic License 2.0 (ELv2) - see [LICENSE](../../LICENSE) for details.
425
+ MIT License - see LICENSE file for details.
423
426
 
424
427
  ## Support
425
428