kreuzberg 4.2.6 → 4.2.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +7 -4
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +36 -9
  5. data/ext/kreuzberg_rb/native/Cargo.toml +32 -0
  6. data/ext/kreuzberg_rb/native/src/config/types.rs +4 -2
  7. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +1 -1
  8. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +1 -1
  9. data/ext/kreuzberg_rb/native/src/result.rs +5 -3
  10. data/lib/kreuzberg/version.rb +1 -1
  11. data/sig/kreuzberg.rbs +228 -37
  12. data/spec/binding/batch_operations_spec.rb +2 -0
  13. data/vendor/Cargo.toml +3 -2
  14. data/vendor/kreuzberg/Cargo.toml +2 -1
  15. data/vendor/kreuzberg/README.md +1 -1
  16. data/vendor/kreuzberg/src/api/error.rs +29 -1
  17. data/vendor/kreuzberg/src/api/handlers.rs +28 -25
  18. data/vendor/kreuzberg/src/api/openapi.rs +14 -1
  19. data/vendor/kreuzberg/src/chunking/config.rs +2 -37
  20. data/vendor/kreuzberg/src/chunking/core.rs +78 -2
  21. data/vendor/kreuzberg/src/chunking/mod.rs +1 -1
  22. data/vendor/kreuzberg/src/chunking/processor.rs +15 -17
  23. data/vendor/kreuzberg/src/core/config/extraction/env.rs +13 -9
  24. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +12 -12
  25. data/vendor/kreuzberg/src/core/config/mod.rs +1 -1
  26. data/vendor/kreuzberg/src/core/config/processing.rs +65 -8
  27. data/vendor/kreuzberg/src/core/config_validation/mod.rs +8 -0
  28. data/vendor/kreuzberg/src/core/config_validation/sections.rs +5 -0
  29. data/vendor/kreuzberg/src/core/extractor/batch.rs +9 -9
  30. data/vendor/kreuzberg/src/core/extractor/file.rs +4 -2
  31. data/vendor/kreuzberg/src/core/extractor/legacy.rs +7 -7
  32. data/vendor/kreuzberg/src/core/extractor/sync.rs +3 -3
  33. data/vendor/kreuzberg/src/core/pipeline/execution.rs +2 -1
  34. data/vendor/kreuzberg/src/core/pipeline/features.rs +16 -22
  35. data/vendor/kreuzberg/src/core/pipeline/format.rs +20 -18
  36. data/vendor/kreuzberg/src/core/pipeline/tests.rs +40 -35
  37. data/vendor/kreuzberg/src/extraction/email.rs +31 -19
  38. data/vendor/kreuzberg/src/extraction/excel.rs +6 -5
  39. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +6 -1
  40. data/vendor/kreuzberg/src/extraction/html/types.rs +4 -3
  41. data/vendor/kreuzberg/src/extraction/libreoffice.rs +10 -9
  42. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +10 -8
  43. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +8 -4
  44. data/vendor/kreuzberg/src/extraction/structured.rs +5 -4
  45. data/vendor/kreuzberg/src/extraction/transform/content.rs +1 -1
  46. data/vendor/kreuzberg/src/extraction/transform/mod.rs +10 -7
  47. data/vendor/kreuzberg/src/extractors/archive.rs +7 -5
  48. data/vendor/kreuzberg/src/extractors/bibtex.rs +34 -17
  49. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +7 -10
  50. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +4 -2
  51. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +3 -2
  52. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +1 -1
  53. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +2 -4
  54. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +1 -1
  55. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +4 -5
  56. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +1 -1
  57. data/vendor/kreuzberg/src/extractors/docbook.rs +1 -1
  58. data/vendor/kreuzberg/src/extractors/docx.rs +32 -24
  59. data/vendor/kreuzberg/src/extractors/email.rs +5 -3
  60. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +10 -10
  61. data/vendor/kreuzberg/src/extractors/epub/mod.rs +7 -3
  62. data/vendor/kreuzberg/src/extractors/excel.rs +8 -6
  63. data/vendor/kreuzberg/src/extractors/fictionbook.rs +1 -1
  64. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +15 -10
  65. data/vendor/kreuzberg/src/extractors/html.rs +1 -1
  66. data/vendor/kreuzberg/src/extractors/image.rs +3 -3
  67. data/vendor/kreuzberg/src/extractors/jats/mod.rs +1 -1
  68. data/vendor/kreuzberg/src/extractors/jupyter.rs +11 -9
  69. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +4 -3
  70. data/vendor/kreuzberg/src/extractors/latex/mod.rs +1 -1
  71. data/vendor/kreuzberg/src/extractors/markdown.rs +6 -4
  72. data/vendor/kreuzberg/src/extractors/odt.rs +38 -21
  73. data/vendor/kreuzberg/src/extractors/opml/core.rs +1 -1
  74. data/vendor/kreuzberg/src/extractors/opml/parser.rs +13 -9
  75. data/vendor/kreuzberg/src/extractors/orgmode.rs +11 -9
  76. data/vendor/kreuzberg/src/extractors/pdf/mod.rs +10 -3
  77. data/vendor/kreuzberg/src/extractors/pptx.rs +13 -11
  78. data/vendor/kreuzberg/src/extractors/rst.rs +15 -13
  79. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +22 -21
  80. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +1 -1
  81. data/vendor/kreuzberg/src/extractors/structured.rs +10 -5
  82. data/vendor/kreuzberg/src/extractors/text.rs +2 -2
  83. data/vendor/kreuzberg/src/extractors/typst.rs +11 -5
  84. data/vendor/kreuzberg/src/extractors/xml.rs +1 -1
  85. data/vendor/kreuzberg/src/keywords/processor.rs +9 -8
  86. data/vendor/kreuzberg/src/language_detection/processor.rs +6 -5
  87. data/vendor/kreuzberg/src/lib.rs +1 -1
  88. data/vendor/kreuzberg/src/mcp/errors.rs +7 -6
  89. data/vendor/kreuzberg/src/mcp/format.rs +5 -4
  90. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +3 -2
  91. data/vendor/kreuzberg/src/ocr/hocr.rs +4 -2
  92. data/vendor/kreuzberg/src/ocr/processor/execution.rs +128 -14
  93. data/vendor/kreuzberg/src/ocr/processor/validation.rs +129 -0
  94. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +36 -6
  95. data/vendor/kreuzberg/src/ocr/types.rs +3 -4
  96. data/vendor/kreuzberg/src/ocr/validation.rs +14 -0
  97. data/vendor/kreuzberg/src/pdf/metadata.rs +1 -0
  98. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +3 -2
  99. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +5 -4
  100. data/vendor/kreuzberg/src/plugins/ocr.rs +5 -4
  101. data/vendor/kreuzberg/src/plugins/processor/mod.rs +13 -12
  102. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +3 -2
  103. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +3 -2
  104. data/vendor/kreuzberg/src/plugins/validator/mod.rs +15 -14
  105. data/vendor/kreuzberg/src/text/quality.rs +13 -13
  106. data/vendor/kreuzberg/src/text/quality_processor.rs +7 -6
  107. data/vendor/kreuzberg/src/types/djot.rs +15 -4
  108. data/vendor/kreuzberg/src/types/extraction.rs +24 -4
  109. data/vendor/kreuzberg/src/types/formats.rs +9 -5
  110. data/vendor/kreuzberg/src/types/metadata.rs +68 -7
  111. data/vendor/kreuzberg/src/types/mod.rs +7 -5
  112. data/vendor/kreuzberg/src/types/page.rs +9 -0
  113. data/vendor/kreuzberg/src/types/tables.rs +2 -0
  114. data/vendor/kreuzberg/tests/concurrency_stress.rs +2 -1
  115. data/vendor/kreuzberg/tests/config_behavioral.rs +12 -16
  116. data/vendor/kreuzberg/tests/config_features.rs +19 -11
  117. data/vendor/kreuzberg/tests/config_loading_tests.rs +9 -9
  118. data/vendor/kreuzberg/tests/contract_mcp.rs +2 -2
  119. data/vendor/kreuzberg/tests/core_integration.rs +5 -6
  120. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +1 -1
  121. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +1 -1
  122. data/vendor/kreuzberg/tests/pipeline_integration.rs +36 -32
  123. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +19 -13
  124. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +3 -2
  125. data/vendor/kreuzberg/tests/plugin_system.rs +7 -6
  126. data/vendor/kreuzberg/tests/plugin_validator_test.rs +1 -1
  127. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -1
  128. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  129. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +3 -2
  130. data/vendor/kreuzberg-ffi/kreuzberg.h +32 -0
  131. data/vendor/kreuzberg-ffi/src/error.rs +56 -0
  132. data/vendor/kreuzberg-ffi/src/helpers.rs +6 -5
  133. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +1 -1
  134. data/vendor/kreuzberg-ffi/src/result.rs +2 -1
  135. data/vendor/kreuzberg-ffi/src/result_view.rs +3 -2
  136. data/vendor/kreuzberg-ffi/src/string_intern.rs +3 -3
  137. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +2 -2
  138. data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
  139. metadata +2 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7d0b60c586fdfacb40e14d02a0f21f0d8844e347095a7d566a310ca5d3600e82
4
- data.tar.gz: 5319a67e46fa8422b2f37b82b06329dfb4322c4afa0182a5ca434f3068cde531
3
+ metadata.gz: 2f7a9518bec5df1adf4b81c08136aff44f7ab6b43cd85d7188625a3224e3a0e9
4
+ data.tar.gz: bd5b5a05b0612d6b66d59d458fd7b0366b0256f290e0158fa86002a64c016429
5
5
  SHA512:
6
- metadata.gz: 4670ecee61b6ba3f0e13979587a746f3a57e7910408984f73e504258bd1a8ef053421a3e1c6ea99d86cd79f8dae899dc64c13b0ff5e997a2eda2fff2a08ba179
7
- data.tar.gz: dd750e605c25e6db09093ea50c9ab44aa1b7c7fd8141ad9bb6c691edabbba553e455a358809bfafbab1a450f6c22127733eb63c5dff4737385dd3f6d200848fb
6
+ metadata.gz: dec54496350fe7c4ba61171ebb2c145bc7c2612fde428a395c5aa279bd25fcfedf21ddaeb9012f80db78717315200f2d7f4529119f217fc00e772e066a5de6e0
7
+ data.tar.gz: 68ae3d48391637936e8edd6b16947790bf4bf1da00a1c1efcf23b88da0dec4876d749a358327ea5e5f93e8a8f898a9a34d891bb6ab4311f46fec202de3b2e57a
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- kreuzberg (4.2.6)
4
+ kreuzberg (4.2.7)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
@@ -80,8 +80,9 @@ GEM
80
80
  ffi (~> 1.0)
81
81
  rb_sys (0.9.119)
82
82
  rake-compiler-dock (= 1.10.0)
83
- rbs (3.10.2)
83
+ rbs (3.10.3)
84
84
  logger
85
+ tsort
85
86
  regexp_parser (2.11.3)
86
87
  reline (0.6.3)
87
88
  io-console (~> 0.5)
@@ -142,6 +143,7 @@ GEM
142
143
  strscan (3.1.7)
143
144
  terminal-table (4.0.0)
144
145
  unicode-display_width (>= 1.1.1, < 4)
146
+ tsort (0.2.0)
145
147
  tzinfo (2.0.6)
146
148
  concurrent-ruby (~> 1.0)
147
149
  unicode-display_width (3.2.0)
@@ -207,7 +209,7 @@ CHECKSUMS
207
209
  i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
208
210
  io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
209
211
  json (2.18.0) sha256=b10506aee4183f5cf49e0efc48073d7b75843ce3782c68dbeb763351c08fd505
210
- kreuzberg (4.2.6)
212
+ kreuzberg (4.2.7)
211
213
  language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
212
214
  lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
213
215
  listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
@@ -228,7 +230,7 @@ CHECKSUMS
228
230
  rb-fsevent (0.11.2) sha256=43900b972e7301d6570f64b850a5aa67833ee7d87b458ee92805d56b7318aefe
229
231
  rb-inotify (0.11.1) sha256=a0a700441239b0ff18eb65e3866236cd78613d6b9f78fea1f9ac47a85e47be6e
230
232
  rb_sys (0.9.119) sha256=64393fa148e402e1b79b64496d2aabfc7df79da6b822b8bb48dc1141eaf40b4b
231
- rbs (3.10.2) sha256=bd8a5dc4c62f229f020146b61844a31f9c79e649449d212904a474eb79c846fc
233
+ rbs (3.10.3) sha256=70627f3919016134d554e6c99195552ae3ef6020fe034c8e983facc9c192daa6
232
234
  regexp_parser (2.11.3) sha256=ca13f381a173b7a93450e53459075c9b76a10433caadcb2f1180f2c741fc55a4
233
235
  reline (0.6.3) sha256=1198b04973565b36ec0f11542ab3f5cfeeec34823f4e54cebde90968092b1835
234
236
  rspec (3.13.2) sha256=206284a08ad798e61f86d7ca3e376718d52c0bc944626b2349266f239f820587
@@ -246,6 +248,7 @@ CHECKSUMS
246
248
  steep (1.10.0) sha256=1b295b55f9aaff1b8d3ee42453ee55bc2a1078fda0268f288edb2dc014f4d7d1
247
249
  strscan (3.1.7) sha256=5f76462b94a3ea50b44973225b7d75b2cb96d4e1bee9ef1319b99ca117b72c8c
248
250
  terminal-table (4.0.0) sha256=f504793203f8251b2ea7c7068333053f0beeea26093ec9962e62ea79f94301d2
251
+ tsort (0.2.0) sha256=9650a793f6859a43b6641671278f79cfead60ac714148aabe4e3f0060480089f
249
252
  tzinfo (2.0.6) sha256=8daf828cc77bcf7d63b0e3bdb6caa47e2272dcfaf4fbfe46f8c3a9df087a829b
250
253
  unicode-display_width (3.2.0) sha256=0cdd96b5681a5949cdbc2c55e7b420facae74c4aaf9a9815eee1087cb1853c42
251
254
  unicode-emoji (4.2.0) sha256=519e69150f75652e40bf736106cfbc8f0f73aa3fb6a65afe62fefa7f80b0f80f
data/README.md CHANGED
@@ -22,7 +22,7 @@
22
22
  <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
23
23
  </a>
24
24
  <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
25
- <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.2.6" alt="Go">
25
+ <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.2.7" alt="Go">
26
26
  </a>
27
27
  <a href="https://www.nuget.org/packages/Kreuzberg/">
28
28
  <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
@@ -716,9 +716,9 @@ dependencies = [
716
716
 
717
717
  [[package]]
718
718
  name = "cc"
719
- version = "1.2.54"
719
+ version = "1.2.55"
720
720
  source = "registry+https://github.com/rust-lang/crates.io-index"
721
- checksum = "6354c81bbfd62d9cfa9cb3c773c2b7b2a3a482d569de977fd0e961f6e7c00583"
721
+ checksum = "47b26a0954ae34af09b50f0de26458fa95369a0d478d8236d3f93082b219bd29"
722
722
  dependencies = [
723
723
  "find-msvc-tools",
724
724
  "jobserver",
@@ -1631,9 +1631,9 @@ dependencies = [
1631
1631
 
1632
1632
  [[package]]
1633
1633
  name = "find-msvc-tools"
1634
- version = "0.1.8"
1634
+ version = "0.1.9"
1635
1635
  source = "registry+https://github.com/rust-lang/crates.io-index"
1636
- checksum = "8591b0bcc8a98a64310a2fae1bb3e9b8564dd10e381e6e28010fde8e8e8568db"
1636
+ checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582"
1637
1637
 
1638
1638
  [[package]]
1639
1639
  name = "flate2"
@@ -2027,9 +2027,9 @@ dependencies = [
2027
2027
 
2028
2028
  [[package]]
2029
2029
  name = "html-to-markdown-rs"
2030
- version = "2.23.4"
2030
+ version = "2.24.3"
2031
2031
  source = "registry+https://github.com/rust-lang/crates.io-index"
2032
- checksum = "ffbf49450676163bdf69fac2acf72674fcf2d2aaf690aa247368c567cc9afb2a"
2032
+ checksum = "51e190e3bcf14728f08547b10ba3afdae24c84299045e0831bb9ee1e199d54ad"
2033
2033
  dependencies = [
2034
2034
  "astral-tl",
2035
2035
  "base64 0.22.1",
@@ -2627,7 +2627,7 @@ dependencies = [
2627
2627
 
2628
2628
  [[package]]
2629
2629
  name = "kreuzberg"
2630
- version = "4.2.0"
2630
+ version = "4.2.6"
2631
2631
  dependencies = [
2632
2632
  "ahash",
2633
2633
  "async-trait",
@@ -2636,6 +2636,7 @@ dependencies = [
2636
2636
  "base64-simd",
2637
2637
  "biblatex",
2638
2638
  "bitvec",
2639
+ "bytes",
2639
2640
  "calamine",
2640
2641
  "chardetng",
2641
2642
  "dashmap",
@@ -2703,6 +2704,7 @@ dependencies = [
2703
2704
  "typst-syntax",
2704
2705
  "unicode-normalization",
2705
2706
  "ureq 3.1.4",
2707
+ "utoipa",
2706
2708
  "uuid",
2707
2709
  "whatlang",
2708
2710
  "yake-rust",
@@ -2711,8 +2713,9 @@ dependencies = [
2711
2713
 
2712
2714
  [[package]]
2713
2715
  name = "kreuzberg-ffi"
2714
- version = "4.2.0"
2716
+ version = "4.2.6"
2715
2717
  dependencies = [
2718
+ "ahash",
2716
2719
  "async-trait",
2717
2720
  "cbindgen",
2718
2721
  "html-to-markdown-rs",
@@ -2768,7 +2771,7 @@ dependencies = [
2768
2771
 
2769
2772
  [[package]]
2770
2773
  name = "kreuzberg-tesseract"
2771
- version = "4.2.0"
2774
+ version = "4.2.6"
2772
2775
  dependencies = [
2773
2776
  "cc",
2774
2777
  "cmake",
@@ -6704,6 +6707,30 @@ version = "0.2.2"
6704
6707
  source = "registry+https://github.com/rust-lang/crates.io-index"
6705
6708
  checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
6706
6709
 
6710
+ [[package]]
6711
+ name = "utoipa"
6712
+ version = "5.4.0"
6713
+ source = "registry+https://github.com/rust-lang/crates.io-index"
6714
+ checksum = "2fcc29c80c21c31608227e0912b2d7fddba57ad76b606890627ba8ee7964e993"
6715
+ dependencies = [
6716
+ "indexmap",
6717
+ "serde",
6718
+ "serde_json",
6719
+ "utoipa-gen",
6720
+ ]
6721
+
6722
+ [[package]]
6723
+ name = "utoipa-gen"
6724
+ version = "5.4.0"
6725
+ source = "registry+https://github.com/rust-lang/crates.io-index"
6726
+ checksum = "6d79d08d92ab8af4c5e8a6da20c47ae3f61a0f1dabc1997cdf2d082b757ca08b"
6727
+ dependencies = [
6728
+ "proc-macro2",
6729
+ "quote",
6730
+ "regex",
6731
+ "syn",
6732
+ ]
6733
+
6707
6734
  [[package]]
6708
6735
  name = "uuid"
6709
6736
  version = "1.20.0"
@@ -1,5 +1,37 @@
1
1
  [workspace]
2
2
 
3
+ [workspace.dependencies]
4
+ bytes = { version = "1", features = ["serde"] }
5
+ serde = { version = "1.0.228", features = ["derive"] }
6
+ serde_json = { version = "1.0.149" }
7
+ tokio = { version = "1.49.0", features = [
8
+ "rt",
9
+ "rt-multi-thread",
10
+ "macros",
11
+ "sync",
12
+ "process",
13
+ "fs",
14
+ "time",
15
+ "io-util",
16
+ ] }
17
+ thiserror = "2.0.18"
18
+ anyhow = "1.0"
19
+ libc = "0.2.180"
20
+ async-trait = "0.1.89"
21
+ tracing = "0.1"
22
+ ahash = "0.8.12"
23
+ base64 = "0.22.1"
24
+ hex = "0.4.3"
25
+ num_cpus = "1.17.0"
26
+ once_cell = "1.21.3"
27
+ parking_lot = "0.12.5"
28
+ html-to-markdown-rs = { version = "2.24.3", default-features = false }
29
+ reqwest = { version = "0.13.1", default-features = false }
30
+ image = { version = "0.25.9", default-features = false }
31
+ toml = "0.9.11"
32
+ tempfile = "3.24.0"
33
+ lzma-rust2 = { version = "0.15.7" }
34
+
3
35
  [workspace.lints.clippy]
4
36
  collapsible_if = "allow"
5
37
 
@@ -93,8 +93,10 @@ pub fn parse_chunking_config(ruby: &Ruby, hash: RHash) -> Result<ChunkingConfig,
93
93
  };
94
94
 
95
95
  let config = ChunkingConfig {
96
- max_chars,
97
- max_overlap,
96
+ max_characters: max_chars,
97
+ overlap: max_overlap,
98
+ trim: true,
99
+ chunker_type: kreuzberg::ChunkerType::Text,
98
100
  embedding,
99
101
  preset,
100
102
  };
@@ -90,7 +90,7 @@ impl OcrBackend for RubyOcrBackend {
90
90
 
91
91
  Ok(ExtractionResult {
92
92
  content,
93
- mime_type: "text/plain".to_string(),
93
+ mime_type: std::borrow::Cow::Borrowed("text/plain"),
94
94
  metadata: Metadata::default(),
95
95
  tables: vec![],
96
96
  detected_languages: None,
@@ -95,7 +95,7 @@ pub fn register_post_processor(args: &[Value]) -> Result<(), Error> {
95
95
  message: format!("Failed to convert mime_type: {}", e),
96
96
  plugin_name: processor_name.clone(),
97
97
  })?;
98
- updated_result.mime_type = new_mime;
98
+ updated_result.mime_type = std::borrow::Cow::Owned(new_mime);
99
99
  }
100
100
 
101
101
  Ok::<kreuzberg::ExtractionResult, kreuzberg::KreuzbergError>(updated_result)
@@ -27,7 +27,7 @@ pub fn extraction_result_to_ruby(ruby: &Ruby, result: RustExtractionResult) -> R
27
27
  let content_value = ruby.str_new(result.content.as_str()).into_value_with(ruby);
28
28
  set_hash_entry(ruby, &hash, "content", content_value)?;
29
29
 
30
- let mime_value = ruby.str_new(result.mime_type.as_str()).into_value_with(ruby);
30
+ let mime_value = ruby.str_new(result.mime_type.as_ref()).into_value_with(ruby);
31
31
  set_hash_entry(ruby, &hash, "mime_type", mime_value)?;
32
32
 
33
33
  // Set metadata both as JSON string and parsed hash
@@ -117,7 +117,8 @@ pub fn extraction_result_to_ruby(ruby: &Ruby, result: RustExtractionResult) -> R
117
117
  let image_hash = ruby.hash_new();
118
118
  let data_value = ruby.str_from_slice(&image.data).into_value_with(ruby);
119
119
  image_hash.aset("data", data_value)?;
120
- image_hash.aset("format", image.format)?;
120
+ let format_value = ruby.str_new(image.format.as_ref()).into_value_with(ruby);
121
+ image_hash.aset("format", format_value)?;
121
122
  image_hash.aset("image_index", image.image_index as i64)?;
122
123
  if let Some(page) = image.page_number {
123
124
  image_hash.aset("page_number", page as i64)?;
@@ -200,7 +201,8 @@ pub fn extraction_result_to_ruby(ruby: &Ruby, result: RustExtractionResult) -> R
200
201
  let image_hash = ruby.hash_new();
201
202
  let data_value = ruby.str_from_slice(&image.data).into_value_with(ruby);
202
203
  image_hash.aset("data", data_value)?;
203
- image_hash.aset("format", image.format.clone())?;
204
+ let format_value = ruby.str_new(image.format.as_ref()).into_value_with(ruby);
205
+ image_hash.aset("format", format_value)?;
204
206
  image_hash.aset("image_index", image.image_index as i64)?;
205
207
  if let Some(page) = image.page_number {
206
208
  image_hash.aset("page_number", page as i64)?;
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Kreuzberg
4
- VERSION = '4.2.6'
4
+ VERSION = '4.2.7'
5
5
  end