kreuzberg 4.0.0.pre.rc.7 → 4.0.0.pre.rc.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +13 -12
  3. data/README.md +22 -0
  4. data/ext/kreuzberg_rb/native/.cargo/config.toml +1 -1
  5. data/ext/kreuzberg_rb/native/Cargo.lock +397 -183
  6. data/ext/kreuzberg_rb/native/Cargo.toml +3 -3
  7. data/ext/kreuzberg_rb/native/src/lib.rs +36 -13
  8. data/kreuzberg.gemspec +34 -2
  9. data/lib/kreuzberg/cache_api.rb +35 -0
  10. data/lib/kreuzberg/error_context.rb +49 -1
  11. data/lib/kreuzberg/extraction_api.rb +255 -0
  12. data/lib/kreuzberg/version.rb +1 -1
  13. data/lib/kreuzberg.rb +6 -0
  14. data/lib/libpdfium.dylib +0 -0
  15. data/sig/kreuzberg.rbs +9 -0
  16. data/vendor/Cargo.toml +44 -0
  17. data/vendor/kreuzberg/Cargo.toml +65 -35
  18. data/vendor/kreuzberg/README.md +50 -0
  19. data/vendor/kreuzberg/build.rs +548 -190
  20. data/vendor/kreuzberg/src/api/mod.rs +0 -2
  21. data/vendor/kreuzberg/src/core/pipeline.rs +13 -0
  22. data/vendor/kreuzberg/src/embeddings.rs +71 -3
  23. data/vendor/kreuzberg/src/error.rs +1 -1
  24. data/vendor/kreuzberg/src/extraction/docx.rs +1 -1
  25. data/vendor/kreuzberg/src/extraction/html.rs +37 -5
  26. data/vendor/kreuzberg/src/extractors/pdf.rs +99 -47
  27. data/vendor/kreuzberg/src/mcp/mod.rs +3 -2
  28. data/vendor/kreuzberg/src/mcp/server.rs +106 -0
  29. data/vendor/kreuzberg/src/pdf/bindings.rs +44 -0
  30. data/vendor/kreuzberg/src/pdf/bundled.rs +346 -0
  31. data/vendor/kreuzberg/src/pdf/metadata.rs +2 -2
  32. data/vendor/kreuzberg/src/pdf/mod.rs +6 -0
  33. data/vendor/kreuzberg/src/pdf/rendering.rs +2 -2
  34. data/vendor/kreuzberg/src/pdf/table.rs +3 -0
  35. data/vendor/kreuzberg/src/pdf/text.rs +2 -2
  36. data/vendor/kreuzberg/src/text/quality_processor.rs +1 -1
  37. data/vendor/kreuzberg/tests/concurrency_stress.rs +1 -1
  38. data/vendor/kreuzberg/tests/format_integration.rs +4 -1
  39. data/vendor/kreuzberg/tests/pdfium_linking.rs +374 -0
  40. data/vendor/kreuzberg-ffi/Cargo.toml +63 -0
  41. data/vendor/kreuzberg-ffi/README.md +851 -0
  42. data/vendor/kreuzberg-ffi/build.rs +176 -0
  43. data/vendor/kreuzberg-ffi/cbindgen.toml +27 -0
  44. data/vendor/kreuzberg-ffi/kreuzberg-ffi-install.pc +12 -0
  45. data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -0
  46. data/vendor/kreuzberg-ffi/kreuzberg.h +1087 -0
  47. data/vendor/kreuzberg-ffi/src/lib.rs +3616 -0
  48. data/vendor/kreuzberg-ffi/src/panic_shield.rs +247 -0
  49. data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -0
  50. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -0
  51. data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -0
  52. data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -0
  53. data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -0
  54. data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -0
  55. data/vendor/kreuzberg-tesseract/.crate-ignore +2 -0
  56. data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -0
  57. data/vendor/kreuzberg-tesseract/Cargo.toml +48 -0
  58. data/vendor/kreuzberg-tesseract/LICENSE +22 -0
  59. data/vendor/kreuzberg-tesseract/README.md +399 -0
  60. data/vendor/kreuzberg-tesseract/build.rs +1354 -0
  61. data/vendor/kreuzberg-tesseract/patches/README.md +71 -0
  62. data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -0
  63. data/vendor/kreuzberg-tesseract/src/api.rs +1371 -0
  64. data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -0
  65. data/vendor/kreuzberg-tesseract/src/enums.rs +297 -0
  66. data/vendor/kreuzberg-tesseract/src/error.rs +81 -0
  67. data/vendor/kreuzberg-tesseract/src/lib.rs +145 -0
  68. data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -0
  69. data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -0
  70. data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -0
  71. data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -0
  72. data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -0
  73. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -0
  74. data/vendor/rb-sys/src/lib.rs +1 -0
  75. metadata +41 -3
  76. data/vendor/rb-sys/bin/release.sh +0 -22
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 86ef74ea08ce1971136cf72aac3e1082a25bd60f10675f9d9e8cab7659076189
4
- data.tar.gz: 7635098e6aa3d1aeaa238e88f993aece6b3adf46497f6423fcc92fa0c75e478f
3
+ metadata.gz: ea76ed9d63fda80f47f1054c421989e77269b2d3bc8810ab25cd7e59b062ec7d
4
+ data.tar.gz: de96aea5d18ed67f34117fca308c5cc30b9e719e60c2a8fb0384f50d1fbd704f
5
5
  SHA512:
6
- metadata.gz: e29f2972b1384283dee4a11de61e3de8330a4a523ae754dea6995729a0cc03485efc22dc966846328f998d3fa0e91b973fa8e072169c177dcae432f3b1655118
7
- data.tar.gz: 3c6146bbca9b009ddb2b1dd107ef767ffd921747555a973390eebd36c4074e4b520380b0a4d4925ed8e2e3f6c4aef351505a1d4b4d0aa0ceabafef28e8c517b7
6
+ metadata.gz: 96fec6456241cf9a03ab60ea19ccf1ca8beda412ade51e298bc13480374a5ff7565fda57b2fb8c73edebc54ab3641d4832d740fc68a0e4bba2d610e9b340f682
7
+ data.tar.gz: 254f57261deda88616238abeb5faffd7ebdc767fb01cdc712193edbabd6a5329aeadc55dc0d93c1e5eda491924662b947a737e40eb49e3a9d9b764239e4c1221
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- kreuzberg (4.0.0.pre.rc.7)
4
+ kreuzberg (4.0.0.pre.rc.11)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
@@ -21,16 +21,17 @@ GEM
21
21
  uri (>= 0.13.1)
22
22
  ast (2.4.3)
23
23
  base64 (0.3.0)
24
- bigdecimal (3.3.1)
24
+ bigdecimal (4.0.1)
25
25
  byebug (12.0.0)
26
26
  coderay (1.1.3)
27
- concurrent-ruby (1.3.5)
27
+ concurrent-ruby (1.3.6)
28
28
  connection_pool (3.0.2)
29
29
  csv (3.3.5)
30
30
  diff-lcs (1.6.2)
31
31
  drb (2.2.3)
32
32
  ffi (1.17.2)
33
33
  ffi (1.17.2-arm64-darwin)
34
+ ffi (1.17.2-x86_64-linux-gnu)
34
35
  fileutils (1.8.0)
35
36
  i18n (1.14.7)
36
37
  concurrent-ruby (~> 1.0)
@@ -42,13 +43,14 @@ GEM
42
43
  rb-inotify (~> 0.9, >= 0.9.10)
43
44
  logger (1.7.0)
44
45
  method_source (1.1.0)
45
- minitest (5.27.0)
46
+ minitest (6.0.0)
47
+ prism (~> 1.5)
46
48
  mutex_m (0.3.0)
47
49
  parallel (1.27.0)
48
50
  parser (3.3.10.0)
49
51
  ast (~> 2.4.1)
50
52
  racc
51
- prism (1.6.0)
53
+ prism (1.7.0)
52
54
  pry (0.15.2)
53
55
  coderay (~> 1.1)
54
56
  method_source (~> 1.0)
@@ -58,13 +60,13 @@ GEM
58
60
  racc (1.8.1)
59
61
  rainbow (3.1.1)
60
62
  rake (13.3.1)
61
- rake-compiler (1.3.0)
63
+ rake-compiler (1.3.1)
62
64
  rake
63
65
  rake-compiler-dock (1.10.0)
64
66
  rb-fsevent (0.11.2)
65
67
  rb-inotify (0.11.1)
66
68
  ffi (~> 1.0)
67
- rb_sys (0.9.119)
69
+ rb_sys (0.9.123)
68
70
  rake-compiler-dock (= 1.10.0)
69
71
  rbs (3.9.5)
70
72
  logger
@@ -82,7 +84,7 @@ GEM
82
84
  diff-lcs (>= 1.2.0, < 2.0)
83
85
  rspec-support (~> 3.13.0)
84
86
  rspec-support (3.13.6)
85
- rubocop (1.81.7)
87
+ rubocop (1.82.0)
86
88
  json (~> 2.3)
87
89
  language_server-protocol (~> 3.17.0.2)
88
90
  lint_roller (~> 1.1.0)
@@ -90,7 +92,7 @@ GEM
90
92
  parser (>= 3.3.0.2)
91
93
  rainbow (>= 2.2.2, < 4.0)
92
94
  regexp_parser (>= 2.9.3, < 3.0)
93
- rubocop-ast (>= 1.47.1, < 2.0)
95
+ rubocop-ast (>= 1.48.0, < 2.0)
94
96
  ruby-progressbar (~> 1.7)
95
97
  unicode-display_width (>= 2.4.0, < 4.0)
96
98
  rubocop-ast (1.48.0)
@@ -122,21 +124,20 @@ GEM
122
124
  strscan (>= 1.0.0)
123
125
  terminal-table (>= 2, < 5)
124
126
  uri (>= 0.12.0)
125
- strscan (3.1.5)
127
+ strscan (3.1.6)
126
128
  terminal-table (4.0.0)
127
129
  unicode-display_width (>= 1.1.1, < 4)
128
130
  tzinfo (2.0.6)
129
131
  concurrent-ruby (~> 1.0)
130
132
  unicode-display_width (3.2.0)
131
133
  unicode-emoji (~> 4.1)
132
- unicode-emoji (4.1.0)
134
+ unicode-emoji (4.2.0)
133
135
  uri (1.1.1)
134
136
  yard (0.9.38)
135
137
 
136
138
  PLATFORMS
137
139
  arm64-darwin-23
138
140
  arm64-darwin-24
139
- x64-mingw-ucrt
140
141
  x86_64-linux
141
142
 
142
143
  DEPENDENCIES
data/README.md CHANGED
@@ -38,6 +38,11 @@ Extract text, tables, images, and metadata from 56 file formats including PDF, D
38
38
 
39
39
  ### Optional System Dependencies
40
40
 
41
+ - **ONNX Runtime**: For embeddings functionality
42
+ - macOS: `brew install onnxruntime`
43
+ - Ubuntu: `sudo apt-get install libonnxruntime libonnxruntime-dev`
44
+ - Windows: `scoop install onnxruntime` or download from [GitHub](https://github.com/microsoft/onnxruntime/releases)
45
+
41
46
  - **Tesseract**: For OCR functionality
42
47
  - macOS: `brew install tesseract`
43
48
  - Ubuntu: `sudo apt-get install tesseract-ocr`
@@ -417,6 +422,23 @@ bundle exec rubocop
417
422
 
418
423
  **Note**: The Ruby bindings use a vendored copy of the core `kreuzberg` Rust crate. For local development, create a symlink at `vendor/kreuzberg` pointing to `../../crates/kreuzberg`. In CI and gem packaging, the actual vendored files are copied to this location.
419
424
 
425
+ ## PDFium Integration
426
+
427
+ PDF extraction is powered by PDFium, which is automatically bundled with this package. No system installation required.
428
+
429
+ ### Platform Support
430
+
431
+ | Platform | Status | Notes |
432
+ |----------|--------|-------|
433
+ | Linux x86_64 | ✅ | Bundled |
434
+ | macOS ARM64 | ✅ | Bundled |
435
+ | macOS x86_64 | ✅ | Bundled |
436
+ | Windows x86_64 | ✅ | Bundled |
437
+
438
+ ### Binary Size Impact
439
+
440
+ PDFium adds approximately 8-15 MB to the package size depending on platform. This ensures consistent PDF extraction across all environments without external dependencies.
441
+
420
442
  ## License
421
443
 
422
444
  MIT License. See [LICENSE](../../LICENSE) for details.
@@ -1,2 +1,2 @@
1
1
  [build]
2
- rustflags = ["-A", "unpredictable-function-pointer-comparisons"]
2
+ rustflags = ["-A", "unpredictable-function-pointer-comparisons", "-A", "fn_ptr_eq"]