kreuzberg 4.0.0.pre.rc.8 → 4.0.0.pre.rc.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +12 -9
  3. data/README.md +22 -0
  4. data/ext/kreuzberg_rb/native/Cargo.lock +397 -177
  5. data/ext/kreuzberg_rb/native/Cargo.toml +3 -3
  6. data/ext/kreuzberg_rb/native/src/lib.rs +36 -13
  7. data/kreuzberg.gemspec +34 -2
  8. data/lib/kreuzberg/cache_api.rb +35 -0
  9. data/lib/kreuzberg/error_context.rb +49 -1
  10. data/lib/kreuzberg/extraction_api.rb +255 -0
  11. data/lib/kreuzberg/version.rb +1 -1
  12. data/lib/kreuzberg.rb +6 -0
  13. data/lib/libpdfium.dylib +0 -0
  14. data/sig/kreuzberg.rbs +9 -0
  15. data/vendor/Cargo.toml +44 -0
  16. data/vendor/kreuzberg/Cargo.toml +61 -38
  17. data/vendor/kreuzberg/README.md +36 -27
  18. data/vendor/kreuzberg/build.rs +197 -245
  19. data/vendor/kreuzberg/src/core/pipeline.rs +13 -0
  20. data/vendor/kreuzberg/src/embeddings.rs +71 -3
  21. data/vendor/kreuzberg/src/error.rs +1 -1
  22. data/vendor/kreuzberg/src/extraction/html.rs +37 -5
  23. data/vendor/kreuzberg/src/extractors/pdf.rs +93 -44
  24. data/vendor/kreuzberg/src/pdf/bindings.rs +44 -0
  25. data/vendor/kreuzberg/src/pdf/bundled.rs +19 -1
  26. data/vendor/kreuzberg/src/pdf/metadata.rs +2 -2
  27. data/vendor/kreuzberg/src/pdf/mod.rs +2 -0
  28. data/vendor/kreuzberg/src/pdf/rendering.rs +2 -2
  29. data/vendor/kreuzberg/src/pdf/table.rs +3 -0
  30. data/vendor/kreuzberg/src/pdf/text.rs +2 -2
  31. data/vendor/kreuzberg/src/text/quality_processor.rs +1 -1
  32. data/vendor/kreuzberg/tests/concurrency_stress.rs +1 -1
  33. data/vendor/kreuzberg/tests/format_integration.rs +4 -1
  34. data/vendor/kreuzberg-ffi/Cargo.toml +63 -0
  35. data/vendor/kreuzberg-ffi/README.md +851 -0
  36. data/vendor/kreuzberg-ffi/build.rs +176 -0
  37. data/vendor/kreuzberg-ffi/cbindgen.toml +27 -0
  38. data/vendor/kreuzberg-ffi/kreuzberg-ffi-install.pc +12 -0
  39. data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -0
  40. data/vendor/kreuzberg-ffi/kreuzberg.h +1087 -0
  41. data/vendor/kreuzberg-ffi/src/lib.rs +3616 -0
  42. data/vendor/kreuzberg-ffi/src/panic_shield.rs +247 -0
  43. data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -0
  44. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -0
  45. data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -0
  46. data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -0
  47. data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -0
  48. data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -0
  49. data/vendor/kreuzberg-tesseract/.crate-ignore +2 -0
  50. data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -0
  51. data/vendor/kreuzberg-tesseract/Cargo.toml +48 -0
  52. data/vendor/kreuzberg-tesseract/LICENSE +22 -0
  53. data/vendor/kreuzberg-tesseract/README.md +399 -0
  54. data/vendor/kreuzberg-tesseract/build.rs +1354 -0
  55. data/vendor/kreuzberg-tesseract/patches/README.md +71 -0
  56. data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -0
  57. data/vendor/kreuzberg-tesseract/src/api.rs +1371 -0
  58. data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -0
  59. data/vendor/kreuzberg-tesseract/src/enums.rs +297 -0
  60. data/vendor/kreuzberg-tesseract/src/error.rs +81 -0
  61. data/vendor/kreuzberg-tesseract/src/lib.rs +145 -0
  62. data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -0
  63. data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -0
  64. data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -0
  65. data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -0
  66. data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -0
  67. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -0
  68. metadata +39 -3
  69. data/vendor/rb-sys/bin/release.sh +0 -21
data/sig/kreuzberg.rbs CHANGED
@@ -226,6 +226,15 @@ module Kreuzberg
226
226
  # Alias for Config::Extraction (for API consistency with other language bindings)
227
227
  ExtractionConfig: singleton(Config::Extraction)
228
228
 
229
+ # Alias for Config::PageConfig (for API consistency with other language bindings)
230
+ PageConfig: singleton(Config::PageConfig)
231
+
232
+ # Keyword algorithm constants
233
+ module KeywordAlgorithm
234
+ YAKE: Symbol
235
+ RAKE: Symbol
236
+ end
237
+
229
238
  # Extraction result type
230
239
  type extraction_result_hash = {
231
240
  content: String,
data/vendor/Cargo.toml ADDED
@@ -0,0 +1,44 @@
1
+ [workspace]
2
+ members = ["kreuzberg", "kreuzberg-ffi", "kreuzberg-tesseract"]
3
+
4
+ [workspace.package]
5
+ version = "4.0.0-rc.11"
6
+ edition = "2024"
7
+ rust-version = "1.91"
8
+ authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
9
+ license = "MIT"
10
+ repository = "https://github.com/kreuzberg-dev/kreuzberg"
11
+ homepage = "https://kreuzberg.dev"
12
+
13
+ [workspace.dependencies]
14
+ # Core async runtime
15
+ tokio = { version = "1.48.0", features = ["rt", "rt-multi-thread", "macros", "sync", "process", "fs", "time", "io-util"] }
16
+
17
+ # Serialization
18
+ serde = { version = "1.0.228", features = ["derive"] }
19
+ serde_json = { version = "1.0.145" }
20
+
21
+ # Error handling
22
+ thiserror = "2.0.17"
23
+ anyhow = "1.0"
24
+
25
+ # Async utilities
26
+ async-trait = "0.1.89"
27
+ libc = "0.2.178"
28
+
29
+ # Tracing/observability
30
+ tracing = "0.1"
31
+
32
+ # Utilities
33
+ ahash = "0.8.12"
34
+ base64 = "0.22.1"
35
+ hex = "0.4.3"
36
+ num_cpus = "1.17.0"
37
+ once_cell = "1.21.3"
38
+ html-to-markdown-rs = { version = "2.14.11", default-features = false }
39
+ reqwest = { version = "0.12.25", default-features = false }
40
+ image = { version = "0.25.9", default-features = false }
41
+
42
+ # Testing (dev)
43
+ tempfile = "3.23.0"
44
+ criterion = { version = "0.8", features = ["html_reports"] }
@@ -1,11 +1,11 @@
1
1
  [package]
2
2
  name = "kreuzberg"
3
- version.workspace = true
4
- edition.workspace = true
5
- rust-version.workspace = true
6
- authors.workspace = true
3
+ version = "4.0.0-rc.11"
4
+ edition = "2024"
5
+ rust-version = "1.91"
6
+ authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
7
7
  description = "High-performance document intelligence library for Rust. Extract text, metadata, and structured data from PDFs, Office documents, images, and 50+ formats with async/sync APIs."
8
- license.workspace = true
8
+ license = "MIT"
9
9
  repository = "https://github.com/kreuzberg-dev/kreuzberg"
10
10
  homepage = "https://kreuzberg.dev"
11
11
  documentation = "https://docs.rs/kreuzberg"
@@ -26,9 +26,13 @@ tokio-runtime = ["dep:tokio"]
26
26
 
27
27
  # Format extractors
28
28
  pdf = ["dep:pdfium-render", "dep:lopdf", "dep:image"]
29
- pdf-static = ["pdf"] # Download + static link (no runtime dependency)
30
- pdf-bundled = ["pdf"] # Embed library in binary (self-contained)
31
- pdf-system = ["pdf"] # Use system-installed pdfium via pkg-config
29
+ static-pdfium = ["pdf"] # Static link (no runtime dep) - requires PDFIUM_STATIC_LIB_PATH env var
30
+ bundled-pdfium = ["pdf"] # Embed library in binary (self-contained, dynamic link)
31
+ system-pdfium = ["pdf"] # Use system-installed pdfium via pkg-config
32
+ # Legacy names for backward compatibility
33
+ pdf-static = ["static-pdfium"]
34
+ pdf-bundled = ["bundled-pdfium"]
35
+ pdf-system = ["system-pdfium"]
32
36
  excel = ["dep:calamine", "dep:polars", "tokio-runtime"]
33
37
  office = [
34
38
  "dep:roxmltree",
@@ -62,7 +66,12 @@ ocr = [
62
66
  ]
63
67
  language-detection = ["dep:whatlang"]
64
68
  chunking = ["dep:text-splitter"]
65
- embeddings = ["dep:fastembed", "dep:reqwest", "chunking", "tokio-runtime"]
69
+ embeddings = [
70
+ "dep:fastembed",
71
+ "dep:reqwest",
72
+ "chunking",
73
+ "tokio-runtime",
74
+ ] # Requires system ONNX Runtime
66
75
  stopwords = [] # Stopwords for keyword extraction and token reduction
67
76
  quality = ["dep:unicode-normalization", "dep:chardetng", "dep:encoding_rs", "stopwords"]
68
77
 
@@ -80,9 +89,10 @@ mcp-http = ["mcp", "api"] # NEW - enabl
80
89
  otel = ["dep:opentelemetry", "dep:opentelemetry_sdk", "dep:tracing-opentelemetry"]
81
90
 
82
91
  # WASM-compatible feature bundle
83
- wasm-target = ["html", "xml", "email", "language-detection", "chunking", "quality"]
92
+ wasm-target = ["pdf", "html", "xml", "email", "language-detection", "chunking", "quality"]
93
+ wasm-threads = ["dep:wasm-bindgen-rayon"]
84
94
 
85
- # Convenience bundles
95
+ # Convenience bundles (all use static PDF linking except wasm)
86
96
  full = [
87
97
  "pdf",
88
98
  "excel",
@@ -94,43 +104,56 @@ full = [
94
104
  "ocr",
95
105
  "language-detection",
96
106
  "chunking",
107
+ "embeddings",
97
108
  "quality",
98
109
  "keywords",
110
+ "api",
111
+ "mcp",
112
+ "otel",
113
+ ]
114
+ server = ["static-pdfium", "excel", "html", "ocr", "api", "mcp"]
115
+ cli = [
116
+ "static-pdfium",
117
+ "excel",
118
+ "office",
119
+ "html",
120
+ "ocr",
121
+ "language-detection",
122
+ "chunking",
123
+ "quality",
99
124
  ]
100
- server = ["pdf", "excel", "html", "ocr", "api", "mcp"]
101
- cli = ["pdf", "excel", "office", "html", "ocr", "language-detection", "chunking", "quality"]
102
125
 
103
126
  [build-dependencies]
104
- tracing = { workspace = true }
127
+ tracing = "0.1"
105
128
  pkg-config = "0.3" # For system pdfium detection
106
129
 
107
130
  [dependencies]
108
131
  # Core dependencies (always included)
109
- ahash = { workspace = true }
110
- async-trait = { workspace = true }
111
- base64 = { workspace = true }
132
+ ahash = "0.8.12"
133
+ async-trait = "0.1.89"
134
+ base64 = "0.22.1"
112
135
  base64-simd = "0.8"
113
- hex = { workspace = true }
136
+ hex = "0.4.3"
114
137
  lazy_static = "1.5.0"
115
- libc = { workspace = true }
138
+ libc = "0.2.178"
116
139
  memchr = "2.7.6"
117
- num_cpus = { workspace = true }
140
+ num_cpus = "1.17.0"
118
141
  once_cell = "1.21.3"
119
- paste = "1.0"
142
+ pastey = "0.2"
120
143
  rayon = "1.11.0"
121
144
  regex = "1.12.2"
122
- serde = { workspace = true }
123
- serde_json = { workspace = true }
145
+ serde = { version = "1.0.228", features = ["derive"] }
146
+ serde_json = "1.0.145"
124
147
  serde_yaml_ng = "0.10.0"
125
- toml = "0.9.8"
148
+ toml = { workspace = true }
126
149
  mime_guess = "2.0"
127
150
  rmp-serde = "1.3"
128
- thiserror = { workspace = true }
151
+ thiserror = "2.0.17"
129
152
  tokio = { workspace = true, optional = true }
130
153
  uuid = { version = "1.19.0", features = ["v4", "js"] }
131
154
  indexmap = "2.12.1"
132
- tracing = { workspace = true }
133
- reqwest = { workspace = true, default-features = false, features = [
155
+ tracing = "0.1"
156
+ reqwest = { version = "0.12.25", default-features = false, features = [
134
157
  "json",
135
158
  "rustls-tls",
136
159
  ], optional = true }
@@ -143,7 +166,7 @@ roxmltree = { version = "0.21.1", optional = true }
143
166
  zip = { version = "6.0.0", optional = true }
144
167
  mail-parser = { version = "0.11.1", optional = true }
145
168
  msg_parser = { version = "0.1.1", optional = true }
146
- html-to-markdown-rs = { version = "2.14.2", features = ["inline-images"], optional = true }
169
+ html-to-markdown-rs = { version = "2.14.11", default-features = false, features = ["inline-images"], optional = true }
147
170
  quick-xml = { version = "0.38.4", features = ["serialize"], optional = true }
148
171
  tar = { version = "0.4.44", optional = true }
149
172
  sevenz-rust = { version = "0.6.1", optional = true }
@@ -158,8 +181,8 @@ fb2 = { version = "0.4", optional = true }
158
181
  typst-syntax = { version = "0.14", optional = true }
159
182
 
160
183
  # Processing features (optional)
161
- kreuzberg-tesseract = { version = "4.0.0-rc.7", optional = true }
162
- image = { workspace = true, default-features = false, features = [
184
+ kreuzberg-tesseract = { path = "../kreuzberg-tesseract", optional = true }
185
+ image = { version = "0.25.9", default-features = false, features = [
163
186
  "png",
164
187
  "jpeg",
165
188
  "webp",
@@ -174,9 +197,9 @@ ndarray = { version = "0.17.1", optional = true }
174
197
  kamadak-exif = { version = "0.6.1", optional = true }
175
198
  whatlang = { version = "0.18.0", optional = true }
176
199
  text-splitter = { version = "0.28.0", features = ["markdown"], optional = true }
177
- fastembed = { version = "5.4", default-features = false, features = [
200
+ fastembed = { version = "5.5", default-features = false, features = [
178
201
  "hf-hub-rustls-tls",
179
- "ort-download-binaries",
202
+ "ort-load-dynamic", # Runtime detection, requires system ONNX Runtime
180
203
  ], optional = true }
181
204
  unicode-normalization = { version = "0.1.25", optional = true }
182
205
  chardetng = { version = "0.1.17", optional = true }
@@ -187,7 +210,7 @@ rake = { version = "0.3.6", optional = true }
187
210
  axum = { version = "0.8", features = ["macros", "json", "multipart"], optional = true }
188
211
  tower = { version = "0.5", optional = true }
189
212
  tower-http = { version = "0.6", features = ["cors", "trace", "limit"], optional = true }
190
- rmcp = { version = "0.11.0", features = [
213
+ rmcp = { version = "0.12.0", features = [
191
214
  "server",
192
215
  "macros",
193
216
  "base64",
@@ -202,16 +225,16 @@ tracing-opentelemetry = { version = "0.32", optional = true }
202
225
  infer = "0.19.0"
203
226
 
204
227
  [dev-dependencies]
205
- tempfile = { workspace = true }
228
+ tempfile = "3.23.0"
206
229
  filetime = "0.2"
207
230
  tar = "0.4.44"
208
231
  zip = "6.0.0"
209
232
  serial_test = "3.2.0"
210
- anyhow = { workspace = true }
233
+ anyhow = "1.0"
211
234
  tokio-test = "0.4"
212
235
  tracing-subscriber = { version = "0.3", features = ["env-filter"] }
213
- criterion = { workspace = true }
214
- image = { workspace = true, default-features = false, features = ["png"] }
236
+ criterion = { version = "0.8", features = ["html_reports"] }
237
+ image = { version = "0.25.9", default-features = false, features = ["png"] }
215
238
 
216
239
  [[bench]]
217
240
  name = "otel_overhead"
@@ -222,4 +245,4 @@ harness = false
222
245
  pprof = { version = "0.15.0", features = ["flamegraph"], optional = true }
223
246
 
224
247
  [target.'cfg(target_arch = "wasm32")'.dependencies]
225
- wasm-bindgen-rayon = "1.3"
248
+ wasm-bindgen-rayon = { version = "1.3", optional = true }
@@ -30,6 +30,26 @@ kreuzberg = "4.0"
30
30
  tokio = { version = "1", features = ["rt", "macros"] }
31
31
  ```
32
32
 
33
+ ## System Requirements
34
+
35
+ ### ONNX Runtime (for embeddings)
36
+
37
+ If using embeddings functionality, ONNX Runtime must be installed:
38
+
39
+ ```bash
40
+ # macOS
41
+ brew install onnxruntime
42
+
43
+ # Ubuntu/Debian
44
+ sudo apt install libonnxruntime libonnxruntime-dev
45
+
46
+ # Windows (MSVC)
47
+ scoop install onnxruntime
48
+ # OR download from https://github.com/microsoft/onnxruntime/releases
49
+ ```
50
+
51
+ Without ONNX Runtime, embeddings will raise `MissingDependencyError` with installation instructions.
52
+
33
53
  ## Quick Start
34
54
 
35
55
  ```rust
@@ -169,46 +189,35 @@ kreuzberg = { version = "4.0", features = ["server"] }
169
189
  kreuzberg = { version = "4.0", features = ["cli"] }
170
190
  ```
171
191
 
172
- ## PDFium Linking Options
192
+ ## PDF Support and Linking Options
173
193
 
174
- When using the `pdf` feature, you can choose how PDFium is linked to your binary. Four strategies are supported:
194
+ Kreuzberg supports three PDFium linking strategies. **Default is `bundled-pdfium`** (best developer experience).
175
195
 
176
- | Strategy | Feature | Use Case |
177
- |----------|---------|----------|
178
- | **Dynamic (default)** | `pdf` | Fast builds, runtime library dependency |
179
- | **Static** | `pdf`, `pdf-static` | Embed PDFium in binary, larger binary size |
180
- | **Bundled** | `pdf`, `pdf-bundled` | Self-contained per-binary copies |
181
- | **System** | `pdf`, `pdf-system` | Use system-installed PDFium |
196
+ | Strategy | Feature | Use Case | Binary Size | Runtime Deps |
197
+ |----------|---------|----------|-------------|--------------|
198
+ | **Bundled (default)** | `bundled-pdfium` | Development, production | +8-15MB | None |
199
+ | **Static** | `static-pdfium` | Docker, musl, standalone binaries | +200MB | None |
200
+ | **System** | `system-pdfium` | Package managers, distros | +2MB | libpdfium.so |
182
201
 
183
- ### Examples
202
+ ### Quick Start
184
203
 
185
- **Default (dynamic linking)** - Fastest compilation, requires libpdfium at runtime:
186
204
  ```toml
205
+ # Default - bundled PDFium (recommended)
187
206
  [dependencies]
188
- kreuzberg = { version = "4.0", features = ["pdf"] }
189
- ```
190
-
191
- **Static linking** - Larger binary, no runtime dependency:
192
- ```toml
193
- [dependencies]
194
- kreuzberg = { version = "4.0", features = ["pdf", "pdf-static"] }
195
- ```
207
+ kreuzberg = "4.0"
196
208
 
197
- **Bundled** - Each binary extracts its own copy:
198
- ```toml
209
+ # Static linking (Docker, musl)
199
210
  [dependencies]
200
- kreuzberg = { version = "4.0", features = ["pdf", "pdf-bundled"] }
201
- ```
211
+ kreuzberg = { version = "4.0", features = ["static-pdfium"] }
202
212
 
203
- **System-installed** - Use pkg-config or manual paths:
204
- ```toml
213
+ # System PDFium (package managers)
205
214
  [dependencies]
206
- kreuzberg = { version = "4.0", features = ["pdf", "pdf-system"] }
215
+ kreuzberg = { version = "4.0", features = ["system-pdfium"] }
207
216
  ```
208
217
 
209
- For comprehensive guidance on linking strategies, environment variables, and troubleshooting, see the [PDFium Linking Guide](../../docs/guides/pdfium-linking.md).
218
+ For detailed information, see the [PDFium Linking Guide](../../docs/guides/pdfium-linking.md).
210
219
 
211
- **Note:** Language bindings (Python, TypeScript, Ruby, Java, Go) bundle PDFium automatically and do not expose linking options.
220
+ **Note:** Language bindings (Python, TypeScript, Ruby, Java, Go) automatically bundle PDFium. No configuration needed.
212
221
 
213
222
  ## Documentation
214
223