kreuzberg 4.0.0.pre.rc.7 → 4.0.0.pre.rc.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +13 -12
  3. data/README.md +22 -0
  4. data/ext/kreuzberg_rb/native/.cargo/config.toml +1 -1
  5. data/ext/kreuzberg_rb/native/Cargo.lock +397 -183
  6. data/ext/kreuzberg_rb/native/Cargo.toml +3 -3
  7. data/ext/kreuzberg_rb/native/src/lib.rs +36 -13
  8. data/kreuzberg.gemspec +34 -2
  9. data/lib/kreuzberg/cache_api.rb +35 -0
  10. data/lib/kreuzberg/error_context.rb +49 -1
  11. data/lib/kreuzberg/extraction_api.rb +255 -0
  12. data/lib/kreuzberg/version.rb +1 -1
  13. data/lib/kreuzberg.rb +6 -0
  14. data/lib/libpdfium.dylib +0 -0
  15. data/sig/kreuzberg.rbs +9 -0
  16. data/vendor/Cargo.toml +44 -0
  17. data/vendor/kreuzberg/Cargo.toml +65 -35
  18. data/vendor/kreuzberg/README.md +50 -0
  19. data/vendor/kreuzberg/build.rs +548 -190
  20. data/vendor/kreuzberg/src/api/mod.rs +0 -2
  21. data/vendor/kreuzberg/src/core/pipeline.rs +13 -0
  22. data/vendor/kreuzberg/src/embeddings.rs +71 -3
  23. data/vendor/kreuzberg/src/error.rs +1 -1
  24. data/vendor/kreuzberg/src/extraction/docx.rs +1 -1
  25. data/vendor/kreuzberg/src/extraction/html.rs +37 -5
  26. data/vendor/kreuzberg/src/extractors/pdf.rs +99 -47
  27. data/vendor/kreuzberg/src/mcp/mod.rs +3 -2
  28. data/vendor/kreuzberg/src/mcp/server.rs +106 -0
  29. data/vendor/kreuzberg/src/pdf/bindings.rs +44 -0
  30. data/vendor/kreuzberg/src/pdf/bundled.rs +346 -0
  31. data/vendor/kreuzberg/src/pdf/metadata.rs +2 -2
  32. data/vendor/kreuzberg/src/pdf/mod.rs +6 -0
  33. data/vendor/kreuzberg/src/pdf/rendering.rs +2 -2
  34. data/vendor/kreuzberg/src/pdf/table.rs +3 -0
  35. data/vendor/kreuzberg/src/pdf/text.rs +2 -2
  36. data/vendor/kreuzberg/src/text/quality_processor.rs +1 -1
  37. data/vendor/kreuzberg/tests/concurrency_stress.rs +1 -1
  38. data/vendor/kreuzberg/tests/format_integration.rs +4 -1
  39. data/vendor/kreuzberg/tests/pdfium_linking.rs +374 -0
  40. data/vendor/kreuzberg-ffi/Cargo.toml +63 -0
  41. data/vendor/kreuzberg-ffi/README.md +851 -0
  42. data/vendor/kreuzberg-ffi/build.rs +176 -0
  43. data/vendor/kreuzberg-ffi/cbindgen.toml +27 -0
  44. data/vendor/kreuzberg-ffi/kreuzberg-ffi-install.pc +12 -0
  45. data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -0
  46. data/vendor/kreuzberg-ffi/kreuzberg.h +1087 -0
  47. data/vendor/kreuzberg-ffi/src/lib.rs +3616 -0
  48. data/vendor/kreuzberg-ffi/src/panic_shield.rs +247 -0
  49. data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -0
  50. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -0
  51. data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -0
  52. data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -0
  53. data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -0
  54. data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -0
  55. data/vendor/kreuzberg-tesseract/.crate-ignore +2 -0
  56. data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -0
  57. data/vendor/kreuzberg-tesseract/Cargo.toml +48 -0
  58. data/vendor/kreuzberg-tesseract/LICENSE +22 -0
  59. data/vendor/kreuzberg-tesseract/README.md +399 -0
  60. data/vendor/kreuzberg-tesseract/build.rs +1354 -0
  61. data/vendor/kreuzberg-tesseract/patches/README.md +71 -0
  62. data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -0
  63. data/vendor/kreuzberg-tesseract/src/api.rs +1371 -0
  64. data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -0
  65. data/vendor/kreuzberg-tesseract/src/enums.rs +297 -0
  66. data/vendor/kreuzberg-tesseract/src/error.rs +81 -0
  67. data/vendor/kreuzberg-tesseract/src/lib.rs +145 -0
  68. data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -0
  69. data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -0
  70. data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -0
  71. data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -0
  72. data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -0
  73. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -0
  74. data/vendor/rb-sys/src/lib.rs +1 -0
  75. metadata +41 -3
  76. data/vendor/rb-sys/bin/release.sh +0 -22
data/sig/kreuzberg.rbs CHANGED
@@ -226,6 +226,15 @@ module Kreuzberg
226
226
  # Alias for Config::Extraction (for API consistency with other language bindings)
227
227
  ExtractionConfig: singleton(Config::Extraction)
228
228
 
229
+ # Alias for Config::PageConfig (for API consistency with other language bindings)
230
+ PageConfig: singleton(Config::PageConfig)
231
+
232
+ # Keyword algorithm constants
233
+ module KeywordAlgorithm
234
+ YAKE: Symbol
235
+ RAKE: Symbol
236
+ end
237
+
229
238
  # Extraction result type
230
239
  type extraction_result_hash = {
231
240
  content: String,
data/vendor/Cargo.toml ADDED
@@ -0,0 +1,44 @@
1
+ [workspace]
2
+ members = ["kreuzberg", "kreuzberg-ffi", "kreuzberg-tesseract"]
3
+
4
+ [workspace.package]
5
+ version = "4.0.0-rc.11"
6
+ edition = "2024"
7
+ rust-version = "1.91"
8
+ authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
9
+ license = "MIT"
10
+ repository = "https://github.com/kreuzberg-dev/kreuzberg"
11
+ homepage = "https://kreuzberg.dev"
12
+
13
+ [workspace.dependencies]
14
+ # Core async runtime
15
+ tokio = { version = "1.48.0", features = ["rt", "rt-multi-thread", "macros", "sync", "process", "fs", "time", "io-util"] }
16
+
17
+ # Serialization
18
+ serde = { version = "1.0.228", features = ["derive"] }
19
+ serde_json = { version = "1.0.145" }
20
+
21
+ # Error handling
22
+ thiserror = "2.0.17"
23
+ anyhow = "1.0"
24
+
25
+ # Async utilities
26
+ async-trait = "0.1.89"
27
+ libc = "0.2.178"
28
+
29
+ # Tracing/observability
30
+ tracing = "0.1"
31
+
32
+ # Utilities
33
+ ahash = "0.8.12"
34
+ base64 = "0.22.1"
35
+ hex = "0.4.3"
36
+ num_cpus = "1.17.0"
37
+ once_cell = "1.21.3"
38
+ html-to-markdown-rs = { version = "2.14.11", default-features = false }
39
+ reqwest = { version = "0.12.25", default-features = false }
40
+ image = { version = "0.25.9", default-features = false }
41
+
42
+ # Testing (dev)
43
+ tempfile = "3.23.0"
44
+ criterion = { version = "0.8", features = ["html_reports"] }
@@ -1,11 +1,11 @@
1
1
  [package]
2
2
  name = "kreuzberg"
3
- version.workspace = true
4
- edition.workspace = true
5
- rust-version.workspace = true
6
- authors.workspace = true
3
+ version = "4.0.0-rc.11"
4
+ edition = "2024"
5
+ rust-version = "1.91"
6
+ authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
7
7
  description = "High-performance document intelligence library for Rust. Extract text, metadata, and structured data from PDFs, Office documents, images, and 50+ formats with async/sync APIs."
8
- license.workspace = true
8
+ license = "MIT"
9
9
  repository = "https://github.com/kreuzberg-dev/kreuzberg"
10
10
  homepage = "https://kreuzberg.dev"
11
11
  documentation = "https://docs.rs/kreuzberg"
@@ -26,6 +26,13 @@ tokio-runtime = ["dep:tokio"]
26
26
 
27
27
  # Format extractors
28
28
  pdf = ["dep:pdfium-render", "dep:lopdf", "dep:image"]
29
+ static-pdfium = ["pdf"] # Static link (no runtime dep) - requires PDFIUM_STATIC_LIB_PATH env var
30
+ bundled-pdfium = ["pdf"] # Embed library in binary (self-contained, dynamic link)
31
+ system-pdfium = ["pdf"] # Use system-installed pdfium via pkg-config
32
+ # Legacy names for backward compatibility
33
+ pdf-static = ["static-pdfium"]
34
+ pdf-bundled = ["bundled-pdfium"]
35
+ pdf-system = ["system-pdfium"]
29
36
  excel = ["dep:calamine", "dep:polars", "tokio-runtime"]
30
37
  office = [
31
38
  "dep:roxmltree",
@@ -59,7 +66,12 @@ ocr = [
59
66
  ]
60
67
  language-detection = ["dep:whatlang"]
61
68
  chunking = ["dep:text-splitter"]
62
- embeddings = ["dep:fastembed", "dep:reqwest", "chunking", "tokio-runtime"]
69
+ embeddings = [
70
+ "dep:fastembed",
71
+ "dep:reqwest",
72
+ "chunking",
73
+ "tokio-runtime",
74
+ ] # Requires system ONNX Runtime
63
75
  stopwords = [] # Stopwords for keyword extraction and token reduction
64
76
  quality = ["dep:unicode-normalization", "dep:chardetng", "dep:encoding_rs", "stopwords"]
65
77
 
@@ -71,14 +83,16 @@ keywords = ["keywords-yake", "keywords-rake"]
71
83
  # Server features
72
84
  api = ["dep:axum", "dep:tower", "dep:tower-http", "tokio-runtime"]
73
85
  mcp = ["dep:rmcp", "tokio-runtime"]
86
+ mcp-http = ["mcp", "api"] # NEW - enables HTTP transport
74
87
 
75
88
  # Observability features
76
89
  otel = ["dep:opentelemetry", "dep:opentelemetry_sdk", "dep:tracing-opentelemetry"]
77
90
 
78
91
  # WASM-compatible feature bundle
79
- wasm-target = ["html", "xml", "email", "language-detection", "chunking", "quality"]
92
+ wasm-target = ["pdf", "html", "xml", "email", "language-detection", "chunking", "quality"]
93
+ wasm-threads = ["dep:wasm-bindgen-rayon"]
80
94
 
81
- # Convenience bundles
95
+ # Convenience bundles (all use static PDF linking except wasm)
82
96
  full = [
83
97
  "pdf",
84
98
  "excel",
@@ -90,42 +104,56 @@ full = [
90
104
  "ocr",
91
105
  "language-detection",
92
106
  "chunking",
107
+ "embeddings",
93
108
  "quality",
94
109
  "keywords",
110
+ "api",
111
+ "mcp",
112
+ "otel",
113
+ ]
114
+ server = ["static-pdfium", "excel", "html", "ocr", "api", "mcp"]
115
+ cli = [
116
+ "static-pdfium",
117
+ "excel",
118
+ "office",
119
+ "html",
120
+ "ocr",
121
+ "language-detection",
122
+ "chunking",
123
+ "quality",
95
124
  ]
96
- server = ["pdf", "excel", "html", "ocr", "api", "mcp"]
97
- cli = ["pdf", "excel", "office", "html", "ocr", "language-detection", "chunking", "quality"]
98
125
 
99
126
  [build-dependencies]
100
- tracing = { workspace = true }
127
+ tracing = "0.1"
128
+ pkg-config = "0.3" # For system pdfium detection
101
129
 
102
130
  [dependencies]
103
131
  # Core dependencies (always included)
104
- ahash = { workspace = true }
105
- async-trait = { workspace = true }
106
- base64 = { workspace = true }
132
+ ahash = "0.8.12"
133
+ async-trait = "0.1.89"
134
+ base64 = "0.22.1"
107
135
  base64-simd = "0.8"
108
- hex = { workspace = true }
136
+ hex = "0.4.3"
109
137
  lazy_static = "1.5.0"
110
- libc = { workspace = true }
138
+ libc = "0.2.178"
111
139
  memchr = "2.7.6"
112
- num_cpus = { workspace = true }
140
+ num_cpus = "1.17.0"
113
141
  once_cell = "1.21.3"
114
- paste = "1.0"
142
+ pastey = "0.2"
115
143
  rayon = "1.11.0"
116
144
  regex = "1.12.2"
117
- serde = { workspace = true }
118
- serde_json = { workspace = true }
145
+ serde = { version = "1.0.228", features = ["derive"] }
146
+ serde_json = "1.0.145"
119
147
  serde_yaml_ng = "0.10.0"
120
- toml = "0.9.8"
148
+ toml = { workspace = true }
121
149
  mime_guess = "2.0"
122
150
  rmp-serde = "1.3"
123
- thiserror = { workspace = true }
151
+ thiserror = "2.0.17"
124
152
  tokio = { workspace = true, optional = true }
125
153
  uuid = { version = "1.19.0", features = ["v4", "js"] }
126
154
  indexmap = "2.12.1"
127
- tracing = { workspace = true }
128
- reqwest = { workspace = true, default-features = false, features = [
155
+ tracing = "0.1"
156
+ reqwest = { version = "0.12.25", default-features = false, features = [
129
157
  "json",
130
158
  "rustls-tls",
131
159
  ], optional = true }
@@ -138,7 +166,7 @@ roxmltree = { version = "0.21.1", optional = true }
138
166
  zip = { version = "6.0.0", optional = true }
139
167
  mail-parser = { version = "0.11.1", optional = true }
140
168
  msg_parser = { version = "0.1.1", optional = true }
141
- html-to-markdown-rs = { version = "2.14.1", features = ["inline-images"], optional = true }
169
+ html-to-markdown-rs = { version = "2.14.11", default-features = false, features = ["inline-images"], optional = true }
142
170
  quick-xml = { version = "0.38.4", features = ["serialize"], optional = true }
143
171
  tar = { version = "0.4.44", optional = true }
144
172
  sevenz-rust = { version = "0.6.1", optional = true }
@@ -153,8 +181,8 @@ fb2 = { version = "0.4", optional = true }
153
181
  typst-syntax = { version = "0.14", optional = true }
154
182
 
155
183
  # Processing features (optional)
156
- kreuzberg-tesseract = { version = "4.0.0-rc.7", optional = true }
157
- image = { workspace = true, default-features = false, features = [
184
+ kreuzberg-tesseract = { path = "../kreuzberg-tesseract", optional = true }
185
+ image = { version = "0.25.9", default-features = false, features = [
158
186
  "png",
159
187
  "jpeg",
160
188
  "webp",
@@ -169,9 +197,9 @@ ndarray = { version = "0.17.1", optional = true }
169
197
  kamadak-exif = { version = "0.6.1", optional = true }
170
198
  whatlang = { version = "0.18.0", optional = true }
171
199
  text-splitter = { version = "0.28.0", features = ["markdown"], optional = true }
172
- fastembed = { version = "5.4", default-features = false, features = [
200
+ fastembed = { version = "5.5", default-features = false, features = [
173
201
  "hf-hub-rustls-tls",
174
- "ort-download-binaries",
202
+ "ort-load-dynamic", # Runtime detection, requires system ONNX Runtime
175
203
  ], optional = true }
176
204
  unicode-normalization = { version = "0.1.25", optional = true }
177
205
  chardetng = { version = "0.1.17", optional = true }
@@ -182,11 +210,13 @@ rake = { version = "0.3.6", optional = true }
182
210
  axum = { version = "0.8", features = ["macros", "json", "multipart"], optional = true }
183
211
  tower = { version = "0.5", optional = true }
184
212
  tower-http = { version = "0.6", features = ["cors", "trace", "limit"], optional = true }
185
- rmcp = { version = "0.11.0", features = [
213
+ rmcp = { version = "0.12.0", features = [
186
214
  "server",
187
215
  "macros",
188
216
  "base64",
189
217
  "transport-io",
218
+ "transport-streamable-http-server",
219
+ "axum",
190
220
  ], optional = true }
191
221
  # Observability features (optional)
192
222
  opentelemetry = { version = "0.31", features = ["trace"], optional = true }
@@ -195,16 +225,16 @@ tracing-opentelemetry = { version = "0.32", optional = true }
195
225
  infer = "0.19.0"
196
226
 
197
227
  [dev-dependencies]
198
- tempfile = { workspace = true }
228
+ tempfile = "3.23.0"
199
229
  filetime = "0.2"
200
230
  tar = "0.4.44"
201
231
  zip = "6.0.0"
202
232
  serial_test = "3.2.0"
203
- anyhow = { workspace = true }
233
+ anyhow = "1.0"
204
234
  tokio-test = "0.4"
205
235
  tracing-subscriber = { version = "0.3", features = ["env-filter"] }
206
- criterion = { workspace = true }
207
- image = { workspace = true, default-features = false, features = ["png"] }
236
+ criterion = { version = "0.8", features = ["html_reports"] }
237
+ image = { version = "0.25.9", default-features = false, features = ["png"] }
208
238
 
209
239
  [[bench]]
210
240
  name = "otel_overhead"
@@ -215,4 +245,4 @@ harness = false
215
245
  pprof = { version = "0.15.0", features = ["flamegraph"], optional = true }
216
246
 
217
247
  [target.'cfg(target_arch = "wasm32")'.dependencies]
218
- wasm-bindgen-rayon = "1.2"
248
+ wasm-bindgen-rayon = { version = "1.3", optional = true }
@@ -30,6 +30,26 @@ kreuzberg = "4.0"
30
30
  tokio = { version = "1", features = ["rt", "macros"] }
31
31
  ```
32
32
 
33
+ ## System Requirements
34
+
35
+ ### ONNX Runtime (for embeddings)
36
+
37
+ If using embeddings functionality, ONNX Runtime must be installed:
38
+
39
+ ```bash
40
+ # macOS
41
+ brew install onnxruntime
42
+
43
+ # Ubuntu/Debian
44
+ sudo apt install libonnxruntime libonnxruntime-dev
45
+
46
+ # Windows (MSVC)
47
+ scoop install onnxruntime
48
+ # OR download from https://github.com/microsoft/onnxruntime/releases
49
+ ```
50
+
51
+ Without ONNX Runtime, embeddings will raise `MissingDependencyError` with installation instructions.
52
+
33
53
  ## Quick Start
34
54
 
35
55
  ```rust
@@ -169,6 +189,36 @@ kreuzberg = { version = "4.0", features = ["server"] }
169
189
  kreuzberg = { version = "4.0", features = ["cli"] }
170
190
  ```
171
191
 
192
+ ## PDF Support and Linking Options
193
+
194
+ Kreuzberg supports three PDFium linking strategies. **Default is `bundled-pdfium`** (best developer experience).
195
+
196
+ | Strategy | Feature | Use Case | Binary Size | Runtime Deps |
197
+ |----------|---------|----------|-------------|--------------|
198
+ | **Bundled (default)** | `bundled-pdfium` | Development, production | +8-15MB | None |
199
+ | **Static** | `static-pdfium` | Docker, musl, standalone binaries | +200MB | None |
200
+ | **System** | `system-pdfium` | Package managers, distros | +2MB | libpdfium.so |
201
+
202
+ ### Quick Start
203
+
204
+ ```toml
205
+ # Default - bundled PDFium (recommended)
206
+ [dependencies]
207
+ kreuzberg = "4.0"
208
+
209
+ # Static linking (Docker, musl)
210
+ [dependencies]
211
+ kreuzberg = { version = "4.0", features = ["static-pdfium"] }
212
+
213
+ # System PDFium (package managers)
214
+ [dependencies]
215
+ kreuzberg = { version = "4.0", features = ["system-pdfium"] }
216
+ ```
217
+
218
+ For detailed information, see the [PDFium Linking Guide](../../docs/guides/pdfium-linking.md).
219
+
220
+ **Note:** Language bindings (Python, TypeScript, Ruby, Java, Go) automatically bundle PDFium. No configuration needed.
221
+
172
222
  ## Documentation
173
223
 
174
224
  **[API Documentation](https://docs.rs/kreuzberg)** – Complete API reference with examples