kreuzberg 4.0.0.pre.rc.6 → 4.0.0.pre.rc.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +5 -5
  3. data/README.md +15 -9
  4. data/ext/kreuzberg_rb/native/.cargo/config.toml +2 -0
  5. data/ext/kreuzberg_rb/native/Cargo.lock +511 -325
  6. data/ext/kreuzberg_rb/native/Cargo.toml +13 -3
  7. data/ext/kreuzberg_rb/native/src/lib.rs +139 -2
  8. data/kreuzberg.gemspec +38 -4
  9. data/lib/kreuzberg/config.rb +34 -1
  10. data/lib/kreuzberg/result.rb +77 -14
  11. data/lib/kreuzberg/version.rb +1 -1
  12. data/sig/kreuzberg.rbs +23 -6
  13. data/vendor/kreuzberg/Cargo.toml +32 -11
  14. data/vendor/kreuzberg/README.md +54 -8
  15. data/vendor/kreuzberg/build.rs +549 -132
  16. data/vendor/kreuzberg/src/chunking/mod.rs +1279 -79
  17. data/vendor/kreuzberg/src/chunking/processor.rs +220 -0
  18. data/vendor/kreuzberg/src/core/config.rs +49 -1
  19. data/vendor/kreuzberg/src/core/extractor.rs +134 -2
  20. data/vendor/kreuzberg/src/core/mod.rs +4 -2
  21. data/vendor/kreuzberg/src/core/pipeline.rs +188 -1
  22. data/vendor/kreuzberg/src/extraction/docx.rs +358 -0
  23. data/vendor/kreuzberg/src/extraction/html.rs +24 -8
  24. data/vendor/kreuzberg/src/extraction/image.rs +124 -1
  25. data/vendor/kreuzberg/src/extraction/libreoffice.rs +1 -2
  26. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -3
  27. data/vendor/kreuzberg/src/extraction/pptx.rs +187 -87
  28. data/vendor/kreuzberg/src/extractors/archive.rs +1 -0
  29. data/vendor/kreuzberg/src/extractors/bibtex.rs +1 -0
  30. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  31. data/vendor/kreuzberg/src/extractors/docx.rs +50 -17
  32. data/vendor/kreuzberg/src/extractors/email.rs +29 -15
  33. data/vendor/kreuzberg/src/extractors/epub.rs +1 -0
  34. data/vendor/kreuzberg/src/extractors/excel.rs +2 -0
  35. data/vendor/kreuzberg/src/extractors/fictionbook.rs +1 -0
  36. data/vendor/kreuzberg/src/extractors/html.rs +29 -15
  37. data/vendor/kreuzberg/src/extractors/image.rs +25 -4
  38. data/vendor/kreuzberg/src/extractors/jats.rs +3 -0
  39. data/vendor/kreuzberg/src/extractors/jupyter.rs +1 -0
  40. data/vendor/kreuzberg/src/extractors/latex.rs +1 -0
  41. data/vendor/kreuzberg/src/extractors/markdown.rs +1 -0
  42. data/vendor/kreuzberg/src/extractors/mod.rs +78 -14
  43. data/vendor/kreuzberg/src/extractors/odt.rs +3 -3
  44. data/vendor/kreuzberg/src/extractors/opml.rs +1 -0
  45. data/vendor/kreuzberg/src/extractors/orgmode.rs +1 -0
  46. data/vendor/kreuzberg/src/extractors/pdf.rs +197 -17
  47. data/vendor/kreuzberg/src/extractors/pptx.rs +32 -13
  48. data/vendor/kreuzberg/src/extractors/rst.rs +1 -0
  49. data/vendor/kreuzberg/src/extractors/rtf.rs +3 -4
  50. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  51. data/vendor/kreuzberg/src/extractors/text.rs +7 -2
  52. data/vendor/kreuzberg/src/extractors/typst.rs +1 -0
  53. data/vendor/kreuzberg/src/extractors/xml.rs +27 -15
  54. data/vendor/kreuzberg/src/keywords/processor.rs +9 -1
  55. data/vendor/kreuzberg/src/language_detection/mod.rs +43 -0
  56. data/vendor/kreuzberg/src/language_detection/processor.rs +219 -0
  57. data/vendor/kreuzberg/src/lib.rs +10 -2
  58. data/vendor/kreuzberg/src/mcp/mod.rs +3 -0
  59. data/vendor/kreuzberg/src/mcp/server.rs +120 -12
  60. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +2 -0
  61. data/vendor/kreuzberg/src/pdf/bundled.rs +328 -0
  62. data/vendor/kreuzberg/src/pdf/error.rs +8 -0
  63. data/vendor/kreuzberg/src/pdf/metadata.rs +238 -95
  64. data/vendor/kreuzberg/src/pdf/mod.rs +18 -2
  65. data/vendor/kreuzberg/src/pdf/rendering.rs +1 -2
  66. data/vendor/kreuzberg/src/pdf/table.rs +26 -2
  67. data/vendor/kreuzberg/src/pdf/text.rs +89 -7
  68. data/vendor/kreuzberg/src/plugins/extractor.rs +34 -3
  69. data/vendor/kreuzberg/src/plugins/mod.rs +3 -0
  70. data/vendor/kreuzberg/src/plugins/ocr.rs +22 -3
  71. data/vendor/kreuzberg/src/plugins/processor.rs +8 -0
  72. data/vendor/kreuzberg/src/plugins/registry.rs +2 -0
  73. data/vendor/kreuzberg/src/plugins/validator.rs +11 -0
  74. data/vendor/kreuzberg/src/text/mod.rs +6 -0
  75. data/vendor/kreuzberg/src/text/quality_processor.rs +219 -0
  76. data/vendor/kreuzberg/src/types.rs +173 -21
  77. data/vendor/kreuzberg/tests/archive_integration.rs +2 -0
  78. data/vendor/kreuzberg/tests/batch_processing.rs +5 -3
  79. data/vendor/kreuzberg/tests/concurrency_stress.rs +14 -6
  80. data/vendor/kreuzberg/tests/config_features.rs +15 -1
  81. data/vendor/kreuzberg/tests/config_loading_tests.rs +1 -0
  82. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +2 -0
  83. data/vendor/kreuzberg/tests/email_integration.rs +2 -0
  84. data/vendor/kreuzberg/tests/error_handling.rs +43 -34
  85. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  86. data/vendor/kreuzberg/tests/image_integration.rs +2 -0
  87. data/vendor/kreuzberg/tests/mime_detection.rs +17 -16
  88. data/vendor/kreuzberg/tests/ocr_configuration.rs +4 -0
  89. data/vendor/kreuzberg/tests/ocr_errors.rs +22 -0
  90. data/vendor/kreuzberg/tests/ocr_quality.rs +2 -0
  91. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -21
  92. data/vendor/kreuzberg/tests/pdf_integration.rs +2 -0
  93. data/vendor/kreuzberg/tests/pdfium_linking.rs +374 -0
  94. data/vendor/kreuzberg/tests/pipeline_integration.rs +25 -0
  95. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +5 -0
  96. data/vendor/kreuzberg/tests/plugin_system.rs +6 -0
  97. data/vendor/kreuzberg/tests/registry_integration_tests.rs +1 -0
  98. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +2 -0
  99. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -1
  100. data/vendor/kreuzberg/tests/security_validation.rs +1 -0
  101. data/vendor/kreuzberg/tests/test_fastembed.rs +45 -23
  102. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1 -0
  103. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +3 -2
  104. data/vendor/rb-sys/.cargo_vcs_info.json +2 -2
  105. data/vendor/rb-sys/Cargo.lock +15 -15
  106. data/vendor/rb-sys/Cargo.toml +4 -4
  107. data/vendor/rb-sys/Cargo.toml.orig +4 -4
  108. data/vendor/rb-sys/build/features.rs +5 -2
  109. data/vendor/rb-sys/build/main.rs +55 -15
  110. data/vendor/rb-sys/build/stable_api_config.rs +4 -2
  111. data/vendor/rb-sys/build/version.rs +3 -1
  112. data/vendor/rb-sys/src/lib.rs +1 -0
  113. data/vendor/rb-sys/src/macros.rs +2 -2
  114. data/vendor/rb-sys/src/special_consts.rs +1 -1
  115. data/vendor/rb-sys/src/stable_api/compiled.rs +1 -1
  116. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +12 -4
  117. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +12 -4
  118. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +12 -4
  119. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +12 -4
  120. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +19 -6
  121. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +17 -5
  122. data/vendor/rb-sys/src/stable_api.rs +0 -1
  123. data/vendor/rb-sys/src/tracking_allocator.rs +1 -3
  124. metadata +13 -10
  125. data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
  126. data/vendor/rb-sys/.cargo-ok +0 -1
  127. data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
@@ -19,12 +19,17 @@ crate-type = ["rlib"]
19
19
  [features]
20
20
  default = ["tokio-runtime"]
21
21
 
22
- tokio-runtime = []
23
22
  profiling = ["dep:pprof"]
24
23
 
24
+ # Runtime features
25
+ tokio-runtime = ["dep:tokio"]
26
+
25
27
  # Format extractors
26
28
  pdf = ["dep:pdfium-render", "dep:lopdf", "dep:image"]
27
- excel = ["dep:calamine", "dep:polars"]
29
+ pdf-static = ["pdf"] # Download + static link (no runtime dependency)
30
+ pdf-bundled = ["pdf"] # Embed library in binary (self-contained)
31
+ pdf-system = ["pdf"] # Use system-installed pdfium via pkg-config
32
+ excel = ["dep:calamine", "dep:polars", "tokio-runtime"]
28
33
  office = [
29
34
  "dep:roxmltree",
30
35
  "dep:zip",
@@ -37,7 +42,8 @@ office = [
37
42
  "dep:rst_parser",
38
43
  "dep:fb2",
39
44
  "dep:typst-syntax",
40
- "html", # EPUB needs HTML parsing (zip + roxmltree + html-to-markdown-rs)
45
+ "html", # EPUB needs HTML parsing (zip + roxmltree + html-to-markdown-rs)
46
+ "tokio-runtime",
41
47
  ]
42
48
  email = ["dep:mail-parser", "dep:msg_parser"]
43
49
  html = ["dep:html-to-markdown-rs"]
@@ -48,6 +54,7 @@ archives = ["dep:zip", "dep:tar", "dep:sevenz-rust"]
48
54
  ocr = [
49
55
  "dep:kreuzberg-tesseract",
50
56
  "dep:image",
57
+ "dep:tiff",
51
58
  "dep:fast_image_resize",
52
59
  "dep:ndarray",
53
60
  "dep:kamadak-exif",
@@ -55,7 +62,7 @@ ocr = [
55
62
  ]
56
63
  language-detection = ["dep:whatlang"]
57
64
  chunking = ["dep:text-splitter"]
58
- embeddings = ["dep:fastembed", "chunking"]
65
+ embeddings = ["dep:fastembed", "dep:reqwest", "chunking", "tokio-runtime"]
59
66
  stopwords = [] # Stopwords for keyword extraction and token reduction
60
67
  quality = ["dep:unicode-normalization", "dep:chardetng", "dep:encoding_rs", "stopwords"]
61
68
 
@@ -65,12 +72,16 @@ keywords-rake = ["dep:rake", "stopwords"]
65
72
  keywords = ["keywords-yake", "keywords-rake"]
66
73
 
67
74
  # Server features
68
- api = ["dep:axum", "dep:tower", "dep:tower-http"]
69
- mcp = ["dep:rmcp"]
75
+ api = ["dep:axum", "dep:tower", "dep:tower-http", "tokio-runtime"]
76
+ mcp = ["dep:rmcp", "tokio-runtime"]
77
+ mcp-http = ["mcp", "api"] # NEW - enables HTTP transport
70
78
 
71
79
  # Observability features
72
80
  otel = ["dep:opentelemetry", "dep:opentelemetry_sdk", "dep:tracing-opentelemetry"]
73
81
 
82
+ # WASM-compatible feature bundle
83
+ wasm-target = ["html", "xml", "email", "language-detection", "chunking", "quality"]
84
+
74
85
  # Convenience bundles
75
86
  full = [
76
87
  "pdf",
@@ -91,6 +102,7 @@ cli = ["pdf", "excel", "office", "html", "ocr", "language-detection", "chunking"
91
102
 
92
103
  [build-dependencies]
93
104
  tracing = { workspace = true }
105
+ pkg-config = "0.3" # For system pdfium detection
94
106
 
95
107
  [dependencies]
96
108
  # Core dependencies (always included)
@@ -114,11 +126,14 @@ toml = "0.9.8"
114
126
  mime_guess = "2.0"
115
127
  rmp-serde = "1.3"
116
128
  thiserror = { workspace = true }
117
- tokio = { workspace = true }
118
- uuid = { version = "1.19.0", features = ["v4"] }
129
+ tokio = { workspace = true, optional = true }
130
+ uuid = { version = "1.19.0", features = ["v4", "js"] }
119
131
  indexmap = "2.12.1"
120
132
  tracing = { workspace = true }
121
- reqwest = { workspace = true, default-features = false, features = ["json", "rustls-tls"] }
133
+ reqwest = { workspace = true, default-features = false, features = [
134
+ "json",
135
+ "rustls-tls",
136
+ ], optional = true }
122
137
  # Format extractors (optional)
123
138
  pdfium-render = { version = "0.8.37", features = ["thread_safe", "image"], optional = true }
124
139
  lopdf = { version = "0.38.0", optional = true }
@@ -128,7 +143,7 @@ roxmltree = { version = "0.21.1", optional = true }
128
143
  zip = { version = "6.0.0", optional = true }
129
144
  mail-parser = { version = "0.11.1", optional = true }
130
145
  msg_parser = { version = "0.1.1", optional = true }
131
- html-to-markdown-rs = { version = "2.12.0", features = ["inline-images"], optional = true }
146
+ html-to-markdown-rs = { version = "2.14.2", features = ["inline-images"], optional = true }
132
147
  quick-xml = { version = "0.38.4", features = ["serialize"], optional = true }
133
148
  tar = { version = "0.4.44", optional = true }
134
149
  sevenz-rust = { version = "0.6.1", optional = true }
@@ -143,7 +158,7 @@ fb2 = { version = "0.4", optional = true }
143
158
  typst-syntax = { version = "0.14", optional = true }
144
159
 
145
160
  # Processing features (optional)
146
- kreuzberg-tesseract = { version = "4.0.0-rc.6", optional = true }
161
+ kreuzberg-tesseract = { version = "4.0.0-rc.7", optional = true }
147
162
  image = { workspace = true, default-features = false, features = [
148
163
  "png",
149
164
  "jpeg",
@@ -153,6 +168,7 @@ image = { workspace = true, default-features = false, features = [
153
168
  "gif",
154
169
  "rayon",
155
170
  ], optional = true }
171
+ tiff = { version = "0.10", optional = true }
156
172
  fast_image_resize = { version = "5.4.0", optional = true }
157
173
  ndarray = { version = "0.17.1", optional = true }
158
174
  kamadak-exif = { version = "0.6.1", optional = true }
@@ -176,6 +192,8 @@ rmcp = { version = "0.11.0", features = [
176
192
  "macros",
177
193
  "base64",
178
194
  "transport-io",
195
+ "transport-streamable-http-server",
196
+ "axum",
179
197
  ], optional = true }
180
198
  # Observability features (optional)
181
199
  opentelemetry = { version = "0.31", features = ["trace"], optional = true }
@@ -202,3 +220,6 @@ harness = false
202
220
  # Only build profiling tooling on non-Windows targets (pprof depends on Unix APIs)
203
221
  [target.'cfg(not(target_os = "windows"))'.dependencies]
204
222
  pprof = { version = "0.15.0", features = ["flamegraph"], optional = true }
223
+
224
+ [target.'cfg(target_arch = "wasm32")'.dependencies]
225
+ wasm-bindgen-rayon = "1.3"
@@ -1,14 +1,19 @@
1
- # Kreuzberg (Rust Core)
1
+ # Kreuzberg
2
+
3
+ [![Rust](https://img.shields.io/crates/v/kreuzberg?label=Rust)](https://crates.io/crates/kreuzberg)
4
+ [![Python](https://img.shields.io/pypi/v/kreuzberg?label=Python)](https://pypi.org/project/kreuzberg/)
5
+ [![TypeScript](https://img.shields.io/npm/v/@kreuzberg/node?label=TypeScript)](https://www.npmjs.com/package/@kreuzberg/node)
6
+ [![WASM](https://img.shields.io/npm/v/@kreuzberg/wasm?label=WASM)](https://www.npmjs.com/package/@kreuzberg/wasm)
7
+ [![Ruby](https://img.shields.io/gem/v/kreuzberg?label=Ruby)](https://rubygems.org/gems/kreuzberg)
8
+ [![Java](https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java)](https://central.sonatype.com/artifact/dev.kreuzberg/kreuzberg)
9
+ [![Go](https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go)](https://pkg.go.dev/github.com/kreuzberg-dev/kreuzberg)
10
+ [![C#](https://img.shields.io/nuget/v/Goldziher.Kreuzberg?label=C%23)](https://www.nuget.org/packages/Goldziher.Kreuzberg/)
2
11
 
3
- [![Crates.io](https://img.shields.io/crates/v/kreuzberg)](https://crates.io/crates/kreuzberg)
4
- [![PyPI](https://img.shields.io/pypi/v/kreuzberg)](https://pypi.org/project/kreuzberg/)
5
- [![npm](https://img.shields.io/npm/v/kreuzberg)](https://www.npmjs.com/package/kreuzberg)
6
- [![RubyGems](https://img.shields.io/gem/v/kreuzberg)](https://rubygems.org/gems/kreuzberg)
7
- [![docs.rs](https://docs.rs/kreuzberg/badge.svg)](https://docs.rs/kreuzberg)
8
12
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
9
- [![Documentation](https://img.shields.io/badge/docs-kreuzberg.dev-blue)](https://kreuzberg.dev)
13
+ [![Documentation](https://img.shields.io/badge/docs-kreuzberg.dev-blue)](https://kreuzberg.dev/)
14
+ [![Discord](https://img.shields.io/badge/Discord-Join%20our%20community-7289da)](https://discord.gg/pXxagNK2zN)
10
15
 
11
- High-performance document intelligence library for Rust. Extract text, metadata, and structured information from PDFs, Office documents, images, and 50+ formats.
16
+ High-performance document intelligence library for Rust. Extract text, metadata, and structured information from PDFs, Office documents, images, and 56 formats.
12
17
 
13
18
  This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
14
19
 
@@ -164,6 +169,47 @@ kreuzberg = { version = "4.0", features = ["server"] }
164
169
  kreuzberg = { version = "4.0", features = ["cli"] }
165
170
  ```
166
171
 
172
+ ## PDFium Linking Options
173
+
174
+ When using the `pdf` feature, you can choose how PDFium is linked to your binary. Four strategies are supported:
175
+
176
+ | Strategy | Feature | Use Case |
177
+ |----------|---------|----------|
178
+ | **Dynamic (default)** | `pdf` | Fast builds, runtime library dependency |
179
+ | **Static** | `pdf`, `pdf-static` | Embed PDFium in binary, larger binary size |
180
+ | **Bundled** | `pdf`, `pdf-bundled` | Self-contained per-binary copies |
181
+ | **System** | `pdf`, `pdf-system` | Use system-installed PDFium |
182
+
183
+ ### Examples
184
+
185
+ **Default (dynamic linking)** - Fastest compilation, requires libpdfium at runtime:
186
+ ```toml
187
+ [dependencies]
188
+ kreuzberg = { version = "4.0", features = ["pdf"] }
189
+ ```
190
+
191
+ **Static linking** - Larger binary, no runtime dependency:
192
+ ```toml
193
+ [dependencies]
194
+ kreuzberg = { version = "4.0", features = ["pdf", "pdf-static"] }
195
+ ```
196
+
197
+ **Bundled** - Each binary extracts its own copy:
198
+ ```toml
199
+ [dependencies]
200
+ kreuzberg = { version = "4.0", features = ["pdf", "pdf-bundled"] }
201
+ ```
202
+
203
+ **System-installed** - Use pkg-config or manual paths:
204
+ ```toml
205
+ [dependencies]
206
+ kreuzberg = { version = "4.0", features = ["pdf", "pdf-system"] }
207
+ ```
208
+
209
+ For comprehensive guidance on linking strategies, environment variables, and troubleshooting, see the [PDFium Linking Guide](../../docs/guides/pdfium-linking.md).
210
+
211
+ **Note:** Language bindings (Python, TypeScript, Ruby, Java, Go) bundle PDFium automatically and do not expose linking options.
212
+
167
213
  ## Documentation
168
214
 
169
215
  **[API Documentation](https://docs.rs/kreuzberg)** – Complete API reference with examples