kreuzberg 4.0.0.pre.rc.6 → 4.0.0.pre.rc.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (126) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +5 -3
  3. data/README.md +15 -9
  4. data/ext/kreuzberg_rb/native/.cargo/config.toml +2 -0
  5. data/ext/kreuzberg_rb/native/Cargo.lock +516 -324
  6. data/ext/kreuzberg_rb/native/Cargo.toml +13 -3
  7. data/ext/kreuzberg_rb/native/src/lib.rs +139 -2
  8. data/kreuzberg.gemspec +38 -4
  9. data/lib/kreuzberg/config.rb +34 -1
  10. data/lib/kreuzberg/result.rb +77 -14
  11. data/lib/kreuzberg/version.rb +1 -1
  12. data/sig/kreuzberg.rbs +23 -6
  13. data/vendor/kreuzberg/Cargo.toml +25 -11
  14. data/vendor/kreuzberg/README.md +13 -8
  15. data/vendor/kreuzberg/build.rs +17 -6
  16. data/vendor/kreuzberg/src/api/mod.rs +2 -0
  17. data/vendor/kreuzberg/src/chunking/mod.rs +1279 -79
  18. data/vendor/kreuzberg/src/chunking/processor.rs +220 -0
  19. data/vendor/kreuzberg/src/core/config.rs +49 -1
  20. data/vendor/kreuzberg/src/core/extractor.rs +134 -2
  21. data/vendor/kreuzberg/src/core/mod.rs +4 -2
  22. data/vendor/kreuzberg/src/core/pipeline.rs +188 -1
  23. data/vendor/kreuzberg/src/extraction/docx.rs +358 -0
  24. data/vendor/kreuzberg/src/extraction/html.rs +24 -8
  25. data/vendor/kreuzberg/src/extraction/image.rs +124 -1
  26. data/vendor/kreuzberg/src/extraction/libreoffice.rs +1 -2
  27. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -3
  28. data/vendor/kreuzberg/src/extraction/pptx.rs +187 -87
  29. data/vendor/kreuzberg/src/extractors/archive.rs +1 -0
  30. data/vendor/kreuzberg/src/extractors/bibtex.rs +1 -0
  31. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  32. data/vendor/kreuzberg/src/extractors/docx.rs +50 -17
  33. data/vendor/kreuzberg/src/extractors/email.rs +29 -15
  34. data/vendor/kreuzberg/src/extractors/epub.rs +1 -0
  35. data/vendor/kreuzberg/src/extractors/excel.rs +2 -0
  36. data/vendor/kreuzberg/src/extractors/fictionbook.rs +1 -0
  37. data/vendor/kreuzberg/src/extractors/html.rs +29 -15
  38. data/vendor/kreuzberg/src/extractors/image.rs +25 -4
  39. data/vendor/kreuzberg/src/extractors/jats.rs +3 -0
  40. data/vendor/kreuzberg/src/extractors/jupyter.rs +1 -0
  41. data/vendor/kreuzberg/src/extractors/latex.rs +1 -0
  42. data/vendor/kreuzberg/src/extractors/markdown.rs +1 -0
  43. data/vendor/kreuzberg/src/extractors/mod.rs +78 -14
  44. data/vendor/kreuzberg/src/extractors/odt.rs +3 -3
  45. data/vendor/kreuzberg/src/extractors/opml.rs +1 -0
  46. data/vendor/kreuzberg/src/extractors/orgmode.rs +1 -0
  47. data/vendor/kreuzberg/src/extractors/pdf.rs +194 -17
  48. data/vendor/kreuzberg/src/extractors/pptx.rs +32 -13
  49. data/vendor/kreuzberg/src/extractors/rst.rs +1 -0
  50. data/vendor/kreuzberg/src/extractors/rtf.rs +3 -4
  51. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  52. data/vendor/kreuzberg/src/extractors/text.rs +7 -2
  53. data/vendor/kreuzberg/src/extractors/typst.rs +1 -0
  54. data/vendor/kreuzberg/src/extractors/xml.rs +27 -15
  55. data/vendor/kreuzberg/src/keywords/processor.rs +9 -1
  56. data/vendor/kreuzberg/src/language_detection/mod.rs +43 -0
  57. data/vendor/kreuzberg/src/language_detection/processor.rs +219 -0
  58. data/vendor/kreuzberg/src/lib.rs +10 -2
  59. data/vendor/kreuzberg/src/mcp/mod.rs +2 -0
  60. data/vendor/kreuzberg/src/mcp/server.rs +14 -12
  61. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +2 -0
  62. data/vendor/kreuzberg/src/pdf/error.rs +8 -0
  63. data/vendor/kreuzberg/src/pdf/metadata.rs +238 -95
  64. data/vendor/kreuzberg/src/pdf/mod.rs +14 -2
  65. data/vendor/kreuzberg/src/pdf/rendering.rs +1 -2
  66. data/vendor/kreuzberg/src/pdf/table.rs +26 -2
  67. data/vendor/kreuzberg/src/pdf/text.rs +89 -7
  68. data/vendor/kreuzberg/src/plugins/extractor.rs +34 -3
  69. data/vendor/kreuzberg/src/plugins/mod.rs +3 -0
  70. data/vendor/kreuzberg/src/plugins/ocr.rs +22 -3
  71. data/vendor/kreuzberg/src/plugins/processor.rs +8 -0
  72. data/vendor/kreuzberg/src/plugins/registry.rs +2 -0
  73. data/vendor/kreuzberg/src/plugins/validator.rs +11 -0
  74. data/vendor/kreuzberg/src/text/mod.rs +6 -0
  75. data/vendor/kreuzberg/src/text/quality_processor.rs +219 -0
  76. data/vendor/kreuzberg/src/types.rs +173 -21
  77. data/vendor/kreuzberg/tests/archive_integration.rs +2 -0
  78. data/vendor/kreuzberg/tests/batch_processing.rs +5 -3
  79. data/vendor/kreuzberg/tests/concurrency_stress.rs +14 -6
  80. data/vendor/kreuzberg/tests/config_features.rs +15 -1
  81. data/vendor/kreuzberg/tests/config_loading_tests.rs +1 -0
  82. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +2 -0
  83. data/vendor/kreuzberg/tests/email_integration.rs +2 -0
  84. data/vendor/kreuzberg/tests/error_handling.rs +43 -34
  85. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  86. data/vendor/kreuzberg/tests/image_integration.rs +2 -0
  87. data/vendor/kreuzberg/tests/mime_detection.rs +17 -16
  88. data/vendor/kreuzberg/tests/ocr_configuration.rs +4 -0
  89. data/vendor/kreuzberg/tests/ocr_errors.rs +22 -0
  90. data/vendor/kreuzberg/tests/ocr_quality.rs +2 -0
  91. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -21
  92. data/vendor/kreuzberg/tests/pdf_integration.rs +2 -0
  93. data/vendor/kreuzberg/tests/pipeline_integration.rs +25 -0
  94. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +5 -0
  95. data/vendor/kreuzberg/tests/plugin_system.rs +6 -0
  96. data/vendor/kreuzberg/tests/registry_integration_tests.rs +1 -0
  97. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +2 -0
  98. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -1
  99. data/vendor/kreuzberg/tests/security_validation.rs +1 -0
  100. data/vendor/kreuzberg/tests/test_fastembed.rs +45 -23
  101. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1 -0
  102. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +3 -2
  103. data/vendor/rb-sys/.cargo_vcs_info.json +2 -2
  104. data/vendor/rb-sys/Cargo.lock +15 -15
  105. data/vendor/rb-sys/Cargo.toml +4 -4
  106. data/vendor/rb-sys/Cargo.toml.orig +4 -4
  107. data/vendor/rb-sys/bin/release.sh +9 -8
  108. data/vendor/rb-sys/build/features.rs +5 -2
  109. data/vendor/rb-sys/build/main.rs +55 -15
  110. data/vendor/rb-sys/build/stable_api_config.rs +4 -2
  111. data/vendor/rb-sys/build/version.rs +3 -1
  112. data/vendor/rb-sys/src/macros.rs +2 -2
  113. data/vendor/rb-sys/src/special_consts.rs +1 -1
  114. data/vendor/rb-sys/src/stable_api/compiled.rs +1 -1
  115. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +12 -4
  116. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +12 -4
  117. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +12 -4
  118. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +12 -4
  119. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +19 -6
  120. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +17 -5
  121. data/vendor/rb-sys/src/stable_api.rs +0 -1
  122. data/vendor/rb-sys/src/tracking_allocator.rs +1 -3
  123. metadata +11 -10
  124. data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
  125. data/vendor/rb-sys/.cargo-ok +0 -1
  126. data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
@@ -19,12 +19,14 @@ crate-type = ["rlib"]
19
19
  [features]
20
20
  default = ["tokio-runtime"]
21
21
 
22
- tokio-runtime = []
23
22
  profiling = ["dep:pprof"]
24
23
 
24
+ # Runtime features
25
+ tokio-runtime = ["dep:tokio"]
26
+
25
27
  # Format extractors
26
28
  pdf = ["dep:pdfium-render", "dep:lopdf", "dep:image"]
27
- excel = ["dep:calamine", "dep:polars"]
29
+ excel = ["dep:calamine", "dep:polars", "tokio-runtime"]
28
30
  office = [
29
31
  "dep:roxmltree",
30
32
  "dep:zip",
@@ -37,7 +39,8 @@ office = [
37
39
  "dep:rst_parser",
38
40
  "dep:fb2",
39
41
  "dep:typst-syntax",
40
- "html", # EPUB needs HTML parsing (zip + roxmltree + html-to-markdown-rs)
42
+ "html", # EPUB needs HTML parsing (zip + roxmltree + html-to-markdown-rs)
43
+ "tokio-runtime",
41
44
  ]
42
45
  email = ["dep:mail-parser", "dep:msg_parser"]
43
46
  html = ["dep:html-to-markdown-rs"]
@@ -48,6 +51,7 @@ archives = ["dep:zip", "dep:tar", "dep:sevenz-rust"]
48
51
  ocr = [
49
52
  "dep:kreuzberg-tesseract",
50
53
  "dep:image",
54
+ "dep:tiff",
51
55
  "dep:fast_image_resize",
52
56
  "dep:ndarray",
53
57
  "dep:kamadak-exif",
@@ -55,7 +59,7 @@ ocr = [
55
59
  ]
56
60
  language-detection = ["dep:whatlang"]
57
61
  chunking = ["dep:text-splitter"]
58
- embeddings = ["dep:fastembed", "chunking"]
62
+ embeddings = ["dep:fastembed", "dep:reqwest", "chunking", "tokio-runtime"]
59
63
  stopwords = [] # Stopwords for keyword extraction and token reduction
60
64
  quality = ["dep:unicode-normalization", "dep:chardetng", "dep:encoding_rs", "stopwords"]
61
65
 
@@ -65,12 +69,15 @@ keywords-rake = ["dep:rake", "stopwords"]
65
69
  keywords = ["keywords-yake", "keywords-rake"]
66
70
 
67
71
  # Server features
68
- api = ["dep:axum", "dep:tower", "dep:tower-http"]
69
- mcp = ["dep:rmcp"]
72
+ api = ["dep:axum", "dep:tower", "dep:tower-http", "tokio-runtime"]
73
+ mcp = ["dep:rmcp", "tokio-runtime"]
70
74
 
71
75
  # Observability features
72
76
  otel = ["dep:opentelemetry", "dep:opentelemetry_sdk", "dep:tracing-opentelemetry"]
73
77
 
78
+ # WASM-compatible feature bundle
79
+ wasm-target = ["html", "xml", "email", "language-detection", "chunking", "quality"]
80
+
74
81
  # Convenience bundles
75
82
  full = [
76
83
  "pdf",
@@ -114,11 +121,14 @@ toml = "0.9.8"
114
121
  mime_guess = "2.0"
115
122
  rmp-serde = "1.3"
116
123
  thiserror = { workspace = true }
117
- tokio = { workspace = true }
118
- uuid = { version = "1.19.0", features = ["v4"] }
124
+ tokio = { workspace = true, optional = true }
125
+ uuid = { version = "1.19.0", features = ["v4", "js"] }
119
126
  indexmap = "2.12.1"
120
127
  tracing = { workspace = true }
121
- reqwest = { workspace = true, default-features = false, features = ["json", "rustls-tls"] }
128
+ reqwest = { workspace = true, default-features = false, features = [
129
+ "json",
130
+ "rustls-tls",
131
+ ], optional = true }
122
132
  # Format extractors (optional)
123
133
  pdfium-render = { version = "0.8.37", features = ["thread_safe", "image"], optional = true }
124
134
  lopdf = { version = "0.38.0", optional = true }
@@ -128,7 +138,7 @@ roxmltree = { version = "0.21.1", optional = true }
128
138
  zip = { version = "6.0.0", optional = true }
129
139
  mail-parser = { version = "0.11.1", optional = true }
130
140
  msg_parser = { version = "0.1.1", optional = true }
131
- html-to-markdown-rs = { version = "2.12.0", features = ["inline-images"], optional = true }
141
+ html-to-markdown-rs = { version = "2.14.1", features = ["inline-images"], optional = true }
132
142
  quick-xml = { version = "0.38.4", features = ["serialize"], optional = true }
133
143
  tar = { version = "0.4.44", optional = true }
134
144
  sevenz-rust = { version = "0.6.1", optional = true }
@@ -143,7 +153,7 @@ fb2 = { version = "0.4", optional = true }
143
153
  typst-syntax = { version = "0.14", optional = true }
144
154
 
145
155
  # Processing features (optional)
146
- kreuzberg-tesseract = { version = "4.0.0-rc.6", optional = true }
156
+ kreuzberg-tesseract = { version = "4.0.0-rc.7", optional = true }
147
157
  image = { workspace = true, default-features = false, features = [
148
158
  "png",
149
159
  "jpeg",
@@ -153,6 +163,7 @@ image = { workspace = true, default-features = false, features = [
153
163
  "gif",
154
164
  "rayon",
155
165
  ], optional = true }
166
+ tiff = { version = "0.10", optional = true }
156
167
  fast_image_resize = { version = "5.4.0", optional = true }
157
168
  ndarray = { version = "0.17.1", optional = true }
158
169
  kamadak-exif = { version = "0.6.1", optional = true }
@@ -202,3 +213,6 @@ harness = false
202
213
  # Only build profiling tooling on non-Windows targets (pprof depends on Unix APIs)
203
214
  [target.'cfg(not(target_os = "windows"))'.dependencies]
204
215
  pprof = { version = "0.15.0", features = ["flamegraph"], optional = true }
216
+
217
+ [target.'cfg(target_arch = "wasm32")'.dependencies]
218
+ wasm-bindgen-rayon = "1.2"
@@ -1,14 +1,19 @@
1
- # Kreuzberg (Rust Core)
1
+ # Kreuzberg
2
+
3
+ [![Rust](https://img.shields.io/crates/v/kreuzberg?label=Rust)](https://crates.io/crates/kreuzberg)
4
+ [![Python](https://img.shields.io/pypi/v/kreuzberg?label=Python)](https://pypi.org/project/kreuzberg/)
5
+ [![TypeScript](https://img.shields.io/npm/v/@kreuzberg/node?label=TypeScript)](https://www.npmjs.com/package/@kreuzberg/node)
6
+ [![WASM](https://img.shields.io/npm/v/@kreuzberg/wasm?label=WASM)](https://www.npmjs.com/package/@kreuzberg/wasm)
7
+ [![Ruby](https://img.shields.io/gem/v/kreuzberg?label=Ruby)](https://rubygems.org/gems/kreuzberg)
8
+ [![Java](https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java)](https://central.sonatype.com/artifact/dev.kreuzberg/kreuzberg)
9
+ [![Go](https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go)](https://pkg.go.dev/github.com/kreuzberg-dev/kreuzberg)
10
+ [![C#](https://img.shields.io/nuget/v/Goldziher.Kreuzberg?label=C%23)](https://www.nuget.org/packages/Goldziher.Kreuzberg/)
2
11
 
3
- [![Crates.io](https://img.shields.io/crates/v/kreuzberg)](https://crates.io/crates/kreuzberg)
4
- [![PyPI](https://img.shields.io/pypi/v/kreuzberg)](https://pypi.org/project/kreuzberg/)
5
- [![npm](https://img.shields.io/npm/v/kreuzberg)](https://www.npmjs.com/package/kreuzberg)
6
- [![RubyGems](https://img.shields.io/gem/v/kreuzberg)](https://rubygems.org/gems/kreuzberg)
7
- [![docs.rs](https://docs.rs/kreuzberg/badge.svg)](https://docs.rs/kreuzberg)
8
12
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
9
- [![Documentation](https://img.shields.io/badge/docs-kreuzberg.dev-blue)](https://kreuzberg.dev)
13
+ [![Documentation](https://img.shields.io/badge/docs-kreuzberg.dev-blue)](https://kreuzberg.dev/)
14
+ [![Discord](https://img.shields.io/badge/Discord-Join%20our%20community-7289da)](https://discord.gg/pXxagNK2zN)
10
15
 
11
- High-performance document intelligence library for Rust. Extract text, metadata, and structured information from PDFs, Office documents, images, and 50+ formats.
16
+ High-performance document intelligence library for Rust. Extract text, metadata, and structured information from PDFs, Office documents, images, and 56 formats.
12
17
 
13
18
  This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
14
19
 
@@ -12,6 +12,12 @@ fn main() {
12
12
 
13
13
  println!("cargo::rustc-check-cfg=cfg(coverage)");
14
14
 
15
+ // Skip pdfium linking if the pdf feature is not enabled
16
+ if !cfg!(feature = "pdf") {
17
+ tracing::debug!("PDF feature not enabled, skipping pdfium linking");
18
+ return;
19
+ }
20
+
15
21
  let (download_url, lib_name) = get_pdfium_url_and_lib(&target);
16
22
 
17
23
  let pdfium_dir = out_dir.join("pdfium");
@@ -59,7 +65,10 @@ fn main() {
59
65
 
60
66
  let lib_dir = pdfium_dir.join("lib");
61
67
  println!("cargo:rustc-link-search=native={}", lib_dir.display());
62
- println!("cargo:rustc-link-lib=dylib={}", lib_name);
68
+
69
+ // WASM requires static linking
70
+ let link_type = if target.contains("wasm") { "static" } else { "dylib" };
71
+ println!("cargo:rustc-link-lib={}={}", link_type, lib_name);
63
72
 
64
73
  if target.contains("darwin") {
65
74
  println!("cargo:rustc-link-arg=-Wl,-rpath,@loader_path");
@@ -121,11 +130,12 @@ fn get_pdfium_url_and_lib(target: &str) -> (String, String) {
121
130
  .unwrap_or_else(|| get_latest_version("paulocoutinhox/pdfium-lib"));
122
131
  tracing::debug!("Using pdfium-lib version: {}", version);
123
132
 
124
- let wasm_arch = if target.contains("wasm32") { "wasm32" } else { "wasm64" };
133
+ // WASM builds use a single 'wasm.tgz' asset regardless of architecture
134
+ // The archive contains both wasm32 and wasm64 if available
125
135
  return (
126
136
  format!(
127
- "https://github.com/paulocoutinhox/pdfium-lib/releases/download/{}/pdfium-{}.tar.gz",
128
- version, wasm_arch
137
+ "https://github.com/paulocoutinhox/pdfium-lib/releases/download/{}/wasm.tgz",
138
+ version
129
139
  ),
130
140
  "pdfium".to_string(),
131
141
  );
@@ -314,7 +324,6 @@ fn copy_lib_to_package(pdfium_dir: &Path, target: &str) {
314
324
  );
315
325
  }
316
326
 
317
- // Also copy to target/{profile} for Java FFI (Maven expects it here)
318
327
  let simple_target_dir = workspace_root.join("target").join(&profile);
319
328
  if simple_target_dir != target_dir {
320
329
  fs::create_dir_all(&simple_target_dir).ok();
@@ -416,7 +425,9 @@ fn codesign_if_needed(target: &str, binary: &Path) {
416
425
  }
417
426
 
418
427
  fn runtime_library_info(target: &str) -> (String, &'static str) {
419
- if target.contains("windows") {
428
+ if target.contains("wasm") {
429
+ ("libpdfium.a".to_string(), "lib")
430
+ } else if target.contains("windows") {
420
431
  ("pdfium.dll".to_string(), "bin")
421
432
  } else if target.contains("darwin") {
422
433
  ("libpdfium.dylib".to_string(), "lib")
@@ -1,3 +1,5 @@
1
+ #![cfg(feature = "api")]
2
+
1
3
  //! REST API server for Kreuzberg document extraction.
2
4
  //!
3
5
  //! This module provides an Axum-based HTTP server for document extraction