kreuzberg 4.0.0.pre.rc.6 → 4.0.0.pre.rc.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +5 -3
- data/README.md +15 -9
- data/ext/kreuzberg_rb/native/.cargo/config.toml +2 -0
- data/ext/kreuzberg_rb/native/Cargo.lock +516 -324
- data/ext/kreuzberg_rb/native/Cargo.toml +13 -3
- data/ext/kreuzberg_rb/native/src/lib.rs +139 -2
- data/kreuzberg.gemspec +38 -4
- data/lib/kreuzberg/config.rb +34 -1
- data/lib/kreuzberg/result.rb +77 -14
- data/lib/kreuzberg/version.rb +1 -1
- data/sig/kreuzberg.rbs +23 -6
- data/vendor/kreuzberg/Cargo.toml +25 -11
- data/vendor/kreuzberg/README.md +13 -8
- data/vendor/kreuzberg/build.rs +17 -6
- data/vendor/kreuzberg/src/api/mod.rs +2 -0
- data/vendor/kreuzberg/src/chunking/mod.rs +1279 -79
- data/vendor/kreuzberg/src/chunking/processor.rs +220 -0
- data/vendor/kreuzberg/src/core/config.rs +49 -1
- data/vendor/kreuzberg/src/core/extractor.rs +134 -2
- data/vendor/kreuzberg/src/core/mod.rs +4 -2
- data/vendor/kreuzberg/src/core/pipeline.rs +188 -1
- data/vendor/kreuzberg/src/extraction/docx.rs +358 -0
- data/vendor/kreuzberg/src/extraction/html.rs +24 -8
- data/vendor/kreuzberg/src/extraction/image.rs +124 -1
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +1 -2
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -3
- data/vendor/kreuzberg/src/extraction/pptx.rs +187 -87
- data/vendor/kreuzberg/src/extractors/archive.rs +1 -0
- data/vendor/kreuzberg/src/extractors/bibtex.rs +1 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +50 -17
- data/vendor/kreuzberg/src/extractors/email.rs +29 -15
- data/vendor/kreuzberg/src/extractors/epub.rs +1 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +2 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +1 -0
- data/vendor/kreuzberg/src/extractors/html.rs +29 -15
- data/vendor/kreuzberg/src/extractors/image.rs +25 -4
- data/vendor/kreuzberg/src/extractors/jats.rs +3 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +1 -0
- data/vendor/kreuzberg/src/extractors/latex.rs +1 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +1 -0
- data/vendor/kreuzberg/src/extractors/mod.rs +78 -14
- data/vendor/kreuzberg/src/extractors/odt.rs +3 -3
- data/vendor/kreuzberg/src/extractors/opml.rs +1 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +1 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +194 -17
- data/vendor/kreuzberg/src/extractors/pptx.rs +32 -13
- data/vendor/kreuzberg/src/extractors/rst.rs +1 -0
- data/vendor/kreuzberg/src/extractors/rtf.rs +3 -4
- data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
- data/vendor/kreuzberg/src/extractors/text.rs +7 -2
- data/vendor/kreuzberg/src/extractors/typst.rs +1 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +27 -15
- data/vendor/kreuzberg/src/keywords/processor.rs +9 -1
- data/vendor/kreuzberg/src/language_detection/mod.rs +43 -0
- data/vendor/kreuzberg/src/language_detection/processor.rs +219 -0
- data/vendor/kreuzberg/src/lib.rs +10 -2
- data/vendor/kreuzberg/src/mcp/mod.rs +2 -0
- data/vendor/kreuzberg/src/mcp/server.rs +14 -12
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +2 -0
- data/vendor/kreuzberg/src/pdf/error.rs +8 -0
- data/vendor/kreuzberg/src/pdf/metadata.rs +238 -95
- data/vendor/kreuzberg/src/pdf/mod.rs +14 -2
- data/vendor/kreuzberg/src/pdf/rendering.rs +1 -2
- data/vendor/kreuzberg/src/pdf/table.rs +26 -2
- data/vendor/kreuzberg/src/pdf/text.rs +89 -7
- data/vendor/kreuzberg/src/plugins/extractor.rs +34 -3
- data/vendor/kreuzberg/src/plugins/mod.rs +3 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +22 -3
- data/vendor/kreuzberg/src/plugins/processor.rs +8 -0
- data/vendor/kreuzberg/src/plugins/registry.rs +2 -0
- data/vendor/kreuzberg/src/plugins/validator.rs +11 -0
- data/vendor/kreuzberg/src/text/mod.rs +6 -0
- data/vendor/kreuzberg/src/text/quality_processor.rs +219 -0
- data/vendor/kreuzberg/src/types.rs +173 -21
- data/vendor/kreuzberg/tests/archive_integration.rs +2 -0
- data/vendor/kreuzberg/tests/batch_processing.rs +5 -3
- data/vendor/kreuzberg/tests/concurrency_stress.rs +14 -6
- data/vendor/kreuzberg/tests/config_features.rs +15 -1
- data/vendor/kreuzberg/tests/config_loading_tests.rs +1 -0
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +2 -0
- data/vendor/kreuzberg/tests/email_integration.rs +2 -0
- data/vendor/kreuzberg/tests/error_handling.rs +43 -34
- data/vendor/kreuzberg/tests/format_integration.rs +2 -0
- data/vendor/kreuzberg/tests/image_integration.rs +2 -0
- data/vendor/kreuzberg/tests/mime_detection.rs +17 -16
- data/vendor/kreuzberg/tests/ocr_configuration.rs +4 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +22 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +2 -0
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -21
- data/vendor/kreuzberg/tests/pdf_integration.rs +2 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +25 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +5 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +6 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +1 -0
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +2 -0
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -1
- data/vendor/kreuzberg/tests/security_validation.rs +1 -0
- data/vendor/kreuzberg/tests/test_fastembed.rs +45 -23
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1 -0
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +3 -2
- data/vendor/rb-sys/.cargo_vcs_info.json +2 -2
- data/vendor/rb-sys/Cargo.lock +15 -15
- data/vendor/rb-sys/Cargo.toml +4 -4
- data/vendor/rb-sys/Cargo.toml.orig +4 -4
- data/vendor/rb-sys/bin/release.sh +9 -8
- data/vendor/rb-sys/build/features.rs +5 -2
- data/vendor/rb-sys/build/main.rs +55 -15
- data/vendor/rb-sys/build/stable_api_config.rs +4 -2
- data/vendor/rb-sys/build/version.rs +3 -1
- data/vendor/rb-sys/src/macros.rs +2 -2
- data/vendor/rb-sys/src/special_consts.rs +1 -1
- data/vendor/rb-sys/src/stable_api/compiled.rs +1 -1
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +12 -4
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +12 -4
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +12 -4
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +12 -4
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +19 -6
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +17 -5
- data/vendor/rb-sys/src/stable_api.rs +0 -1
- data/vendor/rb-sys/src/tracking_allocator.rs +1 -3
- metadata +11 -10
- data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
- data/vendor/rb-sys/.cargo-ok +0 -1
- data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
data/vendor/kreuzberg/Cargo.toml
CHANGED
|
@@ -19,12 +19,14 @@ crate-type = ["rlib"]
|
|
|
19
19
|
[features]
|
|
20
20
|
default = ["tokio-runtime"]
|
|
21
21
|
|
|
22
|
-
tokio-runtime = []
|
|
23
22
|
profiling = ["dep:pprof"]
|
|
24
23
|
|
|
24
|
+
# Runtime features
|
|
25
|
+
tokio-runtime = ["dep:tokio"]
|
|
26
|
+
|
|
25
27
|
# Format extractors
|
|
26
28
|
pdf = ["dep:pdfium-render", "dep:lopdf", "dep:image"]
|
|
27
|
-
excel = ["dep:calamine", "dep:polars"]
|
|
29
|
+
excel = ["dep:calamine", "dep:polars", "tokio-runtime"]
|
|
28
30
|
office = [
|
|
29
31
|
"dep:roxmltree",
|
|
30
32
|
"dep:zip",
|
|
@@ -37,7 +39,8 @@ office = [
|
|
|
37
39
|
"dep:rst_parser",
|
|
38
40
|
"dep:fb2",
|
|
39
41
|
"dep:typst-syntax",
|
|
40
|
-
"html",
|
|
42
|
+
"html", # EPUB needs HTML parsing (zip + roxmltree + html-to-markdown-rs)
|
|
43
|
+
"tokio-runtime",
|
|
41
44
|
]
|
|
42
45
|
email = ["dep:mail-parser", "dep:msg_parser"]
|
|
43
46
|
html = ["dep:html-to-markdown-rs"]
|
|
@@ -48,6 +51,7 @@ archives = ["dep:zip", "dep:tar", "dep:sevenz-rust"]
|
|
|
48
51
|
ocr = [
|
|
49
52
|
"dep:kreuzberg-tesseract",
|
|
50
53
|
"dep:image",
|
|
54
|
+
"dep:tiff",
|
|
51
55
|
"dep:fast_image_resize",
|
|
52
56
|
"dep:ndarray",
|
|
53
57
|
"dep:kamadak-exif",
|
|
@@ -55,7 +59,7 @@ ocr = [
|
|
|
55
59
|
]
|
|
56
60
|
language-detection = ["dep:whatlang"]
|
|
57
61
|
chunking = ["dep:text-splitter"]
|
|
58
|
-
embeddings = ["dep:fastembed", "chunking"]
|
|
62
|
+
embeddings = ["dep:fastembed", "dep:reqwest", "chunking", "tokio-runtime"]
|
|
59
63
|
stopwords = [] # Stopwords for keyword extraction and token reduction
|
|
60
64
|
quality = ["dep:unicode-normalization", "dep:chardetng", "dep:encoding_rs", "stopwords"]
|
|
61
65
|
|
|
@@ -65,12 +69,15 @@ keywords-rake = ["dep:rake", "stopwords"]
|
|
|
65
69
|
keywords = ["keywords-yake", "keywords-rake"]
|
|
66
70
|
|
|
67
71
|
# Server features
|
|
68
|
-
api = ["dep:axum", "dep:tower", "dep:tower-http"]
|
|
69
|
-
mcp = ["dep:rmcp"]
|
|
72
|
+
api = ["dep:axum", "dep:tower", "dep:tower-http", "tokio-runtime"]
|
|
73
|
+
mcp = ["dep:rmcp", "tokio-runtime"]
|
|
70
74
|
|
|
71
75
|
# Observability features
|
|
72
76
|
otel = ["dep:opentelemetry", "dep:opentelemetry_sdk", "dep:tracing-opentelemetry"]
|
|
73
77
|
|
|
78
|
+
# WASM-compatible feature bundle
|
|
79
|
+
wasm-target = ["html", "xml", "email", "language-detection", "chunking", "quality"]
|
|
80
|
+
|
|
74
81
|
# Convenience bundles
|
|
75
82
|
full = [
|
|
76
83
|
"pdf",
|
|
@@ -114,11 +121,14 @@ toml = "0.9.8"
|
|
|
114
121
|
mime_guess = "2.0"
|
|
115
122
|
rmp-serde = "1.3"
|
|
116
123
|
thiserror = { workspace = true }
|
|
117
|
-
tokio = { workspace = true }
|
|
118
|
-
uuid = { version = "1.19.0", features = ["v4"] }
|
|
124
|
+
tokio = { workspace = true, optional = true }
|
|
125
|
+
uuid = { version = "1.19.0", features = ["v4", "js"] }
|
|
119
126
|
indexmap = "2.12.1"
|
|
120
127
|
tracing = { workspace = true }
|
|
121
|
-
reqwest = { workspace = true, default-features = false, features = [
|
|
128
|
+
reqwest = { workspace = true, default-features = false, features = [
|
|
129
|
+
"json",
|
|
130
|
+
"rustls-tls",
|
|
131
|
+
], optional = true }
|
|
122
132
|
# Format extractors (optional)
|
|
123
133
|
pdfium-render = { version = "0.8.37", features = ["thread_safe", "image"], optional = true }
|
|
124
134
|
lopdf = { version = "0.38.0", optional = true }
|
|
@@ -128,7 +138,7 @@ roxmltree = { version = "0.21.1", optional = true }
|
|
|
128
138
|
zip = { version = "6.0.0", optional = true }
|
|
129
139
|
mail-parser = { version = "0.11.1", optional = true }
|
|
130
140
|
msg_parser = { version = "0.1.1", optional = true }
|
|
131
|
-
html-to-markdown-rs = { version = "2.
|
|
141
|
+
html-to-markdown-rs = { version = "2.14.1", features = ["inline-images"], optional = true }
|
|
132
142
|
quick-xml = { version = "0.38.4", features = ["serialize"], optional = true }
|
|
133
143
|
tar = { version = "0.4.44", optional = true }
|
|
134
144
|
sevenz-rust = { version = "0.6.1", optional = true }
|
|
@@ -143,7 +153,7 @@ fb2 = { version = "0.4", optional = true }
|
|
|
143
153
|
typst-syntax = { version = "0.14", optional = true }
|
|
144
154
|
|
|
145
155
|
# Processing features (optional)
|
|
146
|
-
kreuzberg-tesseract = { version = "4.0.0-rc.
|
|
156
|
+
kreuzberg-tesseract = { version = "4.0.0-rc.7", optional = true }
|
|
147
157
|
image = { workspace = true, default-features = false, features = [
|
|
148
158
|
"png",
|
|
149
159
|
"jpeg",
|
|
@@ -153,6 +163,7 @@ image = { workspace = true, default-features = false, features = [
|
|
|
153
163
|
"gif",
|
|
154
164
|
"rayon",
|
|
155
165
|
], optional = true }
|
|
166
|
+
tiff = { version = "0.10", optional = true }
|
|
156
167
|
fast_image_resize = { version = "5.4.0", optional = true }
|
|
157
168
|
ndarray = { version = "0.17.1", optional = true }
|
|
158
169
|
kamadak-exif = { version = "0.6.1", optional = true }
|
|
@@ -202,3 +213,6 @@ harness = false
|
|
|
202
213
|
# Only build profiling tooling on non-Windows targets (pprof depends on Unix APIs)
|
|
203
214
|
[target.'cfg(not(target_os = "windows"))'.dependencies]
|
|
204
215
|
pprof = { version = "0.15.0", features = ["flamegraph"], optional = true }
|
|
216
|
+
|
|
217
|
+
[target.'cfg(target_arch = "wasm32")'.dependencies]
|
|
218
|
+
wasm-bindgen-rayon = "1.2"
|
data/vendor/kreuzberg/README.md
CHANGED
|
@@ -1,14 +1,19 @@
|
|
|
1
|
-
# Kreuzberg
|
|
1
|
+
# Kreuzberg
|
|
2
|
+
|
|
3
|
+
[](https://crates.io/crates/kreuzberg)
|
|
4
|
+
[](https://pypi.org/project/kreuzberg/)
|
|
5
|
+
[](https://www.npmjs.com/package/@kreuzberg/node)
|
|
6
|
+
[](https://www.npmjs.com/package/@kreuzberg/wasm)
|
|
7
|
+
[](https://rubygems.org/gems/kreuzberg)
|
|
8
|
+
[](https://central.sonatype.com/artifact/dev.kreuzberg/kreuzberg)
|
|
9
|
+
[](https://pkg.go.dev/github.com/kreuzberg-dev/kreuzberg)
|
|
10
|
+
[](https://www.nuget.org/packages/Goldziher.Kreuzberg/)
|
|
2
11
|
|
|
3
|
-
[](https://crates.io/crates/kreuzberg)
|
|
4
|
-
[](https://pypi.org/project/kreuzberg/)
|
|
5
|
-
[](https://www.npmjs.com/package/kreuzberg)
|
|
6
|
-
[](https://rubygems.org/gems/kreuzberg)
|
|
7
|
-
[](https://docs.rs/kreuzberg)
|
|
8
12
|
[](https://opensource.org/licenses/MIT)
|
|
9
|
-
[](https://kreuzberg.dev)
|
|
13
|
+
[](https://kreuzberg.dev/)
|
|
14
|
+
[](https://discord.gg/pXxagNK2zN)
|
|
10
15
|
|
|
11
|
-
High-performance document intelligence library for Rust. Extract text, metadata, and structured information from PDFs, Office documents, images, and
|
|
16
|
+
High-performance document intelligence library for Rust. Extract text, metadata, and structured information from PDFs, Office documents, images, and 56 formats.
|
|
12
17
|
|
|
13
18
|
This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
|
|
14
19
|
|
data/vendor/kreuzberg/build.rs
CHANGED
|
@@ -12,6 +12,12 @@ fn main() {
|
|
|
12
12
|
|
|
13
13
|
println!("cargo::rustc-check-cfg=cfg(coverage)");
|
|
14
14
|
|
|
15
|
+
// Skip pdfium linking if the pdf feature is not enabled
|
|
16
|
+
if !cfg!(feature = "pdf") {
|
|
17
|
+
tracing::debug!("PDF feature not enabled, skipping pdfium linking");
|
|
18
|
+
return;
|
|
19
|
+
}
|
|
20
|
+
|
|
15
21
|
let (download_url, lib_name) = get_pdfium_url_and_lib(&target);
|
|
16
22
|
|
|
17
23
|
let pdfium_dir = out_dir.join("pdfium");
|
|
@@ -59,7 +65,10 @@ fn main() {
|
|
|
59
65
|
|
|
60
66
|
let lib_dir = pdfium_dir.join("lib");
|
|
61
67
|
println!("cargo:rustc-link-search=native={}", lib_dir.display());
|
|
62
|
-
|
|
68
|
+
|
|
69
|
+
// WASM requires static linking
|
|
70
|
+
let link_type = if target.contains("wasm") { "static" } else { "dylib" };
|
|
71
|
+
println!("cargo:rustc-link-lib={}={}", link_type, lib_name);
|
|
63
72
|
|
|
64
73
|
if target.contains("darwin") {
|
|
65
74
|
println!("cargo:rustc-link-arg=-Wl,-rpath,@loader_path");
|
|
@@ -121,11 +130,12 @@ fn get_pdfium_url_and_lib(target: &str) -> (String, String) {
|
|
|
121
130
|
.unwrap_or_else(|| get_latest_version("paulocoutinhox/pdfium-lib"));
|
|
122
131
|
tracing::debug!("Using pdfium-lib version: {}", version);
|
|
123
132
|
|
|
124
|
-
|
|
133
|
+
// WASM builds use a single 'wasm.tgz' asset regardless of architecture
|
|
134
|
+
// The archive contains both wasm32 and wasm64 if available
|
|
125
135
|
return (
|
|
126
136
|
format!(
|
|
127
|
-
"https://github.com/paulocoutinhox/pdfium-lib/releases/download/{}/
|
|
128
|
-
version
|
|
137
|
+
"https://github.com/paulocoutinhox/pdfium-lib/releases/download/{}/wasm.tgz",
|
|
138
|
+
version
|
|
129
139
|
),
|
|
130
140
|
"pdfium".to_string(),
|
|
131
141
|
);
|
|
@@ -314,7 +324,6 @@ fn copy_lib_to_package(pdfium_dir: &Path, target: &str) {
|
|
|
314
324
|
);
|
|
315
325
|
}
|
|
316
326
|
|
|
317
|
-
// Also copy to target/{profile} for Java FFI (Maven expects it here)
|
|
318
327
|
let simple_target_dir = workspace_root.join("target").join(&profile);
|
|
319
328
|
if simple_target_dir != target_dir {
|
|
320
329
|
fs::create_dir_all(&simple_target_dir).ok();
|
|
@@ -416,7 +425,9 @@ fn codesign_if_needed(target: &str, binary: &Path) {
|
|
|
416
425
|
}
|
|
417
426
|
|
|
418
427
|
fn runtime_library_info(target: &str) -> (String, &'static str) {
|
|
419
|
-
if target.contains("
|
|
428
|
+
if target.contains("wasm") {
|
|
429
|
+
("libpdfium.a".to_string(), "lib")
|
|
430
|
+
} else if target.contains("windows") {
|
|
420
431
|
("pdfium.dll".to_string(), "bin")
|
|
421
432
|
} else if target.contains("darwin") {
|
|
422
433
|
("libpdfium.dylib".to_string(), "lib")
|