kreuzberg 4.0.7 → 4.0.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +24 -16
- data/README.md +3 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/vendor/Cargo.toml +2 -2
- data/vendor/kreuzberg/Cargo.toml +1 -1
- data/vendor/kreuzberg-ffi/Cargo.toml +1 -1
- data/vendor/kreuzberg-ffi/src/helpers.rs +16 -353
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: f5742beaf511e059ff6d34f536e90a5636097b48cd1262a34eaa7692261de7aa
|
|
4
|
+
data.tar.gz: b913234c3ecda47e6e4944181309c2e60c09a8e708eb651cd705258048d3f7bb
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 1a459be58b389e806f6c46ffadaf0f01ea095154c3f9265b54bcd1c722493d8a094df41cc5a008bea6ec3cadc2622ecd0379f36a0ae7404f93e2a6c39f44ebfd
|
|
7
|
+
data.tar.gz: 31c2d8cb14a9bd72e7fc4fff84f598ec6ecb24c5d982394f642fcc04e09cc74d4280550e10ca1b5d703a0a824fc3c63330d5608c42785a825ea4f46ee3ad8c0a
|
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
kreuzberg (4.0.
|
|
4
|
+
kreuzberg (4.0.8)
|
|
5
5
|
|
|
6
6
|
GEM
|
|
7
7
|
remote: https://rubygems.org/
|
|
@@ -22,7 +22,8 @@ GEM
|
|
|
22
22
|
ast (2.4.3)
|
|
23
23
|
base64 (0.3.0)
|
|
24
24
|
bigdecimal (4.0.1)
|
|
25
|
-
byebug (
|
|
25
|
+
byebug (13.0.0)
|
|
26
|
+
reline (>= 0.6.0)
|
|
26
27
|
coderay (1.1.3)
|
|
27
28
|
concurrent-ruby (1.3.6)
|
|
28
29
|
connection_pool (3.0.2)
|
|
@@ -43,10 +44,12 @@ GEM
|
|
|
43
44
|
fileutils (1.8.0)
|
|
44
45
|
i18n (1.14.8)
|
|
45
46
|
concurrent-ruby (~> 1.0)
|
|
47
|
+
io-console (0.8.2)
|
|
46
48
|
json (2.18.0)
|
|
47
49
|
language_server-protocol (3.17.0.5)
|
|
48
50
|
lint_roller (1.1.0)
|
|
49
|
-
listen (3.
|
|
51
|
+
listen (3.10.0)
|
|
52
|
+
logger
|
|
50
53
|
rb-fsevent (~> 0.10, >= 0.10.3)
|
|
51
54
|
rb-inotify (~> 0.9, >= 0.9.10)
|
|
52
55
|
logger (1.7.0)
|
|
@@ -55,16 +58,17 @@ GEM
|
|
|
55
58
|
prism (~> 1.5)
|
|
56
59
|
mutex_m (0.3.0)
|
|
57
60
|
parallel (1.27.0)
|
|
58
|
-
parser (3.3.10.
|
|
61
|
+
parser (3.3.10.1)
|
|
59
62
|
ast (~> 2.4.1)
|
|
60
63
|
racc
|
|
61
64
|
prism (1.8.0)
|
|
62
|
-
pry (0.
|
|
65
|
+
pry (0.16.0)
|
|
63
66
|
coderay (~> 1.1)
|
|
64
67
|
method_source (~> 1.0)
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
+
reline (>= 0.6.0)
|
|
69
|
+
pry-byebug (3.12.0)
|
|
70
|
+
byebug (~> 13.0)
|
|
71
|
+
pry (>= 0.13, < 0.17)
|
|
68
72
|
racc (1.8.1)
|
|
69
73
|
rainbow (3.1.1)
|
|
70
74
|
rake (13.3.1)
|
|
@@ -79,6 +83,8 @@ GEM
|
|
|
79
83
|
rbs (3.10.2)
|
|
80
84
|
logger
|
|
81
85
|
regexp_parser (2.11.3)
|
|
86
|
+
reline (0.6.3)
|
|
87
|
+
io-console (~> 0.5)
|
|
82
88
|
rspec (3.13.2)
|
|
83
89
|
rspec-core (~> 3.13.0)
|
|
84
90
|
rspec-expectations (~> 3.13.0)
|
|
@@ -115,7 +121,7 @@ GEM
|
|
|
115
121
|
rubocop (~> 1.81)
|
|
116
122
|
ruby-progressbar (1.13.0)
|
|
117
123
|
securerandom (0.4.1)
|
|
118
|
-
sorbet-runtime (0.6.
|
|
124
|
+
sorbet-runtime (0.6.12894)
|
|
119
125
|
steep (1.10.0)
|
|
120
126
|
activesupport (>= 5.1)
|
|
121
127
|
concurrent-ruby (>= 1.1.10)
|
|
@@ -179,7 +185,7 @@ CHECKSUMS
|
|
|
179
185
|
ast (2.4.3) sha256=954615157c1d6a382bc27d690d973195e79db7f55e9765ac7c481c60bdb4d383
|
|
180
186
|
base64 (0.3.0) sha256=27337aeabad6ffae05c265c450490628ef3ebd4b67be58257393227588f5a97b
|
|
181
187
|
bigdecimal (4.0.1) sha256=8b07d3d065a9f921c80ceaea7c9d4ae596697295b584c296fe599dd0ad01c4a7
|
|
182
|
-
byebug (
|
|
188
|
+
byebug (13.0.0) sha256=d2263efe751941ca520fa29744b71972d39cbc41839496706f5d9b22e92ae05d
|
|
183
189
|
coderay (1.1.3) sha256=dc530018a4684512f8f38143cd2a096c9f02a1fc2459edcfe534787a7fc77d4b
|
|
184
190
|
concurrent-ruby (1.3.6) sha256=6b56837e1e7e5292f9864f34b69c5a2cbc75c0cf5338f1ce9903d10fa762d5ab
|
|
185
191
|
connection_pool (3.0.2) sha256=33fff5ba71a12d2aa26cb72b1db8bba2a1a01823559fb01d29eb74c286e62e0a
|
|
@@ -199,20 +205,21 @@ CHECKSUMS
|
|
|
199
205
|
ffi (1.17.3-x86_64-linux-musl) sha256=086b221c3a68320b7564066f46fed23449a44f7a1935f1fe5a245bd89d9aea56
|
|
200
206
|
fileutils (1.8.0) sha256=8c6b1df54e2540bdb2f39258f08af78853aa70bad52b4d394bbc6424593c6e02
|
|
201
207
|
i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
|
|
208
|
+
io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
|
|
202
209
|
json (2.18.0) sha256=b10506aee4183f5cf49e0efc48073d7b75843ce3782c68dbeb763351c08fd505
|
|
203
|
-
kreuzberg (4.0.
|
|
210
|
+
kreuzberg (4.0.8)
|
|
204
211
|
language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
|
|
205
212
|
lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
|
|
206
|
-
listen (3.
|
|
213
|
+
listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
|
|
207
214
|
logger (1.7.0) sha256=196edec7cc44b66cfb40f9755ce11b392f21f7967696af15d274dde7edff0203
|
|
208
215
|
method_source (1.1.0) sha256=181301c9c45b731b4769bc81e8860e72f9161ad7d66dd99103c9ab84f560f5c5
|
|
209
216
|
minitest (6.0.1) sha256=7854c74f48e2e975969062833adc4013f249a4b212f5e7b9d5c040bf838d54bb
|
|
210
217
|
mutex_m (0.3.0) sha256=cfcb04ac16b69c4813777022fdceda24e9f798e48092a2b817eb4c0a782b0751
|
|
211
218
|
parallel (1.27.0) sha256=4ac151e1806b755fb4e2dc2332cbf0e54f2e24ba821ff2d3dcf86bf6dc4ae130
|
|
212
|
-
parser (3.3.10.
|
|
219
|
+
parser (3.3.10.1) sha256=06f6a725d2cd91e5e7f2b7c32ba143631e1f7c8ae2fb918fc4cebec187e6a688
|
|
213
220
|
prism (1.8.0) sha256=84453a16ef5530ea62c5f03ec16b52a459575ad4e7b9c2b360fd8ce2c39c1254
|
|
214
|
-
pry (0.
|
|
215
|
-
pry-byebug (3.
|
|
221
|
+
pry (0.16.0) sha256=d76c69065698ed1f85e717bd33d7942c38a50868f6b0673c636192b3d1b6054e
|
|
222
|
+
pry-byebug (3.12.0) sha256=594e094ae8a8390a7ad4c7b36ae36e13304ed02664c67417d108dc5f7213d1b7
|
|
216
223
|
racc (1.8.1) sha256=4a7f6929691dbec8b5209a0b373bc2614882b55fc5d2e447a21aaa691303d62f
|
|
217
224
|
rainbow (3.1.1) sha256=039491aa3a89f42efa1d6dec2fc4e62ede96eb6acd95e52f1ad581182b79bc6a
|
|
218
225
|
rake (13.3.1) sha256=8c9e89d09f66a26a01264e7e3480ec0607f0c497a861ef16063604b1b08eb19c
|
|
@@ -223,6 +230,7 @@ CHECKSUMS
|
|
|
223
230
|
rb_sys (0.9.119) sha256=64393fa148e402e1b79b64496d2aabfc7df79da6b822b8bb48dc1141eaf40b4b
|
|
224
231
|
rbs (3.10.2) sha256=bd8a5dc4c62f229f020146b61844a31f9c79e649449d212904a474eb79c846fc
|
|
225
232
|
regexp_parser (2.11.3) sha256=ca13f381a173b7a93450e53459075c9b76a10433caadcb2f1180f2c741fc55a4
|
|
233
|
+
reline (0.6.3) sha256=1198b04973565b36ec0f11542ab3f5cfeeec34823f4e54cebde90968092b1835
|
|
226
234
|
rspec (3.13.2) sha256=206284a08ad798e61f86d7ca3e376718d52c0bc944626b2349266f239f820587
|
|
227
235
|
rspec-core (3.13.6) sha256=a8823c6411667b60a8bca135364351dda34cd55e44ff94c4be4633b37d828b2d
|
|
228
236
|
rspec-expectations (3.13.5) sha256=33a4d3a1d95060aea4c94e9f237030a8f9eae5615e9bd85718fe3a09e4b58836
|
|
@@ -234,7 +242,7 @@ CHECKSUMS
|
|
|
234
242
|
rubocop-rspec (3.9.0) sha256=8fa70a3619408237d789aeecfb9beef40576acc855173e60939d63332fdb55e2
|
|
235
243
|
ruby-progressbar (1.13.0) sha256=80fc9c47a9b640d6834e0dc7b3c94c9df37f08cb072b7761e4a71e22cff29b33
|
|
236
244
|
securerandom (0.4.1) sha256=cc5193d414a4341b6e225f0cb4446aceca8e50d5e1888743fac16987638ea0b1
|
|
237
|
-
sorbet-runtime (0.6.
|
|
245
|
+
sorbet-runtime (0.6.12894) sha256=4f0cbe041d80dac973ec3a5a848679922074dd77cc19f46384b27a8b9ff4a90c
|
|
238
246
|
steep (1.10.0) sha256=1b295b55f9aaff1b8d3ee42453ee55bc2a1078fda0268f288edb2dc014f4d7d1
|
|
239
247
|
strscan (3.1.7) sha256=5f76462b94a3ea50b44973225b7d75b2cb96d4e1bee9ef1319b99ca117b72c8c
|
|
240
248
|
terminal-table (4.0.0) sha256=f504793203f8251b2ea7c7068333053f0beeea26093ec9962e62ea79f94301d2
|
data/README.md
CHANGED
|
@@ -33,6 +33,9 @@
|
|
|
33
33
|
<a href="https://rubygems.org/gems/kreuzberg">
|
|
34
34
|
<img src="https://img.shields.io/gem/v/kreuzberg?label=Ruby&color=007ec6" alt="Ruby">
|
|
35
35
|
</a>
|
|
36
|
+
<a href="https://github.com/kreuzberg-dev/kreuzberg/pkgs/container/kreuzberg">
|
|
37
|
+
<img src="https://img.shields.io/badge/Docker-007ec6?logo=docker&logoColor=white" alt="Docker">
|
|
38
|
+
</a>
|
|
36
39
|
|
|
37
40
|
<!-- Project Info -->
|
|
38
41
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/blob/main/LICENSE">
|
data/lib/kreuzberg/version.rb
CHANGED
data/vendor/Cargo.toml
CHANGED
|
@@ -3,7 +3,7 @@ members = ["kreuzberg", "kreuzberg-tesseract", "kreuzberg-ffi"]
|
|
|
3
3
|
resolver = "2"
|
|
4
4
|
|
|
5
5
|
[workspace.package]
|
|
6
|
-
version = "4.0.
|
|
6
|
+
version = "4.0.8"
|
|
7
7
|
edition = "2024"
|
|
8
8
|
rust-version = "1.91"
|
|
9
9
|
authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
|
|
@@ -47,7 +47,7 @@ hex = "0.4.3"
|
|
|
47
47
|
toml = "0.9.11"
|
|
48
48
|
num_cpus = "1.17.0"
|
|
49
49
|
once_cell = "1.21.3"
|
|
50
|
-
html-to-markdown-rs = { version = "2.22.
|
|
50
|
+
html-to-markdown-rs = { version = "2.22.5", default-features = false }
|
|
51
51
|
reqwest = { version = "0.13.1", default-features = false, features = ["json", "rustls"] }
|
|
52
52
|
image = { version = "0.25.9", default-features = false }
|
|
53
53
|
lzma-rust2 = { version = "0.15.7" }
|
data/vendor/kreuzberg/Cargo.toml
CHANGED
|
@@ -28,7 +28,7 @@ serde_json = { workspace = true }
|
|
|
28
28
|
serde = { workspace = true }
|
|
29
29
|
async-trait = { workspace = true }
|
|
30
30
|
tokio = { workspace = true }
|
|
31
|
-
html-to-markdown-rs = { version = "2.22.
|
|
31
|
+
html-to-markdown-rs = { version = "2.22.5", default-features = false }
|
|
32
32
|
rayon = { version = "1.11", optional = true }
|
|
33
33
|
|
|
34
34
|
[target.'cfg(all(windows, target_env = "gnu"))'.dependencies]
|
|
@@ -47,352 +47,11 @@ pub fn string_to_c_string(value: String) -> std::result::Result<*mut c_char, Str
|
|
|
47
47
|
|
|
48
48
|
/// Parse extraction configuration from JSON string
|
|
49
49
|
pub fn parse_extraction_config_from_json(config_str: &str) -> FfiResult<ExtractionConfig> {
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
};
|
|
54
|
-
|
|
55
|
-
fn parse_enum<T, F>(value: Option<&serde_json::Value>, parse_fn: F) -> FfiResult<Option<T>>
|
|
56
|
-
where
|
|
57
|
-
F: Fn(&str) -> std::result::Result<T, String>,
|
|
58
|
-
{
|
|
59
|
-
if let Some(raw) = value {
|
|
60
|
-
let text = raw
|
|
61
|
-
.as_str()
|
|
62
|
-
.ok_or_else(|| "Expected string for html_options enum field".to_string())?;
|
|
63
|
-
return parse_fn(text).map(Some);
|
|
64
|
-
}
|
|
65
|
-
Ok(None)
|
|
66
|
-
}
|
|
67
|
-
|
|
68
|
-
fn parse_heading_style(value: &str) -> FfiResult<HeadingStyle> {
|
|
69
|
-
match value.to_lowercase().as_str() {
|
|
70
|
-
"atx" => Ok(HeadingStyle::Atx),
|
|
71
|
-
"underlined" => Ok(HeadingStyle::Underlined),
|
|
72
|
-
"atx_closed" => Ok(HeadingStyle::AtxClosed),
|
|
73
|
-
other => Err(format!(
|
|
74
|
-
"Invalid heading_style '{}'. Expected one of: atx, underlined, atx_closed",
|
|
75
|
-
other
|
|
76
|
-
)),
|
|
77
|
-
}
|
|
78
|
-
}
|
|
79
|
-
|
|
80
|
-
fn parse_list_indent_type(value: &str) -> FfiResult<ListIndentType> {
|
|
81
|
-
match value.to_lowercase().as_str() {
|
|
82
|
-
"spaces" => Ok(ListIndentType::Spaces),
|
|
83
|
-
"tabs" => Ok(ListIndentType::Tabs),
|
|
84
|
-
other => Err(format!(
|
|
85
|
-
"Invalid list_indent_type '{}'. Expected 'spaces' or 'tabs'",
|
|
86
|
-
other
|
|
87
|
-
)),
|
|
88
|
-
}
|
|
89
|
-
}
|
|
90
|
-
|
|
91
|
-
fn parse_highlight_style(value: &str) -> FfiResult<HighlightStyle> {
|
|
92
|
-
match value.to_lowercase().as_str() {
|
|
93
|
-
"double_equal" | "==" | "highlight" => Ok(HighlightStyle::DoubleEqual),
|
|
94
|
-
"html" => Ok(HighlightStyle::Html),
|
|
95
|
-
"bold" => Ok(HighlightStyle::Bold),
|
|
96
|
-
"none" => Ok(HighlightStyle::None),
|
|
97
|
-
other => Err(format!(
|
|
98
|
-
"Invalid highlight_style '{}'. Expected one of: double_equal, html, bold, none",
|
|
99
|
-
other
|
|
100
|
-
)),
|
|
101
|
-
}
|
|
102
|
-
}
|
|
103
|
-
|
|
104
|
-
fn parse_whitespace_mode(value: &str) -> FfiResult<WhitespaceMode> {
|
|
105
|
-
match value.to_lowercase().as_str() {
|
|
106
|
-
"normalized" => Ok(WhitespaceMode::Normalized),
|
|
107
|
-
"strict" => Ok(WhitespaceMode::Strict),
|
|
108
|
-
other => Err(format!(
|
|
109
|
-
"Invalid whitespace_mode '{}'. Expected 'normalized' or 'strict'",
|
|
110
|
-
other
|
|
111
|
-
)),
|
|
112
|
-
}
|
|
113
|
-
}
|
|
114
|
-
|
|
115
|
-
fn parse_newline_style(value: &str) -> FfiResult<NewlineStyle> {
|
|
116
|
-
match value.to_lowercase().as_str() {
|
|
117
|
-
"spaces" => Ok(NewlineStyle::Spaces),
|
|
118
|
-
"backslash" => Ok(NewlineStyle::Backslash),
|
|
119
|
-
other => Err(format!(
|
|
120
|
-
"Invalid newline_style '{}'. Expected 'spaces' or 'backslash'",
|
|
121
|
-
other
|
|
122
|
-
)),
|
|
123
|
-
}
|
|
124
|
-
}
|
|
125
|
-
|
|
126
|
-
fn parse_code_block_style(value: &str) -> FfiResult<CodeBlockStyle> {
|
|
127
|
-
match value.to_lowercase().as_str() {
|
|
128
|
-
"indented" => Ok(CodeBlockStyle::Indented),
|
|
129
|
-
"backticks" => Ok(CodeBlockStyle::Backticks),
|
|
130
|
-
"tildes" => Ok(CodeBlockStyle::Tildes),
|
|
131
|
-
other => Err(format!(
|
|
132
|
-
"Invalid code_block_style '{}'. Expected 'indented', 'backticks', or 'tildes'",
|
|
133
|
-
other
|
|
134
|
-
)),
|
|
135
|
-
}
|
|
136
|
-
}
|
|
137
|
-
|
|
138
|
-
fn parse_preprocessing_preset(value: &str) -> FfiResult<PreprocessingPreset> {
|
|
139
|
-
match value.to_lowercase().as_str() {
|
|
140
|
-
"minimal" => Ok(PreprocessingPreset::Minimal),
|
|
141
|
-
"standard" => Ok(PreprocessingPreset::Standard),
|
|
142
|
-
"aggressive" => Ok(PreprocessingPreset::Aggressive),
|
|
143
|
-
other => Err(format!(
|
|
144
|
-
"Invalid preprocessing.preset '{}'. Expected one of: minimal, standard, aggressive",
|
|
145
|
-
other
|
|
146
|
-
)),
|
|
147
|
-
}
|
|
148
|
-
}
|
|
149
|
-
|
|
150
|
-
fn parse_html_options(value: &serde_json::Value) -> FfiResult<ConversionOptions> {
|
|
151
|
-
let mut opts = ConversionOptions::default();
|
|
152
|
-
let obj = value
|
|
153
|
-
.as_object()
|
|
154
|
-
.ok_or_else(|| "html_options must be an object".to_string())?;
|
|
155
|
-
|
|
156
|
-
if let Some(val) = obj.get("heading_style") {
|
|
157
|
-
opts.heading_style = parse_enum(Some(val), parse_heading_style)?.unwrap_or(opts.heading_style);
|
|
158
|
-
}
|
|
159
|
-
|
|
160
|
-
if let Some(val) = obj.get("list_indent_type") {
|
|
161
|
-
opts.list_indent_type = parse_enum(Some(val), parse_list_indent_type)?.unwrap_or(opts.list_indent_type);
|
|
162
|
-
}
|
|
163
|
-
|
|
164
|
-
if let Some(val) = obj.get("list_indent_width") {
|
|
165
|
-
opts.list_indent_width = val
|
|
166
|
-
.as_u64()
|
|
167
|
-
.map(|v| v as usize)
|
|
168
|
-
.ok_or_else(|| "list_indent_width must be an integer".to_string())?;
|
|
169
|
-
}
|
|
170
|
-
|
|
171
|
-
if let Some(val) = obj.get("bullets") {
|
|
172
|
-
opts.bullets = val
|
|
173
|
-
.as_str()
|
|
174
|
-
.map(str::to_string)
|
|
175
|
-
.ok_or_else(|| "bullets must be a string".to_string())?;
|
|
176
|
-
}
|
|
177
|
-
|
|
178
|
-
if let Some(val) = obj.get("strong_em_symbol") {
|
|
179
|
-
let symbol = val
|
|
180
|
-
.as_str()
|
|
181
|
-
.ok_or_else(|| "strong_em_symbol must be a string".to_string())?;
|
|
182
|
-
let mut chars = symbol.chars();
|
|
183
|
-
opts.strong_em_symbol = chars
|
|
184
|
-
.next()
|
|
185
|
-
.ok_or_else(|| "strong_em_symbol must not be empty".to_string())?;
|
|
186
|
-
}
|
|
187
|
-
|
|
188
|
-
if let Some(val) = obj.get("escape_asterisks") {
|
|
189
|
-
opts.escape_asterisks = val
|
|
190
|
-
.as_bool()
|
|
191
|
-
.ok_or_else(|| "escape_asterisks must be a boolean".to_string())?;
|
|
192
|
-
}
|
|
193
|
-
if let Some(val) = obj.get("escape_underscores") {
|
|
194
|
-
opts.escape_underscores = val
|
|
195
|
-
.as_bool()
|
|
196
|
-
.ok_or_else(|| "escape_underscores must be a boolean".to_string())?;
|
|
197
|
-
}
|
|
198
|
-
if let Some(val) = obj.get("escape_misc") {
|
|
199
|
-
opts.escape_misc = val
|
|
200
|
-
.as_bool()
|
|
201
|
-
.ok_or_else(|| "escape_misc must be a boolean".to_string())?;
|
|
202
|
-
}
|
|
203
|
-
if let Some(val) = obj.get("escape_ascii") {
|
|
204
|
-
opts.escape_ascii = val
|
|
205
|
-
.as_bool()
|
|
206
|
-
.ok_or_else(|| "escape_ascii must be a boolean".to_string())?;
|
|
207
|
-
}
|
|
208
|
-
|
|
209
|
-
if let Some(val) = obj.get("code_language") {
|
|
210
|
-
opts.code_language = val
|
|
211
|
-
.as_str()
|
|
212
|
-
.map(str::to_string)
|
|
213
|
-
.ok_or_else(|| "code_language must be a string".to_string())?;
|
|
214
|
-
}
|
|
215
|
-
|
|
216
|
-
if let Some(val) = obj.get("autolinks") {
|
|
217
|
-
opts.autolinks = val.as_bool().ok_or_else(|| "autolinks must be a boolean".to_string())?;
|
|
218
|
-
}
|
|
219
|
-
|
|
220
|
-
if let Some(val) = obj.get("default_title") {
|
|
221
|
-
opts.default_title = val
|
|
222
|
-
.as_bool()
|
|
223
|
-
.ok_or_else(|| "default_title must be a boolean".to_string())?;
|
|
224
|
-
}
|
|
225
|
-
|
|
226
|
-
if let Some(val) = obj.get("br_in_tables") {
|
|
227
|
-
opts.br_in_tables = val
|
|
228
|
-
.as_bool()
|
|
229
|
-
.ok_or_else(|| "br_in_tables must be a boolean".to_string())?;
|
|
230
|
-
}
|
|
231
|
-
|
|
232
|
-
if let Some(val) = obj.get("hocr_spatial_tables") {
|
|
233
|
-
opts.hocr_spatial_tables = val
|
|
234
|
-
.as_bool()
|
|
235
|
-
.ok_or_else(|| "hocr_spatial_tables must be a boolean".to_string())?;
|
|
236
|
-
}
|
|
237
|
-
|
|
238
|
-
if let Some(val) = obj.get("highlight_style") {
|
|
239
|
-
opts.highlight_style = parse_enum(Some(val), parse_highlight_style)?.unwrap_or(opts.highlight_style);
|
|
240
|
-
}
|
|
241
|
-
|
|
242
|
-
if let Some(val) = obj.get("extract_metadata") {
|
|
243
|
-
opts.extract_metadata = val
|
|
244
|
-
.as_bool()
|
|
245
|
-
.ok_or_else(|| "extract_metadata must be a boolean".to_string())?;
|
|
246
|
-
}
|
|
247
|
-
|
|
248
|
-
if let Some(val) = obj.get("whitespace_mode") {
|
|
249
|
-
opts.whitespace_mode = parse_enum(Some(val), parse_whitespace_mode)?.unwrap_or(opts.whitespace_mode);
|
|
250
|
-
}
|
|
251
|
-
|
|
252
|
-
if let Some(val) = obj.get("strip_newlines") {
|
|
253
|
-
opts.strip_newlines = val
|
|
254
|
-
.as_bool()
|
|
255
|
-
.ok_or_else(|| "strip_newlines must be a boolean".to_string())?;
|
|
256
|
-
}
|
|
257
|
-
|
|
258
|
-
if let Some(val) = obj.get("wrap") {
|
|
259
|
-
opts.wrap = val.as_bool().ok_or_else(|| "wrap must be a boolean".to_string())?;
|
|
260
|
-
}
|
|
261
|
-
|
|
262
|
-
if let Some(val) = obj.get("wrap_width") {
|
|
263
|
-
opts.wrap_width = val
|
|
264
|
-
.as_u64()
|
|
265
|
-
.map(|v| v as usize)
|
|
266
|
-
.ok_or_else(|| "wrap_width must be an integer".to_string())?;
|
|
267
|
-
}
|
|
268
|
-
|
|
269
|
-
if let Some(val) = obj.get("convert_as_inline") {
|
|
270
|
-
opts.convert_as_inline = val
|
|
271
|
-
.as_bool()
|
|
272
|
-
.ok_or_else(|| "convert_as_inline must be a boolean".to_string())?;
|
|
273
|
-
}
|
|
274
|
-
|
|
275
|
-
if let Some(val) = obj.get("sub_symbol") {
|
|
276
|
-
opts.sub_symbol = val
|
|
277
|
-
.as_str()
|
|
278
|
-
.map(str::to_string)
|
|
279
|
-
.ok_or_else(|| "sub_symbol must be a string".to_string())?;
|
|
280
|
-
}
|
|
281
|
-
|
|
282
|
-
if let Some(val) = obj.get("sup_symbol") {
|
|
283
|
-
opts.sup_symbol = val
|
|
284
|
-
.as_str()
|
|
285
|
-
.map(str::to_string)
|
|
286
|
-
.ok_or_else(|| "sup_symbol must be a string".to_string())?;
|
|
287
|
-
}
|
|
288
|
-
|
|
289
|
-
if let Some(val) = obj.get("newline_style") {
|
|
290
|
-
opts.newline_style = parse_enum(Some(val), parse_newline_style)?.unwrap_or(opts.newline_style);
|
|
291
|
-
}
|
|
292
|
-
|
|
293
|
-
if let Some(val) = obj.get("code_block_style") {
|
|
294
|
-
opts.code_block_style = parse_enum(Some(val), parse_code_block_style)?.unwrap_or(opts.code_block_style);
|
|
295
|
-
}
|
|
296
|
-
|
|
297
|
-
if let Some(val) = obj.get("keep_inline_images_in") {
|
|
298
|
-
opts.keep_inline_images_in = val
|
|
299
|
-
.as_array()
|
|
300
|
-
.ok_or_else(|| "keep_inline_images_in must be an array".to_string())?
|
|
301
|
-
.iter()
|
|
302
|
-
.map(|v| {
|
|
303
|
-
v.as_str()
|
|
304
|
-
.map(str::to_string)
|
|
305
|
-
.ok_or_else(|| "keep_inline_images_in entries must be strings".to_string())
|
|
306
|
-
})
|
|
307
|
-
.collect::<std::result::Result<Vec<_>, _>>()?;
|
|
308
|
-
}
|
|
309
|
-
|
|
310
|
-
if let Some(val) = obj.get("encoding") {
|
|
311
|
-
opts.encoding = val
|
|
312
|
-
.as_str()
|
|
313
|
-
.map(str::to_string)
|
|
314
|
-
.ok_or_else(|| "encoding must be a string".to_string())?;
|
|
315
|
-
}
|
|
316
|
-
|
|
317
|
-
if let Some(val) = obj.get("debug") {
|
|
318
|
-
opts.debug = val.as_bool().ok_or_else(|| "debug must be a boolean".to_string())?;
|
|
319
|
-
}
|
|
320
|
-
|
|
321
|
-
if let Some(val) = obj.get("strip_tags") {
|
|
322
|
-
opts.strip_tags = val
|
|
323
|
-
.as_array()
|
|
324
|
-
.ok_or_else(|| "strip_tags must be an array".to_string())?
|
|
325
|
-
.iter()
|
|
326
|
-
.map(|v| {
|
|
327
|
-
v.as_str()
|
|
328
|
-
.map(str::to_string)
|
|
329
|
-
.ok_or_else(|| "strip_tags entries must be strings".to_string())
|
|
330
|
-
})
|
|
331
|
-
.collect::<std::result::Result<Vec<_>, _>>()?;
|
|
332
|
-
}
|
|
333
|
-
|
|
334
|
-
if let Some(val) = obj.get("preserve_tags") {
|
|
335
|
-
opts.preserve_tags = val
|
|
336
|
-
.as_array()
|
|
337
|
-
.ok_or_else(|| "preserve_tags must be an array".to_string())?
|
|
338
|
-
.iter()
|
|
339
|
-
.map(|v| {
|
|
340
|
-
v.as_str()
|
|
341
|
-
.map(str::to_string)
|
|
342
|
-
.ok_or_else(|| "preserve_tags entries must be strings".to_string())
|
|
343
|
-
})
|
|
344
|
-
.collect::<std::result::Result<Vec<_>, _>>()?;
|
|
345
|
-
}
|
|
346
|
-
|
|
347
|
-
if let Some(val) = obj.get("preprocessing") {
|
|
348
|
-
let pre = val
|
|
349
|
-
.as_object()
|
|
350
|
-
.ok_or_else(|| "preprocessing must be an object".to_string())?;
|
|
351
|
-
let mut preprocessing = opts.preprocessing.clone();
|
|
352
|
-
|
|
353
|
-
if let Some(v) = pre.get("enabled") {
|
|
354
|
-
preprocessing.enabled = v
|
|
355
|
-
.as_bool()
|
|
356
|
-
.ok_or_else(|| "preprocessing.enabled must be a boolean".to_string())?;
|
|
357
|
-
}
|
|
358
|
-
|
|
359
|
-
if let Some(v) = pre.get("preset") {
|
|
360
|
-
let preset = v
|
|
361
|
-
.as_str()
|
|
362
|
-
.ok_or_else(|| "preprocessing.preset must be a string".to_string())?;
|
|
363
|
-
preprocessing.preset = parse_preprocessing_preset(preset)?;
|
|
364
|
-
}
|
|
365
|
-
|
|
366
|
-
if let Some(v) = pre.get("remove_navigation") {
|
|
367
|
-
preprocessing.remove_navigation = v
|
|
368
|
-
.as_bool()
|
|
369
|
-
.ok_or_else(|| "preprocessing.remove_navigation must be a boolean".to_string())?;
|
|
370
|
-
}
|
|
371
|
-
|
|
372
|
-
if let Some(v) = pre.get("remove_forms") {
|
|
373
|
-
preprocessing.remove_forms = v
|
|
374
|
-
.as_bool()
|
|
375
|
-
.ok_or_else(|| "preprocessing.remove_forms must be a boolean".to_string())?;
|
|
376
|
-
}
|
|
377
|
-
|
|
378
|
-
opts.preprocessing = preprocessing;
|
|
379
|
-
}
|
|
380
|
-
|
|
381
|
-
Ok(opts)
|
|
382
|
-
}
|
|
383
|
-
|
|
384
|
-
let value: serde_json::Value =
|
|
50
|
+
// html-to-markdown-rs v2.22.5+ has #[serde(default)] on ConversionOptions,
|
|
51
|
+
// so serde can now handle partial deserialization with defaults for missing fields
|
|
52
|
+
let config: ExtractionConfig =
|
|
385
53
|
serde_json::from_str(config_str).map_err(|e| format!("Failed to parse config JSON: {}", e))?;
|
|
386
54
|
|
|
387
|
-
let html_options = value.get("html_options").map(parse_html_options).transpose()?;
|
|
388
|
-
|
|
389
|
-
let mut config: ExtractionConfig =
|
|
390
|
-
serde_json::from_value(value).map_err(|e| format!("Failed to parse config JSON: {}", e))?;
|
|
391
|
-
|
|
392
|
-
if let Some(options) = html_options {
|
|
393
|
-
config.html_options = Some(options);
|
|
394
|
-
}
|
|
395
|
-
|
|
396
55
|
Ok(config)
|
|
397
56
|
}
|
|
398
57
|
|
|
@@ -596,8 +255,8 @@ mod tests {
|
|
|
596
255
|
fn test_parse_extraction_config_with_html_options() {
|
|
597
256
|
let json = r#"{
|
|
598
257
|
"html_options": {
|
|
599
|
-
"
|
|
600
|
-
"
|
|
258
|
+
"headingStyle": "atx",
|
|
259
|
+
"escapeAsterisks": true,
|
|
601
260
|
"autolinks": false
|
|
602
261
|
}
|
|
603
262
|
}"#;
|
|
@@ -620,16 +279,20 @@ mod tests {
|
|
|
620
279
|
}
|
|
621
280
|
|
|
622
281
|
#[test]
|
|
623
|
-
fn
|
|
282
|
+
fn test_parse_extraction_config_invalid_heading_style_uses_default() {
|
|
283
|
+
// With #[serde(default)], invalid enum values use the default instead of failing
|
|
624
284
|
let json = r#"{
|
|
625
285
|
"html_options": {
|
|
626
|
-
"
|
|
286
|
+
"headingStyle": "invalid_style"
|
|
627
287
|
}
|
|
628
288
|
}"#;
|
|
629
289
|
|
|
630
290
|
let result = parse_extraction_config_from_json(json);
|
|
631
|
-
assert!(result.
|
|
632
|
-
|
|
291
|
+
assert!(result.is_ok(), "Parsing should succeed with default values");
|
|
292
|
+
|
|
293
|
+
// Invalid enum values should be ignored and default value used
|
|
294
|
+
let config = result.unwrap();
|
|
295
|
+
assert!(config.html_options.is_some());
|
|
633
296
|
}
|
|
634
297
|
|
|
635
298
|
#[test]
|
|
@@ -641,7 +304,7 @@ mod tests {
|
|
|
641
304
|
];
|
|
642
305
|
|
|
643
306
|
for (input, _expected) in styles {
|
|
644
|
-
let json = format!(r#"{{"html_options": {{"
|
|
307
|
+
let json = format!(r#"{{"html_options": {{"headingStyle": "{}"}}}}"#, input);
|
|
645
308
|
let result = parse_extraction_config_from_json(&json);
|
|
646
309
|
assert!(result.is_ok(), "Failed to parse heading_style: {}", input);
|
|
647
310
|
}
|
|
@@ -654,8 +317,8 @@ mod tests {
|
|
|
654
317
|
"preprocessing": {
|
|
655
318
|
"enabled": true,
|
|
656
319
|
"preset": "aggressive",
|
|
657
|
-
"
|
|
658
|
-
"
|
|
320
|
+
"removeNavigation": true,
|
|
321
|
+
"removeForms": false
|
|
659
322
|
}
|
|
660
323
|
}
|
|
661
324
|
}"#;
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "kreuzberg-tesseract"
|
|
3
|
-
version = "4.0.
|
|
3
|
+
version = "4.0.8"
|
|
4
4
|
edition = "2024"
|
|
5
5
|
rust-version = "1.91"
|
|
6
6
|
authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
|
|
@@ -24,7 +24,7 @@ thiserror = { workspace = true }
|
|
|
24
24
|
image = { workspace = true }
|
|
25
25
|
|
|
26
26
|
[build-dependencies]
|
|
27
|
-
cc = { version = "^1.2.
|
|
27
|
+
cc = { version = "^1.2.53", optional = true }
|
|
28
28
|
cmake = { version = "0.1.57", optional = true }
|
|
29
29
|
zip = { version = "7.1.0", optional = true }
|
|
30
30
|
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: kreuzberg
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 4.0.
|
|
4
|
+
version: 4.0.8
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Na'aman Hirschfeld
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-01-
|
|
11
|
+
date: 2026-01-17 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: bundler
|