kreuzberg 4.0.7 → 4.0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 58ca4c0e55e9de4e3a17d748de7dc9210f14ebf2658252fc265799d092928fd9
4
- data.tar.gz: 55bcb1ed72c046c1f49707fc51ce55bf8346b24614a0c7c8a5ccb976d6351621
3
+ metadata.gz: f5742beaf511e059ff6d34f536e90a5636097b48cd1262a34eaa7692261de7aa
4
+ data.tar.gz: b913234c3ecda47e6e4944181309c2e60c09a8e708eb651cd705258048d3f7bb
5
5
  SHA512:
6
- metadata.gz: 0f76ed348b4eb214557e526b8e2d94cd0267da71cd8e1a095fee125d47cfc6868077d03c41fe58654d4fd9978376ba1641bb83f3aed2b4c1a105e21ee2f8e73e
7
- data.tar.gz: 4c6e77e98a5be13a3ccf5f38e859f13ead0438019bf413d94f17938b4f47d83c2d7378181a6561a7db76eb284289847f4cc54b0d64c8ddf1ccd9f2b54eb8b880
6
+ metadata.gz: 1a459be58b389e806f6c46ffadaf0f01ea095154c3f9265b54bcd1c722493d8a094df41cc5a008bea6ec3cadc2622ecd0379f36a0ae7404f93e2a6c39f44ebfd
7
+ data.tar.gz: 31c2d8cb14a9bd72e7fc4fff84f598ec6ecb24c5d982394f642fcc04e09cc74d4280550e10ca1b5d703a0a824fc3c63330d5608c42785a825ea4f46ee3ad8c0a
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- kreuzberg (4.0.7)
4
+ kreuzberg (4.0.8)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
@@ -22,7 +22,8 @@ GEM
22
22
  ast (2.4.3)
23
23
  base64 (0.3.0)
24
24
  bigdecimal (4.0.1)
25
- byebug (12.0.0)
25
+ byebug (13.0.0)
26
+ reline (>= 0.6.0)
26
27
  coderay (1.1.3)
27
28
  concurrent-ruby (1.3.6)
28
29
  connection_pool (3.0.2)
@@ -43,10 +44,12 @@ GEM
43
44
  fileutils (1.8.0)
44
45
  i18n (1.14.8)
45
46
  concurrent-ruby (~> 1.0)
47
+ io-console (0.8.2)
46
48
  json (2.18.0)
47
49
  language_server-protocol (3.17.0.5)
48
50
  lint_roller (1.1.0)
49
- listen (3.9.0)
51
+ listen (3.10.0)
52
+ logger
50
53
  rb-fsevent (~> 0.10, >= 0.10.3)
51
54
  rb-inotify (~> 0.9, >= 0.9.10)
52
55
  logger (1.7.0)
@@ -55,16 +58,17 @@ GEM
55
58
  prism (~> 1.5)
56
59
  mutex_m (0.3.0)
57
60
  parallel (1.27.0)
58
- parser (3.3.10.0)
61
+ parser (3.3.10.1)
59
62
  ast (~> 2.4.1)
60
63
  racc
61
64
  prism (1.8.0)
62
- pry (0.15.2)
65
+ pry (0.16.0)
63
66
  coderay (~> 1.1)
64
67
  method_source (~> 1.0)
65
- pry-byebug (3.11.0)
66
- byebug (~> 12.0)
67
- pry (>= 0.13, < 0.16)
68
+ reline (>= 0.6.0)
69
+ pry-byebug (3.12.0)
70
+ byebug (~> 13.0)
71
+ pry (>= 0.13, < 0.17)
68
72
  racc (1.8.1)
69
73
  rainbow (3.1.1)
70
74
  rake (13.3.1)
@@ -79,6 +83,8 @@ GEM
79
83
  rbs (3.10.2)
80
84
  logger
81
85
  regexp_parser (2.11.3)
86
+ reline (0.6.3)
87
+ io-console (~> 0.5)
82
88
  rspec (3.13.2)
83
89
  rspec-core (~> 3.13.0)
84
90
  rspec-expectations (~> 3.13.0)
@@ -115,7 +121,7 @@ GEM
115
121
  rubocop (~> 1.81)
116
122
  ruby-progressbar (1.13.0)
117
123
  securerandom (0.4.1)
118
- sorbet-runtime (0.6.12887)
124
+ sorbet-runtime (0.6.12894)
119
125
  steep (1.10.0)
120
126
  activesupport (>= 5.1)
121
127
  concurrent-ruby (>= 1.1.10)
@@ -179,7 +185,7 @@ CHECKSUMS
179
185
  ast (2.4.3) sha256=954615157c1d6a382bc27d690d973195e79db7f55e9765ac7c481c60bdb4d383
180
186
  base64 (0.3.0) sha256=27337aeabad6ffae05c265c450490628ef3ebd4b67be58257393227588f5a97b
181
187
  bigdecimal (4.0.1) sha256=8b07d3d065a9f921c80ceaea7c9d4ae596697295b584c296fe599dd0ad01c4a7
182
- byebug (12.0.0) sha256=d4a150d291cca40b66ec9ca31f754e93fed8aa266a17335f71bb0afa7fca1a1e
188
+ byebug (13.0.0) sha256=d2263efe751941ca520fa29744b71972d39cbc41839496706f5d9b22e92ae05d
183
189
  coderay (1.1.3) sha256=dc530018a4684512f8f38143cd2a096c9f02a1fc2459edcfe534787a7fc77d4b
184
190
  concurrent-ruby (1.3.6) sha256=6b56837e1e7e5292f9864f34b69c5a2cbc75c0cf5338f1ce9903d10fa762d5ab
185
191
  connection_pool (3.0.2) sha256=33fff5ba71a12d2aa26cb72b1db8bba2a1a01823559fb01d29eb74c286e62e0a
@@ -199,20 +205,21 @@ CHECKSUMS
199
205
  ffi (1.17.3-x86_64-linux-musl) sha256=086b221c3a68320b7564066f46fed23449a44f7a1935f1fe5a245bd89d9aea56
200
206
  fileutils (1.8.0) sha256=8c6b1df54e2540bdb2f39258f08af78853aa70bad52b4d394bbc6424593c6e02
201
207
  i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
208
+ io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
202
209
  json (2.18.0) sha256=b10506aee4183f5cf49e0efc48073d7b75843ce3782c68dbeb763351c08fd505
203
- kreuzberg (4.0.7)
210
+ kreuzberg (4.0.8)
204
211
  language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
205
212
  lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
206
- listen (3.9.0) sha256=db9e4424e0e5834480385197c139cb6b0ae0ef28cc13310cfd1ca78377d59c67
213
+ listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
207
214
  logger (1.7.0) sha256=196edec7cc44b66cfb40f9755ce11b392f21f7967696af15d274dde7edff0203
208
215
  method_source (1.1.0) sha256=181301c9c45b731b4769bc81e8860e72f9161ad7d66dd99103c9ab84f560f5c5
209
216
  minitest (6.0.1) sha256=7854c74f48e2e975969062833adc4013f249a4b212f5e7b9d5c040bf838d54bb
210
217
  mutex_m (0.3.0) sha256=cfcb04ac16b69c4813777022fdceda24e9f798e48092a2b817eb4c0a782b0751
211
218
  parallel (1.27.0) sha256=4ac151e1806b755fb4e2dc2332cbf0e54f2e24ba821ff2d3dcf86bf6dc4ae130
212
- parser (3.3.10.0) sha256=ce3587fa5cc55a88c4ba5b2b37621b3329aadf5728f9eafa36bbd121462aabd6
219
+ parser (3.3.10.1) sha256=06f6a725d2cd91e5e7f2b7c32ba143631e1f7c8ae2fb918fc4cebec187e6a688
213
220
  prism (1.8.0) sha256=84453a16ef5530ea62c5f03ec16b52a459575ad4e7b9c2b360fd8ce2c39c1254
214
- pry (0.15.2) sha256=12d54b8640d3fa29c9211dd4ffb08f3fd8bf7a4fd9b5a73ce5b59c8709385b6b
215
- pry-byebug (3.11.0) sha256=0b0abb7d309bc7f00044d512a3c8567274f7012b944b38becc8440439a1cea72
221
+ pry (0.16.0) sha256=d76c69065698ed1f85e717bd33d7942c38a50868f6b0673c636192b3d1b6054e
222
+ pry-byebug (3.12.0) sha256=594e094ae8a8390a7ad4c7b36ae36e13304ed02664c67417d108dc5f7213d1b7
216
223
  racc (1.8.1) sha256=4a7f6929691dbec8b5209a0b373bc2614882b55fc5d2e447a21aaa691303d62f
217
224
  rainbow (3.1.1) sha256=039491aa3a89f42efa1d6dec2fc4e62ede96eb6acd95e52f1ad581182b79bc6a
218
225
  rake (13.3.1) sha256=8c9e89d09f66a26a01264e7e3480ec0607f0c497a861ef16063604b1b08eb19c
@@ -223,6 +230,7 @@ CHECKSUMS
223
230
  rb_sys (0.9.119) sha256=64393fa148e402e1b79b64496d2aabfc7df79da6b822b8bb48dc1141eaf40b4b
224
231
  rbs (3.10.2) sha256=bd8a5dc4c62f229f020146b61844a31f9c79e649449d212904a474eb79c846fc
225
232
  regexp_parser (2.11.3) sha256=ca13f381a173b7a93450e53459075c9b76a10433caadcb2f1180f2c741fc55a4
233
+ reline (0.6.3) sha256=1198b04973565b36ec0f11542ab3f5cfeeec34823f4e54cebde90968092b1835
226
234
  rspec (3.13.2) sha256=206284a08ad798e61f86d7ca3e376718d52c0bc944626b2349266f239f820587
227
235
  rspec-core (3.13.6) sha256=a8823c6411667b60a8bca135364351dda34cd55e44ff94c4be4633b37d828b2d
228
236
  rspec-expectations (3.13.5) sha256=33a4d3a1d95060aea4c94e9f237030a8f9eae5615e9bd85718fe3a09e4b58836
@@ -234,7 +242,7 @@ CHECKSUMS
234
242
  rubocop-rspec (3.9.0) sha256=8fa70a3619408237d789aeecfb9beef40576acc855173e60939d63332fdb55e2
235
243
  ruby-progressbar (1.13.0) sha256=80fc9c47a9b640d6834e0dc7b3c94c9df37f08cb072b7761e4a71e22cff29b33
236
244
  securerandom (0.4.1) sha256=cc5193d414a4341b6e225f0cb4446aceca8e50d5e1888743fac16987638ea0b1
237
- sorbet-runtime (0.6.12887) sha256=d5960676f23caba8f25f95240701f6d901c07faff53022209fc71b2dae74dfdb
245
+ sorbet-runtime (0.6.12894) sha256=4f0cbe041d80dac973ec3a5a848679922074dd77cc19f46384b27a8b9ff4a90c
238
246
  steep (1.10.0) sha256=1b295b55f9aaff1b8d3ee42453ee55bc2a1078fda0268f288edb2dc014f4d7d1
239
247
  strscan (3.1.7) sha256=5f76462b94a3ea50b44973225b7d75b2cb96d4e1bee9ef1319b99ca117b72c8c
240
248
  terminal-table (4.0.0) sha256=f504793203f8251b2ea7c7068333053f0beeea26093ec9962e62ea79f94301d2
data/README.md CHANGED
@@ -33,6 +33,9 @@
33
33
  <a href="https://rubygems.org/gems/kreuzberg">
34
34
  <img src="https://img.shields.io/gem/v/kreuzberg?label=Ruby&color=007ec6" alt="Ruby">
35
35
  </a>
36
+ <a href="https://github.com/kreuzberg-dev/kreuzberg/pkgs/container/kreuzberg">
37
+ <img src="https://img.shields.io/badge/Docker-007ec6?logo=docker&logoColor=white" alt="Docker">
38
+ </a>
36
39
 
37
40
  <!-- Project Info -->
38
41
  <a href="https://github.com/kreuzberg-dev/kreuzberg/blob/main/LICENSE">
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Kreuzberg
4
- VERSION = '4.0.7'
4
+ VERSION = '4.0.8'
5
5
  end
data/vendor/Cargo.toml CHANGED
@@ -3,7 +3,7 @@ members = ["kreuzberg", "kreuzberg-tesseract", "kreuzberg-ffi"]
3
3
  resolver = "2"
4
4
 
5
5
  [workspace.package]
6
- version = "4.0.7"
6
+ version = "4.0.8"
7
7
  edition = "2024"
8
8
  rust-version = "1.91"
9
9
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -47,7 +47,7 @@ hex = "0.4.3"
47
47
  toml = "0.9.11"
48
48
  num_cpus = "1.17.0"
49
49
  once_cell = "1.21.3"
50
- html-to-markdown-rs = { version = "2.22.2", default-features = false }
50
+ html-to-markdown-rs = { version = "2.22.5", default-features = false }
51
51
  reqwest = { version = "0.13.1", default-features = false, features = ["json", "rustls"] }
52
52
  image = { version = "0.25.9", default-features = false }
53
53
  lzma-rust2 = { version = "0.15.7" }
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg"
3
- version = "4.0.7"
3
+ version = "4.0.8"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -28,7 +28,7 @@ serde_json = { workspace = true }
28
28
  serde = { workspace = true }
29
29
  async-trait = { workspace = true }
30
30
  tokio = { workspace = true }
31
- html-to-markdown-rs = { version = "2.22.2", default-features = false }
31
+ html-to-markdown-rs = { version = "2.22.5", default-features = false }
32
32
  rayon = { version = "1.11", optional = true }
33
33
 
34
34
  [target.'cfg(all(windows, target_env = "gnu"))'.dependencies]
@@ -47,352 +47,11 @@ pub fn string_to_c_string(value: String) -> std::result::Result<*mut c_char, Str
47
47
 
48
48
  /// Parse extraction configuration from JSON string
49
49
  pub fn parse_extraction_config_from_json(config_str: &str) -> FfiResult<ExtractionConfig> {
50
- use html_to_markdown_rs::options::{
51
- CodeBlockStyle, ConversionOptions, HeadingStyle, HighlightStyle, ListIndentType, NewlineStyle,
52
- PreprocessingPreset, WhitespaceMode,
53
- };
54
-
55
- fn parse_enum<T, F>(value: Option<&serde_json::Value>, parse_fn: F) -> FfiResult<Option<T>>
56
- where
57
- F: Fn(&str) -> std::result::Result<T, String>,
58
- {
59
- if let Some(raw) = value {
60
- let text = raw
61
- .as_str()
62
- .ok_or_else(|| "Expected string for html_options enum field".to_string())?;
63
- return parse_fn(text).map(Some);
64
- }
65
- Ok(None)
66
- }
67
-
68
- fn parse_heading_style(value: &str) -> FfiResult<HeadingStyle> {
69
- match value.to_lowercase().as_str() {
70
- "atx" => Ok(HeadingStyle::Atx),
71
- "underlined" => Ok(HeadingStyle::Underlined),
72
- "atx_closed" => Ok(HeadingStyle::AtxClosed),
73
- other => Err(format!(
74
- "Invalid heading_style '{}'. Expected one of: atx, underlined, atx_closed",
75
- other
76
- )),
77
- }
78
- }
79
-
80
- fn parse_list_indent_type(value: &str) -> FfiResult<ListIndentType> {
81
- match value.to_lowercase().as_str() {
82
- "spaces" => Ok(ListIndentType::Spaces),
83
- "tabs" => Ok(ListIndentType::Tabs),
84
- other => Err(format!(
85
- "Invalid list_indent_type '{}'. Expected 'spaces' or 'tabs'",
86
- other
87
- )),
88
- }
89
- }
90
-
91
- fn parse_highlight_style(value: &str) -> FfiResult<HighlightStyle> {
92
- match value.to_lowercase().as_str() {
93
- "double_equal" | "==" | "highlight" => Ok(HighlightStyle::DoubleEqual),
94
- "html" => Ok(HighlightStyle::Html),
95
- "bold" => Ok(HighlightStyle::Bold),
96
- "none" => Ok(HighlightStyle::None),
97
- other => Err(format!(
98
- "Invalid highlight_style '{}'. Expected one of: double_equal, html, bold, none",
99
- other
100
- )),
101
- }
102
- }
103
-
104
- fn parse_whitespace_mode(value: &str) -> FfiResult<WhitespaceMode> {
105
- match value.to_lowercase().as_str() {
106
- "normalized" => Ok(WhitespaceMode::Normalized),
107
- "strict" => Ok(WhitespaceMode::Strict),
108
- other => Err(format!(
109
- "Invalid whitespace_mode '{}'. Expected 'normalized' or 'strict'",
110
- other
111
- )),
112
- }
113
- }
114
-
115
- fn parse_newline_style(value: &str) -> FfiResult<NewlineStyle> {
116
- match value.to_lowercase().as_str() {
117
- "spaces" => Ok(NewlineStyle::Spaces),
118
- "backslash" => Ok(NewlineStyle::Backslash),
119
- other => Err(format!(
120
- "Invalid newline_style '{}'. Expected 'spaces' or 'backslash'",
121
- other
122
- )),
123
- }
124
- }
125
-
126
- fn parse_code_block_style(value: &str) -> FfiResult<CodeBlockStyle> {
127
- match value.to_lowercase().as_str() {
128
- "indented" => Ok(CodeBlockStyle::Indented),
129
- "backticks" => Ok(CodeBlockStyle::Backticks),
130
- "tildes" => Ok(CodeBlockStyle::Tildes),
131
- other => Err(format!(
132
- "Invalid code_block_style '{}'. Expected 'indented', 'backticks', or 'tildes'",
133
- other
134
- )),
135
- }
136
- }
137
-
138
- fn parse_preprocessing_preset(value: &str) -> FfiResult<PreprocessingPreset> {
139
- match value.to_lowercase().as_str() {
140
- "minimal" => Ok(PreprocessingPreset::Minimal),
141
- "standard" => Ok(PreprocessingPreset::Standard),
142
- "aggressive" => Ok(PreprocessingPreset::Aggressive),
143
- other => Err(format!(
144
- "Invalid preprocessing.preset '{}'. Expected one of: minimal, standard, aggressive",
145
- other
146
- )),
147
- }
148
- }
149
-
150
- fn parse_html_options(value: &serde_json::Value) -> FfiResult<ConversionOptions> {
151
- let mut opts = ConversionOptions::default();
152
- let obj = value
153
- .as_object()
154
- .ok_or_else(|| "html_options must be an object".to_string())?;
155
-
156
- if let Some(val) = obj.get("heading_style") {
157
- opts.heading_style = parse_enum(Some(val), parse_heading_style)?.unwrap_or(opts.heading_style);
158
- }
159
-
160
- if let Some(val) = obj.get("list_indent_type") {
161
- opts.list_indent_type = parse_enum(Some(val), parse_list_indent_type)?.unwrap_or(opts.list_indent_type);
162
- }
163
-
164
- if let Some(val) = obj.get("list_indent_width") {
165
- opts.list_indent_width = val
166
- .as_u64()
167
- .map(|v| v as usize)
168
- .ok_or_else(|| "list_indent_width must be an integer".to_string())?;
169
- }
170
-
171
- if let Some(val) = obj.get("bullets") {
172
- opts.bullets = val
173
- .as_str()
174
- .map(str::to_string)
175
- .ok_or_else(|| "bullets must be a string".to_string())?;
176
- }
177
-
178
- if let Some(val) = obj.get("strong_em_symbol") {
179
- let symbol = val
180
- .as_str()
181
- .ok_or_else(|| "strong_em_symbol must be a string".to_string())?;
182
- let mut chars = symbol.chars();
183
- opts.strong_em_symbol = chars
184
- .next()
185
- .ok_or_else(|| "strong_em_symbol must not be empty".to_string())?;
186
- }
187
-
188
- if let Some(val) = obj.get("escape_asterisks") {
189
- opts.escape_asterisks = val
190
- .as_bool()
191
- .ok_or_else(|| "escape_asterisks must be a boolean".to_string())?;
192
- }
193
- if let Some(val) = obj.get("escape_underscores") {
194
- opts.escape_underscores = val
195
- .as_bool()
196
- .ok_or_else(|| "escape_underscores must be a boolean".to_string())?;
197
- }
198
- if let Some(val) = obj.get("escape_misc") {
199
- opts.escape_misc = val
200
- .as_bool()
201
- .ok_or_else(|| "escape_misc must be a boolean".to_string())?;
202
- }
203
- if let Some(val) = obj.get("escape_ascii") {
204
- opts.escape_ascii = val
205
- .as_bool()
206
- .ok_or_else(|| "escape_ascii must be a boolean".to_string())?;
207
- }
208
-
209
- if let Some(val) = obj.get("code_language") {
210
- opts.code_language = val
211
- .as_str()
212
- .map(str::to_string)
213
- .ok_or_else(|| "code_language must be a string".to_string())?;
214
- }
215
-
216
- if let Some(val) = obj.get("autolinks") {
217
- opts.autolinks = val.as_bool().ok_or_else(|| "autolinks must be a boolean".to_string())?;
218
- }
219
-
220
- if let Some(val) = obj.get("default_title") {
221
- opts.default_title = val
222
- .as_bool()
223
- .ok_or_else(|| "default_title must be a boolean".to_string())?;
224
- }
225
-
226
- if let Some(val) = obj.get("br_in_tables") {
227
- opts.br_in_tables = val
228
- .as_bool()
229
- .ok_or_else(|| "br_in_tables must be a boolean".to_string())?;
230
- }
231
-
232
- if let Some(val) = obj.get("hocr_spatial_tables") {
233
- opts.hocr_spatial_tables = val
234
- .as_bool()
235
- .ok_or_else(|| "hocr_spatial_tables must be a boolean".to_string())?;
236
- }
237
-
238
- if let Some(val) = obj.get("highlight_style") {
239
- opts.highlight_style = parse_enum(Some(val), parse_highlight_style)?.unwrap_or(opts.highlight_style);
240
- }
241
-
242
- if let Some(val) = obj.get("extract_metadata") {
243
- opts.extract_metadata = val
244
- .as_bool()
245
- .ok_or_else(|| "extract_metadata must be a boolean".to_string())?;
246
- }
247
-
248
- if let Some(val) = obj.get("whitespace_mode") {
249
- opts.whitespace_mode = parse_enum(Some(val), parse_whitespace_mode)?.unwrap_or(opts.whitespace_mode);
250
- }
251
-
252
- if let Some(val) = obj.get("strip_newlines") {
253
- opts.strip_newlines = val
254
- .as_bool()
255
- .ok_or_else(|| "strip_newlines must be a boolean".to_string())?;
256
- }
257
-
258
- if let Some(val) = obj.get("wrap") {
259
- opts.wrap = val.as_bool().ok_or_else(|| "wrap must be a boolean".to_string())?;
260
- }
261
-
262
- if let Some(val) = obj.get("wrap_width") {
263
- opts.wrap_width = val
264
- .as_u64()
265
- .map(|v| v as usize)
266
- .ok_or_else(|| "wrap_width must be an integer".to_string())?;
267
- }
268
-
269
- if let Some(val) = obj.get("convert_as_inline") {
270
- opts.convert_as_inline = val
271
- .as_bool()
272
- .ok_or_else(|| "convert_as_inline must be a boolean".to_string())?;
273
- }
274
-
275
- if let Some(val) = obj.get("sub_symbol") {
276
- opts.sub_symbol = val
277
- .as_str()
278
- .map(str::to_string)
279
- .ok_or_else(|| "sub_symbol must be a string".to_string())?;
280
- }
281
-
282
- if let Some(val) = obj.get("sup_symbol") {
283
- opts.sup_symbol = val
284
- .as_str()
285
- .map(str::to_string)
286
- .ok_or_else(|| "sup_symbol must be a string".to_string())?;
287
- }
288
-
289
- if let Some(val) = obj.get("newline_style") {
290
- opts.newline_style = parse_enum(Some(val), parse_newline_style)?.unwrap_or(opts.newline_style);
291
- }
292
-
293
- if let Some(val) = obj.get("code_block_style") {
294
- opts.code_block_style = parse_enum(Some(val), parse_code_block_style)?.unwrap_or(opts.code_block_style);
295
- }
296
-
297
- if let Some(val) = obj.get("keep_inline_images_in") {
298
- opts.keep_inline_images_in = val
299
- .as_array()
300
- .ok_or_else(|| "keep_inline_images_in must be an array".to_string())?
301
- .iter()
302
- .map(|v| {
303
- v.as_str()
304
- .map(str::to_string)
305
- .ok_or_else(|| "keep_inline_images_in entries must be strings".to_string())
306
- })
307
- .collect::<std::result::Result<Vec<_>, _>>()?;
308
- }
309
-
310
- if let Some(val) = obj.get("encoding") {
311
- opts.encoding = val
312
- .as_str()
313
- .map(str::to_string)
314
- .ok_or_else(|| "encoding must be a string".to_string())?;
315
- }
316
-
317
- if let Some(val) = obj.get("debug") {
318
- opts.debug = val.as_bool().ok_or_else(|| "debug must be a boolean".to_string())?;
319
- }
320
-
321
- if let Some(val) = obj.get("strip_tags") {
322
- opts.strip_tags = val
323
- .as_array()
324
- .ok_or_else(|| "strip_tags must be an array".to_string())?
325
- .iter()
326
- .map(|v| {
327
- v.as_str()
328
- .map(str::to_string)
329
- .ok_or_else(|| "strip_tags entries must be strings".to_string())
330
- })
331
- .collect::<std::result::Result<Vec<_>, _>>()?;
332
- }
333
-
334
- if let Some(val) = obj.get("preserve_tags") {
335
- opts.preserve_tags = val
336
- .as_array()
337
- .ok_or_else(|| "preserve_tags must be an array".to_string())?
338
- .iter()
339
- .map(|v| {
340
- v.as_str()
341
- .map(str::to_string)
342
- .ok_or_else(|| "preserve_tags entries must be strings".to_string())
343
- })
344
- .collect::<std::result::Result<Vec<_>, _>>()?;
345
- }
346
-
347
- if let Some(val) = obj.get("preprocessing") {
348
- let pre = val
349
- .as_object()
350
- .ok_or_else(|| "preprocessing must be an object".to_string())?;
351
- let mut preprocessing = opts.preprocessing.clone();
352
-
353
- if let Some(v) = pre.get("enabled") {
354
- preprocessing.enabled = v
355
- .as_bool()
356
- .ok_or_else(|| "preprocessing.enabled must be a boolean".to_string())?;
357
- }
358
-
359
- if let Some(v) = pre.get("preset") {
360
- let preset = v
361
- .as_str()
362
- .ok_or_else(|| "preprocessing.preset must be a string".to_string())?;
363
- preprocessing.preset = parse_preprocessing_preset(preset)?;
364
- }
365
-
366
- if let Some(v) = pre.get("remove_navigation") {
367
- preprocessing.remove_navigation = v
368
- .as_bool()
369
- .ok_or_else(|| "preprocessing.remove_navigation must be a boolean".to_string())?;
370
- }
371
-
372
- if let Some(v) = pre.get("remove_forms") {
373
- preprocessing.remove_forms = v
374
- .as_bool()
375
- .ok_or_else(|| "preprocessing.remove_forms must be a boolean".to_string())?;
376
- }
377
-
378
- opts.preprocessing = preprocessing;
379
- }
380
-
381
- Ok(opts)
382
- }
383
-
384
- let value: serde_json::Value =
50
+ // html-to-markdown-rs v2.22.5+ has #[serde(default)] on ConversionOptions,
51
+ // so serde can now handle partial deserialization with defaults for missing fields
52
+ let config: ExtractionConfig =
385
53
  serde_json::from_str(config_str).map_err(|e| format!("Failed to parse config JSON: {}", e))?;
386
54
 
387
- let html_options = value.get("html_options").map(parse_html_options).transpose()?;
388
-
389
- let mut config: ExtractionConfig =
390
- serde_json::from_value(value).map_err(|e| format!("Failed to parse config JSON: {}", e))?;
391
-
392
- if let Some(options) = html_options {
393
- config.html_options = Some(options);
394
- }
395
-
396
55
  Ok(config)
397
56
  }
398
57
 
@@ -596,8 +255,8 @@ mod tests {
596
255
  fn test_parse_extraction_config_with_html_options() {
597
256
  let json = r#"{
598
257
  "html_options": {
599
- "heading_style": "atx",
600
- "escape_asterisks": true,
258
+ "headingStyle": "atx",
259
+ "escapeAsterisks": true,
601
260
  "autolinks": false
602
261
  }
603
262
  }"#;
@@ -620,16 +279,20 @@ mod tests {
620
279
  }
621
280
 
622
281
  #[test]
623
- fn test_parse_extraction_config_invalid_heading_style() {
282
+ fn test_parse_extraction_config_invalid_heading_style_uses_default() {
283
+ // With #[serde(default)], invalid enum values use the default instead of failing
624
284
  let json = r#"{
625
285
  "html_options": {
626
- "heading_style": "invalid_style"
286
+ "headingStyle": "invalid_style"
627
287
  }
628
288
  }"#;
629
289
 
630
290
  let result = parse_extraction_config_from_json(json);
631
- assert!(result.is_err());
632
- assert!(result.unwrap_err().contains("Invalid heading_style"));
291
+ assert!(result.is_ok(), "Parsing should succeed with default values");
292
+
293
+ // Invalid enum values should be ignored and default value used
294
+ let config = result.unwrap();
295
+ assert!(config.html_options.is_some());
633
296
  }
634
297
 
635
298
  #[test]
@@ -641,7 +304,7 @@ mod tests {
641
304
  ];
642
305
 
643
306
  for (input, _expected) in styles {
644
- let json = format!(r#"{{"html_options": {{"heading_style": "{}"}}}}"#, input);
307
+ let json = format!(r#"{{"html_options": {{"headingStyle": "{}"}}}}"#, input);
645
308
  let result = parse_extraction_config_from_json(&json);
646
309
  assert!(result.is_ok(), "Failed to parse heading_style: {}", input);
647
310
  }
@@ -654,8 +317,8 @@ mod tests {
654
317
  "preprocessing": {
655
318
  "enabled": true,
656
319
  "preset": "aggressive",
657
- "remove_navigation": true,
658
- "remove_forms": false
320
+ "removeNavigation": true,
321
+ "removeForms": false
659
322
  }
660
323
  }
661
324
  }"#;
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg-tesseract"
3
- version = "4.0.7"
3
+ version = "4.0.8"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -24,7 +24,7 @@ thiserror = { workspace = true }
24
24
  image = { workspace = true }
25
25
 
26
26
  [build-dependencies]
27
- cc = { version = "^1.2.52", optional = true }
27
+ cc = { version = "^1.2.53", optional = true }
28
28
  cmake = { version = "0.1.57", optional = true }
29
29
  zip = { version = "7.1.0", optional = true }
30
30
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kreuzberg
3
3
  version: !ruby/object:Gem::Version
4
- version: 4.0.7
4
+ version: 4.0.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - Na'aman Hirschfeld
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2026-01-16 00:00:00.000000000 Z
11
+ date: 2026-01-17 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler