html-to-markdown 2.29.0 → 2.30.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +22 -35
- data/ext/html-to-markdown-rb/native/Cargo.lock +14 -14
- data/ext/html-to-markdown-rb/native/Cargo.toml +1 -1
- data/lib/html_to_markdown/version.rb +1 -1
- data/sig/html_to_markdown.rbs +1 -1
- data/vendor/Cargo.toml +3 -3
- data/vendor/html-to-markdown-rs/Cargo.toml +1 -1
- data/vendor/html-to-markdown-rs/README.md +3 -1
- data/vendor/html-to-markdown-rs/src/converter/main_helpers.rs +1 -0
- data/vendor/html-to-markdown-rs/src/hocr/mod.rs +11 -0
- data/vendor/html-to-markdown-rs/src/options/conversion.rs +6 -2
- data/vendor/html-to-markdown-rs/tests/hocr_compliance_test.rs +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: a59dd088c63edcda3711c276290e4377db037bd7a7cab8beb8bcbe83cf52a6f7
|
|
4
|
+
data.tar.gz: f49297af30be7e708bbca200099d471dbaadc0c98b2916489f17e83c07644ba7
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 94a8a04d0c146886a27c184ec5aacd83ed1619611b6666d9a98e3088c1fae5421b38a7b83dc557decb14a49e05f2b70d9ae142d3db61ab8dbf2bde165ee67dcc
|
|
7
|
+
data.tar.gz: b6676434b9dcf908f84d1803de65e2bbc86c4413eae4bd861d7732acca3b6938f9184ffb6b6e32b5326e7f56003fd44c67b851863f3b63931b69a388fb2bcfa6
|
data/Gemfile.lock
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
html-to-markdown (2.
|
|
4
|
+
html-to-markdown (2.30.0)
|
|
5
5
|
rb_sys (>= 0.9, < 1.0)
|
|
6
6
|
|
|
7
7
|
GEM
|
|
8
8
|
remote: https://rubygems.org/
|
|
9
9
|
specs:
|
|
10
|
-
activesupport (8.1.
|
|
10
|
+
activesupport (8.1.3)
|
|
11
11
|
base64
|
|
12
12
|
bigdecimal
|
|
13
13
|
concurrent-ruby (~> 1.0, >= 1.3.1)
|
|
@@ -20,8 +20,6 @@ GEM
|
|
|
20
20
|
securerandom (>= 0.3)
|
|
21
21
|
tzinfo (~> 2.0, >= 2.0.5)
|
|
22
22
|
uri (>= 0.13.1)
|
|
23
|
-
addressable (2.8.9)
|
|
24
|
-
public_suffix (>= 2.0.2, < 8.0)
|
|
25
23
|
ast (2.4.3)
|
|
26
24
|
base64 (0.3.0)
|
|
27
25
|
bigdecimal (4.0.1)
|
|
@@ -30,18 +28,15 @@ GEM
|
|
|
30
28
|
csv (3.3.5)
|
|
31
29
|
diff-lcs (1.6.2)
|
|
32
30
|
drb (2.2.3)
|
|
33
|
-
ffi (1.17.
|
|
34
|
-
ffi (1.17.
|
|
35
|
-
ffi (1.17.
|
|
36
|
-
ffi (1.17.
|
|
37
|
-
ffi (1.17.
|
|
31
|
+
ffi (1.17.4-aarch64-linux-gnu)
|
|
32
|
+
ffi (1.17.4-arm64-darwin)
|
|
33
|
+
ffi (1.17.4-x64-mingw-ucrt)
|
|
34
|
+
ffi (1.17.4-x86_64-darwin)
|
|
35
|
+
ffi (1.17.4-x86_64-linux-gnu)
|
|
38
36
|
fileutils (1.8.0)
|
|
39
37
|
i18n (1.14.8)
|
|
40
38
|
concurrent-ruby (~> 1.0)
|
|
41
|
-
json (2.19.
|
|
42
|
-
json-schema (6.2.0)
|
|
43
|
-
addressable (~> 2.8)
|
|
44
|
-
bigdecimal (>= 3.1, < 5)
|
|
39
|
+
json (2.19.3)
|
|
45
40
|
language_server-protocol (3.17.0.5)
|
|
46
41
|
lint_roller (1.1.0)
|
|
47
42
|
listen (3.10.0)
|
|
@@ -49,18 +44,15 @@ GEM
|
|
|
49
44
|
rb-fsevent (~> 0.10, >= 0.10.3)
|
|
50
45
|
rb-inotify (~> 0.9, >= 0.9.10)
|
|
51
46
|
logger (1.7.0)
|
|
52
|
-
mcp (0.9.0)
|
|
53
|
-
json-schema (>= 4.1)
|
|
54
47
|
minitest (6.0.2)
|
|
55
48
|
drb (~> 2.0)
|
|
56
49
|
prism (~> 1.5)
|
|
57
50
|
mutex_m (0.3.0)
|
|
58
51
|
parallel (1.27.0)
|
|
59
|
-
parser (3.3.
|
|
52
|
+
parser (3.3.11.1)
|
|
60
53
|
ast (~> 2.4.1)
|
|
61
54
|
racc
|
|
62
55
|
prism (1.9.0)
|
|
63
|
-
public_suffix (7.0.5)
|
|
64
56
|
racc (1.8.1)
|
|
65
57
|
rainbow (3.1.1)
|
|
66
58
|
rake (13.3.1)
|
|
@@ -72,7 +64,7 @@ GEM
|
|
|
72
64
|
ffi (~> 1.0)
|
|
73
65
|
rb_sys (0.9.124)
|
|
74
66
|
rake-compiler-dock (= 1.11.0)
|
|
75
|
-
rbs (3.10.
|
|
67
|
+
rbs (3.10.4)
|
|
76
68
|
logger
|
|
77
69
|
tsort
|
|
78
70
|
regexp_parser (2.11.3)
|
|
@@ -89,11 +81,10 @@ GEM
|
|
|
89
81
|
diff-lcs (>= 1.2.0, < 2.0)
|
|
90
82
|
rspec-support (~> 3.13.0)
|
|
91
83
|
rspec-support (3.13.7)
|
|
92
|
-
rubocop (1.
|
|
84
|
+
rubocop (1.86.0)
|
|
93
85
|
json (~> 2.3)
|
|
94
86
|
language_server-protocol (~> 3.17.0.2)
|
|
95
87
|
lint_roller (~> 1.1.0)
|
|
96
|
-
mcp (~> 0.6)
|
|
97
88
|
parallel (~> 1.10)
|
|
98
89
|
parser (>= 3.3.0.2)
|
|
99
90
|
rainbow (>= 2.2.2, < 4.0)
|
|
@@ -156,8 +147,7 @@ DEPENDENCIES
|
|
|
156
147
|
steep
|
|
157
148
|
|
|
158
149
|
CHECKSUMS
|
|
159
|
-
activesupport (8.1.
|
|
160
|
-
addressable (2.8.9) sha256=cc154fcbe689711808a43601dee7b980238ce54368d23e127421753e46895485
|
|
150
|
+
activesupport (8.1.3) sha256=21a5e0dfbd4c3ddd9e1317ec6a4d782fa226e7867dc70b0743acda81a1dca20e
|
|
161
151
|
ast (2.4.3) sha256=954615157c1d6a382bc27d690d973195e79db7f55e9765ac7c481c60bdb4d383
|
|
162
152
|
base64 (0.3.0) sha256=27337aeabad6ffae05c265c450490628ef3ebd4b67be58257393227588f5a97b
|
|
163
153
|
bigdecimal (4.0.1) sha256=8b07d3d065a9f921c80ceaea7c9d4ae596697295b584c296fe599dd0ad01c4a7
|
|
@@ -166,27 +156,24 @@ CHECKSUMS
|
|
|
166
156
|
csv (3.3.5) sha256=6e5134ac3383ef728b7f02725d9872934f523cb40b961479f69cf3afa6c8e73f
|
|
167
157
|
diff-lcs (1.6.2) sha256=9ae0d2cba7d4df3075fe8cd8602a8604993efc0dfa934cff568969efb1909962
|
|
168
158
|
drb (2.2.3) sha256=0b00d6fdb50995fe4a45dea13663493c841112e4068656854646f418fda13373
|
|
169
|
-
ffi (1.17.
|
|
170
|
-
ffi (1.17.
|
|
171
|
-
ffi (1.17.
|
|
172
|
-
ffi (1.17.
|
|
173
|
-
ffi (1.17.
|
|
159
|
+
ffi (1.17.4-aarch64-linux-gnu) sha256=b208f06f91ffd8f5e1193da3cae3d2ccfc27fc36fba577baf698d26d91c080df
|
|
160
|
+
ffi (1.17.4-arm64-darwin) sha256=19071aaf1419251b0a46852abf960e77330a3b334d13a4ab51d58b31a937001b
|
|
161
|
+
ffi (1.17.4-x64-mingw-ucrt) sha256=f6ff9618cfccc494138bddade27aa06c74c6c7bc367a1ea1103d80c2fcb9ed35
|
|
162
|
+
ffi (1.17.4-x86_64-darwin) sha256=aa70390523cf3235096cf64962b709b4cfbd5c082a2cb2ae714eb0fe2ccda496
|
|
163
|
+
ffi (1.17.4-x86_64-linux-gnu) sha256=9d3db14c2eae074b382fa9c083fe95aec6e0a1451da249eab096c34002bc752d
|
|
174
164
|
fileutils (1.8.0) sha256=8c6b1df54e2540bdb2f39258f08af78853aa70bad52b4d394bbc6424593c6e02
|
|
175
|
-
html-to-markdown (2.
|
|
165
|
+
html-to-markdown (2.30.0)
|
|
176
166
|
i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
|
|
177
|
-
json (2.19.
|
|
178
|
-
json-schema (6.2.0) sha256=e8bff46ed845a22c1ab2bd0d7eccf831c01fe23bb3920caa4c74db4306813666
|
|
167
|
+
json (2.19.3) sha256=289b0bb53052a1fa8c34ab33cc750b659ba14a5c45f3fcf4b18762dc67c78646
|
|
179
168
|
language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
|
|
180
169
|
lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
|
|
181
170
|
listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
|
|
182
171
|
logger (1.7.0) sha256=196edec7cc44b66cfb40f9755ce11b392f21f7967696af15d274dde7edff0203
|
|
183
|
-
mcp (0.9.0) sha256=a0a3737b0ac9df0772f4ef7e2b013c260ddbcf217a5d50a66bff0baeddf03e47
|
|
184
172
|
minitest (6.0.2) sha256=db6e57956f6ecc6134683b4c87467d6dd792323c7f0eea7b93f66bd284adbc3d
|
|
185
173
|
mutex_m (0.3.0) sha256=cfcb04ac16b69c4813777022fdceda24e9f798e48092a2b817eb4c0a782b0751
|
|
186
174
|
parallel (1.27.0) sha256=4ac151e1806b755fb4e2dc2332cbf0e54f2e24ba821ff2d3dcf86bf6dc4ae130
|
|
187
|
-
parser (3.3.
|
|
175
|
+
parser (3.3.11.1) sha256=d17ace7aabe3e72c3cc94043714be27cc6f852f104d81aa284c2281aecc65d54
|
|
188
176
|
prism (1.9.0) sha256=7b530c6a9f92c24300014919c9dcbc055bf4cdf51ec30aed099b06cd6674ef85
|
|
189
|
-
public_suffix (7.0.5) sha256=1a8bb08f1bbea19228d3bed6e5ed908d1cb4f7c2726d18bd9cadf60bc676f623
|
|
190
177
|
racc (1.8.1) sha256=4a7f6929691dbec8b5209a0b373bc2614882b55fc5d2e447a21aaa691303d62f
|
|
191
178
|
rainbow (3.1.1) sha256=039491aa3a89f42efa1d6dec2fc4e62ede96eb6acd95e52f1ad581182b79bc6a
|
|
192
179
|
rake (13.3.1) sha256=8c9e89d09f66a26a01264e7e3480ec0607f0c497a861ef16063604b1b08eb19c
|
|
@@ -195,14 +182,14 @@ CHECKSUMS
|
|
|
195
182
|
rb-fsevent (0.11.2) sha256=43900b972e7301d6570f64b850a5aa67833ee7d87b458ee92805d56b7318aefe
|
|
196
183
|
rb-inotify (0.11.1) sha256=a0a700441239b0ff18eb65e3866236cd78613d6b9f78fea1f9ac47a85e47be6e
|
|
197
184
|
rb_sys (0.9.124) sha256=513476557b12eaf73764b3da9f8746024558fe8699bda785fb548c9aa3877ae7
|
|
198
|
-
rbs (3.10.
|
|
185
|
+
rbs (3.10.4) sha256=b17d7c4be4bb31a11a3b529830f0aa206a807ca42f2e7921a3027dfc6b7e5ce8
|
|
199
186
|
regexp_parser (2.11.3) sha256=ca13f381a173b7a93450e53459075c9b76a10433caadcb2f1180f2c741fc55a4
|
|
200
187
|
rspec (3.13.2) sha256=206284a08ad798e61f86d7ca3e376718d52c0bc944626b2349266f239f820587
|
|
201
188
|
rspec-core (3.13.6) sha256=a8823c6411667b60a8bca135364351dda34cd55e44ff94c4be4633b37d828b2d
|
|
202
189
|
rspec-expectations (3.13.5) sha256=33a4d3a1d95060aea4c94e9f237030a8f9eae5615e9bd85718fe3a09e4b58836
|
|
203
190
|
rspec-mocks (3.13.8) sha256=086ad3d3d17533f4237643de0b5c42f04b66348c28bf6b9c2d3f4a3b01af1d47
|
|
204
191
|
rspec-support (3.13.7) sha256=0640e5570872aafefd79867901deeeeb40b0c9875a36b983d85f54fb7381c47c
|
|
205
|
-
rubocop (1.
|
|
192
|
+
rubocop (1.86.0) sha256=4ff1186fe16ebe9baff5e7aad66bb0ad4cabf5cdcd419f773146dbba2565d186
|
|
206
193
|
rubocop-ast (1.49.1) sha256=4412f3ee70f6fe4546cc489548e0f6fcf76cafcfa80fa03af67098ffed755035
|
|
207
194
|
rubocop-rspec (3.9.0) sha256=8fa70a3619408237d789aeecfb9beef40576acc855173e60939d63332fdb55e2
|
|
208
195
|
ruby-progressbar (1.13.0) sha256=80fc9c47a9b640d6834e0dc7b3c94c9df37f08cb072b7761e4a71e22cff29b33
|
|
@@ -155,9 +155,9 @@ checksum = "8f1fe948ff07f4bd06c30984e69f5b4899c516a3ef74f34df92a2df2ab535495"
|
|
|
155
155
|
|
|
156
156
|
[[package]]
|
|
157
157
|
name = "cc"
|
|
158
|
-
version = "1.2.
|
|
158
|
+
version = "1.2.58"
|
|
159
159
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
160
|
-
checksum = "
|
|
160
|
+
checksum = "e1e928d4b69e3077709075a938a05ffbedfa53a84c8f766efbf8220bb1ff60e1"
|
|
161
161
|
dependencies = [
|
|
162
162
|
"find-msvc-tools",
|
|
163
163
|
"shlex",
|
|
@@ -452,7 +452,7 @@ dependencies = [
|
|
|
452
452
|
|
|
453
453
|
[[package]]
|
|
454
454
|
name = "html-to-markdown-rb"
|
|
455
|
-
version = "2.
|
|
455
|
+
version = "2.29.0"
|
|
456
456
|
dependencies = [
|
|
457
457
|
"html-to-markdown-rs",
|
|
458
458
|
"magnus",
|
|
@@ -462,7 +462,7 @@ dependencies = [
|
|
|
462
462
|
|
|
463
463
|
[[package]]
|
|
464
464
|
name = "html-to-markdown-rs"
|
|
465
|
-
version = "2.
|
|
465
|
+
version = "2.29.0"
|
|
466
466
|
dependencies = [
|
|
467
467
|
"ahash",
|
|
468
468
|
"astral-tl",
|
|
@@ -1143,9 +1143,9 @@ checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
|
|
|
1143
1143
|
|
|
1144
1144
|
[[package]]
|
|
1145
1145
|
name = "simd-adler32"
|
|
1146
|
-
version = "0.3.
|
|
1146
|
+
version = "0.3.9"
|
|
1147
1147
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1148
|
-
checksum = "
|
|
1148
|
+
checksum = "703d5c7ef118737c72f1af64ad2f6f8c5e1921f818cdcb97b8fe6fc69bf66214"
|
|
1149
1149
|
|
|
1150
1150
|
[[package]]
|
|
1151
1151
|
name = "siphasher"
|
|
@@ -1206,9 +1206,9 @@ dependencies = [
|
|
|
1206
1206
|
|
|
1207
1207
|
[[package]]
|
|
1208
1208
|
name = "symbolic-common"
|
|
1209
|
-
version = "12.17.
|
|
1209
|
+
version = "12.17.3"
|
|
1210
1210
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1211
|
-
checksum = "
|
|
1211
|
+
checksum = "52ca086c1eb5c7ee74b151ba83c6487d5d33f8c08ad991b86f3f58f6629e68d5"
|
|
1212
1212
|
dependencies = [
|
|
1213
1213
|
"debugid",
|
|
1214
1214
|
"memmap2",
|
|
@@ -1218,9 +1218,9 @@ dependencies = [
|
|
|
1218
1218
|
|
|
1219
1219
|
[[package]]
|
|
1220
1220
|
name = "symbolic-demangle"
|
|
1221
|
-
version = "12.17.
|
|
1221
|
+
version = "12.17.3"
|
|
1222
1222
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1223
|
-
checksum = "
|
|
1223
|
+
checksum = "baa911a28a62823aaf2cc2e074212492a3ee69d0d926cc8f5b12b4a108ff5c0c"
|
|
1224
1224
|
dependencies = [
|
|
1225
1225
|
"cpp_demangle",
|
|
1226
1226
|
"rustc-demangle",
|
|
@@ -1316,9 +1316,9 @@ checksum = "1292c0d970b54115d14f2492fe0170adf21d68a1de108eebc51c1df4f346a091"
|
|
|
1316
1316
|
|
|
1317
1317
|
[[package]]
|
|
1318
1318
|
name = "uuid"
|
|
1319
|
-
version = "1.
|
|
1319
|
+
version = "1.23.0"
|
|
1320
1320
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1321
|
-
checksum = "
|
|
1321
|
+
checksum = "5ac8b6f42ead25368cf5b098aeb3dc8a1a2c05a3eee8a9a1a68c640edbfc79d9"
|
|
1322
1322
|
dependencies = [
|
|
1323
1323
|
"js-sys",
|
|
1324
1324
|
"wasm-bindgen",
|
|
@@ -1622,9 +1622,9 @@ checksum = "cb8a0807f7c01457d0379ba880ba6322660448ddebc890ce29bb64da71fb40f9"
|
|
|
1622
1622
|
|
|
1623
1623
|
[[package]]
|
|
1624
1624
|
name = "zune-jpeg"
|
|
1625
|
-
version = "0.5.
|
|
1625
|
+
version = "0.5.15"
|
|
1626
1626
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1627
|
-
checksum = "
|
|
1627
|
+
checksum = "27bc9d5b815bc103f142aa054f561d9187d191692ec7c2d1e2b4737f8dbd7296"
|
|
1628
1628
|
dependencies = [
|
|
1629
1629
|
"zune-core",
|
|
1630
1630
|
]
|
data/sig/html_to_markdown.rbs
CHANGED
|
@@ -62,7 +62,7 @@ module HtmlToMarkdown
|
|
|
62
62
|
autolinks?: bool,
|
|
63
63
|
default_title?: bool,
|
|
64
64
|
br_in_tables?: bool,
|
|
65
|
-
hocr_spatial_tables?: bool,
|
|
65
|
+
hocr_spatial_tables?: bool, # Deprecated since 2.30.0: hOCR support will be removed in v3.
|
|
66
66
|
highlight_style?: highlight_style,
|
|
67
67
|
extract_metadata?: bool,
|
|
68
68
|
whitespace_mode?: whitespace_mode,
|
data/vendor/Cargo.toml
CHANGED
|
@@ -3,7 +3,7 @@ members = ["html-to-markdown-rs"]
|
|
|
3
3
|
resolver = "2"
|
|
4
4
|
|
|
5
5
|
[workspace.package]
|
|
6
|
-
version = "2.
|
|
6
|
+
version = "2.30.0"
|
|
7
7
|
edition = "2024"
|
|
8
8
|
rust-version = "1.85"
|
|
9
9
|
authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
|
|
@@ -17,9 +17,9 @@ async-trait = "0.1"
|
|
|
17
17
|
base64 = "0.22"
|
|
18
18
|
clap = { version = "4.6", features = ["derive"] }
|
|
19
19
|
clap_complete = "4.6"
|
|
20
|
-
clap_mangen = "0.
|
|
20
|
+
clap_mangen = "0.3"
|
|
21
21
|
encoding_rs = "0.8"
|
|
22
|
-
ext-php-rs = "0.15.
|
|
22
|
+
ext-php-rs = "0.15.7"
|
|
23
23
|
html5ever = "0.39.0"
|
|
24
24
|
once_cell = "1.21"
|
|
25
25
|
pyo3 = { version = "0.28.2", features = ["abi3-py310"] }
|
|
@@ -121,7 +121,9 @@ options.preprocessing.remove_forms = true;
|
|
|
121
121
|
let markdown = convert(scraped_html, Some(options))?;
|
|
122
122
|
```
|
|
123
123
|
|
|
124
|
-
## hOCR Table Extraction
|
|
124
|
+
## hOCR Table Extraction (Deprecated)
|
|
125
|
+
|
|
126
|
+
> **Deprecated since 2.30.0**: hOCR support will be removed in v3.
|
|
125
127
|
|
|
126
128
|
```rust
|
|
127
129
|
use html_to_markdown_rs::convert;
|
|
@@ -286,6 +286,7 @@ pub fn is_inline_element(tag_name: &str) -> bool {
|
|
|
286
286
|
}
|
|
287
287
|
|
|
288
288
|
/// Handle hOCR document conversion, returning true if handled, false if not hOCR.
|
|
289
|
+
#[allow(deprecated)]
|
|
289
290
|
pub fn handle_hocr_document(
|
|
290
291
|
dom: &tl::VDom<'_>,
|
|
291
292
|
parser: &tl::Parser<'_>,
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
#![allow(clippy::cast_precision_loss, clippy::cast_sign_loss, clippy::unused_self)]
|
|
2
2
|
//! hOCR 1.2 document processing.
|
|
3
3
|
//!
|
|
4
|
+
//! **Deprecated since 2.30.0**: hOCR support will be removed in v3.
|
|
5
|
+
//!
|
|
4
6
|
//! Complete hOCR 1.2 specification support for extracting structured content from OCR documents.
|
|
5
7
|
//!
|
|
6
8
|
//! ## Features
|
|
@@ -19,13 +21,22 @@
|
|
|
19
21
|
//! - [`converter`]: hOCR to Markdown conversion
|
|
20
22
|
//! - [`spatial`]: Spatial table reconstruction from bounding boxes
|
|
21
23
|
|
|
24
|
+
#[allow(deprecated)]
|
|
22
25
|
pub mod converter;
|
|
26
|
+
#[allow(deprecated)]
|
|
23
27
|
pub mod extractor;
|
|
28
|
+
#[allow(deprecated)]
|
|
24
29
|
pub mod parser;
|
|
30
|
+
#[allow(deprecated)]
|
|
25
31
|
pub mod spatial;
|
|
32
|
+
#[allow(deprecated)]
|
|
26
33
|
pub mod types;
|
|
27
34
|
|
|
35
|
+
#[deprecated(since = "2.30.0", note = "hOCR support will be removed in v3.")]
|
|
28
36
|
pub use converter::{convert_to_markdown, convert_to_markdown_with_options};
|
|
37
|
+
#[deprecated(since = "2.30.0", note = "hOCR support will be removed in v3.")]
|
|
29
38
|
pub use extractor::extract_hocr_document;
|
|
39
|
+
#[deprecated(since = "2.30.0", note = "hOCR support will be removed in v3.")]
|
|
30
40
|
pub use spatial::{HocrWord, extract_hocr_words, reconstruct_table, table_to_markdown};
|
|
41
|
+
#[deprecated(since = "2.30.0", note = "hOCR support will be removed in v3.")]
|
|
31
42
|
pub use types::{BBox, Baseline, HocrElement, HocrElementType, HocrMetadata, HocrProperties};
|
|
@@ -62,7 +62,9 @@ pub struct ConversionOptions {
|
|
|
62
62
|
/// Use HTML <br> elements in tables instead of spaces for line breaks
|
|
63
63
|
pub br_in_tables: bool,
|
|
64
64
|
|
|
65
|
-
/// Enable spatial table reconstruction in hOCR documents (via spatial positioning analysis)
|
|
65
|
+
/// Enable spatial table reconstruction in hOCR documents (via spatial positioning analysis).
|
|
66
|
+
///
|
|
67
|
+
/// **Deprecated since 2.30.0**: hOCR support will be removed in v3.
|
|
66
68
|
pub hocr_spatial_tables: bool,
|
|
67
69
|
|
|
68
70
|
/// Highlight style for <mark> elements (`DoubleEqual`, Html, Bold, None)
|
|
@@ -176,7 +178,9 @@ pub struct ConversionOptionsUpdate {
|
|
|
176
178
|
/// Optional HTML <br> usage in tables override
|
|
177
179
|
pub br_in_tables: Option<bool>,
|
|
178
180
|
|
|
179
|
-
/// Optional spatial table reconstruction for hOCR documents override
|
|
181
|
+
/// Optional spatial table reconstruction for hOCR documents override.
|
|
182
|
+
///
|
|
183
|
+
/// **Deprecated since 2.30.0**: hOCR support will be removed in v3.
|
|
180
184
|
pub hocr_spatial_tables: Option<bool>,
|
|
181
185
|
|
|
182
186
|
/// Optional highlight style override for <mark> elements
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: html-to-markdown
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 2.
|
|
4
|
+
version: 2.30.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Na'aman Hirschfeld
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-03-
|
|
11
|
+
date: 2026-03-27 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: rb_sys
|