html-to-markdown 2.3.0__tar.gz → 2.3.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of html-to-markdown might be problematic. Click here for more details.
- {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/Cargo.lock +25 -25
- {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/Cargo.toml +2 -2
- {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/PKG-INFO +1 -1
- {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown/src/hocr/converter.rs +267 -39
- {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown-py/README.md +7 -7
- {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/html_to_markdown/__init__.py +12 -2
- html_to_markdown-2.3.3/html_to_markdown/api.py +143 -0
- {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/html_to_markdown/bin/html-to-markdown +0 -0
- {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/pyproject.toml +1 -1
- html_to_markdown-2.3.0/html_to_markdown/api.py +0 -74
- {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/LICENSE +0 -0
- {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/README_PYPI.md +0 -0
- {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown/Cargo.toml +0 -0
- {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown/README.md +0 -0
- {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown/benches/conversion_benchmark.rs +0 -0
- {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown/benches/micro_benchmark.rs +0 -0
- {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown/benches/profiling_benchmark.rs +0 -0
- {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown/examples/basic.rs +0 -0
- {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown/examples/table.rs +0 -0
- {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown/examples/test_escape.rs +0 -0
- {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown/examples/test_inline_formatting.rs +0 -0
- {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown/examples/test_lists.rs +0 -0
- {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown/examples/test_semantic_tags.rs +0 -0
- {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown/examples/test_tables.rs +0 -0
- {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown/examples/test_task_lists.rs +0 -0
- {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown/examples/test_whitespace.rs +0 -0
- {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown/src/converter.rs +0 -0
- {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown/src/error.rs +0 -0
- {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown/src/hocr/extractor.rs +0 -0
- {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown/src/hocr/mod.rs +0 -0
- {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown/src/hocr/parser.rs +0 -0
- {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown/src/hocr/spatial.rs +0 -0
- {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown/src/hocr/types.rs +0 -0
- {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown/src/inline_images.rs +0 -0
- {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown/src/lib.rs +0 -0
- {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown/src/options.rs +0 -0
- {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown/src/sanitizer.rs +0 -0
- {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown/src/text.rs +0 -0
- {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown/src/wrapper.rs +0 -0
- {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown/tests/commonmark_compliance_test.rs +0 -0
- {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown/tests/hocr_compliance_test.rs +0 -0
- {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown/tests/integration_test.rs +0 -0
- {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown-py/Cargo.toml +0 -0
- {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown-py/python/html_to_markdown/__init__.py +0 -0
- {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown-py/python/html_to_markdown/_html_to_markdown.pyi +0 -0
- {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown-py/src/lib.rs +0 -0
- {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown-py/uv.lock +0 -0
- {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/html_to_markdown/__main__.py +0 -0
- {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/html_to_markdown/_rust.pyi +0 -0
- {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/html_to_markdown/cli.py +0 -0
- {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/html_to_markdown/cli_proxy.py +0 -0
- {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/html_to_markdown/exceptions.py +0 -0
- {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/html_to_markdown/options.py +0 -0
- {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/html_to_markdown/py.typed +0 -0
- {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/html_to_markdown/v1_compat.py +0 -0
|
@@ -200,9 +200,9 @@ dependencies = [
|
|
|
200
200
|
|
|
201
201
|
[[package]]
|
|
202
202
|
name = "clap"
|
|
203
|
-
version = "4.5.
|
|
203
|
+
version = "4.5.49"
|
|
204
204
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
205
|
-
checksum = "
|
|
205
|
+
checksum = "f4512b90fa68d3a9932cea5184017c5d200f5921df706d45e853537dea51508f"
|
|
206
206
|
dependencies = [
|
|
207
207
|
"clap_builder",
|
|
208
208
|
"clap_derive",
|
|
@@ -210,9 +210,9 @@ dependencies = [
|
|
|
210
210
|
|
|
211
211
|
[[package]]
|
|
212
212
|
name = "clap_builder"
|
|
213
|
-
version = "4.5.
|
|
213
|
+
version = "4.5.49"
|
|
214
214
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
215
|
-
checksum = "
|
|
215
|
+
checksum = "0025e98baa12e766c67ba13ff4695a887a1eba19569aad00a472546795bd6730"
|
|
216
216
|
dependencies = [
|
|
217
217
|
"anstream",
|
|
218
218
|
"anstyle",
|
|
@@ -222,18 +222,18 @@ dependencies = [
|
|
|
222
222
|
|
|
223
223
|
[[package]]
|
|
224
224
|
name = "clap_complete"
|
|
225
|
-
version = "4.5.
|
|
225
|
+
version = "4.5.59"
|
|
226
226
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
227
|
-
checksum = "
|
|
227
|
+
checksum = "2348487adcd4631696ced64ccdb40d38ac4d31cae7f2eec8817fcea1b9d1c43c"
|
|
228
228
|
dependencies = [
|
|
229
229
|
"clap",
|
|
230
230
|
]
|
|
231
231
|
|
|
232
232
|
[[package]]
|
|
233
233
|
name = "clap_derive"
|
|
234
|
-
version = "4.5.
|
|
234
|
+
version = "4.5.49"
|
|
235
235
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
236
|
-
checksum = "
|
|
236
|
+
checksum = "2a0b5487afeab2deb2ff4e03a807ad1a03ac532ff5a2cee5d86884440c7f7671"
|
|
237
237
|
dependencies = [
|
|
238
238
|
"heck",
|
|
239
239
|
"proc-macro2",
|
|
@@ -243,15 +243,15 @@ dependencies = [
|
|
|
243
243
|
|
|
244
244
|
[[package]]
|
|
245
245
|
name = "clap_lex"
|
|
246
|
-
version = "0.7.
|
|
246
|
+
version = "0.7.6"
|
|
247
247
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
248
|
-
checksum = "
|
|
248
|
+
checksum = "a1d728cc89cf3aee9ff92b05e62b19ee65a02b5702cff7d5a377e32c6ae29d8d"
|
|
249
249
|
|
|
250
250
|
[[package]]
|
|
251
251
|
name = "clap_mangen"
|
|
252
|
-
version = "0.2.
|
|
252
|
+
version = "0.2.30"
|
|
253
253
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
254
|
-
checksum = "
|
|
254
|
+
checksum = "263c8214a8e0cb8129f3c62036c50e9c6e15c7bd364c42e0437c492b9293f778"
|
|
255
255
|
dependencies = [
|
|
256
256
|
"clap",
|
|
257
257
|
"roff",
|
|
@@ -583,9 +583,9 @@ dependencies = [
|
|
|
583
583
|
|
|
584
584
|
[[package]]
|
|
585
585
|
name = "half"
|
|
586
|
-
version = "2.7.
|
|
586
|
+
version = "2.7.1"
|
|
587
587
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
588
|
-
checksum = "
|
|
588
|
+
checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b"
|
|
589
589
|
dependencies = [
|
|
590
590
|
"cfg-if",
|
|
591
591
|
"crunchy",
|
|
@@ -615,7 +615,7 @@ dependencies = [
|
|
|
615
615
|
|
|
616
616
|
[[package]]
|
|
617
617
|
name = "html-to-markdown-cli"
|
|
618
|
-
version = "2.3.
|
|
618
|
+
version = "2.3.3"
|
|
619
619
|
dependencies = [
|
|
620
620
|
"assert_cmd",
|
|
621
621
|
"clap",
|
|
@@ -629,7 +629,7 @@ dependencies = [
|
|
|
629
629
|
|
|
630
630
|
[[package]]
|
|
631
631
|
name = "html-to-markdown-node"
|
|
632
|
-
version = "2.3.
|
|
632
|
+
version = "2.3.3"
|
|
633
633
|
dependencies = [
|
|
634
634
|
"html-to-markdown-rs",
|
|
635
635
|
"mimalloc-rust",
|
|
@@ -640,7 +640,7 @@ dependencies = [
|
|
|
640
640
|
|
|
641
641
|
[[package]]
|
|
642
642
|
name = "html-to-markdown-py"
|
|
643
|
-
version = "2.3.
|
|
643
|
+
version = "2.3.3"
|
|
644
644
|
dependencies = [
|
|
645
645
|
"base64",
|
|
646
646
|
"html-to-markdown-rs",
|
|
@@ -650,7 +650,7 @@ dependencies = [
|
|
|
650
650
|
|
|
651
651
|
[[package]]
|
|
652
652
|
name = "html-to-markdown-rs"
|
|
653
|
-
version = "2.3.
|
|
653
|
+
version = "2.3.3"
|
|
654
654
|
dependencies = [
|
|
655
655
|
"ammonia",
|
|
656
656
|
"base64",
|
|
@@ -667,7 +667,7 @@ dependencies = [
|
|
|
667
667
|
|
|
668
668
|
[[package]]
|
|
669
669
|
name = "html-to-markdown-wasm"
|
|
670
|
-
version = "2.3.
|
|
670
|
+
version = "2.3.3"
|
|
671
671
|
dependencies = [
|
|
672
672
|
"console_error_panic_hook",
|
|
673
673
|
"getrandom 0.2.16",
|
|
@@ -1435,9 +1435,9 @@ dependencies = [
|
|
|
1435
1435
|
|
|
1436
1436
|
[[package]]
|
|
1437
1437
|
name = "regex"
|
|
1438
|
-
version = "1.12.
|
|
1438
|
+
version = "1.12.2"
|
|
1439
1439
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1440
|
-
checksum = "
|
|
1440
|
+
checksum = "843bc0191f75f3e22651ae5f1e72939ab2f72a4bc30fa80a066bd66edefc24d4"
|
|
1441
1441
|
dependencies = [
|
|
1442
1442
|
"aho-corasick",
|
|
1443
1443
|
"memchr",
|
|
@@ -1447,9 +1447,9 @@ dependencies = [
|
|
|
1447
1447
|
|
|
1448
1448
|
[[package]]
|
|
1449
1449
|
name = "regex-automata"
|
|
1450
|
-
version = "0.4.
|
|
1450
|
+
version = "0.4.13"
|
|
1451
1451
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1452
|
-
checksum = "
|
|
1452
|
+
checksum = "5276caf25ac86c8d810222b3dbb938e512c55c6831a10f3e6ed1c93b84041f1c"
|
|
1453
1453
|
dependencies = [
|
|
1454
1454
|
"aho-corasick",
|
|
1455
1455
|
"memchr",
|
|
@@ -1458,9 +1458,9 @@ dependencies = [
|
|
|
1458
1458
|
|
|
1459
1459
|
[[package]]
|
|
1460
1460
|
name = "regex-syntax"
|
|
1461
|
-
version = "0.8.
|
|
1461
|
+
version = "0.8.8"
|
|
1462
1462
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1463
|
-
checksum = "
|
|
1463
|
+
checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58"
|
|
1464
1464
|
|
|
1465
1465
|
[[package]]
|
|
1466
1466
|
name = "roff"
|
|
@@ -3,7 +3,7 @@ resolver = "2"
|
|
|
3
3
|
members = ["crates/html-to-markdown-py"]
|
|
4
4
|
|
|
5
5
|
[workspace.package]
|
|
6
|
-
version = "2.3.
|
|
6
|
+
version = "2.3.3"
|
|
7
7
|
edition = "2021"
|
|
8
8
|
authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
|
|
9
9
|
license = "MIT"
|
|
@@ -15,7 +15,7 @@ rust-version = "1.80"
|
|
|
15
15
|
|
|
16
16
|
[workspace.dependencies]
|
|
17
17
|
# Core library
|
|
18
|
-
html-to-markdown-rs = { version = "2.3.
|
|
18
|
+
html-to-markdown-rs = { version = "2.3.3", path = "crates/html-to-markdown" }
|
|
19
19
|
|
|
20
20
|
# HTML parsing and sanitization
|
|
21
21
|
tl = "0.7"
|
{html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown/src/hocr/converter.rs
RENAMED
|
@@ -156,8 +156,15 @@ fn convert_element(
|
|
|
156
156
|
|
|
157
157
|
// Paragraphs
|
|
158
158
|
HocrElementType::OcrPar => {
|
|
159
|
-
|
|
160
|
-
|
|
159
|
+
let bullet_paragraph = is_bullet_paragraph(element);
|
|
160
|
+
if !output.is_empty() {
|
|
161
|
+
if bullet_paragraph {
|
|
162
|
+
if !output.ends_with('\n') {
|
|
163
|
+
output.push('\n');
|
|
164
|
+
}
|
|
165
|
+
} else if !output.ends_with("\n\n") {
|
|
166
|
+
output.push_str("\n\n");
|
|
167
|
+
}
|
|
161
168
|
}
|
|
162
169
|
|
|
163
170
|
if let Some(heading) = detect_heading_paragraph(element) {
|
|
@@ -188,7 +195,13 @@ fn convert_element(
|
|
|
188
195
|
if output.ends_with(' ') {
|
|
189
196
|
output.pop();
|
|
190
197
|
}
|
|
191
|
-
|
|
198
|
+
if bullet_paragraph {
|
|
199
|
+
if !output.ends_with('\n') {
|
|
200
|
+
output.push('\n');
|
|
201
|
+
}
|
|
202
|
+
} else {
|
|
203
|
+
output.push_str("\n\n");
|
|
204
|
+
}
|
|
192
205
|
}
|
|
193
206
|
|
|
194
207
|
// Blockquotes
|
|
@@ -588,6 +601,43 @@ fn try_spatial_table_reconstruction(element: &HocrElement) -> Option<String> {
|
|
|
588
601
|
None
|
|
589
602
|
}
|
|
590
603
|
|
|
604
|
+
fn is_bullet_paragraph(element: &HocrElement) -> bool {
|
|
605
|
+
if element.element_type != HocrElementType::OcrPar {
|
|
606
|
+
return false;
|
|
607
|
+
}
|
|
608
|
+
|
|
609
|
+
let text = element_text_content(element);
|
|
610
|
+
let trimmed = text.trim_start();
|
|
611
|
+
if trimmed.is_empty() {
|
|
612
|
+
return false;
|
|
613
|
+
}
|
|
614
|
+
|
|
615
|
+
if matches!(trimmed.chars().next(), Some('•' | '●' | '-' | '+' | '*')) {
|
|
616
|
+
return true;
|
|
617
|
+
}
|
|
618
|
+
|
|
619
|
+
let mut chars = trimmed.chars().peekable();
|
|
620
|
+
let mut digit_count = 0;
|
|
621
|
+
while let Some(&ch) = chars.peek() {
|
|
622
|
+
if ch.is_ascii_digit() {
|
|
623
|
+
digit_count += 1;
|
|
624
|
+
chars.next();
|
|
625
|
+
} else {
|
|
626
|
+
break;
|
|
627
|
+
}
|
|
628
|
+
}
|
|
629
|
+
|
|
630
|
+
if digit_count > 0 {
|
|
631
|
+
if let Some(&ch) = chars.peek() {
|
|
632
|
+
if (ch == '.' || ch == ')') && chars.clone().nth(1).map(|c| c.is_whitespace()).unwrap_or(false) {
|
|
633
|
+
return true;
|
|
634
|
+
}
|
|
635
|
+
}
|
|
636
|
+
}
|
|
637
|
+
|
|
638
|
+
false
|
|
639
|
+
}
|
|
640
|
+
|
|
591
641
|
#[derive(Clone)]
|
|
592
642
|
struct CodeLineInfo {
|
|
593
643
|
text: String,
|
|
@@ -662,6 +712,10 @@ fn collect_code_block(children: &[&HocrElement]) -> Option<(Vec<String>, usize,
|
|
|
662
712
|
return None;
|
|
663
713
|
}
|
|
664
714
|
|
|
715
|
+
if !is_confident_code_block(&collected) {
|
|
716
|
+
return None;
|
|
717
|
+
}
|
|
718
|
+
|
|
665
719
|
// Determine base indentation metrics
|
|
666
720
|
let mut x_values: Vec<u32> = collected
|
|
667
721
|
.iter()
|
|
@@ -844,6 +898,42 @@ fn is_bullet_like(line: &str) -> bool {
|
|
|
844
898
|
false
|
|
845
899
|
}
|
|
846
900
|
|
|
901
|
+
fn contains_keyword_token(text: &str, keyword: &str) -> bool {
|
|
902
|
+
text.split(|ch: char| !(ch.is_ascii_alphanumeric() || ch == '_'))
|
|
903
|
+
.any(|token| token == keyword)
|
|
904
|
+
}
|
|
905
|
+
|
|
906
|
+
fn is_shell_prompt(text: &str) -> bool {
|
|
907
|
+
let trimmed = text.trim_start();
|
|
908
|
+
if trimmed.is_empty() {
|
|
909
|
+
return false;
|
|
910
|
+
}
|
|
911
|
+
|
|
912
|
+
trimmed.starts_with('$')
|
|
913
|
+
|| trimmed.starts_with('#')
|
|
914
|
+
|| trimmed.contains("]#")
|
|
915
|
+
|| trimmed.starts_with("sudo ")
|
|
916
|
+
|| trimmed.starts_with("./")
|
|
917
|
+
|| trimmed.starts_with("python ")
|
|
918
|
+
|| trimmed.starts_with("pip ")
|
|
919
|
+
|| trimmed.starts_with("uv ")
|
|
920
|
+
}
|
|
921
|
+
|
|
922
|
+
fn starts_with_keyword(trimmed: &str, keyword: &str) -> bool {
|
|
923
|
+
if !trimmed.starts_with(keyword) {
|
|
924
|
+
return false;
|
|
925
|
+
}
|
|
926
|
+
if let Some(first) = trimmed.chars().next() {
|
|
927
|
+
if !first.is_ascii_lowercase() {
|
|
928
|
+
return false;
|
|
929
|
+
}
|
|
930
|
+
}
|
|
931
|
+
match trimmed.chars().nth(keyword.len()) {
|
|
932
|
+
None => true,
|
|
933
|
+
Some(ch) => ch.is_whitespace() || matches!(ch, '(' | ':' | '{' | '[' | '.'),
|
|
934
|
+
}
|
|
935
|
+
}
|
|
936
|
+
|
|
847
937
|
fn is_code_paragraph(lines: &[CodeLineInfo]) -> bool {
|
|
848
938
|
if lines.is_empty() {
|
|
849
939
|
return false;
|
|
@@ -865,47 +955,70 @@ fn is_code_paragraph(lines: &[CodeLineInfo]) -> bool {
|
|
|
865
955
|
|
|
866
956
|
total += 1;
|
|
867
957
|
let lower = text.to_lowercase();
|
|
958
|
+
let trimmed = text.trim_start();
|
|
868
959
|
|
|
869
|
-
let
|
|
870
|
-
|
|
871
|
-
|
|
872
|
-
|
|
873
|
-
|
|
874
|
-
|
|
875
|
-
|
|
876
|
-
|
|
877
|
-
|
|
878
|
-
|
|
879
|
-
|
|
880
|
-
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
||
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
||
|
|
897
|
-
|| text.contains('
|
|
898
|
-
|| text.contains(
|
|
899
|
-
|| text.contains(
|
|
900
|
-
|| text.contains(
|
|
960
|
+
let documentation_tokens = [
|
|
961
|
+
"definition",
|
|
962
|
+
"theorem",
|
|
963
|
+
"lemma",
|
|
964
|
+
"proof",
|
|
965
|
+
"corollary",
|
|
966
|
+
"algorithm",
|
|
967
|
+
"figure",
|
|
968
|
+
"table",
|
|
969
|
+
"appendix",
|
|
970
|
+
];
|
|
971
|
+
if documentation_tokens
|
|
972
|
+
.iter()
|
|
973
|
+
.any(|token| contains_keyword_token(&lower, token))
|
|
974
|
+
{
|
|
975
|
+
return false;
|
|
976
|
+
}
|
|
977
|
+
|
|
978
|
+
let has_keyword = (starts_with_keyword(trimmed, "function") && text.contains('('))
|
|
979
|
+
|| (starts_with_keyword(trimmed, "return")
|
|
980
|
+
&& trimmed
|
|
981
|
+
.chars()
|
|
982
|
+
.nth("return".len())
|
|
983
|
+
.map(|c| c.is_whitespace())
|
|
984
|
+
.unwrap_or(true))
|
|
985
|
+
|| trimmed.starts_with("console.")
|
|
986
|
+
|| starts_with_keyword(trimmed, "async")
|
|
987
|
+
|| starts_with_keyword(trimmed, "await")
|
|
988
|
+
|| (starts_with_keyword(trimmed, "class") && (text.contains('{') || text.contains(':')))
|
|
989
|
+
|| (starts_with_keyword(trimmed, "struct") && text.contains('{'))
|
|
990
|
+
|| (starts_with_keyword(trimmed, "enum") && text.contains('{'))
|
|
991
|
+
|| (starts_with_keyword(trimmed, "def") && (text.contains('(') || text.contains(':')))
|
|
992
|
+
|| (starts_with_keyword(trimmed, "fn") && text.contains('('))
|
|
993
|
+
|| (starts_with_keyword(trimmed, "pub")
|
|
994
|
+
&& (text.contains("fn") || text.contains("struct") || text.contains("enum")))
|
|
995
|
+
|| starts_with_keyword(trimmed, "import")
|
|
996
|
+
|| starts_with_keyword(trimmed, "using")
|
|
997
|
+
|| starts_with_keyword(trimmed, "namespace")
|
|
998
|
+
|| starts_with_keyword(trimmed, "public")
|
|
999
|
+
|| starts_with_keyword(trimmed, "private")
|
|
1000
|
+
|| starts_with_keyword(trimmed, "protected")
|
|
1001
|
+
|| starts_with_keyword(trimmed, "static")
|
|
1002
|
+
|| starts_with_keyword(trimmed, "void")
|
|
1003
|
+
|| starts_with_keyword(trimmed, "try")
|
|
1004
|
+
|| starts_with_keyword(trimmed, "catch")
|
|
1005
|
+
|| starts_with_keyword(trimmed, "finally")
|
|
1006
|
+
|| starts_with_keyword(trimmed, "throw")
|
|
1007
|
+
|| starts_with_keyword(trimmed, "typedef")
|
|
1008
|
+
|| starts_with_keyword(trimmed, "package")
|
|
1009
|
+
|| starts_with_keyword(trimmed, "module");
|
|
1010
|
+
|
|
1011
|
+
let has_symbol = text.contains(';') || text.contains("::");
|
|
901
1012
|
|
|
902
1013
|
if has_keyword || has_symbol {
|
|
903
1014
|
strong_markers += 1;
|
|
904
1015
|
continue;
|
|
905
1016
|
}
|
|
906
1017
|
|
|
907
|
-
|
|
908
|
-
|
|
1018
|
+
if is_shell_prompt(text) {
|
|
1019
|
+
strong_markers += 1;
|
|
1020
|
+
continue;
|
|
1021
|
+
}
|
|
909
1022
|
let has_assignment = text.contains(" = ")
|
|
910
1023
|
|| text.contains("+=")
|
|
911
1024
|
|| text.contains("-=")
|
|
@@ -914,7 +1027,11 @@ fn is_code_paragraph(lines: &[CodeLineInfo]) -> bool {
|
|
|
914
1027
|
|| text.contains(" := ")
|
|
915
1028
|
|| text.contains(" == ");
|
|
916
1029
|
|
|
917
|
-
|
|
1030
|
+
let has_arrow = text.contains("=>");
|
|
1031
|
+
let has_brace = text.contains('{') || text.contains('}');
|
|
1032
|
+
let has_pointer_arrow = text.contains("->");
|
|
1033
|
+
|
|
1034
|
+
if has_assignment || has_arrow || has_brace || has_pointer_arrow {
|
|
918
1035
|
moderate_markers += 1;
|
|
919
1036
|
}
|
|
920
1037
|
}
|
|
@@ -922,10 +1039,13 @@ fn is_code_paragraph(lines: &[CodeLineInfo]) -> bool {
|
|
|
922
1039
|
if total == 0 {
|
|
923
1040
|
return false;
|
|
924
1041
|
}
|
|
1042
|
+
if strong_markers == 0 {
|
|
1043
|
+
return false;
|
|
1044
|
+
}
|
|
925
1045
|
if strong_markers * 2 >= total {
|
|
926
1046
|
return true;
|
|
927
1047
|
}
|
|
928
|
-
|
|
1048
|
+
(strong_markers + moderate_markers) * 2 >= total
|
|
929
1049
|
}
|
|
930
1050
|
|
|
931
1051
|
fn normalize_code_line(text: &str) -> String {
|
|
@@ -980,6 +1100,114 @@ fn normalize_code_line(text: &str) -> String {
|
|
|
980
1100
|
final_line.trim().to_string()
|
|
981
1101
|
}
|
|
982
1102
|
|
|
1103
|
+
fn is_confident_code_block(lines: &[CodeLineInfo]) -> bool {
|
|
1104
|
+
let mut total = 0;
|
|
1105
|
+
let mut keyword_lines = 0;
|
|
1106
|
+
let mut punctuation_lines = 0;
|
|
1107
|
+
let mut assignment_lines = 0;
|
|
1108
|
+
let mut shell_lines = 0;
|
|
1109
|
+
let mut indent_lines = 0;
|
|
1110
|
+
|
|
1111
|
+
let min_x = lines.iter().map(|info| info.x1).min().unwrap_or_default();
|
|
1112
|
+
|
|
1113
|
+
for info in lines {
|
|
1114
|
+
let text = info.text.trim();
|
|
1115
|
+
if text.is_empty() {
|
|
1116
|
+
continue;
|
|
1117
|
+
}
|
|
1118
|
+
total += 1;
|
|
1119
|
+
|
|
1120
|
+
if is_shell_prompt(text) {
|
|
1121
|
+
shell_lines += 1;
|
|
1122
|
+
}
|
|
1123
|
+
|
|
1124
|
+
let trimmed = text.trim_start();
|
|
1125
|
+
|
|
1126
|
+
if (starts_with_keyword(trimmed, "function") && text.contains('('))
|
|
1127
|
+
|| trimmed.starts_with("console.")
|
|
1128
|
+
|| (starts_with_keyword(trimmed, "return")
|
|
1129
|
+
&& trimmed
|
|
1130
|
+
.chars()
|
|
1131
|
+
.nth("return".len())
|
|
1132
|
+
.map(|c| c.is_whitespace())
|
|
1133
|
+
.unwrap_or(true))
|
|
1134
|
+
|| starts_with_keyword(trimmed, "async")
|
|
1135
|
+
|| starts_with_keyword(trimmed, "await")
|
|
1136
|
+
|| (starts_with_keyword(trimmed, "class") && (text.contains('{') || text.contains(':')))
|
|
1137
|
+
|| (starts_with_keyword(trimmed, "struct") && text.contains('{'))
|
|
1138
|
+
|| (starts_with_keyword(trimmed, "enum") && text.contains('{'))
|
|
1139
|
+
|| (starts_with_keyword(trimmed, "def") && (text.contains('(') || text.contains(':')))
|
|
1140
|
+
|| (starts_with_keyword(trimmed, "fn") && text.contains('('))
|
|
1141
|
+
|| (starts_with_keyword(trimmed, "pub")
|
|
1142
|
+
&& (text.contains("fn") || text.contains("struct") || text.contains("enum")))
|
|
1143
|
+
|| starts_with_keyword(trimmed, "import")
|
|
1144
|
+
|| starts_with_keyword(trimmed, "using")
|
|
1145
|
+
|| starts_with_keyword(trimmed, "namespace")
|
|
1146
|
+
|| starts_with_keyword(trimmed, "public")
|
|
1147
|
+
|| starts_with_keyword(trimmed, "private")
|
|
1148
|
+
|| starts_with_keyword(trimmed, "protected")
|
|
1149
|
+
|| starts_with_keyword(trimmed, "static")
|
|
1150
|
+
|| starts_with_keyword(trimmed, "void")
|
|
1151
|
+
|| starts_with_keyword(trimmed, "try")
|
|
1152
|
+
|| starts_with_keyword(trimmed, "catch")
|
|
1153
|
+
|| starts_with_keyword(trimmed, "finally")
|
|
1154
|
+
|| starts_with_keyword(trimmed, "throw")
|
|
1155
|
+
|| starts_with_keyword(trimmed, "typedef")
|
|
1156
|
+
|| starts_with_keyword(trimmed, "package")
|
|
1157
|
+
|| starts_with_keyword(trimmed, "module")
|
|
1158
|
+
{
|
|
1159
|
+
keyword_lines += 1;
|
|
1160
|
+
}
|
|
1161
|
+
|
|
1162
|
+
if text.contains(';')
|
|
1163
|
+
|| text.contains('{')
|
|
1164
|
+
|| text.contains('}')
|
|
1165
|
+
|| text.contains("::")
|
|
1166
|
+
|| text.contains("->")
|
|
1167
|
+
|| text.contains("=>")
|
|
1168
|
+
{
|
|
1169
|
+
punctuation_lines += 1;
|
|
1170
|
+
}
|
|
1171
|
+
|
|
1172
|
+
if text.contains(" = ")
|
|
1173
|
+
|| text.contains("+=")
|
|
1174
|
+
|| text.contains("-=")
|
|
1175
|
+
|| text.contains("*=")
|
|
1176
|
+
|| text.contains("/=")
|
|
1177
|
+
|| text.contains(" := ")
|
|
1178
|
+
|| text.contains(" == ")
|
|
1179
|
+
{
|
|
1180
|
+
assignment_lines += 1;
|
|
1181
|
+
}
|
|
1182
|
+
|
|
1183
|
+
if info.x1 > min_x + 8 {
|
|
1184
|
+
indent_lines += 1;
|
|
1185
|
+
}
|
|
1186
|
+
}
|
|
1187
|
+
|
|
1188
|
+
if total < 3 {
|
|
1189
|
+
return false;
|
|
1190
|
+
}
|
|
1191
|
+
|
|
1192
|
+
if shell_lines >= 2 && shell_lines * 2 >= total {
|
|
1193
|
+
return true;
|
|
1194
|
+
}
|
|
1195
|
+
|
|
1196
|
+
if keyword_lines >= 2 && assignment_lines >= 1 {
|
|
1197
|
+
return true;
|
|
1198
|
+
}
|
|
1199
|
+
|
|
1200
|
+
if keyword_lines >= 1 && punctuation_lines >= 1 && assignment_lines >= 1 {
|
|
1201
|
+
return true;
|
|
1202
|
+
}
|
|
1203
|
+
|
|
1204
|
+
if indent_lines == total && keyword_lines >= 1 && assignment_lines >= 1 {
|
|
1205
|
+
return true;
|
|
1206
|
+
}
|
|
1207
|
+
|
|
1208
|
+
false
|
|
1209
|
+
}
|
|
1210
|
+
|
|
983
1211
|
fn detect_code_language(lines: &[String]) -> Option<&'static str> {
|
|
984
1212
|
let lower_lines: Vec<String> = lines.iter().map(|line| line.to_lowercase()).collect();
|
|
985
1213
|
if lower_lines.iter().any(|line| line.contains("function"))
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
High-performance HTML → Markdown conversion powered by Rust. Shipping as a Rust crate, Python package, Node.js bindings, WebAssembly, and standalone CLI with identical rendering behaviour.
|
|
4
4
|
|
|
5
5
|
[](https://pypi.org/project/html-to-markdown/)
|
|
6
|
-
[](https://www.npmjs.com/package/html-to-markdown)
|
|
6
|
+
[](https://www.npmjs.com/package/html-to-markdown-node)
|
|
7
7
|
[](https://crates.io/crates/html-to-markdown-rs)
|
|
8
8
|
[](https://pypi.org/project/html-to-markdown/)
|
|
9
9
|
[](https://github.com/Goldziher/html-to-markdown/blob/main/LICENSE)
|
|
@@ -23,9 +23,9 @@ High-performance HTML → Markdown conversion powered by Rust. Shipping as a Rus
|
|
|
23
23
|
|
|
24
24
|
| Target | Command |
|
|
25
25
|
| --------------------------- | ------------------------------------------------------------------------- |
|
|
26
|
-
| **Node.js/Bun** (native) | `npm install
|
|
27
|
-
| **WebAssembly** (universal) | `npm install
|
|
28
|
-
| **Deno** | `import { convert } from "npm
|
|
26
|
+
| **Node.js/Bun** (native) | `npm install html-to-markdown-node` |
|
|
27
|
+
| **WebAssembly** (universal) | `npm install html-to-markdown-wasm` |
|
|
28
|
+
| **Deno** | `import { convert } from "npm:html-to-markdown-wasm"` |
|
|
29
29
|
| **Python** (bindings + CLI) | `pip install html-to-markdown` |
|
|
30
30
|
| **Rust** crate | `cargo add html-to-markdown-rs` |
|
|
31
31
|
| Rust CLI | `cargo install html-to-markdown-cli` |
|
|
@@ -39,7 +39,7 @@ High-performance HTML → Markdown conversion powered by Rust. Shipping as a Rus
|
|
|
39
39
|
**Node.js / Bun (Native - Fastest):**
|
|
40
40
|
|
|
41
41
|
```typescript
|
|
42
|
-
import { convert } from '
|
|
42
|
+
import { convert } from 'html-to-markdown-node';
|
|
43
43
|
|
|
44
44
|
const html = '<h1>Hello</h1><p>Rust ❤️ Markdown</p>';
|
|
45
45
|
const markdown = convert(html, {
|
|
@@ -52,8 +52,8 @@ const markdown = convert(html, {
|
|
|
52
52
|
**Deno / Browsers / Edge (Universal):**
|
|
53
53
|
|
|
54
54
|
```typescript
|
|
55
|
-
import { convert } from "npm
|
|
56
|
-
// or: import { convert } from '
|
|
55
|
+
import { convert } from "npm:html-to-markdown-wasm"; // Deno
|
|
56
|
+
// or: import { convert } from 'html-to-markdown-wasm'; // Bundlers
|
|
57
57
|
|
|
58
58
|
const markdown = convert(html, {
|
|
59
59
|
headingStyle: 'atx',
|
|
@@ -15,7 +15,13 @@ V1 API (backward compatibility):
|
|
|
15
15
|
markdown = convert_to_markdown(html, heading_style="atx")
|
|
16
16
|
"""
|
|
17
17
|
|
|
18
|
-
from html_to_markdown.api import
|
|
18
|
+
from html_to_markdown.api import (
|
|
19
|
+
InlineImage,
|
|
20
|
+
InlineImageConfig,
|
|
21
|
+
InlineImageWarning,
|
|
22
|
+
convert,
|
|
23
|
+
convert_with_inline_images,
|
|
24
|
+
)
|
|
19
25
|
from html_to_markdown.exceptions import (
|
|
20
26
|
ConflictingOptionsError,
|
|
21
27
|
EmptyHtmlError,
|
|
@@ -31,12 +37,16 @@ __all__ = [
|
|
|
31
37
|
"ConversionOptions",
|
|
32
38
|
"EmptyHtmlError",
|
|
33
39
|
"HtmlToMarkdownError",
|
|
40
|
+
"InlineImage",
|
|
41
|
+
"InlineImageConfig",
|
|
42
|
+
"InlineImageWarning",
|
|
34
43
|
"InvalidParserError",
|
|
35
44
|
"MissingDependencyError",
|
|
36
45
|
"PreprocessingOptions",
|
|
37
46
|
"convert",
|
|
38
47
|
"convert_to_markdown",
|
|
48
|
+
"convert_with_inline_images",
|
|
39
49
|
"markdownify",
|
|
40
50
|
]
|
|
41
51
|
|
|
42
|
-
__version__ = "2.3.
|
|
52
|
+
__version__ = "2.3.3"
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
"""New v2 functional API for HTML to Markdown conversion.
|
|
2
|
+
|
|
3
|
+
This module provides the new functional API with dataclass-based options,
|
|
4
|
+
using the Rust backend for conversion.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from typing import TYPE_CHECKING, Literal, TypedDict, cast
|
|
10
|
+
|
|
11
|
+
import html_to_markdown._html_to_markdown as _rust # type: ignore[import-not-found]
|
|
12
|
+
from html_to_markdown.options import ConversionOptions, PreprocessingOptions
|
|
13
|
+
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from html_to_markdown._html_to_markdown import InlineImageConfig
|
|
16
|
+
else:
|
|
17
|
+
InlineImageConfig = _rust.InlineImageConfig # type: ignore[misc, assignment]
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class InlineImage(TypedDict):
|
|
21
|
+
"""Inline image extracted during conversion."""
|
|
22
|
+
|
|
23
|
+
data: bytes
|
|
24
|
+
format: str
|
|
25
|
+
filename: str | None
|
|
26
|
+
description: str | None
|
|
27
|
+
dimensions: tuple[int, int] | None
|
|
28
|
+
source: Literal["img_data_uri", "svg_element"]
|
|
29
|
+
attributes: dict[str, str]
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class InlineImageWarning(TypedDict):
|
|
33
|
+
"""Warning produced during inline image extraction."""
|
|
34
|
+
|
|
35
|
+
index: int
|
|
36
|
+
message: str
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _to_rust_preprocessing(options: PreprocessingOptions) -> _rust.PreprocessingOptions:
|
|
40
|
+
"""Convert high-level preprocessing options to the Rust bindings."""
|
|
41
|
+
return _rust.PreprocessingOptions(
|
|
42
|
+
enabled=options.enabled,
|
|
43
|
+
preset=options.preset,
|
|
44
|
+
remove_navigation=options.remove_navigation,
|
|
45
|
+
remove_forms=options.remove_forms,
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _to_rust_options(
|
|
50
|
+
options: ConversionOptions,
|
|
51
|
+
preprocessing: PreprocessingOptions,
|
|
52
|
+
) -> _rust.ConversionOptions:
|
|
53
|
+
"""Convert high-level conversion options to the Rust bindings."""
|
|
54
|
+
return _rust.ConversionOptions(
|
|
55
|
+
heading_style=options.heading_style,
|
|
56
|
+
list_indent_type=options.list_indent_type,
|
|
57
|
+
list_indent_width=options.list_indent_width,
|
|
58
|
+
bullets=options.bullets,
|
|
59
|
+
strong_em_symbol=options.strong_em_symbol,
|
|
60
|
+
escape_asterisks=options.escape_asterisks,
|
|
61
|
+
escape_underscores=options.escape_underscores,
|
|
62
|
+
escape_misc=options.escape_misc,
|
|
63
|
+
escape_ascii=options.escape_ascii,
|
|
64
|
+
code_language=options.code_language,
|
|
65
|
+
autolinks=options.autolinks,
|
|
66
|
+
default_title=options.default_title,
|
|
67
|
+
br_in_tables=options.br_in_tables,
|
|
68
|
+
hocr_spatial_tables=options.hocr_spatial_tables,
|
|
69
|
+
highlight_style=options.highlight_style,
|
|
70
|
+
extract_metadata=options.extract_metadata,
|
|
71
|
+
whitespace_mode=options.whitespace_mode,
|
|
72
|
+
strip_newlines=options.strip_newlines,
|
|
73
|
+
wrap=options.wrap,
|
|
74
|
+
wrap_width=options.wrap_width,
|
|
75
|
+
convert_as_inline=options.convert_as_inline,
|
|
76
|
+
sub_symbol=options.sub_symbol,
|
|
77
|
+
sup_symbol=options.sup_symbol,
|
|
78
|
+
newline_style=options.newline_style,
|
|
79
|
+
code_block_style=options.code_block_style,
|
|
80
|
+
keep_inline_images_in=list(options.keep_inline_images_in) if options.keep_inline_images_in else [],
|
|
81
|
+
preprocessing=_to_rust_preprocessing(preprocessing),
|
|
82
|
+
encoding=options.encoding,
|
|
83
|
+
debug=options.debug,
|
|
84
|
+
strip_tags=list(options.strip_tags) if options.strip_tags else [],
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def convert(
|
|
89
|
+
html: str,
|
|
90
|
+
options: ConversionOptions | None = None,
|
|
91
|
+
preprocessing: PreprocessingOptions | None = None,
|
|
92
|
+
) -> str:
|
|
93
|
+
"""Convert HTML to Markdown using the Rust backend.
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
html: HTML string to convert.
|
|
97
|
+
options: Conversion configuration options (defaults to ConversionOptions()).
|
|
98
|
+
preprocessing: HTML preprocessing options (defaults to PreprocessingOptions()).
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
Converted Markdown string.
|
|
102
|
+
"""
|
|
103
|
+
if options is None:
|
|
104
|
+
options = ConversionOptions()
|
|
105
|
+
if preprocessing is None:
|
|
106
|
+
preprocessing = PreprocessingOptions()
|
|
107
|
+
|
|
108
|
+
rust_options = _to_rust_options(options, preprocessing)
|
|
109
|
+
return cast("str", _rust.convert(html, rust_options))
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def convert_with_inline_images(
|
|
113
|
+
html: str,
|
|
114
|
+
options: ConversionOptions | None = None,
|
|
115
|
+
preprocessing: PreprocessingOptions | None = None,
|
|
116
|
+
image_config: InlineImageConfig | None = None,
|
|
117
|
+
) -> tuple[str, list[InlineImage], list[InlineImageWarning]]:
|
|
118
|
+
"""Convert HTML and extract inline images.
|
|
119
|
+
|
|
120
|
+
Returns Markdown along with extracted inline images and any warnings.
|
|
121
|
+
"""
|
|
122
|
+
if options is None:
|
|
123
|
+
options = ConversionOptions()
|
|
124
|
+
if preprocessing is None:
|
|
125
|
+
preprocessing = PreprocessingOptions()
|
|
126
|
+
if image_config is None:
|
|
127
|
+
image_config = InlineImageConfig()
|
|
128
|
+
|
|
129
|
+
rust_options = _to_rust_options(options, preprocessing)
|
|
130
|
+
markdown, images, warnings = cast(
|
|
131
|
+
"tuple[str, list[InlineImage], list[InlineImageWarning]]",
|
|
132
|
+
_rust.convert_with_inline_images(html, rust_options, image_config),
|
|
133
|
+
)
|
|
134
|
+
return markdown, list(images), list(warnings)
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
__all__ = [
|
|
138
|
+
"InlineImage",
|
|
139
|
+
"InlineImageConfig",
|
|
140
|
+
"InlineImageWarning",
|
|
141
|
+
"convert",
|
|
142
|
+
"convert_with_inline_images",
|
|
143
|
+
]
|
|
Binary file
|
|
@@ -1,74 +0,0 @@
|
|
|
1
|
-
"""New v2 functional API for HTML to Markdown conversion.
|
|
2
|
-
|
|
3
|
-
This module provides the new functional API with dataclass-based options,
|
|
4
|
-
using the Rust backend for conversion.
|
|
5
|
-
"""
|
|
6
|
-
|
|
7
|
-
from __future__ import annotations
|
|
8
|
-
|
|
9
|
-
import html_to_markdown._html_to_markdown as _rust # type: ignore[import-not-found]
|
|
10
|
-
from html_to_markdown.options import ConversionOptions, PreprocessingOptions
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
def convert(
|
|
14
|
-
html: str,
|
|
15
|
-
options: ConversionOptions | None = None,
|
|
16
|
-
preprocessing: PreprocessingOptions | None = None,
|
|
17
|
-
) -> str:
|
|
18
|
-
"""Convert HTML to Markdown using the Rust backend.
|
|
19
|
-
|
|
20
|
-
Args:
|
|
21
|
-
html: HTML string to convert.
|
|
22
|
-
options: Conversion configuration options (defaults to ConversionOptions()).
|
|
23
|
-
preprocessing: HTML preprocessing options (defaults to PreprocessingOptions()).
|
|
24
|
-
|
|
25
|
-
Returns:
|
|
26
|
-
Converted Markdown string.
|
|
27
|
-
"""
|
|
28
|
-
if options is None:
|
|
29
|
-
options = ConversionOptions()
|
|
30
|
-
if preprocessing is None:
|
|
31
|
-
preprocessing = PreprocessingOptions()
|
|
32
|
-
|
|
33
|
-
rust_preprocessing = _rust.PreprocessingOptions(
|
|
34
|
-
enabled=preprocessing.enabled,
|
|
35
|
-
preset=preprocessing.preset,
|
|
36
|
-
remove_navigation=preprocessing.remove_navigation,
|
|
37
|
-
remove_forms=preprocessing.remove_forms,
|
|
38
|
-
)
|
|
39
|
-
|
|
40
|
-
rust_options = _rust.ConversionOptions(
|
|
41
|
-
heading_style=options.heading_style,
|
|
42
|
-
list_indent_type=options.list_indent_type,
|
|
43
|
-
list_indent_width=options.list_indent_width,
|
|
44
|
-
bullets=options.bullets,
|
|
45
|
-
strong_em_symbol=options.strong_em_symbol,
|
|
46
|
-
escape_asterisks=options.escape_asterisks,
|
|
47
|
-
escape_underscores=options.escape_underscores,
|
|
48
|
-
escape_misc=options.escape_misc,
|
|
49
|
-
escape_ascii=options.escape_ascii,
|
|
50
|
-
code_language=options.code_language,
|
|
51
|
-
autolinks=options.autolinks,
|
|
52
|
-
default_title=options.default_title,
|
|
53
|
-
br_in_tables=options.br_in_tables,
|
|
54
|
-
hocr_spatial_tables=options.hocr_spatial_tables,
|
|
55
|
-
highlight_style=options.highlight_style,
|
|
56
|
-
extract_metadata=options.extract_metadata,
|
|
57
|
-
whitespace_mode=options.whitespace_mode,
|
|
58
|
-
strip_newlines=options.strip_newlines,
|
|
59
|
-
wrap=options.wrap,
|
|
60
|
-
wrap_width=options.wrap_width,
|
|
61
|
-
convert_as_inline=options.convert_as_inline,
|
|
62
|
-
sub_symbol=options.sub_symbol,
|
|
63
|
-
sup_symbol=options.sup_symbol,
|
|
64
|
-
newline_style=options.newline_style,
|
|
65
|
-
code_block_style=options.code_block_style,
|
|
66
|
-
keep_inline_images_in=list(options.keep_inline_images_in) if options.keep_inline_images_in else [],
|
|
67
|
-
preprocessing=rust_preprocessing,
|
|
68
|
-
encoding=options.encoding,
|
|
69
|
-
debug=options.debug,
|
|
70
|
-
strip_tags=list(options.strip_tags) if options.strip_tags else [],
|
|
71
|
-
)
|
|
72
|
-
|
|
73
|
-
result: str = _rust.convert(html, rust_options)
|
|
74
|
-
return result
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown/benches/micro_benchmark.rs
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown/examples/test_escape.rs
RENAMED
|
File without changes
|
|
File without changes
|
{html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown/examples/test_lists.rs
RENAMED
|
File without changes
|
|
File without changes
|
{html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown/examples/test_tables.rs
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown/src/hocr/extractor.rs
RENAMED
|
File without changes
|
|
File without changes
|
{html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown/src/hocr/parser.rs
RENAMED
|
File without changes
|
{html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown/src/hocr/spatial.rs
RENAMED
|
File without changes
|
|
File without changes
|
{html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown/src/inline_images.rs
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown/tests/integration_test.rs
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|