html-to-markdown 2.3.0__tar.gz → 2.3.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of html-to-markdown might be problematic. Click here for more details.

Files changed (55) hide show
  1. {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/Cargo.lock +25 -25
  2. {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/Cargo.toml +2 -2
  3. {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/PKG-INFO +1 -1
  4. {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown/src/hocr/converter.rs +267 -39
  5. {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown-py/README.md +7 -7
  6. {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/html_to_markdown/__init__.py +12 -2
  7. html_to_markdown-2.3.3/html_to_markdown/api.py +143 -0
  8. {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/html_to_markdown/bin/html-to-markdown +0 -0
  9. {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/pyproject.toml +1 -1
  10. html_to_markdown-2.3.0/html_to_markdown/api.py +0 -74
  11. {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/LICENSE +0 -0
  12. {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/README_PYPI.md +0 -0
  13. {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown/Cargo.toml +0 -0
  14. {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown/README.md +0 -0
  15. {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown/benches/conversion_benchmark.rs +0 -0
  16. {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown/benches/micro_benchmark.rs +0 -0
  17. {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown/benches/profiling_benchmark.rs +0 -0
  18. {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown/examples/basic.rs +0 -0
  19. {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown/examples/table.rs +0 -0
  20. {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown/examples/test_escape.rs +0 -0
  21. {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown/examples/test_inline_formatting.rs +0 -0
  22. {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown/examples/test_lists.rs +0 -0
  23. {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown/examples/test_semantic_tags.rs +0 -0
  24. {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown/examples/test_tables.rs +0 -0
  25. {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown/examples/test_task_lists.rs +0 -0
  26. {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown/examples/test_whitespace.rs +0 -0
  27. {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown/src/converter.rs +0 -0
  28. {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown/src/error.rs +0 -0
  29. {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown/src/hocr/extractor.rs +0 -0
  30. {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown/src/hocr/mod.rs +0 -0
  31. {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown/src/hocr/parser.rs +0 -0
  32. {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown/src/hocr/spatial.rs +0 -0
  33. {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown/src/hocr/types.rs +0 -0
  34. {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown/src/inline_images.rs +0 -0
  35. {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown/src/lib.rs +0 -0
  36. {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown/src/options.rs +0 -0
  37. {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown/src/sanitizer.rs +0 -0
  38. {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown/src/text.rs +0 -0
  39. {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown/src/wrapper.rs +0 -0
  40. {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown/tests/commonmark_compliance_test.rs +0 -0
  41. {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown/tests/hocr_compliance_test.rs +0 -0
  42. {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown/tests/integration_test.rs +0 -0
  43. {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown-py/Cargo.toml +0 -0
  44. {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown-py/python/html_to_markdown/__init__.py +0 -0
  45. {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown-py/python/html_to_markdown/_html_to_markdown.pyi +0 -0
  46. {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown-py/src/lib.rs +0 -0
  47. {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/crates/html-to-markdown-py/uv.lock +0 -0
  48. {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/html_to_markdown/__main__.py +0 -0
  49. {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/html_to_markdown/_rust.pyi +0 -0
  50. {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/html_to_markdown/cli.py +0 -0
  51. {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/html_to_markdown/cli_proxy.py +0 -0
  52. {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/html_to_markdown/exceptions.py +0 -0
  53. {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/html_to_markdown/options.py +0 -0
  54. {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/html_to_markdown/py.typed +0 -0
  55. {html_to_markdown-2.3.0 → html_to_markdown-2.3.3}/html_to_markdown/v1_compat.py +0 -0
@@ -200,9 +200,9 @@ dependencies = [
200
200
 
201
201
  [[package]]
202
202
  name = "clap"
203
- version = "4.5.48"
203
+ version = "4.5.49"
204
204
  source = "registry+https://github.com/rust-lang/crates.io-index"
205
- checksum = "e2134bb3ea021b78629caa971416385309e0131b351b25e01dc16fb54e1b5fae"
205
+ checksum = "f4512b90fa68d3a9932cea5184017c5d200f5921df706d45e853537dea51508f"
206
206
  dependencies = [
207
207
  "clap_builder",
208
208
  "clap_derive",
@@ -210,9 +210,9 @@ dependencies = [
210
210
 
211
211
  [[package]]
212
212
  name = "clap_builder"
213
- version = "4.5.48"
213
+ version = "4.5.49"
214
214
  source = "registry+https://github.com/rust-lang/crates.io-index"
215
- checksum = "c2ba64afa3c0a6df7fa517765e31314e983f51dda798ffba27b988194fb65dc9"
215
+ checksum = "0025e98baa12e766c67ba13ff4695a887a1eba19569aad00a472546795bd6730"
216
216
  dependencies = [
217
217
  "anstream",
218
218
  "anstyle",
@@ -222,18 +222,18 @@ dependencies = [
222
222
 
223
223
  [[package]]
224
224
  name = "clap_complete"
225
- version = "4.5.58"
225
+ version = "4.5.59"
226
226
  source = "registry+https://github.com/rust-lang/crates.io-index"
227
- checksum = "75bf0b32ad2e152de789bb635ea4d3078f6b838ad7974143e99b99f45a04af4a"
227
+ checksum = "2348487adcd4631696ced64ccdb40d38ac4d31cae7f2eec8817fcea1b9d1c43c"
228
228
  dependencies = [
229
229
  "clap",
230
230
  ]
231
231
 
232
232
  [[package]]
233
233
  name = "clap_derive"
234
- version = "4.5.47"
234
+ version = "4.5.49"
235
235
  source = "registry+https://github.com/rust-lang/crates.io-index"
236
- checksum = "bbfd7eae0b0f1a6e63d4b13c9c478de77c2eb546fba158ad50b4203dc24b9f9c"
236
+ checksum = "2a0b5487afeab2deb2ff4e03a807ad1a03ac532ff5a2cee5d86884440c7f7671"
237
237
  dependencies = [
238
238
  "heck",
239
239
  "proc-macro2",
@@ -243,15 +243,15 @@ dependencies = [
243
243
 
244
244
  [[package]]
245
245
  name = "clap_lex"
246
- version = "0.7.5"
246
+ version = "0.7.6"
247
247
  source = "registry+https://github.com/rust-lang/crates.io-index"
248
- checksum = "b94f61472cee1439c0b966b47e3aca9ae07e45d070759512cd390ea2bebc6675"
248
+ checksum = "a1d728cc89cf3aee9ff92b05e62b19ee65a02b5702cff7d5a377e32c6ae29d8d"
249
249
 
250
250
  [[package]]
251
251
  name = "clap_mangen"
252
- version = "0.2.29"
252
+ version = "0.2.30"
253
253
  source = "registry+https://github.com/rust-lang/crates.io-index"
254
- checksum = "27b4c3c54b30f0d9adcb47f25f61fcce35c4dd8916638c6b82fbd5f4fb4179e2"
254
+ checksum = "263c8214a8e0cb8129f3c62036c50e9c6e15c7bd364c42e0437c492b9293f778"
255
255
  dependencies = [
256
256
  "clap",
257
257
  "roff",
@@ -583,9 +583,9 @@ dependencies = [
583
583
 
584
584
  [[package]]
585
585
  name = "half"
586
- version = "2.7.0"
586
+ version = "2.7.1"
587
587
  source = "registry+https://github.com/rust-lang/crates.io-index"
588
- checksum = "e54c115d4f30f52c67202f079c5f9d8b49db4691f460fdb0b4c2e838261b2ba5"
588
+ checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b"
589
589
  dependencies = [
590
590
  "cfg-if",
591
591
  "crunchy",
@@ -615,7 +615,7 @@ dependencies = [
615
615
 
616
616
  [[package]]
617
617
  name = "html-to-markdown-cli"
618
- version = "2.3.0"
618
+ version = "2.3.3"
619
619
  dependencies = [
620
620
  "assert_cmd",
621
621
  "clap",
@@ -629,7 +629,7 @@ dependencies = [
629
629
 
630
630
  [[package]]
631
631
  name = "html-to-markdown-node"
632
- version = "2.3.0"
632
+ version = "2.3.3"
633
633
  dependencies = [
634
634
  "html-to-markdown-rs",
635
635
  "mimalloc-rust",
@@ -640,7 +640,7 @@ dependencies = [
640
640
 
641
641
  [[package]]
642
642
  name = "html-to-markdown-py"
643
- version = "2.3.0"
643
+ version = "2.3.3"
644
644
  dependencies = [
645
645
  "base64",
646
646
  "html-to-markdown-rs",
@@ -650,7 +650,7 @@ dependencies = [
650
650
 
651
651
  [[package]]
652
652
  name = "html-to-markdown-rs"
653
- version = "2.3.0"
653
+ version = "2.3.3"
654
654
  dependencies = [
655
655
  "ammonia",
656
656
  "base64",
@@ -667,7 +667,7 @@ dependencies = [
667
667
 
668
668
  [[package]]
669
669
  name = "html-to-markdown-wasm"
670
- version = "2.3.0"
670
+ version = "2.3.3"
671
671
  dependencies = [
672
672
  "console_error_panic_hook",
673
673
  "getrandom 0.2.16",
@@ -1435,9 +1435,9 @@ dependencies = [
1435
1435
 
1436
1436
  [[package]]
1437
1437
  name = "regex"
1438
- version = "1.12.1"
1438
+ version = "1.12.2"
1439
1439
  source = "registry+https://github.com/rust-lang/crates.io-index"
1440
- checksum = "4a52d8d02cacdb176ef4678de6c052efb4b3da14b78e4db683a4252762be5433"
1440
+ checksum = "843bc0191f75f3e22651ae5f1e72939ab2f72a4bc30fa80a066bd66edefc24d4"
1441
1441
  dependencies = [
1442
1442
  "aho-corasick",
1443
1443
  "memchr",
@@ -1447,9 +1447,9 @@ dependencies = [
1447
1447
 
1448
1448
  [[package]]
1449
1449
  name = "regex-automata"
1450
- version = "0.4.12"
1450
+ version = "0.4.13"
1451
1451
  source = "registry+https://github.com/rust-lang/crates.io-index"
1452
- checksum = "722166aa0d7438abbaa4d5cc2c649dac844e8c56d82fb3d33e9c34b5cd268fc6"
1452
+ checksum = "5276caf25ac86c8d810222b3dbb938e512c55c6831a10f3e6ed1c93b84041f1c"
1453
1453
  dependencies = [
1454
1454
  "aho-corasick",
1455
1455
  "memchr",
@@ -1458,9 +1458,9 @@ dependencies = [
1458
1458
 
1459
1459
  [[package]]
1460
1460
  name = "regex-syntax"
1461
- version = "0.8.7"
1461
+ version = "0.8.8"
1462
1462
  source = "registry+https://github.com/rust-lang/crates.io-index"
1463
- checksum = "c3160422bbd54dd5ecfdca71e5fd59b7b8fe2b1697ab2baf64f6d05dcc66d298"
1463
+ checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58"
1464
1464
 
1465
1465
  [[package]]
1466
1466
  name = "roff"
@@ -3,7 +3,7 @@ resolver = "2"
3
3
  members = ["crates/html-to-markdown-py"]
4
4
 
5
5
  [workspace.package]
6
- version = "2.3.0"
6
+ version = "2.3.3"
7
7
  edition = "2021"
8
8
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
9
9
  license = "MIT"
@@ -15,7 +15,7 @@ rust-version = "1.80"
15
15
 
16
16
  [workspace.dependencies]
17
17
  # Core library
18
- html-to-markdown-rs = { version = "2.3.0", path = "crates/html-to-markdown" }
18
+ html-to-markdown-rs = { version = "2.3.3", path = "crates/html-to-markdown" }
19
19
 
20
20
  # HTML parsing and sanitization
21
21
  tl = "0.7"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: html-to-markdown
3
- Version: 2.3.0
3
+ Version: 2.3.3
4
4
  Classifier: Development Status :: 5 - Production/Stable
5
5
  Classifier: Environment :: Console
6
6
  Classifier: Intended Audience :: Developers
@@ -156,8 +156,15 @@ fn convert_element(
156
156
 
157
157
  // Paragraphs
158
158
  HocrElementType::OcrPar => {
159
- if !output.is_empty() && !output.ends_with("\n\n") {
160
- output.push_str("\n\n");
159
+ let bullet_paragraph = is_bullet_paragraph(element);
160
+ if !output.is_empty() {
161
+ if bullet_paragraph {
162
+ if !output.ends_with('\n') {
163
+ output.push('\n');
164
+ }
165
+ } else if !output.ends_with("\n\n") {
166
+ output.push_str("\n\n");
167
+ }
161
168
  }
162
169
 
163
170
  if let Some(heading) = detect_heading_paragraph(element) {
@@ -188,7 +195,13 @@ fn convert_element(
188
195
  if output.ends_with(' ') {
189
196
  output.pop();
190
197
  }
191
- output.push_str("\n\n");
198
+ if bullet_paragraph {
199
+ if !output.ends_with('\n') {
200
+ output.push('\n');
201
+ }
202
+ } else {
203
+ output.push_str("\n\n");
204
+ }
192
205
  }
193
206
 
194
207
  // Blockquotes
@@ -588,6 +601,43 @@ fn try_spatial_table_reconstruction(element: &HocrElement) -> Option<String> {
588
601
  None
589
602
  }
590
603
 
604
+ fn is_bullet_paragraph(element: &HocrElement) -> bool {
605
+ if element.element_type != HocrElementType::OcrPar {
606
+ return false;
607
+ }
608
+
609
+ let text = element_text_content(element);
610
+ let trimmed = text.trim_start();
611
+ if trimmed.is_empty() {
612
+ return false;
613
+ }
614
+
615
+ if matches!(trimmed.chars().next(), Some('•' | '●' | '-' | '+' | '*')) {
616
+ return true;
617
+ }
618
+
619
+ let mut chars = trimmed.chars().peekable();
620
+ let mut digit_count = 0;
621
+ while let Some(&ch) = chars.peek() {
622
+ if ch.is_ascii_digit() {
623
+ digit_count += 1;
624
+ chars.next();
625
+ } else {
626
+ break;
627
+ }
628
+ }
629
+
630
+ if digit_count > 0 {
631
+ if let Some(&ch) = chars.peek() {
632
+ if (ch == '.' || ch == ')') && chars.clone().nth(1).map(|c| c.is_whitespace()).unwrap_or(false) {
633
+ return true;
634
+ }
635
+ }
636
+ }
637
+
638
+ false
639
+ }
640
+
591
641
  #[derive(Clone)]
592
642
  struct CodeLineInfo {
593
643
  text: String,
@@ -662,6 +712,10 @@ fn collect_code_block(children: &[&HocrElement]) -> Option<(Vec<String>, usize,
662
712
  return None;
663
713
  }
664
714
 
715
+ if !is_confident_code_block(&collected) {
716
+ return None;
717
+ }
718
+
665
719
  // Determine base indentation metrics
666
720
  let mut x_values: Vec<u32> = collected
667
721
  .iter()
@@ -844,6 +898,42 @@ fn is_bullet_like(line: &str) -> bool {
844
898
  false
845
899
  }
846
900
 
901
+ fn contains_keyword_token(text: &str, keyword: &str) -> bool {
902
+ text.split(|ch: char| !(ch.is_ascii_alphanumeric() || ch == '_'))
903
+ .any(|token| token == keyword)
904
+ }
905
+
906
+ fn is_shell_prompt(text: &str) -> bool {
907
+ let trimmed = text.trim_start();
908
+ if trimmed.is_empty() {
909
+ return false;
910
+ }
911
+
912
+ trimmed.starts_with('$')
913
+ || trimmed.starts_with('#')
914
+ || trimmed.contains("]#")
915
+ || trimmed.starts_with("sudo ")
916
+ || trimmed.starts_with("./")
917
+ || trimmed.starts_with("python ")
918
+ || trimmed.starts_with("pip ")
919
+ || trimmed.starts_with("uv ")
920
+ }
921
+
922
+ fn starts_with_keyword(trimmed: &str, keyword: &str) -> bool {
923
+ if !trimmed.starts_with(keyword) {
924
+ return false;
925
+ }
926
+ if let Some(first) = trimmed.chars().next() {
927
+ if !first.is_ascii_lowercase() {
928
+ return false;
929
+ }
930
+ }
931
+ match trimmed.chars().nth(keyword.len()) {
932
+ None => true,
933
+ Some(ch) => ch.is_whitespace() || matches!(ch, '(' | ':' | '{' | '[' | '.'),
934
+ }
935
+ }
936
+
847
937
  fn is_code_paragraph(lines: &[CodeLineInfo]) -> bool {
848
938
  if lines.is_empty() {
849
939
  return false;
@@ -865,47 +955,70 @@ fn is_code_paragraph(lines: &[CodeLineInfo]) -> bool {
865
955
 
866
956
  total += 1;
867
957
  let lower = text.to_lowercase();
958
+ let trimmed = text.trim_start();
868
959
 
869
- let has_keyword = lower.contains("function")
870
- || lower.contains("console.")
871
- || lower.starts_with("return")
872
- || lower.starts_with("var ")
873
- || lower.starts_with("let ")
874
- || lower.starts_with("const ")
875
- || lower.starts_with("async ")
876
- || lower.starts_with("await ")
877
- || lower.starts_with("if ")
878
- || lower.starts_with("elif ")
879
- || lower.starts_with("else if ")
880
- || lower.starts_with("for ")
881
- || lower.starts_with("while ")
882
- || lower.starts_with("switch ")
883
- || lower.starts_with("case ")
884
- || lower.starts_with("class ")
885
- || lower.starts_with("struct ")
886
- || lower.starts_with("enum ")
887
- || lower.starts_with("def ")
888
- || lower.starts_with("fn ")
889
- || lower.starts_with("pub ")
890
- || lower.starts_with("import ")
891
- || lower.starts_with("from ")
892
- || lower.starts_with("using ")
893
- || lower.starts_with("namespace ");
894
-
895
- let has_symbol = text.contains('{')
896
- || text.contains('}')
897
- || text.contains(';')
898
- || text.contains("::")
899
- || text.contains("->")
900
- || text.contains("=>");
960
+ let documentation_tokens = [
961
+ "definition",
962
+ "theorem",
963
+ "lemma",
964
+ "proof",
965
+ "corollary",
966
+ "algorithm",
967
+ "figure",
968
+ "table",
969
+ "appendix",
970
+ ];
971
+ if documentation_tokens
972
+ .iter()
973
+ .any(|token| contains_keyword_token(&lower, token))
974
+ {
975
+ return false;
976
+ }
977
+
978
+ let has_keyword = (starts_with_keyword(trimmed, "function") && text.contains('('))
979
+ || (starts_with_keyword(trimmed, "return")
980
+ && trimmed
981
+ .chars()
982
+ .nth("return".len())
983
+ .map(|c| c.is_whitespace())
984
+ .unwrap_or(true))
985
+ || trimmed.starts_with("console.")
986
+ || starts_with_keyword(trimmed, "async")
987
+ || starts_with_keyword(trimmed, "await")
988
+ || (starts_with_keyword(trimmed, "class") && (text.contains('{') || text.contains(':')))
989
+ || (starts_with_keyword(trimmed, "struct") && text.contains('{'))
990
+ || (starts_with_keyword(trimmed, "enum") && text.contains('{'))
991
+ || (starts_with_keyword(trimmed, "def") && (text.contains('(') || text.contains(':')))
992
+ || (starts_with_keyword(trimmed, "fn") && text.contains('('))
993
+ || (starts_with_keyword(trimmed, "pub")
994
+ && (text.contains("fn") || text.contains("struct") || text.contains("enum")))
995
+ || starts_with_keyword(trimmed, "import")
996
+ || starts_with_keyword(trimmed, "using")
997
+ || starts_with_keyword(trimmed, "namespace")
998
+ || starts_with_keyword(trimmed, "public")
999
+ || starts_with_keyword(trimmed, "private")
1000
+ || starts_with_keyword(trimmed, "protected")
1001
+ || starts_with_keyword(trimmed, "static")
1002
+ || starts_with_keyword(trimmed, "void")
1003
+ || starts_with_keyword(trimmed, "try")
1004
+ || starts_with_keyword(trimmed, "catch")
1005
+ || starts_with_keyword(trimmed, "finally")
1006
+ || starts_with_keyword(trimmed, "throw")
1007
+ || starts_with_keyword(trimmed, "typedef")
1008
+ || starts_with_keyword(trimmed, "package")
1009
+ || starts_with_keyword(trimmed, "module");
1010
+
1011
+ let has_symbol = text.contains(';') || text.contains("::");
901
1012
 
902
1013
  if has_keyword || has_symbol {
903
1014
  strong_markers += 1;
904
1015
  continue;
905
1016
  }
906
1017
 
907
- let trimmed = text.trim_start();
908
- let starts_with_indent = trimmed.len() + 2 <= text.len();
1018
+ if is_shell_prompt(text) {
1019
+ strong_markers += 1;
1020
+ continue;
1021
+ }
909
1022
  let has_assignment = text.contains(" = ")
910
1023
  || text.contains("+=")
911
1024
  || text.contains("-=")
@@ -914,7 +1027,11 @@ fn is_code_paragraph(lines: &[CodeLineInfo]) -> bool {
914
1027
  || text.contains(" := ")
915
1028
  || text.contains(" == ");
916
1029
 
917
- if has_assignment || starts_with_indent {
1030
+ let has_arrow = text.contains("=>");
1031
+ let has_brace = text.contains('{') || text.contains('}');
1032
+ let has_pointer_arrow = text.contains("->");
1033
+
1034
+ if has_assignment || has_arrow || has_brace || has_pointer_arrow {
918
1035
  moderate_markers += 1;
919
1036
  }
920
1037
  }
@@ -922,10 +1039,13 @@ fn is_code_paragraph(lines: &[CodeLineInfo]) -> bool {
922
1039
  if total == 0 {
923
1040
  return false;
924
1041
  }
1042
+ if strong_markers == 0 {
1043
+ return false;
1044
+ }
925
1045
  if strong_markers * 2 >= total {
926
1046
  return true;
927
1047
  }
928
- strong_markers > 0 && (strong_markers + moderate_markers) * 2 >= total
1048
+ (strong_markers + moderate_markers) * 2 >= total
929
1049
  }
930
1050
 
931
1051
  fn normalize_code_line(text: &str) -> String {
@@ -980,6 +1100,114 @@ fn normalize_code_line(text: &str) -> String {
980
1100
  final_line.trim().to_string()
981
1101
  }
982
1102
 
1103
+ fn is_confident_code_block(lines: &[CodeLineInfo]) -> bool {
1104
+ let mut total = 0;
1105
+ let mut keyword_lines = 0;
1106
+ let mut punctuation_lines = 0;
1107
+ let mut assignment_lines = 0;
1108
+ let mut shell_lines = 0;
1109
+ let mut indent_lines = 0;
1110
+
1111
+ let min_x = lines.iter().map(|info| info.x1).min().unwrap_or_default();
1112
+
1113
+ for info in lines {
1114
+ let text = info.text.trim();
1115
+ if text.is_empty() {
1116
+ continue;
1117
+ }
1118
+ total += 1;
1119
+
1120
+ if is_shell_prompt(text) {
1121
+ shell_lines += 1;
1122
+ }
1123
+
1124
+ let trimmed = text.trim_start();
1125
+
1126
+ if (starts_with_keyword(trimmed, "function") && text.contains('('))
1127
+ || trimmed.starts_with("console.")
1128
+ || (starts_with_keyword(trimmed, "return")
1129
+ && trimmed
1130
+ .chars()
1131
+ .nth("return".len())
1132
+ .map(|c| c.is_whitespace())
1133
+ .unwrap_or(true))
1134
+ || starts_with_keyword(trimmed, "async")
1135
+ || starts_with_keyword(trimmed, "await")
1136
+ || (starts_with_keyword(trimmed, "class") && (text.contains('{') || text.contains(':')))
1137
+ || (starts_with_keyword(trimmed, "struct") && text.contains('{'))
1138
+ || (starts_with_keyword(trimmed, "enum") && text.contains('{'))
1139
+ || (starts_with_keyword(trimmed, "def") && (text.contains('(') || text.contains(':')))
1140
+ || (starts_with_keyword(trimmed, "fn") && text.contains('('))
1141
+ || (starts_with_keyword(trimmed, "pub")
1142
+ && (text.contains("fn") || text.contains("struct") || text.contains("enum")))
1143
+ || starts_with_keyword(trimmed, "import")
1144
+ || starts_with_keyword(trimmed, "using")
1145
+ || starts_with_keyword(trimmed, "namespace")
1146
+ || starts_with_keyword(trimmed, "public")
1147
+ || starts_with_keyword(trimmed, "private")
1148
+ || starts_with_keyword(trimmed, "protected")
1149
+ || starts_with_keyword(trimmed, "static")
1150
+ || starts_with_keyword(trimmed, "void")
1151
+ || starts_with_keyword(trimmed, "try")
1152
+ || starts_with_keyword(trimmed, "catch")
1153
+ || starts_with_keyword(trimmed, "finally")
1154
+ || starts_with_keyword(trimmed, "throw")
1155
+ || starts_with_keyword(trimmed, "typedef")
1156
+ || starts_with_keyword(trimmed, "package")
1157
+ || starts_with_keyword(trimmed, "module")
1158
+ {
1159
+ keyword_lines += 1;
1160
+ }
1161
+
1162
+ if text.contains(';')
1163
+ || text.contains('{')
1164
+ || text.contains('}')
1165
+ || text.contains("::")
1166
+ || text.contains("->")
1167
+ || text.contains("=>")
1168
+ {
1169
+ punctuation_lines += 1;
1170
+ }
1171
+
1172
+ if text.contains(" = ")
1173
+ || text.contains("+=")
1174
+ || text.contains("-=")
1175
+ || text.contains("*=")
1176
+ || text.contains("/=")
1177
+ || text.contains(" := ")
1178
+ || text.contains(" == ")
1179
+ {
1180
+ assignment_lines += 1;
1181
+ }
1182
+
1183
+ if info.x1 > min_x + 8 {
1184
+ indent_lines += 1;
1185
+ }
1186
+ }
1187
+
1188
+ if total < 3 {
1189
+ return false;
1190
+ }
1191
+
1192
+ if shell_lines >= 2 && shell_lines * 2 >= total {
1193
+ return true;
1194
+ }
1195
+
1196
+ if keyword_lines >= 2 && assignment_lines >= 1 {
1197
+ return true;
1198
+ }
1199
+
1200
+ if keyword_lines >= 1 && punctuation_lines >= 1 && assignment_lines >= 1 {
1201
+ return true;
1202
+ }
1203
+
1204
+ if indent_lines == total && keyword_lines >= 1 && assignment_lines >= 1 {
1205
+ return true;
1206
+ }
1207
+
1208
+ false
1209
+ }
1210
+
983
1211
  fn detect_code_language(lines: &[String]) -> Option<&'static str> {
984
1212
  let lower_lines: Vec<String> = lines.iter().map(|line| line.to_lowercase()).collect();
985
1213
  if lower_lines.iter().any(|line| line.contains("function"))
@@ -3,7 +3,7 @@
3
3
  High-performance HTML → Markdown conversion powered by Rust. Shipping as a Rust crate, Python package, Node.js bindings, WebAssembly, and standalone CLI with identical rendering behaviour.
4
4
 
5
5
  [![PyPI version](https://badge.fury.io/py/html-to-markdown.svg)](https://pypi.org/project/html-to-markdown/)
6
- [![npm version](https://badge.fury.io/js/html-to-markdown.svg)](https://www.npmjs.com/package/html-to-markdown)
6
+ [![npm version](https://badge.fury.io/js/html-to-markdown.svg)](https://www.npmjs.com/package/html-to-markdown-node)
7
7
  [![Crates.io](https://img.shields.io/crates/v/html-to-markdown-rs.svg)](https://crates.io/crates/html-to-markdown-rs)
8
8
  [![Python Versions](https://img.shields.io/pypi/pyversions/html-to-markdown.svg)](https://pypi.org/project/html-to-markdown/)
9
9
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://github.com/Goldziher/html-to-markdown/blob/main/LICENSE)
@@ -23,9 +23,9 @@ High-performance HTML → Markdown conversion powered by Rust. Shipping as a Rus
23
23
 
24
24
  | Target | Command |
25
25
  | --------------------------- | ------------------------------------------------------------------------- |
26
- | **Node.js/Bun** (native) | `npm install @html-to-markdown/node` |
27
- | **WebAssembly** (universal) | `npm install @html-to-markdown/wasm` |
28
- | **Deno** | `import { convert } from "npm:@html-to-markdown/wasm"` |
26
+ | **Node.js/Bun** (native) | `npm install html-to-markdown-node` |
27
+ | **WebAssembly** (universal) | `npm install html-to-markdown-wasm` |
28
+ | **Deno** | `import { convert } from "npm:html-to-markdown-wasm"` |
29
29
  | **Python** (bindings + CLI) | `pip install html-to-markdown` |
30
30
  | **Rust** crate | `cargo add html-to-markdown-rs` |
31
31
  | Rust CLI | `cargo install html-to-markdown-cli` |
@@ -39,7 +39,7 @@ High-performance HTML → Markdown conversion powered by Rust. Shipping as a Rus
39
39
  **Node.js / Bun (Native - Fastest):**
40
40
 
41
41
  ```typescript
42
- import { convert } from '@html-to-markdown/node';
42
+ import { convert } from 'html-to-markdown-node';
43
43
 
44
44
  const html = '<h1>Hello</h1><p>Rust ❤️ Markdown</p>';
45
45
  const markdown = convert(html, {
@@ -52,8 +52,8 @@ const markdown = convert(html, {
52
52
  **Deno / Browsers / Edge (Universal):**
53
53
 
54
54
  ```typescript
55
- import { convert } from "npm:@html-to-markdown/wasm"; // Deno
56
- // or: import { convert } from '@html-to-markdown/wasm'; // Bundlers
55
+ import { convert } from "npm:html-to-markdown-wasm"; // Deno
56
+ // or: import { convert } from 'html-to-markdown-wasm'; // Bundlers
57
57
 
58
58
  const markdown = convert(html, {
59
59
  headingStyle: 'atx',
@@ -15,7 +15,13 @@ V1 API (backward compatibility):
15
15
  markdown = convert_to_markdown(html, heading_style="atx")
16
16
  """
17
17
 
18
- from html_to_markdown.api import convert
18
+ from html_to_markdown.api import (
19
+ InlineImage,
20
+ InlineImageConfig,
21
+ InlineImageWarning,
22
+ convert,
23
+ convert_with_inline_images,
24
+ )
19
25
  from html_to_markdown.exceptions import (
20
26
  ConflictingOptionsError,
21
27
  EmptyHtmlError,
@@ -31,12 +37,16 @@ __all__ = [
31
37
  "ConversionOptions",
32
38
  "EmptyHtmlError",
33
39
  "HtmlToMarkdownError",
40
+ "InlineImage",
41
+ "InlineImageConfig",
42
+ "InlineImageWarning",
34
43
  "InvalidParserError",
35
44
  "MissingDependencyError",
36
45
  "PreprocessingOptions",
37
46
  "convert",
38
47
  "convert_to_markdown",
48
+ "convert_with_inline_images",
39
49
  "markdownify",
40
50
  ]
41
51
 
42
- __version__ = "2.3.0"
52
+ __version__ = "2.3.3"
@@ -0,0 +1,143 @@
1
+ """New v2 functional API for HTML to Markdown conversion.
2
+
3
+ This module provides the new functional API with dataclass-based options,
4
+ using the Rust backend for conversion.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from typing import TYPE_CHECKING, Literal, TypedDict, cast
10
+
11
+ import html_to_markdown._html_to_markdown as _rust # type: ignore[import-not-found]
12
+ from html_to_markdown.options import ConversionOptions, PreprocessingOptions
13
+
14
+ if TYPE_CHECKING:
15
+ from html_to_markdown._html_to_markdown import InlineImageConfig
16
+ else:
17
+ InlineImageConfig = _rust.InlineImageConfig # type: ignore[misc, assignment]
18
+
19
+
20
+ class InlineImage(TypedDict):
21
+ """Inline image extracted during conversion."""
22
+
23
+ data: bytes
24
+ format: str
25
+ filename: str | None
26
+ description: str | None
27
+ dimensions: tuple[int, int] | None
28
+ source: Literal["img_data_uri", "svg_element"]
29
+ attributes: dict[str, str]
30
+
31
+
32
+ class InlineImageWarning(TypedDict):
33
+ """Warning produced during inline image extraction."""
34
+
35
+ index: int
36
+ message: str
37
+
38
+
39
+ def _to_rust_preprocessing(options: PreprocessingOptions) -> _rust.PreprocessingOptions:
40
+ """Convert high-level preprocessing options to the Rust bindings."""
41
+ return _rust.PreprocessingOptions(
42
+ enabled=options.enabled,
43
+ preset=options.preset,
44
+ remove_navigation=options.remove_navigation,
45
+ remove_forms=options.remove_forms,
46
+ )
47
+
48
+
49
+ def _to_rust_options(
50
+ options: ConversionOptions,
51
+ preprocessing: PreprocessingOptions,
52
+ ) -> _rust.ConversionOptions:
53
+ """Convert high-level conversion options to the Rust bindings."""
54
+ return _rust.ConversionOptions(
55
+ heading_style=options.heading_style,
56
+ list_indent_type=options.list_indent_type,
57
+ list_indent_width=options.list_indent_width,
58
+ bullets=options.bullets,
59
+ strong_em_symbol=options.strong_em_symbol,
60
+ escape_asterisks=options.escape_asterisks,
61
+ escape_underscores=options.escape_underscores,
62
+ escape_misc=options.escape_misc,
63
+ escape_ascii=options.escape_ascii,
64
+ code_language=options.code_language,
65
+ autolinks=options.autolinks,
66
+ default_title=options.default_title,
67
+ br_in_tables=options.br_in_tables,
68
+ hocr_spatial_tables=options.hocr_spatial_tables,
69
+ highlight_style=options.highlight_style,
70
+ extract_metadata=options.extract_metadata,
71
+ whitespace_mode=options.whitespace_mode,
72
+ strip_newlines=options.strip_newlines,
73
+ wrap=options.wrap,
74
+ wrap_width=options.wrap_width,
75
+ convert_as_inline=options.convert_as_inline,
76
+ sub_symbol=options.sub_symbol,
77
+ sup_symbol=options.sup_symbol,
78
+ newline_style=options.newline_style,
79
+ code_block_style=options.code_block_style,
80
+ keep_inline_images_in=list(options.keep_inline_images_in) if options.keep_inline_images_in else [],
81
+ preprocessing=_to_rust_preprocessing(preprocessing),
82
+ encoding=options.encoding,
83
+ debug=options.debug,
84
+ strip_tags=list(options.strip_tags) if options.strip_tags else [],
85
+ )
86
+
87
+
88
+ def convert(
89
+ html: str,
90
+ options: ConversionOptions | None = None,
91
+ preprocessing: PreprocessingOptions | None = None,
92
+ ) -> str:
93
+ """Convert HTML to Markdown using the Rust backend.
94
+
95
+ Args:
96
+ html: HTML string to convert.
97
+ options: Conversion configuration options (defaults to ConversionOptions()).
98
+ preprocessing: HTML preprocessing options (defaults to PreprocessingOptions()).
99
+
100
+ Returns:
101
+ Converted Markdown string.
102
+ """
103
+ if options is None:
104
+ options = ConversionOptions()
105
+ if preprocessing is None:
106
+ preprocessing = PreprocessingOptions()
107
+
108
+ rust_options = _to_rust_options(options, preprocessing)
109
+ return cast("str", _rust.convert(html, rust_options))
110
+
111
+
112
+ def convert_with_inline_images(
113
+ html: str,
114
+ options: ConversionOptions | None = None,
115
+ preprocessing: PreprocessingOptions | None = None,
116
+ image_config: InlineImageConfig | None = None,
117
+ ) -> tuple[str, list[InlineImage], list[InlineImageWarning]]:
118
+ """Convert HTML and extract inline images.
119
+
120
+ Returns Markdown along with extracted inline images and any warnings.
121
+ """
122
+ if options is None:
123
+ options = ConversionOptions()
124
+ if preprocessing is None:
125
+ preprocessing = PreprocessingOptions()
126
+ if image_config is None:
127
+ image_config = InlineImageConfig()
128
+
129
+ rust_options = _to_rust_options(options, preprocessing)
130
+ markdown, images, warnings = cast(
131
+ "tuple[str, list[InlineImage], list[InlineImageWarning]]",
132
+ _rust.convert_with_inline_images(html, rust_options, image_config),
133
+ )
134
+ return markdown, list(images), list(warnings)
135
+
136
+
137
+ __all__ = [
138
+ "InlineImage",
139
+ "InlineImageConfig",
140
+ "InlineImageWarning",
141
+ "convert",
142
+ "convert_with_inline_images",
143
+ ]
@@ -7,7 +7,7 @@ requires = [
7
7
 
8
8
  [project]
9
9
  name = "html-to-markdown"
10
- version = "2.3.0"
10
+ version = "2.3.3"
11
11
  description = "High-performance HTML to Markdown converter powered by Rust with a clean Python API"
12
12
  readme = "README_PYPI.md"
13
13
  keywords = [
@@ -1,74 +0,0 @@
1
- """New v2 functional API for HTML to Markdown conversion.
2
-
3
- This module provides the new functional API with dataclass-based options,
4
- using the Rust backend for conversion.
5
- """
6
-
7
- from __future__ import annotations
8
-
9
- import html_to_markdown._html_to_markdown as _rust # type: ignore[import-not-found]
10
- from html_to_markdown.options import ConversionOptions, PreprocessingOptions
11
-
12
-
13
- def convert(
14
- html: str,
15
- options: ConversionOptions | None = None,
16
- preprocessing: PreprocessingOptions | None = None,
17
- ) -> str:
18
- """Convert HTML to Markdown using the Rust backend.
19
-
20
- Args:
21
- html: HTML string to convert.
22
- options: Conversion configuration options (defaults to ConversionOptions()).
23
- preprocessing: HTML preprocessing options (defaults to PreprocessingOptions()).
24
-
25
- Returns:
26
- Converted Markdown string.
27
- """
28
- if options is None:
29
- options = ConversionOptions()
30
- if preprocessing is None:
31
- preprocessing = PreprocessingOptions()
32
-
33
- rust_preprocessing = _rust.PreprocessingOptions(
34
- enabled=preprocessing.enabled,
35
- preset=preprocessing.preset,
36
- remove_navigation=preprocessing.remove_navigation,
37
- remove_forms=preprocessing.remove_forms,
38
- )
39
-
40
- rust_options = _rust.ConversionOptions(
41
- heading_style=options.heading_style,
42
- list_indent_type=options.list_indent_type,
43
- list_indent_width=options.list_indent_width,
44
- bullets=options.bullets,
45
- strong_em_symbol=options.strong_em_symbol,
46
- escape_asterisks=options.escape_asterisks,
47
- escape_underscores=options.escape_underscores,
48
- escape_misc=options.escape_misc,
49
- escape_ascii=options.escape_ascii,
50
- code_language=options.code_language,
51
- autolinks=options.autolinks,
52
- default_title=options.default_title,
53
- br_in_tables=options.br_in_tables,
54
- hocr_spatial_tables=options.hocr_spatial_tables,
55
- highlight_style=options.highlight_style,
56
- extract_metadata=options.extract_metadata,
57
- whitespace_mode=options.whitespace_mode,
58
- strip_newlines=options.strip_newlines,
59
- wrap=options.wrap,
60
- wrap_width=options.wrap_width,
61
- convert_as_inline=options.convert_as_inline,
62
- sub_symbol=options.sub_symbol,
63
- sup_symbol=options.sup_symbol,
64
- newline_style=options.newline_style,
65
- code_block_style=options.code_block_style,
66
- keep_inline_images_in=list(options.keep_inline_images_in) if options.keep_inline_images_in else [],
67
- preprocessing=rust_preprocessing,
68
- encoding=options.encoding,
69
- debug=options.debug,
70
- strip_tags=list(options.strip_tags) if options.strip_tags else [],
71
- )
72
-
73
- result: str = _rust.convert(html, rust_options)
74
- return result