html-to-markdown 2.30.0 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +4 -14
- data/README.md +37 -50
- data/ext/html-to-markdown-rb/native/Cargo.lock +13 -701
- data/ext/html-to-markdown-rb/native/Cargo.toml +1 -4
- data/ext/html-to-markdown-rb/native/README.md +4 -13
- data/ext/html-to-markdown-rb/native/src/conversion/inline_images.rs +2 -73
- data/ext/html-to-markdown-rb/native/src/conversion/metadata.rs +5 -49
- data/ext/html-to-markdown-rb/native/src/conversion/mod.rs +0 -6
- data/ext/html-to-markdown-rb/native/src/lib.rs +76 -213
- data/ext/html-to-markdown-rb/native/src/options.rs +0 -3
- data/lib/html_to_markdown/version.rb +1 -1
- data/lib/html_to_markdown.rb +13 -194
- data/sig/html_to_markdown.rbs +12 -373
- data/vendor/Cargo.toml +5 -2
- data/vendor/html-to-markdown-rs/Cargo.toml +4 -10
- data/vendor/html-to-markdown-rs/README.md +126 -52
- data/vendor/html-to-markdown-rs/examples/basic.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/table.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_escape.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_inline_formatting.rs +8 -2
- data/vendor/html-to-markdown-rs/examples/test_lists.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_semantic_tags.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_tables.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_task_lists.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_whitespace.rs +6 -1
- data/vendor/html-to-markdown-rs/src/convert_api.rs +151 -745
- data/vendor/html-to-markdown-rs/src/converter/block/blockquote.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/block/div.rs +1 -7
- data/vendor/html-to-markdown-rs/src/converter/block/heading.rs +18 -5
- data/vendor/html-to-markdown-rs/src/converter/block/paragraph.rs +10 -0
- data/vendor/html-to-markdown-rs/src/converter/block/preformatted.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/block/table/builder.rs +16 -11
- data/vendor/html-to-markdown-rs/src/converter/block/table/cell.rs +20 -0
- data/vendor/html-to-markdown-rs/src/converter/block/table/cells.rs +4 -17
- data/vendor/html-to-markdown-rs/src/converter/block/table/mod.rs +140 -0
- data/vendor/html-to-markdown-rs/src/converter/block/table/scanner.rs +4 -18
- data/vendor/html-to-markdown-rs/src/converter/block/table/utils.rs +2 -18
- data/vendor/html-to-markdown-rs/src/converter/context.rs +8 -0
- data/vendor/html-to-markdown-rs/src/converter/dom_context.rs +1 -6
- data/vendor/html-to-markdown-rs/src/converter/form/elements.rs +14 -14
- data/vendor/html-to-markdown-rs/src/converter/handlers/blockquote.rs +4 -5
- data/vendor/html-to-markdown-rs/src/converter/handlers/code_block.rs +5 -10
- data/vendor/html-to-markdown-rs/src/converter/handlers/graphic.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/handlers/image.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/handlers/link.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/inline/code.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/inline/emphasis.rs +4 -10
- data/vendor/html-to-markdown-rs/src/converter/inline/link.rs +4 -170
- data/vendor/html-to-markdown-rs/src/converter/inline/semantic/marks.rs +7 -19
- data/vendor/html-to-markdown-rs/src/converter/list/item.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/list/ordered.rs +4 -10
- data/vendor/html-to-markdown-rs/src/converter/list/unordered.rs +6 -12
- data/vendor/html-to-markdown-rs/src/converter/list/utils.rs +1 -12
- data/vendor/html-to-markdown-rs/src/converter/main.rs +85 -56
- data/vendor/html-to-markdown-rs/src/converter/main_helpers.rs +4 -68
- data/vendor/html-to-markdown-rs/src/converter/media/embedded.rs +1 -5
- data/vendor/html-to-markdown-rs/src/converter/media/graphic.rs +3 -40
- data/vendor/html-to-markdown-rs/src/converter/media/image.rs +0 -8
- data/vendor/html-to-markdown-rs/src/converter/media/svg.rs +3 -13
- data/vendor/html-to-markdown-rs/src/converter/metadata.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/mod.rs +0 -8
- data/vendor/html-to-markdown-rs/src/converter/plain_text.rs +37 -12
- data/vendor/html-to-markdown-rs/src/converter/semantic/attributes.rs +5 -30
- data/vendor/html-to-markdown-rs/src/converter/semantic/figure.rs +29 -0
- data/vendor/html-to-markdown-rs/src/converter/text/escaping.rs +1 -36
- data/vendor/html-to-markdown-rs/src/converter/text/mod.rs +1 -3
- data/vendor/html-to-markdown-rs/src/converter/text/normalization.rs +0 -53
- data/vendor/html-to-markdown-rs/src/converter/text_node.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/utility/attributes.rs +0 -41
- data/vendor/html-to-markdown-rs/src/converter/utility/caching.rs +2 -1
- data/vendor/html-to-markdown-rs/src/converter/utility/content.rs +15 -98
- data/vendor/html-to-markdown-rs/src/converter/utility/preprocessing.rs +113 -4
- data/vendor/html-to-markdown-rs/src/converter/utility/serialization.rs +3 -0
- data/vendor/html-to-markdown-rs/src/converter/visitor_hooks.rs +4 -10
- data/vendor/html-to-markdown-rs/src/exports.rs +1 -4
- data/vendor/html-to-markdown-rs/src/inline_images.rs +1 -1
- data/vendor/html-to-markdown-rs/src/lib.rs +13 -133
- data/vendor/html-to-markdown-rs/src/metadata/collector.rs +4 -4
- data/vendor/html-to-markdown-rs/src/metadata/mod.rs +22 -22
- data/vendor/html-to-markdown-rs/src/metadata/types.rs +3 -3
- data/vendor/html-to-markdown-rs/src/options/conversion.rs +351 -323
- data/vendor/html-to-markdown-rs/src/options/preprocessing.rs +8 -2
- data/vendor/html-to-markdown-rs/src/prelude.rs +1 -15
- data/vendor/html-to-markdown-rs/src/rcdom.rs +7 -1
- data/vendor/html-to-markdown-rs/src/text.rs +25 -14
- data/vendor/html-to-markdown-rs/src/types/document.rs +175 -0
- data/vendor/html-to-markdown-rs/src/types/mod.rs +17 -0
- data/vendor/html-to-markdown-rs/src/types/result.rs +49 -0
- data/vendor/html-to-markdown-rs/src/types/structure_builder.rs +790 -0
- data/vendor/html-to-markdown-rs/src/types/structure_collector.rs +442 -0
- data/vendor/html-to-markdown-rs/src/types/tables.rs +47 -0
- data/vendor/html-to-markdown-rs/src/types/warnings.rs +28 -0
- data/vendor/html-to-markdown-rs/src/visitor/mod.rs +0 -6
- data/vendor/html-to-markdown-rs/src/visitor/traits.rs +0 -1
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/mod.rs +1 -21
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/mod.rs +0 -5
- data/vendor/html-to-markdown-rs/src/visitor_helpers.rs +1 -845
- data/vendor/html-to-markdown-rs/tests/br_in_inline_test.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/commonmark_compliance_test.rs +8 -8
- data/vendor/html-to-markdown-rs/tests/djot_output_test.rs +8 -2
- data/vendor/html-to-markdown-rs/tests/integration_test.rs +23 -6
- data/vendor/html-to-markdown-rs/tests/issue_121_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_127_regressions.rs +8 -2
- data/vendor/html-to-markdown-rs/tests/issue_128_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/issue_131_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_134_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_139_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_140_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_143_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_145_regressions.rs +8 -7
- data/vendor/html-to-markdown-rs/tests/issue_146_regressions.rs +8 -7
- data/vendor/html-to-markdown-rs/tests/issue_176_regressions.rs +12 -2
- data/vendor/html-to-markdown-rs/tests/issue_190_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_199_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/issue_200_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/issue_212_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/issue_216_217_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/json_ld_script_extraction.rs +4 -6
- data/vendor/html-to-markdown-rs/tests/lists_test.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/plain_output_test.rs +8 -2
- data/vendor/html-to-markdown-rs/tests/preprocessing_tests.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/skip_images_test.rs +8 -11
- data/vendor/html-to-markdown-rs/tests/tables_test.rs +12 -2
- data/vendor/html-to-markdown-rs/tests/test_custom_elements.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/test_nested_simple.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/test_script_style_stripping.rs +17 -28
- data/vendor/html-to-markdown-rs/tests/test_spa_bisect.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/visitor_integration_test.rs +29 -33
- data/vendor/html-to-markdown-rs/tests/xml_tables_test.rs +8 -1
- metadata +9 -37
- data/bin/benchmark.rb +0 -232
- data/ext/html-to-markdown-rb/native/src/conversion/tables.rs +0 -71
- data/ext/html-to-markdown-rb/native/src/profiling.rs +0 -215
- data/ext/html-to-markdown-rb/native/src/visitor/bridge.rs +0 -252
- data/ext/html-to-markdown-rb/native/src/visitor/callbacks.rs +0 -640
- data/ext/html-to-markdown-rb/native/src/visitor/mod.rs +0 -12
- data/spec/convert_spec.rb +0 -77
- data/spec/convert_with_tables_spec.rb +0 -194
- data/spec/metadata_extraction_spec.rb +0 -437
- data/spec/visitor_issue_187_spec.rb +0 -605
- data/spec/visitor_spec.rb +0 -1149
- data/vendor/html-to-markdown-rs/src/hocr/converter/code_analysis.rs +0 -254
- data/vendor/html-to-markdown-rs/src/hocr/converter/core.rs +0 -249
- data/vendor/html-to-markdown-rs/src/hocr/converter/elements.rs +0 -382
- data/vendor/html-to-markdown-rs/src/hocr/converter/hierarchy.rs +0 -379
- data/vendor/html-to-markdown-rs/src/hocr/converter/keywords.rs +0 -55
- data/vendor/html-to-markdown-rs/src/hocr/converter/layout.rs +0 -313
- data/vendor/html-to-markdown-rs/src/hocr/converter/mod.rs +0 -26
- data/vendor/html-to-markdown-rs/src/hocr/converter/output.rs +0 -78
- data/vendor/html-to-markdown-rs/src/hocr/extractor.rs +0 -232
- data/vendor/html-to-markdown-rs/src/hocr/mod.rs +0 -42
- data/vendor/html-to-markdown-rs/src/hocr/parser.rs +0 -333
- data/vendor/html-to-markdown-rs/src/hocr/spatial/coords.rs +0 -129
- data/vendor/html-to-markdown-rs/src/hocr/spatial/grouping.rs +0 -165
- data/vendor/html-to-markdown-rs/src/hocr/spatial/layout.rs +0 -335
- data/vendor/html-to-markdown-rs/src/hocr/spatial/mod.rs +0 -15
- data/vendor/html-to-markdown-rs/src/hocr/spatial/output.rs +0 -63
- data/vendor/html-to-markdown-rs/src/hocr/types.rs +0 -269
- data/vendor/html-to-markdown-rs/src/visitor/async_traits.rs +0 -249
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge.rs +0 -189
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge_visitor.rs +0 -343
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/macros.rs +0 -217
- data/vendor/html-to-markdown-rs/tests/async_visitor_test.rs +0 -57
- data/vendor/html-to-markdown-rs/tests/convert_with_metadata_no_frontmatter.rs +0 -100
- data/vendor/html-to-markdown-rs/tests/hocr_compliance_test.rs +0 -509
|
@@ -1,254 +0,0 @@
|
|
|
1
|
-
//! Code block analysis and validation
|
|
2
|
-
|
|
3
|
-
use super::hierarchy::CodeLineInfo;
|
|
4
|
-
use super::keywords::{contains_keyword_token, is_shell_prompt, starts_with_keyword};
|
|
5
|
-
|
|
6
|
-
/// Check if a line looks like it's part of a bullet list
|
|
7
|
-
pub fn is_bullet_like(line: &str) -> bool {
|
|
8
|
-
let trimmed = line.trim_start();
|
|
9
|
-
if trimmed.is_empty() {
|
|
10
|
-
return false;
|
|
11
|
-
}
|
|
12
|
-
|
|
13
|
-
if trimmed.starts_with("- ") || trimmed.starts_with("* ") || trimmed.starts_with("+ ") || trimmed.starts_with("•")
|
|
14
|
-
{
|
|
15
|
-
return true;
|
|
16
|
-
}
|
|
17
|
-
|
|
18
|
-
let mut chars = trimmed.chars().peekable();
|
|
19
|
-
let mut digit_count = 0;
|
|
20
|
-
while let Some(&ch) = chars.peek() {
|
|
21
|
-
if ch.is_ascii_digit() {
|
|
22
|
-
digit_count += 1;
|
|
23
|
-
chars.next();
|
|
24
|
-
continue;
|
|
25
|
-
}
|
|
26
|
-
break;
|
|
27
|
-
}
|
|
28
|
-
|
|
29
|
-
if digit_count > 0 {
|
|
30
|
-
if let Some(&ch) = chars.peek() {
|
|
31
|
-
if (ch == '.' || ch == ')') && chars.clone().nth(1).is_some_and(char::is_whitespace) {
|
|
32
|
-
return true;
|
|
33
|
-
}
|
|
34
|
-
}
|
|
35
|
-
}
|
|
36
|
-
|
|
37
|
-
false
|
|
38
|
-
}
|
|
39
|
-
|
|
40
|
-
/// Determine if lines form a paragraph of code
|
|
41
|
-
pub fn is_code_paragraph(lines: &[CodeLineInfo]) -> bool {
|
|
42
|
-
if lines.is_empty() {
|
|
43
|
-
return false;
|
|
44
|
-
}
|
|
45
|
-
|
|
46
|
-
let mut strong_markers = 0;
|
|
47
|
-
let mut moderate_markers = 0;
|
|
48
|
-
let mut total = 0;
|
|
49
|
-
|
|
50
|
-
for info in lines {
|
|
51
|
-
let text = info.text.trim();
|
|
52
|
-
if text.is_empty() {
|
|
53
|
-
continue;
|
|
54
|
-
}
|
|
55
|
-
|
|
56
|
-
if is_bullet_like(&info.text) {
|
|
57
|
-
return false;
|
|
58
|
-
}
|
|
59
|
-
|
|
60
|
-
total += 1;
|
|
61
|
-
let lower = text.to_lowercase();
|
|
62
|
-
let trimmed = text.trim_start();
|
|
63
|
-
|
|
64
|
-
let documentation_tokens = [
|
|
65
|
-
"definition",
|
|
66
|
-
"theorem",
|
|
67
|
-
"lemma",
|
|
68
|
-
"proof",
|
|
69
|
-
"corollary",
|
|
70
|
-
"algorithm",
|
|
71
|
-
"figure",
|
|
72
|
-
"table",
|
|
73
|
-
"appendix",
|
|
74
|
-
];
|
|
75
|
-
if documentation_tokens
|
|
76
|
-
.iter()
|
|
77
|
-
.any(|token| contains_keyword_token(&lower, token))
|
|
78
|
-
{
|
|
79
|
-
return false;
|
|
80
|
-
}
|
|
81
|
-
|
|
82
|
-
let has_keyword = (starts_with_keyword(trimmed, "function") && text.contains('('))
|
|
83
|
-
|| (starts_with_keyword(trimmed, "return")
|
|
84
|
-
&& trimmed.chars().nth("return".len()).is_none_or(char::is_whitespace))
|
|
85
|
-
|| trimmed.starts_with("console.")
|
|
86
|
-
|| starts_with_keyword(trimmed, "async")
|
|
87
|
-
|| starts_with_keyword(trimmed, "await")
|
|
88
|
-
|| (starts_with_keyword(trimmed, "class") && (text.contains('{') || text.contains(':')))
|
|
89
|
-
|| (starts_with_keyword(trimmed, "struct") && text.contains('{'))
|
|
90
|
-
|| (starts_with_keyword(trimmed, "enum") && text.contains('{'))
|
|
91
|
-
|| (starts_with_keyword(trimmed, "def") && (text.contains('(') || text.contains(':')))
|
|
92
|
-
|| (starts_with_keyword(trimmed, "fn") && text.contains('('))
|
|
93
|
-
|| (starts_with_keyword(trimmed, "pub")
|
|
94
|
-
&& (text.contains("fn") || text.contains("struct") || text.contains("enum")))
|
|
95
|
-
|| starts_with_keyword(trimmed, "import")
|
|
96
|
-
|| starts_with_keyword(trimmed, "using")
|
|
97
|
-
|| starts_with_keyword(trimmed, "namespace")
|
|
98
|
-
|| starts_with_keyword(trimmed, "public")
|
|
99
|
-
|| starts_with_keyword(trimmed, "private")
|
|
100
|
-
|| starts_with_keyword(trimmed, "protected")
|
|
101
|
-
|| starts_with_keyword(trimmed, "static")
|
|
102
|
-
|| starts_with_keyword(trimmed, "void")
|
|
103
|
-
|| starts_with_keyword(trimmed, "try")
|
|
104
|
-
|| starts_with_keyword(trimmed, "catch")
|
|
105
|
-
|| starts_with_keyword(trimmed, "finally")
|
|
106
|
-
|| starts_with_keyword(trimmed, "throw")
|
|
107
|
-
|| starts_with_keyword(trimmed, "typedef")
|
|
108
|
-
|| starts_with_keyword(trimmed, "package")
|
|
109
|
-
|| starts_with_keyword(trimmed, "module");
|
|
110
|
-
|
|
111
|
-
let has_symbol = text.contains(';') || text.contains("::");
|
|
112
|
-
|
|
113
|
-
if has_keyword || has_symbol {
|
|
114
|
-
strong_markers += 1;
|
|
115
|
-
continue;
|
|
116
|
-
}
|
|
117
|
-
|
|
118
|
-
if is_shell_prompt(text) {
|
|
119
|
-
strong_markers += 1;
|
|
120
|
-
continue;
|
|
121
|
-
}
|
|
122
|
-
let has_assignment = text.contains(" = ")
|
|
123
|
-
|| text.contains("+=")
|
|
124
|
-
|| text.contains("-=")
|
|
125
|
-
|| text.contains("*=")
|
|
126
|
-
|| text.contains("/=")
|
|
127
|
-
|| text.contains(" := ")
|
|
128
|
-
|| text.contains(" == ");
|
|
129
|
-
|
|
130
|
-
let has_arrow = text.contains("=>");
|
|
131
|
-
let has_brace = text.contains('{') || text.contains('}');
|
|
132
|
-
let has_pointer_arrow = text.contains("->");
|
|
133
|
-
|
|
134
|
-
if has_assignment || has_arrow || has_brace || has_pointer_arrow {
|
|
135
|
-
moderate_markers += 1;
|
|
136
|
-
}
|
|
137
|
-
}
|
|
138
|
-
|
|
139
|
-
if total == 0 {
|
|
140
|
-
return false;
|
|
141
|
-
}
|
|
142
|
-
if strong_markers == 0 {
|
|
143
|
-
return false;
|
|
144
|
-
}
|
|
145
|
-
if strong_markers * 2 >= total {
|
|
146
|
-
return true;
|
|
147
|
-
}
|
|
148
|
-
(strong_markers + moderate_markers) * 2 >= total
|
|
149
|
-
}
|
|
150
|
-
|
|
151
|
-
/// Check if code block is likely valid and not just prose
|
|
152
|
-
pub fn is_confident_code_block(lines: &[CodeLineInfo]) -> bool {
|
|
153
|
-
let mut total = 0;
|
|
154
|
-
let mut keyword_lines = 0;
|
|
155
|
-
let mut punctuation_lines = 0;
|
|
156
|
-
let mut assignment_lines = 0;
|
|
157
|
-
let mut shell_lines = 0;
|
|
158
|
-
let mut indent_lines = 0;
|
|
159
|
-
|
|
160
|
-
let min_x = lines.iter().map(|info| info.x1).min().unwrap_or_default();
|
|
161
|
-
|
|
162
|
-
for info in lines {
|
|
163
|
-
let text = info.text.trim();
|
|
164
|
-
if text.is_empty() {
|
|
165
|
-
continue;
|
|
166
|
-
}
|
|
167
|
-
total += 1;
|
|
168
|
-
|
|
169
|
-
if is_shell_prompt(text) {
|
|
170
|
-
shell_lines += 1;
|
|
171
|
-
}
|
|
172
|
-
|
|
173
|
-
let trimmed = text.trim_start();
|
|
174
|
-
|
|
175
|
-
if (starts_with_keyword(trimmed, "function") && text.contains('('))
|
|
176
|
-
|| trimmed.starts_with("console.")
|
|
177
|
-
|| (starts_with_keyword(trimmed, "return")
|
|
178
|
-
&& trimmed.chars().nth("return".len()).is_none_or(char::is_whitespace))
|
|
179
|
-
|| starts_with_keyword(trimmed, "async")
|
|
180
|
-
|| starts_with_keyword(trimmed, "await")
|
|
181
|
-
|| (starts_with_keyword(trimmed, "class") && (text.contains('{') || text.contains(':')))
|
|
182
|
-
|| (starts_with_keyword(trimmed, "struct") && text.contains('{'))
|
|
183
|
-
|| (starts_with_keyword(trimmed, "enum") && text.contains('{'))
|
|
184
|
-
|| (starts_with_keyword(trimmed, "def") && (text.contains('(') || text.contains(':')))
|
|
185
|
-
|| (starts_with_keyword(trimmed, "fn") && text.contains('('))
|
|
186
|
-
|| (starts_with_keyword(trimmed, "pub")
|
|
187
|
-
&& (text.contains("fn") || text.contains("struct") || text.contains("enum")))
|
|
188
|
-
|| starts_with_keyword(trimmed, "import")
|
|
189
|
-
|| starts_with_keyword(trimmed, "using")
|
|
190
|
-
|| starts_with_keyword(trimmed, "namespace")
|
|
191
|
-
|| starts_with_keyword(trimmed, "public")
|
|
192
|
-
|| starts_with_keyword(trimmed, "private")
|
|
193
|
-
|| starts_with_keyword(trimmed, "protected")
|
|
194
|
-
|| starts_with_keyword(trimmed, "static")
|
|
195
|
-
|| starts_with_keyword(trimmed, "void")
|
|
196
|
-
|| starts_with_keyword(trimmed, "try")
|
|
197
|
-
|| starts_with_keyword(trimmed, "catch")
|
|
198
|
-
|| starts_with_keyword(trimmed, "finally")
|
|
199
|
-
|| starts_with_keyword(trimmed, "throw")
|
|
200
|
-
|| starts_with_keyword(trimmed, "typedef")
|
|
201
|
-
|| starts_with_keyword(trimmed, "package")
|
|
202
|
-
|| starts_with_keyword(trimmed, "module")
|
|
203
|
-
{
|
|
204
|
-
keyword_lines += 1;
|
|
205
|
-
}
|
|
206
|
-
|
|
207
|
-
if text.contains(';')
|
|
208
|
-
|| text.contains('{')
|
|
209
|
-
|| text.contains('}')
|
|
210
|
-
|| text.contains("::")
|
|
211
|
-
|| text.contains("->")
|
|
212
|
-
|| text.contains("=>")
|
|
213
|
-
{
|
|
214
|
-
punctuation_lines += 1;
|
|
215
|
-
}
|
|
216
|
-
|
|
217
|
-
if text.contains(" = ")
|
|
218
|
-
|| text.contains("+=")
|
|
219
|
-
|| text.contains("-=")
|
|
220
|
-
|| text.contains("*=")
|
|
221
|
-
|| text.contains("/=")
|
|
222
|
-
|| text.contains(" := ")
|
|
223
|
-
|| text.contains(" == ")
|
|
224
|
-
{
|
|
225
|
-
assignment_lines += 1;
|
|
226
|
-
}
|
|
227
|
-
|
|
228
|
-
if info.x1 > min_x + 8 {
|
|
229
|
-
indent_lines += 1;
|
|
230
|
-
}
|
|
231
|
-
}
|
|
232
|
-
|
|
233
|
-
if total < 3 {
|
|
234
|
-
return false;
|
|
235
|
-
}
|
|
236
|
-
|
|
237
|
-
if shell_lines >= 2 && shell_lines * 2 >= total {
|
|
238
|
-
return true;
|
|
239
|
-
}
|
|
240
|
-
|
|
241
|
-
if keyword_lines >= 2 && assignment_lines >= 1 {
|
|
242
|
-
return true;
|
|
243
|
-
}
|
|
244
|
-
|
|
245
|
-
if keyword_lines >= 1 && punctuation_lines >= 1 && assignment_lines >= 1 {
|
|
246
|
-
return true;
|
|
247
|
-
}
|
|
248
|
-
|
|
249
|
-
if indent_lines == total && keyword_lines >= 1 && assignment_lines >= 1 {
|
|
250
|
-
return true;
|
|
251
|
-
}
|
|
252
|
-
|
|
253
|
-
false
|
|
254
|
-
}
|
|
@@ -1,249 +0,0 @@
|
|
|
1
|
-
//! Core conversion logic for hOCR to Markdown
|
|
2
|
-
|
|
3
|
-
use super::elements::convert_element;
|
|
4
|
-
use super::output::{ConvertContext, collapse_extra_newlines};
|
|
5
|
-
use crate::hocr::types::HocrElement;
|
|
6
|
-
|
|
7
|
-
/// Convert hOCR elements to Markdown with semantic formatting
|
|
8
|
-
///
|
|
9
|
-
/// Transforms hOCR document structure into clean, readable Markdown while preserving
|
|
10
|
-
/// document hierarchy and semantic meaning.
|
|
11
|
-
///
|
|
12
|
-
/// # Arguments
|
|
13
|
-
///
|
|
14
|
-
/// * `elements` - hOCR elements to convert (typically from `extract_hocr_document`)
|
|
15
|
-
/// * `preserve_structure` - If `true`, sorts elements by their `order` property to respect reading order
|
|
16
|
-
///
|
|
17
|
-
/// # Returns
|
|
18
|
-
///
|
|
19
|
-
/// A `String` containing the formatted Markdown output
|
|
20
|
-
///
|
|
21
|
-
/// # Semantic Conversion
|
|
22
|
-
///
|
|
23
|
-
/// All 40 hOCR 1.2 element types are converted with appropriate markdown formatting:
|
|
24
|
-
///
|
|
25
|
-
/// | hOCR Element | Markdown Output |
|
|
26
|
-
/// |--------------|-----------------|
|
|
27
|
-
/// | `ocr_title`, `ocr_chapter` | `# Heading` |
|
|
28
|
-
/// | `ocr_section` | `## Heading` |
|
|
29
|
-
/// | `ocr_subsection` | `### Heading` |
|
|
30
|
-
/// | `ocr_par` | Paragraph with blank lines |
|
|
31
|
-
/// | `ocr_blockquote` | `> Quote` |
|
|
32
|
-
/// | `ocr_abstract` | `**Abstract**` header |
|
|
33
|
-
/// | `ocr_author` | `*Author*` (italic) |
|
|
34
|
-
/// | `ocr_image`, `ocr_photo` | `` |
|
|
35
|
-
/// | `ocr_math`, `ocr_chem` | `` `formula` `` (inline code) |
|
|
36
|
-
/// | `ocr_display` | ` ```equation``` ` (code block) |
|
|
37
|
-
/// | `ocr_separator` | `---` (horizontal rule) |
|
|
38
|
-
/// | `ocr_dropcap` | `**Letter**` (bold) |
|
|
39
|
-
/// | `ocrx_word` | Word with markdown escaping |
|
|
40
|
-
///
|
|
41
|
-
/// # Example
|
|
42
|
-
///
|
|
43
|
-
/// ```rust
|
|
44
|
-
/// use html_to_markdown_rs::hocr::{extract_hocr_document, convert_to_markdown};
|
|
45
|
-
///
|
|
46
|
-
/// let html = r#"<div class="ocr_page">
|
|
47
|
-
/// <h1 class="ocr_title">Document Title</h1>
|
|
48
|
-
/// <p class="ocr_par" title="order 1">
|
|
49
|
-
/// <span class="ocrx_word" title="bbox 10 10 50 30; x_wconf 95">Hello</span>
|
|
50
|
-
/// <span class="ocrx_word" title="bbox 60 10 100 30; x_wconf 92">World</span>
|
|
51
|
-
/// </p>
|
|
52
|
-
/// </div>"#;
|
|
53
|
-
///
|
|
54
|
-
/// let dom = tl::parse(html, tl::ParserOptions::default()).unwrap();
|
|
55
|
-
/// let (elements, _) = extract_hocr_document(&dom);
|
|
56
|
-
/// let markdown = convert_to_markdown(&elements, true);
|
|
57
|
-
/// // Output: "# Document Title\n\nHello World"
|
|
58
|
-
/// ```
|
|
59
|
-
#[must_use]
|
|
60
|
-
pub fn convert_to_markdown(elements: &[HocrElement], preserve_structure: bool) -> String {
|
|
61
|
-
convert_to_markdown_with_options(elements, preserve_structure, true)
|
|
62
|
-
}
|
|
63
|
-
|
|
64
|
-
/// Convert hOCR elements to Markdown with advanced options.
|
|
65
|
-
///
|
|
66
|
-
/// Transforms hOCR document structure into clean, readable Markdown with fine-grained
|
|
67
|
-
/// control over structure preservation and spatial table reconstruction behavior.
|
|
68
|
-
///
|
|
69
|
-
/// # Arguments
|
|
70
|
-
///
|
|
71
|
-
/// * `elements` - hOCR elements to convert (typically from `extract_hocr_document`)
|
|
72
|
-
/// * `preserve_structure` - If `true`, sorts elements by their `order` property to respect reading order.
|
|
73
|
-
/// If `false`, elements are processed in their original tree order.
|
|
74
|
-
/// * `enable_spatial_tables` - If `true`, attempts to reconstruct table structure from spatial
|
|
75
|
-
/// positioning of words. If `false`, word positions are ignored and only text content is used.
|
|
76
|
-
///
|
|
77
|
-
/// # Returns
|
|
78
|
-
///
|
|
79
|
-
/// A `String` containing the formatted Markdown output
|
|
80
|
-
///
|
|
81
|
-
/// # Performance
|
|
82
|
-
///
|
|
83
|
-
/// - Spatial table reconstruction is more computationally expensive but produces better table formatting
|
|
84
|
-
/// - For documents without tables, setting `enable_spatial_tables` to `false` improves performance
|
|
85
|
-
/// - Structure preservation requires sorting which adds O(n log n) complexity; disable if not needed
|
|
86
|
-
#[must_use]
|
|
87
|
-
pub fn convert_to_markdown_with_options(
|
|
88
|
-
elements: &[HocrElement],
|
|
89
|
-
preserve_structure: bool,
|
|
90
|
-
enable_spatial_tables: bool,
|
|
91
|
-
) -> String {
|
|
92
|
-
let mut output = String::new();
|
|
93
|
-
let mut ctx = ConvertContext::default();
|
|
94
|
-
|
|
95
|
-
if preserve_structure && should_sort_children(elements) {
|
|
96
|
-
let mut sorted_elements: Vec<&HocrElement> = elements.iter().collect();
|
|
97
|
-
sorted_elements.sort_by_key(|e| e.properties.order.unwrap_or(u32::MAX));
|
|
98
|
-
for element in sorted_elements {
|
|
99
|
-
convert_element(
|
|
100
|
-
element,
|
|
101
|
-
&mut output,
|
|
102
|
-
0,
|
|
103
|
-
preserve_structure,
|
|
104
|
-
enable_spatial_tables,
|
|
105
|
-
&mut ctx,
|
|
106
|
-
);
|
|
107
|
-
}
|
|
108
|
-
} else {
|
|
109
|
-
for element in elements {
|
|
110
|
-
convert_element(
|
|
111
|
-
element,
|
|
112
|
-
&mut output,
|
|
113
|
-
0,
|
|
114
|
-
preserve_structure,
|
|
115
|
-
enable_spatial_tables,
|
|
116
|
-
&mut ctx,
|
|
117
|
-
);
|
|
118
|
-
}
|
|
119
|
-
}
|
|
120
|
-
|
|
121
|
-
collapse_extra_newlines(&mut output);
|
|
122
|
-
output.trim().to_string()
|
|
123
|
-
}
|
|
124
|
-
|
|
125
|
-
fn should_sort_children(children: &[HocrElement]) -> bool {
|
|
126
|
-
let mut last = 0u32;
|
|
127
|
-
let mut saw_any = false;
|
|
128
|
-
|
|
129
|
-
for child in children {
|
|
130
|
-
let order = child.properties.order.unwrap_or(u32::MAX);
|
|
131
|
-
if saw_any && order < last {
|
|
132
|
-
return true;
|
|
133
|
-
}
|
|
134
|
-
last = order;
|
|
135
|
-
saw_any = true;
|
|
136
|
-
}
|
|
137
|
-
|
|
138
|
-
false
|
|
139
|
-
}
|
|
140
|
-
|
|
141
|
-
#[cfg(test)]
|
|
142
|
-
mod tests {
|
|
143
|
-
use super::*;
|
|
144
|
-
use crate::hocr::types::{BBox, HocrElement, HocrElementType, HocrProperties};
|
|
145
|
-
|
|
146
|
-
#[test]
|
|
147
|
-
fn test_convert_title() {
|
|
148
|
-
let element = HocrElement {
|
|
149
|
-
element_type: HocrElementType::OcrTitle,
|
|
150
|
-
properties: HocrProperties::default(),
|
|
151
|
-
text: "Document Title".to_string(),
|
|
152
|
-
children: vec![],
|
|
153
|
-
};
|
|
154
|
-
|
|
155
|
-
let markdown = convert_to_markdown(&[element], true);
|
|
156
|
-
assert_eq!(markdown, "# Document Title");
|
|
157
|
-
}
|
|
158
|
-
|
|
159
|
-
#[test]
|
|
160
|
-
fn test_spatial_table_reconstruction_can_be_disabled() {
|
|
161
|
-
fn word(text: &str, x1: u32, y1: u32) -> HocrElement {
|
|
162
|
-
HocrElement {
|
|
163
|
-
element_type: HocrElementType::OcrxWord,
|
|
164
|
-
properties: HocrProperties {
|
|
165
|
-
bbox: Some(BBox {
|
|
166
|
-
x1,
|
|
167
|
-
y1,
|
|
168
|
-
x2: x1 + 40,
|
|
169
|
-
y2: y1 + 20,
|
|
170
|
-
}),
|
|
171
|
-
x_wconf: Some(95.0),
|
|
172
|
-
..HocrProperties::default()
|
|
173
|
-
},
|
|
174
|
-
text: text.to_string(),
|
|
175
|
-
children: vec![],
|
|
176
|
-
}
|
|
177
|
-
}
|
|
178
|
-
|
|
179
|
-
let paragraph = HocrElement {
|
|
180
|
-
element_type: HocrElementType::OcrPar,
|
|
181
|
-
properties: HocrProperties::default(),
|
|
182
|
-
text: String::new(),
|
|
183
|
-
children: vec![
|
|
184
|
-
word("A", 10, 10),
|
|
185
|
-
word("B", 120, 10),
|
|
186
|
-
word("C", 230, 10),
|
|
187
|
-
word("D", 12, 60),
|
|
188
|
-
word("E", 122, 60),
|
|
189
|
-
word("F", 232, 60),
|
|
190
|
-
],
|
|
191
|
-
};
|
|
192
|
-
|
|
193
|
-
let markdown_with_tables = convert_to_markdown_with_options(std::slice::from_ref(¶graph), true, true);
|
|
194
|
-
assert!(
|
|
195
|
-
markdown_with_tables.contains("| --- |"),
|
|
196
|
-
"Expected spatial table reconstruction to produce a markdown table"
|
|
197
|
-
);
|
|
198
|
-
|
|
199
|
-
let markdown_without_tables = convert_to_markdown_with_options(std::slice::from_ref(¶graph), true, false);
|
|
200
|
-
assert!(
|
|
201
|
-
!markdown_without_tables.contains('|'),
|
|
202
|
-
"Table reconstruction should be disabled when the flag is false"
|
|
203
|
-
);
|
|
204
|
-
assert!(
|
|
205
|
-
markdown_without_tables.contains("A B C"),
|
|
206
|
-
"Plain text output should retain original word order"
|
|
207
|
-
);
|
|
208
|
-
}
|
|
209
|
-
|
|
210
|
-
#[test]
|
|
211
|
-
fn test_convert_paragraph_with_words() {
|
|
212
|
-
let par = HocrElement {
|
|
213
|
-
element_type: HocrElementType::OcrPar,
|
|
214
|
-
properties: HocrProperties::default(),
|
|
215
|
-
text: String::new(),
|
|
216
|
-
children: vec![
|
|
217
|
-
HocrElement {
|
|
218
|
-
element_type: HocrElementType::OcrxWord,
|
|
219
|
-
properties: HocrProperties::default(),
|
|
220
|
-
text: "Hello".to_string(),
|
|
221
|
-
children: vec![],
|
|
222
|
-
},
|
|
223
|
-
HocrElement {
|
|
224
|
-
element_type: HocrElementType::OcrxWord,
|
|
225
|
-
properties: HocrProperties::default(),
|
|
226
|
-
text: "World".to_string(),
|
|
227
|
-
children: vec![],
|
|
228
|
-
},
|
|
229
|
-
],
|
|
230
|
-
};
|
|
231
|
-
|
|
232
|
-
let markdown = convert_to_markdown(&[par], true);
|
|
233
|
-
assert!(markdown.contains("Hello"));
|
|
234
|
-
assert!(markdown.contains("World"));
|
|
235
|
-
}
|
|
236
|
-
|
|
237
|
-
#[test]
|
|
238
|
-
fn test_convert_blockquote() {
|
|
239
|
-
let quote = HocrElement {
|
|
240
|
-
element_type: HocrElementType::OcrBlockquote,
|
|
241
|
-
properties: HocrProperties::default(),
|
|
242
|
-
text: "This is a quote".to_string(),
|
|
243
|
-
children: vec![],
|
|
244
|
-
};
|
|
245
|
-
|
|
246
|
-
let markdown = convert_to_markdown(&[quote], true);
|
|
247
|
-
assert!(markdown.starts_with("> "));
|
|
248
|
-
}
|
|
249
|
-
}
|