html-to-markdown 3.2.4 → 3.4.0.pre.rc.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Steepfile +6 -0
- data/ext/html_to_markdown_rb/Cargo.toml +2 -2
- data/ext/html_to_markdown_rb/native/Cargo.toml +28 -0
- data/ext/html_to_markdown_rb/src/html-to-markdown/version.rb +10 -0
- data/ext/html_to_markdown_rb/src/html-to-markdown.rb +13 -0
- data/ext/html_to_markdown_rb/src/lib.rs +2088 -268
- data/lib/bin/html-to-markdown +0 -0
- data/lib/html_to_markdown/version.rb +1 -1
- data/lib/html_to_markdown.rb +5 -3
- data/sig/types.rbs +769 -0
- data/vendor/Cargo.toml +2 -2
- data/vendor/html-to-markdown-rs/Cargo.toml +1 -1
- data/vendor/html-to-markdown-rs/examples/basic.rs +1 -1
- data/vendor/html-to-markdown-rs/examples/table.rs +1 -1
- data/vendor/html-to-markdown-rs/examples/test_deser.rs +1 -1
- data/vendor/html-to-markdown-rs/examples/test_escape.rs +1 -1
- data/vendor/html-to-markdown-rs/examples/test_inline_formatting.rs +1 -1
- data/vendor/html-to-markdown-rs/examples/test_lists.rs +1 -1
- data/vendor/html-to-markdown-rs/examples/test_semantic_tags.rs +1 -1
- data/vendor/html-to-markdown-rs/examples/test_tables.rs +1 -1
- data/vendor/html-to-markdown-rs/examples/test_task_lists.rs +1 -1
- data/vendor/html-to-markdown-rs/examples/test_whitespace.rs +1 -1
- data/vendor/html-to-markdown-rs/src/convert_api.rs +15 -25
- data/vendor/html-to-markdown-rs/src/converter/block/blockquote.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/block/container.rs +3 -3
- data/vendor/html-to-markdown-rs/src/converter/block/div.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/block/heading.rs +6 -7
- data/vendor/html-to-markdown-rs/src/converter/block/horizontal_rule.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/block/line_break.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/block/mod.rs +0 -108
- data/vendor/html-to-markdown-rs/src/converter/block/paragraph.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/block/preformatted.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/block/table/builder.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/block/table/cell.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/block/table/layout.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/block/table/mod.rs +2 -4
- data/vendor/html-to-markdown-rs/src/converter/block/unknown.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/context.rs +10 -0
- data/vendor/html-to-markdown-rs/src/converter/dom_context.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/form/elements.rs +14 -14
- data/vendor/html-to-markdown-rs/src/converter/form/mod.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/format/mod.rs +0 -3
- data/vendor/html-to-markdown-rs/src/converter/inline/code.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/inline/emphasis.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/inline/link.rs +2 -2
- data/vendor/html-to-markdown-rs/src/converter/inline/mod.rs +0 -1
- data/vendor/html-to-markdown-rs/src/converter/inline/ruby.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/inline/semantic/mod.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/list/definition.rs +3 -3
- data/vendor/html-to-markdown-rs/src/converter/list/item.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/list/mod.rs +0 -1
- data/vendor/html-to-markdown-rs/src/converter/list/ordered.rs +2 -2
- data/vendor/html-to-markdown-rs/src/converter/list/unordered.rs +2 -2
- data/vendor/html-to-markdown-rs/src/converter/main.rs +57 -31
- data/vendor/html-to-markdown-rs/src/converter/media/embedded.rs +8 -8
- data/vendor/html-to-markdown-rs/src/converter/media/image.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/media/mod.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/media/svg.rs +5 -5
- data/vendor/html-to-markdown-rs/src/converter/mod.rs +6 -17
- data/vendor/html-to-markdown-rs/src/converter/plain_text.rs +64 -11
- data/vendor/html-to-markdown-rs/src/converter/preprocessing_helpers.rs +80 -22
- data/vendor/html-to-markdown-rs/src/converter/semantic/figure.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/semantic/mod.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/text/mod.rs +0 -4
- data/vendor/html-to-markdown-rs/src/converter/utility/attributes.rs +5 -9
- data/vendor/html-to-markdown-rs/src/converter/utility/caching.rs +3 -3
- data/vendor/html-to-markdown-rs/src/converter/utility/content.rs +10 -10
- data/vendor/html-to-markdown-rs/src/converter/utility/preprocessing.rs +13 -13
- data/vendor/html-to-markdown-rs/src/converter/utility/serialization.rs +4 -4
- data/vendor/html-to-markdown-rs/src/converter/utility/siblings.rs +6 -14
- data/vendor/html-to-markdown-rs/src/inline_images.rs +6 -0
- data/vendor/html-to-markdown-rs/src/lib.rs +17 -18
- data/vendor/html-to-markdown-rs/src/options/conversion.rs +31 -0
- data/vendor/html-to-markdown-rs/src/prelude.rs +1 -12
- data/vendor/html-to-markdown-rs/src/text.rs +0 -44
- data/vendor/html-to-markdown-rs/src/types/warnings.rs +2 -0
- data/vendor/html-to-markdown-rs/src/visitor/types.rs +5 -1
- data/vendor/html-to-markdown-rs/src/visitor_helpers.rs +4 -1
- data/vendor/html-to-markdown-rs/tests/br_in_inline_test.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/commonmark_compliance_test.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/djot_output_test.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/exclude_selectors_test.rs +136 -0
- data/vendor/html-to-markdown-rs/tests/integration_test.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_121_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_127_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_128_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_131_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_134_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_139_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_140_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_143_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_145_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_146_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_176_regressions.rs +2 -2
- data/vendor/html-to-markdown-rs/tests/issue_190_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_199_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_200_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_212_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_216_217_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/json_ld_script_extraction.rs +2 -2
- data/vendor/html-to-markdown-rs/tests/lists_test.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/plain_output_test.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/preprocessing_tests.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/reference_links_test.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/sectioning_elements_test.rs +137 -0
- data/vendor/html-to-markdown-rs/tests/skip_images_test.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/tables_test.rs +2 -2
- data/vendor/html-to-markdown-rs/tests/test_custom_elements.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/test_issue_187.rs +5 -2
- data/vendor/html-to-markdown-rs/tests/test_issue_218.rs +4 -4
- data/vendor/html-to-markdown-rs/tests/test_issue_277.rs +77 -0
- data/vendor/html-to-markdown-rs/tests/test_max_depth.rs +82 -0
- data/vendor/html-to-markdown-rs/tests/test_nested_simple.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/test_script_style_stripping.rs +4 -4
- data/vendor/html-to-markdown-rs/tests/test_spa_bisect.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/visitor_code_integration_test.rs +6 -6
- data/vendor/html-to-markdown-rs/tests/visitor_integration_test.rs +103 -35
- data/vendor/html-to-markdown-rs/tests/xml_tables_test.rs +1 -1
- metadata +21 -43
- data/.bundle/config +0 -2
- data/.gitignore +0 -3
- data/.rubocop.yml +0 -59
- data/Gemfile +0 -18
- data/Gemfile.lock +0 -173
- data/README.md +0 -331
- data/Rakefile +0 -26
- data/exe/html-to-markdown +0 -6
- data/ext/html_to_markdown_rb/src/html_to_markdown_rs/version.rb +0 -6
- data/ext/html_to_markdown_rb/src/html_to_markdown_rs.rb +0 -9
- data/html-to-markdown-rb.gemspec +0 -99
- data/lib/html_to_markdown_rs.rb +0 -3
- data/sig/html_to_markdown.rbs +0 -149
- data/vendor/html-to-markdown-rs/src/converter/text/escaping.rs +0 -94
- data/vendor/html-to-markdown-rs/src/converter/text/normalization.rs +0 -86
- data/vendor/html-to-markdown-rs/src/safety.rs +0 -70
data/sig/html_to_markdown.rbs
DELETED
|
@@ -1,149 +0,0 @@
|
|
|
1
|
-
# Native extension module (Magnus/rb-sys)
|
|
2
|
-
module HtmlToMarkdownRs
|
|
3
|
-
class ConversionOptions
|
|
4
|
-
def initialize: (Hash[Symbol, untyped]) -> void
|
|
5
|
-
end
|
|
6
|
-
|
|
7
|
-
class ConversionResult
|
|
8
|
-
def content: () -> String?
|
|
9
|
-
end
|
|
10
|
-
|
|
11
|
-
def self.convert: (String html, ConversionOptions? options) -> ConversionResult
|
|
12
|
-
end
|
|
13
|
-
|
|
14
|
-
# Type definitions for HtmlToMarkdown Ruby gem
|
|
15
|
-
module HtmlToMarkdown
|
|
16
|
-
VERSION: String
|
|
17
|
-
|
|
18
|
-
type heading_style = :underlined | :atx | :atx_closed
|
|
19
|
-
type list_indent_type = :spaces | :tabs
|
|
20
|
-
type highlight_style = :double_equal | :html | :bold | :none
|
|
21
|
-
type whitespace_mode = :normalized | :strict
|
|
22
|
-
type newline_style = :spaces | :backslash
|
|
23
|
-
type code_block_style = :indented | :backticks | :tildes
|
|
24
|
-
type link_style = :inline | :reference
|
|
25
|
-
type output_format = :markdown | :djot
|
|
26
|
-
type preprocessing_preset = :minimal | :standard | :aggressive
|
|
27
|
-
|
|
28
|
-
type preprocessing_options = {
|
|
29
|
-
enabled?: bool,
|
|
30
|
-
preset?: preprocessing_preset,
|
|
31
|
-
remove_navigation?: bool,
|
|
32
|
-
remove_forms?: bool
|
|
33
|
-
}
|
|
34
|
-
|
|
35
|
-
type conversion_options = {
|
|
36
|
-
heading_style?: heading_style,
|
|
37
|
-
list_indent_type?: list_indent_type,
|
|
38
|
-
list_indent_width?: Integer,
|
|
39
|
-
bullets?: String,
|
|
40
|
-
strong_em_symbol?: String,
|
|
41
|
-
escape_asterisks?: bool,
|
|
42
|
-
escape_underscores?: bool,
|
|
43
|
-
escape_misc?: bool,
|
|
44
|
-
escape_ascii?: bool,
|
|
45
|
-
code_language?: String,
|
|
46
|
-
autolinks?: bool,
|
|
47
|
-
default_title?: bool,
|
|
48
|
-
br_in_tables?: bool,
|
|
49
|
-
highlight_style?: highlight_style,
|
|
50
|
-
extract_metadata?: bool,
|
|
51
|
-
whitespace_mode?: whitespace_mode,
|
|
52
|
-
strip_newlines?: bool,
|
|
53
|
-
wrap?: bool,
|
|
54
|
-
wrap_width?: Integer,
|
|
55
|
-
convert_as_inline?: bool,
|
|
56
|
-
sub_symbol?: String,
|
|
57
|
-
sup_symbol?: String,
|
|
58
|
-
newline_style?: newline_style,
|
|
59
|
-
code_block_style?: code_block_style,
|
|
60
|
-
keep_inline_images_in?: Array[String],
|
|
61
|
-
preprocessing?: preprocessing_options,
|
|
62
|
-
encoding?: String,
|
|
63
|
-
debug?: bool,
|
|
64
|
-
strip_tags?: Array[String],
|
|
65
|
-
preserve_tags?: Array[String],
|
|
66
|
-
link_style?: link_style,
|
|
67
|
-
output_format?: output_format,
|
|
68
|
-
skip_images?: bool,
|
|
69
|
-
include_document_structure?: bool,
|
|
70
|
-
extract_images?: bool,
|
|
71
|
-
max_image_size?: Integer,
|
|
72
|
-
capture_svg?: bool,
|
|
73
|
-
infer_dimensions?: bool
|
|
74
|
-
}
|
|
75
|
-
|
|
76
|
-
type text_direction = "ltr" | "rtl" | "auto" | nil
|
|
77
|
-
|
|
78
|
-
type document_metadata = {
|
|
79
|
-
title: String?,
|
|
80
|
-
description: String?,
|
|
81
|
-
keywords: Array[String],
|
|
82
|
-
author: String?,
|
|
83
|
-
canonical_url: String?,
|
|
84
|
-
base_href: String?,
|
|
85
|
-
language: String?,
|
|
86
|
-
text_direction: text_direction,
|
|
87
|
-
open_graph: Hash[String, String],
|
|
88
|
-
twitter_card: Hash[String, String],
|
|
89
|
-
meta_tags: Hash[String, String]
|
|
90
|
-
}
|
|
91
|
-
|
|
92
|
-
type header_metadata = {
|
|
93
|
-
level: Integer,
|
|
94
|
-
text: String,
|
|
95
|
-
id: String?,
|
|
96
|
-
depth: Integer,
|
|
97
|
-
html_offset: Integer
|
|
98
|
-
}
|
|
99
|
-
|
|
100
|
-
type link_type = "anchor" | "internal" | "external" | "email" | "phone" | "other"
|
|
101
|
-
|
|
102
|
-
type link_metadata = {
|
|
103
|
-
href: String,
|
|
104
|
-
text: String,
|
|
105
|
-
title: String?,
|
|
106
|
-
link_type: link_type,
|
|
107
|
-
rel: Array[String],
|
|
108
|
-
attributes: Hash[String, String]
|
|
109
|
-
}
|
|
110
|
-
|
|
111
|
-
type image_type = "data_uri" | "inline_svg" | "external" | "relative"
|
|
112
|
-
|
|
113
|
-
type image_metadata = {
|
|
114
|
-
src: String,
|
|
115
|
-
alt: String?,
|
|
116
|
-
title: String?,
|
|
117
|
-
dimensions: [Integer, Integer]?,
|
|
118
|
-
image_type: image_type,
|
|
119
|
-
attributes: Hash[String, String]
|
|
120
|
-
}
|
|
121
|
-
|
|
122
|
-
type structured_data = {
|
|
123
|
-
data_type: "json_ld" | "microdata" | "rdfa",
|
|
124
|
-
raw_json: String,
|
|
125
|
-
schema_type: String?
|
|
126
|
-
}
|
|
127
|
-
|
|
128
|
-
type extended_metadata = {
|
|
129
|
-
document: document_metadata,
|
|
130
|
-
headers: Array[header_metadata],
|
|
131
|
-
links: Array[link_metadata],
|
|
132
|
-
images: Array[image_metadata],
|
|
133
|
-
structured_data: Array[structured_data]
|
|
134
|
-
}
|
|
135
|
-
|
|
136
|
-
# Native method (implemented in Rust via Magnus/rb-sys)
|
|
137
|
-
private
|
|
138
|
-
|
|
139
|
-
def self.native_convert: (String html, conversion_options? options) -> Hash[String, untyped]
|
|
140
|
-
def native_convert: (String html, conversion_options? options) -> Hash[String, untyped]
|
|
141
|
-
|
|
142
|
-
public
|
|
143
|
-
|
|
144
|
-
# Convert HTML to Markdown, returning the markdown content string.
|
|
145
|
-
#
|
|
146
|
-
# Example:
|
|
147
|
-
# result = HtmlToMarkdown.convert(html)
|
|
148
|
-
def self.convert: (String html, ?conversion_options options) -> String
|
|
149
|
-
end
|
|
@@ -1,94 +0,0 @@
|
|
|
1
|
-
//! Escaping utilities for Markdown special characters.
|
|
2
|
-
//!
|
|
3
|
-
//! This module provides functions for escaping characters that have special meaning
|
|
4
|
-
//! in Markdown, including brackets in link labels and angle brackets.
|
|
5
|
-
|
|
6
|
-
use std::borrow::Cow;
|
|
7
|
-
|
|
8
|
-
/// Escape special characters in link labels.
|
|
9
|
-
///
|
|
10
|
-
/// Markdown link labels can contain brackets, which need careful escaping to avoid
|
|
11
|
-
/// being interpreted as nested links. This function escapes unescaped closing brackets
|
|
12
|
-
/// that would break the link syntax.
|
|
13
|
-
///
|
|
14
|
-
/// # Examples
|
|
15
|
-
///
|
|
16
|
-
/// ```text
|
|
17
|
-
/// "Simple text" → "Simple text"
|
|
18
|
-
/// "Text [with brackets]" → "Text [with brackets\]"
|
|
19
|
-
/// "Text \\[escaped\\]" → "Text \\[escaped\\]"
|
|
20
|
-
/// ```
|
|
21
|
-
pub fn escape_link_label(text: &str) -> String {
|
|
22
|
-
crate::converter::utility::content::escape_link_label(text)
|
|
23
|
-
}
|
|
24
|
-
|
|
25
|
-
/// Escape malformed angle brackets in markdown output.
|
|
26
|
-
///
|
|
27
|
-
/// Markdown uses `<...>` for automatic links. Angle brackets that don't form valid
|
|
28
|
-
/// link syntax should be escaped as `<` to prevent parser confusion.
|
|
29
|
-
///
|
|
30
|
-
/// A valid tag must have:
|
|
31
|
-
/// - `<!` followed by `-` or alphabetic character (for comments/declarations)
|
|
32
|
-
/// - `</` followed by alphabetic character (for closing tags)
|
|
33
|
-
/// - `<?` (for processing instructions)
|
|
34
|
-
/// - `<` followed by alphabetic character (for opening tags)
|
|
35
|
-
///
|
|
36
|
-
/// # Examples
|
|
37
|
-
///
|
|
38
|
-
/// ```text
|
|
39
|
-
/// "<valid@example.com>" → "<valid@example.com>" (unchanged, valid link)
|
|
40
|
-
/// "< invalid" → "< invalid" (escaped, invalid)
|
|
41
|
-
/// "Text <2 more" → "Text <2 more" (escaped, invalid)
|
|
42
|
-
/// ```
|
|
43
|
-
pub fn escape_malformed_angle_brackets(input: &str) -> Cow<'_, str> {
|
|
44
|
-
let bytes = input.as_bytes();
|
|
45
|
-
let len = bytes.len();
|
|
46
|
-
let mut idx = 0;
|
|
47
|
-
let mut last = 0;
|
|
48
|
-
let mut output: Option<String> = None;
|
|
49
|
-
|
|
50
|
-
while idx < len {
|
|
51
|
-
if bytes[idx] == b'<' {
|
|
52
|
-
if idx + 1 < len {
|
|
53
|
-
let next = bytes[idx + 1];
|
|
54
|
-
|
|
55
|
-
let is_valid_tag = match next {
|
|
56
|
-
b'!' => {
|
|
57
|
-
idx + 2 < len
|
|
58
|
-
&& (bytes[idx + 2] == b'-'
|
|
59
|
-
|| bytes[idx + 2].is_ascii_alphabetic()
|
|
60
|
-
|| bytes[idx + 2].is_ascii_uppercase())
|
|
61
|
-
}
|
|
62
|
-
b'/' => {
|
|
63
|
-
idx + 2 < len && (bytes[idx + 2].is_ascii_alphabetic() || bytes[idx + 2].is_ascii_uppercase())
|
|
64
|
-
}
|
|
65
|
-
b'?' => true,
|
|
66
|
-
c if c.is_ascii_alphabetic() || c.is_ascii_uppercase() => true,
|
|
67
|
-
_ => false,
|
|
68
|
-
};
|
|
69
|
-
|
|
70
|
-
if !is_valid_tag {
|
|
71
|
-
let out = output.get_or_insert_with(|| String::with_capacity(input.len() + 4));
|
|
72
|
-
out.push_str(&input[last..idx]);
|
|
73
|
-
out.push_str("<");
|
|
74
|
-
last = idx + 1;
|
|
75
|
-
}
|
|
76
|
-
} else {
|
|
77
|
-
let out = output.get_or_insert_with(|| String::with_capacity(input.len() + 4));
|
|
78
|
-
out.push_str(&input[last..idx]);
|
|
79
|
-
out.push_str("<");
|
|
80
|
-
last = idx + 1;
|
|
81
|
-
}
|
|
82
|
-
}
|
|
83
|
-
idx += 1;
|
|
84
|
-
}
|
|
85
|
-
|
|
86
|
-
if let Some(mut out) = output {
|
|
87
|
-
if last < input.len() {
|
|
88
|
-
out.push_str(&input[last..]);
|
|
89
|
-
}
|
|
90
|
-
Cow::Owned(out)
|
|
91
|
-
} else {
|
|
92
|
-
Cow::Borrowed(input)
|
|
93
|
-
}
|
|
94
|
-
}
|
|
@@ -1,86 +0,0 @@
|
|
|
1
|
-
//! Text normalization utilities for HTML to Markdown conversion.
|
|
2
|
-
//!
|
|
3
|
-
//! This module provides functions for normalizing text content extracted from HTML,
|
|
4
|
-
//! including chomping whitespace, normalizing heading text, and handling trailing whitespace.
|
|
5
|
-
|
|
6
|
-
use std::borrow::Cow;
|
|
7
|
-
|
|
8
|
-
/// Remove trailing spaces/tabs from every line while preserving newlines.
|
|
9
|
-
pub fn trim_line_end_whitespace(output: &mut String) {
|
|
10
|
-
if output.is_empty() {
|
|
11
|
-
return;
|
|
12
|
-
}
|
|
13
|
-
|
|
14
|
-
let mut cleaned = String::with_capacity(output.len());
|
|
15
|
-
for (idx, line) in output.split('\n').enumerate() {
|
|
16
|
-
if idx > 0 {
|
|
17
|
-
cleaned.push('\n');
|
|
18
|
-
}
|
|
19
|
-
|
|
20
|
-
let has_soft_break = line.ends_with(" ");
|
|
21
|
-
let trimmed = line.trim_end_matches([' ', '\t']);
|
|
22
|
-
|
|
23
|
-
cleaned.push_str(trimmed);
|
|
24
|
-
if has_soft_break {
|
|
25
|
-
cleaned.push_str(" ");
|
|
26
|
-
}
|
|
27
|
-
}
|
|
28
|
-
|
|
29
|
-
cleaned.push('\n');
|
|
30
|
-
*output = cleaned;
|
|
31
|
-
}
|
|
32
|
-
|
|
33
|
-
/// Truncate a string at a valid UTF-8 boundary.
|
|
34
|
-
pub fn truncate_at_char_boundary(value: &mut String, max_len: usize) {
|
|
35
|
-
if value.len() <= max_len {
|
|
36
|
-
return;
|
|
37
|
-
}
|
|
38
|
-
|
|
39
|
-
let mut new_len = max_len.min(value.len());
|
|
40
|
-
while new_len > 0 && !value.is_char_boundary(new_len) {
|
|
41
|
-
new_len -= 1;
|
|
42
|
-
}
|
|
43
|
-
value.truncate(new_len);
|
|
44
|
-
}
|
|
45
|
-
|
|
46
|
-
/// Normalize heading text by replacing newlines and extra whitespace.
|
|
47
|
-
///
|
|
48
|
-
/// Heading text should be on a single line in Markdown. This function collapses
|
|
49
|
-
/// any newlines and multiple spaces into single spaces.
|
|
50
|
-
///
|
|
51
|
-
/// # Examples
|
|
52
|
-
///
|
|
53
|
-
/// ```text
|
|
54
|
-
/// "Hello\nWorld" → "Hello World"
|
|
55
|
-
/// "Text with spaces" → "Text with spaces" (unchanged if no newlines)
|
|
56
|
-
/// ```
|
|
57
|
-
pub fn normalize_heading_text(text: &str) -> Cow<'_, str> {
|
|
58
|
-
if !text.contains('\n') && !text.contains('\r') {
|
|
59
|
-
return Cow::Borrowed(text);
|
|
60
|
-
}
|
|
61
|
-
|
|
62
|
-
let mut normalized = String::with_capacity(text.len());
|
|
63
|
-
let mut pending_space = false;
|
|
64
|
-
|
|
65
|
-
for ch in text.chars() {
|
|
66
|
-
match ch {
|
|
67
|
-
'\n' | '\r' => {
|
|
68
|
-
if !normalized.is_empty() {
|
|
69
|
-
pending_space = true;
|
|
70
|
-
}
|
|
71
|
-
}
|
|
72
|
-
' ' | '\t' if pending_space => {}
|
|
73
|
-
_ => {
|
|
74
|
-
if pending_space {
|
|
75
|
-
if !normalized.ends_with(' ') {
|
|
76
|
-
normalized.push(' ');
|
|
77
|
-
}
|
|
78
|
-
pending_space = false;
|
|
79
|
-
}
|
|
80
|
-
normalized.push(ch);
|
|
81
|
-
}
|
|
82
|
-
}
|
|
83
|
-
}
|
|
84
|
-
|
|
85
|
-
Cow::Owned(normalized)
|
|
86
|
-
}
|
|
@@ -1,70 +0,0 @@
|
|
|
1
|
-
#![allow(clippy::option_if_let_else)]
|
|
2
|
-
//! Helpers to keep binding entrypoints panic-safe.
|
|
3
|
-
//!
|
|
4
|
-
//! Binding layers (`PyO3`, NAPI-RS, ext-php-rs, WASM, FFI) must not allow Rust
|
|
5
|
-
//! panics to unwind into foreign runtimes. `guard_panic` wraps conversion calls,
|
|
6
|
-
//! converts panics into `ConversionError::Panic`, and preserves the original
|
|
7
|
-
//! error handling path for the caller.
|
|
8
|
-
|
|
9
|
-
use std::any::Any;
|
|
10
|
-
use std::panic::{self, AssertUnwindSafe, UnwindSafe};
|
|
11
|
-
|
|
12
|
-
use crate::error::{ConversionError, Result};
|
|
13
|
-
|
|
14
|
-
/// Run a fallible operation while preventing panics from unwinding across FFI.
|
|
15
|
-
///
|
|
16
|
-
/// Panics are captured and surfaced as `ConversionError::Panic` so bindings can
|
|
17
|
-
/// translate them into language-native errors instead of aborting.
|
|
18
|
-
#[allow(clippy::missing_errors_doc)]
|
|
19
|
-
pub fn guard_panic<F, T>(f: F) -> Result<T>
|
|
20
|
-
where
|
|
21
|
-
F: FnOnce() -> Result<T> + UnwindSafe,
|
|
22
|
-
{
|
|
23
|
-
if std::env::var("HTML_TO_MARKDOWN_FAST_FFI")
|
|
24
|
-
.ok()
|
|
25
|
-
.is_some_and(|value| matches!(value.as_str(), "1" | "true" | "yes"))
|
|
26
|
-
{
|
|
27
|
-
return f();
|
|
28
|
-
}
|
|
29
|
-
|
|
30
|
-
match panic::catch_unwind(AssertUnwindSafe(f)) {
|
|
31
|
-
Ok(result) => result,
|
|
32
|
-
Err(payload) => Err(ConversionError::Panic(panic_message(payload))),
|
|
33
|
-
}
|
|
34
|
-
}
|
|
35
|
-
|
|
36
|
-
#[allow(clippy::trivially_copy_pass_by_ref)]
|
|
37
|
-
#[allow(clippy::needless_pass_by_value)]
|
|
38
|
-
fn panic_message(payload: Box<dyn Any + Send>) -> String {
|
|
39
|
-
if let Some(msg) = payload.downcast_ref::<&str>() {
|
|
40
|
-
(*msg).to_string()
|
|
41
|
-
} else if let Some(msg) = payload.downcast_ref::<String>() {
|
|
42
|
-
msg.clone()
|
|
43
|
-
} else {
|
|
44
|
-
"unexpected panic without message".to_string()
|
|
45
|
-
}
|
|
46
|
-
}
|
|
47
|
-
|
|
48
|
-
#[cfg(test)]
|
|
49
|
-
mod tests {
|
|
50
|
-
use super::*;
|
|
51
|
-
|
|
52
|
-
#[test]
|
|
53
|
-
fn guard_panic_converts_panic_to_error() {
|
|
54
|
-
let err = guard_panic::<_, ()>(|| -> Result<()> {
|
|
55
|
-
panic!("boom");
|
|
56
|
-
})
|
|
57
|
-
.unwrap_err();
|
|
58
|
-
|
|
59
|
-
match err {
|
|
60
|
-
ConversionError::Panic(message) => assert_eq!(message, "boom"),
|
|
61
|
-
other => panic!("expected panic error, got {:?}", other),
|
|
62
|
-
}
|
|
63
|
-
}
|
|
64
|
-
|
|
65
|
-
#[test]
|
|
66
|
-
fn guard_panic_forwards_ok() {
|
|
67
|
-
let value = guard_panic(|| Ok::<_, ConversionError>(42)).unwrap();
|
|
68
|
-
assert_eq!(value, 42);
|
|
69
|
-
}
|
|
70
|
-
}
|