html-to-markdown 2.29.0 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +18 -41
- data/README.md +37 -50
- data/ext/html-to-markdown-rb/native/Cargo.lock +17 -705
- data/ext/html-to-markdown-rb/native/Cargo.toml +1 -4
- data/ext/html-to-markdown-rb/native/README.md +4 -13
- data/ext/html-to-markdown-rb/native/src/conversion/inline_images.rs +2 -73
- data/ext/html-to-markdown-rb/native/src/conversion/metadata.rs +5 -49
- data/ext/html-to-markdown-rb/native/src/conversion/mod.rs +0 -6
- data/ext/html-to-markdown-rb/native/src/lib.rs +76 -213
- data/ext/html-to-markdown-rb/native/src/options.rs +0 -3
- data/lib/html_to_markdown/version.rb +1 -1
- data/lib/html_to_markdown.rb +13 -194
- data/sig/html_to_markdown.rbs +12 -373
- data/vendor/Cargo.toml +7 -4
- data/vendor/html-to-markdown-rs/Cargo.toml +4 -10
- data/vendor/html-to-markdown-rs/README.md +127 -51
- data/vendor/html-to-markdown-rs/examples/basic.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/table.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_escape.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_inline_formatting.rs +8 -2
- data/vendor/html-to-markdown-rs/examples/test_lists.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_semantic_tags.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_tables.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_task_lists.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_whitespace.rs +6 -1
- data/vendor/html-to-markdown-rs/src/convert_api.rs +151 -745
- data/vendor/html-to-markdown-rs/src/converter/block/blockquote.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/block/div.rs +1 -7
- data/vendor/html-to-markdown-rs/src/converter/block/heading.rs +18 -5
- data/vendor/html-to-markdown-rs/src/converter/block/paragraph.rs +10 -0
- data/vendor/html-to-markdown-rs/src/converter/block/preformatted.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/block/table/builder.rs +16 -11
- data/vendor/html-to-markdown-rs/src/converter/block/table/cell.rs +20 -0
- data/vendor/html-to-markdown-rs/src/converter/block/table/cells.rs +4 -17
- data/vendor/html-to-markdown-rs/src/converter/block/table/mod.rs +140 -0
- data/vendor/html-to-markdown-rs/src/converter/block/table/scanner.rs +4 -18
- data/vendor/html-to-markdown-rs/src/converter/block/table/utils.rs +2 -18
- data/vendor/html-to-markdown-rs/src/converter/context.rs +8 -0
- data/vendor/html-to-markdown-rs/src/converter/dom_context.rs +1 -6
- data/vendor/html-to-markdown-rs/src/converter/form/elements.rs +14 -14
- data/vendor/html-to-markdown-rs/src/converter/handlers/blockquote.rs +4 -5
- data/vendor/html-to-markdown-rs/src/converter/handlers/code_block.rs +5 -10
- data/vendor/html-to-markdown-rs/src/converter/handlers/graphic.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/handlers/image.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/handlers/link.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/inline/code.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/inline/emphasis.rs +4 -10
- data/vendor/html-to-markdown-rs/src/converter/inline/link.rs +4 -170
- data/vendor/html-to-markdown-rs/src/converter/inline/semantic/marks.rs +7 -19
- data/vendor/html-to-markdown-rs/src/converter/list/item.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/list/ordered.rs +4 -10
- data/vendor/html-to-markdown-rs/src/converter/list/unordered.rs +6 -12
- data/vendor/html-to-markdown-rs/src/converter/list/utils.rs +1 -12
- data/vendor/html-to-markdown-rs/src/converter/main.rs +85 -56
- data/vendor/html-to-markdown-rs/src/converter/main_helpers.rs +4 -67
- data/vendor/html-to-markdown-rs/src/converter/media/embedded.rs +1 -5
- data/vendor/html-to-markdown-rs/src/converter/media/graphic.rs +3 -40
- data/vendor/html-to-markdown-rs/src/converter/media/image.rs +0 -8
- data/vendor/html-to-markdown-rs/src/converter/media/svg.rs +3 -13
- data/vendor/html-to-markdown-rs/src/converter/metadata.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/mod.rs +0 -8
- data/vendor/html-to-markdown-rs/src/converter/plain_text.rs +37 -12
- data/vendor/html-to-markdown-rs/src/converter/semantic/attributes.rs +5 -30
- data/vendor/html-to-markdown-rs/src/converter/semantic/figure.rs +29 -0
- data/vendor/html-to-markdown-rs/src/converter/text/escaping.rs +1 -36
- data/vendor/html-to-markdown-rs/src/converter/text/mod.rs +1 -3
- data/vendor/html-to-markdown-rs/src/converter/text/normalization.rs +0 -53
- data/vendor/html-to-markdown-rs/src/converter/text_node.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/utility/attributes.rs +0 -41
- data/vendor/html-to-markdown-rs/src/converter/utility/caching.rs +2 -1
- data/vendor/html-to-markdown-rs/src/converter/utility/content.rs +15 -98
- data/vendor/html-to-markdown-rs/src/converter/utility/preprocessing.rs +113 -4
- data/vendor/html-to-markdown-rs/src/converter/utility/serialization.rs +3 -0
- data/vendor/html-to-markdown-rs/src/converter/visitor_hooks.rs +4 -10
- data/vendor/html-to-markdown-rs/src/exports.rs +1 -4
- data/vendor/html-to-markdown-rs/src/inline_images.rs +1 -1
- data/vendor/html-to-markdown-rs/src/lib.rs +13 -133
- data/vendor/html-to-markdown-rs/src/metadata/collector.rs +4 -4
- data/vendor/html-to-markdown-rs/src/metadata/mod.rs +22 -22
- data/vendor/html-to-markdown-rs/src/metadata/types.rs +3 -3
- data/vendor/html-to-markdown-rs/src/options/conversion.rs +351 -319
- data/vendor/html-to-markdown-rs/src/options/preprocessing.rs +8 -2
- data/vendor/html-to-markdown-rs/src/prelude.rs +1 -15
- data/vendor/html-to-markdown-rs/src/rcdom.rs +7 -1
- data/vendor/html-to-markdown-rs/src/text.rs +25 -14
- data/vendor/html-to-markdown-rs/src/types/document.rs +175 -0
- data/vendor/html-to-markdown-rs/src/types/mod.rs +17 -0
- data/vendor/html-to-markdown-rs/src/types/result.rs +49 -0
- data/vendor/html-to-markdown-rs/src/types/structure_builder.rs +790 -0
- data/vendor/html-to-markdown-rs/src/types/structure_collector.rs +442 -0
- data/vendor/html-to-markdown-rs/src/types/tables.rs +47 -0
- data/vendor/html-to-markdown-rs/src/types/warnings.rs +28 -0
- data/vendor/html-to-markdown-rs/src/visitor/mod.rs +0 -6
- data/vendor/html-to-markdown-rs/src/visitor/traits.rs +0 -1
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/mod.rs +1 -21
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/mod.rs +0 -5
- data/vendor/html-to-markdown-rs/src/visitor_helpers.rs +1 -845
- data/vendor/html-to-markdown-rs/tests/br_in_inline_test.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/commonmark_compliance_test.rs +8 -8
- data/vendor/html-to-markdown-rs/tests/djot_output_test.rs +8 -2
- data/vendor/html-to-markdown-rs/tests/integration_test.rs +23 -6
- data/vendor/html-to-markdown-rs/tests/issue_121_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_127_regressions.rs +8 -2
- data/vendor/html-to-markdown-rs/tests/issue_128_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/issue_131_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_134_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_139_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_140_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_143_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_145_regressions.rs +8 -7
- data/vendor/html-to-markdown-rs/tests/issue_146_regressions.rs +8 -7
- data/vendor/html-to-markdown-rs/tests/issue_176_regressions.rs +12 -2
- data/vendor/html-to-markdown-rs/tests/issue_190_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_199_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/issue_200_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/issue_212_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/issue_216_217_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/json_ld_script_extraction.rs +4 -6
- data/vendor/html-to-markdown-rs/tests/lists_test.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/plain_output_test.rs +8 -2
- data/vendor/html-to-markdown-rs/tests/preprocessing_tests.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/skip_images_test.rs +8 -11
- data/vendor/html-to-markdown-rs/tests/tables_test.rs +12 -2
- data/vendor/html-to-markdown-rs/tests/test_custom_elements.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/test_nested_simple.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/test_script_style_stripping.rs +17 -28
- data/vendor/html-to-markdown-rs/tests/test_spa_bisect.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/visitor_integration_test.rs +29 -33
- data/vendor/html-to-markdown-rs/tests/xml_tables_test.rs +8 -1
- metadata +9 -37
- data/bin/benchmark.rb +0 -232
- data/ext/html-to-markdown-rb/native/src/conversion/tables.rs +0 -71
- data/ext/html-to-markdown-rb/native/src/profiling.rs +0 -215
- data/ext/html-to-markdown-rb/native/src/visitor/bridge.rs +0 -252
- data/ext/html-to-markdown-rb/native/src/visitor/callbacks.rs +0 -640
- data/ext/html-to-markdown-rb/native/src/visitor/mod.rs +0 -12
- data/spec/convert_spec.rb +0 -77
- data/spec/convert_with_tables_spec.rb +0 -194
- data/spec/metadata_extraction_spec.rb +0 -437
- data/spec/visitor_issue_187_spec.rb +0 -605
- data/spec/visitor_spec.rb +0 -1149
- data/vendor/html-to-markdown-rs/src/hocr/converter/code_analysis.rs +0 -254
- data/vendor/html-to-markdown-rs/src/hocr/converter/core.rs +0 -249
- data/vendor/html-to-markdown-rs/src/hocr/converter/elements.rs +0 -382
- data/vendor/html-to-markdown-rs/src/hocr/converter/hierarchy.rs +0 -379
- data/vendor/html-to-markdown-rs/src/hocr/converter/keywords.rs +0 -55
- data/vendor/html-to-markdown-rs/src/hocr/converter/layout.rs +0 -313
- data/vendor/html-to-markdown-rs/src/hocr/converter/mod.rs +0 -26
- data/vendor/html-to-markdown-rs/src/hocr/converter/output.rs +0 -78
- data/vendor/html-to-markdown-rs/src/hocr/extractor.rs +0 -232
- data/vendor/html-to-markdown-rs/src/hocr/mod.rs +0 -31
- data/vendor/html-to-markdown-rs/src/hocr/parser.rs +0 -333
- data/vendor/html-to-markdown-rs/src/hocr/spatial/coords.rs +0 -129
- data/vendor/html-to-markdown-rs/src/hocr/spatial/grouping.rs +0 -165
- data/vendor/html-to-markdown-rs/src/hocr/spatial/layout.rs +0 -335
- data/vendor/html-to-markdown-rs/src/hocr/spatial/mod.rs +0 -15
- data/vendor/html-to-markdown-rs/src/hocr/spatial/output.rs +0 -63
- data/vendor/html-to-markdown-rs/src/hocr/types.rs +0 -269
- data/vendor/html-to-markdown-rs/src/visitor/async_traits.rs +0 -249
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge.rs +0 -189
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge_visitor.rs +0 -343
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/macros.rs +0 -217
- data/vendor/html-to-markdown-rs/tests/async_visitor_test.rs +0 -57
- data/vendor/html-to-markdown-rs/tests/convert_with_metadata_no_frontmatter.rs +0 -100
- data/vendor/html-to-markdown-rs/tests/hocr_compliance_test.rs +0 -509
data/lib/html_to_markdown.rb
CHANGED
|
@@ -7,205 +7,24 @@ module HtmlToMarkdown
|
|
|
7
7
|
autoload :CLI, 'html_to_markdown/cli'
|
|
8
8
|
autoload :CLIProxy, 'html_to_markdown/cli_proxy'
|
|
9
9
|
|
|
10
|
-
class Options; end # rubocop:disable Lint/EmptyClass
|
|
11
|
-
|
|
12
10
|
class << self
|
|
13
11
|
alias native_convert convert
|
|
14
|
-
alias native_convert_with_inline_images convert_with_inline_images
|
|
15
|
-
alias native_convert_with_inline_images_handle convert_with_inline_images_handle
|
|
16
|
-
alias native_options options
|
|
17
|
-
alias native_convert_with_options convert_with_options
|
|
18
|
-
alias native_convert_with_metadata convert_with_metadata
|
|
19
|
-
alias native_convert_with_metadata_handle convert_with_metadata_handle
|
|
20
|
-
alias native_convert_with_visitor convert_with_visitor
|
|
21
|
-
alias native_convert_with_tables convert_with_tables
|
|
22
12
|
end
|
|
23
13
|
|
|
24
14
|
module_function
|
|
25
15
|
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
# NOTE: visitor parameter is accepted for API compatibility but not used in inline images mode
|
|
40
|
-
# The visitor pattern is only supported in the standard convert() method
|
|
41
|
-
native_convert_with_inline_images(html.to_s, options, image_config)
|
|
42
|
-
end
|
|
43
|
-
|
|
44
|
-
def convert_with_inline_images_handle(html, options_handle, image_config = nil)
|
|
45
|
-
native_convert_with_inline_images_handle(html.to_s, options_handle, image_config)
|
|
46
|
-
end
|
|
47
|
-
|
|
48
|
-
def options(options_hash = nil)
|
|
49
|
-
native_options(options_hash)
|
|
50
|
-
end
|
|
51
|
-
|
|
52
|
-
# Convert HTML to Markdown with comprehensive metadata extraction.
|
|
53
|
-
#
|
|
54
|
-
# Performs HTML-to-Markdown conversion while extracting document metadata, headers,
|
|
55
|
-
# links, images, and structured data in a single pass. Ideal for content analysis,
|
|
56
|
-
# SEO workflows, and document indexing.
|
|
57
|
-
#
|
|
58
|
-
# @param html [String] HTML string to convert. Line endings are normalized (CRLF -> LF).
|
|
59
|
-
# @param options [ConversionOptions, Hash, nil] Optional conversion configuration.
|
|
60
|
-
# When a Hash, keys should match ConversionOptions field names (as symbols or strings).
|
|
61
|
-
# Common options:
|
|
62
|
-
# - :heading_style [String] "atx", "atx_closed", or "underlined" (default: "underlined")
|
|
63
|
-
# - :list_indent_type [String] "spaces" or "tabs" (default: "spaces")
|
|
64
|
-
# - :list_indent_width [Integer] Spaces per indent level (default: 4)
|
|
65
|
-
# - :wrap [true, false] Enable text wrapping (default: false)
|
|
66
|
-
# - :wrap_width [Integer] Wrap at this column width (default: 80)
|
|
67
|
-
# See ConversionOptions documentation for complete list.
|
|
68
|
-
#
|
|
69
|
-
# @param metadata_config [Hash, nil] Optional metadata extraction configuration.
|
|
70
|
-
# Keys should be symbols or strings. Supported keys:
|
|
71
|
-
# - :extract_headers [true, false] Extract h1-h6 heading elements (default: true)
|
|
72
|
-
# - :extract_links [true, false] Extract hyperlinks with type classification (default: true)
|
|
73
|
-
# - :extract_images [true, false] Extract image elements (default: true)
|
|
74
|
-
# - :extract_structured_data [true, false] Extract JSON-LD/Microdata/RDFa (default: true)
|
|
75
|
-
# - :max_structured_data_size [Integer] Size limit for structured data in bytes (default: 1_000_000)
|
|
76
|
-
#
|
|
77
|
-
# @return [Array<String, Hash>] Tuple of [markdown_string, metadata_hash]
|
|
78
|
-
# markdown_string: String - The converted Markdown output
|
|
79
|
-
#
|
|
80
|
-
# metadata_hash: Hash with keys:
|
|
81
|
-
# - :document [Hash] Document-level metadata:
|
|
82
|
-
# - :title [String, nil] From <title> tag
|
|
83
|
-
# - :description [String, nil] From <meta name="description">
|
|
84
|
-
# - :keywords [Array<String>] From <meta name="keywords">
|
|
85
|
-
# - :author [String, nil] From <meta name="author">
|
|
86
|
-
# - :language [String, nil] From lang attribute (e.g., "en")
|
|
87
|
-
# - :text_direction [String, nil] "ltr", "rtl", or "auto"
|
|
88
|
-
# - :canonical_url [String, nil] From <link rel="canonical">
|
|
89
|
-
# - :base_href [String, nil] From <base href="">
|
|
90
|
-
# - :open_graph [Hash<String, String>] Open Graph properties (og:* meta tags)
|
|
91
|
-
# - :twitter_card [Hash<String, String>] Twitter Card properties (twitter:* meta tags)
|
|
92
|
-
# - :meta_tags [Hash<String, String>] Other meta tags
|
|
93
|
-
#
|
|
94
|
-
# - :headers [Array<Hash>] Heading elements:
|
|
95
|
-
# - :level [Integer] 1-6
|
|
96
|
-
# - :text [String] Header text content
|
|
97
|
-
# - :id [String, nil] HTML id attribute
|
|
98
|
-
# - :depth [Integer] Tree nesting depth
|
|
99
|
-
# - :html_offset [Integer] Byte offset in original HTML
|
|
100
|
-
#
|
|
101
|
-
# - :links [Array<Hash>] Hyperlinks:
|
|
102
|
-
# - :href [String] Link URL
|
|
103
|
-
# - :text [String] Link text content
|
|
104
|
-
# - :title [String, nil] Title attribute
|
|
105
|
-
# - :link_type [String] "anchor", "internal", "external", "email", "phone", or "other"
|
|
106
|
-
# - :rel [Array<String>] Rel attribute values
|
|
107
|
-
# - :attributes [Hash<String, String>] Additional HTML attributes
|
|
108
|
-
#
|
|
109
|
-
# - :images [Array<Hash>] Image elements:
|
|
110
|
-
# - :src [String] Image source URL or data URI
|
|
111
|
-
# - :alt [String, nil] Alt text for accessibility
|
|
112
|
-
# - :title [String, nil] Title attribute
|
|
113
|
-
# - :dimensions [Array<Integer>, nil] [width, height] if available
|
|
114
|
-
# - :image_type [String] "data_uri", "external", "relative", or "inline_svg"
|
|
115
|
-
# - :attributes [Hash<String, String>] Additional HTML attributes
|
|
116
|
-
#
|
|
117
|
-
# - :structured_data [Array<Hash>] Structured data blocks:
|
|
118
|
-
# - :data_type [String] "json_ld", "microdata", or "rdfa"
|
|
119
|
-
# - :raw_json [String] Raw JSON content
|
|
120
|
-
# - :schema_type [String, nil] Schema type (e.g., "Article", "Event")
|
|
121
|
-
#
|
|
122
|
-
# @raise [StandardError] If conversion fails or invalid configuration
|
|
123
|
-
#
|
|
124
|
-
# @example Basic usage
|
|
125
|
-
# html = <<~HTML
|
|
126
|
-
# <html lang="en">
|
|
127
|
-
# <head>
|
|
128
|
-
# <title>My Article</title>
|
|
129
|
-
# <meta name="description" content="A great read">
|
|
130
|
-
# </head>
|
|
131
|
-
# <body>
|
|
132
|
-
# <h1 id="intro">Introduction</h1>
|
|
133
|
-
# <p>Visit <a href="https://example.com">our site</a></p>
|
|
134
|
-
# <img src="photo.jpg" alt="Beautiful landscape">
|
|
135
|
-
# </body>
|
|
136
|
-
# </html>
|
|
137
|
-
# HTML
|
|
138
|
-
#
|
|
139
|
-
# markdown, metadata = HtmlToMarkdown.convert_with_metadata(html)
|
|
140
|
-
#
|
|
141
|
-
# puts metadata[:document][:title] # => "My Article"
|
|
142
|
-
# puts metadata[:document][:language] # => "en"
|
|
143
|
-
# puts metadata[:headers].length # => 1
|
|
144
|
-
# puts metadata[:headers][0][:text] # => "Introduction"
|
|
145
|
-
# puts metadata[:links].length # => 1
|
|
146
|
-
# puts metadata[:images].length # => 1
|
|
147
|
-
#
|
|
148
|
-
# @example With selective metadata extraction
|
|
149
|
-
# config = {
|
|
150
|
-
# extract_headers: true,
|
|
151
|
-
# extract_links: true,
|
|
152
|
-
# extract_images: false, # Skip images
|
|
153
|
-
# extract_structured_data: false # Skip structured data
|
|
154
|
-
# }
|
|
155
|
-
#
|
|
156
|
-
# markdown, metadata = HtmlToMarkdown.convert_with_metadata(html, nil, config)
|
|
157
|
-
# puts metadata[:images].empty? # => true (not extracted)
|
|
158
|
-
#
|
|
159
|
-
# @example With conversion options
|
|
160
|
-
# options = {
|
|
161
|
-
# heading_style: "atx", # Use # H1, ## H2 style
|
|
162
|
-
# wrap: true,
|
|
163
|
-
# wrap_width: 80
|
|
164
|
-
# }
|
|
165
|
-
#
|
|
166
|
-
# config = { extract_headers: true }
|
|
167
|
-
#
|
|
168
|
-
# markdown, metadata = HtmlToMarkdown.convert_with_metadata(html, options, config)
|
|
169
|
-
# # Markdown uses ATX-style headings and wraps at 80 characters
|
|
170
|
-
#
|
|
171
|
-
# @see #convert Simple conversion without metadata
|
|
172
|
-
# @see #convert_with_inline_images Extract inline images during conversion
|
|
173
|
-
# @see ConversionOptions Detailed conversion configuration
|
|
174
|
-
def convert_with_metadata(html, options = nil, metadata_config = nil, _visitor = nil)
|
|
175
|
-
# NOTE: visitor parameter is accepted for API compatibility but not used in metadata extraction mode
|
|
176
|
-
# The visitor pattern is only supported in the standard convert() method
|
|
177
|
-
native_convert_with_metadata(html.to_s, options, metadata_config)
|
|
178
|
-
end
|
|
179
|
-
|
|
180
|
-
def convert_with_metadata_handle(html, options_handle, metadata_config = nil)
|
|
181
|
-
native_convert_with_metadata_handle(html.to_s, options_handle, metadata_config)
|
|
182
|
-
end
|
|
183
|
-
|
|
184
|
-
# Convert HTML to Markdown with table extraction.
|
|
185
|
-
#
|
|
186
|
-
# Performs HTML-to-Markdown conversion while extracting structured table data
|
|
187
|
-
# (cells, markdown representation, header row flags) in a single pass.
|
|
188
|
-
#
|
|
189
|
-
# @param html [String] HTML string to convert.
|
|
190
|
-
# @param options [Hash, nil] Optional conversion configuration.
|
|
191
|
-
# @param metadata_config [Hash, nil] Optional metadata extraction configuration.
|
|
192
|
-
#
|
|
193
|
-
# @return [Hash] A hash with keys:
|
|
194
|
-
# - :content [String] The converted Markdown output
|
|
195
|
-
# - :metadata [Hash, nil] Extended metadata (if metadata extraction was configured)
|
|
196
|
-
# - :tables [Array<Hash>] Extracted tables, each with:
|
|
197
|
-
# - :cells [Array<Array<String>>] Table cells organized as rows x columns
|
|
198
|
-
# - :markdown [String] Complete rendered table in Markdown format
|
|
199
|
-
# - :is_header_row [Array<Boolean>] Per-row flag indicating header rows
|
|
200
|
-
#
|
|
201
|
-
# @raise [StandardError] If conversion fails or invalid configuration
|
|
202
|
-
#
|
|
203
|
-
# @example Basic usage
|
|
204
|
-
# html = '<table><thead><tr><th>Name</th></tr></thead><tbody><tr><td>Alice</td></tr></tbody></table>'
|
|
205
|
-
# result = HtmlToMarkdown.convert_with_tables(html)
|
|
206
|
-
# puts result[:tables].length # => 1
|
|
207
|
-
# puts result[:tables][0][:cells] # => [["Name"], ["Alice"]]
|
|
208
|
-
def convert_with_tables(html, options = nil, metadata_config = nil)
|
|
209
|
-
native_convert_with_tables(html.to_s, options, metadata_config)
|
|
16
|
+
# Convert HTML to Markdown, returning a Hash with:
|
|
17
|
+
# - :content [String, nil] the converted Markdown output
|
|
18
|
+
# - :document [nil] document structure (not yet exposed)
|
|
19
|
+
# - :metadata [Hash, nil] extracted HTML metadata
|
|
20
|
+
# - :tables [Array<Hash>] extracted tables with :grid and :markdown
|
|
21
|
+
# - :images [Array<Hash>] extracted inline images
|
|
22
|
+
# - :warnings [Array<Hash>] processing warnings
|
|
23
|
+
#
|
|
24
|
+
# @param html [String] HTML string to convert
|
|
25
|
+
# @param options [Hash, nil] optional conversion options
|
|
26
|
+
# @return [Hash] conversion result
|
|
27
|
+
def convert(html, options = nil)
|
|
28
|
+
native_convert(html.to_s, options)
|
|
210
29
|
end
|
|
211
30
|
end
|
data/sig/html_to_markdown.rbs
CHANGED
|
@@ -2,36 +2,6 @@
|
|
|
2
2
|
module HtmlToMarkdown
|
|
3
3
|
VERSION: String
|
|
4
4
|
|
|
5
|
-
# Opaque handle for reusable conversion options
|
|
6
|
-
class Options
|
|
7
|
-
end
|
|
8
|
-
|
|
9
|
-
# Visitor context information passed to visitor callbacks
|
|
10
|
-
class NodeContext
|
|
11
|
-
attr_reader node_type: Symbol
|
|
12
|
-
attr_reader tag_name: String
|
|
13
|
-
attr_reader attributes: Hash[String, String]
|
|
14
|
-
attr_reader depth: Integer
|
|
15
|
-
attr_reader index_in_parent: Integer
|
|
16
|
-
attr_reader parent_tag: String | nil
|
|
17
|
-
attr_reader is_inline: bool
|
|
18
|
-
end
|
|
19
|
-
|
|
20
|
-
# Result of a visitor callback
|
|
21
|
-
type visitor_result = {
|
|
22
|
-
type: :continue,
|
|
23
|
-
} | {
|
|
24
|
-
type: :custom,
|
|
25
|
-
output: String,
|
|
26
|
-
} | {
|
|
27
|
-
type: :skip,
|
|
28
|
-
} | {
|
|
29
|
-
type: :preserve_html,
|
|
30
|
-
} | {
|
|
31
|
-
type: :error,
|
|
32
|
-
message: String,
|
|
33
|
-
}
|
|
34
|
-
|
|
35
5
|
type heading_style = :underlined | :atx | :atx_closed
|
|
36
6
|
type list_indent_type = :spaces | :tabs
|
|
37
7
|
type highlight_style = :double_equal | :html | :bold | :none
|
|
@@ -62,7 +32,6 @@ module HtmlToMarkdown
|
|
|
62
32
|
autolinks?: bool,
|
|
63
33
|
default_title?: bool,
|
|
64
34
|
br_in_tables?: bool,
|
|
65
|
-
hocr_spatial_tables?: bool,
|
|
66
35
|
highlight_style?: highlight_style,
|
|
67
36
|
extract_metadata?: bool,
|
|
68
37
|
whitespace_mode?: whitespace_mode,
|
|
@@ -81,50 +50,14 @@ module HtmlToMarkdown
|
|
|
81
50
|
strip_tags?: Array[String],
|
|
82
51
|
preserve_tags?: Array[String],
|
|
83
52
|
output_format?: output_format,
|
|
84
|
-
skip_images?: bool
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
max_decoded_size_bytes?: Integer,
|
|
89
|
-
filename_prefix?: String?,
|
|
53
|
+
skip_images?: bool,
|
|
54
|
+
include_document_structure?: bool,
|
|
55
|
+
extract_images?: bool,
|
|
56
|
+
max_image_size?: Integer,
|
|
90
57
|
capture_svg?: bool,
|
|
91
58
|
infer_dimensions?: bool
|
|
92
59
|
}
|
|
93
60
|
|
|
94
|
-
type inline_image_format = "png" | "jpeg" | "gif" | "bmp" | "webp" | "svg" | String
|
|
95
|
-
|
|
96
|
-
type inline_image_source = "img_data_uri" | "svg_element"
|
|
97
|
-
|
|
98
|
-
type inline_image = {
|
|
99
|
-
data: String,
|
|
100
|
-
format: inline_image_format,
|
|
101
|
-
filename: String?,
|
|
102
|
-
description: String?,
|
|
103
|
-
dimensions: [Integer, Integer]?,
|
|
104
|
-
source: inline_image_source,
|
|
105
|
-
attributes: Hash[String, String]
|
|
106
|
-
}
|
|
107
|
-
|
|
108
|
-
type inline_image_warning = {
|
|
109
|
-
index: Integer,
|
|
110
|
-
message: String
|
|
111
|
-
}
|
|
112
|
-
|
|
113
|
-
type html_extraction = {
|
|
114
|
-
markdown: String,
|
|
115
|
-
inline_images: Array[inline_image],
|
|
116
|
-
warnings: Array[inline_image_warning]
|
|
117
|
-
}
|
|
118
|
-
|
|
119
|
-
type metadata_config = {
|
|
120
|
-
extract_document?: bool,
|
|
121
|
-
extract_headers?: bool,
|
|
122
|
-
extract_links?: bool,
|
|
123
|
-
extract_images?: bool,
|
|
124
|
-
extract_structured_data?: bool,
|
|
125
|
-
max_structured_data_size?: Integer
|
|
126
|
-
}
|
|
127
|
-
|
|
128
61
|
type text_direction = "ltr" | "rtl" | "auto" | nil
|
|
129
62
|
|
|
130
63
|
type document_metadata = {
|
|
@@ -185,314 +118,20 @@ module HtmlToMarkdown
|
|
|
185
118
|
structured_data: Array[structured_data]
|
|
186
119
|
}
|
|
187
120
|
|
|
188
|
-
|
|
189
|
-
cells: Array[Array[String]],
|
|
190
|
-
markdown: String,
|
|
191
|
-
is_header_row: Array[bool]
|
|
192
|
-
}
|
|
193
|
-
|
|
194
|
-
type table_extraction_result = {
|
|
195
|
-
content: String,
|
|
196
|
-
metadata: extended_metadata?,
|
|
197
|
-
tables: Array[table_data]
|
|
198
|
-
}
|
|
199
|
-
|
|
200
|
-
# Native methods (implemented in Rust via Magnus/rb-sys)
|
|
201
|
-
# These are aliased from the Rust extension and available as both module and instance methods
|
|
121
|
+
# Native method (implemented in Rust via Magnus/rb-sys)
|
|
202
122
|
private
|
|
203
123
|
|
|
204
|
-
def self.native_convert: (String html, conversion_options? options) -> String
|
|
205
|
-
def
|
|
206
|
-
def self.native_convert_with_options: (String html, Options options_handle) -> String
|
|
207
|
-
def self.native_convert_with_inline_images_handle: (
|
|
208
|
-
String html,
|
|
209
|
-
Options options_handle,
|
|
210
|
-
inline_image_config? image_config
|
|
211
|
-
) -> html_extraction
|
|
212
|
-
def self.native_convert_with_inline_images: (
|
|
213
|
-
String html,
|
|
214
|
-
conversion_options? options,
|
|
215
|
-
inline_image_config? image_config
|
|
216
|
-
) -> html_extraction
|
|
217
|
-
def self.native_convert_with_metadata_handle: (
|
|
218
|
-
String html,
|
|
219
|
-
Options options_handle,
|
|
220
|
-
metadata_config? metadata_config
|
|
221
|
-
) -> [String, extended_metadata]
|
|
222
|
-
def self.native_convert_with_metadata: (
|
|
223
|
-
String html,
|
|
224
|
-
conversion_options? options,
|
|
225
|
-
metadata_config? metadata_config
|
|
226
|
-
) -> [String, extended_metadata]
|
|
227
|
-
def self.native_convert_with_visitor: (
|
|
228
|
-
String html,
|
|
229
|
-
conversion_options? options,
|
|
230
|
-
visitor? visitor
|
|
231
|
-
) -> String
|
|
232
|
-
def self.native_convert_with_tables: (
|
|
233
|
-
String html,
|
|
234
|
-
conversion_options? options,
|
|
235
|
-
metadata_config? metadata_config
|
|
236
|
-
) -> table_extraction_result
|
|
237
|
-
|
|
238
|
-
def native_convert: (String html, conversion_options? options) -> String
|
|
239
|
-
def native_options: (conversion_options? options_hash) -> Options
|
|
240
|
-
def native_convert_with_options: (String html, Options options_handle) -> String
|
|
241
|
-
def native_convert_with_inline_images_handle: (
|
|
242
|
-
String html,
|
|
243
|
-
Options options_handle,
|
|
244
|
-
inline_image_config? image_config
|
|
245
|
-
) -> html_extraction
|
|
246
|
-
def native_convert_with_inline_images: (
|
|
247
|
-
String html,
|
|
248
|
-
conversion_options? options,
|
|
249
|
-
inline_image_config? image_config
|
|
250
|
-
) -> html_extraction
|
|
251
|
-
def native_convert_with_metadata_handle: (
|
|
252
|
-
String html,
|
|
253
|
-
Options options_handle,
|
|
254
|
-
metadata_config? metadata_config
|
|
255
|
-
) -> [String, extended_metadata]
|
|
256
|
-
def native_convert_with_metadata: (
|
|
257
|
-
String html,
|
|
258
|
-
conversion_options? options,
|
|
259
|
-
metadata_config? metadata_config
|
|
260
|
-
) -> [String, extended_metadata]
|
|
261
|
-
def native_convert_with_visitor: (
|
|
262
|
-
String html,
|
|
263
|
-
conversion_options? options,
|
|
264
|
-
visitor? visitor
|
|
265
|
-
) -> String
|
|
266
|
-
def native_convert_with_tables: (
|
|
267
|
-
String html,
|
|
268
|
-
conversion_options? options,
|
|
269
|
-
metadata_config? metadata_config
|
|
270
|
-
) -> table_extraction_result
|
|
271
|
-
|
|
272
|
-
# Visitor interface for customizing conversion behavior
|
|
273
|
-
type visitor = Object
|
|
124
|
+
def self.native_convert: (String html, conversion_options? options) -> Hash[String, untyped]
|
|
125
|
+
def native_convert: (String html, conversion_options? options) -> Hash[String, untyped]
|
|
274
126
|
|
|
275
127
|
public
|
|
276
128
|
|
|
277
|
-
# Convert HTML to Markdown with
|
|
278
|
-
#
|
|
279
|
-
# The optional visitor parameter allows customization of conversion behavior for specific elements.
|
|
280
|
-
# When both options and visitor are provided, the visitor can override default conversions.
|
|
281
|
-
#
|
|
282
|
-
# Args:
|
|
283
|
-
# html: HTML string to convert
|
|
284
|
-
# options: Optional conversion configuration
|
|
285
|
-
# visitor: Optional visitor object for customizing conversion
|
|
286
|
-
#
|
|
287
|
-
# Returns:
|
|
288
|
-
# markdown: String - Converted markdown output
|
|
289
|
-
#
|
|
290
|
-
# Example:
|
|
291
|
-
# markdown = HtmlToMarkdown.convert(html, { wrap: true }, my_visitor)
|
|
292
|
-
def self.convert: (String html, ?conversion_options options, ?visitor visitor) -> String
|
|
293
|
-
|
|
294
|
-
# Create a reusable options handle for performance
|
|
295
|
-
def self.options: (?conversion_options options_hash) -> Options
|
|
296
|
-
|
|
297
|
-
# Convert HTML using a pre-built options handle
|
|
298
|
-
def self.convert_with_options: (String html, Options options_handle) -> String
|
|
299
|
-
def self.convert_with_inline_images_handle: (
|
|
300
|
-
String html,
|
|
301
|
-
Options options_handle,
|
|
302
|
-
?inline_image_config image_config
|
|
303
|
-
) -> html_extraction
|
|
304
|
-
|
|
305
|
-
# Convert HTML with inline image extraction
|
|
306
|
-
#
|
|
307
|
-
# Optionally accepts a visitor for customizing conversion behavior.
|
|
308
|
-
#
|
|
309
|
-
# Args:
|
|
310
|
-
# html: HTML string to convert
|
|
311
|
-
# options: Optional conversion configuration
|
|
312
|
-
# image_config: Optional inline image extraction configuration
|
|
313
|
-
# visitor: Optional visitor object for customizing conversion
|
|
314
|
-
#
|
|
315
|
-
# Returns:
|
|
316
|
-
# html_extraction: Hash containing markdown, inline_images array, and warnings array
|
|
317
|
-
#
|
|
318
|
-
# Example:
|
|
319
|
-
# result = HtmlToMarkdown.convert_with_inline_images(html, { wrap: true }, image_config, my_visitor)
|
|
320
|
-
def self.convert_with_inline_images: (
|
|
321
|
-
String html,
|
|
322
|
-
?conversion_options options,
|
|
323
|
-
?inline_image_config image_config,
|
|
324
|
-
?visitor visitor
|
|
325
|
-
) -> html_extraction
|
|
326
|
-
|
|
327
|
-
# Convert HTML to Markdown with a custom visitor (deprecated)
|
|
328
|
-
#
|
|
329
|
-
# DEPRECATED: Use convert() with the optional visitor parameter instead.
|
|
330
|
-
# This method is maintained for backward compatibility.
|
|
331
|
-
#
|
|
332
|
-
# All convert functions now accept optional visitors:
|
|
333
|
-
# - convert(html, options, visitor)
|
|
334
|
-
# - convert_with_inline_images(html, options, image_config, visitor)
|
|
335
|
-
# - convert_with_metadata(html, options, metadata_config, visitor)
|
|
336
|
-
#
|
|
337
|
-
# The visitor object can implement any of the following methods:
|
|
338
|
-
# - visit_element_start(ctx) -> visitor_result
|
|
339
|
-
# - visit_element_end(ctx, output) -> visitor_result
|
|
340
|
-
# - visit_text(ctx, text) -> visitor_result
|
|
341
|
-
# - visit_link(ctx, href, text, title) -> visitor_result
|
|
342
|
-
# - visit_image(ctx, src, alt, title) -> visitor_result
|
|
343
|
-
# - visit_heading(ctx, level, text, id) -> visitor_result
|
|
344
|
-
# - visit_code_block(ctx, lang, code) -> visitor_result
|
|
345
|
-
# - visit_code_inline(ctx, code) -> visitor_result
|
|
346
|
-
# - visit_list_item(ctx, ordered, marker, text) -> visitor_result
|
|
347
|
-
# - visit_list_start(ctx, ordered) -> visitor_result
|
|
348
|
-
# - visit_list_end(ctx, ordered, output) -> visitor_result
|
|
349
|
-
# - visit_table_start(ctx) -> visitor_result
|
|
350
|
-
# - visit_table_row(ctx, cells, is_header) -> visitor_result
|
|
351
|
-
# - visit_table_end(ctx, output) -> visitor_result
|
|
352
|
-
# - visit_blockquote(ctx, content, depth) -> visitor_result
|
|
353
|
-
# - visit_strong(ctx, text) -> visitor_result
|
|
354
|
-
# - visit_emphasis(ctx, text) -> visitor_result
|
|
355
|
-
# - visit_strikethrough(ctx, text) -> visitor_result
|
|
356
|
-
# - visit_underline(ctx, text) -> visitor_result
|
|
357
|
-
# - visit_subscript(ctx, text) -> visitor_result
|
|
358
|
-
# - visit_superscript(ctx, text) -> visitor_result
|
|
359
|
-
# - visit_mark(ctx, text) -> visitor_result
|
|
360
|
-
# - visit_line_break(ctx) -> visitor_result
|
|
361
|
-
# - visit_horizontal_rule(ctx) -> visitor_result
|
|
362
|
-
# - visit_custom_element(ctx, tag_name, html) -> visitor_result
|
|
363
|
-
# - visit_definition_list_start(ctx) -> visitor_result
|
|
364
|
-
# - visit_definition_term(ctx, text) -> visitor_result
|
|
365
|
-
# - visit_definition_description(ctx, text) -> visitor_result
|
|
366
|
-
# - visit_definition_list_end(ctx, output) -> visitor_result
|
|
367
|
-
# - visit_form(ctx, action, method) -> visitor_result
|
|
368
|
-
# - visit_input(ctx, input_type, name, value) -> visitor_result
|
|
369
|
-
# - visit_button(ctx, text) -> visitor_result
|
|
370
|
-
# - visit_audio(ctx, src) -> visitor_result
|
|
371
|
-
# - visit_video(ctx, src) -> visitor_result
|
|
372
|
-
# - visit_iframe(ctx, src) -> visitor_result
|
|
373
|
-
# - visit_details(ctx, open) -> visitor_result
|
|
374
|
-
# - visit_summary(ctx, text) -> visitor_result
|
|
375
|
-
# - visit_figure_start(ctx) -> visitor_result
|
|
376
|
-
# - visit_figcaption(ctx, text) -> visitor_result
|
|
377
|
-
# - visit_figure_end(ctx, output) -> visitor_result
|
|
378
|
-
#
|
|
379
|
-
# Each method should return a Hash with at least :type key:
|
|
380
|
-
# { type: :continue } - Continue with default behavior
|
|
381
|
-
# { type: :custom, output: "..." } - Replace with custom markdown
|
|
382
|
-
# { type: :skip } - Skip this element entirely
|
|
383
|
-
# { type: :preserve_html } - Keep original HTML
|
|
384
|
-
# { type: :error, message: "..." } - Stop conversion with error
|
|
385
|
-
#
|
|
386
|
-
# Args:
|
|
387
|
-
# html: HTML string to convert
|
|
388
|
-
# options: Optional conversion configuration
|
|
389
|
-
# visitor: Visitor object that responds to visitor callback methods
|
|
390
|
-
#
|
|
391
|
-
# Returns:
|
|
392
|
-
# markdown: String - Converted markdown output
|
|
393
|
-
#
|
|
394
|
-
# Example:
|
|
395
|
-
# class MyVisitor
|
|
396
|
-
# def visit_link(ctx, href, text, title = nil)
|
|
397
|
-
# { type: :custom, output: "[#{text}](#{href})" }
|
|
398
|
-
# end
|
|
399
|
-
# end
|
|
400
|
-
#
|
|
401
|
-
# HtmlToMarkdown.convert_with_visitor(html, visitor: MyVisitor.new)
|
|
402
|
-
def self.convert_with_visitor: (String html, ?conversion_options options, visitor: visitor) -> String
|
|
403
|
-
|
|
404
|
-
# Convert HTML to Markdown with metadata extraction
|
|
405
|
-
#
|
|
406
|
-
# Extracts comprehensive metadata (headers, links, images, structured data) during conversion.
|
|
407
|
-
# Optionally accepts a visitor for customizing conversion behavior.
|
|
408
|
-
#
|
|
409
|
-
# Args:
|
|
410
|
-
# html: HTML string to convert
|
|
411
|
-
# options: Optional conversion configuration
|
|
412
|
-
# metadata_config: Optional metadata extraction configuration
|
|
413
|
-
# visitor: Optional visitor object for customizing conversion
|
|
414
|
-
#
|
|
415
|
-
# Returns:
|
|
416
|
-
# Array containing:
|
|
417
|
-
# - [0] markdown: String - Converted markdown output
|
|
418
|
-
# - [1] metadata: Hash - Extracted metadata with document, headers, links, images, structured_data
|
|
419
|
-
#
|
|
420
|
-
# The metadata hash contains:
|
|
421
|
-
# - document: Document-level metadata (title, description, lang, etc.)
|
|
422
|
-
# - headers: List of header elements with hierarchy
|
|
423
|
-
# - links: List of extracted hyperlinks with classification
|
|
424
|
-
# - images: List of extracted images with metadata
|
|
425
|
-
# - structured_data: List of JSON-LD, Microdata, or RDFa blocks
|
|
426
|
-
#
|
|
427
|
-
# Example:
|
|
428
|
-
# html = '<html lang="en"><head><title>Test</title></head><body><h1>Hello</h1></body></html>'
|
|
429
|
-
# markdown, metadata = HtmlToMarkdown.convert_with_metadata(html)
|
|
430
|
-
# puts "Title: #{metadata['document']['title']}"
|
|
431
|
-
# puts "Headers: #{metadata['headers'].length}"
|
|
432
|
-
#
|
|
433
|
-
# Example with visitor:
|
|
434
|
-
# markdown, metadata = HtmlToMarkdown.convert_with_metadata(html, options, metadata_config, my_visitor)
|
|
435
|
-
def self.convert_with_metadata: (
|
|
436
|
-
String html,
|
|
437
|
-
?conversion_options options,
|
|
438
|
-
?metadata_config metadata_config,
|
|
439
|
-
?visitor visitor
|
|
440
|
-
) -> [String, extended_metadata]
|
|
441
|
-
def self.convert_with_metadata_handle: (
|
|
442
|
-
String html,
|
|
443
|
-
Options options_handle,
|
|
444
|
-
?metadata_config metadata_config
|
|
445
|
-
) -> [String, extended_metadata]
|
|
446
|
-
|
|
447
|
-
# Convert HTML and extract tables as structured data
|
|
448
|
-
#
|
|
449
|
-
# Args:
|
|
450
|
-
# html: HTML string to convert
|
|
451
|
-
# options: Optional conversion configuration
|
|
452
|
-
# metadata_config: Optional metadata extraction configuration
|
|
453
|
-
#
|
|
454
|
-
# Returns:
|
|
455
|
-
# table_extraction_result: Hash containing content, metadata, and tables array
|
|
129
|
+
# Convert HTML to Markdown, returning a Hash with content, metadata, tables, images, and warnings.
|
|
456
130
|
#
|
|
457
131
|
# Example:
|
|
458
|
-
# result = HtmlToMarkdown.
|
|
459
|
-
|
|
460
|
-
def self.convert_with_tables: (
|
|
461
|
-
String html,
|
|
462
|
-
?conversion_options options,
|
|
463
|
-
?metadata_config metadata_config
|
|
464
|
-
) -> table_extraction_result
|
|
132
|
+
# result = HtmlToMarkdown.convert(html)
|
|
133
|
+
def self.convert: (String html, ?conversion_options options) -> Hash[String, untyped]
|
|
465
134
|
|
|
466
|
-
# Instance method
|
|
467
|
-
def convert: (String html, ?conversion_options options
|
|
468
|
-
def options: (?conversion_options options_hash) -> Options
|
|
469
|
-
def convert_with_options: (String html, Options options_handle) -> String
|
|
470
|
-
def convert_with_inline_images_handle: (
|
|
471
|
-
String html,
|
|
472
|
-
Options options_handle,
|
|
473
|
-
?inline_image_config image_config
|
|
474
|
-
) -> html_extraction
|
|
475
|
-
def convert_with_inline_images: (
|
|
476
|
-
String html,
|
|
477
|
-
?conversion_options options,
|
|
478
|
-
?inline_image_config image_config,
|
|
479
|
-
?visitor visitor
|
|
480
|
-
) -> html_extraction
|
|
481
|
-
def convert_with_visitor: (String html, ?conversion_options options, visitor: visitor) -> String
|
|
482
|
-
def convert_with_metadata: (
|
|
483
|
-
String html,
|
|
484
|
-
?conversion_options options,
|
|
485
|
-
?metadata_config metadata_config,
|
|
486
|
-
?visitor visitor
|
|
487
|
-
) -> [String, extended_metadata]
|
|
488
|
-
def convert_with_metadata_handle: (
|
|
489
|
-
String html,
|
|
490
|
-
Options options_handle,
|
|
491
|
-
?metadata_config metadata_config
|
|
492
|
-
) -> [String, extended_metadata]
|
|
493
|
-
def convert_with_tables: (
|
|
494
|
-
String html,
|
|
495
|
-
?conversion_options options,
|
|
496
|
-
?metadata_config metadata_config
|
|
497
|
-
) -> table_extraction_result
|
|
135
|
+
# Instance method version (created by module_function)
|
|
136
|
+
def convert: (String html, ?conversion_options options) -> Hash[String, untyped]
|
|
498
137
|
end
|
data/vendor/Cargo.toml
CHANGED
|
@@ -3,7 +3,7 @@ members = ["html-to-markdown-rs"]
|
|
|
3
3
|
resolver = "2"
|
|
4
4
|
|
|
5
5
|
[workspace.package]
|
|
6
|
-
version = "
|
|
6
|
+
version = "3.0.0"
|
|
7
7
|
edition = "2024"
|
|
8
8
|
rust-version = "1.85"
|
|
9
9
|
authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
|
|
@@ -13,18 +13,21 @@ homepage = "https://kreuzberg.dev"
|
|
|
13
13
|
|
|
14
14
|
[workspace.dependencies]
|
|
15
15
|
ahash = { version = "0.8", features = ["std", "compile-time-rng"], default-features = false }
|
|
16
|
-
async-trait = "0.1"
|
|
17
16
|
base64 = "0.22"
|
|
18
17
|
clap = { version = "4.6", features = ["derive"] }
|
|
19
18
|
clap_complete = "4.6"
|
|
20
|
-
clap_mangen = "0.
|
|
19
|
+
clap_mangen = "0.3"
|
|
21
20
|
encoding_rs = "0.8"
|
|
22
|
-
ext-php-rs = "0.15.
|
|
21
|
+
ext-php-rs = "0.15.7"
|
|
23
22
|
html5ever = "0.39.0"
|
|
24
23
|
once_cell = "1.21"
|
|
25
24
|
pyo3 = { version = "0.28.2", features = ["abi3-py310"] }
|
|
25
|
+
rayon = "1.11"
|
|
26
26
|
regex = "1.12"
|
|
27
27
|
serde = { version = "1.0", features = ["derive"] }
|
|
28
28
|
serde_json = "1.0"
|
|
29
|
+
tempfile = "3.27"
|
|
29
30
|
thiserror = "2.0"
|
|
30
31
|
tl = { package = "astral-tl", version = "0.7.11" }
|
|
32
|
+
toml = "1.1"
|
|
33
|
+
which = "8"
|