html-to-markdown 3.4.1-aarch64-linux → 3.5.0-aarch64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,10 +1,10 @@
1
1
  # This file is auto-generated by alef — DO NOT EDIT.
2
- # alef:hash:ee219b60565c49b67427d4a5c825fcafe3db1061d9c81a06de306dd72495b28b
2
+ # alef:hash:13800159bcd376ddaa37a79cd1dbda8cbefca3a8450e10e9a6510a9576cad1f4
3
3
  # To regenerate: alef generate
4
4
  # To verify freshness: alef verify --exit-code
5
5
  # Issues & docs: https://github.com/kreuzberg-dev/alef
6
6
  # frozen_string_literal: true
7
7
 
8
8
  module HtmlToMarkdown
9
- VERSION = '3.4.1'
9
+ VERSION = "3.5.0"
10
10
  end
@@ -1,12 +1,12 @@
1
1
  # This file is auto-generated by alef — DO NOT EDIT.
2
- # alef:hash:8697ef2add7937bd1fef3fb3a54b34edd14a9764276d0e9f4a53aab7ce9499e0
2
+ # alef:hash:47eeb6be7adffaa90e4383392c292f7b374ce905d84da6b6997e9798053a05a0
3
3
  # To regenerate: alef generate
4
4
  # To verify freshness: alef verify --exit-code
5
5
  # Issues & docs: https://github.com/kreuzberg-dev/alef
6
6
  # frozen_string_literal: true
7
7
 
8
- require_relative 'html_to_markdown/version'
9
- require_relative 'html_to_markdown/native'
8
+ require_relative "html_to_markdown/version"
9
+ require_relative "html_to_markdown/native"
10
10
 
11
11
  module HtmlToMarkdown
12
12
  # Re-export all types and functions from native extension
Binary file
data/sig/types.rbs CHANGED
@@ -1,5 +1,5 @@
1
1
  # This file is auto-generated by alef — DO NOT EDIT.
2
- # alef:hash:7e5aa912785759cbcad9e2b4750726cbd2e1d7d0717eb2078ffde17d2dd6344f
2
+ # alef:hash:86b0e721bec1828e9c4c971cf6a562e12a69f76d8e0f89e96dc5c1b80f4dd75e
3
3
  # To regenerate: alef generate
4
4
  # To verify freshness: alef verify --exit-code
5
5
  # Issues & docs: https://github.com/kreuzberg-dev/alef
@@ -8,25 +8,9 @@ module HtmlToMarkdown
8
8
 
9
9
  VERSION: String
10
10
 
11
- class DocumentMetadata
12
- # Document-level metadata extracted from `<head>` and top-level elements.
13
- #
14
- # Contains all metadata typically used by search engines, social media platforms,
15
- # and browsers for document indexing and presentation.
16
- #
17
- # # Examples
18
- #
19
- # ```
20
- # let doc = DocumentMetadata {
21
- # title: Some("My Article".to_string()),
22
- # description: Some("A great article about Rust".to_string()),
23
- # keywords: vec!["rust".to_string(), "programming".to_string()],
24
- # ..Default::default()
25
- # };
26
- #
27
- # assert_eq!(doc.title, Some("My Article".to_string()));
28
- # ```
11
+ type json_value = Hash[String, untyped] | Array[untyped] | String | Integer | Float | bool | nil
29
12
 
13
+ class DocumentMetadata
30
14
  attr_accessor title: String?
31
15
  attr_accessor description: String?
32
16
  attr_accessor keywords: Array[String]?
@@ -39,30 +23,10 @@ module HtmlToMarkdown
39
23
  attr_accessor twitter_card: Hash[String, String]?
40
24
  attr_accessor meta_tags: Hash[String, String]?
41
25
 
42
- def initialize: (?title: String, ?description: String, keywords: Array[String], ?author: String, ?canonical_url: String, ?base_href: String, ?language: String, ?text_direction: TextDirection, open_graph: Hash[String, String], twitter_card: Hash[String, String], meta_tags: Hash[String, String]) -> void
26
+ def initialize: (?title: String, ?description: String, ?keywords: Array[String], ?author: String, ?canonical_url: String, ?base_href: String, ?language: String, ?text_direction: TextDirection, ?open_graph: Hash[String, String], ?twitter_card: Hash[String, String], ?meta_tags: Hash[String, String]) -> void
43
27
  end
44
28
 
45
29
  class HeaderMetadata
46
- # Header element metadata with hierarchy tracking.
47
- #
48
- # Captures heading elements (h1-h6) with their text content, identifiers,
49
- # and position in the document structure.
50
- #
51
- # # Examples
52
- #
53
- # ```
54
- # let header = HeaderMetadata {
55
- # level: 1,
56
- # text: "Main Title".to_string(),
57
- # id: Some("main-title".to_string()),
58
- # depth: 0,
59
- # html_offset: 145,
60
- # };
61
- #
62
- # assert_eq!(header.level, 1);
63
- # assert!(header.is_valid());
64
- # ```
65
-
66
30
  attr_reader level: Integer
67
31
  attr_reader text: String
68
32
  attr_reader id: String
@@ -74,26 +38,6 @@ module HtmlToMarkdown
74
38
  end
75
39
 
76
40
  class LinkMetadata
77
- # Hyperlink metadata with categorization and attributes.
78
- #
79
- # Represents `<a>` elements with parsed href values, text content, and link type classification.
80
- #
81
- # # Examples
82
- #
83
- # ```
84
- # let link = LinkMetadata {
85
- # href: "https://example.com".to_string(),
86
- # text: "Example".to_string(),
87
- # title: Some("Visit Example".to_string()),
88
- # link_type: LinkType::External,
89
- # rel: vec!["nofollow".to_string()],
90
- # attributes: Default::default(),
91
- # };
92
- #
93
- # assert_eq!(link.link_type, LinkType::External);
94
- # assert_eq!(link.text, "Example");
95
- # ```
96
-
97
41
  attr_reader href: String
98
42
  attr_reader text: String
99
43
  attr_reader title: String
@@ -102,30 +46,9 @@ module HtmlToMarkdown
102
46
  attr_reader attributes: Hash[String, String]
103
47
 
104
48
  def initialize: (href: String, text: String, ?title: String, link_type: LinkType, rel: Array[String], attributes: Hash[String, String]) -> void
105
- def self.classify_link: (String href) -> LinkType
106
49
  end
107
50
 
108
51
  class ImageMetadata
109
- # Image metadata with source and dimensions.
110
- #
111
- # Captures `<img>` elements and inline `<svg>` elements with metadata
112
- # for image analysis and optimization.
113
- #
114
- # # Examples
115
- #
116
- # ```
117
- # let img = ImageMetadata {
118
- # src: "https://example.com/image.jpg".to_string(),
119
- # alt: Some("An example image".to_string()),
120
- # title: Some("Example".to_string()),
121
- # dimensions: Some((800, 600)),
122
- # image_type: ImageType::External,
123
- # attributes: Default::default(),
124
- # };
125
- #
126
- # assert_eq!(img.image_type, ImageType::External);
127
- # ```
128
-
129
52
  attr_reader src: String
130
53
  attr_reader alt: String
131
54
  attr_reader title: String
@@ -137,23 +60,6 @@ module HtmlToMarkdown
137
60
  end
138
61
 
139
62
  class StructuredData
140
- # Structured data block (JSON-LD, Microdata, or RDFa).
141
- #
142
- # Represents machine-readable structured data found in the document.
143
- # JSON-LD blocks are collected as raw JSON strings for flexibility.
144
- #
145
- # # Examples
146
- #
147
- # ```
148
- # let schema = StructuredData {
149
- # data_type: StructuredDataType::JsonLd,
150
- # raw_json: r#"{"@context":"https://schema.org","@type":"Article"}"#.to_string(),
151
- # schema_type: Some("Article".to_string()),
152
- # };
153
- #
154
- # assert_eq!(schema.data_type, StructuredDataType::JsonLd);
155
- # ```
156
-
157
63
  attr_reader data_type: StructuredDataType
158
64
  attr_reader raw_json: String
159
65
  attr_reader schema_type: String
@@ -162,51 +68,16 @@ module HtmlToMarkdown
162
68
  end
163
69
 
164
70
  class HtmlMetadata
165
- # Comprehensive metadata extraction result from HTML document.
166
- #
167
- # Contains all extracted metadata types in a single structure,
168
- # suitable for serialization and transmission across language boundaries.
169
- #
170
- # # Examples
171
- #
172
- # ```
173
- # let metadata = HtmlMetadata {
174
- # document: Default::default(),
175
- # headers: Vec::new(),
176
- # links: Vec::new(),
177
- # images: Vec::new(),
178
- # structured_data: Vec::new(),
179
- # };
180
- #
181
- # assert!(metadata.headers.is_empty());
182
- # ```
183
-
184
71
  attr_accessor document: DocumentMetadata?
185
72
  attr_accessor headers: Array[HeaderMetadata]?
186
73
  attr_accessor links: Array[LinkMetadata]?
187
74
  attr_accessor images: Array[ImageMetadata]?
188
75
  attr_accessor structured_data: Array[StructuredData]?
189
76
 
190
- def initialize: (document: DocumentMetadata, headers: Array[HeaderMetadata], links: Array[LinkMetadata], images: Array[ImageMetadata], structured_data: Array[StructuredData]) -> void
77
+ def initialize: (?document: DocumentMetadata, ?headers: Array[HeaderMetadata], ?links: Array[LinkMetadata], ?images: Array[ImageMetadata], ?structured_data: Array[StructuredData]) -> void
191
78
  end
192
79
 
193
80
  class ConversionOptions
194
- # Main conversion options for HTML to Markdown conversion.
195
- #
196
- # Use [`ConversionOptions::builder()`] to construct, or [`Default::default()`] for defaults.
197
- #
198
- # # Example
199
- #
200
- # ```text
201
- # use html_to_markdown_rs::ConversionOptions;
202
- #
203
- # let options = ConversionOptions::builder()
204
- # .heading_style(HeadingStyle::Atx)
205
- # .wrap(true)
206
- # .wrap_width(100)
207
- # .build();
208
- # ```
209
-
210
81
  attr_accessor heading_style: HeadingStyle?
211
82
  attr_accessor list_indent_type: ListIndentType?
212
83
  attr_accessor list_indent_width: Integer?
@@ -220,6 +91,7 @@ module HtmlToMarkdown
220
91
  attr_accessor autolinks: bool?
221
92
  attr_accessor default_title: bool?
222
93
  attr_accessor br_in_tables: bool?
94
+ attr_accessor compact_tables: bool?
223
95
  attr_accessor highlight_style: HighlightStyle?
224
96
  attr_accessor extract_metadata: bool?
225
97
  attr_accessor whitespace_mode: WhitespaceMode?
@@ -249,34 +121,11 @@ module HtmlToMarkdown
249
121
  attr_accessor exclude_selectors: Array[String]?
250
122
  attr_accessor visitor: VisitorHandle?
251
123
 
252
- def initialize: (heading_style: HeadingStyle, list_indent_type: ListIndentType, list_indent_width: Integer, bullets: String, strong_em_symbol: String, escape_asterisks: bool, escape_underscores: bool, escape_misc: bool, escape_ascii: bool, code_language: String, autolinks: bool, default_title: bool, br_in_tables: bool, highlight_style: HighlightStyle, extract_metadata: bool, whitespace_mode: WhitespaceMode, strip_newlines: bool, wrap: bool, wrap_width: Integer, convert_as_inline: bool, sub_symbol: String, sup_symbol: String, newline_style: NewlineStyle, code_block_style: CodeBlockStyle, keep_inline_images_in: Array[String], preprocessing: PreprocessingOptions, encoding: String, debug: bool, strip_tags: Array[String], preserve_tags: Array[String], skip_images: bool, link_style: LinkStyle, output_format: OutputFormat, include_document_structure: bool, extract_images: bool, max_image_size: Integer, capture_svg: bool, infer_dimensions: bool, ?max_depth: Integer, exclude_selectors: Array[String], ?visitor: VisitorHandle) -> void
253
- def apply_update: (ConversionOptionsUpdate update) -> void
124
+ def initialize: (?heading_style: HeadingStyle, ?list_indent_type: ListIndentType, ?list_indent_width: Integer, ?bullets: String, ?strong_em_symbol: String, ?escape_asterisks: bool, ?escape_underscores: bool, ?escape_misc: bool, ?escape_ascii: bool, ?code_language: String, ?autolinks: bool, ?default_title: bool, ?br_in_tables: bool, ?compact_tables: bool, ?highlight_style: HighlightStyle, ?extract_metadata: bool, ?whitespace_mode: WhitespaceMode, ?strip_newlines: bool, ?wrap: bool, ?wrap_width: Integer, ?convert_as_inline: bool, ?sub_symbol: String, ?sup_symbol: String, ?newline_style: NewlineStyle, ?code_block_style: CodeBlockStyle, ?keep_inline_images_in: Array[String], ?preprocessing: PreprocessingOptions, ?encoding: String, ?debug: bool, ?strip_tags: Array[String], ?preserve_tags: Array[String], ?skip_images: bool, ?link_style: LinkStyle, ?output_format: OutputFormat, ?include_document_structure: bool, ?extract_images: bool, ?max_image_size: Integer, ?capture_svg: bool, ?infer_dimensions: bool, ?max_depth: Integer, ?exclude_selectors: Array[String], ?visitor: VisitorHandle) -> void
254
125
  def self.default: () -> ConversionOptions
255
- def self.builder: () -> ConversionOptionsBuilder
256
- def self.from_update: (ConversionOptionsUpdate update) -> ConversionOptions
257
- def self.from: (ConversionOptionsUpdate update) -> ConversionOptions
258
- end
259
-
260
- class ConversionOptionsBuilder
261
- # Builder for [`ConversionOptions`].
262
- #
263
- # All fields start with default values. Call `.build()` to produce the final options.
264
-
265
- def strip_tags: (Array[String] tags) -> ConversionOptionsBuilder
266
- def preserve_tags: (Array[String] tags) -> ConversionOptionsBuilder
267
- def keep_inline_images_in: (Array[String] tags) -> ConversionOptionsBuilder
268
- def exclude_selectors: (Array[String] selectors) -> ConversionOptionsBuilder
269
- def visitor: (?VisitorHandle visitor) -> ConversionOptionsBuilder
270
- def preprocessing: (PreprocessingOptions preprocessing) -> ConversionOptionsBuilder
271
- def build: () -> ConversionOptions
272
126
  end
273
127
 
274
128
  class ConversionOptionsUpdate
275
- # Partial update for `ConversionOptions`.
276
- #
277
- # Uses `Option<T>` fields for selective updates. Bindings use this to construct
278
- # options from language-native types. Prefer [`ConversionOptionsBuilder`] for Rust code.
279
-
280
129
  attr_accessor heading_style: HeadingStyle?
281
130
  attr_accessor list_indent_type: ListIndentType?
282
131
  attr_accessor list_indent_width: Integer?
@@ -290,6 +139,7 @@ module HtmlToMarkdown
290
139
  attr_accessor autolinks: bool?
291
140
  attr_accessor default_title: bool?
292
141
  attr_accessor br_in_tables: bool?
142
+ attr_accessor compact_tables: bool?
293
143
  attr_accessor highlight_style: HighlightStyle?
294
144
  attr_accessor extract_metadata: bool?
295
145
  attr_accessor whitespace_mode: WhitespaceMode?
@@ -319,31 +169,20 @@ module HtmlToMarkdown
319
169
  attr_accessor exclude_selectors: Array[String]?
320
170
  attr_accessor visitor: VisitorHandle?
321
171
 
322
- def initialize: (?heading_style: HeadingStyle, ?list_indent_type: ListIndentType, ?list_indent_width: Integer, ?bullets: String, ?strong_em_symbol: String, ?escape_asterisks: bool, ?escape_underscores: bool, ?escape_misc: bool, ?escape_ascii: bool, ?code_language: String, ?autolinks: bool, ?default_title: bool, ?br_in_tables: bool, ?highlight_style: HighlightStyle, ?extract_metadata: bool, ?whitespace_mode: WhitespaceMode, ?strip_newlines: bool, ?wrap: bool, ?wrap_width: Integer, ?convert_as_inline: bool, ?sub_symbol: String, ?sup_symbol: String, ?newline_style: NewlineStyle, ?code_block_style: CodeBlockStyle, ?keep_inline_images_in: Array[String], ?preprocessing: PreprocessingOptionsUpdate, ?encoding: String, ?debug: bool, ?strip_tags: Array[String], ?preserve_tags: Array[String], ?skip_images: bool, ?link_style: LinkStyle, ?output_format: OutputFormat, ?include_document_structure: bool, ?extract_images: bool, ?max_image_size: Integer, ?capture_svg: bool, ?infer_dimensions: bool, ?max_depth: Integer?, ?exclude_selectors: Array[String], ?visitor: VisitorHandle) -> void
172
+ def initialize: (?heading_style: HeadingStyle, ?list_indent_type: ListIndentType, ?list_indent_width: Integer, ?bullets: String, ?strong_em_symbol: String, ?escape_asterisks: bool, ?escape_underscores: bool, ?escape_misc: bool, ?escape_ascii: bool, ?code_language: String, ?autolinks: bool, ?default_title: bool, ?br_in_tables: bool, ?compact_tables: bool, ?highlight_style: HighlightStyle, ?extract_metadata: bool, ?whitespace_mode: WhitespaceMode, ?strip_newlines: bool, ?wrap: bool, ?wrap_width: Integer, ?convert_as_inline: bool, ?sub_symbol: String, ?sup_symbol: String, ?newline_style: NewlineStyle, ?code_block_style: CodeBlockStyle, ?keep_inline_images_in: Array[String], ?preprocessing: PreprocessingOptionsUpdate, ?encoding: String, ?debug: bool, ?strip_tags: Array[String], ?preserve_tags: Array[String], ?skip_images: bool, ?link_style: LinkStyle, ?output_format: OutputFormat, ?include_document_structure: bool, ?extract_images: bool, ?max_image_size: Integer, ?capture_svg: bool, ?infer_dimensions: bool, ?max_depth: Integer?, ?exclude_selectors: Array[String], ?visitor: VisitorHandle) -> void
323
173
  end
324
174
 
325
175
  class PreprocessingOptions
326
- # HTML preprocessing options for document cleanup before conversion.
327
-
328
176
  attr_accessor enabled: bool?
329
177
  attr_accessor preset: PreprocessingPreset?
330
178
  attr_accessor remove_navigation: bool?
331
179
  attr_accessor remove_forms: bool?
332
180
 
333
- def initialize: (enabled: bool, preset: PreprocessingPreset, remove_navigation: bool, remove_forms: bool) -> void
334
- def apply_update: (PreprocessingOptionsUpdate update) -> void
181
+ def initialize: (?enabled: bool, ?preset: PreprocessingPreset, ?remove_navigation: bool, ?remove_forms: bool) -> void
335
182
  def self.default: () -> PreprocessingOptions
336
- def self.from_update: (PreprocessingOptionsUpdate update) -> PreprocessingOptions
337
- def self.from: (PreprocessingOptionsUpdate update) -> PreprocessingOptions
338
183
  end
339
184
 
340
185
  class PreprocessingOptionsUpdate
341
- # Partial update for `PreprocessingOptions`.
342
- #
343
- # This struct uses `Option<T>` to represent optional fields that can be selectively updated.
344
- # Only specified fields (Some values) will override existing options; None values leave the
345
- # corresponding fields unchanged when applied via [`PreprocessingOptions::apply_update`].
346
-
347
186
  attr_accessor enabled: bool?
348
187
  attr_accessor preset: PreprocessingPreset?
349
188
  attr_accessor remove_navigation: bool?
@@ -353,10 +192,6 @@ module HtmlToMarkdown
353
192
  end
354
193
 
355
194
  class DocumentStructure
356
- # A structured document tree representing the semantic content of an HTML document.
357
- #
358
- # Uses a flat node array with index-based parent/child references for efficient traversal.
359
-
360
195
  attr_reader nodes: Array[DocumentNode]
361
196
  attr_reader source_format: String
362
197
 
@@ -364,8 +199,6 @@ module HtmlToMarkdown
364
199
  end
365
200
 
366
201
  class DocumentNode
367
- # A single node in the document tree.
368
-
369
202
  attr_reader id: String
370
203
  attr_reader content: NodeContent
371
204
  attr_reader parent: Integer
@@ -377,10 +210,6 @@ module HtmlToMarkdown
377
210
  end
378
211
 
379
212
  class TextAnnotation
380
- # An inline text annotation with byte-range offsets.
381
- #
382
- # Annotations describe formatting (bold, italic, etc.) and links within a node's text content.
383
-
384
213
  attr_reader start: Integer
385
214
  attr_reader end: Integer
386
215
  attr_reader kind: AnnotationKind
@@ -389,21 +218,6 @@ module HtmlToMarkdown
389
218
  end
390
219
 
391
220
  class ConversionResult
392
- # The primary result of HTML conversion and extraction.
393
- #
394
- # Contains the converted text output, optional structured document tree,
395
- # metadata, extracted tables, images, and processing warnings.
396
- #
397
- # # Example
398
- #
399
- # ```text
400
- # use html_to_markdown_rs::{convert, ConversionOptions};
401
- #
402
- # let result = convert("<h1>Hello</h1><p>World</p>", None)?;
403
- # assert!(result.content.is_some());
404
- # assert!(result.warnings.is_empty());
405
- # ```
406
-
407
221
  attr_accessor content: String?
408
222
  attr_accessor document: DocumentStructure?
409
223
  attr_accessor metadata: HtmlMetadata?
@@ -411,22 +225,18 @@ module HtmlToMarkdown
411
225
  attr_accessor images: Array[String]?
412
226
  attr_accessor warnings: Array[ProcessingWarning]?
413
227
 
414
- def initialize: (?content: String, ?document: DocumentStructure, metadata: HtmlMetadata, tables: Array[TableData], images: Array[String], warnings: Array[ProcessingWarning]) -> void
228
+ def initialize: (?content: String, ?document: DocumentStructure, ?metadata: HtmlMetadata, ?tables: Array[TableData], ?images: Array[String], ?warnings: Array[ProcessingWarning]) -> void
415
229
  end
416
230
 
417
231
  class TableGrid
418
- # A structured table grid with cell-level data including spans.
419
-
420
232
  attr_accessor rows: Integer?
421
233
  attr_accessor cols: Integer?
422
234
  attr_accessor cells: Array[GridCell]?
423
235
 
424
- def initialize: (rows: Integer, cols: Integer, cells: Array[GridCell]) -> void
236
+ def initialize: (?rows: Integer, ?cols: Integer, ?cells: Array[GridCell]) -> void
425
237
  end
426
238
 
427
239
  class GridCell
428
- # A single cell in a table grid.
429
-
430
240
  attr_reader content: String
431
241
  attr_reader row: Integer
432
242
  attr_reader col: Integer
@@ -438,8 +248,6 @@ module HtmlToMarkdown
438
248
  end
439
249
 
440
250
  class TableData
441
- # A top-level extracted table with both structured data and markdown representation.
442
-
443
251
  attr_reader grid: TableGrid
444
252
  attr_reader markdown: String
445
253
 
@@ -447,8 +255,6 @@ module HtmlToMarkdown
447
255
  end
448
256
 
449
257
  class ProcessingWarning
450
- # A non-fatal warning generated during HTML processing.
451
-
452
258
  attr_reader message: String
453
259
  attr_reader kind: WarningKind
454
260
 
@@ -456,20 +262,9 @@ module HtmlToMarkdown
456
262
  end
457
263
 
458
264
  class VisitorHandle
459
- # Type alias for a visitor handle (`Arc`-wrapped `Mutex` for thread-safe shared mutation).
460
- #
461
- # `Send + Sync` so that types embedding a `VisitorHandle` (e.g. `ConversionOptions`)
462
- # can be shared across threads — required by callers that stash configs inside
463
- # axum/rmcp/tokio Send-bound contexts.
464
-
465
265
  end
466
266
 
467
267
  class NodeContext
468
- # Context information passed to all visitor methods.
469
- #
470
- # Provides comprehensive metadata about the current node being visited,
471
- # including its type, attributes, position in the DOM tree, and parent context.
472
-
473
268
  attr_reader node_type: NodeType
474
269
  attr_reader tag_name: String
475
270
  attr_reader attributes: Hash[String, String]
@@ -482,128 +277,72 @@ module HtmlToMarkdown
482
277
  end
483
278
 
484
279
  class TextDirection
485
- # Text directionality of document content.
486
- #
487
- # Corresponds to the HTML `dir` attribute and `bdi` element directionality.
488
280
  type value = :left_to_right | :right_to_left | :auto
489
281
  end
490
282
 
491
283
  class LinkType
492
- # Link classification based on href value and document context.
493
- #
494
- # Used to categorize links during extraction for filtering and analysis.
495
284
  type value = :anchor | :internal | :external | :email | :phone | :other
496
285
  end
497
286
 
498
287
  class ImageType
499
- # Image source classification for proper handling and processing.
500
- #
501
- # Determines whether an image is embedded (data URI), inline SVG, external, or relative.
502
288
  type value = :data_uri | :inline_svg | :external | :relative
503
289
  end
504
290
 
505
291
  class StructuredDataType
506
- # Structured data format type.
507
- #
508
- # Identifies the schema/format used for structured data markup.
509
292
  type value = :json_ld | :microdata | :r_d_fa
510
293
  end
511
294
 
512
295
  class PreprocessingPreset
513
- # HTML preprocessing aggressiveness level.
514
- #
515
- # Controls the extent of cleanup performed before conversion. Higher levels remove more elements.
516
296
  type value = :minimal | :standard | :aggressive
517
297
  end
518
298
 
519
299
  class HeadingStyle
520
- # Heading style options for Markdown output.
521
- #
522
- # Controls how headings (h1-h6) are rendered in the output Markdown.
523
300
  type value = :underlined | :atx | :atx_closed
524
301
  end
525
302
 
526
303
  class ListIndentType
527
- # List indentation character type.
528
- #
529
- # Controls whether list items are indented with spaces or tabs.
530
304
  type value = :spaces | :tabs
531
305
  end
532
306
 
533
307
  class WhitespaceMode
534
- # Whitespace handling strategy during conversion.
535
- #
536
- # Determines how sequences of whitespace characters (spaces, tabs, newlines) are processed.
537
308
  type value = :normalized | :strict
538
309
  end
539
310
 
540
311
  class NewlineStyle
541
- # Line break syntax in Markdown output.
542
- #
543
- # Controls how soft line breaks (from `<br>` or line breaks in source) are rendered.
544
312
  type value = :spaces | :backslash
545
313
  end
546
314
 
547
315
  class CodeBlockStyle
548
- # Code block fence style in Markdown output.
549
- #
550
- # Determines how code blocks (`<pre><code>`) are rendered in Markdown.
551
316
  type value = :indented | :backticks | :tildes
552
317
  end
553
318
 
554
319
  class HighlightStyle
555
- # Highlight rendering style for `<mark>` elements.
556
- #
557
- # Controls how highlighted text is rendered in Markdown output.
558
320
  type value = :double_equal | :html | :bold | :none
559
321
  end
560
322
 
561
323
  class LinkStyle
562
- # Link rendering style in Markdown output.
563
- #
564
- # Controls whether links and images use inline `[text](url)` syntax or
565
- # reference-style `[text][1]` syntax with definitions collected at the end.
566
324
  type value = :inline | :reference
567
325
  end
568
326
 
569
327
  class OutputFormat
570
- # Output format for conversion.
571
- #
572
- # Specifies the target markup language format for the conversion output.
573
328
  type value = :markdown | :djot | :plain
574
329
  end
575
330
 
576
331
  class NodeContent
577
- # The semantic content type of a document node.
578
- #
579
- # Uses internally tagged representation (`"node_type": "heading"`) for JSON serialization.
580
332
  end
581
333
 
582
334
  class AnnotationKind
583
- # The type of an inline text annotation.
584
- #
585
- # Uses internally tagged representation (`"annotation_type": "bold"`) for JSON serialization.
586
335
  end
587
336
 
588
337
  class WarningKind
589
- # Categories of processing warnings.
590
338
  type value = :image_extraction_failed | :encoding_fallback | :truncated_input | :malformed_html | :sanitization_applied | :depth_limit_exceeded
591
339
  end
592
340
 
593
341
  class NodeType
594
- # Node type enumeration covering all HTML element types.
595
- #
596
- # This enum categorizes all HTML elements that the converter recognizes,
597
- # providing a coarse-grained classification for visitor dispatch.
598
342
  type value = :text | :element | :heading | :paragraph | :div | :blockquote | :pre | :hr | :list | :list_item | :definition_list | :definition_term | :definition_description | :table | :table_row | :table_cell | :table_header | :table_body | :table_head | :table_foot | :link | :image | :strong | :em | :code | :strikethrough | :underline | :subscript | :superscript | :mark | :small | :br | :span | :article | :section | :nav | :aside | :header | :footer | :main | :figure | :figcaption | :time | :details | :summary | :form | :input | :select | :option | :button | :textarea | :label | :fieldset | :legend | :audio | :video | :picture | :source | :iframe | :svg | :canvas | :ruby | :rt | :rp | :abbr | :kbd | :samp | :var | :cite | :q | :del | :ins | :data | :meter | :progress | :output | :template | :slot | :html | :head | :body | :title | :meta | :link_tag | :style | :script | :base | :custom
599
343
  end
600
344
 
601
345
  class VisitResult
602
- # Result of a visitor callback.
603
- #
604
- # Allows visitors to control the conversion flow by either proceeding
605
- # with default behavior, providing custom output, skipping elements,
606
- # preserving HTML, or signaling errors.
607
346
  end
608
347
 
609
348
  def self.convert: (String html, ?ConversionOptions options) -> ConversionResult
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html-to-markdown
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.4.1
4
+ version: 3.5.0
5
5
  platform: aarch64-linux
6
6
  authors:
7
7
  - Kreuzberg Team
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2026-05-14 00:00:00.000000000 Z
11
+ date: 2026-05-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys
@@ -24,6 +24,20 @@ dependencies:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
26
  version: '0.9'
27
+ - !ruby/object:Gem::Dependency
28
+ name: sorbet-runtime
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '0.5'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '0.5'
27
41
  description: High-performance HTML to Markdown converter
28
42
  email:
29
43
  executables: []