RubyGems - html-to-markdown - Versions diffs - 3.4.0-aarch64-linux - Mend

html-to-markdown 3.4.0-aarch64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

checksums.yaml +7 -0
data/Steepfile +6 -0
data/lib/bin/html-to-markdown +0 -0
data/lib/html_to_markdown/native.rb +59 -0
data/lib/html_to_markdown/version.rb +10 -0
data/lib/html_to_markdown.rb +13 -0
data/lib/html_to_markdown_rb.so +0 -0
data/sig/html_to_markdown/cli.rbs +24 -0
data/sig/html_to_markdown/cli_proxy.rbs +48 -0
data/sig/open3.rbs +12 -0
data/sig/types.rbs +609 -0
metadata +54 -0

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA256:
+  metadata.gz: '092b4d1ad3d0ea4ac57bfe807760729d6cd676066dd05fc5f7fd3ecfeb3cb1c7'
+  data.tar.gz: d488926fb2483f76d356eb12082bb0f426a21b7d8bde7a981297a14abc7542a7
+SHA512:
+  metadata.gz: 1d1cd3a0b1135303d143374ec4bac13cf69d6dc1448bf29c90431ee7de7cce0e228aa568604575c39b2fb1237a32a48f76aad79ec486739910a7bb907a1d640d
+  data.tar.gz: 0a58dbc37be917e28cbc14f2d97f6ed51db6191ebddf1229ab49fa78f5dcdf2623c67762dff40a99adc44fb0de339317ea10123312f50469e8a865d061998f31

data/Steepfile ADDED Viewed

@@ -0,0 +1,6 @@
+# frozen_string_literal: true
+target :lib do
+  signature 'sig'
+  check 'lib'
+end

data/lib/bin/html-to-markdown ADDED Viewed

Binary file

data/lib/html_to_markdown/native.rb ADDED Viewed

@@ -0,0 +1,59 @@
+# This file is auto-generated by alef — DO NOT EDIT.
+# alef:hash:b54e7bb2ab55cc6c25c9cac0e62ec66c35fd2d1956ef9ba5e3dc9e7ba5e666a5
+# To regenerate: alef generate
+# To verify freshness: alef verify --exit-code
+# Issues & docs: https://github.com/kreuzberg-dev/alef
+# frozen_string_literal: true
+require 'json'
+require 'html_to_markdown_rb'
+module HtmlToMarkdown
+  # Re-export all public module functions from the native extension
+  HtmlToMarkdownRs.methods(false).each do |m|
+    define_singleton_method(m) { |*args, **kwargs, &blk| HtmlToMarkdownRs.public_send(m, *args, **kwargs, &blk) }
+  end
+  # Re-export all constants (classes, structs, etc.) from the native extension
+  HtmlToMarkdownRs.constants.each do |c|
+    const_set(c, HtmlToMarkdownRs.const_get(c)) unless const_defined?(c)
+  end
+end
+# Add accessor methods to Hash-based internally-tagged enum instances
+class Hash
+  # Support internally-tagged enum accessors like format.excel, format.email, etc.
+  # Also support direct field access like format.sheet_count
+  # rubocop:disable Metrics/CyclomaticComplexity
+  def method_missing(method_name, *args, &block)
+    # Try symbol key first (how Magnus converts JSON keys)
+    return self[method_name] if key?(method_name)
+    # Try string key
+    return self[method_name.to_s] if key?(method_name.to_s)
+    # Check if this hash has a 'format_type' field (indicating an internally-tagged enum)
+    format_type = self[:'format_type'] || self['format_type']
+    return super unless format_type
+    # If the method name matches the format_type (snake_case), extract and return the variant's wrapped data
+    # Internally-tagged enums store variant data in the '_0' field (from alef's struct variant conversion)
+    # This allows format.excel to return the ExcelMetadata hash with sheet_count, sheet_names, etc.
+    snake_case_method = method_name.to_s.downcase
+    if snake_case_method == format_type.to_s.downcase
+      return self[:'_0'] || self['_0'] || self
+    end
+    super
+  end
+  # rubocop:enable Metrics/CyclomaticComplexity
+  def respond_to_missing?(method_name, include_private = false)
+    return true if key?(method_name) || key?(method_name.to_s)
+    format_type = self[:'format_type'] || self['format_type']
+    return false unless format_type
+    snake_case_method = method_name.to_s.downcase
+    snake_case_method == format_type.to_s.downcase || super
+  end
+end

data/lib/html_to_markdown/version.rb ADDED Viewed

@@ -0,0 +1,10 @@
+# This file is auto-generated by alef — DO NOT EDIT.
+# alef:hash:9c58cf63849e82246f03b4fcc3996c264d47f2b2c27e0e8ba6b93eb4a84cb279
+# To regenerate: alef generate
+# To verify freshness: alef verify --exit-code
+# Issues & docs: https://github.com/kreuzberg-dev/alef
+# frozen_string_literal: true
+module HtmlToMarkdown
+  VERSION = '3.4.0'
+end

data/lib/html_to_markdown.rb ADDED Viewed

@@ -0,0 +1,13 @@
+# This file is auto-generated by alef — DO NOT EDIT.
+# alef:hash:b671355c68864d5f935b91f875ab29144d9543baad5a955cd926ab9881762a19
+# To regenerate: alef generate
+# To verify freshness: alef verify --exit-code
+# Issues & docs: https://github.com/kreuzberg-dev/alef
+# frozen_string_literal: true
+require_relative 'html_to_markdown/version'
+require_relative 'html_to_markdown/native'
+module HtmlToMarkdown
+  # Re-export all types and functions from native extension
+end

data/lib/html_to_markdown_rb.so ADDED Viewed

Binary file

data/sig/html_to_markdown/cli.rbs ADDED Viewed

@@ -0,0 +1,24 @@
+module HtmlToMarkdown
+  module CLI
+    # Module method (module_function creates both module and instance methods)
+    #
+    # Run the CLI with the given arguments
+    #
+    # @param argv Command-line arguments (defaults to ARGV)
+    # @param stdout Output stream for standard output
+    # @param stderr Output stream for standard error
+    # @return Exit code (0 for success, non-zero for failure)
+    def self.run: (
+      ?Array[String] argv,
+      ?stdout: IO,
+      ?stderr: IO
+    ) -> Integer
+    # Instance method version (created by module_function)
+    def run: (
+      ?Array[String] argv,
+      ?stdout: IO,
+      ?stderr: IO
+    ) -> Integer
+  end
+end

data/sig/html_to_markdown/cli_proxy.rbs ADDED Viewed

@@ -0,0 +1,48 @@
+module HtmlToMarkdown
+  module CLIProxy
+    # Base error class
+    class Error < StandardError
+    end
+    # Error when CLI binary is not found
+    class MissingBinaryError < Error
+    end
+    # Error when CLI execution fails
+    class CLIExecutionError < Error
+      attr_reader stderr: String
+      attr_reader status: Integer?
+      def initialize: (String message, stderr: String, status: Integer?) -> void
+    end
+    # Module methods (module_function creates both module and instance methods)
+    # Execute CLI with given arguments
+    def self.call: (Array[String] argv) -> String
+    # Find the CLI binary in search paths
+    def self.find_cli_binary: () -> Pathname
+    # Get root path of the gem
+    def self.root_path: () -> Pathname
+    # Get lib path of the gem
+    def self.lib_path: () -> Pathname
+    # Get search paths for CLI binary
+    def self.search_paths: (String binary_name) -> Array[Pathname]
+    # Get error message for missing binary
+    def self.missing_binary_message: () -> String
+    # Instance method versions (created by module_function)
+    def call: (Array[String] argv) -> String
+    def find_cli_binary: () -> Pathname
+    def root_path: () -> Pathname
+    def lib_path: () -> Pathname
+    def search_paths: (String binary_name) -> Array[Pathname]
+    def missing_binary_message: () -> String
+  end
+end

data/sig/open3.rbs ADDED Viewed

@@ -0,0 +1,12 @@
+# Type signature for Open3 standard library
+module Open3
+  # Execute command and capture stdout, stderr, and status
+  #
+  # @param cmd Command to execute
+  # @param args Command arguments
+  # @return Array containing stdout (String), stderr (String), and status (Process::Status)
+  def self.capture3: (
+    String cmd,
+    *String args
+  ) -> [String, String, Process::Status]
+end

data/sig/types.rbs ADDED Viewed

@@ -0,0 +1,609 @@
+# This file is auto-generated by alef — DO NOT EDIT.
+# alef:hash:da88db156d77eefe37cfd0ca53ea75c07abbc5d3ebb7ad977060f871af4c9ff3
+# To regenerate: alef generate
+# To verify freshness: alef verify --exit-code
+# Issues & docs: https://github.com/kreuzberg-dev/alef
+module HtmlToMarkdown
+  VERSION: String
+  class DocumentMetadata
+    # Document-level metadata extracted from `<head>` and top-level elements.
+    #
+    # Contains all metadata typically used by search engines, social media platforms,
+    # and browsers for document indexing and presentation.
+    #
+    # # Examples
+    #
+    # ```
+    # let doc = DocumentMetadata {
+    #     title: Some("My Article".to_string()),
+    #     description: Some("A great article about Rust".to_string()),
+    #     keywords: vec!["rust".to_string(), "programming".to_string()],
+    #     ..Default::default()
+    # };
+    #
+    # assert_eq!(doc.title, Some("My Article".to_string()));
+    # ```
+    attr_accessor title: String?
+    attr_accessor description: String?
+    attr_accessor keywords: Array[String]?
+    attr_accessor author: String?
+    attr_accessor canonical_url: String?
+    attr_accessor base_href: String?
+    attr_accessor language: String?
+    attr_accessor text_direction: TextDirection?
+    attr_accessor open_graph: Hash[String, String]?
+    attr_accessor twitter_card: Hash[String, String]?
+    attr_accessor meta_tags: Hash[String, String]?
+    def initialize: (?title: String, ?description: String, keywords: Array[String], ?author: String, ?canonical_url: String, ?base_href: String, ?language: String, ?text_direction: TextDirection, open_graph: Hash[String, String], twitter_card: Hash[String, String], meta_tags: Hash[String, String]) -> void
+  end
+  class HeaderMetadata
+    # Header element metadata with hierarchy tracking.
+    #
+    # Captures heading elements (h1-h6) with their text content, identifiers,
+    # and position in the document structure.
+    #
+    # # Examples
+    #
+    # ```
+    # let header = HeaderMetadata {
+    #     level: 1,
+    #     text: "Main Title".to_string(),
+    #     id: Some("main-title".to_string()),
+    #     depth: 0,
+    #     html_offset: 145,
+    # };
+    #
+    # assert_eq!(header.level, 1);
+    # assert!(header.is_valid());
+    # ```
+    attr_reader level: Integer
+    attr_reader text: String
+    attr_reader id: String
+    attr_reader depth: Integer
+    attr_reader html_offset: Integer
+    def initialize: (level: Integer, text: String, ?id: String, depth: Integer, html_offset: Integer) -> void
+    def is_valid: () -> bool
+  end
+  class LinkMetadata
+    # Hyperlink metadata with categorization and attributes.
+    #
+    # Represents `<a>` elements with parsed href values, text content, and link type classification.
+    #
+    # # Examples
+    #
+    # ```
+    # let link = LinkMetadata {
+    #     href: "https://example.com".to_string(),
+    #     text: "Example".to_string(),
+    #     title: Some("Visit Example".to_string()),
+    #     link_type: LinkType::External,
+    #     rel: vec!["nofollow".to_string()],
+    #     attributes: Default::default(),
+    # };
+    #
+    # assert_eq!(link.link_type, LinkType::External);
+    # assert_eq!(link.text, "Example");
+    # ```
+    attr_reader href: String
+    attr_reader text: String
+    attr_reader title: String
+    attr_reader link_type: LinkType
+    attr_reader rel: Array[String]
+    attr_reader attributes: Hash[String, String]
+    def initialize: (href: String, text: String, ?title: String, link_type: LinkType, rel: Array[String], attributes: Hash[String, String]) -> void
+    def self.classify_link: (String href) -> LinkType
+  end
+  class ImageMetadata
+    # Image metadata with source and dimensions.
+    #
+    # Captures `<img>` elements and inline `<svg>` elements with metadata
+    # for image analysis and optimization.
+    #
+    # # Examples
+    #
+    # ```
+    # let img = ImageMetadata {
+    #     src: "https://example.com/image.jpg".to_string(),
+    #     alt: Some("An example image".to_string()),
+    #     title: Some("Example".to_string()),
+    #     dimensions: Some((800, 600)),
+    #     image_type: ImageType::External,
+    #     attributes: Default::default(),
+    # };
+    #
+    # assert_eq!(img.image_type, ImageType::External);
+    # ```
+    attr_reader src: String
+    attr_reader alt: String
+    attr_reader title: String
+    attr_reader dimensions: Array[Integer]
+    attr_reader image_type: ImageType
+    attr_reader attributes: Hash[String, String]
+    def initialize: (src: String, ?alt: String, ?title: String, ?dimensions: Array[Integer], image_type: ImageType, attributes: Hash[String, String]) -> void
+  end
+  class StructuredData
+    # Structured data block (JSON-LD, Microdata, or RDFa).
+    #
+    # Represents machine-readable structured data found in the document.
+    # JSON-LD blocks are collected as raw JSON strings for flexibility.
+    #
+    # # Examples
+    #
+    # ```
+    # let schema = StructuredData {
+    #     data_type: StructuredDataType::JsonLd,
+    #     raw_json: r#"{"@context":"https://schema.org","@type":"Article"}"#.to_string(),
+    #     schema_type: Some("Article".to_string()),
+    # };
+    #
+    # assert_eq!(schema.data_type, StructuredDataType::JsonLd);
+    # ```
+    attr_reader data_type: StructuredDataType
+    attr_reader raw_json: String
+    attr_reader schema_type: String
+    def initialize: (data_type: StructuredDataType, raw_json: String, ?schema_type: String) -> void
+  end
+  class HtmlMetadata
+    # Comprehensive metadata extraction result from HTML document.
+    #
+    # Contains all extracted metadata types in a single structure,
+    # suitable for serialization and transmission across language boundaries.
+    #
+    # # Examples
+    #
+    # ```
+    # let metadata = HtmlMetadata {
+    #     document: Default::default(),
+    #     headers: Vec::new(),
+    #     links: Vec::new(),
+    #     images: Vec::new(),
+    #     structured_data: Vec::new(),
+    # };
+    #
+    # assert!(metadata.headers.is_empty());
+    # ```
+    attr_accessor document: DocumentMetadata?
+    attr_accessor headers: Array[HeaderMetadata]?
+    attr_accessor links: Array[LinkMetadata]?
+    attr_accessor images: Array[ImageMetadata]?
+    attr_accessor structured_data: Array[StructuredData]?
+    def initialize: (document: DocumentMetadata, headers: Array[HeaderMetadata], links: Array[LinkMetadata], images: Array[ImageMetadata], structured_data: Array[StructuredData]) -> void
+  end
+  class ConversionOptions
+    # Main conversion options for HTML to Markdown conversion.
+    #
+    # Use [`ConversionOptions::builder()`] to construct, or [`Default::default()`] for defaults.
+    #
+    # # Example
+    #
+    # ```text
+    # use html_to_markdown_rs::ConversionOptions;
+    #
+    # let options = ConversionOptions::builder()
+    #     .heading_style(HeadingStyle::Atx)
+    #     .wrap(true)
+    #     .wrap_width(100)
+    #     .build();
+    # ```
+    attr_accessor heading_style: HeadingStyle?
+    attr_accessor list_indent_type: ListIndentType?
+    attr_accessor list_indent_width: Integer?
+    attr_accessor bullets: String?
+    attr_accessor strong_em_symbol: String?
+    attr_accessor escape_asterisks: bool?
+    attr_accessor escape_underscores: bool?
+    attr_accessor escape_misc: bool?
+    attr_accessor escape_ascii: bool?
+    attr_accessor code_language: String?
+    attr_accessor autolinks: bool?
+    attr_accessor default_title: bool?
+    attr_accessor br_in_tables: bool?
+    attr_accessor highlight_style: HighlightStyle?
+    attr_accessor extract_metadata: bool?
+    attr_accessor whitespace_mode: WhitespaceMode?
+    attr_accessor strip_newlines: bool?
+    attr_accessor wrap: bool?
+    attr_accessor wrap_width: Integer?
+    attr_accessor convert_as_inline: bool?
+    attr_accessor sub_symbol: String?
+    attr_accessor sup_symbol: String?
+    attr_accessor newline_style: NewlineStyle?
+    attr_accessor code_block_style: CodeBlockStyle?
+    attr_accessor keep_inline_images_in: Array[String]?
+    attr_accessor preprocessing: PreprocessingOptions?
+    attr_accessor encoding: String?
+    attr_accessor debug: bool?
+    attr_accessor strip_tags: Array[String]?
+    attr_accessor preserve_tags: Array[String]?
+    attr_accessor skip_images: bool?
+    attr_accessor link_style: LinkStyle?
+    attr_accessor output_format: OutputFormat?
+    attr_accessor include_document_structure: bool?
+    attr_accessor extract_images: bool?
+    attr_accessor max_image_size: Integer?
+    attr_accessor capture_svg: bool?
+    attr_accessor infer_dimensions: bool?
+    attr_accessor max_depth: Integer?
+    attr_accessor exclude_selectors: Array[String]?
+    attr_accessor visitor: VisitorHandle?
+    def initialize: (heading_style: HeadingStyle, list_indent_type: ListIndentType, list_indent_width: Integer, bullets: String, strong_em_symbol: String, escape_asterisks: bool, escape_underscores: bool, escape_misc: bool, escape_ascii: bool, code_language: String, autolinks: bool, default_title: bool, br_in_tables: bool, highlight_style: HighlightStyle, extract_metadata: bool, whitespace_mode: WhitespaceMode, strip_newlines: bool, wrap: bool, wrap_width: Integer, convert_as_inline: bool, sub_symbol: String, sup_symbol: String, newline_style: NewlineStyle, code_block_style: CodeBlockStyle, keep_inline_images_in: Array[String], preprocessing: PreprocessingOptions, encoding: String, debug: bool, strip_tags: Array[String], preserve_tags: Array[String], skip_images: bool, link_style: LinkStyle, output_format: OutputFormat, include_document_structure: bool, extract_images: bool, max_image_size: Integer, capture_svg: bool, infer_dimensions: bool, ?max_depth: Integer, exclude_selectors: Array[String], ?visitor: VisitorHandle) -> void
+    def apply_update: (ConversionOptionsUpdate update) -> void
+    def self.default: () -> ConversionOptions
+    def self.builder: () -> ConversionOptionsBuilder
+    def self.from_update: (ConversionOptionsUpdate update) -> ConversionOptions
+    def self.from: (ConversionOptionsUpdate update) -> ConversionOptions
+  end
+  class ConversionOptionsBuilder
+    # Builder for [`ConversionOptions`].
+    #
+    # All fields start with default values. Call `.build()` to produce the final options.
+    def strip_tags: (Array[String] tags) -> ConversionOptionsBuilder
+    def preserve_tags: (Array[String] tags) -> ConversionOptionsBuilder
+    def keep_inline_images_in: (Array[String] tags) -> ConversionOptionsBuilder
+    def exclude_selectors: (Array[String] selectors) -> ConversionOptionsBuilder
+    def visitor: (?VisitorHandle visitor) -> ConversionOptionsBuilder
+    def preprocessing: (PreprocessingOptions preprocessing) -> ConversionOptionsBuilder
+    def build: () -> ConversionOptions
+  end
+  class ConversionOptionsUpdate
+    # Partial update for `ConversionOptions`.
+    #
+    # Uses `Option<T>` fields for selective updates. Bindings use this to construct
+    # options from language-native types. Prefer [`ConversionOptionsBuilder`] for Rust code.
+    attr_accessor heading_style: HeadingStyle?
+    attr_accessor list_indent_type: ListIndentType?
+    attr_accessor list_indent_width: Integer?
+    attr_accessor bullets: String?
+    attr_accessor strong_em_symbol: String?
+    attr_accessor escape_asterisks: bool?
+    attr_accessor escape_underscores: bool?
+    attr_accessor escape_misc: bool?
+    attr_accessor escape_ascii: bool?
+    attr_accessor code_language: String?
+    attr_accessor autolinks: bool?
+    attr_accessor default_title: bool?
+    attr_accessor br_in_tables: bool?
+    attr_accessor highlight_style: HighlightStyle?
+    attr_accessor extract_metadata: bool?
+    attr_accessor whitespace_mode: WhitespaceMode?
+    attr_accessor strip_newlines: bool?
+    attr_accessor wrap: bool?
+    attr_accessor wrap_width: Integer?
+    attr_accessor convert_as_inline: bool?
+    attr_accessor sub_symbol: String?
+    attr_accessor sup_symbol: String?
+    attr_accessor newline_style: NewlineStyle?
+    attr_accessor code_block_style: CodeBlockStyle?
+    attr_accessor keep_inline_images_in: Array[String]?
+    attr_accessor preprocessing: PreprocessingOptionsUpdate?
+    attr_accessor encoding: String?
+    attr_accessor debug: bool?
+    attr_accessor strip_tags: Array[String]?
+    attr_accessor preserve_tags: Array[String]?
+    attr_accessor skip_images: bool?
+    attr_accessor link_style: LinkStyle?
+    attr_accessor output_format: OutputFormat?
+    attr_accessor include_document_structure: bool?
+    attr_accessor extract_images: bool?
+    attr_accessor max_image_size: Integer?
+    attr_accessor capture_svg: bool?
+    attr_accessor infer_dimensions: bool?
+    attr_accessor max_depth: Integer?
+    attr_accessor exclude_selectors: Array[String]?
+    attr_accessor visitor: VisitorHandle?
+    def initialize: (?heading_style: HeadingStyle, ?list_indent_type: ListIndentType, ?list_indent_width: Integer, ?bullets: String, ?strong_em_symbol: String, ?escape_asterisks: bool, ?escape_underscores: bool, ?escape_misc: bool, ?escape_ascii: bool, ?code_language: String, ?autolinks: bool, ?default_title: bool, ?br_in_tables: bool, ?highlight_style: HighlightStyle, ?extract_metadata: bool, ?whitespace_mode: WhitespaceMode, ?strip_newlines: bool, ?wrap: bool, ?wrap_width: Integer, ?convert_as_inline: bool, ?sub_symbol: String, ?sup_symbol: String, ?newline_style: NewlineStyle, ?code_block_style: CodeBlockStyle, ?keep_inline_images_in: Array[String], ?preprocessing: PreprocessingOptionsUpdate, ?encoding: String, ?debug: bool, ?strip_tags: Array[String], ?preserve_tags: Array[String], ?skip_images: bool, ?link_style: LinkStyle, ?output_format: OutputFormat, ?include_document_structure: bool, ?extract_images: bool, ?max_image_size: Integer, ?capture_svg: bool, ?infer_dimensions: bool, ?max_depth: Integer?, ?exclude_selectors: Array[String], ?visitor: VisitorHandle) -> void
+  end
+  class PreprocessingOptions
+    # HTML preprocessing options for document cleanup before conversion.
+    attr_accessor enabled: bool?
+    attr_accessor preset: PreprocessingPreset?
+    attr_accessor remove_navigation: bool?
+    attr_accessor remove_forms: bool?
+    def initialize: (enabled: bool, preset: PreprocessingPreset, remove_navigation: bool, remove_forms: bool) -> void
+    def apply_update: (PreprocessingOptionsUpdate update) -> void
+    def self.default: () -> PreprocessingOptions
+    def self.from_update: (PreprocessingOptionsUpdate update) -> PreprocessingOptions
+    def self.from: (PreprocessingOptionsUpdate update) -> PreprocessingOptions
+  end
+  class PreprocessingOptionsUpdate
+    # Partial update for `PreprocessingOptions`.
+    #
+    # This struct uses `Option<T>` to represent optional fields that can be selectively updated.
+    # Only specified fields (Some values) will override existing options; None values leave the
+    # corresponding fields unchanged when applied via [`PreprocessingOptions::apply_update`].
+    attr_accessor enabled: bool?
+    attr_accessor preset: PreprocessingPreset?
+    attr_accessor remove_navigation: bool?
+    attr_accessor remove_forms: bool?
+    def initialize: (?enabled: bool, ?preset: PreprocessingPreset, ?remove_navigation: bool, ?remove_forms: bool) -> void
+  end
+  class DocumentStructure
+    # A structured document tree representing the semantic content of an HTML document.
+    #
+    # Uses a flat node array with index-based parent/child references for efficient traversal.
+    attr_reader nodes: Array[DocumentNode]
+    attr_reader source_format: String
+    def initialize: (nodes: Array[DocumentNode], ?source_format: String) -> void
+  end
+  class DocumentNode
+    # A single node in the document tree.
+    attr_reader id: String
+    attr_reader content: NodeContent
+    attr_reader parent: Integer
+    attr_reader children: Array[Integer]
+    attr_reader annotations: Array[TextAnnotation]
+    attr_reader attributes: Hash[String, String]
+    def initialize: (id: String, content: NodeContent, ?parent: Integer, children: Array[Integer], annotations: Array[TextAnnotation], ?attributes: Hash[String, String]) -> void
+  end
+  class TextAnnotation
+    # An inline text annotation with byte-range offsets.
+    #
+    # Annotations describe formatting (bold, italic, etc.) and links within a node's text content.
+    attr_reader start: Integer
+    attr_reader end: Integer
+    attr_reader kind: AnnotationKind
+    def initialize: (start: Integer, end: Integer, kind: AnnotationKind) -> void
+  end
+  class ConversionResult
+    # The primary result of HTML conversion and extraction.
+    #
+    # Contains the converted text output, optional structured document tree,
+    # metadata, extracted tables, images, and processing warnings.
+    #
+    # # Example
+    #
+    # ```text
+    # use html_to_markdown_rs::{convert, ConversionOptions};
+    #
+    # let result = convert("<h1>Hello</h1><p>World</p>", None)?;
+    # assert!(result.content.is_some());
+    # assert!(result.warnings.is_empty());
+    # ```
+    attr_accessor content: String?
+    attr_accessor document: DocumentStructure?
+    attr_accessor metadata: HtmlMetadata?
+    attr_accessor tables: Array[TableData]?
+    attr_accessor images: Array[String]?
+    attr_accessor warnings: Array[ProcessingWarning]?
+    def initialize: (?content: String, ?document: DocumentStructure, metadata: HtmlMetadata, tables: Array[TableData], images: Array[String], warnings: Array[ProcessingWarning]) -> void
+  end
+  class TableGrid
+    # A structured table grid with cell-level data including spans.
+    attr_accessor rows: Integer?
+    attr_accessor cols: Integer?
+    attr_accessor cells: Array[GridCell]?
+    def initialize: (rows: Integer, cols: Integer, cells: Array[GridCell]) -> void
+  end
+  class GridCell
+    # A single cell in a table grid.
+    attr_reader content: String
+    attr_reader row: Integer
+    attr_reader col: Integer
+    attr_reader row_span: Integer
+    attr_reader col_span: Integer
+    attr_reader is_header: bool
+    def initialize: (content: String, row: Integer, col: Integer, row_span: Integer, col_span: Integer, is_header: bool) -> void
+  end
+  class TableData
+    # A top-level extracted table with both structured data and markdown representation.
+    attr_reader grid: TableGrid
+    attr_reader markdown: String
+    def initialize: (grid: TableGrid, markdown: String) -> void
+  end
+  class ProcessingWarning
+    # A non-fatal warning generated during HTML processing.
+    attr_reader message: String
+    attr_reader kind: WarningKind
+    def initialize: (message: String, kind: WarningKind) -> void
+  end
+  class VisitorHandle
+    # Type alias for a visitor handle (Rc-wrapped `RefCell` for interior mutability).
+    #
+    # This allows visitors to be passed around and shared while still being mutable.
+  end
+  class NodeContext
+    # Context information passed to all visitor methods.
+    #
+    # Provides comprehensive metadata about the current node being visited,
+    # including its type, attributes, position in the DOM tree, and parent context.
+    attr_reader node_type: NodeType
+    attr_reader tag_name: String
+    attr_reader attributes: Hash[String, String]
+    attr_reader depth: Integer
+    attr_reader index_in_parent: Integer
+    attr_reader parent_tag: String
+    attr_reader is_inline: bool
+    def initialize: (node_type: NodeType, tag_name: String, attributes: Hash[String, String], depth: Integer, index_in_parent: Integer, ?parent_tag: String, is_inline: bool) -> void
+  end
+  class TextDirection
+    # Text directionality of document content.
+    #
+    # Corresponds to the HTML `dir` attribute and `bdi` element directionality.
+    type instance = :left_to_right | :right_to_left | :auto
+  end
+  class LinkType
+    # Link classification based on href value and document context.
+    #
+    # Used to categorize links during extraction for filtering and analysis.
+    type instance = :anchor | :internal | :external | :email | :phone | :other
+  end
+  class ImageType
+    # Image source classification for proper handling and processing.
+    #
+    # Determines whether an image is embedded (data URI), inline SVG, external, or relative.
+    type instance = :data_uri | :inline_svg | :external | :relative
+  end
+  class StructuredDataType
+    # Structured data format type.
+    #
+    # Identifies the schema/format used for structured data markup.
+    type instance = :json_ld | :microdata | :r_d_fa
+  end
+  class PreprocessingPreset
+    # HTML preprocessing aggressiveness level.
+    #
+    # Controls the extent of cleanup performed before conversion. Higher levels remove more elements.
+    type instance = :minimal | :standard | :aggressive
+  end
+  class HeadingStyle
+    # Heading style options for Markdown output.
+    #
+    # Controls how headings (h1-h6) are rendered in the output Markdown.
+    type instance = :underlined | :atx | :atx_closed
+  end
+  class ListIndentType
+    # List indentation character type.
+    #
+    # Controls whether list items are indented with spaces or tabs.
+    type instance = :spaces | :tabs
+  end
+  class WhitespaceMode
+    # Whitespace handling strategy during conversion.
+    #
+    # Determines how sequences of whitespace characters (spaces, tabs, newlines) are processed.
+    type instance = :normalized | :strict
+  end
+  class NewlineStyle
+    # Line break syntax in Markdown output.
+    #
+    # Controls how soft line breaks (from `<br>` or line breaks in source) are rendered.
+    type instance = :spaces | :backslash
+  end
+  class CodeBlockStyle
+    # Code block fence style in Markdown output.
+    #
+    # Determines how code blocks (`<pre><code>`) are rendered in Markdown.
+    type instance = :indented | :backticks | :tildes
+  end
+  class HighlightStyle
+    # Highlight rendering style for `<mark>` elements.
+    #
+    # Controls how highlighted text is rendered in Markdown output.
+    type instance = :double_equal | :html | :bold | :none
+  end
+  class LinkStyle
+    # Link rendering style in Markdown output.
+    #
+    # Controls whether links and images use inline `[text](url)` syntax or
+    # reference-style `[text][1]` syntax with definitions collected at the end.
+    type instance = :inline | :reference
+  end
+  class OutputFormat
+    # Output format for conversion.
+    #
+    # Specifies the target markup language format for the conversion output.
+    type instance = :markdown | :djot | :plain
+  end
+  class NodeContent
+    # The semantic content type of a document node.
+    #
+    # Uses internally tagged representation (`"node_type": "heading"`) for JSON serialization.
+  end
+  class AnnotationKind
+    # The type of an inline text annotation.
+    #
+    # Uses internally tagged representation (`"annotation_type": "bold"`) for JSON serialization.
+  end
+  class WarningKind
+    # Categories of processing warnings.
+    type instance = :image_extraction_failed | :encoding_fallback | :truncated_input | :malformed_html | :sanitization_applied | :depth_limit_exceeded
+  end
+  class NodeType
+    # Node type enumeration covering all HTML element types.
+    #
+    # This enum categorizes all HTML elements that the converter recognizes,
+    # providing a coarse-grained classification for visitor dispatch.
+    type instance = :text | :element | :heading | :paragraph | :div | :blockquote | :pre | :hr | :list | :list_item | :definition_list | :definition_term | :definition_description | :table | :table_row | :table_cell | :table_header | :table_body | :table_head | :table_foot | :link | :image | :strong | :em | :code | :strikethrough | :underline | :subscript | :superscript | :mark | :small | :br | :span | :article | :section | :nav | :aside | :header | :footer | :main | :figure | :figcaption | :time | :details | :summary | :form | :input | :select | :option | :button | :textarea | :label | :fieldset | :legend | :audio | :video | :picture | :source | :iframe | :svg | :canvas | :ruby | :rt | :rp | :abbr | :kbd | :samp | :var | :cite | :q | :del | :ins | :data | :meter | :progress | :output | :template | :slot | :html | :head | :body | :title | :meta | :link_tag | :style | :script | :base | :custom
+  end
+  class VisitResult
+    # Result of a visitor callback.
+    #
+    # Allows visitors to control the conversion flow by either proceeding
+    # with default behavior, providing custom output, skipping elements,
+    # preserving HTML, or signaling errors.
+  end
+  def self.convert: (String html, ?ConversionOptions options) -> ConversionResult
+end

metadata ADDED Viewed

@@ -0,0 +1,54 @@
+--- !ruby/object:Gem::Specification
+name: html-to-markdown
+version: !ruby/object:Gem::Version
+  version: 3.4.0
+platform: aarch64-linux
+authors:
+- Kreuzberg Team
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2026-05-09 00:00:00.000000000 Z
+dependencies: []
+description: High-performance HTML to Markdown converter
+email:
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- Steepfile
+- lib/bin/html-to-markdown
+- lib/html_to_markdown.rb
+- lib/html_to_markdown/native.rb
+- lib/html_to_markdown/version.rb
+- lib/html_to_markdown_rb.so
+- sig/html_to_markdown/cli.rbs
+- sig/html_to_markdown/cli_proxy.rbs
+- sig/open3.rbs
+- sig/types.rbs
+homepage: https://github.com/kreuzberg-dev/html-to-markdown
+licenses:
+- MIT
+metadata:
+  keywords: html,markdown,converter
+  rubygems_mfa_required: 'true'
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: 3.2.0
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubygems_version: 3.5.22
+signing_key:
+specification_version: 4
+summary: High-performance HTML to Markdown converter
+test_files: []