RubyGems - html-to-markdown - Versions diffs - 2.30.0 → 3.0.0 - Mend

html-to-markdown 2.30.0 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (166) hide show

data/vendor/html-to-markdown-rs/src/hocr/converter/layout.rs DELETED Viewed

@@ -1,313 +0,0 @@
-//! Spatial layout analysis and table reconstruction for hOCR conversion
-use crate::hocr::spatial::{self, HocrWord};
-use crate::hocr::types::{HocrElement, HocrElementType};
-pub fn is_bullet_paragraph(element: &HocrElement, text: &str) -> bool {
-    if element.element_type != HocrElementType::OcrPar {
-        return false;
-    }
-    let trimmed = text.trim_start();
-    if trimmed.is_empty() {
-        return false;
-    }
-    if matches!(trimmed.chars().next(), Some('•' | '●' | '-' | '+' | '*')) {
-        return true;
-    }
-    let mut chars = trimmed.chars().peekable();
-    let mut digit_count = 0;
-    while let Some(&ch) = chars.peek() {
-        if ch.is_ascii_digit() {
-            digit_count += 1;
-            chars.next();
-        } else {
-            break;
-        }
-    }
-    if digit_count > 0 {
-        if let Some(&ch) = chars.peek() {
-            if (ch == '.' || ch == ')') && chars.clone().nth(1).is_some_and(char::is_whitespace) {
-                return true;
-            }
-        }
-    }
-    false
-}
-/// Try to detect and reconstruct a table from an element's word children
-///
-/// Returns Some(markdown) if table structure detected, None otherwise
-pub fn try_spatial_table_reconstruction(element: &HocrElement) -> Option<String> {
-    let mut words = Vec::new();
-    collect_words(element, &mut words);
-    if words.len() < 6 {
-        return None;
-    }
-    let table = spatial::reconstruct_table(&words, 50, 0.5);
-    if table.is_empty() || table[0].is_empty() {
-        return None;
-    }
-    if let Some(cleaned_table) = post_process_table(table) {
-        let markdown = spatial::table_to_markdown(&cleaned_table);
-        if !markdown.is_empty() {
-            return Some(markdown);
-        }
-    }
-    None
-}
-/// Collect all word elements recursively from an element tree
-fn collect_words(element: &HocrElement, words: &mut Vec<HocrWord>) {
-    if element.element_type == HocrElementType::OcrxWord {
-        if let Some(bbox) = element.properties.bbox {
-            let confidence = element.properties.x_wconf.unwrap_or(0.0);
-            words.push(HocrWord {
-                text: element.text.clone(),
-                left: bbox.x1,
-                top: bbox.y1,
-                width: bbox.width(),
-                height: bbox.height(),
-                confidence,
-            });
-        }
-    }
-    for child in &element.children {
-        collect_words(child, words);
-    }
-}
-fn post_process_table(mut table: Vec<Vec<String>>) -> Option<Vec<Vec<String>>> {
-    table.retain(|row| row.iter().any(|cell| !cell.trim().is_empty()));
-    if table.is_empty() {
-        return None;
-    }
-    let mut non_empty = 0;
-    let mut long_cells = 0;
-    for row in &table {
-        for cell in row {
-            let trimmed = cell.trim();
-            if trimmed.is_empty() {
-                continue;
-            }
-            non_empty += 1;
-            if trimmed.chars().count() > 60 {
-                long_cells += 1;
-            }
-        }
-    }
-    if non_empty > 0 && long_cells * 3 > non_empty * 2 {
-        return None;
-    }
-    let data_start = table
-        .iter()
-        .enumerate()
-        .find_map(|(idx, row)| {
-            let digit_cells = row
-                .iter()
-                .filter(|cell| cell.chars().any(|c| c.is_ascii_digit()))
-                .count();
-            if digit_cells >= 3 { Some(idx) } else { None }
-        })
-        .unwrap_or(0);
-    let mut header_rows = if data_start > 0 {
-        table[..data_start].to_vec()
-    } else {
-        Vec::new()
-    };
-    let mut data_rows = table[data_start..].to_vec();
-    if header_rows.len() > 2 {
-        header_rows = header_rows[header_rows.len() - 2..].to_vec();
-    }
-    if header_rows.is_empty() {
-        if data_rows.len() < 2 {
-            return None;
-        }
-        header_rows.push(data_rows[0].clone());
-        data_rows = data_rows[1..].to_vec();
-    }
-    let column_count = header_rows
-        .first()
-        .or_else(|| data_rows.first())
-        .map_or(0, std::vec::Vec::len);
-    if column_count == 0 {
-        return None;
-    }
-    let mut header = vec![String::new(); column_count];
-    for row in &header_rows {
-        for (idx, cell) in row.iter().enumerate() {
-            let trimmed = cell.trim();
-            if trimmed.is_empty() {
-                continue;
-            }
-            if !header[idx].is_empty() {
-                header[idx].push(' ');
-            }
-            header[idx].push_str(trimmed);
-        }
-    }
-    let mut processed = Vec::new();
-    processed.push(header);
-    processed.extend(data_rows);
-    if processed.len() <= 1 {
-        return None;
-    }
-    let mut col = 0;
-    while col < processed[0].len() {
-        let header_text = processed[0][col].trim().to_string();
-        let data_empty = processed[1..]
-            .iter()
-            .all(|row| row.get(col).is_none_or(|cell| cell.trim().is_empty()));
-        if data_empty {
-            merge_header_only_column(&mut processed, col, header_text);
-        } else {
-            col += 1;
-        }
-        if processed.is_empty() || processed[0].is_empty() {
-            return None;
-        }
-    }
-    if processed[0].len() < 2 || processed.len() <= 1 {
-        return None;
-    }
-    for cell in &mut processed[0] {
-        normalize_header_cell(cell);
-    }
-    for row in processed.iter_mut().skip(1) {
-        for cell in row.iter_mut() {
-            normalize_data_cell(cell);
-        }
-    }
-    Some(processed)
-}
-#[allow(clippy::trivially_copy_pass_by_ref)]
-fn merge_header_only_column(table: &mut [Vec<String>], col: usize, header_text: String) {
-    if table.is_empty() || table[0].is_empty() {
-        return;
-    }
-    let trimmed = header_text.trim();
-    if trimmed.is_empty() && table.len() > 1 {
-        for row in table.iter_mut() {
-            row.remove(col);
-        }
-        return;
-    }
-    if !trimmed.is_empty() {
-        if col > 0 {
-            let mut target = col - 1;
-            while target > 0 && table[0][target].trim().is_empty() {
-                target -= 1;
-            }
-            if !table[0][target].trim().is_empty() || target == 0 {
-                if !table[0][target].is_empty() {
-                    table[0][target].push(' ');
-                }
-                table[0][target].push_str(trimmed);
-                for row in table.iter_mut() {
-                    row.remove(col);
-                }
-                return;
-            }
-        }
-        if col + 1 < table[0].len() {
-            if table[0][col + 1].trim().is_empty() {
-                table[0][col + 1] = trimmed.to_string();
-            } else {
-                let mut updated = trimmed.to_string();
-                updated.push(' ');
-                updated.push_str(table[0][col + 1].trim());
-                table[0][col + 1] = updated;
-            }
-            for row in table.iter_mut() {
-                row.remove(col);
-            }
-            return;
-        }
-    }
-    for row in table.iter_mut() {
-        row.remove(col);
-    }
-}
-fn normalize_header_cell(cell: &mut String) {
-    let mut text = cell.trim().replace("  ", " ");
-    if text.contains("(Q)") {
-        text = text.replace("(Q)", "(Ω)");
-    }
-    if text.contains("icorr") && text.contains("(A/cm)") && !text.contains("^2") {
-        text = text.replace("(A/cm)", "(A/cm^2)");
-    }
-    if text.eq_ignore_ascii_case("be (V/dec)") {
-        text = "bc (V/dec)".to_string();
-    }
-    if text.starts_with("Polarization resistance") {
-        if text.contains("(Ω)") {
-            text = text.replace("(Ω) rate", "(Ω)");
-        } else {
-            text.push_str(" (Ω)");
-        }
-    }
-    if text.starts_with("Corrosion") && text.contains("mm/year") {
-        text = "Corrosion rate (mm/year)".to_string();
-    }
-    *cell = text;
-}
-fn normalize_data_cell(cell: &mut String) {
-    let mut text = cell.trim().to_string();
-    if text.is_empty() {
-        cell.clear();
-        return;
-    }
-    for ch in ['\u{2014}', '\u{2013}', '\u{2212}'] {
-        text = text.replace(ch, "-");
-    }
-    if text.starts_with("- ") {
-        text = format!("-{}", text[2..].trim_start());
-    }
-    text = text.replace("- ", "-");
-    text = text.replace(" -", "-");
-    text = text.replace("E-", "e-").replace("E+", "e+");
-    if text == "-" {
-        text.clear();
-    }
-    *cell = text;
-}

data/vendor/html-to-markdown-rs/src/hocr/converter/mod.rs DELETED Viewed

@@ -1,26 +0,0 @@
-//! hOCR to Markdown conversion module
-//!
-//! Converts structured hOCR elements to Markdown while preserving document hierarchy.
-//!
-//! This module is organized into several submodules:
-//! - `core`: Main conversion functions and entry points
-//! - `elements`: Element-specific conversion logic
-//! - `hierarchy`: Document hierarchy and code block detection
-//! - `layout`: Spatial layout analysis and table reconstruction
-//! - `output`: Output formatting utilities
-#![allow(clippy::branches_sharing_code, clippy::option_if_let_else)]
-mod code_analysis;
-mod core;
-mod elements;
-mod hierarchy;
-mod keywords;
-mod layout;
-mod output;
-// Re-export public API
-pub use core::{convert_to_markdown, convert_to_markdown_with_options};
-// Re-export commonly used types from spatial module for downstream use
-pub use super::spatial::HocrWord;

data/vendor/html-to-markdown-rs/src/hocr/converter/output.rs DELETED Viewed

@@ -1,78 +0,0 @@
-//! Output formatting utilities for hOCR to Markdown conversion
-use crate::hocr::types::{HocrElement, HocrElementType};
-#[derive(Default)]
-pub struct ConvertContext {
-    pub last_heading: Option<String>,
-}
-pub fn ensure_trailing_blank_line(output: &mut String) {
-    while output.ends_with("\n\n\n") {
-        output.pop();
-    }
-    if output.ends_with("\n\n") {
-        return;
-    }
-    if output.ends_with('\n') {
-        output.push('\n');
-    } else {
-        output.push_str("\n\n");
-    }
-}
-pub fn collapse_extra_newlines(output: &mut String) {
-    let mut collapsed = String::with_capacity(output.len());
-    let mut newline_count = 0;
-    for ch in output.chars() {
-        if ch == '\n' {
-            newline_count += 1;
-            if newline_count <= 2 {
-                collapsed.push('\n');
-            }
-        } else {
-            newline_count = 0;
-            collapsed.push(ch);
-        }
-    }
-    if collapsed.len() != output.len() {
-        *output = collapsed;
-    }
-}
-pub fn element_text_content(element: &HocrElement) -> String {
-    let mut output = String::new();
-    collect_text_tokens(element, &mut output);
-    output
-}
-fn collect_text_tokens(element: &HocrElement, output: &mut String) {
-    if element.element_type == HocrElementType::OcrxWord {
-        let trimmed = element.text.trim();
-        if !trimmed.is_empty() {
-            if !output.is_empty() {
-                output.push(' ');
-            }
-            output.push_str(trimmed);
-        }
-    }
-    for child in &element.children {
-        collect_text_tokens(child, output);
-    }
-}
-pub fn collect_line_words(element: &HocrElement, words: &mut Vec<String>) {
-    if element.element_type == HocrElementType::OcrxWord {
-        let trimmed = element.text.trim();
-        if !trimmed.is_empty() {
-            words.push(trimmed.to_string());
-        }
-    }
-    for child in &element.children {
-        collect_line_words(child, words);
-    }
-}

data/vendor/html-to-markdown-rs/src/hocr/extractor.rs DELETED Viewed

@@ -1,232 +0,0 @@
-#![allow(clippy::option_if_let_else)]
-//! hOCR element extraction
-//!
-//! Extracts structured hOCR elements from HTML DOM.
-use super::parser::parse_properties;
-use super::types::{HocrElement, HocrElementType, HocrMetadata};
-/// Extract complete hOCR document structure from HTML DOM
-///
-/// Parses an HTML document containing hOCR annotations and extracts all hOCR elements
-/// along with document metadata.
-///
-/// # Arguments
-///
-/// * `dom` - The parsed HTML DOM (from the astral-tl parser)
-/// * `debug` - Enable debug logging for property parsing
-///
-/// # Returns
-///
-/// A tuple containing:
-/// * `Vec<HocrElement>` - All top-level hOCR elements with their full hierarchies
-/// * `HocrMetadata` - Document metadata from `<head>` meta tags
-///
-/// # hOCR 1.2 Compliance
-///
-/// Supports all 40 element types:
-/// - Logical structure (12): `ocr_title`, `ocr_chapter`, `ocr_section`, `ocr_par`, etc.
-/// - Typesetting (6): `ocr_page`, `ocr_carea`, `ocr_line`, etc.
-/// - Float elements (13): `ocr_image`, `ocr_table`, `ocr_math`, etc.
-/// - Inline elements (6): `ocr_dropcap`, `ocr_glyph`, etc.
-/// - Engine-specific (3): `ocrx_block`, `ocrx_line`, `ocrx_word`
-///
-/// Extracts all 20+ properties from title attributes (bbox, `x_wconf`, baseline, order, etc.)
-/// and all 5 metadata fields (ocr-system, ocr-capabilities, ocr-langs, etc.)
-///
-/// # Example
-///
-/// ```rust
-/// use html_to_markdown_rs::hocr::extract_hocr_document;
-///
-/// let html = r#"<div class="ocr_page" title="bbox 0 0 1000 1500">
-///     <p class="ocr_par" title="bbox 100 100 900 200">
-///         <span class="ocrx_word" title="bbox 100 100 150 130; x_wconf 95">Hello</span>
-///     </p>
-/// </div>"#;
-/// let dom = tl::parse(html, tl::ParserOptions::default()).unwrap();
-/// let (elements, metadata) = extract_hocr_document(&dom);
-/// ```
-#[must_use]
-pub fn extract_hocr_document(dom: &tl::VDom) -> (Vec<HocrElement>, HocrMetadata) {
-    let parser = dom.parser();
-    let mut elements = Vec::new();
-    let metadata = extract_metadata(dom);
-    for child_handle in dom.children() {
-        collect_hocr_elements(child_handle, parser, &mut elements);
-    }
-    (elements, metadata)
-}
-/// Recursively collect hOCR elements from DOM tree
-#[allow(clippy::trivially_copy_pass_by_ref)]
-fn collect_hocr_elements(node_handle: &tl::NodeHandle, parser: &tl::Parser, elements: &mut Vec<HocrElement>) {
-    if let Some(element) = extract_element(node_handle, parser) {
-        elements.push(element);
-    } else if let Some(tl::Node::Tag(tag)) = node_handle.get(parser) {
-        let children = tag.children();
-        for child_handle in children.top().iter() {
-            collect_hocr_elements(child_handle, parser, elements);
-        }
-    }
-}
-/// Extract hOCR metadata from HTML head (or from orphaned meta tags after sanitization)
-pub(crate) fn extract_metadata(dom: &tl::VDom) -> HocrMetadata {
-    let mut metadata = HocrMetadata::default();
-    let parser = dom.parser();
-    fn extract_from_meta_tag(meta_tag: &tl::HTMLTag, metadata: &mut HocrMetadata) {
-        let attrs = meta_tag.attributes();
-        if let (Some(name), Some(content)) = (attrs.get("name").flatten(), attrs.get("content").flatten()) {
-            let name_str = name.as_utf8_str();
-            let content_str = content.as_utf8_str().to_string();
-            match name_str.as_ref() {
-                "ocr-system" => metadata.ocr_system = Some(content_str),
-                "ocr-capabilities" => {
-                    metadata.ocr_capabilities = content_str
-                        .split_whitespace()
-                        .map(std::string::ToString::to_string)
-                        .collect();
-                }
-                "ocr-number-of-pages" => {
-                    metadata.ocr_number_of_pages = content_str.parse().ok();
-                }
-                "ocr-langs" => {
-                    metadata.ocr_langs = content_str
-                        .split_whitespace()
-                        .map(std::string::ToString::to_string)
-                        .collect();
-                }
-                "ocr-scripts" => {
-                    metadata.ocr_scripts = content_str
-                        .split_whitespace()
-                        .map(std::string::ToString::to_string)
-                        .collect();
-                }
-                _ => {}
-            }
-        }
-    }
-    #[allow(clippy::trivially_copy_pass_by_ref)]
-    fn find_meta_tags<'a>(node_handle: &tl::NodeHandle, parser: &'a tl::Parser<'a>, metadata: &mut HocrMetadata) {
-        if let Some(tl::Node::Tag(tag)) = node_handle.get(parser) {
-            let tag_name = tag.name().as_utf8_str();
-            if tag_name == "meta" {
-                extract_from_meta_tag(tag, metadata);
-            }
-            let children = tag.children();
-            for child_handle in children.top().iter() {
-                find_meta_tags(child_handle, parser, metadata);
-            }
-        }
-    }
-    for child_handle in dom.children() {
-        find_meta_tags(child_handle, parser, &mut metadata);
-    }
-    metadata
-}
-/// Extract a single hOCR element and its children
-#[allow(clippy::trivially_copy_pass_by_ref)]
-fn extract_element(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> Option<HocrElement> {
-    if let Some(tl::Node::Tag(tag)) = node_handle.get(parser) {
-        let attrs = tag.attributes();
-        let class_attr = attrs.get("class").flatten()?;
-        let classes = class_attr.as_utf8_str();
-        if !classes.as_ref().contains("ocr") {
-            return None;
-        }
-        let element_type = classes.split_whitespace().find_map(HocrElementType::from_class)?;
-        let properties = if let Some(title) = attrs.get("title").flatten() {
-            parse_properties(&title.as_utf8_str())
-        } else {
-            Default::default()
-        };
-        let mut text = String::new();
-        let mut children = Vec::new();
-        let tag_children = tag.children();
-        for child_handle in tag_children.top().iter() {
-            if let Some(tl::Node::Raw(bytes)) = child_handle.get(parser) {
-                text.push_str(&bytes.as_utf8_str());
-            } else if let Some(child_element) = extract_element(child_handle, parser) {
-                children.push(child_element);
-            }
-        }
-        Some(HocrElement {
-            element_type,
-            properties,
-            text: text.trim().to_string(),
-            children,
-        })
-    } else {
-        None
-    }
-}
-#[cfg(test)]
-mod tests {
-    use super::*;
-    #[test]
-    fn test_extract_simple_word() {
-        let html = r#"<span class="ocrx_word" title="bbox 100 50 150 80; x_wconf 95">Hello</span>"#;
-        let dom = tl::parse(html, tl::ParserOptions::default()).unwrap();
-        let parser = dom.parser();
-        let element = extract_element(&dom.children()[0], parser).unwrap();
-        assert!(matches!(element.element_type, HocrElementType::OcrxWord));
-        assert_eq!(element.text, "Hello");
-        assert!(element.properties.bbox.is_some());
-        assert_eq!(element.properties.x_wconf, Some(95.0));
-    }
-    #[test]
-    fn test_extract_paragraph() {
-        let html = r#"<p class="ocr_par" title="bbox 0 0 200 100">
-            <span class="ocrx_word" title="bbox 10 10 50 30; x_wconf 90">First</span>
-            <span class="ocrx_word" title="bbox 60 10 100 30; x_wconf 92">Word</span>
-        </p>"#;
-        let dom = tl::parse(html, tl::ParserOptions::default()).unwrap();
-        let parser = dom.parser();
-        let element = extract_element(&dom.children()[0], parser).unwrap();
-        assert!(matches!(element.element_type, HocrElementType::OcrPar));
-        assert_eq!(element.children.len(), 2);
-        assert!(matches!(element.children[0].element_type, HocrElementType::OcrxWord));
-    }
-    #[test]
-    fn test_extract_metadata() {
-        let html = r#"<!DOCTYPE html>
-<html>
-<head>
-<meta name="ocr-system" content="tesseract 4.1.1" />
-<meta name="ocr-capabilities" content="ocr_page ocr_carea ocr_par ocr_line ocrx_word" />
-<meta name="ocr-number-of-pages" content="5" />
-</head>
-<body>
-<div class="ocr_page"></div>
-</body>
-</html>"#;
-        let dom = tl::parse(html, tl::ParserOptions::default()).unwrap();
-        let (_, metadata) = extract_hocr_document(&dom);
-        assert_eq!(metadata.ocr_system, Some("tesseract 4.1.1".to_string()));
-        assert!(metadata.ocr_capabilities.contains(&"ocr_page".to_string()));
-        assert_eq!(metadata.ocr_number_of_pages, Some(5));
-    }
-}

data/vendor/html-to-markdown-rs/src/hocr/mod.rs DELETED Viewed

@@ -1,42 +0,0 @@
-#![allow(clippy::cast_precision_loss, clippy::cast_sign_loss, clippy::unused_self)]
-//! hOCR 1.2 document processing.
-//!
-//! **Deprecated since 2.30.0**: hOCR support will be removed in v3.
-//!
-//! Complete hOCR 1.2 specification support for extracting structured content from OCR documents.
-//!
-//! ## Features
-//!
-//! - **Full Element Support**: All 40+ hOCR 1.2 element types
-//! - **Complete Property Parsing**: All 20+ hOCR properties (bbox, baseline, fonts, etc.)
-//! - **Document Structure**: Logical hierarchy (paragraphs, sections, chapters)
-//! - **Spatial Table Reconstruction**: Automatic table detection from bbox coordinates
-//! - **Metadata Extraction**: OCR system info, capabilities, languages
-//!
-//! ## Modules
-//!
-//! - [`types`]: Core hOCR element and property types
-//! - [`parser`]: Property parsing from title attributes
-//! - [`extractor`]: DOM to hOCR element tree extraction
-//! - [`converter`]: hOCR to Markdown conversion
-//! - [`spatial`]: Spatial table reconstruction from bounding boxes
-#[allow(deprecated)]
-pub mod converter;
-#[allow(deprecated)]
-pub mod extractor;
-#[allow(deprecated)]
-pub mod parser;
-#[allow(deprecated)]
-pub mod spatial;
-#[allow(deprecated)]
-pub mod types;
-#[deprecated(since = "2.30.0", note = "hOCR support will be removed in v3.")]
-pub use converter::{convert_to_markdown, convert_to_markdown_with_options};
-#[deprecated(since = "2.30.0", note = "hOCR support will be removed in v3.")]
-pub use extractor::extract_hocr_document;
-#[deprecated(since = "2.30.0", note = "hOCR support will be removed in v3.")]
-pub use spatial::{HocrWord, extract_hocr_words, reconstruct_table, table_to_markdown};
-#[deprecated(since = "2.30.0", note = "hOCR support will be removed in v3.")]
-pub use types::{BBox, Baseline, HocrElement, HocrElementType, HocrMetadata, HocrProperties};