parsekit 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
 - data/ext/parsekit/src/format_detector.rs +233 -0
 - data/ext/parsekit/src/lib.rs +1 -0
 - data/ext/parsekit/src/parser.rs +111 -181
 - data/lib/parsekit/NATIVE_API.md +125 -0
 - data/lib/parsekit/parsekit.bundle +0 -0
 - data/lib/parsekit/parser.rb +155 -104
 - data/lib/parsekit/version.rb +1 -1
 - data/lib/parsekit.rb +32 -0
 - metadata +3 -1
 
    
        checksums.yaml
    CHANGED
    
    | 
         @@ -1,7 +1,7 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            ---
         
     | 
| 
       2 
2 
     | 
    
         
             
            SHA256:
         
     | 
| 
       3 
     | 
    
         
            -
              metadata.gz:  
     | 
| 
       4 
     | 
    
         
            -
              data.tar.gz:  
     | 
| 
      
 3 
     | 
    
         
            +
              metadata.gz: e77e605d938d5b0b89c7814d1360f4c505415c54efbf8ffe9f2f7d4c564d917e
         
     | 
| 
      
 4 
     | 
    
         
            +
              data.tar.gz: 6b86f57b2dce1231cae704b4d35c7562807ab77b001860b6fa5bbcdc9844781f
         
     | 
| 
       5 
5 
     | 
    
         
             
            SHA512:
         
     | 
| 
       6 
     | 
    
         
            -
              metadata.gz:  
     | 
| 
       7 
     | 
    
         
            -
              data.tar.gz:  
     | 
| 
      
 6 
     | 
    
         
            +
              metadata.gz: a3f7089e8bd3e84cb2e14614cb78c3b3132d4d93a3c95d5cdcfa6c63723fe2dfce3a01bf0ee27255be7ff036bd0e438492434ded72853772e57b65faf7bded9b
         
     | 
| 
      
 7 
     | 
    
         
            +
              data.tar.gz: c84b03d65471f50d6ec72eaa21269b5fc1c5e40e0cefa923cc71d50b802734d64c8296e5e5e6a76ca2f5d388a568119ed09474ae622534aaef03e0a96109dee3
         
     | 
| 
         @@ -0,0 +1,233 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            use std::path::Path;
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
      
 3 
     | 
    
         
            +
            /// Represents a detected file format
         
     | 
| 
      
 4 
     | 
    
         
            +
            #[derive(Debug, Clone, PartialEq)]
         
     | 
| 
      
 5 
     | 
    
         
            +
            pub enum FileFormat {
         
     | 
| 
      
 6 
     | 
    
         
            +
                Pdf,
         
     | 
| 
      
 7 
     | 
    
         
            +
                Docx,
         
     | 
| 
      
 8 
     | 
    
         
            +
                Xlsx,
         
     | 
| 
      
 9 
     | 
    
         
            +
                Xls,
         
     | 
| 
      
 10 
     | 
    
         
            +
                Pptx,
         
     | 
| 
      
 11 
     | 
    
         
            +
                Png,
         
     | 
| 
      
 12 
     | 
    
         
            +
                Jpeg,
         
     | 
| 
      
 13 
     | 
    
         
            +
                Tiff,
         
     | 
| 
      
 14 
     | 
    
         
            +
                Bmp,
         
     | 
| 
      
 15 
     | 
    
         
            +
                Json,
         
     | 
| 
      
 16 
     | 
    
         
            +
                Xml,
         
     | 
| 
      
 17 
     | 
    
         
            +
                Html,
         
     | 
| 
      
 18 
     | 
    
         
            +
                Text,
         
     | 
| 
      
 19 
     | 
    
         
            +
                Unknown,
         
     | 
| 
      
 20 
     | 
    
         
            +
            }
         
     | 
| 
      
 21 
     | 
    
         
            +
             
     | 
| 
      
 22 
     | 
    
         
            +
            impl FileFormat {
         
     | 
| 
      
 23 
     | 
    
         
            +
                /// Convert to Ruby symbol representation
         
     | 
| 
      
 24 
     | 
    
         
            +
                pub fn to_symbol(&self) -> &'static str {
         
     | 
| 
      
 25 
     | 
    
         
            +
                    match self {
         
     | 
| 
      
 26 
     | 
    
         
            +
                        FileFormat::Pdf => "pdf",
         
     | 
| 
      
 27 
     | 
    
         
            +
                        FileFormat::Docx => "docx",
         
     | 
| 
      
 28 
     | 
    
         
            +
                        FileFormat::Xlsx => "xlsx",
         
     | 
| 
      
 29 
     | 
    
         
            +
                        FileFormat::Xls => "xls",
         
     | 
| 
      
 30 
     | 
    
         
            +
                        FileFormat::Pptx => "pptx",
         
     | 
| 
      
 31 
     | 
    
         
            +
                        FileFormat::Png => "png",
         
     | 
| 
      
 32 
     | 
    
         
            +
                        FileFormat::Jpeg => "jpeg",
         
     | 
| 
      
 33 
     | 
    
         
            +
                        FileFormat::Tiff => "tiff",
         
     | 
| 
      
 34 
     | 
    
         
            +
                        FileFormat::Bmp => "bmp",
         
     | 
| 
      
 35 
     | 
    
         
            +
                        FileFormat::Json => "json",
         
     | 
| 
      
 36 
     | 
    
         
            +
                        FileFormat::Xml => "xml",
         
     | 
| 
      
 37 
     | 
    
         
            +
                        FileFormat::Html => "xml", // HTML is treated as XML in Ruby
         
     | 
| 
      
 38 
     | 
    
         
            +
                        FileFormat::Text => "text",
         
     | 
| 
      
 39 
     | 
    
         
            +
                        FileFormat::Unknown => "unknown",
         
     | 
| 
      
 40 
     | 
    
         
            +
                    }
         
     | 
| 
      
 41 
     | 
    
         
            +
                }
         
     | 
| 
      
 42 
     | 
    
         
            +
            }
         
     | 
| 
      
 43 
     | 
    
         
            +
             
     | 
| 
      
 44 
     | 
    
         
            +
            /// Central format detection logic
         
     | 
| 
      
 45 
     | 
    
         
            +
            pub struct FormatDetector;
         
     | 
| 
      
 46 
     | 
    
         
            +
             
     | 
| 
      
 47 
     | 
    
         
            +
            impl FormatDetector {
         
     | 
| 
      
 48 
     | 
    
         
            +
                /// Detect format from filename and content
         
     | 
| 
      
 49 
     | 
    
         
            +
                /// Prioritizes content detection over extension when both are available
         
     | 
| 
      
 50 
     | 
    
         
            +
                pub fn detect(filename: Option<&str>, content: Option<&[u8]>) -> FileFormat {
         
     | 
| 
      
 51 
     | 
    
         
            +
                    // First try content-based detection if content is provided
         
     | 
| 
      
 52 
     | 
    
         
            +
                    if let Some(data) = content {
         
     | 
| 
      
 53 
     | 
    
         
            +
                        let format = Self::detect_from_content(data);
         
     | 
| 
      
 54 
     | 
    
         
            +
                        // If we got a definitive format from content, use it
         
     | 
| 
      
 55 
     | 
    
         
            +
                        if !matches!(format, FileFormat::Text | FileFormat::Unknown) {
         
     | 
| 
      
 56 
     | 
    
         
            +
                            return format;
         
     | 
| 
      
 57 
     | 
    
         
            +
                        }
         
     | 
| 
      
 58 
     | 
    
         
            +
                    }
         
     | 
| 
      
 59 
     | 
    
         
            +
                    
         
     | 
| 
      
 60 
     | 
    
         
            +
                    // Fall back to extension-based detection
         
     | 
| 
      
 61 
     | 
    
         
            +
                    if let Some(name) = filename {
         
     | 
| 
      
 62 
     | 
    
         
            +
                        let ext_format = Self::detect_from_extension(name);
         
     | 
| 
      
 63 
     | 
    
         
            +
                        if ext_format != FileFormat::Unknown {
         
     | 
| 
      
 64 
     | 
    
         
            +
                            return ext_format;
         
     | 
| 
      
 65 
     | 
    
         
            +
                        }
         
     | 
| 
      
 66 
     | 
    
         
            +
                    }
         
     | 
| 
      
 67 
     | 
    
         
            +
                    
         
     | 
| 
      
 68 
     | 
    
         
            +
                    // If content detection returned Text and no extension match, return Text
         
     | 
| 
      
 69 
     | 
    
         
            +
                    if let Some(data) = content {
         
     | 
| 
      
 70 
     | 
    
         
            +
                        let format = Self::detect_from_content(data);
         
     | 
| 
      
 71 
     | 
    
         
            +
                        if format == FileFormat::Text {
         
     | 
| 
      
 72 
     | 
    
         
            +
                            return FileFormat::Text;
         
     | 
| 
      
 73 
     | 
    
         
            +
                        }
         
     | 
| 
      
 74 
     | 
    
         
            +
                    }
         
     | 
| 
      
 75 
     | 
    
         
            +
                    
         
     | 
| 
      
 76 
     | 
    
         
            +
                    FileFormat::Unknown
         
     | 
| 
      
 77 
     | 
    
         
            +
                }
         
     | 
| 
      
 78 
     | 
    
         
            +
                
         
     | 
| 
      
 79 
     | 
    
         
            +
                /// Detect format from file extension
         
     | 
| 
      
 80 
     | 
    
         
            +
                pub fn detect_from_extension(filename: &str) -> FileFormat {
         
     | 
| 
      
 81 
     | 
    
         
            +
                    let path = Path::new(filename);
         
     | 
| 
      
 82 
     | 
    
         
            +
                    let ext = match path.extension().and_then(|s| s.to_str()) {
         
     | 
| 
      
 83 
     | 
    
         
            +
                        Some(e) => e.to_lowercase(),
         
     | 
| 
      
 84 
     | 
    
         
            +
                        None => return FileFormat::Unknown,
         
     | 
| 
      
 85 
     | 
    
         
            +
                    };
         
     | 
| 
      
 86 
     | 
    
         
            +
                    
         
     | 
| 
      
 87 
     | 
    
         
            +
                    match ext.as_str() {
         
     | 
| 
      
 88 
     | 
    
         
            +
                        "pdf" => FileFormat::Pdf,
         
     | 
| 
      
 89 
     | 
    
         
            +
                        "docx" => FileFormat::Docx,
         
     | 
| 
      
 90 
     | 
    
         
            +
                        "xlsx" => FileFormat::Xlsx,
         
     | 
| 
      
 91 
     | 
    
         
            +
                        "xls" => FileFormat::Xls,
         
     | 
| 
      
 92 
     | 
    
         
            +
                        "pptx" => FileFormat::Pptx,
         
     | 
| 
      
 93 
     | 
    
         
            +
                        "png" => FileFormat::Png,
         
     | 
| 
      
 94 
     | 
    
         
            +
                        "jpg" | "jpeg" => FileFormat::Jpeg,
         
     | 
| 
      
 95 
     | 
    
         
            +
                        "tiff" | "tif" => FileFormat::Tiff,
         
     | 
| 
      
 96 
     | 
    
         
            +
                        "bmp" => FileFormat::Bmp,
         
     | 
| 
      
 97 
     | 
    
         
            +
                        "json" => FileFormat::Json,
         
     | 
| 
      
 98 
     | 
    
         
            +
                        "xml" => FileFormat::Xml,
         
     | 
| 
      
 99 
     | 
    
         
            +
                        "html" | "htm" => FileFormat::Html,
         
     | 
| 
      
 100 
     | 
    
         
            +
                        "txt" | "text" | "md" | "markdown" | "csv" => FileFormat::Text,
         
     | 
| 
      
 101 
     | 
    
         
            +
                        _ => FileFormat::Unknown,
         
     | 
| 
      
 102 
     | 
    
         
            +
                    }
         
     | 
| 
      
 103 
     | 
    
         
            +
                }
         
     | 
| 
      
 104 
     | 
    
         
            +
                
         
     | 
| 
      
 105 
     | 
    
         
            +
                /// Detect format from file content (magic bytes)
         
     | 
| 
      
 106 
     | 
    
         
            +
                pub fn detect_from_content(data: &[u8]) -> FileFormat {
         
     | 
| 
      
 107 
     | 
    
         
            +
                    if data.is_empty() {
         
     | 
| 
      
 108 
     | 
    
         
            +
                        return FileFormat::Text; // Empty files are treated as text
         
     | 
| 
      
 109 
     | 
    
         
            +
                    }
         
     | 
| 
      
 110 
     | 
    
         
            +
                    
         
     | 
| 
      
 111 
     | 
    
         
            +
                    // PDF
         
     | 
| 
      
 112 
     | 
    
         
            +
                    if data.len() >= 4 && data.starts_with(b"%PDF") {
         
     | 
| 
      
 113 
     | 
    
         
            +
                        return FileFormat::Pdf;
         
     | 
| 
      
 114 
     | 
    
         
            +
                    }
         
     | 
| 
      
 115 
     | 
    
         
            +
                    
         
     | 
| 
      
 116 
     | 
    
         
            +
                    // PNG
         
     | 
| 
      
 117 
     | 
    
         
            +
                    if data.len() >= 8 && data.starts_with(&[0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A]) {
         
     | 
| 
      
 118 
     | 
    
         
            +
                        return FileFormat::Png;
         
     | 
| 
      
 119 
     | 
    
         
            +
                    }
         
     | 
| 
      
 120 
     | 
    
         
            +
                    
         
     | 
| 
      
 121 
     | 
    
         
            +
                    // JPEG
         
     | 
| 
      
 122 
     | 
    
         
            +
                    if data.len() >= 3 && data.starts_with(&[0xFF, 0xD8, 0xFF]) {
         
     | 
| 
      
 123 
     | 
    
         
            +
                        return FileFormat::Jpeg;
         
     | 
| 
      
 124 
     | 
    
         
            +
                    }
         
     | 
| 
      
 125 
     | 
    
         
            +
                    
         
     | 
| 
      
 126 
     | 
    
         
            +
                    // BMP
         
     | 
| 
      
 127 
     | 
    
         
            +
                    if data.len() >= 2 && data.starts_with(b"BM") {
         
     | 
| 
      
 128 
     | 
    
         
            +
                        return FileFormat::Bmp;
         
     | 
| 
      
 129 
     | 
    
         
            +
                    }
         
     | 
| 
      
 130 
     | 
    
         
            +
                    
         
     | 
| 
      
 131 
     | 
    
         
            +
                    // TIFF (little-endian or big-endian)
         
     | 
| 
      
 132 
     | 
    
         
            +
                    if data.len() >= 4 {
         
     | 
| 
      
 133 
     | 
    
         
            +
                        if data.starts_with(b"II\x2A\x00") || data.starts_with(b"MM\x00\x2A") {
         
     | 
| 
      
 134 
     | 
    
         
            +
                            return FileFormat::Tiff;
         
     | 
| 
      
 135 
     | 
    
         
            +
                        }
         
     | 
| 
      
 136 
     | 
    
         
            +
                    }
         
     | 
| 
      
 137 
     | 
    
         
            +
                    
         
     | 
| 
      
 138 
     | 
    
         
            +
                    // OLE Compound Document (old Excel/Word)
         
     | 
| 
      
 139 
     | 
    
         
            +
                    if data.len() >= 4 && data.starts_with(&[0xD0, 0xCF, 0x11, 0xE0]) {
         
     | 
| 
      
 140 
     | 
    
         
            +
                        return FileFormat::Xls; // Old Office format, usually Excel
         
     | 
| 
      
 141 
     | 
    
         
            +
                    }
         
     | 
| 
      
 142 
     | 
    
         
            +
                    
         
     | 
| 
      
 143 
     | 
    
         
            +
                    // ZIP archive (could be DOCX, XLSX, PPTX)
         
     | 
| 
      
 144 
     | 
    
         
            +
                    if data.len() >= 2 && data.starts_with(b"PK") {
         
     | 
| 
      
 145 
     | 
    
         
            +
                        return Self::detect_office_format(data);
         
     | 
| 
      
 146 
     | 
    
         
            +
                    }
         
     | 
| 
      
 147 
     | 
    
         
            +
                    
         
     | 
| 
      
 148 
     | 
    
         
            +
                    // XML
         
     | 
| 
      
 149 
     | 
    
         
            +
                    if data.len() >= 5 {
         
     | 
| 
      
 150 
     | 
    
         
            +
                        let start = String::from_utf8_lossy(&data[0..5.min(data.len())]);
         
     | 
| 
      
 151 
     | 
    
         
            +
                        if start.starts_with("<?xml") || start.starts_with("<!") {
         
     | 
| 
      
 152 
     | 
    
         
            +
                            return FileFormat::Xml;
         
     | 
| 
      
 153 
     | 
    
         
            +
                        }
         
     | 
| 
      
 154 
     | 
    
         
            +
                    }
         
     | 
| 
      
 155 
     | 
    
         
            +
                    
         
     | 
| 
      
 156 
     | 
    
         
            +
                    // HTML
         
     | 
| 
      
 157 
     | 
    
         
            +
                    if data.len() >= 14 {
         
     | 
| 
      
 158 
     | 
    
         
            +
                        let start = String::from_utf8_lossy(&data[0..14.min(data.len())]).to_lowercase();
         
     | 
| 
      
 159 
     | 
    
         
            +
                        if start.contains("<!doctype") || start.contains("<html") {
         
     | 
| 
      
 160 
     | 
    
         
            +
                            return FileFormat::Html;
         
     | 
| 
      
 161 
     | 
    
         
            +
                        }
         
     | 
| 
      
 162 
     | 
    
         
            +
                    }
         
     | 
| 
      
 163 
     | 
    
         
            +
                    
         
     | 
| 
      
 164 
     | 
    
         
            +
                    // JSON
         
     | 
| 
      
 165 
     | 
    
         
            +
                    if let Some(&first_non_ws) = data.iter().find(|&&b| !b" \t\n\r".contains(&b)) {
         
     | 
| 
      
 166 
     | 
    
         
            +
                        if first_non_ws == b'{' || first_non_ws == b'[' {
         
     | 
| 
      
 167 
     | 
    
         
            +
                            return FileFormat::Json;
         
     | 
| 
      
 168 
     | 
    
         
            +
                        }
         
     | 
| 
      
 169 
     | 
    
         
            +
                    }
         
     | 
| 
      
 170 
     | 
    
         
            +
                    
         
     | 
| 
      
 171 
     | 
    
         
            +
                    // Default to text for unrecognized formats
         
     | 
| 
      
 172 
     | 
    
         
            +
                    FileFormat::Text
         
     | 
| 
      
 173 
     | 
    
         
            +
                }
         
     | 
| 
      
 174 
     | 
    
         
            +
                
         
     | 
| 
      
 175 
     | 
    
         
            +
                /// Detect specific Office format from ZIP data
         
     | 
| 
      
 176 
     | 
    
         
            +
                fn detect_office_format(data: &[u8]) -> FileFormat {
         
     | 
| 
      
 177 
     | 
    
         
            +
                    // Look for Office-specific directory names in first 2KB of ZIP
         
     | 
| 
      
 178 
     | 
    
         
            +
                    let check_len = 2000.min(data.len());
         
     | 
| 
      
 179 
     | 
    
         
            +
                    let content = String::from_utf8_lossy(&data[0..check_len]);
         
     | 
| 
      
 180 
     | 
    
         
            +
                    
         
     | 
| 
      
 181 
     | 
    
         
            +
                    // Check for format-specific markers
         
     | 
| 
      
 182 
     | 
    
         
            +
                    if content.contains("word/") || content.contains("word/_rels") {
         
     | 
| 
      
 183 
     | 
    
         
            +
                        FileFormat::Docx
         
     | 
| 
      
 184 
     | 
    
         
            +
                    } else if content.contains("xl/") || content.contains("xl/_rels") {
         
     | 
| 
      
 185 
     | 
    
         
            +
                        FileFormat::Xlsx
         
     | 
| 
      
 186 
     | 
    
         
            +
                    } else if content.contains("ppt/") || content.contains("ppt/_rels") {
         
     | 
| 
      
 187 
     | 
    
         
            +
                        FileFormat::Pptx
         
     | 
| 
      
 188 
     | 
    
         
            +
                    } else {
         
     | 
| 
      
 189 
     | 
    
         
            +
                        // Default to XLSX for generic ZIP (most common Office format)
         
     | 
| 
      
 190 
     | 
    
         
            +
                        FileFormat::Xlsx
         
     | 
| 
      
 191 
     | 
    
         
            +
                    }
         
     | 
| 
      
 192 
     | 
    
         
            +
                }
         
     | 
| 
      
 193 
     | 
    
         
            +
                
         
     | 
| 
      
 194 
     | 
    
         
            +
                
         
     | 
| 
      
 195 
     | 
    
         
            +
                /// Get all supported extensions
         
     | 
| 
      
 196 
     | 
    
         
            +
                pub fn supported_extensions() -> Vec<&'static str> {
         
     | 
| 
      
 197 
     | 
    
         
            +
                    vec![
         
     | 
| 
      
 198 
     | 
    
         
            +
                        "pdf", "docx", "xlsx", "xls", "pptx",
         
     | 
| 
      
 199 
     | 
    
         
            +
                        "png", "jpg", "jpeg", "tiff", "tif", "bmp",
         
     | 
| 
      
 200 
     | 
    
         
            +
                        "json", "xml", "html", "htm",
         
     | 
| 
      
 201 
     | 
    
         
            +
                        "txt", "text", "md", "markdown", "csv"
         
     | 
| 
      
 202 
     | 
    
         
            +
                    ]
         
     | 
| 
      
 203 
     | 
    
         
            +
                }
         
     | 
| 
      
 204 
     | 
    
         
            +
            }
         
     | 
| 
      
 205 
     | 
    
         
            +
             
     | 
| 
      
 206 
     | 
    
         
            +
            #[cfg(test)]
         
     | 
| 
      
 207 
     | 
    
         
            +
            mod tests {
         
     | 
| 
      
 208 
     | 
    
         
            +
                use super::*;
         
     | 
| 
      
 209 
     | 
    
         
            +
                
         
     | 
| 
      
 210 
     | 
    
         
            +
                #[test]
         
     | 
| 
      
 211 
     | 
    
         
            +
                fn test_detect_pdf() {
         
     | 
| 
      
 212 
     | 
    
         
            +
                    let pdf_data = b"%PDF-1.5\n";
         
     | 
| 
      
 213 
     | 
    
         
            +
                    assert_eq!(FormatDetector::detect_from_content(pdf_data), FileFormat::Pdf);
         
     | 
| 
      
 214 
     | 
    
         
            +
                }
         
     | 
| 
      
 215 
     | 
    
         
            +
                
         
     | 
| 
      
 216 
     | 
    
         
            +
                #[test]
         
     | 
| 
      
 217 
     | 
    
         
            +
                fn test_detect_png() {
         
     | 
| 
      
 218 
     | 
    
         
            +
                    let png_data = &[0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A];
         
     | 
| 
      
 219 
     | 
    
         
            +
                    assert_eq!(FormatDetector::detect_from_content(png_data), FileFormat::Png);
         
     | 
| 
      
 220 
     | 
    
         
            +
                }
         
     | 
| 
      
 221 
     | 
    
         
            +
                
         
     | 
| 
      
 222 
     | 
    
         
            +
                #[test]
         
     | 
| 
      
 223 
     | 
    
         
            +
                fn test_detect_from_extension() {
         
     | 
| 
      
 224 
     | 
    
         
            +
                    assert_eq!(FormatDetector::detect_from_extension("document.pdf"), FileFormat::Pdf);
         
     | 
| 
      
 225 
     | 
    
         
            +
                    assert_eq!(FormatDetector::detect_from_extension("Document.PDF"), FileFormat::Pdf);
         
     | 
| 
      
 226 
     | 
    
         
            +
                    assert_eq!(FormatDetector::detect_from_extension("data.xlsx"), FileFormat::Xlsx);
         
     | 
| 
      
 227 
     | 
    
         
            +
                }
         
     | 
| 
      
 228 
     | 
    
         
            +
                
         
     | 
| 
      
 229 
     | 
    
         
            +
                #[test]
         
     | 
| 
      
 230 
     | 
    
         
            +
                fn test_empty_data() {
         
     | 
| 
      
 231 
     | 
    
         
            +
                    assert_eq!(FormatDetector::detect_from_content(&[]), FileFormat::Text);
         
     | 
| 
      
 232 
     | 
    
         
            +
                }
         
     | 
| 
      
 233 
     | 
    
         
            +
            }
         
     | 
    
        data/ext/parsekit/src/lib.rs
    CHANGED
    
    
    
        data/ext/parsekit/src/parser.rs
    CHANGED
    
    | 
         @@ -1,7 +1,7 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            use magnus::{
         
     | 
| 
       2 
2 
     | 
    
         
             
                function, method, prelude::*, scan_args, Error, Module, RHash, RModule, Ruby, Value,
         
     | 
| 
       3 
3 
     | 
    
         
             
            };
         
     | 
| 
       4 
     | 
    
         
            -
            use  
     | 
| 
      
 4 
     | 
    
         
            +
            use crate::format_detector::{FileFormat, FormatDetector};
         
     | 
| 
       5 
5 
     | 
    
         | 
| 
       6 
6 
     | 
    
         
             
            #[derive(Debug, Clone)]
         
     | 
| 
       7 
7 
     | 
    
         
             
            #[magnus::wrap(class = "ParseKit::Parser", free_immediately, size)]
         
     | 
| 
         @@ -28,6 +28,33 @@ impl Default for ParserConfig { 
     | 
|
| 
       28 
28 
     | 
    
         
             
                }
         
     | 
| 
       29 
29 
     | 
    
         
             
            }
         
     | 
| 
       30 
30 
     | 
    
         | 
| 
      
 31 
     | 
    
         
            +
            // Error handling helpers
         
     | 
| 
      
 32 
     | 
    
         
            +
            impl Parser {
         
     | 
| 
      
 33 
     | 
    
         
            +
                /// Create a RuntimeError with formatted message
         
     | 
| 
      
 34 
     | 
    
         
            +
                fn runtime_error<E: std::fmt::Display>(context: &str, err: E) -> Error {
         
     | 
| 
      
 35 
     | 
    
         
            +
                    Error::new(
         
     | 
| 
      
 36 
     | 
    
         
            +
                        Ruby::get().unwrap().exception_runtime_error(),
         
     | 
| 
      
 37 
     | 
    
         
            +
                        format!("{}: {}", context, err),
         
     | 
| 
      
 38 
     | 
    
         
            +
                    )
         
     | 
| 
      
 39 
     | 
    
         
            +
                }
         
     | 
| 
      
 40 
     | 
    
         
            +
                
         
     | 
| 
      
 41 
     | 
    
         
            +
                /// Create an ArgumentError with message
         
     | 
| 
      
 42 
     | 
    
         
            +
                fn argument_error(msg: &str) -> Error {
         
     | 
| 
      
 43 
     | 
    
         
            +
                    Error::new(
         
     | 
| 
      
 44 
     | 
    
         
            +
                        Ruby::get().unwrap().exception_arg_error(),
         
     | 
| 
      
 45 
     | 
    
         
            +
                        msg.to_string(),
         
     | 
| 
      
 46 
     | 
    
         
            +
                    )
         
     | 
| 
      
 47 
     | 
    
         
            +
                }
         
     | 
| 
      
 48 
     | 
    
         
            +
                
         
     | 
| 
      
 49 
     | 
    
         
            +
                /// Create an IOError with formatted message
         
     | 
| 
      
 50 
     | 
    
         
            +
                fn io_error<E: std::fmt::Display>(context: &str, err: E) -> Error {
         
     | 
| 
      
 51 
     | 
    
         
            +
                    Error::new(
         
     | 
| 
      
 52 
     | 
    
         
            +
                        Ruby::get().unwrap().exception_io_error(),
         
     | 
| 
      
 53 
     | 
    
         
            +
                        format!("{}: {}", context, err),
         
     | 
| 
      
 54 
     | 
    
         
            +
                    )
         
     | 
| 
      
 55 
     | 
    
         
            +
                }
         
     | 
| 
      
 56 
     | 
    
         
            +
            }
         
     | 
| 
      
 57 
     | 
    
         
            +
             
     | 
| 
       31 
58 
     | 
    
         
             
            impl Parser {
         
     | 
| 
       32 
59 
     | 
    
         
             
                /// Create a new Parser instance with optional configuration
         
     | 
| 
       33 
60 
     | 
    
         
             
                fn new(ruby: &Ruby, args: &[Value]) -> Result<Self, Error> {
         
     | 
| 
         @@ -58,73 +85,49 @@ impl Parser { 
     | 
|
| 
       58 
85 
     | 
    
         
             
                fn parse_bytes_internal(&self, data: Vec<u8>, filename: Option<&str>) -> Result<String, Error> {
         
     | 
| 
       59 
86 
     | 
    
         
             
                    // Check size limit
         
     | 
| 
       60 
87 
     | 
    
         
             
                    if data.len() > self.config.max_size {
         
     | 
| 
       61 
     | 
    
         
            -
                        return Err( 
     | 
| 
       62 
     | 
    
         
            -
                             
     | 
| 
       63 
     | 
    
         
            -
                            format!(
         
     | 
| 
       64 
     | 
    
         
            -
                                 
     | 
| 
       65 
     | 
    
         
            -
                                data.len(),
         
     | 
| 
       66 
     | 
    
         
            -
                                self.config.max_size
         
     | 
| 
       67 
     | 
    
         
            -
                            ),
         
     | 
| 
      
 88 
     | 
    
         
            +
                        return Err(Self::runtime_error(
         
     | 
| 
      
 89 
     | 
    
         
            +
                            "File size exceeds limit",
         
     | 
| 
      
 90 
     | 
    
         
            +
                            format!("{} bytes exceeds maximum allowed size of {} bytes", 
         
     | 
| 
      
 91 
     | 
    
         
            +
                                data.len(), self.config.max_size)
         
     | 
| 
       68 
92 
     | 
    
         
             
                        ));
         
     | 
| 
       69 
93 
     | 
    
         
             
                    }
         
     | 
| 
       70 
94 
     | 
    
         | 
| 
       71 
     | 
    
         
            -
                    //  
     | 
| 
       72 
     | 
    
         
            -
                    let  
     | 
| 
       73 
     | 
    
         
            -
             
     | 
| 
       74 
     | 
    
         
            -
                     
     | 
| 
       75 
     | 
    
         
            -
             
     | 
| 
       76 
     | 
    
         
            -
                    };
         
     | 
| 
       77 
     | 
    
         
            -
             
     | 
| 
       78 
     | 
    
         
            -
                    match file_type.as_str() {
         
     | 
| 
       79 
     | 
    
         
            -
                        "pdf" => self.parse_pdf(data),
         
     | 
| 
       80 
     | 
    
         
            -
                        "docx" => self.parse_docx(data),
         
     | 
| 
       81 
     | 
    
         
            -
                        "pptx" => self.parse_pptx(data),
         
     | 
| 
       82 
     | 
    
         
            -
                        "xlsx" | "xls" => self.parse_xlsx(data),
         
     | 
| 
       83 
     | 
    
         
            -
                        "json" => self.parse_json(data),
         
     | 
| 
       84 
     | 
    
         
            -
                        "xml" | "html" => self.parse_xml(data),
         
     | 
| 
       85 
     | 
    
         
            -
                        "png" | "jpg" | "jpeg" | "tiff" | "bmp" => self.ocr_image(data),
         
     | 
| 
       86 
     | 
    
         
            -
                        "txt" | "text" => self.parse_text(data),
         
     | 
| 
       87 
     | 
    
         
            -
                        _ => self.parse_text(data), // Default to text parsing
         
     | 
| 
       88 
     | 
    
         
            -
                    }
         
     | 
| 
      
 95 
     | 
    
         
            +
                    // Use centralized format detection
         
     | 
| 
      
 96 
     | 
    
         
            +
                    let format = FormatDetector::detect(filename, Some(&data));
         
     | 
| 
      
 97 
     | 
    
         
            +
                    
         
     | 
| 
      
 98 
     | 
    
         
            +
                    // Use centralized dispatch
         
     | 
| 
      
 99 
     | 
    
         
            +
                    self.dispatch_to_parser(format, data)
         
     | 
| 
       89 
100 
     | 
    
         
             
                }
         
     | 
| 
       90 
     | 
    
         
            -
             
     | 
| 
       91 
     | 
    
         
            -
                ///  
     | 
| 
       92 
     | 
    
         
            -
                fn  
     | 
| 
       93 
     | 
    
         
            -
                     
     | 
| 
       94 
     | 
    
         
            -
             
     | 
| 
       95 
     | 
    
         
            -
                         
     | 
| 
       96 
     | 
    
         
            -
                         
     | 
| 
      
 101 
     | 
    
         
            +
                
         
     | 
| 
      
 102 
     | 
    
         
            +
                /// Centralized dispatch logic - routes format to appropriate parser
         
     | 
| 
      
 103 
     | 
    
         
            +
                fn dispatch_to_parser(&self, format: FileFormat, data: Vec<u8>) -> Result<String, Error> {
         
     | 
| 
      
 104 
     | 
    
         
            +
                    match format {
         
     | 
| 
      
 105 
     | 
    
         
            +
                        FileFormat::Pdf => self.parse_pdf(data),
         
     | 
| 
      
 106 
     | 
    
         
            +
                        FileFormat::Docx => self.parse_docx(data),
         
     | 
| 
      
 107 
     | 
    
         
            +
                        FileFormat::Pptx => self.parse_pptx(data),
         
     | 
| 
      
 108 
     | 
    
         
            +
                        FileFormat::Xlsx | FileFormat::Xls => self.parse_xlsx(data),
         
     | 
| 
      
 109 
     | 
    
         
            +
                        FileFormat::Json => self.parse_json(data),
         
     | 
| 
      
 110 
     | 
    
         
            +
                        FileFormat::Xml | FileFormat::Html => self.parse_xml(data),
         
     | 
| 
      
 111 
     | 
    
         
            +
                        FileFormat::Png | FileFormat::Jpeg | FileFormat::Tiff | FileFormat::Bmp => self.ocr_image(data),
         
     | 
| 
      
 112 
     | 
    
         
            +
                        FileFormat::Text | FileFormat::Unknown => self.parse_text(data),
         
     | 
| 
       97 
113 
     | 
    
         
             
                    }
         
     | 
| 
       98 
114 
     | 
    
         
             
                }
         
     | 
| 
       99 
115 
     | 
    
         | 
| 
       100 
     | 
    
         
            -
                ///  
     | 
| 
       101 
     | 
    
         
            -
                fn  
     | 
| 
       102 
     | 
    
         
            -
                     
     | 
| 
       103 
     | 
    
         
            -
             
     | 
| 
       104 
     | 
    
         
            -
                     
     | 
| 
       105 
     | 
    
         
            -
                         
     | 
| 
       106 
     | 
    
         
            -
                         
     | 
| 
       107 
     | 
    
         
            -
                        // This is a simplified check - both DOCX and XLSX are ZIP files
         
     | 
| 
       108 
     | 
    
         
            -
                        // For now, default to xlsx as it's more commonly parsed
         
     | 
| 
       109 
     | 
    
         
            -
                        "xlsx".to_string() // Office Open XML format (could also be DOCX)
         
     | 
| 
       110 
     | 
    
         
            -
                    } else if data.starts_with(&[0xD0, 0xCF, 0x11, 0xE0]) {
         
     | 
| 
       111 
     | 
    
         
            -
                        "xls".to_string() // Old Excel format
         
     | 
| 
       112 
     | 
    
         
            -
                    } else if data.starts_with(&[0x89, 0x50, 0x4E, 0x47]) {
         
     | 
| 
       113 
     | 
    
         
            -
                        "png".to_string() // PNG signature
         
     | 
| 
       114 
     | 
    
         
            -
                    } else if data.starts_with(&[0xFF, 0xD8, 0xFF]) {
         
     | 
| 
       115 
     | 
    
         
            -
                        "jpg".to_string() // JPEG signature
         
     | 
| 
       116 
     | 
    
         
            -
                    } else if data.starts_with(b"BM") {
         
     | 
| 
       117 
     | 
    
         
            -
                        "bmp".to_string() // BMP signature
         
     | 
| 
       118 
     | 
    
         
            -
                    } else if data.starts_with(b"II\x2A\x00") || data.starts_with(b"MM\x00\x2A") {
         
     | 
| 
       119 
     | 
    
         
            -
                        "tiff".to_string() // TIFF signature (little-endian or big-endian)
         
     | 
| 
       120 
     | 
    
         
            -
                    } else if data.starts_with(b"<?xml") || data.starts_with(b"<html") {
         
     | 
| 
       121 
     | 
    
         
            -
                        "xml".to_string()
         
     | 
| 
       122 
     | 
    
         
            -
                    } else if data.starts_with(b"{") || data.starts_with(b"[") {
         
     | 
| 
       123 
     | 
    
         
            -
                        "json".to_string()
         
     | 
| 
       124 
     | 
    
         
            -
                    } else {
         
     | 
| 
       125 
     | 
    
         
            -
                        "txt".to_string()
         
     | 
| 
      
 116 
     | 
    
         
            +
                /// Ruby-accessible method to detect format from bytes
         
     | 
| 
      
 117 
     | 
    
         
            +
                fn detect_format_from_bytes(&self, data: Vec<u8>) -> String {
         
     | 
| 
      
 118 
     | 
    
         
            +
                    let format = FormatDetector::detect_from_content(&data);
         
     | 
| 
      
 119 
     | 
    
         
            +
                    // For compatibility with Ruby tests, return "xlsx" for old Excel
         
     | 
| 
      
 120 
     | 
    
         
            +
                    match format {
         
     | 
| 
      
 121 
     | 
    
         
            +
                        FileFormat::Xls => "xlsx".to_string(),  // Compatibility with existing tests
         
     | 
| 
      
 122 
     | 
    
         
            +
                        _ => format.to_symbol().to_string(),
         
     | 
| 
       126 
123 
     | 
    
         
             
                    }
         
     | 
| 
       127 
124 
     | 
    
         
             
                }
         
     | 
| 
      
 125 
     | 
    
         
            +
                
         
     | 
| 
      
 126 
     | 
    
         
            +
                /// Ruby-accessible method to detect format from filename
         
     | 
| 
      
 127 
     | 
    
         
            +
                fn detect_format_from_filename(&self, filename: String) -> String {
         
     | 
| 
      
 128 
     | 
    
         
            +
                    let format = FormatDetector::detect_from_extension(&filename);
         
     | 
| 
      
 129 
     | 
    
         
            +
                    format.to_symbol().to_string()
         
     | 
| 
      
 130 
     | 
    
         
            +
                }
         
     | 
| 
       128 
131 
     | 
    
         | 
| 
       129 
132 
     | 
    
         
             
                /// Perform OCR on image data using Tesseract
         
     | 
| 
       130 
133 
     | 
    
         
             
                fn ocr_image(&self, data: Vec<u8>) -> Result<String, Error> {
         
     | 
| 
         @@ -191,20 +194,12 @@ impl Parser { 
     | 
|
| 
       191 
194 
     | 
    
         
             
                    };
         
     | 
| 
       192 
195 
     | 
    
         | 
| 
       193 
196 
     | 
    
         
             
                    if let Err(e) = init_result {
         
     | 
| 
       194 
     | 
    
         
            -
                        return Err( 
     | 
| 
       195 
     | 
    
         
            -
                            Ruby::get().unwrap().exception_runtime_error(),
         
     | 
| 
       196 
     | 
    
         
            -
                            format!("Failed to initialize Tesseract: {:?}", e),
         
     | 
| 
       197 
     | 
    
         
            -
                        ))
         
     | 
| 
      
 197 
     | 
    
         
            +
                        return Err(Self::runtime_error("Failed to initialize Tesseract", e));
         
     | 
| 
       198 
198 
     | 
    
         
             
                    }
         
     | 
| 
       199 
199 
     | 
    
         | 
| 
       200 
200 
     | 
    
         
             
                    // Load the image from bytes
         
     | 
| 
       201 
     | 
    
         
            -
                    let img =  
     | 
| 
       202 
     | 
    
         
            -
                         
     | 
| 
       203 
     | 
    
         
            -
                        Err(e) => return Err(Error::new(
         
     | 
| 
       204 
     | 
    
         
            -
                            Ruby::get().unwrap().exception_runtime_error(),
         
     | 
| 
       205 
     | 
    
         
            -
                            format!("Failed to load image: {}", e),
         
     | 
| 
       206 
     | 
    
         
            -
                        ))
         
     | 
| 
       207 
     | 
    
         
            -
                    };
         
     | 
| 
      
 201 
     | 
    
         
            +
                    let img = image::load_from_memory(&data)
         
     | 
| 
      
 202 
     | 
    
         
            +
                        .map_err(|e| Self::runtime_error("Failed to load image", e))?;
         
     | 
| 
       208 
203 
     | 
    
         | 
| 
       209 
204 
     | 
    
         
             
                    // Convert to RGBA8 format
         
     | 
| 
       210 
205 
     | 
    
         
             
                    let rgba_img = img.to_rgba8();
         
     | 
| 
         @@ -212,27 +207,18 @@ impl Parser { 
     | 
|
| 
       212 
207 
     | 
    
         
             
                    let raw_data = rgba_img.into_raw();
         
     | 
| 
       213 
208 
     | 
    
         | 
| 
       214 
209 
     | 
    
         
             
                    // Set image data
         
     | 
| 
       215 
     | 
    
         
            -
                     
     | 
| 
      
 210 
     | 
    
         
            +
                    tesseract.set_image(
         
     | 
| 
       216 
211 
     | 
    
         
             
                        &raw_data,
         
     | 
| 
       217 
212 
     | 
    
         
             
                        width as i32,
         
     | 
| 
       218 
213 
     | 
    
         
             
                        height as i32,
         
     | 
| 
       219 
214 
     | 
    
         
             
                        4,  // bytes per pixel (RGBA)
         
     | 
| 
       220 
215 
     | 
    
         
             
                        (width * 4) as i32,  // bytes per line
         
     | 
| 
       221 
     | 
    
         
            -
                    )  
     | 
| 
       222 
     | 
    
         
            -
                        return Err(Error::new(
         
     | 
| 
       223 
     | 
    
         
            -
                            Ruby::get().unwrap().exception_runtime_error(),
         
     | 
| 
       224 
     | 
    
         
            -
                            format!("Failed to set image: {}", e),
         
     | 
| 
       225 
     | 
    
         
            -
                        ))
         
     | 
| 
       226 
     | 
    
         
            -
                    }
         
     | 
| 
      
 216 
     | 
    
         
            +
                    ).map_err(|e| Self::runtime_error("Failed to set image", e))?;
         
     | 
| 
       227 
217 
     | 
    
         | 
| 
       228 
218 
     | 
    
         
             
                    // Extract text
         
     | 
| 
       229 
     | 
    
         
            -
                     
     | 
| 
       230 
     | 
    
         
            -
                         
     | 
| 
       231 
     | 
    
         
            -
                         
     | 
| 
       232 
     | 
    
         
            -
                            Ruby::get().unwrap().exception_runtime_error(),
         
     | 
| 
       233 
     | 
    
         
            -
                            format!("Failed to perform OCR: {}", e),
         
     | 
| 
       234 
     | 
    
         
            -
                        )),
         
     | 
| 
       235 
     | 
    
         
            -
                    }
         
     | 
| 
      
 219 
     | 
    
         
            +
                    tesseract.get_utf8_text()
         
     | 
| 
      
 220 
     | 
    
         
            +
                        .map(|text| text.trim().to_string())
         
     | 
| 
      
 221 
     | 
    
         
            +
                        .map_err(|e| Self::runtime_error("Failed to perform OCR", e))
         
     | 
| 
       236 
222 
     | 
    
         
             
                }
         
     | 
| 
       237 
223 
     | 
    
         | 
| 
       238 
224 
     | 
    
         | 
| 
         @@ -242,51 +228,31 @@ impl Parser { 
     | 
|
| 
       242 
228 
     | 
    
         | 
| 
       243 
229 
     | 
    
         
             
                    // Try to load the PDF from memory
         
     | 
| 
       244 
230 
     | 
    
         
             
                    // The magic parameter helps MuPDF identify the file type
         
     | 
| 
       245 
     | 
    
         
            -
                     
     | 
| 
       246 
     | 
    
         
            -
                         
     | 
| 
       247 
     | 
    
         
            -
             
     | 
| 
       248 
     | 
    
         
            -
             
     | 
| 
       249 
     | 
    
         
            -
             
     | 
| 
       250 
     | 
    
         
            -
             
     | 
| 
       251 
     | 
    
         
            -
             
     | 
| 
       252 
     | 
    
         
            -
             
     | 
| 
       253 
     | 
    
         
            -
             
     | 
| 
       254 
     | 
    
         
            -
             
     | 
| 
       255 
     | 
    
         
            -
             
     | 
| 
       256 
     | 
    
         
            -
             
     | 
| 
       257 
     | 
    
         
            -
             
     | 
| 
       258 
     | 
    
         
            -
                             
     | 
| 
       259 
     | 
    
         
            -
             
     | 
| 
       260 
     | 
    
         
            -
             
     | 
| 
       261 
     | 
    
         
            -
             
     | 
| 
       262 
     | 
    
         
            -
                                match doc.load_page(page_num) {
         
     | 
| 
       263 
     | 
    
         
            -
                                    Ok(page) => {
         
     | 
| 
       264 
     | 
    
         
            -
                                        // Extract text from the page
         
     | 
| 
       265 
     | 
    
         
            -
                                        match page.to_text() {
         
     | 
| 
       266 
     | 
    
         
            -
                                            Ok(text) => {
         
     | 
| 
       267 
     | 
    
         
            -
                                                all_text.push_str(&text);
         
     | 
| 
       268 
     | 
    
         
            -
                                                all_text.push('\n');
         
     | 
| 
       269 
     | 
    
         
            -
                                            }
         
     | 
| 
       270 
     | 
    
         
            -
                                            Err(_) => continue,
         
     | 
| 
       271 
     | 
    
         
            -
                                        }
         
     | 
| 
       272 
     | 
    
         
            -
                                    }
         
     | 
| 
       273 
     | 
    
         
            -
                                    Err(_) => continue,
         
     | 
| 
       274 
     | 
    
         
            -
                                }
         
     | 
| 
       275 
     | 
    
         
            -
                            }
         
     | 
| 
       276 
     | 
    
         
            -
             
     | 
| 
       277 
     | 
    
         
            -
                            if all_text.is_empty() {
         
     | 
| 
       278 
     | 
    
         
            -
                                Ok(
         
     | 
| 
       279 
     | 
    
         
            -
                                    "PDF contains no extractable text (might be scanned/image-based)"
         
     | 
| 
       280 
     | 
    
         
            -
                                        .to_string(),
         
     | 
| 
       281 
     | 
    
         
            -
                                )
         
     | 
| 
       282 
     | 
    
         
            -
                            } else {
         
     | 
| 
       283 
     | 
    
         
            -
                                Ok(all_text.trim().to_string())
         
     | 
| 
      
 231 
     | 
    
         
            +
                    let doc = Document::from_bytes(&data, "pdf")
         
     | 
| 
      
 232 
     | 
    
         
            +
                        .map_err(|e| Self::runtime_error("Failed to parse PDF", e))?;
         
     | 
| 
      
 233 
     | 
    
         
            +
                    
         
     | 
| 
      
 234 
     | 
    
         
            +
                    let mut all_text = String::new();
         
     | 
| 
      
 235 
     | 
    
         
            +
             
     | 
| 
      
 236 
     | 
    
         
            +
                    // Get page count
         
     | 
| 
      
 237 
     | 
    
         
            +
                    let page_count = doc.page_count()
         
     | 
| 
      
 238 
     | 
    
         
            +
                        .map_err(|e| Self::runtime_error("Failed to get page count", e))?;
         
     | 
| 
      
 239 
     | 
    
         
            +
             
     | 
| 
      
 240 
     | 
    
         
            +
                    // Iterate through pages
         
     | 
| 
      
 241 
     | 
    
         
            +
                    for page_num in 0..page_count {
         
     | 
| 
      
 242 
     | 
    
         
            +
                        // Continue on page errors rather than failing entirely
         
     | 
| 
      
 243 
     | 
    
         
            +
                        if let Ok(page) = doc.load_page(page_num) {
         
     | 
| 
      
 244 
     | 
    
         
            +
                            // Extract text from the page
         
     | 
| 
      
 245 
     | 
    
         
            +
                            if let Ok(text) = page.to_text() {
         
     | 
| 
      
 246 
     | 
    
         
            +
                                all_text.push_str(&text);
         
     | 
| 
      
 247 
     | 
    
         
            +
                                all_text.push('\n');
         
     | 
| 
       284 
248 
     | 
    
         
             
                            }
         
     | 
| 
       285 
249 
     | 
    
         
             
                        }
         
     | 
| 
       286 
     | 
    
         
            -
             
     | 
| 
       287 
     | 
    
         
            -
             
     | 
| 
       288 
     | 
    
         
            -
             
     | 
| 
       289 
     | 
    
         
            -
                        )) 
     | 
| 
      
 250 
     | 
    
         
            +
                    }
         
     | 
| 
      
 251 
     | 
    
         
            +
             
     | 
| 
      
 252 
     | 
    
         
            +
                    if all_text.is_empty() {
         
     | 
| 
      
 253 
     | 
    
         
            +
                        Ok("PDF contains no extractable text (might be scanned/image-based)".to_string())
         
     | 
| 
      
 254 
     | 
    
         
            +
                    } else {
         
     | 
| 
      
 255 
     | 
    
         
            +
                        Ok(all_text.trim().to_string())
         
     | 
| 
       290 
256 
     | 
    
         
             
                    }
         
     | 
| 
       291 
257 
     | 
    
         
             
                }
         
     | 
| 
       292 
258 
     | 
    
         | 
| 
         @@ -322,10 +288,7 @@ impl Parser { 
     | 
|
| 
       322 
288 
     | 
    
         | 
| 
       323 
289 
     | 
    
         
             
                            Ok(result.trim().to_string())
         
     | 
| 
       324 
290 
     | 
    
         
             
                        }
         
     | 
| 
       325 
     | 
    
         
            -
                        Err(e) => Err( 
     | 
| 
       326 
     | 
    
         
            -
                            Ruby::get().unwrap().exception_runtime_error(),
         
     | 
| 
       327 
     | 
    
         
            -
                            format!("Failed to parse DOCX file: {}", e),
         
     | 
| 
       328 
     | 
    
         
            -
                        )),
         
     | 
| 
      
 291 
     | 
    
         
            +
                        Err(e) => Err(Self::runtime_error("Failed to parse DOCX file", e)),
         
     | 
| 
       329 
292 
     | 
    
         
             
                    }
         
     | 
| 
       330 
293 
     | 
    
         
             
                }
         
     | 
| 
       331 
294 
     | 
    
         | 
| 
         @@ -335,15 +298,8 @@ impl Parser { 
     | 
|
| 
       335 
298 
     | 
    
         
             
                    use zip::ZipArchive;
         
     | 
| 
       336 
299 
     | 
    
         | 
| 
       337 
300 
     | 
    
         
             
                    let cursor = Cursor::new(data);
         
     | 
| 
       338 
     | 
    
         
            -
                    let mut archive =  
     | 
| 
       339 
     | 
    
         
            -
                         
     | 
| 
       340 
     | 
    
         
            -
                        Err(e) => {
         
     | 
| 
       341 
     | 
    
         
            -
                            return Err(Error::new(
         
     | 
| 
       342 
     | 
    
         
            -
                                Ruby::get().unwrap().exception_runtime_error(),
         
     | 
| 
       343 
     | 
    
         
            -
                                format!("Failed to open PPTX as ZIP: {}", e),
         
     | 
| 
       344 
     | 
    
         
            -
                            ))
         
     | 
| 
       345 
     | 
    
         
            -
                        }
         
     | 
| 
       346 
     | 
    
         
            -
                    };
         
     | 
| 
      
 301 
     | 
    
         
            +
                    let mut archive = ZipArchive::new(cursor)
         
     | 
| 
      
 302 
     | 
    
         
            +
                        .map_err(|e| Self::runtime_error("Failed to open PPTX as ZIP", e))?;
         
     | 
| 
       347 
303 
     | 
    
         | 
| 
       348 
304 
     | 
    
         
             
                    let mut all_text = Vec::new();
         
     | 
| 
       349 
305 
     | 
    
         
             
                    let mut slide_numbers = Vec::new();
         
     | 
| 
         @@ -492,10 +448,7 @@ impl Parser { 
     | 
|
| 
       492 
448 
     | 
    
         | 
| 
       493 
449 
     | 
    
         
             
                            Ok(result)
         
     | 
| 
       494 
450 
     | 
    
         
             
                        }
         
     | 
| 
       495 
     | 
    
         
            -
                        Err(e) => Err( 
     | 
| 
       496 
     | 
    
         
            -
                            Ruby::get().unwrap().exception_runtime_error(),
         
     | 
| 
       497 
     | 
    
         
            -
                            format!("Failed to parse Excel file: {}", e),
         
     | 
| 
       498 
     | 
    
         
            -
                        )),
         
     | 
| 
      
 451 
     | 
    
         
            +
                        Err(e) => Err(Self::runtime_error("Failed to parse Excel file", e)),
         
     | 
| 
       499 
452 
     | 
    
         
             
                    }
         
     | 
| 
       500 
453 
     | 
    
         
             
                }
         
     | 
| 
       501 
454 
     | 
    
         | 
| 
         @@ -527,10 +480,7 @@ impl Parser { 
     | 
|
| 
       527 
480 
     | 
    
         
             
                            }
         
     | 
| 
       528 
481 
     | 
    
         
             
                            Ok(Event::Eof) => break,
         
     | 
| 
       529 
482 
     | 
    
         
             
                            Err(e) => {
         
     | 
| 
       530 
     | 
    
         
            -
                                return Err( 
     | 
| 
       531 
     | 
    
         
            -
                                    Ruby::get().unwrap().exception_runtime_error(),
         
     | 
| 
       532 
     | 
    
         
            -
                                    format!("XML parse error: {}", e),
         
     | 
| 
       533 
     | 
    
         
            -
                                ))
         
     | 
| 
      
 483 
     | 
    
         
            +
                                return Err(Self::runtime_error("XML parse error", e))
         
     | 
| 
       534 
484 
     | 
    
         
             
                            }
         
     | 
| 
       535 
485 
     | 
    
         
             
                            _ => {}
         
     | 
| 
       536 
486 
     | 
    
         
             
                        }
         
     | 
| 
         @@ -557,10 +507,7 @@ impl Parser { 
     | 
|
| 
       557 
507 
     | 
    
         
             
                /// Parse input string (for text content)
         
     | 
| 
       558 
508 
     | 
    
         
             
                fn parse(&self, input: String) -> Result<String, Error> {
         
     | 
| 
       559 
509 
     | 
    
         
             
                    if input.is_empty() {
         
     | 
| 
       560 
     | 
    
         
            -
                        return Err( 
     | 
| 
       561 
     | 
    
         
            -
                            Ruby::get().unwrap().exception_arg_error(),
         
     | 
| 
       562 
     | 
    
         
            -
                            "Input cannot be empty",
         
     | 
| 
       563 
     | 
    
         
            -
                        ));
         
     | 
| 
      
 510 
     | 
    
         
            +
                        return Err(Self::argument_error("Input cannot be empty"));
         
     | 
| 
       564 
511 
     | 
    
         
             
                    }
         
     | 
| 
       565 
512 
     | 
    
         | 
| 
       566 
513 
     | 
    
         
             
                    // For string input, just return cleaned text
         
     | 
| 
         @@ -576,12 +523,8 @@ impl Parser { 
     | 
|
| 
       576 
523 
     | 
    
         
             
                fn parse_file(&self, path: String) -> Result<String, Error> {
         
     | 
| 
       577 
524 
     | 
    
         
             
                    use std::fs;
         
     | 
| 
       578 
525 
     | 
    
         | 
| 
       579 
     | 
    
         
            -
                    let data = fs::read(&path) 
     | 
| 
       580 
     | 
    
         
            -
                         
     | 
| 
       581 
     | 
    
         
            -
                            Ruby::get().unwrap().exception_io_error(),
         
     | 
| 
       582 
     | 
    
         
            -
                            format!("Failed to read file: {}", e),
         
     | 
| 
       583 
     | 
    
         
            -
                        )
         
     | 
| 
       584 
     | 
    
         
            -
                    })?;
         
     | 
| 
      
 526 
     | 
    
         
            +
                    let data = fs::read(&path)
         
     | 
| 
      
 527 
     | 
    
         
            +
                        .map_err(|e| Self::io_error("Failed to read file", e))?;
         
     | 
| 
       585 
528 
     | 
    
         | 
| 
       586 
529 
     | 
    
         
             
                    self.parse_bytes_internal(data, Some(&path))
         
     | 
| 
       587 
530 
     | 
    
         
             
                }
         
     | 
| 
         @@ -589,10 +532,7 @@ impl Parser { 
     | 
|
| 
       589 
532 
     | 
    
         
             
                /// Parse bytes from Ruby
         
     | 
| 
       590 
533 
     | 
    
         
             
                fn parse_bytes(&self, data: Vec<u8>) -> Result<String, Error> {
         
     | 
| 
       591 
534 
     | 
    
         
             
                    if data.is_empty() {
         
     | 
| 
       592 
     | 
    
         
            -
                        return Err( 
     | 
| 
       593 
     | 
    
         
            -
                            Ruby::get().unwrap().exception_arg_error(),
         
     | 
| 
       594 
     | 
    
         
            -
                            "Data cannot be empty",
         
     | 
| 
       595 
     | 
    
         
            -
                        ));
         
     | 
| 
      
 535 
     | 
    
         
            +
                        return Err(Self::argument_error("Data cannot be empty"));
         
     | 
| 
       596 
536 
     | 
    
         
             
                    }
         
     | 
| 
       597 
537 
     | 
    
         | 
| 
       598 
538 
     | 
    
         
             
                    self.parse_bytes_internal(data, None)
         
     | 
| 
         @@ -616,25 +556,11 @@ impl Parser { 
     | 
|
| 
       616 
556 
     | 
    
         | 
| 
       617 
557 
     | 
    
         
             
                /// Check supported file types
         
     | 
| 
       618 
558 
     | 
    
         
             
                fn supported_formats() -> Vec<String> {
         
     | 
| 
       619 
     | 
    
         
            -
                     
     | 
| 
       620 
     | 
    
         
            -
             
     | 
| 
       621 
     | 
    
         
            -
                         
     | 
| 
       622 
     | 
    
         
            -
                         
     | 
| 
       623 
     | 
    
         
            -
                         
     | 
| 
       624 
     | 
    
         
            -
                        "htm".to_string(), // HTML files (alternative extension)
         
     | 
| 
       625 
     | 
    
         
            -
                        "md".to_string(),  // Markdown files
         
     | 
| 
       626 
     | 
    
         
            -
                        "docx".to_string(),
         
     | 
| 
       627 
     | 
    
         
            -
                        "pptx".to_string(),
         
     | 
| 
       628 
     | 
    
         
            -
                        "xlsx".to_string(),
         
     | 
| 
       629 
     | 
    
         
            -
                        "xls".to_string(),
         
     | 
| 
       630 
     | 
    
         
            -
                        "csv".to_string(),
         
     | 
| 
       631 
     | 
    
         
            -
                        "pdf".to_string(),  // Text extraction via MuPDF
         
     | 
| 
       632 
     | 
    
         
            -
                        "png".to_string(),  // OCR via Tesseract
         
     | 
| 
       633 
     | 
    
         
            -
                        "jpg".to_string(),  // OCR via Tesseract
         
     | 
| 
       634 
     | 
    
         
            -
                        "jpeg".to_string(), // OCR via Tesseract
         
     | 
| 
       635 
     | 
    
         
            -
                        "tiff".to_string(), // OCR via Tesseract
         
     | 
| 
       636 
     | 
    
         
            -
                        "bmp".to_string(),  // OCR via Tesseract
         
     | 
| 
       637 
     | 
    
         
            -
                    ]
         
     | 
| 
      
 559 
     | 
    
         
            +
                    // Use the centralized list from FormatDetector
         
     | 
| 
      
 560 
     | 
    
         
            +
                    FormatDetector::supported_extensions()
         
     | 
| 
      
 561 
     | 
    
         
            +
                        .iter()
         
     | 
| 
      
 562 
     | 
    
         
            +
                        .map(|&s| s.to_string())
         
     | 
| 
      
 563 
     | 
    
         
            +
                        .collect()
         
     | 
| 
       638 
564 
     | 
    
         
             
                }
         
     | 
| 
       639 
565 
     | 
    
         | 
| 
       640 
566 
     | 
    
         
             
                /// Detect if file extension is supported
         
     | 
| 
         @@ -688,6 +614,10 @@ pub fn init(_ruby: &Ruby, module: RModule) -> Result<(), Error> { 
     | 
|
| 
       688 
614 
     | 
    
         
             
                class.define_method("parse_xml", method!(Parser::parse_xml, 1))?;
         
     | 
| 
       689 
615 
     | 
    
         
             
                class.define_method("parse_text", method!(Parser::parse_text, 1))?;
         
     | 
| 
       690 
616 
     | 
    
         
             
                class.define_method("ocr_image", method!(Parser::ocr_image, 1))?;
         
     | 
| 
      
 617 
     | 
    
         
            +
                
         
     | 
| 
      
 618 
     | 
    
         
            +
                // Format detection methods
         
     | 
| 
      
 619 
     | 
    
         
            +
                class.define_method("detect_format_from_bytes", method!(Parser::detect_format_from_bytes, 1))?;
         
     | 
| 
      
 620 
     | 
    
         
            +
                class.define_method("detect_format_from_filename", method!(Parser::detect_format_from_filename, 1))?;
         
     | 
| 
       691 
621 
     | 
    
         | 
| 
       692 
622 
     | 
    
         
             
                // Class methods
         
     | 
| 
       693 
623 
     | 
    
         
             
                class.define_singleton_method("supported_formats", function!(Parser::supported_formats, 0))?;
         
     | 
| 
         @@ -0,0 +1,125 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            # ParseKit Native API Documentation
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
      
 3 
     | 
    
         
            +
            This document describes the methods implemented in the Rust native extension for ParseKit::Parser.
         
     | 
| 
      
 4 
     | 
    
         
            +
             
     | 
| 
      
 5 
     | 
    
         
            +
            ## Instance Methods
         
     | 
| 
      
 6 
     | 
    
         
            +
             
     | 
| 
      
 7 
     | 
    
         
            +
            ### `initialize(options = {})`
         
     | 
| 
      
 8 
     | 
    
         
            +
            Initialize a new Parser instance with optional configuration.
         
     | 
| 
      
 9 
     | 
    
         
            +
             
     | 
| 
      
 10 
     | 
    
         
            +
            **Parameters:**
         
     | 
| 
      
 11 
     | 
    
         
            +
            - `options` [Hash] Configuration options
         
     | 
| 
      
 12 
     | 
    
         
            +
              - `:encoding` [String] Input encoding (default: UTF-8)
         
     | 
| 
      
 13 
     | 
    
         
            +
              - `:strict_mode` [Boolean] Enable strict parsing mode (default: false)
         
     | 
| 
      
 14 
     | 
    
         
            +
              - `:max_depth` [Integer] Maximum nesting depth (default: 100)
         
     | 
| 
      
 15 
     | 
    
         
            +
              - `:max_size` [Integer] Maximum file size in bytes (default: 100MB)
         
     | 
| 
      
 16 
     | 
    
         
            +
             
     | 
| 
      
 17 
     | 
    
         
            +
            ### `parse(input)`
         
     | 
| 
      
 18 
     | 
    
         
            +
            Parse an input string (for text content).
         
     | 
| 
      
 19 
     | 
    
         
            +
             
     | 
| 
      
 20 
     | 
    
         
            +
            **Parameters:**
         
     | 
| 
      
 21 
     | 
    
         
            +
            - `input` [String] The input to parse
         
     | 
| 
      
 22 
     | 
    
         
            +
             
     | 
| 
      
 23 
     | 
    
         
            +
            **Returns:**
         
     | 
| 
      
 24 
     | 
    
         
            +
            - [String] The parsed result
         
     | 
| 
      
 25 
     | 
    
         
            +
             
     | 
| 
      
 26 
     | 
    
         
            +
            **Raises:**
         
     | 
| 
      
 27 
     | 
    
         
            +
            - `ArgumentError` If input is empty
         
     | 
| 
      
 28 
     | 
    
         
            +
             
     | 
| 
      
 29 
     | 
    
         
            +
            ### `parse_file(path)`
         
     | 
| 
      
 30 
     | 
    
         
            +
            Parse a file (supports PDF, Office documents, text files, images with OCR).
         
     | 
| 
      
 31 
     | 
    
         
            +
             
     | 
| 
      
 32 
     | 
    
         
            +
            **Parameters:**
         
     | 
| 
      
 33 
     | 
    
         
            +
            - `path` [String] Path to the file to parse
         
     | 
| 
      
 34 
     | 
    
         
            +
             
     | 
| 
      
 35 
     | 
    
         
            +
            **Returns:**
         
     | 
| 
      
 36 
     | 
    
         
            +
            - [String] The extracted text content
         
     | 
| 
      
 37 
     | 
    
         
            +
             
     | 
| 
      
 38 
     | 
    
         
            +
            **Raises:**
         
     | 
| 
      
 39 
     | 
    
         
            +
            - `IOError` If file cannot be read
         
     | 
| 
      
 40 
     | 
    
         
            +
            - `RuntimeError` If parsing fails
         
     | 
| 
      
 41 
     | 
    
         
            +
             
     | 
| 
      
 42 
     | 
    
         
            +
            ### `parse_bytes(data)`
         
     | 
| 
      
 43 
     | 
    
         
            +
            Parse binary data.
         
     | 
| 
      
 44 
     | 
    
         
            +
             
     | 
| 
      
 45 
     | 
    
         
            +
            **Parameters:**
         
     | 
| 
      
 46 
     | 
    
         
            +
            - `data` [Array<Integer>] Binary data as byte array
         
     | 
| 
      
 47 
     | 
    
         
            +
             
     | 
| 
      
 48 
     | 
    
         
            +
            **Returns:**
         
     | 
| 
      
 49 
     | 
    
         
            +
            - [String] The extracted text content
         
     | 
| 
      
 50 
     | 
    
         
            +
             
     | 
| 
      
 51 
     | 
    
         
            +
            **Raises:**
         
     | 
| 
      
 52 
     | 
    
         
            +
            - `ArgumentError` If data is empty
         
     | 
| 
      
 53 
     | 
    
         
            +
            - `RuntimeError` If parsing fails
         
     | 
| 
      
 54 
     | 
    
         
            +
             
     | 
| 
      
 55 
     | 
    
         
            +
            ### `config`
         
     | 
| 
      
 56 
     | 
    
         
            +
            Get the current parser configuration.
         
     | 
| 
      
 57 
     | 
    
         
            +
             
     | 
| 
      
 58 
     | 
    
         
            +
            **Returns:**
         
     | 
| 
      
 59 
     | 
    
         
            +
            - [Hash] The parser configuration including encoding, strict_mode, max_depth, and max_size
         
     | 
| 
      
 60 
     | 
    
         
            +
             
     | 
| 
      
 61 
     | 
    
         
            +
            ### `supports_file?(path)`
         
     | 
| 
      
 62 
     | 
    
         
            +
            Check if a file format is supported.
         
     | 
| 
      
 63 
     | 
    
         
            +
             
     | 
| 
      
 64 
     | 
    
         
            +
            **Parameters:**
         
     | 
| 
      
 65 
     | 
    
         
            +
            - `path` [String] File path to check
         
     | 
| 
      
 66 
     | 
    
         
            +
             
     | 
| 
      
 67 
     | 
    
         
            +
            **Returns:**
         
     | 
| 
      
 68 
     | 
    
         
            +
            - [Boolean] True if the file format is supported
         
     | 
| 
      
 69 
     | 
    
         
            +
             
     | 
| 
      
 70 
     | 
    
         
            +
            ### `strict_mode?`
         
     | 
| 
      
 71 
     | 
    
         
            +
            Check if strict mode is enabled.
         
     | 
| 
      
 72 
     | 
    
         
            +
             
     | 
| 
      
 73 
     | 
    
         
            +
            **Returns:**
         
     | 
| 
      
 74 
     | 
    
         
            +
            - [Boolean] True if strict mode is enabled
         
     | 
| 
      
 75 
     | 
    
         
            +
             
     | 
| 
      
 76 
     | 
    
         
            +
            ## Format-Specific Parsers
         
     | 
| 
      
 77 
     | 
    
         
            +
             
     | 
| 
      
 78 
     | 
    
         
            +
            These methods are also available but typically called internally via `parse_file` or `parse_bytes`:
         
     | 
| 
      
 79 
     | 
    
         
            +
             
     | 
| 
      
 80 
     | 
    
         
            +
            ### `parse_pdf(data)`
         
     | 
| 
      
 81 
     | 
    
         
            +
            Parse PDF files using MuPDF (statically linked).
         
     | 
| 
      
 82 
     | 
    
         
            +
             
     | 
| 
      
 83 
     | 
    
         
            +
            ### `parse_docx(data)`
         
     | 
| 
      
 84 
     | 
    
         
            +
            Parse Microsoft Word documents.
         
     | 
| 
      
 85 
     | 
    
         
            +
             
     | 
| 
      
 86 
     | 
    
         
            +
            ### `parse_pptx(data)`
         
     | 
| 
      
 87 
     | 
    
         
            +
            Parse Microsoft PowerPoint presentations.
         
     | 
| 
      
 88 
     | 
    
         
            +
             
     | 
| 
      
 89 
     | 
    
         
            +
            ### `parse_xlsx(data)`
         
     | 
| 
      
 90 
     | 
    
         
            +
            Parse Microsoft Excel spreadsheets.
         
     | 
| 
      
 91 
     | 
    
         
            +
             
     | 
| 
      
 92 
     | 
    
         
            +
            ### `parse_json(data)`
         
     | 
| 
      
 93 
     | 
    
         
            +
            Parse and pretty-print JSON data.
         
     | 
| 
      
 94 
     | 
    
         
            +
             
     | 
| 
      
 95 
     | 
    
         
            +
            ### `parse_xml(data)`
         
     | 
| 
      
 96 
     | 
    
         
            +
            Parse XML/HTML files and extract text content.
         
     | 
| 
      
 97 
     | 
    
         
            +
             
     | 
| 
      
 98 
     | 
    
         
            +
            ### `parse_text(data)`
         
     | 
| 
      
 99 
     | 
    
         
            +
            Parse plain text files.
         
     | 
| 
      
 100 
     | 
    
         
            +
             
     | 
| 
      
 101 
     | 
    
         
            +
            ### `ocr_image(data)`
         
     | 
| 
      
 102 
     | 
    
         
            +
            Perform OCR on images (PNG, JPEG, TIFF, BMP) using Tesseract.
         
     | 
| 
      
 103 
     | 
    
         
            +
             
     | 
| 
      
 104 
     | 
    
         
            +
            ## Class Methods
         
     | 
| 
      
 105 
     | 
    
         
            +
             
     | 
| 
      
 106 
     | 
    
         
            +
            ### `Parser.supported_formats`
         
     | 
| 
      
 107 
     | 
    
         
            +
            Get list of supported file formats.
         
     | 
| 
      
 108 
     | 
    
         
            +
             
     | 
| 
      
 109 
     | 
    
         
            +
            **Returns:**
         
     | 
| 
      
 110 
     | 
    
         
            +
            - [Array<String>] List of supported file extensions
         
     | 
| 
      
 111 
     | 
    
         
            +
             
     | 
| 
      
 112 
     | 
    
         
            +
            **Example:**
         
     | 
| 
      
 113 
     | 
    
         
            +
            ```ruby
         
     | 
| 
      
 114 
     | 
    
         
            +
            ParseKit::Parser.supported_formats
         
     | 
| 
      
 115 
     | 
    
         
            +
            # => ["txt", "json", "xml", "html", "docx", "xlsx", "xls", "csv", "pdf", "png", "jpg", "jpeg", "tiff", "bmp", ...]
         
     | 
| 
      
 116 
     | 
    
         
            +
            ```
         
     | 
| 
      
 117 
     | 
    
         
            +
             
     | 
| 
      
 118 
     | 
    
         
            +
            ## Implementation Notes
         
     | 
| 
      
 119 
     | 
    
         
            +
             
     | 
| 
      
 120 
     | 
    
         
            +
            All these methods are implemented in Rust via the native extension. The Ruby layer (`lib/parsekit/parser.rb`) provides additional convenience methods and helpers that wrap these native methods.
         
     | 
| 
      
 121 
     | 
    
         
            +
             
     | 
| 
      
 122 
     | 
    
         
            +
            The native extension uses:
         
     | 
| 
      
 123 
     | 
    
         
            +
            - **MuPDF** for PDF parsing (statically linked)
         
     | 
| 
      
 124 
     | 
    
         
            +
            - **Tesseract** for OCR functionality (bundled)
         
     | 
| 
      
 125 
     | 
    
         
            +
            - **Various Rust crates** for Office document parsing (docx-rs, calamine, etc.)
         
     | 
| 
         Binary file 
     | 
    
        data/lib/parsekit/parser.rb
    CHANGED
    
    | 
         @@ -3,65 +3,24 @@ 
     | 
|
| 
       3 
3 
     | 
    
         
             
            module ParseKit
         
     | 
| 
       4 
4 
     | 
    
         
             
              # Ruby wrapper for the native Parser class
         
     | 
| 
       5 
5 
     | 
    
         
             
              #
         
     | 
| 
       6 
     | 
    
         
            -
              #  
     | 
| 
       7 
     | 
    
         
            -
              #  
     | 
| 
      
 6 
     | 
    
         
            +
              # This class provides document parsing capabilities through a native Rust extension.
         
     | 
| 
      
 7 
     | 
    
         
            +
              # For documentation of native methods, see NATIVE_API.md
         
     | 
| 
      
 8 
     | 
    
         
            +
              #
         
     | 
| 
      
 9 
     | 
    
         
            +
              # The Ruby layer provides convenience methods and helpers while the Rust
         
     | 
| 
      
 10 
     | 
    
         
            +
              # extension handles the actual parsing of PDF, Office documents, images (OCR), etc.
         
     | 
| 
       8 
11 
     | 
    
         
             
              class Parser
         
     | 
| 
       9 
     | 
    
         
            -
                #  
     | 
| 
       10 
     | 
    
         
            -
                #  
     | 
| 
       11 
     | 
    
         
            -
                
         
     | 
| 
       12 
     | 
    
         
            -
                #  
     | 
| 
       13 
     | 
    
         
            -
                #  
     | 
| 
       14 
     | 
    
         
            -
                #  
     | 
| 
       15 
     | 
    
         
            -
                #  
     | 
| 
       16 
     | 
    
         
            -
                # 
     | 
| 
       17 
     | 
    
         
            -
                #  
     | 
| 
       18 
     | 
    
         
            -
                
         
     | 
| 
       19 
     | 
    
         
            -
                 
     | 
| 
       20 
     | 
    
         
            -
                #  
     | 
| 
       21 
     | 
    
         
            -
                # @return [String] The parsed result
         
     | 
| 
       22 
     | 
    
         
            -
                # @raise [ArgumentError] If input is empty
         
     | 
| 
       23 
     | 
    
         
            -
                # def parse(input)
         
     | 
| 
       24 
     | 
    
         
            -
                #   # Implemented in native extension
         
     | 
| 
       25 
     | 
    
         
            -
                # end
         
     | 
| 
       26 
     | 
    
         
            -
                
         
     | 
| 
       27 
     | 
    
         
            -
                # Parse a file (supports PDF, Office documents, text files)
         
     | 
| 
       28 
     | 
    
         
            -
                # @param path [String] Path to the file to parse
         
     | 
| 
       29 
     | 
    
         
            -
                # @return [String] The extracted text content
         
     | 
| 
       30 
     | 
    
         
            -
                # @raise [IOError] If file cannot be read
         
     | 
| 
       31 
     | 
    
         
            -
                # @raise [RuntimeError] If parsing fails
         
     | 
| 
       32 
     | 
    
         
            -
                # def parse_file(path)
         
     | 
| 
       33 
     | 
    
         
            -
                #   # Implemented in native extension
         
     | 
| 
       34 
     | 
    
         
            -
                # end
         
     | 
| 
       35 
     | 
    
         
            -
                
         
     | 
| 
       36 
     | 
    
         
            -
                # Parse binary data
         
     | 
| 
       37 
     | 
    
         
            -
                # @param data [Array<Integer>] Binary data as byte array
         
     | 
| 
       38 
     | 
    
         
            -
                # @return [String] The extracted text content
         
     | 
| 
       39 
     | 
    
         
            -
                # @raise [ArgumentError] If data is empty
         
     | 
| 
       40 
     | 
    
         
            -
                # @raise [RuntimeError] If parsing fails
         
     | 
| 
       41 
     | 
    
         
            -
                # def parse_bytes(data)
         
     | 
| 
       42 
     | 
    
         
            -
                #   # Implemented in native extension
         
     | 
| 
       43 
     | 
    
         
            -
                # end
         
     | 
| 
       44 
     | 
    
         
            -
                
         
     | 
| 
       45 
     | 
    
         
            -
                # Get the current configuration
         
     | 
| 
       46 
     | 
    
         
            -
                # @return [Hash] The parser configuration
         
     | 
| 
       47 
     | 
    
         
            -
                # def config
         
     | 
| 
       48 
     | 
    
         
            -
                #   # Implemented in native extension
         
     | 
| 
       49 
     | 
    
         
            -
                # end
         
     | 
| 
       50 
     | 
    
         
            -
                
         
     | 
| 
       51 
     | 
    
         
            -
                # Check if a file format is supported
         
     | 
| 
       52 
     | 
    
         
            -
                # @param path [String] File path to check
         
     | 
| 
       53 
     | 
    
         
            -
                # @return [Boolean] True if the file format is supported
         
     | 
| 
       54 
     | 
    
         
            -
                # def supports_file?(path)
         
     | 
| 
       55 
     | 
    
         
            -
                #   # Implemented in native extension
         
     | 
| 
       56 
     | 
    
         
            -
                # end
         
     | 
| 
       57 
     | 
    
         
            -
                
         
     | 
| 
       58 
     | 
    
         
            -
                # Get list of supported file formats
         
     | 
| 
       59 
     | 
    
         
            -
                # @return [Array<String>] List of supported file extensions
         
     | 
| 
       60 
     | 
    
         
            -
                # def self.supported_formats
         
     | 
| 
       61 
     | 
    
         
            -
                #   # Implemented in native extension
         
     | 
| 
       62 
     | 
    
         
            -
                # end
         
     | 
| 
       63 
     | 
    
         
            -
                
         
     | 
| 
       64 
     | 
    
         
            -
                # Ruby-level helper methods
         
     | 
| 
      
 12 
     | 
    
         
            +
                # Native methods implemented in Rust:
         
     | 
| 
      
 13 
     | 
    
         
            +
                # - initialize(options = {})
         
     | 
| 
      
 14 
     | 
    
         
            +
                # - parse(input)
         
     | 
| 
      
 15 
     | 
    
         
            +
                # - parse_file(path)
         
     | 
| 
      
 16 
     | 
    
         
            +
                # - parse_bytes(data)
         
     | 
| 
      
 17 
     | 
    
         
            +
                # - config
         
     | 
| 
      
 18 
     | 
    
         
            +
                # - supports_file?(path)
         
     | 
| 
      
 19 
     | 
    
         
            +
                # - strict_mode?
         
     | 
| 
      
 20 
     | 
    
         
            +
                # - parse_pdf, parse_docx, parse_xlsx, parse_pptx, parse_json, parse_xml, parse_text, ocr_image
         
     | 
| 
      
 21 
     | 
    
         
            +
                # See NATIVE_API.md for detailed documentation
         
     | 
| 
      
 22 
     | 
    
         
            +
                
         
     | 
| 
      
 23 
     | 
    
         
            +
                # Ruby convenience methods and helpers
         
     | 
| 
       65 
24 
     | 
    
         | 
| 
       66 
25 
     | 
    
         
             
                # Create a parser with strict mode enabled
         
     | 
| 
       67 
26 
     | 
    
         
             
                # @param options [Hash] Additional options
         
     | 
| 
         @@ -81,6 +40,7 @@ module ParseKit 
     | 
|
| 
       81 
40 
     | 
    
         
             
                end
         
     | 
| 
       82 
41 
     | 
    
         | 
| 
       83 
42 
     | 
    
         
             
                # Detect format from file path
         
     | 
| 
      
 43 
     | 
    
         
            +
                # @deprecated Use the native format detection in parse_file instead
         
     | 
| 
       84 
44 
     | 
    
         
             
                # @param path [String] File path
         
     | 
| 
       85 
45 
     | 
    
         
             
                # @return [Symbol, nil] Format symbol or nil if unknown
         
     | 
| 
       86 
46 
     | 
    
         
             
                def detect_format(path)
         
     | 
| 
         @@ -101,67 +61,134 @@ module ParseKit 
     | 
|
| 
       101 
61 
     | 
    
         
             
                end
         
     | 
| 
       102 
62 
     | 
    
         | 
| 
       103 
63 
     | 
    
         
             
                # Detect format from binary data
         
     | 
| 
      
 64 
     | 
    
         
            +
                # @deprecated Use the native format detection in parse_bytes instead
         
     | 
| 
       104 
65 
     | 
    
         
             
                # @param data [String, Array<Integer>] Binary data
         
     | 
| 
       105 
66 
     | 
    
         
             
                # @return [Symbol] Format symbol
         
     | 
| 
       106 
67 
     | 
    
         
             
                def detect_format_from_bytes(data)
         
     | 
| 
       107 
68 
     | 
    
         
             
                  # Convert to bytes if string
         
     | 
| 
       108 
69 
     | 
    
         
             
                  bytes = data.is_a?(String) ? data.bytes : data
         
     | 
| 
       109 
     | 
    
         
            -
                  return :text if bytes.empty?
         
     | 
| 
       110 
     | 
    
         
            -
                  
         
     | 
| 
       111 
     | 
    
         
            -
                  # Check magic bytes
         
     | 
| 
       112 
     | 
    
         
            -
                   
     | 
| 
       113 
     | 
    
         
            -
             
     | 
| 
       114 
     | 
    
         
            -
                   
     | 
| 
       115 
     | 
    
         
            -
                     
     | 
| 
       116 
     | 
    
         
            -
             
     | 
| 
       117 
     | 
    
         
            -
             
     | 
| 
       118 
     | 
    
         
            -
                   
     | 
| 
      
 70 
     | 
    
         
            +
                  return :text if bytes.empty?  # Return :text for empty data
         
     | 
| 
      
 71 
     | 
    
         
            +
                  
         
     | 
| 
      
 72 
     | 
    
         
            +
                  # Check magic bytes for various formats
         
     | 
| 
      
 73 
     | 
    
         
            +
                  
         
     | 
| 
      
 74 
     | 
    
         
            +
                  # PDF
         
     | 
| 
      
 75 
     | 
    
         
            +
                  if bytes.size >= 4 && bytes[0..3] == [0x25, 0x50, 0x44, 0x46]  # %PDF
         
     | 
| 
      
 76 
     | 
    
         
            +
                    return :pdf
         
     | 
| 
      
 77 
     | 
    
         
            +
                  end
         
     | 
| 
      
 78 
     | 
    
         
            +
                  
         
     | 
| 
      
 79 
     | 
    
         
            +
                  # PNG
         
     | 
| 
      
 80 
     | 
    
         
            +
                  if bytes.size >= 8 && bytes[0..7] == [0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A]
         
     | 
| 
      
 81 
     | 
    
         
            +
                    return :png
         
     | 
| 
      
 82 
     | 
    
         
            +
                  end
         
     | 
| 
      
 83 
     | 
    
         
            +
                  
         
     | 
| 
      
 84 
     | 
    
         
            +
                  # JPEG
         
     | 
| 
      
 85 
     | 
    
         
            +
                  if bytes.size >= 3 && bytes[0..2] == [0xFF, 0xD8, 0xFF]
         
     | 
| 
      
 86 
     | 
    
         
            +
                    return :jpeg
         
     | 
| 
      
 87 
     | 
    
         
            +
                  end
         
     | 
| 
      
 88 
     | 
    
         
            +
                  
         
     | 
| 
      
 89 
     | 
    
         
            +
                  # BMP
         
     | 
| 
      
 90 
     | 
    
         
            +
                  if bytes.size >= 2 && bytes[0..1] == [0x42, 0x4D]  # BM
         
     | 
| 
      
 91 
     | 
    
         
            +
                    return :bmp
         
     | 
| 
      
 92 
     | 
    
         
            +
                  end
         
     | 
| 
      
 93 
     | 
    
         
            +
                  
         
     | 
| 
      
 94 
     | 
    
         
            +
                  # TIFF (little-endian or big-endian)
         
     | 
| 
      
 95 
     | 
    
         
            +
                  if bytes.size >= 4
         
     | 
| 
      
 96 
     | 
    
         
            +
                    if bytes[0..3] == [0x49, 0x49, 0x2A, 0x00]  # II*\0 (little-endian)
         
     | 
| 
      
 97 
     | 
    
         
            +
                      return :tiff
         
     | 
| 
      
 98 
     | 
    
         
            +
                    elsif bytes[0..3] == [0x4D, 0x4D, 0x00, 0x2A]  # MM\0* (big-endian)
         
     | 
| 
      
 99 
     | 
    
         
            +
                      return :tiff
         
     | 
| 
      
 100 
     | 
    
         
            +
                    end
         
     | 
| 
      
 101 
     | 
    
         
            +
                  end
         
     | 
| 
      
 102 
     | 
    
         
            +
                  
         
     | 
| 
      
 103 
     | 
    
         
            +
                  # OLE Compound Document (old Excel/Word) - return :xlsx for compatibility
         
     | 
| 
      
 104 
     | 
    
         
            +
                  if bytes.size >= 4 && bytes[0..3] == [0xD0, 0xCF, 0x11, 0xE0]
         
     | 
| 
      
 105 
     | 
    
         
            +
                    return :xlsx  # Return :xlsx for compatibility with existing tests
         
     | 
| 
      
 106 
     | 
    
         
            +
                  end
         
     | 
| 
      
 107 
     | 
    
         
            +
                  
         
     | 
| 
      
 108 
     | 
    
         
            +
                  # ZIP archive (could be DOCX, XLSX, PPTX)
         
     | 
| 
      
 109 
     | 
    
         
            +
                  if bytes.size >= 2 && bytes[0..1] == [0x50, 0x4B]  # PK
         
     | 
| 
      
 110 
     | 
    
         
            +
                    # Try to determine the specific Office format by checking ZIP contents
         
     | 
| 
      
 111 
     | 
    
         
            +
                    # For now, we'll need to inspect the ZIP structure
         
     | 
| 
      
 112 
     | 
    
         
            +
                    return detect_office_format_from_zip(bytes)
         
     | 
| 
      
 113 
     | 
    
         
            +
                  end
         
     | 
| 
      
 114 
     | 
    
         
            +
                  
         
     | 
| 
      
 115 
     | 
    
         
            +
                  # XML
         
     | 
| 
      
 116 
     | 
    
         
            +
                  if bytes.size >= 5
         
     | 
| 
      
 117 
     | 
    
         
            +
                    first_chars = bytes[0..4].pack('C*')
         
     | 
| 
      
 118 
     | 
    
         
            +
                    if first_chars == '<?xml' || first_chars.start_with?('<!')
         
     | 
| 
      
 119 
     | 
    
         
            +
                      return :xml
         
     | 
| 
      
 120 
     | 
    
         
            +
                    end
         
     | 
| 
      
 121 
     | 
    
         
            +
                  end
         
     | 
| 
      
 122 
     | 
    
         
            +
                  
         
     | 
| 
      
 123 
     | 
    
         
            +
                  # HTML
         
     | 
| 
      
 124 
     | 
    
         
            +
                  if bytes.size >= 14
         
     | 
| 
      
 125 
     | 
    
         
            +
                    first_chars = bytes[0..13].pack('C*').downcase
         
     | 
| 
      
 126 
     | 
    
         
            +
                    if first_chars.include?('<!doctype') || first_chars.include?('<html')
         
     | 
| 
      
 127 
     | 
    
         
            +
                      return :xml  # HTML is treated as XML
         
     | 
| 
      
 128 
     | 
    
         
            +
                    end
         
     | 
| 
      
 129 
     | 
    
         
            +
                  end
         
     | 
| 
      
 130 
     | 
    
         
            +
                  
         
     | 
| 
      
 131 
     | 
    
         
            +
                  # JSON
         
     | 
| 
      
 132 
     | 
    
         
            +
                  if bytes.size > 0
         
     | 
| 
      
 133 
     | 
    
         
            +
                    first_char = bytes[0]
         
     | 
| 
      
 134 
     | 
    
         
            +
                    # Skip whitespace
         
     | 
| 
      
 135 
     | 
    
         
            +
                    idx = 0
         
     | 
| 
      
 136 
     | 
    
         
            +
                    while idx < bytes.size && [0x20, 0x09, 0x0A, 0x0D].include?(bytes[idx])
         
     | 
| 
      
 137 
     | 
    
         
            +
                      idx += 1
         
     | 
| 
      
 138 
     | 
    
         
            +
                    end
         
     | 
| 
      
 139 
     | 
    
         
            +
                    
         
     | 
| 
      
 140 
     | 
    
         
            +
                    if idx < bytes.size
         
     | 
| 
      
 141 
     | 
    
         
            +
                      first_non_ws = bytes[idx]
         
     | 
| 
      
 142 
     | 
    
         
            +
                      if first_non_ws == 0x7B || first_non_ws == 0x5B  # { or [
         
     | 
| 
      
 143 
     | 
    
         
            +
                        return :json
         
     | 
| 
      
 144 
     | 
    
         
            +
                      end
         
     | 
| 
      
 145 
     | 
    
         
            +
                    end
         
     | 
| 
      
 146 
     | 
    
         
            +
                  end
         
     | 
| 
      
 147 
     | 
    
         
            +
                  
         
     | 
| 
      
 148 
     | 
    
         
            +
                  # Default to text if not recognized
         
     | 
| 
      
 149 
     | 
    
         
            +
                  :text
         
     | 
| 
      
 150 
     | 
    
         
            +
                end
         
     | 
| 
      
 151 
     | 
    
         
            +
                
         
     | 
| 
      
 152 
     | 
    
         
            +
                # Detect specific Office format from ZIP data
         
     | 
| 
      
 153 
     | 
    
         
            +
                # @param bytes [Array<Integer>] ZIP file bytes
         
     | 
| 
      
 154 
     | 
    
         
            +
                # @return [Symbol] :docx, :xlsx, :pptx, or :unknown
         
     | 
| 
      
 155 
     | 
    
         
            +
                def detect_office_format_from_zip(bytes)
         
     | 
| 
      
 156 
     | 
    
         
            +
                  # This is a simplified detection - in practice you'd parse the ZIP
         
     | 
| 
      
 157 
     | 
    
         
            +
                  # For the test, we'll check for known patterns in the ZIP structure
         
     | 
| 
      
 158 
     | 
    
         
            +
                  
         
     | 
| 
      
 159 
     | 
    
         
            +
                  # Convert bytes to string for pattern matching
         
     | 
| 
      
 160 
     | 
    
         
            +
                  content = bytes[0..2000].pack('C*')  # Check first 2KB
         
     | 
| 
      
 161 
     | 
    
         
            +
                  
         
     | 
| 
      
 162 
     | 
    
         
            +
                  # Look for Office-specific directory names in the ZIP
         
     | 
| 
      
 163 
     | 
    
         
            +
                  if content.include?('word/') || content.include?('word/_rels')
         
     | 
| 
      
 164 
     | 
    
         
            +
                    :docx
         
     | 
| 
      
 165 
     | 
    
         
            +
                  elsif content.include?('xl/') || content.include?('xl/_rels')
         
     | 
| 
       119 
166 
     | 
    
         
             
                    :xlsx
         
     | 
| 
       120 
     | 
    
         
            -
                  elsif  
     | 
| 
       121 
     | 
    
         
            -
                    : 
     | 
| 
       122 
     | 
    
         
            -
                  elsif bytes[0..4] == [0x3C, 0x68, 0x74, 0x6D, 0x6C]  # <html
         
     | 
| 
       123 
     | 
    
         
            -
                    :xml
         
     | 
| 
       124 
     | 
    
         
            -
                  elsif bytes[0] == 0x7B || bytes[0] == 0x5B  # { or [
         
     | 
| 
       125 
     | 
    
         
            -
                    :json
         
     | 
| 
      
 167 
     | 
    
         
            +
                  elsif content.include?('ppt/') || content.include?('ppt/_rels')
         
     | 
| 
      
 168 
     | 
    
         
            +
                    :pptx
         
     | 
| 
       126 
169 
     | 
    
         
             
                  else
         
     | 
| 
       127 
     | 
    
         
            -
                     
     | 
| 
      
 170 
     | 
    
         
            +
                    # Default to xlsx for generic ZIP
         
     | 
| 
      
 171 
     | 
    
         
            +
                    :xlsx
         
     | 
| 
       128 
172 
     | 
    
         
             
                  end
         
     | 
| 
       129 
173 
     | 
    
         
             
                end
         
     | 
| 
       130 
174 
     | 
    
         | 
| 
       131 
175 
     | 
    
         
             
                # Parse file using format-specific parser
         
     | 
| 
       132 
     | 
    
         
            -
                # This method  
     | 
| 
      
 176 
     | 
    
         
            +
                # This method delegates to parse_file which uses centralized dispatch in Rust
         
     | 
| 
       133 
177 
     | 
    
         
             
                # @param path [String] File path
         
     | 
| 
       134 
178 
     | 
    
         
             
                # @return [String] Parsed content
         
     | 
| 
       135 
179 
     | 
    
         
             
                def parse_file_routed(path)
         
     | 
| 
       136 
     | 
    
         
            -
                   
     | 
| 
       137 
     | 
    
         
            -
                   
     | 
| 
       138 
     | 
    
         
            -
                  
         
     | 
| 
       139 
     | 
    
         
            -
                  case format
         
     | 
| 
       140 
     | 
    
         
            -
                  when :docx then parse_docx(data)
         
     | 
| 
       141 
     | 
    
         
            -
                  when :xlsx then parse_xlsx(data) 
         
     | 
| 
       142 
     | 
    
         
            -
                  when :pdf then parse_pdf(data)
         
     | 
| 
       143 
     | 
    
         
            -
                  when :json then parse_json(data)
         
     | 
| 
       144 
     | 
    
         
            -
                  when :xml then parse_xml(data)
         
     | 
| 
       145 
     | 
    
         
            -
                  else parse_text(data)
         
     | 
| 
       146 
     | 
    
         
            -
                  end
         
     | 
| 
      
 180 
     | 
    
         
            +
                  # Simply delegate to parse_file which already has dispatch logic
         
     | 
| 
      
 181 
     | 
    
         
            +
                  parse_file(path)
         
     | 
| 
       147 
182 
     | 
    
         
             
                end
         
     | 
| 
       148 
183 
     | 
    
         | 
| 
       149 
184 
     | 
    
         
             
                # Parse bytes using format-specific parser
         
     | 
| 
       150 
     | 
    
         
            -
                # This method  
     | 
| 
      
 185 
     | 
    
         
            +
                # This method delegates to parse_bytes which uses centralized dispatch in Rust
         
     | 
| 
       151 
186 
     | 
    
         
             
                # @param data [String, Array<Integer>] Binary data
         
     | 
| 
       152 
187 
     | 
    
         
             
                # @return [String] Parsed content
         
     | 
| 
       153 
188 
     | 
    
         
             
                def parse_bytes_routed(data)
         
     | 
| 
       154 
     | 
    
         
            -
                   
     | 
| 
      
 189 
     | 
    
         
            +
                  # Simply delegate to parse_bytes which already has dispatch logic
         
     | 
| 
       155 
190 
     | 
    
         
             
                  bytes = data.is_a?(String) ? data.bytes : data
         
     | 
| 
       156 
     | 
    
         
            -
                  
         
     | 
| 
       157 
     | 
    
         
            -
                  case format
         
     | 
| 
       158 
     | 
    
         
            -
                  when :docx then parse_docx(bytes)
         
     | 
| 
       159 
     | 
    
         
            -
                  when :xlsx then parse_xlsx(bytes)
         
     | 
| 
       160 
     | 
    
         
            -
                  when :pdf then parse_pdf(bytes)
         
     | 
| 
       161 
     | 
    
         
            -
                  when :json then parse_json(bytes)
         
     | 
| 
       162 
     | 
    
         
            -
                  when :xml then parse_xml(bytes)
         
     | 
| 
       163 
     | 
    
         
            -
                  else parse_text(bytes)
         
     | 
| 
       164 
     | 
    
         
            -
                  end
         
     | 
| 
      
 191 
     | 
    
         
            +
                  parse_bytes(bytes)
         
     | 
| 
       165 
192 
     | 
    
         
             
                end
         
     | 
| 
       166 
193 
     | 
    
         | 
| 
       167 
194 
     | 
    
         
             
                # Parse with a block for processing results
         
     | 
| 
         @@ -178,25 +205,49 @@ module ParseKit 
     | 
|
| 
       178 
205 
     | 
    
         
             
                # @param input [String] The input to validate
         
     | 
| 
       179 
206 
     | 
    
         
             
                # @return [Boolean] True if input is valid
         
     | 
| 
       180 
207 
     | 
    
         
             
                def valid_input?(input)
         
     | 
| 
       181 
     | 
    
         
            -
                   
     | 
| 
       182 
     | 
    
         
            -
                  return false if input.empty?
         
     | 
| 
       183 
     | 
    
         
            -
                  true
         
     | 
| 
      
 208 
     | 
    
         
            +
                  input.is_a?(String) && !input.empty?
         
     | 
| 
       184 
209 
     | 
    
         
             
                end
         
     | 
| 
       185 
210 
     | 
    
         | 
| 
       186 
211 
     | 
    
         
             
                # Validate file before parsing
         
     | 
| 
       187 
212 
     | 
    
         
             
                # @param path [String] The file path to validate
         
     | 
| 
       188 
213 
     | 
    
         
             
                # @return [Boolean] True if file exists and format is supported
         
     | 
| 
       189 
214 
     | 
    
         
             
                def valid_file?(path)
         
     | 
| 
      
 215 
     | 
    
         
            +
                  return false if path.nil? || path.empty?
         
     | 
| 
       190 
216 
     | 
    
         
             
                  return false unless File.exist?(path)
         
     | 
| 
      
 217 
     | 
    
         
            +
                  return false if File.directory?(path)
         
     | 
| 
       191 
218 
     | 
    
         
             
                  supports_file?(path)
         
     | 
| 
       192 
219 
     | 
    
         
             
                end
         
     | 
| 
       193 
220 
     | 
    
         | 
| 
       194 
221 
     | 
    
         
             
                # Get file extension
         
     | 
| 
       195 
222 
     | 
    
         
             
                # @param path [String] File path
         
     | 
| 
       196 
     | 
    
         
            -
                # @return [String, nil] File extension in lowercase
         
     | 
| 
      
 223 
     | 
    
         
            +
                # @return [String, nil] File extension in lowercase without leading dot
         
     | 
| 
       197 
224 
     | 
    
         
             
                def file_extension(path)
         
     | 
| 
       198 
     | 
    
         
            -
                   
     | 
| 
       199 
     | 
    
         
            -
                   
     | 
| 
      
 225 
     | 
    
         
            +
                  return nil if path.nil? || path.empty?
         
     | 
| 
      
 226 
     | 
    
         
            +
                  
         
     | 
| 
      
 227 
     | 
    
         
            +
                  # Handle trailing whitespace
         
     | 
| 
      
 228 
     | 
    
         
            +
                  clean_path = path.strip
         
     | 
| 
      
 229 
     | 
    
         
            +
                  
         
     | 
| 
      
 230 
     | 
    
         
            +
                  # Handle trailing slashes (directory indicator)
         
     | 
| 
      
 231 
     | 
    
         
            +
                  return nil if clean_path.end_with?('/')
         
     | 
| 
      
 232 
     | 
    
         
            +
                  
         
     | 
| 
      
 233 
     | 
    
         
            +
                  # Get the extension
         
     | 
| 
      
 234 
     | 
    
         
            +
                  ext = File.extname(clean_path)
         
     | 
| 
      
 235 
     | 
    
         
            +
                  
         
     | 
| 
      
 236 
     | 
    
         
            +
                  # Handle special cases
         
     | 
| 
      
 237 
     | 
    
         
            +
                  if ext.empty?
         
     | 
| 
      
 238 
     | 
    
         
            +
                    # Check for hidden files like .gitignore (the whole name after dot is the "extension")
         
     | 
| 
      
 239 
     | 
    
         
            +
                    basename = File.basename(clean_path)
         
     | 
| 
      
 240 
     | 
    
         
            +
                    if basename.start_with?('.') && basename.length > 1 && !basename[1..-1].include?('.')
         
     | 
| 
      
 241 
     | 
    
         
            +
                      return basename[1..-1].downcase
         
     | 
| 
      
 242 
     | 
    
         
            +
                    end
         
     | 
| 
      
 243 
     | 
    
         
            +
                    return nil
         
     | 
| 
      
 244 
     | 
    
         
            +
                  elsif ext == '.'
         
     | 
| 
      
 245 
     | 
    
         
            +
                    # File ends with a dot but no extension
         
     | 
| 
      
 246 
     | 
    
         
            +
                    return nil
         
     | 
| 
      
 247 
     | 
    
         
            +
                  else
         
     | 
| 
      
 248 
     | 
    
         
            +
                    # Normal extension, remove the dot and downcase
         
     | 
| 
      
 249 
     | 
    
         
            +
                    ext[1..-1].downcase
         
     | 
| 
      
 250 
     | 
    
         
            +
                  end
         
     | 
| 
       200 
251 
     | 
    
         
             
                end
         
     | 
| 
       201 
252 
     | 
    
         
             
              end
         
     | 
| 
       202 
253 
     | 
    
         
             
            end
         
     | 
    
        data/lib/parsekit/version.rb
    CHANGED
    
    
    
        data/lib/parsekit.rb
    CHANGED
    
    | 
         @@ -14,6 +14,22 @@ require_relative "parsekit/parser" 
     | 
|
| 
       14 
14 
     | 
    
         | 
| 
       15 
15 
     | 
    
         
             
            # ParseKit is a Ruby document parsing toolkit with PDF and OCR support
         
     | 
| 
       16 
16 
     | 
    
         
             
            module ParseKit
         
     | 
| 
      
 17 
     | 
    
         
            +
              # Supported file formats and their extensions
         
     | 
| 
      
 18 
     | 
    
         
            +
              SUPPORTED_FORMATS = {
         
     | 
| 
      
 19 
     | 
    
         
            +
                pdf: ['.pdf'],
         
     | 
| 
      
 20 
     | 
    
         
            +
                docx: ['.docx'],
         
     | 
| 
      
 21 
     | 
    
         
            +
                xlsx: ['.xlsx'],
         
     | 
| 
      
 22 
     | 
    
         
            +
                xls: ['.xls'],
         
     | 
| 
      
 23 
     | 
    
         
            +
                pptx: ['.pptx'],
         
     | 
| 
      
 24 
     | 
    
         
            +
                png: ['.png'],
         
     | 
| 
      
 25 
     | 
    
         
            +
                jpeg: ['.jpg', '.jpeg'],
         
     | 
| 
      
 26 
     | 
    
         
            +
                tiff: ['.tiff', '.tif'],
         
     | 
| 
      
 27 
     | 
    
         
            +
                bmp: ['.bmp'],
         
     | 
| 
      
 28 
     | 
    
         
            +
                json: ['.json'],
         
     | 
| 
      
 29 
     | 
    
         
            +
                xml: ['.xml', '.html'],
         
     | 
| 
      
 30 
     | 
    
         
            +
                text: ['.txt', '.md', '.csv']
         
     | 
| 
      
 31 
     | 
    
         
            +
              }.freeze
         
     | 
| 
      
 32 
     | 
    
         
            +
             
     | 
| 
       17 
33 
     | 
    
         
             
              class << self
         
     | 
| 
       18 
34 
     | 
    
         
             
                # The parse_file and parse_bytes methods are defined in the native extension
         
     | 
| 
       19 
35 
     | 
    
         
             
                # We just need to document them here or add wrapper logic if needed
         
     | 
| 
         @@ -50,6 +66,22 @@ module ParseKit 
     | 
|
| 
       50 
66 
     | 
    
         
             
                  Parser.new.supports_file?(path)
         
     | 
| 
       51 
67 
     | 
    
         
             
                end
         
     | 
| 
       52 
68 
     | 
    
         | 
| 
      
 69 
     | 
    
         
            +
                # Detect file format from filename/extension
         
     | 
| 
      
 70 
     | 
    
         
            +
                # @param filename [String, nil] The filename to check
         
     | 
| 
      
 71 
     | 
    
         
            +
                # @return [Symbol] The detected format, or :unknown
         
     | 
| 
      
 72 
     | 
    
         
            +
                def detect_format(filename)
         
     | 
| 
      
 73 
     | 
    
         
            +
                  return :unknown if filename.nil? || filename.empty?
         
     | 
| 
      
 74 
     | 
    
         
            +
                  
         
     | 
| 
      
 75 
     | 
    
         
            +
                  ext = File.extname(filename).downcase
         
     | 
| 
      
 76 
     | 
    
         
            +
                  return :unknown if ext.empty?
         
     | 
| 
      
 77 
     | 
    
         
            +
                  
         
     | 
| 
      
 78 
     | 
    
         
            +
                  SUPPORTED_FORMATS.each do |format, extensions|
         
     | 
| 
      
 79 
     | 
    
         
            +
                    return format if extensions.include?(ext)
         
     | 
| 
      
 80 
     | 
    
         
            +
                  end
         
     | 
| 
      
 81 
     | 
    
         
            +
                  
         
     | 
| 
      
 82 
     | 
    
         
            +
                  :unknown
         
     | 
| 
      
 83 
     | 
    
         
            +
                end
         
     | 
| 
      
 84 
     | 
    
         
            +
                
         
     | 
| 
       53 
85 
     | 
    
         
             
                # Get the native library version
         
     | 
| 
       54 
86 
     | 
    
         
             
                # @return [String] Version of the native library
         
     | 
| 
       55 
87 
     | 
    
         
             
                def native_version
         
     | 
    
        metadata
    CHANGED
    
    | 
         @@ -1,7 +1,7 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            --- !ruby/object:Gem::Specification
         
     | 
| 
       2 
2 
     | 
    
         
             
            name: parsekit
         
     | 
| 
       3 
3 
     | 
    
         
             
            version: !ruby/object:Gem::Version
         
     | 
| 
       4 
     | 
    
         
            -
              version: 0.1. 
     | 
| 
      
 4 
     | 
    
         
            +
              version: 0.1.1
         
     | 
| 
       5 
5 
     | 
    
         
             
            platform: ruby
         
     | 
| 
       6 
6 
     | 
    
         
             
            authors:
         
     | 
| 
       7 
7 
     | 
    
         
             
            - Chris Petersen
         
     | 
| 
         @@ -96,9 +96,11 @@ files: 
     | 
|
| 
       96 
96 
     | 
    
         
             
            - ext/parsekit/Cargo.toml
         
     | 
| 
       97 
97 
     | 
    
         
             
            - ext/parsekit/extconf.rb
         
     | 
| 
       98 
98 
     | 
    
         
             
            - ext/parsekit/src/error.rs
         
     | 
| 
      
 99 
     | 
    
         
            +
            - ext/parsekit/src/format_detector.rs
         
     | 
| 
       99 
100 
     | 
    
         
             
            - ext/parsekit/src/lib.rs
         
     | 
| 
       100 
101 
     | 
    
         
             
            - ext/parsekit/src/parser.rs
         
     | 
| 
       101 
102 
     | 
    
         
             
            - lib/parsekit.rb
         
     | 
| 
      
 103 
     | 
    
         
            +
            - lib/parsekit/NATIVE_API.md
         
     | 
| 
       102 
104 
     | 
    
         
             
            - lib/parsekit/error.rb
         
     | 
| 
       103 
105 
     | 
    
         
             
            - lib/parsekit/parsekit.bundle
         
     | 
| 
       104 
106 
     | 
    
         
             
            - lib/parsekit/parser.rb
         
     |