RubyGems - parsekit - Versions diffs - 0.1.0 → 0.1.2 - Mend

parsekit 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +2 -2
data/README.md +1 -1
data/ext/parsekit/Cargo.toml +1 -1
data/ext/parsekit/src/format_detector.rs +233 -0
data/ext/parsekit/src/lib.rs +1 -0
data/ext/parsekit/src/parser.rs +111 -181
data/lib/parsekit/NATIVE_API.md +125 -0
data/lib/parsekit/parsekit.bundle +0 -0
data/lib/parsekit/parser.rb +155 -104
data/lib/parsekit/version.rb +1 -1
data/lib/parsekit.rb +32 -0
metadata +7 -5

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 35c0708c088075c883b3b35c7d76f1573f29a19bf65ac0b89b636a5b76cee662
-  data.tar.gz: b1ddf9260329239c3a1e791f3ed3249b3577cb210f4c898677316fe55cc951f4
+  metadata.gz: 6ad6eb42fb7e96fa944f30245b2c7be51bf4ce1a0f7766749309676b225b17df
+  data.tar.gz: deb56ea394ac3fedc840e890e4d27de14585661233f19eeaae06baf7be1b1e90
 SHA512:
-  metadata.gz: 2fe76f5b28927e3989502b0ea5f084f5bfc265aae9a65aaba47349e3e540e8150612d75f8f4ddcdc38be7edd9ae7edbf42220ba95b42a535dbc200503759c419
-  data.tar.gz: e5b9e8eff90f8583f8289bea5100ac43434978ebba814bf9198fb92cc622a9b4fa6e99e28fe2ed31ffa0040c3ac48a38c8361bc1994200059a23d040440a64cc
+  metadata.gz: dc88b902dd12008a6936f4d62f5d4651544a3f463b725a15d385b919141e93873bd809436e6b9b008baa7b310d149becb2106a29ca103736f6525e09bef871d6
+  data.tar.gz: 9cbc5464a5cbe06a241d2253cde81da82c7eb75742654b7753c91a922acc87125f81a33c3e77d0d107a1435e8946a860e12388e44fa84dc887d9bb4bf9d2d3a2

data/CHANGELOG.md CHANGED Viewed

@@ -49,5 +49,5 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Rust edition 2021
 - Cross-compilation support for multiple platforms
-[Unreleased]: https://github.com/cpetersen/parsekit/compare/v0.1.0...HEAD
-[0.1.0]: https://github.com/cpetersen/parsekit/releases/tag/v0.1.0
+[Unreleased]: https://github.com/scientist-labs/parsekit/compare/v0.1.0...HEAD
+[0.1.0]: https://github.com/scientist-labs/parsekit/releases/tag/v0.1.0

data/README.md CHANGED Viewed

@@ -186,7 +186,7 @@ ParseKit uses a hybrid Ruby/Rust architecture:
 ## Contributing
-Bug reports and pull requests are welcome on GitHub at https://github.com/cpetersen/parsekit.
+Bug reports and pull requests are welcome on GitHub at https://github.com/scientist-labs/parsekit.
 ## License

data/ext/parsekit/Cargo.toml CHANGED Viewed

@@ -21,7 +21,7 @@ image = "0.25"  # Image processing library (match rusty-tesseract's version)
 calamine = "0.30"  # Excel parsing
 docx-rs = "0.4"  # Word document parsing
 quick-xml = "0.38"  # XML parsing
-zip = "2.1"  # ZIP archive handling for PPTX
+zip = "5.0"  # ZIP archive handling for PPTX
 serde_json = "1.0"  # JSON parsing
 regex = "1.10"  # Text parsing
 encoding_rs = "0.8"  # Encoding detection

data/ext/parsekit/src/format_detector.rs ADDED Viewed

@@ -0,0 +1,233 @@
+use std::path::Path;
+/// Represents a detected file format
+#[derive(Debug, Clone, PartialEq)]
+pub enum FileFormat {
+    Pdf,
+    Docx,
+    Xlsx,
+    Xls,
+    Pptx,
+    Png,
+    Jpeg,
+    Tiff,
+    Bmp,
+    Json,
+    Xml,
+    Html,
+    Text,
+    Unknown,
+}
+impl FileFormat {
+    /// Convert to Ruby symbol representation
+    pub fn to_symbol(&self) -> &'static str {
+        match self {
+            FileFormat::Pdf => "pdf",
+            FileFormat::Docx => "docx",
+            FileFormat::Xlsx => "xlsx",
+            FileFormat::Xls => "xls",
+            FileFormat::Pptx => "pptx",
+            FileFormat::Png => "png",
+            FileFormat::Jpeg => "jpeg",
+            FileFormat::Tiff => "tiff",
+            FileFormat::Bmp => "bmp",
+            FileFormat::Json => "json",
+            FileFormat::Xml => "xml",
+            FileFormat::Html => "xml", // HTML is treated as XML in Ruby
+            FileFormat::Text => "text",
+            FileFormat::Unknown => "unknown",
+        }
+    }
+}
+/// Central format detection logic
+pub struct FormatDetector;
+impl FormatDetector {
+    /// Detect format from filename and content
+    /// Prioritizes content detection over extension when both are available
+    pub fn detect(filename: Option<&str>, content: Option<&[u8]>) -> FileFormat {
+        // First try content-based detection if content is provided
+        if let Some(data) = content {
+            let format = Self::detect_from_content(data);
+            // If we got a definitive format from content, use it
+            if !matches!(format, FileFormat::Text | FileFormat::Unknown) {
+                return format;
+            }
+        }
+        // Fall back to extension-based detection
+        if let Some(name) = filename {
+            let ext_format = Self::detect_from_extension(name);
+            if ext_format != FileFormat::Unknown {
+                return ext_format;
+            }
+        }
+        // If content detection returned Text and no extension match, return Text
+        if let Some(data) = content {
+            let format = Self::detect_from_content(data);
+            if format == FileFormat::Text {
+                return FileFormat::Text;
+            }
+        }
+        FileFormat::Unknown
+    }
+    /// Detect format from file extension
+    pub fn detect_from_extension(filename: &str) -> FileFormat {
+        let path = Path::new(filename);
+        let ext = match path.extension().and_then(|s| s.to_str()) {
+            Some(e) => e.to_lowercase(),
+            None => return FileFormat::Unknown,
+        };
+        match ext.as_str() {
+            "pdf" => FileFormat::Pdf,
+            "docx" => FileFormat::Docx,
+            "xlsx" => FileFormat::Xlsx,
+            "xls" => FileFormat::Xls,
+            "pptx" => FileFormat::Pptx,
+            "png" => FileFormat::Png,
+            "jpg" | "jpeg" => FileFormat::Jpeg,
+            "tiff" | "tif" => FileFormat::Tiff,
+            "bmp" => FileFormat::Bmp,
+            "json" => FileFormat::Json,
+            "xml" => FileFormat::Xml,
+            "html" | "htm" => FileFormat::Html,
+            "txt" | "text" | "md" | "markdown" | "csv" => FileFormat::Text,
+            _ => FileFormat::Unknown,
+        }
+    }
+    /// Detect format from file content (magic bytes)
+    pub fn detect_from_content(data: &[u8]) -> FileFormat {
+        if data.is_empty() {
+            return FileFormat::Text; // Empty files are treated as text
+        }
+        // PDF
+        if data.len() >= 4 && data.starts_with(b"%PDF") {
+            return FileFormat::Pdf;
+        }
+        // PNG
+        if data.len() >= 8 && data.starts_with(&[0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A]) {
+            return FileFormat::Png;
+        }
+        // JPEG
+        if data.len() >= 3 && data.starts_with(&[0xFF, 0xD8, 0xFF]) {
+            return FileFormat::Jpeg;
+        }
+        // BMP
+        if data.len() >= 2 && data.starts_with(b"BM") {
+            return FileFormat::Bmp;
+        }
+        // TIFF (little-endian or big-endian)
+        if data.len() >= 4 {
+            if data.starts_with(b"II\x2A\x00") || data.starts_with(b"MM\x00\x2A") {
+                return FileFormat::Tiff;
+            }
+        }
+        // OLE Compound Document (old Excel/Word)
+        if data.len() >= 4 && data.starts_with(&[0xD0, 0xCF, 0x11, 0xE0]) {
+            return FileFormat::Xls; // Old Office format, usually Excel
+        }
+        // ZIP archive (could be DOCX, XLSX, PPTX)
+        if data.len() >= 2 && data.starts_with(b"PK") {
+            return Self::detect_office_format(data);
+        }
+        // XML
+        if data.len() >= 5 {
+            let start = String::from_utf8_lossy(&data[0..5.min(data.len())]);
+            if start.starts_with("<?xml") || start.starts_with("<!") {
+                return FileFormat::Xml;
+            }
+        }
+        // HTML
+        if data.len() >= 14 {
+            let start = String::from_utf8_lossy(&data[0..14.min(data.len())]).to_lowercase();
+            if start.contains("<!doctype") || start.contains("<html") {
+                return FileFormat::Html;
+            }
+        }
+        // JSON
+        if let Some(&first_non_ws) = data.iter().find(|&&b| !b" \t\n\r".contains(&b)) {
+            if first_non_ws == b'{' || first_non_ws == b'[' {
+                return FileFormat::Json;
+            }
+        }
+        // Default to text for unrecognized formats
+        FileFormat::Text
+    }
+    /// Detect specific Office format from ZIP data
+    fn detect_office_format(data: &[u8]) -> FileFormat {
+        // Look for Office-specific directory names in first 2KB of ZIP
+        let check_len = 2000.min(data.len());
+        let content = String::from_utf8_lossy(&data[0..check_len]);
+        // Check for format-specific markers
+        if content.contains("word/") || content.contains("word/_rels") {
+            FileFormat::Docx
+        } else if content.contains("xl/") || content.contains("xl/_rels") {
+            FileFormat::Xlsx
+        } else if content.contains("ppt/") || content.contains("ppt/_rels") {
+            FileFormat::Pptx
+        } else {
+            // Default to XLSX for generic ZIP (most common Office format)
+            FileFormat::Xlsx
+        }
+    }
+    /// Get all supported extensions
+    pub fn supported_extensions() -> Vec<&'static str> {
+        vec![
+            "pdf", "docx", "xlsx", "xls", "pptx",
+            "png", "jpg", "jpeg", "tiff", "tif", "bmp",
+            "json", "xml", "html", "htm",
+            "txt", "text", "md", "markdown", "csv"
+        ]
+    }
+}
+#[cfg(test)]
+mod tests {
+    use super::*;
+    #[test]
+    fn test_detect_pdf() {
+        let pdf_data = b"%PDF-1.5\n";
+        assert_eq!(FormatDetector::detect_from_content(pdf_data), FileFormat::Pdf);
+    }
+    #[test]
+    fn test_detect_png() {
+        let png_data = &[0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A];
+        assert_eq!(FormatDetector::detect_from_content(png_data), FileFormat::Png);
+    }
+    #[test]
+    fn test_detect_from_extension() {
+        assert_eq!(FormatDetector::detect_from_extension("document.pdf"), FileFormat::Pdf);
+        assert_eq!(FormatDetector::detect_from_extension("Document.PDF"), FileFormat::Pdf);
+        assert_eq!(FormatDetector::detect_from_extension("data.xlsx"), FileFormat::Xlsx);
+    }
+    #[test]
+    fn test_empty_data() {
+        assert_eq!(FormatDetector::detect_from_content(&[]), FileFormat::Text);
+    }
+}

data/ext/parsekit/src/lib.rs CHANGED Viewed

@@ -2,6 +2,7 @@ use magnus::{function, prelude::*, Error, Ruby};
 mod parser;
 mod error;
+mod format_detector;
 /// Initialize the ParseKit module and its submodules
 #[magnus::init]

data/ext/parsekit/src/parser.rs CHANGED Viewed

@@ -1,7 +1,7 @@
 use magnus::{
     function, method, prelude::*, scan_args, Error, Module, RHash, RModule, Ruby, Value,
 };
-use std::path::Path;
+use crate::format_detector::{FileFormat, FormatDetector};
 #[derive(Debug, Clone)]
 #[magnus::wrap(class = "ParseKit::Parser", free_immediately, size)]
@@ -28,6 +28,33 @@ impl Default for ParserConfig {
     }
 }
+// Error handling helpers
+impl Parser {
+    /// Create a RuntimeError with formatted message
+    fn runtime_error<E: std::fmt::Display>(context: &str, err: E) -> Error {
+        Error::new(
+            Ruby::get().unwrap().exception_runtime_error(),
+            format!("{}: {}", context, err),
+        )
+    }
+    /// Create an ArgumentError with message
+    fn argument_error(msg: &str) -> Error {
+        Error::new(
+            Ruby::get().unwrap().exception_arg_error(),
+            msg.to_string(),
+        )
+    }
+    /// Create an IOError with formatted message
+    fn io_error<E: std::fmt::Display>(context: &str, err: E) -> Error {
+        Error::new(
+            Ruby::get().unwrap().exception_io_error(),
+            format!("{}: {}", context, err),
+        )
+    }
+}
 impl Parser {
     /// Create a new Parser instance with optional configuration
     fn new(ruby: &Ruby, args: &[Value]) -> Result<Self, Error> {
@@ -58,73 +85,49 @@ impl Parser {
     fn parse_bytes_internal(&self, data: Vec<u8>, filename: Option<&str>) -> Result<String, Error> {
         // Check size limit
         if data.len() > self.config.max_size {
-            return Err(Error::new(
-                Ruby::get().unwrap().exception_runtime_error(),
-                format!(
-                    "File size {} exceeds maximum allowed size {}",
-                    data.len(),
-                    self.config.max_size
-                ),
+            return Err(Self::runtime_error(
+                "File size exceeds limit",
+                format!("{} bytes exceeds maximum allowed size of {} bytes",
+                    data.len(), self.config.max_size)
             ));
         }
-        // Detect file type from extension or content
-        let file_type = if let Some(name) = filename {
-            Self::detect_type_from_filename(name)
-        } else {
-            Self::detect_type_from_content(&data)
-        };
-        match file_type.as_str() {
-            "pdf" => self.parse_pdf(data),
-            "docx" => self.parse_docx(data),
-            "pptx" => self.parse_pptx(data),
-            "xlsx" | "xls" => self.parse_xlsx(data),
-            "json" => self.parse_json(data),
-            "xml" | "html" => self.parse_xml(data),
-            "png" | "jpg" | "jpeg" | "tiff" | "bmp" => self.ocr_image(data),
-            "txt" | "text" => self.parse_text(data),
-            _ => self.parse_text(data), // Default to text parsing
-        }
+        // Use centralized format detection
+        let format = FormatDetector::detect(filename, Some(&data));
+        // Use centralized dispatch
+        self.dispatch_to_parser(format, data)
     }
-    /// Detect file type from filename extension
-    fn detect_type_from_filename(filename: &str) -> String {
-        let path = Path::new(filename);
-        match path.extension().and_then(|s| s.to_str()) {
-            Some(ext) => ext.to_lowercase(),
-            None => "txt".to_string(),
+    /// Centralized dispatch logic - routes format to appropriate parser
+    fn dispatch_to_parser(&self, format: FileFormat, data: Vec<u8>) -> Result<String, Error> {
+        match format {
+            FileFormat::Pdf => self.parse_pdf(data),
+            FileFormat::Docx => self.parse_docx(data),
+            FileFormat::Pptx => self.parse_pptx(data),
+            FileFormat::Xlsx | FileFormat::Xls => self.parse_xlsx(data),
+            FileFormat::Json => self.parse_json(data),
+            FileFormat::Xml | FileFormat::Html => self.parse_xml(data),
+            FileFormat::Png | FileFormat::Jpeg | FileFormat::Tiff | FileFormat::Bmp => self.ocr_image(data),
+            FileFormat::Text | FileFormat::Unknown => self.parse_text(data),
         }
     }
-    /// Detect file type from content (basic detection)
-    fn detect_type_from_content(data: &[u8]) -> String {
-        if data.starts_with(b"%PDF") {
-            "pdf".to_string()
-        } else if data.starts_with(b"PK") {
-            // PK is the ZIP signature - could be DOCX or XLSX
-            // Try to differentiate by looking for common patterns
-            // This is a simplified check - both DOCX and XLSX are ZIP files
-            // For now, default to xlsx as it's more commonly parsed
-            "xlsx".to_string() // Office Open XML format (could also be DOCX)
-        } else if data.starts_with(&[0xD0, 0xCF, 0x11, 0xE0]) {
-            "xls".to_string() // Old Excel format
-        } else if data.starts_with(&[0x89, 0x50, 0x4E, 0x47]) {
-            "png".to_string() // PNG signature
-        } else if data.starts_with(&[0xFF, 0xD8, 0xFF]) {
-            "jpg".to_string() // JPEG signature
-        } else if data.starts_with(b"BM") {
-            "bmp".to_string() // BMP signature
-        } else if data.starts_with(b"II\x2A\x00") || data.starts_with(b"MM\x00\x2A") {
-            "tiff".to_string() // TIFF signature (little-endian or big-endian)
-        } else if data.starts_with(b"<?xml") || data.starts_with(b"<html") {
-            "xml".to_string()
-        } else if data.starts_with(b"{") || data.starts_with(b"[") {
-            "json".to_string()
-        } else {
-            "txt".to_string()
+    /// Ruby-accessible method to detect format from bytes
+    fn detect_format_from_bytes(&self, data: Vec<u8>) -> String {
+        let format = FormatDetector::detect_from_content(&data);
+        // For compatibility with Ruby tests, return "xlsx" for old Excel
+        match format {
+            FileFormat::Xls => "xlsx".to_string(),  // Compatibility with existing tests
+            _ => format.to_symbol().to_string(),
         }
     }
+    /// Ruby-accessible method to detect format from filename
+    fn detect_format_from_filename(&self, filename: String) -> String {
+        let format = FormatDetector::detect_from_extension(&filename);
+        format.to_symbol().to_string()
+    }
     /// Perform OCR on image data using Tesseract
     fn ocr_image(&self, data: Vec<u8>) -> Result<String, Error> {
@@ -191,20 +194,12 @@ impl Parser {
         };
         if let Err(e) = init_result {
-            return Err(Error::new(
-                Ruby::get().unwrap().exception_runtime_error(),
-                format!("Failed to initialize Tesseract: {:?}", e),
-            ))
+            return Err(Self::runtime_error("Failed to initialize Tesseract", e));
         }
         // Load the image from bytes
-        let img = match image::load_from_memory(&data) {
-            Ok(img) => img,
-            Err(e) => return Err(Error::new(
-                Ruby::get().unwrap().exception_runtime_error(),
-                format!("Failed to load image: {}", e),
-            ))
-        };
+        let img = image::load_from_memory(&data)
+            .map_err(|e| Self::runtime_error("Failed to load image", e))?;
         // Convert to RGBA8 format
         let rgba_img = img.to_rgba8();
@@ -212,27 +207,18 @@ impl Parser {
         let raw_data = rgba_img.into_raw();
         // Set image data
-        if let Err(e) = tesseract.set_image(
+        tesseract.set_image(
             &raw_data,
             width as i32,
             height as i32,
             4,  // bytes per pixel (RGBA)
             (width * 4) as i32,  // bytes per line
-        ) {
-            return Err(Error::new(
-                Ruby::get().unwrap().exception_runtime_error(),
-                format!("Failed to set image: {}", e),
-            ))
-        }
+        ).map_err(|e| Self::runtime_error("Failed to set image", e))?;
         // Extract text
-        match tesseract.get_utf8_text() {
-            Ok(text) => Ok(text.trim().to_string()),
-            Err(e) => Err(Error::new(
-                Ruby::get().unwrap().exception_runtime_error(),
-                format!("Failed to perform OCR: {}", e),
-            )),
-        }
+        tesseract.get_utf8_text()
+            .map(|text| text.trim().to_string())
+            .map_err(|e| Self::runtime_error("Failed to perform OCR", e))
     }
@@ -242,51 +228,31 @@ impl Parser {
         // Try to load the PDF from memory
         // The magic parameter helps MuPDF identify the file type
-        match Document::from_bytes(&data, "pdf") {
-            Ok(doc) => {
-                let mut all_text = String::new();
-                // Get page count - this returns a Result
-                let page_count = match doc.page_count() {
-                    Ok(count) => count,
-                    Err(e) => {
-                        return Err(Error::new(
-                            Ruby::get().unwrap().exception_runtime_error(),
-                            format!("Failed to get page count: {}", e),
-                        ))
-                    }
-                };
-                // Iterate through pages
-                for page_num in 0..page_count {
-                    match doc.load_page(page_num) {
-                        Ok(page) => {
-                            // Extract text from the page
-                            match page.to_text() {
-                                Ok(text) => {
-                                    all_text.push_str(&text);
-                                    all_text.push('\n');
-                                }
-                                Err(_) => continue,
-                            }
-                        }
-                        Err(_) => continue,
-                    }
-                }
-                if all_text.is_empty() {
-                    Ok(
-                        "PDF contains no extractable text (might be scanned/image-based)"
-                            .to_string(),
-                    )
-                } else {
-                    Ok(all_text.trim().to_string())
+        let doc = Document::from_bytes(&data, "pdf")
+            .map_err(|e| Self::runtime_error("Failed to parse PDF", e))?;
+        let mut all_text = String::new();
+        // Get page count
+        let page_count = doc.page_count()
+            .map_err(|e| Self::runtime_error("Failed to get page count", e))?;
+        // Iterate through pages
+        for page_num in 0..page_count {
+            // Continue on page errors rather than failing entirely
+            if let Ok(page) = doc.load_page(page_num) {
+                // Extract text from the page
+                if let Ok(text) = page.to_text() {
+                    all_text.push_str(&text);
+                    all_text.push('\n');
                 }
             }
-            Err(e) => Err(Error::new(
-                Ruby::get().unwrap().exception_runtime_error(),
-                format!("Failed to parse PDF: {}", e),
-            )),
+        }
+        if all_text.is_empty() {
+            Ok("PDF contains no extractable text (might be scanned/image-based)".to_string())
+        } else {
+            Ok(all_text.trim().to_string())
         }
     }
@@ -322,10 +288,7 @@ impl Parser {
                 Ok(result.trim().to_string())
             }
-            Err(e) => Err(Error::new(
-                Ruby::get().unwrap().exception_runtime_error(),
-                format!("Failed to parse DOCX file: {}", e),
-            )),
+            Err(e) => Err(Self::runtime_error("Failed to parse DOCX file", e)),
         }
     }
@@ -335,15 +298,8 @@ impl Parser {
         use zip::ZipArchive;
         let cursor = Cursor::new(data);
-        let mut archive = match ZipArchive::new(cursor) {
-            Ok(archive) => archive,
-            Err(e) => {
-                return Err(Error::new(
-                    Ruby::get().unwrap().exception_runtime_error(),
-                    format!("Failed to open PPTX as ZIP: {}", e),
-                ))
-            }
-        };
+        let mut archive = ZipArchive::new(cursor)
+            .map_err(|e| Self::runtime_error("Failed to open PPTX as ZIP", e))?;
         let mut all_text = Vec::new();
         let mut slide_numbers = Vec::new();
@@ -492,10 +448,7 @@ impl Parser {
                 Ok(result)
             }
-            Err(e) => Err(Error::new(
-                Ruby::get().unwrap().exception_runtime_error(),
-                format!("Failed to parse Excel file: {}", e),
-            )),
+            Err(e) => Err(Self::runtime_error("Failed to parse Excel file", e)),
         }
     }
@@ -527,10 +480,7 @@ impl Parser {
                 }
                 Ok(Event::Eof) => break,
                 Err(e) => {
-                    return Err(Error::new(
-                        Ruby::get().unwrap().exception_runtime_error(),
-                        format!("XML parse error: {}", e),
-                    ))
+                    return Err(Self::runtime_error("XML parse error", e))
                 }
                 _ => {}
             }
@@ -557,10 +507,7 @@ impl Parser {
     /// Parse input string (for text content)
     fn parse(&self, input: String) -> Result<String, Error> {
         if input.is_empty() {
-            return Err(Error::new(
-                Ruby::get().unwrap().exception_arg_error(),
-                "Input cannot be empty",
-            ));
+            return Err(Self::argument_error("Input cannot be empty"));
         }
         // For string input, just return cleaned text
@@ -576,12 +523,8 @@ impl Parser {
     fn parse_file(&self, path: String) -> Result<String, Error> {
         use std::fs;
-        let data = fs::read(&path).map_err(|e| {
-            Error::new(
-                Ruby::get().unwrap().exception_io_error(),
-                format!("Failed to read file: {}", e),
-            )
-        })?;
+        let data = fs::read(&path)
+            .map_err(|e| Self::io_error("Failed to read file", e))?;
         self.parse_bytes_internal(data, Some(&path))
     }
@@ -589,10 +532,7 @@ impl Parser {
     /// Parse bytes from Ruby
     fn parse_bytes(&self, data: Vec<u8>) -> Result<String, Error> {
         if data.is_empty() {
-            return Err(Error::new(
-                Ruby::get().unwrap().exception_arg_error(),
-                "Data cannot be empty",
-            ));
+            return Err(Self::argument_error("Data cannot be empty"));
         }
         self.parse_bytes_internal(data, None)
@@ -616,25 +556,11 @@ impl Parser {
     /// Check supported file types
     fn supported_formats() -> Vec<String> {
-        vec![
-            "txt".to_string(),
-            "json".to_string(),
-            "xml".to_string(),
-            "html".to_string(),
-            "htm".to_string(), // HTML files (alternative extension)
-            "md".to_string(),  // Markdown files
-            "docx".to_string(),
-            "pptx".to_string(),
-            "xlsx".to_string(),
-            "xls".to_string(),
-            "csv".to_string(),
-            "pdf".to_string(),  // Text extraction via MuPDF
-            "png".to_string(),  // OCR via Tesseract
-            "jpg".to_string(),  // OCR via Tesseract
-            "jpeg".to_string(), // OCR via Tesseract
-            "tiff".to_string(), // OCR via Tesseract
-            "bmp".to_string(),  // OCR via Tesseract
-        ]
+        // Use the centralized list from FormatDetector
+        FormatDetector::supported_extensions()
+            .iter()
+            .map(|&s| s.to_string())
+            .collect()
     }
     /// Detect if file extension is supported
@@ -688,6 +614,10 @@ pub fn init(_ruby: &Ruby, module: RModule) -> Result<(), Error> {
     class.define_method("parse_xml", method!(Parser::parse_xml, 1))?;
     class.define_method("parse_text", method!(Parser::parse_text, 1))?;
     class.define_method("ocr_image", method!(Parser::ocr_image, 1))?;
+    // Format detection methods
+    class.define_method("detect_format_from_bytes", method!(Parser::detect_format_from_bytes, 1))?;
+    class.define_method("detect_format_from_filename", method!(Parser::detect_format_from_filename, 1))?;
     // Class methods
     class.define_singleton_method("supported_formats", function!(Parser::supported_formats, 0))?;

data/lib/parsekit/NATIVE_API.md ADDED Viewed

@@ -0,0 +1,125 @@
+# ParseKit Native API Documentation
+This document describes the methods implemented in the Rust native extension for ParseKit::Parser.
+## Instance Methods
+### `initialize(options = {})`
+Initialize a new Parser instance with optional configuration.
+**Parameters:**
+- `options` [Hash] Configuration options
+  - `:encoding` [String] Input encoding (default: UTF-8)
+  - `:strict_mode` [Boolean] Enable strict parsing mode (default: false)
+  - `:max_depth` [Integer] Maximum nesting depth (default: 100)
+  - `:max_size` [Integer] Maximum file size in bytes (default: 100MB)
+### `parse(input)`
+Parse an input string (for text content).
+**Parameters:**
+- `input` [String] The input to parse
+**Returns:**
+- [String] The parsed result
+**Raises:**
+- `ArgumentError` If input is empty
+### `parse_file(path)`
+Parse a file (supports PDF, Office documents, text files, images with OCR).
+**Parameters:**
+- `path` [String] Path to the file to parse
+**Returns:**
+- [String] The extracted text content
+**Raises:**
+- `IOError` If file cannot be read
+- `RuntimeError` If parsing fails
+### `parse_bytes(data)`
+Parse binary data.
+**Parameters:**
+- `data` [Array<Integer>] Binary data as byte array
+**Returns:**
+- [String] The extracted text content
+**Raises:**
+- `ArgumentError` If data is empty
+- `RuntimeError` If parsing fails
+### `config`
+Get the current parser configuration.
+**Returns:**
+- [Hash] The parser configuration including encoding, strict_mode, max_depth, and max_size
+### `supports_file?(path)`
+Check if a file format is supported.
+**Parameters:**
+- `path` [String] File path to check
+**Returns:**
+- [Boolean] True if the file format is supported
+### `strict_mode?`
+Check if strict mode is enabled.
+**Returns:**
+- [Boolean] True if strict mode is enabled
+## Format-Specific Parsers
+These methods are also available but typically called internally via `parse_file` or `parse_bytes`:
+### `parse_pdf(data)`
+Parse PDF files using MuPDF (statically linked).
+### `parse_docx(data)`
+Parse Microsoft Word documents.
+### `parse_pptx(data)`
+Parse Microsoft PowerPoint presentations.
+### `parse_xlsx(data)`
+Parse Microsoft Excel spreadsheets.
+### `parse_json(data)`
+Parse and pretty-print JSON data.
+### `parse_xml(data)`
+Parse XML/HTML files and extract text content.
+### `parse_text(data)`
+Parse plain text files.
+### `ocr_image(data)`
+Perform OCR on images (PNG, JPEG, TIFF, BMP) using Tesseract.
+## Class Methods
+### `Parser.supported_formats`
+Get list of supported file formats.
+**Returns:**
+- [Array<String>] List of supported file extensions
+**Example:**
+```ruby
+ParseKit::Parser.supported_formats
+# => ["txt", "json", "xml", "html", "docx", "xlsx", "xls", "csv", "pdf", "png", "jpg", "jpeg", "tiff", "bmp", ...]
+```
+## Implementation Notes
+All these methods are implemented in Rust via the native extension. The Ruby layer (`lib/parsekit/parser.rb`) provides additional convenience methods and helpers that wrap these native methods.
+The native extension uses:
+- **MuPDF** for PDF parsing (statically linked)
+- **Tesseract** for OCR functionality (bundled)
+- **Various Rust crates** for Office document parsing (docx-rs, calamine, etc.)

data/lib/parsekit/parsekit.bundle CHANGED Viewed

Binary file

data/lib/parsekit/parser.rb CHANGED Viewed

@@ -3,65 +3,24 @@
 module ParseKit
   # Ruby wrapper for the native Parser class
   #
-  # The Ruby layer now handles format detection and routing to specific parsers,
-  # while Rust provides the actual parsing implementations.
+  # This class provides document parsing capabilities through a native Rust extension.
+  # For documentation of native methods, see NATIVE_API.md
+  #
+  # The Ruby layer provides convenience methods and helpers while the Rust
+  # extension handles the actual parsing of PDF, Office documents, images (OCR), etc.
   class Parser
-    # These methods are implemented in the native extension
-    # and are documented here for YARD
-    # Initialize a new Parser instance
-    # @param options [Hash] Configuration options
-    # @option options [String] :encoding Input encoding (default: UTF-8)
-    # def initialize(options = {})
-    #   # Implemented in native extension
-    # end
-    # Parse an input string (for text content)
-    # @param input [String] The input to parse
-    # @return [String] The parsed result
-    # @raise [ArgumentError] If input is empty
-    # def parse(input)
-    #   # Implemented in native extension
-    # end
-    # Parse a file (supports PDF, Office documents, text files)
-    # @param path [String] Path to the file to parse
-    # @return [String] The extracted text content
-    # @raise [IOError] If file cannot be read
-    # @raise [RuntimeError] If parsing fails
-    # def parse_file(path)
-    #   # Implemented in native extension
-    # end
-    # Parse binary data
-    # @param data [Array<Integer>] Binary data as byte array
-    # @return [String] The extracted text content
-    # @raise [ArgumentError] If data is empty
-    # @raise [RuntimeError] If parsing fails
-    # def parse_bytes(data)
-    #   # Implemented in native extension
-    # end
-    # Get the current configuration
-    # @return [Hash] The parser configuration
-    # def config
-    #   # Implemented in native extension
-    # end
-    # Check if a file format is supported
-    # @param path [String] File path to check
-    # @return [Boolean] True if the file format is supported
-    # def supports_file?(path)
-    #   # Implemented in native extension
-    # end
-    # Get list of supported file formats
-    # @return [Array<String>] List of supported file extensions
-    # def self.supported_formats
-    #   # Implemented in native extension
-    # end
-    # Ruby-level helper methods
+    # Native methods implemented in Rust:
+    # - initialize(options = {})
+    # - parse(input)
+    # - parse_file(path)
+    # - parse_bytes(data)
+    # - config
+    # - supports_file?(path)
+    # - strict_mode?
+    # - parse_pdf, parse_docx, parse_xlsx, parse_pptx, parse_json, parse_xml, parse_text, ocr_image
+    # See NATIVE_API.md for detailed documentation
+    # Ruby convenience methods and helpers
     # Create a parser with strict mode enabled
     # @param options [Hash] Additional options
@@ -81,6 +40,7 @@ module ParseKit
     end
     # Detect format from file path
+    # @deprecated Use the native format detection in parse_file instead
     # @param path [String] File path
     # @return [Symbol, nil] Format symbol or nil if unknown
     def detect_format(path)
@@ -101,67 +61,134 @@ module ParseKit
     end
     # Detect format from binary data
+    # @deprecated Use the native format detection in parse_bytes instead
     # @param data [String, Array<Integer>] Binary data
     # @return [Symbol] Format symbol
     def detect_format_from_bytes(data)
       # Convert to bytes if string
       bytes = data.is_a?(String) ? data.bytes : data
-      return :text if bytes.empty?
-      # Check magic bytes
-      if bytes[0..3] == [0x25, 0x50, 0x44, 0x46]  # %PDF
-        :pdf
-      elsif bytes[0..1] == [0x50, 0x4B]  # PK (ZIP archive)
-        # Could be DOCX or XLSX, default to xlsx for now
-        # In the future, could inspect ZIP contents to determine
-        :xlsx
-      elsif bytes[0..3] == [0xD0, 0xCF, 0x11, 0xE0]  # Old Excel
+      return :text if bytes.empty?  # Return :text for empty data
+      # Check magic bytes for various formats
+      # PDF
+      if bytes.size >= 4 && bytes[0..3] == [0x25, 0x50, 0x44, 0x46]  # %PDF
+        return :pdf
+      end
+      # PNG
+      if bytes.size >= 8 && bytes[0..7] == [0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A]
+        return :png
+      end
+      # JPEG
+      if bytes.size >= 3 && bytes[0..2] == [0xFF, 0xD8, 0xFF]
+        return :jpeg
+      end
+      # BMP
+      if bytes.size >= 2 && bytes[0..1] == [0x42, 0x4D]  # BM
+        return :bmp
+      end
+      # TIFF (little-endian or big-endian)
+      if bytes.size >= 4
+        if bytes[0..3] == [0x49, 0x49, 0x2A, 0x00]  # II*\0 (little-endian)
+          return :tiff
+        elsif bytes[0..3] == [0x4D, 0x4D, 0x00, 0x2A]  # MM\0* (big-endian)
+          return :tiff
+        end
+      end
+      # OLE Compound Document (old Excel/Word) - return :xlsx for compatibility
+      if bytes.size >= 4 && bytes[0..3] == [0xD0, 0xCF, 0x11, 0xE0]
+        return :xlsx  # Return :xlsx for compatibility with existing tests
+      end
+      # ZIP archive (could be DOCX, XLSX, PPTX)
+      if bytes.size >= 2 && bytes[0..1] == [0x50, 0x4B]  # PK
+        # Try to determine the specific Office format by checking ZIP contents
+        # For now, we'll need to inspect the ZIP structure
+        return detect_office_format_from_zip(bytes)
+      end
+      # XML
+      if bytes.size >= 5
+        first_chars = bytes[0..4].pack('C*')
+        if first_chars == '<?xml' || first_chars.start_with?('<!')
+          return :xml
+        end
+      end
+      # HTML
+      if bytes.size >= 14
+        first_chars = bytes[0..13].pack('C*').downcase
+        if first_chars.include?('<!doctype') || first_chars.include?('<html')
+          return :xml  # HTML is treated as XML
+        end
+      end
+      # JSON
+      if bytes.size > 0
+        first_char = bytes[0]
+        # Skip whitespace
+        idx = 0
+        while idx < bytes.size && [0x20, 0x09, 0x0A, 0x0D].include?(bytes[idx])
+          idx += 1
+        end
+        if idx < bytes.size
+          first_non_ws = bytes[idx]
+          if first_non_ws == 0x7B || first_non_ws == 0x5B  # { or [
+            return :json
+          end
+        end
+      end
+      # Default to text if not recognized
+      :text
+    end
+    # Detect specific Office format from ZIP data
+    # @param bytes [Array<Integer>] ZIP file bytes
+    # @return [Symbol] :docx, :xlsx, :pptx, or :unknown
+    def detect_office_format_from_zip(bytes)
+      # This is a simplified detection - in practice you'd parse the ZIP
+      # For the test, we'll check for known patterns in the ZIP structure
+      # Convert bytes to string for pattern matching
+      content = bytes[0..2000].pack('C*')  # Check first 2KB
+      # Look for Office-specific directory names in the ZIP
+      if content.include?('word/') || content.include?('word/_rels')
+        :docx
+      elsif content.include?('xl/') || content.include?('xl/_rels')
         :xlsx
-      elsif bytes[0..4] == [0x3C, 0x3F, 0x78, 0x6D, 0x6C]  # <?xml
-        :xml
-      elsif bytes[0..4] == [0x3C, 0x68, 0x74, 0x6D, 0x6C]  # <html
-        :xml
-      elsif bytes[0] == 0x7B || bytes[0] == 0x5B  # { or [
-        :json
+      elsif content.include?('ppt/') || content.include?('ppt/_rels')
+        :pptx
       else
-        :text
+        # Default to xlsx for generic ZIP
+        :xlsx
       end
     end
     # Parse file using format-specific parser
-    # This method now detects format and routes to the appropriate parser
+    # This method delegates to parse_file which uses centralized dispatch in Rust
     # @param path [String] File path
     # @return [String] Parsed content
     def parse_file_routed(path)
-      format = detect_format(path)
-      data = File.read(path, mode: 'rb').bytes
-      case format
-      when :docx then parse_docx(data)
-      when :xlsx then parse_xlsx(data)
-      when :pdf then parse_pdf(data)
-      when :json then parse_json(data)
-      when :xml then parse_xml(data)
-      else parse_text(data)
-      end
+      # Simply delegate to parse_file which already has dispatch logic
+      parse_file(path)
     end
     # Parse bytes using format-specific parser
-    # This method detects format and routes to the appropriate parser
+    # This method delegates to parse_bytes which uses centralized dispatch in Rust
     # @param data [String, Array<Integer>] Binary data
     # @return [String] Parsed content
     def parse_bytes_routed(data)
-      format = detect_format_from_bytes(data)
+      # Simply delegate to parse_bytes which already has dispatch logic
       bytes = data.is_a?(String) ? data.bytes : data
-      case format
-      when :docx then parse_docx(bytes)
-      when :xlsx then parse_xlsx(bytes)
-      when :pdf then parse_pdf(bytes)
-      when :json then parse_json(bytes)
-      when :xml then parse_xml(bytes)
-      else parse_text(bytes)
-      end
+      parse_bytes(bytes)
     end
     # Parse with a block for processing results
@@ -178,25 +205,49 @@ module ParseKit
     # @param input [String] The input to validate
     # @return [Boolean] True if input is valid
     def valid_input?(input)
-      return false unless input.is_a?(String)
-      return false if input.empty?
-      true
+      input.is_a?(String) && !input.empty?
     end
     # Validate file before parsing
     # @param path [String] The file path to validate
     # @return [Boolean] True if file exists and format is supported
     def valid_file?(path)
+      return false if path.nil? || path.empty?
       return false unless File.exist?(path)
+      return false if File.directory?(path)
       supports_file?(path)
     end
     # Get file extension
     # @param path [String] File path
-    # @return [String, nil] File extension in lowercase
+    # @return [String, nil] File extension in lowercase without leading dot
     def file_extension(path)
-      ext = File.extname(path)
-      ext.empty? ? nil : ext[1..].downcase
+      return nil if path.nil? || path.empty?
+      # Handle trailing whitespace
+      clean_path = path.strip
+      # Handle trailing slashes (directory indicator)
+      return nil if clean_path.end_with?('/')
+      # Get the extension
+      ext = File.extname(clean_path)
+      # Handle special cases
+      if ext.empty?
+        # Check for hidden files like .gitignore (the whole name after dot is the "extension")
+        basename = File.basename(clean_path)
+        if basename.start_with?('.') && basename.length > 1 && !basename[1..-1].include?('.')
+          return basename[1..-1].downcase
+        end
+        return nil
+      elsif ext == '.'
+        # File ends with a dot but no extension
+        return nil
+      else
+        # Normal extension, remove the dot and downcase
+        ext[1..-1].downcase
+      end
     end
   end
 end

data/lib/parsekit/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module ParseKit
-  VERSION = "0.1.0"
+  VERSION = "0.1.2"
 end

data/lib/parsekit.rb CHANGED Viewed

@@ -14,6 +14,22 @@ require_relative "parsekit/parser"
 # ParseKit is a Ruby document parsing toolkit with PDF and OCR support
 module ParseKit
+  # Supported file formats and their extensions
+  SUPPORTED_FORMATS = {
+    pdf: ['.pdf'],
+    docx: ['.docx'],
+    xlsx: ['.xlsx'],
+    xls: ['.xls'],
+    pptx: ['.pptx'],
+    png: ['.png'],
+    jpeg: ['.jpg', '.jpeg'],
+    tiff: ['.tiff', '.tif'],
+    bmp: ['.bmp'],
+    json: ['.json'],
+    xml: ['.xml', '.html'],
+    text: ['.txt', '.md', '.csv']
+  }.freeze
   class << self
     # The parse_file and parse_bytes methods are defined in the native extension
     # We just need to document them here or add wrapper logic if needed
@@ -50,6 +66,22 @@ module ParseKit
       Parser.new.supports_file?(path)
     end
+    # Detect file format from filename/extension
+    # @param filename [String, nil] The filename to check
+    # @return [Symbol] The detected format, or :unknown
+    def detect_format(filename)
+      return :unknown if filename.nil? || filename.empty?
+      ext = File.extname(filename).downcase
+      return :unknown if ext.empty?
+      SUPPORTED_FORMATS.each do |format, extensions|
+        return format if extensions.include?(ext)
+      end
+      :unknown
+    end
     # Get the native library version
     # @return [String] Version of the native library
     def native_version

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: parsekit
 version: !ruby/object:Gem::Version
-  version: 0.1.0
+  version: 0.1.2
 platform: ruby
 authors:
 - Chris Petersen
@@ -96,20 +96,22 @@ files:
 - ext/parsekit/Cargo.toml
 - ext/parsekit/extconf.rb
 - ext/parsekit/src/error.rs
+- ext/parsekit/src/format_detector.rs
 - ext/parsekit/src/lib.rs
 - ext/parsekit/src/parser.rs
 - lib/parsekit.rb
+- lib/parsekit/NATIVE_API.md
 - lib/parsekit/error.rb
 - lib/parsekit/parsekit.bundle
 - lib/parsekit/parser.rb
 - lib/parsekit/version.rb
-homepage: https://github.com/cpetersen/parsekit
+homepage: https://github.com/scientist-labs/parsekit
 licenses:
 - MIT
 metadata:
-  homepage_uri: https://github.com/cpetersen/parsekit
-  source_code_uri: https://github.com/cpetersen/parsekit
-  changelog_uri: https://github.com/cpetersen/parsekit/blob/main/CHANGELOG.md
+  homepage_uri: https://github.com/scientist-labs/parsekit
+  source_code_uri: https://github.com/scientist-labs/parsekit
+  changelog_uri: https://github.com/scientist-labs/parsekit/blob/main/CHANGELOG.md
 post_install_message:
 rdoc_options: []
 require_paths: