parsekit-bin 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 0a6447155b1ecdb5426c2051e680a12a6b0091a43539d46c711443a5bc98f4bd
4
+ data.tar.gz: 1dc5570e92dbc491b50669d8dcc68aec57ca9249dd6f31fd82c5767c1109beea
5
+ SHA512:
6
+ metadata.gz: 681e2892319f1f8ccf51982836224fae0b38946c54dd44eaf184fca8deefc1b21875ce9359523663d09610eb2cfeec1efe6425d897584d6d8bbc9fc3ca51196b
7
+ data.tar.gz: 891180dcb47736714c2f33c42bb13032bf8f6fddcf09e663915edcb525c8715d137ac9309b1821702c7c8b8918f3aa552b291e9a340bf169e69da6e5b257ac5d
data/CHANGELOG.md ADDED
@@ -0,0 +1,53 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
+
8
+ ## [Unreleased]
9
+
10
+ ### Added
11
+ - Nothing yet
12
+
13
+ ### Changed
14
+ - Nothing yet
15
+
16
+ ### Deprecated
17
+ - Nothing yet
18
+
19
+ ### Removed
20
+ - Nothing yet
21
+
22
+ ### Fixed
23
+ - Nothing yet
24
+
25
+ ### Security
26
+ - Nothing yet
27
+
28
+ ## [0.1.0] - 2024-08-09
29
+
30
+ ### Added
31
+ - Initial release of parsekit
32
+ - Basic parser functionality with Ruby bindings via Magnus
33
+ - Support for parsing strings and files
34
+ - Configurable parser with options (strict_mode, max_depth, encoding)
35
+ - Parser class with instance methods
36
+ - Module-level convenience methods
37
+ - Error handling with custom error classes
38
+ - Thread-safe parsing operations
39
+ - Cross-platform support (Linux, macOS, Windows)
40
+ - Ruby 3.0+ support
41
+ - Comprehensive test suite with RSpec
42
+ - CI/CD with GitHub Actions
43
+ - Documentation and examples
44
+ - Integration with ruby-nlp ecosystem
45
+
46
+ ### Technical Details
47
+ - Built with Magnus 0.7 for Ruby-Rust bindings
48
+ - Uses rb_sys 0.9 for build system integration
49
+ - Rust edition 2021
50
+ - Cross-compilation support for multiple platforms
51
+
52
+ [Unreleased]: https://github.com/scientist-labs/parsekit/compare/v0.1.0...HEAD
53
+ [0.1.0]: https://github.com/scientist-labs/parsekit/releases/tag/v0.1.0
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2024 Your Name
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,195 @@
1
+ <img src="/docs/assets/parsekit-wide.png" alt="parsekit" height="80px">
2
+
3
+ [![Gem Version](https://badge.fury.io/rb/parsekit-bin.svg)](https://badge.fury.io/rb/parsekit-bin)
4
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
5
+
6
+ Native Ruby bindings for the [parser-core](https://crates.io/crates/parser-core) Rust crate, providing high-performance document parsing and text extraction capabilities through Magnus. This gem wraps parser-core to extract text from PDFs, Office documents (DOCX, XLSX), images (with OCR), and more. Part of the ruby-nlp ecosystem.
7
+
8
+ ## Features
9
+
10
+ - ๐Ÿ“„ **Document Parsing**: Extract text from PDFs, Office documents (DOCX, XLSX)
11
+ - ๐Ÿ–ผ๏ธ **OCR Support**: Extract text from images using Tesseract OCR
12
+ - ๐Ÿš€ **High Performance**: Native Rust performance with Ruby convenience
13
+ - ๐Ÿ”ง **Unified API**: Single interface for multiple document formats
14
+ - ๐Ÿ“ฆ **Cross-Platform**: Works on Linux, macOS, and Windows
15
+ - ๐Ÿงช **Well Tested**: Comprehensive test suite with RSpec
16
+
17
+ ## Installation
18
+
19
+ Add this line to your application's Gemfile:
20
+
21
+ ```ruby
22
+ gem 'parsekit-bin'
23
+ ```
24
+
25
+ And then execute:
26
+
27
+ $ bundle install
28
+
29
+ Or install it yourself as:
30
+
31
+ ```bash
32
+ gem install parsekit-bin
33
+ ```
34
+
35
+ ### Requirements
36
+
37
+ - Ruby >= 3.0.0
38
+ - Rust toolchain (stable)
39
+ - C compiler (for linking)
40
+
41
+ That's it! ParseKit bundles all necessary libraries including Tesseract for OCR, so you don't need to install any system dependencies.
42
+
43
+ ## Usage
44
+
45
+ ### Basic Usage
46
+
47
+ ```ruby
48
+ require 'parsekit'
49
+
50
+ # Parse a PDF file
51
+ text = ParseKit.parse_file("document.pdf")
52
+ puts text # Extracted text from the PDF
53
+
54
+ # Parse an Excel file
55
+ text = ParseKit.parse_file("spreadsheet.xlsx")
56
+ puts text # Extracted text from all sheets
57
+
58
+ # Parse binary data directly
59
+ file_data = File.binread("document.pdf")
60
+ text = ParseKit.parse_bytes(file_data)
61
+ puts text
62
+
63
+ # Parse with a Parser instance
64
+ parser = ParseKit::Parser.new
65
+ text = parser.parse_file("report.docx")
66
+ puts text
67
+ ```
68
+
69
+ ### Module-Level Convenience Methods
70
+
71
+ ```ruby
72
+ # Parse files directly
73
+ content = ParseKit.parse_file('document.pdf')
74
+
75
+ # Parse bytes
76
+ data = File.read('document.pdf', mode: 'rb')
77
+ content = ParseKit.parse_bytes(data.bytes)
78
+
79
+ # Check supported formats
80
+ formats = ParseKit.supported_formats
81
+ # => ["txt", "json", "xml", "html", "docx", "xlsx", "xls", "csv", "pdf", "png", "jpg", "jpeg", "tiff", "bmp"]
82
+
83
+ # Check if a file is supported
84
+ ParseKit.supports_file?('document.pdf') # => true
85
+ ```
86
+
87
+ ### Configuration Options
88
+
89
+ ```ruby
90
+ # Create parser with options
91
+ parser = ParseKit::Parser.new(
92
+ strict_mode: true,
93
+ max_size: 50 * 1024 * 1024, # 50MB limit
94
+ encoding: 'UTF-8'
95
+ )
96
+
97
+ # Or use the strict convenience method
98
+ parser = ParseKit::Parser.strict
99
+ ```
100
+
101
+ ### Format-Specific Parsing
102
+
103
+ ```ruby
104
+ parser = ParseKit::Parser.new
105
+
106
+ # Direct access to format-specific parsers
107
+ pdf_data = File.read('document.pdf', mode: 'rb').bytes
108
+ pdf_text = parser.parse_pdf(pdf_data)
109
+
110
+ image_data = File.read('image.png', mode: 'rb').bytes
111
+ ocr_text = parser.ocr_image(image_data)
112
+
113
+ excel_data = File.read('data.xlsx', mode: 'rb').bytes
114
+ excel_text = parser.parse_xlsx(excel_data)
115
+ ```
116
+
117
+ ## Supported Formats
118
+
119
+ | Format | Extensions | Method | Notes |
120
+ |--------|------------|--------|-------|
121
+ | PDF | .pdf | `parse_pdf` | Text extraction via MuPDF |
122
+ | Word | .docx | `parse_docx` | Office Open XML format |
123
+ | Excel | .xlsx, .xls | `parse_xlsx` | Both modern and legacy formats |
124
+ | PowerPoint | .pptx | `parse_pptx` | Text extraction from slides and notes |
125
+ | Images | .png, .jpg, .jpeg, .tiff, .bmp | `ocr_image` | OCR via bundled Tesseract |
126
+ | JSON | .json | `parse_json` | Pretty-printed output |
127
+ | XML/HTML | .xml, .html | `parse_xml` | Extracts text content |
128
+ | Text | .txt, .csv, .md | `parse_text` | With encoding detection |
129
+
130
+ ## Performance
131
+
132
+ ParseKit is built with performance in mind:
133
+
134
+ - Native Rust implementation for speed
135
+ - Statically linked C libraries (MuPDF, Tesseract) compiled with optimizations
136
+ - Efficient memory usage with streaming where possible
137
+ - Configurable size limits to prevent memory issues
138
+
139
+ ## Development
140
+
141
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests.
142
+
143
+ To compile the Rust extension:
144
+
145
+ ```bash
146
+ rake compile
147
+ ```
148
+
149
+ To run tests with coverage:
150
+
151
+ ```bash
152
+ rake dev:coverage
153
+ ```
154
+
155
+ ### OCR Mode Configuration
156
+
157
+ By default, ParseKit bundles Tesseract for zero-dependency OCR support. Advanced users who already have Tesseract installed system-wide and want faster gem installation can use system mode:
158
+
159
+ **Using system Tesseract during installation:**
160
+ ```bash
161
+ gem install parsekit-bin -- --no-default-features
162
+ ```
163
+
164
+ **For development with system Tesseract:**
165
+ ```bash
166
+ rake compile CARGO_FEATURES="" # Disables bundled-tesseract feature
167
+ ```
168
+
169
+ **System Tesseract requirements:**
170
+ - **macOS**: `brew install tesseract`
171
+ - **Ubuntu/Debian**: `sudo apt-get install libtesseract-dev`
172
+ - **Fedora/RHEL**: `sudo dnf install tesseract-devel`
173
+
174
+ The bundled mode adds ~1-3 minutes to initial gem installation but provides a completely self-contained experience with no external dependencies.
175
+
176
+ ## Architecture
177
+
178
+ ParseKit uses a hybrid Ruby/Rust architecture:
179
+
180
+ - **Ruby Layer**: Provides convenient API and format detection
181
+ - **Rust Layer**: Implements high-performance parsing using:
182
+ - MuPDF for PDF text extraction (statically linked)
183
+ - tesseract-rs for OCR (with bundled Tesseract by default)
184
+ - Pure Rust libraries for DOCX/XLSX parsing
185
+ - Magnus for Ruby-Rust FFI bindings
186
+
187
+ ## Contributing
188
+
189
+ Bug reports and pull requests are welcome on GitHub at https://github.com/scientist-labs/parsekit.
190
+
191
+ ## License
192
+
193
+ The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
194
+
195
+ Note: This gem includes statically linked versions of MuPDF (AGPL/Commercial) and Tesseract (Apache 2.0). Please review their respective licenses for compliance with your use case.
@@ -0,0 +1,36 @@
1
+ [package]
2
+ name = "parsekit"
3
+ version = "0.1.0"
4
+ edition = "2021"
5
+ authors = ["Your Name <your.email@example.com>"]
6
+ license = "MIT"
7
+ publish = false
8
+
9
+ [lib]
10
+ crate-type = ["cdylib"]
11
+ name = "parsekit"
12
+
13
+ [dependencies]
14
+ magnus = { version = "0.8", features = ["rb-sys"] }
15
+ # Document parsing - testing embedded C libraries
16
+ # MuPDF builds from source and statically links
17
+ mupdf = { version = "0.5", default-features = false, features = [] }
18
+ # OCR - Using tesseract-rs for both system and bundled modes
19
+ tesseract-rs = "0.1" # Tesseract with optional bundling
20
+ image = "0.25" # Image processing library (match rusty-tesseract's version)
21
+ calamine = "0.31" # Excel parsing
22
+ docx-rs = "0.4" # Word document parsing
23
+ quick-xml = "0.38" # XML parsing
24
+ zip = "5.0" # ZIP archive handling for PPTX
25
+ serde_json = "1.0" # JSON parsing
26
+ regex = "1.10" # Text parsing
27
+ encoding_rs = "0.8" # Encoding detection
28
+
29
+ [features]
30
+ default = ["bundled-tesseract"]
31
+ bundled-tesseract = []
32
+
33
+ [profile.release]
34
+ opt-level = 3
35
+ lto = true
36
+ codegen-units = 1
@@ -0,0 +1,6 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "mkmf"
4
+ require "rb_sys/mkmf"
5
+
6
+ create_rust_makefile("parsekit/parsekit")
@@ -0,0 +1,45 @@
1
+ use magnus::{Error, RModule, Ruby, Module};
2
+
3
+ /// Custom error types for ParseKit
4
+ #[derive(Debug)]
5
+ #[allow(dead_code)]
6
+ pub enum ParserError {
7
+ ParseError(String),
8
+ ConfigError(String),
9
+ IoError(String),
10
+ }
11
+
12
+ impl ParserError {
13
+ /// Convert to Magnus Error
14
+ #[allow(dead_code)]
15
+ pub fn to_error(&self) -> Error {
16
+ match self {
17
+ ParserError::ParseError(msg) => {
18
+ Error::new(Ruby::get().unwrap().exception_runtime_error(), msg.clone())
19
+ }
20
+ ParserError::ConfigError(msg) => {
21
+ Error::new(Ruby::get().unwrap().exception_arg_error(), msg.clone())
22
+ }
23
+ ParserError::IoError(msg) => {
24
+ Error::new(Ruby::get().unwrap().exception_io_error(), msg.clone())
25
+ }
26
+ }
27
+ }
28
+ }
29
+
30
+ /// Initialize error classes
31
+ /// For simplicity, we'll just create Ruby classes that inherit from Object,
32
+ /// and document that they should be treated as exceptions
33
+ pub fn init(_ruby: &Ruby, module: RModule) -> Result<(), Error> {
34
+ // For now, just create placeholder classes
35
+ // In a real implementation, you'd want to properly set up exception classes
36
+ // but Magnus 0.7's API for this is complex
37
+
38
+ // Define error classes as regular Ruby classes
39
+ // Users can still rescue them by name in Ruby code
40
+ let _error = module.define_class("Error", Ruby::get().unwrap().class_object())?;
41
+ let _parse_error = module.define_class("ParseError", Ruby::get().unwrap().class_object())?;
42
+ let _config_error = module.define_class("ConfigError", Ruby::get().unwrap().class_object())?;
43
+
44
+ Ok(())
45
+ }
@@ -0,0 +1,233 @@
1
+ use std::path::Path;
2
+
3
+ /// Represents a detected file format
4
+ #[derive(Debug, Clone, PartialEq)]
5
+ pub enum FileFormat {
6
+ Pdf,
7
+ Docx,
8
+ Xlsx,
9
+ Xls,
10
+ Pptx,
11
+ Png,
12
+ Jpeg,
13
+ Tiff,
14
+ Bmp,
15
+ Json,
16
+ Xml,
17
+ Html,
18
+ Text,
19
+ Unknown,
20
+ }
21
+
22
+ impl FileFormat {
23
+ /// Convert to Ruby symbol representation
24
+ pub fn to_symbol(&self) -> &'static str {
25
+ match self {
26
+ FileFormat::Pdf => "pdf",
27
+ FileFormat::Docx => "docx",
28
+ FileFormat::Xlsx => "xlsx",
29
+ FileFormat::Xls => "xls",
30
+ FileFormat::Pptx => "pptx",
31
+ FileFormat::Png => "png",
32
+ FileFormat::Jpeg => "jpeg",
33
+ FileFormat::Tiff => "tiff",
34
+ FileFormat::Bmp => "bmp",
35
+ FileFormat::Json => "json",
36
+ FileFormat::Xml => "xml",
37
+ FileFormat::Html => "xml", // HTML is treated as XML in Ruby
38
+ FileFormat::Text => "text",
39
+ FileFormat::Unknown => "unknown",
40
+ }
41
+ }
42
+ }
43
+
44
+ /// Central format detection logic
45
+ pub struct FormatDetector;
46
+
47
+ impl FormatDetector {
48
+ /// Detect format from filename and content
49
+ /// Prioritizes content detection over extension when both are available
50
+ pub fn detect(filename: Option<&str>, content: Option<&[u8]>) -> FileFormat {
51
+ // First try content-based detection if content is provided
52
+ if let Some(data) = content {
53
+ let format = Self::detect_from_content(data);
54
+ // If we got a definitive format from content, use it
55
+ if !matches!(format, FileFormat::Text | FileFormat::Unknown) {
56
+ return format;
57
+ }
58
+ }
59
+
60
+ // Fall back to extension-based detection
61
+ if let Some(name) = filename {
62
+ let ext_format = Self::detect_from_extension(name);
63
+ if ext_format != FileFormat::Unknown {
64
+ return ext_format;
65
+ }
66
+ }
67
+
68
+ // If content detection returned Text and no extension match, return Text
69
+ if let Some(data) = content {
70
+ let format = Self::detect_from_content(data);
71
+ if format == FileFormat::Text {
72
+ return FileFormat::Text;
73
+ }
74
+ }
75
+
76
+ FileFormat::Unknown
77
+ }
78
+
79
+ /// Detect format from file extension
80
+ pub fn detect_from_extension(filename: &str) -> FileFormat {
81
+ let path = Path::new(filename);
82
+ let ext = match path.extension().and_then(|s| s.to_str()) {
83
+ Some(e) => e.to_lowercase(),
84
+ None => return FileFormat::Unknown,
85
+ };
86
+
87
+ match ext.as_str() {
88
+ "pdf" => FileFormat::Pdf,
89
+ "docx" => FileFormat::Docx,
90
+ "xlsx" => FileFormat::Xlsx,
91
+ "xls" => FileFormat::Xls,
92
+ "pptx" => FileFormat::Pptx,
93
+ "png" => FileFormat::Png,
94
+ "jpg" | "jpeg" => FileFormat::Jpeg,
95
+ "tiff" | "tif" => FileFormat::Tiff,
96
+ "bmp" => FileFormat::Bmp,
97
+ "json" => FileFormat::Json,
98
+ "xml" => FileFormat::Xml,
99
+ "html" | "htm" => FileFormat::Html,
100
+ "txt" | "text" | "md" | "markdown" | "csv" => FileFormat::Text,
101
+ _ => FileFormat::Unknown,
102
+ }
103
+ }
104
+
105
+ /// Detect format from file content (magic bytes)
106
+ pub fn detect_from_content(data: &[u8]) -> FileFormat {
107
+ if data.is_empty() {
108
+ return FileFormat::Text; // Empty files are treated as text
109
+ }
110
+
111
+ // PDF
112
+ if data.len() >= 4 && data.starts_with(b"%PDF") {
113
+ return FileFormat::Pdf;
114
+ }
115
+
116
+ // PNG
117
+ if data.len() >= 8 && data.starts_with(&[0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A]) {
118
+ return FileFormat::Png;
119
+ }
120
+
121
+ // JPEG
122
+ if data.len() >= 3 && data.starts_with(&[0xFF, 0xD8, 0xFF]) {
123
+ return FileFormat::Jpeg;
124
+ }
125
+
126
+ // BMP
127
+ if data.len() >= 2 && data.starts_with(b"BM") {
128
+ return FileFormat::Bmp;
129
+ }
130
+
131
+ // TIFF (little-endian or big-endian)
132
+ if data.len() >= 4 {
133
+ if data.starts_with(b"II\x2A\x00") || data.starts_with(b"MM\x00\x2A") {
134
+ return FileFormat::Tiff;
135
+ }
136
+ }
137
+
138
+ // OLE Compound Document (old Excel/Word)
139
+ if data.len() >= 4 && data.starts_with(&[0xD0, 0xCF, 0x11, 0xE0]) {
140
+ return FileFormat::Xls; // Old Office format, usually Excel
141
+ }
142
+
143
+ // ZIP archive (could be DOCX, XLSX, PPTX)
144
+ if data.len() >= 2 && data.starts_with(b"PK") {
145
+ return Self::detect_office_format(data);
146
+ }
147
+
148
+ // XML
149
+ if data.len() >= 5 {
150
+ let start = String::from_utf8_lossy(&data[0..5.min(data.len())]);
151
+ if start.starts_with("<?xml") || start.starts_with("<!") {
152
+ return FileFormat::Xml;
153
+ }
154
+ }
155
+
156
+ // HTML
157
+ if data.len() >= 14 {
158
+ let start = String::from_utf8_lossy(&data[0..14.min(data.len())]).to_lowercase();
159
+ if start.contains("<!doctype") || start.contains("<html") {
160
+ return FileFormat::Html;
161
+ }
162
+ }
163
+
164
+ // JSON
165
+ if let Some(&first_non_ws) = data.iter().find(|&&b| !b" \t\n\r".contains(&b)) {
166
+ if first_non_ws == b'{' || first_non_ws == b'[' {
167
+ return FileFormat::Json;
168
+ }
169
+ }
170
+
171
+ // Default to text for unrecognized formats
172
+ FileFormat::Text
173
+ }
174
+
175
+ /// Detect specific Office format from ZIP data
176
+ fn detect_office_format(data: &[u8]) -> FileFormat {
177
+ // Look for Office-specific directory names in first 2KB of ZIP
178
+ let check_len = 2000.min(data.len());
179
+ let content = String::from_utf8_lossy(&data[0..check_len]);
180
+
181
+ // Check for format-specific markers
182
+ if content.contains("word/") || content.contains("word/_rels") {
183
+ FileFormat::Docx
184
+ } else if content.contains("xl/") || content.contains("xl/_rels") {
185
+ FileFormat::Xlsx
186
+ } else if content.contains("ppt/") || content.contains("ppt/_rels") {
187
+ FileFormat::Pptx
188
+ } else {
189
+ // Default to XLSX for generic ZIP (most common Office format)
190
+ FileFormat::Xlsx
191
+ }
192
+ }
193
+
194
+
195
+ /// Get all supported extensions
196
+ pub fn supported_extensions() -> Vec<&'static str> {
197
+ vec![
198
+ "pdf", "docx", "xlsx", "xls", "pptx",
199
+ "png", "jpg", "jpeg", "tiff", "tif", "bmp",
200
+ "json", "xml", "html", "htm",
201
+ "txt", "text", "md", "markdown", "csv"
202
+ ]
203
+ }
204
+ }
205
+
206
+ #[cfg(test)]
207
+ mod tests {
208
+ use super::*;
209
+
210
+ #[test]
211
+ fn test_detect_pdf() {
212
+ let pdf_data = b"%PDF-1.5\n";
213
+ assert_eq!(FormatDetector::detect_from_content(pdf_data), FileFormat::Pdf);
214
+ }
215
+
216
+ #[test]
217
+ fn test_detect_png() {
218
+ let png_data = &[0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A];
219
+ assert_eq!(FormatDetector::detect_from_content(png_data), FileFormat::Png);
220
+ }
221
+
222
+ #[test]
223
+ fn test_detect_from_extension() {
224
+ assert_eq!(FormatDetector::detect_from_extension("document.pdf"), FileFormat::Pdf);
225
+ assert_eq!(FormatDetector::detect_from_extension("Document.PDF"), FileFormat::Pdf);
226
+ assert_eq!(FormatDetector::detect_from_extension("data.xlsx"), FileFormat::Xlsx);
227
+ }
228
+
229
+ #[test]
230
+ fn test_empty_data() {
231
+ assert_eq!(FormatDetector::detect_from_content(&[]), FileFormat::Text);
232
+ }
233
+ }
@@ -0,0 +1,25 @@
1
+ use magnus::{function, prelude::*, Error, Ruby};
2
+
3
+ mod parser;
4
+ mod error;
5
+ mod format_detector;
6
+
7
+ /// Initialize the ParseKit module and its submodules
8
+ #[magnus::init]
9
+ fn init(ruby: &Ruby) -> Result<(), Error> {
10
+ let module = ruby.define_module("ParseKit")?;
11
+
12
+ // Initialize submodules
13
+ parser::init(ruby, module)?;
14
+ error::init(ruby, module)?;
15
+
16
+ // Add module-level methods
17
+ module.define_singleton_method("version", function!(version, 0))?;
18
+
19
+ Ok(())
20
+ }
21
+
22
+ /// Return the version of the parsekit gem
23
+ fn version() -> String {
24
+ env!("CARGO_PKG_VERSION").to_string()
25
+ }