parsekit 0.1.0.pre.1 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 02b091ecd1da29c68d59afb1089f1756cef350252b260b531ef82a06fb163c65
4
- data.tar.gz: e34663b8f849a907ede07b357ad3c5b21a614c16ea767fb5b735c3422bd66aa7
3
+ metadata.gz: e77e605d938d5b0b89c7814d1360f4c505415c54efbf8ffe9f2f7d4c564d917e
4
+ data.tar.gz: 6b86f57b2dce1231cae704b4d35c7562807ab77b001860b6fa5bbcdc9844781f
5
5
  SHA512:
6
- metadata.gz: b476aad0a9c9a711fce10d3a22dedd64e6ac82597c1d5d501d3ced7a46982d8f65b5bf44b513c3daabc5c5115a4b6278a0bea911b4dc9b1667010467e1cad8c9
7
- data.tar.gz: f1d2adeb0bf8199b5ce397537b8b40577a79e954dddf33f4e4ff2fe418791cddb2f41adf79654c05c9d37e6ef0b1e99526f390560b5321feb711982d2218372d
6
+ metadata.gz: a3f7089e8bd3e84cb2e14614cb78c3b3132d4d93a3c95d5cdcfa6c63723fe2dfce3a01bf0ee27255be7ff036bd0e438492434ded72853772e57b65faf7bded9b
7
+ data.tar.gz: c84b03d65471f50d6ec72eaa21269b5fc1c5e40e0cefa923cc71d50b802734d64c8296e5e5e6a76ca2f5d388a568119ed09474ae622534aaef03e0a96109dee3
data/README.md CHANGED
@@ -1,14 +1,13 @@
1
- # ParseKit
1
+ <img src="/docs/assets/parsekit-wide.png" alt="parsekit" height="80px">
2
2
 
3
- [![CI](https://github.com/cpetersen/parsekit/actions/workflows/ci.yml/badge.svg)](https://github.com/cpetersen/parsekit/actions/workflows/ci.yml)
4
3
  [![Gem Version](https://badge.fury.io/rb/parsekit.svg)](https://badge.fury.io/rb/parsekit)
5
4
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
6
5
 
7
- Native Ruby bindings for the [parser-core](https://crates.io/crates/parser-core) Rust crate, providing high-performance document parsing and text extraction capabilities through Magnus. This gem wraps parser-core to extract text from PDFs, Office documents (DOCX, XLSX, PPTX), images (with OCR), and more. Part of the ruby-nlp ecosystem.
6
+ Native Ruby bindings for the [parser-core](https://crates.io/crates/parser-core) Rust crate, providing high-performance document parsing and text extraction capabilities through Magnus. This gem wraps parser-core to extract text from PDFs, Office documents (DOCX, XLSX), images (with OCR), and more. Part of the ruby-nlp ecosystem.
8
7
 
9
8
  ## Features
10
9
 
11
- - 📄 **Document Parsing**: Extract text from PDFs, Office documents (DOCX, XLSX, PPTX)
10
+ - 📄 **Document Parsing**: Extract text from PDFs, Office documents (DOCX, XLSX)
12
11
  - 🖼️ **OCR Support**: Extract text from images using Tesseract OCR
13
12
  - 🚀 **High Performance**: Native Rust performance with Ruby convenience
14
13
  - 🔧 **Unified API**: Single interface for multiple document formats
@@ -38,13 +37,8 @@ gem install parsekit
38
37
  - Ruby >= 3.0.0
39
38
  - Rust toolchain (stable)
40
39
  - C compiler (for linking)
41
- - System libraries for document parsing:
42
- - **macOS**: `brew install leptonica tesseract poppler`
43
- - **Ubuntu/Debian**: `sudo apt-get install libleptonica-dev libtesseract-dev libpoppler-cpp-dev`
44
- - **Fedora/RHEL**: `sudo dnf install leptonica-devel tesseract-devel poppler-cpp-devel`
45
- - **Windows**: See [DEPENDENCIES.md](DEPENDENCIES.md) for MSYS2 instructions
46
40
 
47
- For detailed installation instructions and troubleshooting, see [DEPENDENCIES.md](DEPENDENCIES.md).
41
+ That's it! ParseKit bundles all necessary libraries including Tesseract for OCR, so you don't need to install any system dependencies.
48
42
 
49
43
  ## Usage
50
44
 
@@ -57,10 +51,6 @@ require 'parsekit'
57
51
  text = ParseKit.parse_file("document.pdf")
58
52
  puts text # Extracted text from the PDF
59
53
 
60
- # Parse an Office document
61
- text = ParseKit.parse_file("presentation.pptx")
62
- puts text # Extracted text from all slides
63
-
64
54
  # Parse an Excel file
65
55
  text = ParseKit.parse_file("spreadsheet.xlsx")
66
56
  puts text # Extracted text from all sheets
@@ -131,7 +121,8 @@ excel_text = parser.parse_xlsx(excel_data)
131
121
  | PDF | .pdf | `parse_pdf` | Text extraction via MuPDF |
132
122
  | Word | .docx | `parse_docx` | Office Open XML format |
133
123
  | Excel | .xlsx, .xls | `parse_xlsx` | Both modern and legacy formats |
134
- | Images | .png, .jpg, .jpeg, .tiff, .bmp | `ocr_image` | OCR via embedded Tesseract |
124
+ | PowerPoint | .pptx | `parse_pptx` | Text extraction from slides and notes |
125
+ | Images | .png, .jpg, .jpeg, .tiff, .bmp | `ocr_image` | OCR via bundled Tesseract |
135
126
  | JSON | .json | `parse_json` | Pretty-printed output |
136
127
  | XML/HTML | .xml, .html | `parse_xml` | Extracts text content |
137
128
  | Text | .txt, .csv, .md | `parse_text` | With encoding detection |
@@ -161,6 +152,27 @@ To run tests with coverage:
161
152
  rake dev:coverage
162
153
  ```
163
154
 
155
+ ### OCR Mode Configuration
156
+
157
+ By default, ParseKit bundles Tesseract for zero-dependency OCR support. Advanced users who already have Tesseract installed system-wide and want faster gem installation can use system mode:
158
+
159
+ **Using system Tesseract during installation:**
160
+ ```bash
161
+ gem install parsekit -- --no-default-features
162
+ ```
163
+
164
+ **For development with system Tesseract:**
165
+ ```bash
166
+ rake compile CARGO_FEATURES="" # Disables bundled-tesseract feature
167
+ ```
168
+
169
+ **System Tesseract requirements:**
170
+ - **macOS**: `brew install tesseract`
171
+ - **Ubuntu/Debian**: `sudo apt-get install libtesseract-dev`
172
+ - **Fedora/RHEL**: `sudo dnf install tesseract-devel`
173
+
174
+ The bundled mode adds ~1-3 minutes to initial gem installation but provides a completely self-contained experience with no external dependencies.
175
+
164
176
  ## Architecture
165
177
 
166
178
  ParseKit uses a hybrid Ruby/Rust architecture:
@@ -168,7 +180,7 @@ ParseKit uses a hybrid Ruby/Rust architecture:
168
180
  - **Ruby Layer**: Provides convenient API and format detection
169
181
  - **Rust Layer**: Implements high-performance parsing using:
170
182
  - MuPDF for PDF text extraction (statically linked)
171
- - rusty-tesseract for OCR (with embedded Tesseract)
183
+ - tesseract-rs for OCR (with bundled Tesseract by default)
172
184
  - Pure Rust libraries for DOCX/XLSX parsing
173
185
  - Magnus for Ruby-Rust FFI bindings
174
186
 
@@ -180,4 +192,4 @@ Bug reports and pull requests are welcome on GitHub at https://github.com/cpeter
180
192
 
181
193
  The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
182
194
 
183
- Note: This gem includes statically linked versions of MuPDF (AGPL/Commercial) and Tesseract (Apache 2.0). Please review their respective licenses for compliance with your use case.
195
+ Note: This gem includes statically linked versions of MuPDF (AGPL/Commercial) and Tesseract (Apache 2.0). Please review their respective licenses for compliance with your use case.
@@ -11,24 +11,26 @@ crate-type = ["cdylib"]
11
11
  name = "parsekit"
12
12
 
13
13
  [dependencies]
14
- magnus = { version = "0.7", features = ["rb-sys"] }
14
+ magnus = { version = "0.8", features = ["rb-sys"] }
15
15
  # Document parsing - testing embedded C libraries
16
16
  # MuPDF builds from source and statically links
17
17
  mupdf = { version = "0.5", default-features = false, features = [] }
18
- # OCR - Tesseract with image loading support
19
- rusty-tesseract = "1.1" # Tesseract wrapper with image loading
18
+ # OCR - Using tesseract-rs for both system and bundled modes
19
+ tesseract-rs = "0.1" # Tesseract with optional bundling
20
20
  image = "0.25" # Image processing library (match rusty-tesseract's version)
21
- calamine = "0.26" # Excel parsing
21
+ calamine = "0.30" # Excel parsing
22
22
  docx-rs = "0.4" # Word document parsing
23
- quick-xml = "0.36" # XML parsing
23
+ quick-xml = "0.38" # XML parsing
24
+ zip = "2.1" # ZIP archive handling for PPTX
24
25
  serde_json = "1.0" # JSON parsing
25
26
  regex = "1.10" # Text parsing
26
27
  encoding_rs = "0.8" # Encoding detection
27
28
 
28
29
  [features]
29
- default = []
30
+ default = ["bundled-tesseract"]
31
+ bundled-tesseract = []
30
32
 
31
33
  [profile.release]
32
34
  opt-level = 3
33
35
  lto = true
34
- codegen-units = 1
36
+ codegen-units = 1
@@ -1,4 +1,4 @@
1
- use magnus::{exception, Error, RModule, Ruby, Module};
1
+ use magnus::{Error, RModule, Ruby, Module};
2
2
 
3
3
  /// Custom error types for ParseKit
4
4
  #[derive(Debug)]
@@ -15,13 +15,13 @@ impl ParserError {
15
15
  pub fn to_error(&self) -> Error {
16
16
  match self {
17
17
  ParserError::ParseError(msg) => {
18
- Error::new(exception::runtime_error(), msg.clone())
18
+ Error::new(Ruby::get().unwrap().exception_runtime_error(), msg.clone())
19
19
  }
20
20
  ParserError::ConfigError(msg) => {
21
- Error::new(exception::arg_error(), msg.clone())
21
+ Error::new(Ruby::get().unwrap().exception_arg_error(), msg.clone())
22
22
  }
23
23
  ParserError::IoError(msg) => {
24
- Error::new(exception::io_error(), msg.clone())
24
+ Error::new(Ruby::get().unwrap().exception_io_error(), msg.clone())
25
25
  }
26
26
  }
27
27
  }
@@ -37,9 +37,9 @@ pub fn init(_ruby: &Ruby, module: RModule) -> Result<(), Error> {
37
37
 
38
38
  // Define error classes as regular Ruby classes
39
39
  // Users can still rescue them by name in Ruby code
40
- let _error = module.define_class("Error", magnus::class::object())?;
41
- let _parse_error = module.define_class("ParseError", magnus::class::object())?;
42
- let _config_error = module.define_class("ConfigError", magnus::class::object())?;
40
+ let _error = module.define_class("Error", Ruby::get().unwrap().class_object())?;
41
+ let _parse_error = module.define_class("ParseError", Ruby::get().unwrap().class_object())?;
42
+ let _config_error = module.define_class("ConfigError", Ruby::get().unwrap().class_object())?;
43
43
 
44
44
  Ok(())
45
45
  }
@@ -0,0 +1,233 @@
1
+ use std::path::Path;
2
+
3
+ /// Represents a detected file format
4
+ #[derive(Debug, Clone, PartialEq)]
5
+ pub enum FileFormat {
6
+ Pdf,
7
+ Docx,
8
+ Xlsx,
9
+ Xls,
10
+ Pptx,
11
+ Png,
12
+ Jpeg,
13
+ Tiff,
14
+ Bmp,
15
+ Json,
16
+ Xml,
17
+ Html,
18
+ Text,
19
+ Unknown,
20
+ }
21
+
22
+ impl FileFormat {
23
+ /// Convert to Ruby symbol representation
24
+ pub fn to_symbol(&self) -> &'static str {
25
+ match self {
26
+ FileFormat::Pdf => "pdf",
27
+ FileFormat::Docx => "docx",
28
+ FileFormat::Xlsx => "xlsx",
29
+ FileFormat::Xls => "xls",
30
+ FileFormat::Pptx => "pptx",
31
+ FileFormat::Png => "png",
32
+ FileFormat::Jpeg => "jpeg",
33
+ FileFormat::Tiff => "tiff",
34
+ FileFormat::Bmp => "bmp",
35
+ FileFormat::Json => "json",
36
+ FileFormat::Xml => "xml",
37
+ FileFormat::Html => "xml", // HTML is treated as XML in Ruby
38
+ FileFormat::Text => "text",
39
+ FileFormat::Unknown => "unknown",
40
+ }
41
+ }
42
+ }
43
+
44
+ /// Central format detection logic
45
+ pub struct FormatDetector;
46
+
47
+ impl FormatDetector {
48
+ /// Detect format from filename and content
49
+ /// Prioritizes content detection over extension when both are available
50
+ pub fn detect(filename: Option<&str>, content: Option<&[u8]>) -> FileFormat {
51
+ // First try content-based detection if content is provided
52
+ if let Some(data) = content {
53
+ let format = Self::detect_from_content(data);
54
+ // If we got a definitive format from content, use it
55
+ if !matches!(format, FileFormat::Text | FileFormat::Unknown) {
56
+ return format;
57
+ }
58
+ }
59
+
60
+ // Fall back to extension-based detection
61
+ if let Some(name) = filename {
62
+ let ext_format = Self::detect_from_extension(name);
63
+ if ext_format != FileFormat::Unknown {
64
+ return ext_format;
65
+ }
66
+ }
67
+
68
+ // If content detection returned Text and no extension match, return Text
69
+ if let Some(data) = content {
70
+ let format = Self::detect_from_content(data);
71
+ if format == FileFormat::Text {
72
+ return FileFormat::Text;
73
+ }
74
+ }
75
+
76
+ FileFormat::Unknown
77
+ }
78
+
79
+ /// Detect format from file extension
80
+ pub fn detect_from_extension(filename: &str) -> FileFormat {
81
+ let path = Path::new(filename);
82
+ let ext = match path.extension().and_then(|s| s.to_str()) {
83
+ Some(e) => e.to_lowercase(),
84
+ None => return FileFormat::Unknown,
85
+ };
86
+
87
+ match ext.as_str() {
88
+ "pdf" => FileFormat::Pdf,
89
+ "docx" => FileFormat::Docx,
90
+ "xlsx" => FileFormat::Xlsx,
91
+ "xls" => FileFormat::Xls,
92
+ "pptx" => FileFormat::Pptx,
93
+ "png" => FileFormat::Png,
94
+ "jpg" | "jpeg" => FileFormat::Jpeg,
95
+ "tiff" | "tif" => FileFormat::Tiff,
96
+ "bmp" => FileFormat::Bmp,
97
+ "json" => FileFormat::Json,
98
+ "xml" => FileFormat::Xml,
99
+ "html" | "htm" => FileFormat::Html,
100
+ "txt" | "text" | "md" | "markdown" | "csv" => FileFormat::Text,
101
+ _ => FileFormat::Unknown,
102
+ }
103
+ }
104
+
105
+ /// Detect format from file content (magic bytes)
106
+ pub fn detect_from_content(data: &[u8]) -> FileFormat {
107
+ if data.is_empty() {
108
+ return FileFormat::Text; // Empty files are treated as text
109
+ }
110
+
111
+ // PDF
112
+ if data.len() >= 4 && data.starts_with(b"%PDF") {
113
+ return FileFormat::Pdf;
114
+ }
115
+
116
+ // PNG
117
+ if data.len() >= 8 && data.starts_with(&[0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A]) {
118
+ return FileFormat::Png;
119
+ }
120
+
121
+ // JPEG
122
+ if data.len() >= 3 && data.starts_with(&[0xFF, 0xD8, 0xFF]) {
123
+ return FileFormat::Jpeg;
124
+ }
125
+
126
+ // BMP
127
+ if data.len() >= 2 && data.starts_with(b"BM") {
128
+ return FileFormat::Bmp;
129
+ }
130
+
131
+ // TIFF (little-endian or big-endian)
132
+ if data.len() >= 4 {
133
+ if data.starts_with(b"II\x2A\x00") || data.starts_with(b"MM\x00\x2A") {
134
+ return FileFormat::Tiff;
135
+ }
136
+ }
137
+
138
+ // OLE Compound Document (old Excel/Word)
139
+ if data.len() >= 4 && data.starts_with(&[0xD0, 0xCF, 0x11, 0xE0]) {
140
+ return FileFormat::Xls; // Old Office format, usually Excel
141
+ }
142
+
143
+ // ZIP archive (could be DOCX, XLSX, PPTX)
144
+ if data.len() >= 2 && data.starts_with(b"PK") {
145
+ return Self::detect_office_format(data);
146
+ }
147
+
148
+ // XML
149
+ if data.len() >= 5 {
150
+ let start = String::from_utf8_lossy(&data[0..5.min(data.len())]);
151
+ if start.starts_with("<?xml") || start.starts_with("<!") {
152
+ return FileFormat::Xml;
153
+ }
154
+ }
155
+
156
+ // HTML
157
+ if data.len() >= 14 {
158
+ let start = String::from_utf8_lossy(&data[0..14.min(data.len())]).to_lowercase();
159
+ if start.contains("<!doctype") || start.contains("<html") {
160
+ return FileFormat::Html;
161
+ }
162
+ }
163
+
164
+ // JSON
165
+ if let Some(&first_non_ws) = data.iter().find(|&&b| !b" \t\n\r".contains(&b)) {
166
+ if first_non_ws == b'{' || first_non_ws == b'[' {
167
+ return FileFormat::Json;
168
+ }
169
+ }
170
+
171
+ // Default to text for unrecognized formats
172
+ FileFormat::Text
173
+ }
174
+
175
+ /// Detect specific Office format from ZIP data
176
+ fn detect_office_format(data: &[u8]) -> FileFormat {
177
+ // Look for Office-specific directory names in first 2KB of ZIP
178
+ let check_len = 2000.min(data.len());
179
+ let content = String::from_utf8_lossy(&data[0..check_len]);
180
+
181
+ // Check for format-specific markers
182
+ if content.contains("word/") || content.contains("word/_rels") {
183
+ FileFormat::Docx
184
+ } else if content.contains("xl/") || content.contains("xl/_rels") {
185
+ FileFormat::Xlsx
186
+ } else if content.contains("ppt/") || content.contains("ppt/_rels") {
187
+ FileFormat::Pptx
188
+ } else {
189
+ // Default to XLSX for generic ZIP (most common Office format)
190
+ FileFormat::Xlsx
191
+ }
192
+ }
193
+
194
+
195
+ /// Get all supported extensions
196
+ pub fn supported_extensions() -> Vec<&'static str> {
197
+ vec![
198
+ "pdf", "docx", "xlsx", "xls", "pptx",
199
+ "png", "jpg", "jpeg", "tiff", "tif", "bmp",
200
+ "json", "xml", "html", "htm",
201
+ "txt", "text", "md", "markdown", "csv"
202
+ ]
203
+ }
204
+ }
205
+
206
+ #[cfg(test)]
207
+ mod tests {
208
+ use super::*;
209
+
210
+ #[test]
211
+ fn test_detect_pdf() {
212
+ let pdf_data = b"%PDF-1.5\n";
213
+ assert_eq!(FormatDetector::detect_from_content(pdf_data), FileFormat::Pdf);
214
+ }
215
+
216
+ #[test]
217
+ fn test_detect_png() {
218
+ let png_data = &[0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A];
219
+ assert_eq!(FormatDetector::detect_from_content(png_data), FileFormat::Png);
220
+ }
221
+
222
+ #[test]
223
+ fn test_detect_from_extension() {
224
+ assert_eq!(FormatDetector::detect_from_extension("document.pdf"), FileFormat::Pdf);
225
+ assert_eq!(FormatDetector::detect_from_extension("Document.PDF"), FileFormat::Pdf);
226
+ assert_eq!(FormatDetector::detect_from_extension("data.xlsx"), FileFormat::Xlsx);
227
+ }
228
+
229
+ #[test]
230
+ fn test_empty_data() {
231
+ assert_eq!(FormatDetector::detect_from_content(&[]), FileFormat::Text);
232
+ }
233
+ }
@@ -2,6 +2,7 @@ use magnus::{function, prelude::*, Error, Ruby};
2
2
 
3
3
  mod parser;
4
4
  mod error;
5
+ mod format_detector;
5
6
 
6
7
  /// Initialize the ParseKit module and its submodules
7
8
  #[magnus::init]