parsekit 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +2 -2
- data/README.md +1 -1
- data/ext/parsekit/Cargo.toml +1 -1
- data/ext/parsekit/src/format_detector.rs +233 -0
- data/ext/parsekit/src/lib.rs +1 -0
- data/ext/parsekit/src/parser.rs +111 -181
- data/lib/parsekit/NATIVE_API.md +125 -0
- data/lib/parsekit/parsekit.bundle +0 -0
- data/lib/parsekit/parser.rb +155 -104
- data/lib/parsekit/version.rb +1 -1
- data/lib/parsekit.rb +32 -0
- metadata +7 -5
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 6ad6eb42fb7e96fa944f30245b2c7be51bf4ce1a0f7766749309676b225b17df
|
|
4
|
+
data.tar.gz: deb56ea394ac3fedc840e890e4d27de14585661233f19eeaae06baf7be1b1e90
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: dc88b902dd12008a6936f4d62f5d4651544a3f463b725a15d385b919141e93873bd809436e6b9b008baa7b310d149becb2106a29ca103736f6525e09bef871d6
|
|
7
|
+
data.tar.gz: 9cbc5464a5cbe06a241d2253cde81da82c7eb75742654b7753c91a922acc87125f81a33c3e77d0d107a1435e8946a860e12388e44fa84dc887d9bb4bf9d2d3a2
|
data/CHANGELOG.md
CHANGED
|
@@ -49,5 +49,5 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
49
49
|
- Rust edition 2021
|
|
50
50
|
- Cross-compilation support for multiple platforms
|
|
51
51
|
|
|
52
|
-
[Unreleased]: https://github.com/
|
|
53
|
-
[0.1.0]: https://github.com/
|
|
52
|
+
[Unreleased]: https://github.com/scientist-labs/parsekit/compare/v0.1.0...HEAD
|
|
53
|
+
[0.1.0]: https://github.com/scientist-labs/parsekit/releases/tag/v0.1.0
|
data/README.md
CHANGED
|
@@ -186,7 +186,7 @@ ParseKit uses a hybrid Ruby/Rust architecture:
|
|
|
186
186
|
|
|
187
187
|
## Contributing
|
|
188
188
|
|
|
189
|
-
Bug reports and pull requests are welcome on GitHub at https://github.com/
|
|
189
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/scientist-labs/parsekit.
|
|
190
190
|
|
|
191
191
|
## License
|
|
192
192
|
|
data/ext/parsekit/Cargo.toml
CHANGED
|
@@ -21,7 +21,7 @@ image = "0.25" # Image processing library (match rusty-tesseract's version)
|
|
|
21
21
|
calamine = "0.30" # Excel parsing
|
|
22
22
|
docx-rs = "0.4" # Word document parsing
|
|
23
23
|
quick-xml = "0.38" # XML parsing
|
|
24
|
-
zip = "
|
|
24
|
+
zip = "5.0" # ZIP archive handling for PPTX
|
|
25
25
|
serde_json = "1.0" # JSON parsing
|
|
26
26
|
regex = "1.10" # Text parsing
|
|
27
27
|
encoding_rs = "0.8" # Encoding detection
|
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
use std::path::Path;
|
|
2
|
+
|
|
3
|
+
/// Represents a detected file format
|
|
4
|
+
#[derive(Debug, Clone, PartialEq)]
|
|
5
|
+
pub enum FileFormat {
|
|
6
|
+
Pdf,
|
|
7
|
+
Docx,
|
|
8
|
+
Xlsx,
|
|
9
|
+
Xls,
|
|
10
|
+
Pptx,
|
|
11
|
+
Png,
|
|
12
|
+
Jpeg,
|
|
13
|
+
Tiff,
|
|
14
|
+
Bmp,
|
|
15
|
+
Json,
|
|
16
|
+
Xml,
|
|
17
|
+
Html,
|
|
18
|
+
Text,
|
|
19
|
+
Unknown,
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
impl FileFormat {
|
|
23
|
+
/// Convert to Ruby symbol representation
|
|
24
|
+
pub fn to_symbol(&self) -> &'static str {
|
|
25
|
+
match self {
|
|
26
|
+
FileFormat::Pdf => "pdf",
|
|
27
|
+
FileFormat::Docx => "docx",
|
|
28
|
+
FileFormat::Xlsx => "xlsx",
|
|
29
|
+
FileFormat::Xls => "xls",
|
|
30
|
+
FileFormat::Pptx => "pptx",
|
|
31
|
+
FileFormat::Png => "png",
|
|
32
|
+
FileFormat::Jpeg => "jpeg",
|
|
33
|
+
FileFormat::Tiff => "tiff",
|
|
34
|
+
FileFormat::Bmp => "bmp",
|
|
35
|
+
FileFormat::Json => "json",
|
|
36
|
+
FileFormat::Xml => "xml",
|
|
37
|
+
FileFormat::Html => "xml", // HTML is treated as XML in Ruby
|
|
38
|
+
FileFormat::Text => "text",
|
|
39
|
+
FileFormat::Unknown => "unknown",
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
/// Central format detection logic
|
|
45
|
+
pub struct FormatDetector;
|
|
46
|
+
|
|
47
|
+
impl FormatDetector {
|
|
48
|
+
/// Detect format from filename and content
|
|
49
|
+
/// Prioritizes content detection over extension when both are available
|
|
50
|
+
pub fn detect(filename: Option<&str>, content: Option<&[u8]>) -> FileFormat {
|
|
51
|
+
// First try content-based detection if content is provided
|
|
52
|
+
if let Some(data) = content {
|
|
53
|
+
let format = Self::detect_from_content(data);
|
|
54
|
+
// If we got a definitive format from content, use it
|
|
55
|
+
if !matches!(format, FileFormat::Text | FileFormat::Unknown) {
|
|
56
|
+
return format;
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
// Fall back to extension-based detection
|
|
61
|
+
if let Some(name) = filename {
|
|
62
|
+
let ext_format = Self::detect_from_extension(name);
|
|
63
|
+
if ext_format != FileFormat::Unknown {
|
|
64
|
+
return ext_format;
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
// If content detection returned Text and no extension match, return Text
|
|
69
|
+
if let Some(data) = content {
|
|
70
|
+
let format = Self::detect_from_content(data);
|
|
71
|
+
if format == FileFormat::Text {
|
|
72
|
+
return FileFormat::Text;
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
FileFormat::Unknown
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
/// Detect format from file extension
|
|
80
|
+
pub fn detect_from_extension(filename: &str) -> FileFormat {
|
|
81
|
+
let path = Path::new(filename);
|
|
82
|
+
let ext = match path.extension().and_then(|s| s.to_str()) {
|
|
83
|
+
Some(e) => e.to_lowercase(),
|
|
84
|
+
None => return FileFormat::Unknown,
|
|
85
|
+
};
|
|
86
|
+
|
|
87
|
+
match ext.as_str() {
|
|
88
|
+
"pdf" => FileFormat::Pdf,
|
|
89
|
+
"docx" => FileFormat::Docx,
|
|
90
|
+
"xlsx" => FileFormat::Xlsx,
|
|
91
|
+
"xls" => FileFormat::Xls,
|
|
92
|
+
"pptx" => FileFormat::Pptx,
|
|
93
|
+
"png" => FileFormat::Png,
|
|
94
|
+
"jpg" | "jpeg" => FileFormat::Jpeg,
|
|
95
|
+
"tiff" | "tif" => FileFormat::Tiff,
|
|
96
|
+
"bmp" => FileFormat::Bmp,
|
|
97
|
+
"json" => FileFormat::Json,
|
|
98
|
+
"xml" => FileFormat::Xml,
|
|
99
|
+
"html" | "htm" => FileFormat::Html,
|
|
100
|
+
"txt" | "text" | "md" | "markdown" | "csv" => FileFormat::Text,
|
|
101
|
+
_ => FileFormat::Unknown,
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
/// Detect format from file content (magic bytes)
|
|
106
|
+
pub fn detect_from_content(data: &[u8]) -> FileFormat {
|
|
107
|
+
if data.is_empty() {
|
|
108
|
+
return FileFormat::Text; // Empty files are treated as text
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
// PDF
|
|
112
|
+
if data.len() >= 4 && data.starts_with(b"%PDF") {
|
|
113
|
+
return FileFormat::Pdf;
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
// PNG
|
|
117
|
+
if data.len() >= 8 && data.starts_with(&[0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A]) {
|
|
118
|
+
return FileFormat::Png;
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
// JPEG
|
|
122
|
+
if data.len() >= 3 && data.starts_with(&[0xFF, 0xD8, 0xFF]) {
|
|
123
|
+
return FileFormat::Jpeg;
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
// BMP
|
|
127
|
+
if data.len() >= 2 && data.starts_with(b"BM") {
|
|
128
|
+
return FileFormat::Bmp;
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
// TIFF (little-endian or big-endian)
|
|
132
|
+
if data.len() >= 4 {
|
|
133
|
+
if data.starts_with(b"II\x2A\x00") || data.starts_with(b"MM\x00\x2A") {
|
|
134
|
+
return FileFormat::Tiff;
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
// OLE Compound Document (old Excel/Word)
|
|
139
|
+
if data.len() >= 4 && data.starts_with(&[0xD0, 0xCF, 0x11, 0xE0]) {
|
|
140
|
+
return FileFormat::Xls; // Old Office format, usually Excel
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
// ZIP archive (could be DOCX, XLSX, PPTX)
|
|
144
|
+
if data.len() >= 2 && data.starts_with(b"PK") {
|
|
145
|
+
return Self::detect_office_format(data);
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
// XML
|
|
149
|
+
if data.len() >= 5 {
|
|
150
|
+
let start = String::from_utf8_lossy(&data[0..5.min(data.len())]);
|
|
151
|
+
if start.starts_with("<?xml") || start.starts_with("<!") {
|
|
152
|
+
return FileFormat::Xml;
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
// HTML
|
|
157
|
+
if data.len() >= 14 {
|
|
158
|
+
let start = String::from_utf8_lossy(&data[0..14.min(data.len())]).to_lowercase();
|
|
159
|
+
if start.contains("<!doctype") || start.contains("<html") {
|
|
160
|
+
return FileFormat::Html;
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
// JSON
|
|
165
|
+
if let Some(&first_non_ws) = data.iter().find(|&&b| !b" \t\n\r".contains(&b)) {
|
|
166
|
+
if first_non_ws == b'{' || first_non_ws == b'[' {
|
|
167
|
+
return FileFormat::Json;
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
// Default to text for unrecognized formats
|
|
172
|
+
FileFormat::Text
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
/// Detect specific Office format from ZIP data
|
|
176
|
+
fn detect_office_format(data: &[u8]) -> FileFormat {
|
|
177
|
+
// Look for Office-specific directory names in first 2KB of ZIP
|
|
178
|
+
let check_len = 2000.min(data.len());
|
|
179
|
+
let content = String::from_utf8_lossy(&data[0..check_len]);
|
|
180
|
+
|
|
181
|
+
// Check for format-specific markers
|
|
182
|
+
if content.contains("word/") || content.contains("word/_rels") {
|
|
183
|
+
FileFormat::Docx
|
|
184
|
+
} else if content.contains("xl/") || content.contains("xl/_rels") {
|
|
185
|
+
FileFormat::Xlsx
|
|
186
|
+
} else if content.contains("ppt/") || content.contains("ppt/_rels") {
|
|
187
|
+
FileFormat::Pptx
|
|
188
|
+
} else {
|
|
189
|
+
// Default to XLSX for generic ZIP (most common Office format)
|
|
190
|
+
FileFormat::Xlsx
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
/// Get all supported extensions
|
|
196
|
+
pub fn supported_extensions() -> Vec<&'static str> {
|
|
197
|
+
vec![
|
|
198
|
+
"pdf", "docx", "xlsx", "xls", "pptx",
|
|
199
|
+
"png", "jpg", "jpeg", "tiff", "tif", "bmp",
|
|
200
|
+
"json", "xml", "html", "htm",
|
|
201
|
+
"txt", "text", "md", "markdown", "csv"
|
|
202
|
+
]
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
#[cfg(test)]
|
|
207
|
+
mod tests {
|
|
208
|
+
use super::*;
|
|
209
|
+
|
|
210
|
+
#[test]
|
|
211
|
+
fn test_detect_pdf() {
|
|
212
|
+
let pdf_data = b"%PDF-1.5\n";
|
|
213
|
+
assert_eq!(FormatDetector::detect_from_content(pdf_data), FileFormat::Pdf);
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
#[test]
|
|
217
|
+
fn test_detect_png() {
|
|
218
|
+
let png_data = &[0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A];
|
|
219
|
+
assert_eq!(FormatDetector::detect_from_content(png_data), FileFormat::Png);
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
#[test]
|
|
223
|
+
fn test_detect_from_extension() {
|
|
224
|
+
assert_eq!(FormatDetector::detect_from_extension("document.pdf"), FileFormat::Pdf);
|
|
225
|
+
assert_eq!(FormatDetector::detect_from_extension("Document.PDF"), FileFormat::Pdf);
|
|
226
|
+
assert_eq!(FormatDetector::detect_from_extension("data.xlsx"), FileFormat::Xlsx);
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
#[test]
|
|
230
|
+
fn test_empty_data() {
|
|
231
|
+
assert_eq!(FormatDetector::detect_from_content(&[]), FileFormat::Text);
|
|
232
|
+
}
|
|
233
|
+
}
|
data/ext/parsekit/src/lib.rs
CHANGED
data/ext/parsekit/src/parser.rs
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
use magnus::{
|
|
2
2
|
function, method, prelude::*, scan_args, Error, Module, RHash, RModule, Ruby, Value,
|
|
3
3
|
};
|
|
4
|
-
use
|
|
4
|
+
use crate::format_detector::{FileFormat, FormatDetector};
|
|
5
5
|
|
|
6
6
|
#[derive(Debug, Clone)]
|
|
7
7
|
#[magnus::wrap(class = "ParseKit::Parser", free_immediately, size)]
|
|
@@ -28,6 +28,33 @@ impl Default for ParserConfig {
|
|
|
28
28
|
}
|
|
29
29
|
}
|
|
30
30
|
|
|
31
|
+
// Error handling helpers
|
|
32
|
+
impl Parser {
|
|
33
|
+
/// Create a RuntimeError with formatted message
|
|
34
|
+
fn runtime_error<E: std::fmt::Display>(context: &str, err: E) -> Error {
|
|
35
|
+
Error::new(
|
|
36
|
+
Ruby::get().unwrap().exception_runtime_error(),
|
|
37
|
+
format!("{}: {}", context, err),
|
|
38
|
+
)
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
/// Create an ArgumentError with message
|
|
42
|
+
fn argument_error(msg: &str) -> Error {
|
|
43
|
+
Error::new(
|
|
44
|
+
Ruby::get().unwrap().exception_arg_error(),
|
|
45
|
+
msg.to_string(),
|
|
46
|
+
)
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
/// Create an IOError with formatted message
|
|
50
|
+
fn io_error<E: std::fmt::Display>(context: &str, err: E) -> Error {
|
|
51
|
+
Error::new(
|
|
52
|
+
Ruby::get().unwrap().exception_io_error(),
|
|
53
|
+
format!("{}: {}", context, err),
|
|
54
|
+
)
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
|
|
31
58
|
impl Parser {
|
|
32
59
|
/// Create a new Parser instance with optional configuration
|
|
33
60
|
fn new(ruby: &Ruby, args: &[Value]) -> Result<Self, Error> {
|
|
@@ -58,73 +85,49 @@ impl Parser {
|
|
|
58
85
|
fn parse_bytes_internal(&self, data: Vec<u8>, filename: Option<&str>) -> Result<String, Error> {
|
|
59
86
|
// Check size limit
|
|
60
87
|
if data.len() > self.config.max_size {
|
|
61
|
-
return Err(
|
|
62
|
-
|
|
63
|
-
format!(
|
|
64
|
-
|
|
65
|
-
data.len(),
|
|
66
|
-
self.config.max_size
|
|
67
|
-
),
|
|
88
|
+
return Err(Self::runtime_error(
|
|
89
|
+
"File size exceeds limit",
|
|
90
|
+
format!("{} bytes exceeds maximum allowed size of {} bytes",
|
|
91
|
+
data.len(), self.config.max_size)
|
|
68
92
|
));
|
|
69
93
|
}
|
|
70
94
|
|
|
71
|
-
//
|
|
72
|
-
let
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
};
|
|
77
|
-
|
|
78
|
-
match file_type.as_str() {
|
|
79
|
-
"pdf" => self.parse_pdf(data),
|
|
80
|
-
"docx" => self.parse_docx(data),
|
|
81
|
-
"pptx" => self.parse_pptx(data),
|
|
82
|
-
"xlsx" | "xls" => self.parse_xlsx(data),
|
|
83
|
-
"json" => self.parse_json(data),
|
|
84
|
-
"xml" | "html" => self.parse_xml(data),
|
|
85
|
-
"png" | "jpg" | "jpeg" | "tiff" | "bmp" => self.ocr_image(data),
|
|
86
|
-
"txt" | "text" => self.parse_text(data),
|
|
87
|
-
_ => self.parse_text(data), // Default to text parsing
|
|
88
|
-
}
|
|
95
|
+
// Use centralized format detection
|
|
96
|
+
let format = FormatDetector::detect(filename, Some(&data));
|
|
97
|
+
|
|
98
|
+
// Use centralized dispatch
|
|
99
|
+
self.dispatch_to_parser(format, data)
|
|
89
100
|
}
|
|
90
|
-
|
|
91
|
-
///
|
|
92
|
-
fn
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
101
|
+
|
|
102
|
+
/// Centralized dispatch logic - routes format to appropriate parser
|
|
103
|
+
fn dispatch_to_parser(&self, format: FileFormat, data: Vec<u8>) -> Result<String, Error> {
|
|
104
|
+
match format {
|
|
105
|
+
FileFormat::Pdf => self.parse_pdf(data),
|
|
106
|
+
FileFormat::Docx => self.parse_docx(data),
|
|
107
|
+
FileFormat::Pptx => self.parse_pptx(data),
|
|
108
|
+
FileFormat::Xlsx | FileFormat::Xls => self.parse_xlsx(data),
|
|
109
|
+
FileFormat::Json => self.parse_json(data),
|
|
110
|
+
FileFormat::Xml | FileFormat::Html => self.parse_xml(data),
|
|
111
|
+
FileFormat::Png | FileFormat::Jpeg | FileFormat::Tiff | FileFormat::Bmp => self.ocr_image(data),
|
|
112
|
+
FileFormat::Text | FileFormat::Unknown => self.parse_text(data),
|
|
97
113
|
}
|
|
98
114
|
}
|
|
99
115
|
|
|
100
|
-
///
|
|
101
|
-
fn
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
// This is a simplified check - both DOCX and XLSX are ZIP files
|
|
108
|
-
// For now, default to xlsx as it's more commonly parsed
|
|
109
|
-
"xlsx".to_string() // Office Open XML format (could also be DOCX)
|
|
110
|
-
} else if data.starts_with(&[0xD0, 0xCF, 0x11, 0xE0]) {
|
|
111
|
-
"xls".to_string() // Old Excel format
|
|
112
|
-
} else if data.starts_with(&[0x89, 0x50, 0x4E, 0x47]) {
|
|
113
|
-
"png".to_string() // PNG signature
|
|
114
|
-
} else if data.starts_with(&[0xFF, 0xD8, 0xFF]) {
|
|
115
|
-
"jpg".to_string() // JPEG signature
|
|
116
|
-
} else if data.starts_with(b"BM") {
|
|
117
|
-
"bmp".to_string() // BMP signature
|
|
118
|
-
} else if data.starts_with(b"II\x2A\x00") || data.starts_with(b"MM\x00\x2A") {
|
|
119
|
-
"tiff".to_string() // TIFF signature (little-endian or big-endian)
|
|
120
|
-
} else if data.starts_with(b"<?xml") || data.starts_with(b"<html") {
|
|
121
|
-
"xml".to_string()
|
|
122
|
-
} else if data.starts_with(b"{") || data.starts_with(b"[") {
|
|
123
|
-
"json".to_string()
|
|
124
|
-
} else {
|
|
125
|
-
"txt".to_string()
|
|
116
|
+
/// Ruby-accessible method to detect format from bytes
|
|
117
|
+
fn detect_format_from_bytes(&self, data: Vec<u8>) -> String {
|
|
118
|
+
let format = FormatDetector::detect_from_content(&data);
|
|
119
|
+
// For compatibility with Ruby tests, return "xlsx" for old Excel
|
|
120
|
+
match format {
|
|
121
|
+
FileFormat::Xls => "xlsx".to_string(), // Compatibility with existing tests
|
|
122
|
+
_ => format.to_symbol().to_string(),
|
|
126
123
|
}
|
|
127
124
|
}
|
|
125
|
+
|
|
126
|
+
/// Ruby-accessible method to detect format from filename
|
|
127
|
+
fn detect_format_from_filename(&self, filename: String) -> String {
|
|
128
|
+
let format = FormatDetector::detect_from_extension(&filename);
|
|
129
|
+
format.to_symbol().to_string()
|
|
130
|
+
}
|
|
128
131
|
|
|
129
132
|
/// Perform OCR on image data using Tesseract
|
|
130
133
|
fn ocr_image(&self, data: Vec<u8>) -> Result<String, Error> {
|
|
@@ -191,20 +194,12 @@ impl Parser {
|
|
|
191
194
|
};
|
|
192
195
|
|
|
193
196
|
if let Err(e) = init_result {
|
|
194
|
-
return Err(
|
|
195
|
-
Ruby::get().unwrap().exception_runtime_error(),
|
|
196
|
-
format!("Failed to initialize Tesseract: {:?}", e),
|
|
197
|
-
))
|
|
197
|
+
return Err(Self::runtime_error("Failed to initialize Tesseract", e));
|
|
198
198
|
}
|
|
199
199
|
|
|
200
200
|
// Load the image from bytes
|
|
201
|
-
let img =
|
|
202
|
-
|
|
203
|
-
Err(e) => return Err(Error::new(
|
|
204
|
-
Ruby::get().unwrap().exception_runtime_error(),
|
|
205
|
-
format!("Failed to load image: {}", e),
|
|
206
|
-
))
|
|
207
|
-
};
|
|
201
|
+
let img = image::load_from_memory(&data)
|
|
202
|
+
.map_err(|e| Self::runtime_error("Failed to load image", e))?;
|
|
208
203
|
|
|
209
204
|
// Convert to RGBA8 format
|
|
210
205
|
let rgba_img = img.to_rgba8();
|
|
@@ -212,27 +207,18 @@ impl Parser {
|
|
|
212
207
|
let raw_data = rgba_img.into_raw();
|
|
213
208
|
|
|
214
209
|
// Set image data
|
|
215
|
-
|
|
210
|
+
tesseract.set_image(
|
|
216
211
|
&raw_data,
|
|
217
212
|
width as i32,
|
|
218
213
|
height as i32,
|
|
219
214
|
4, // bytes per pixel (RGBA)
|
|
220
215
|
(width * 4) as i32, // bytes per line
|
|
221
|
-
)
|
|
222
|
-
return Err(Error::new(
|
|
223
|
-
Ruby::get().unwrap().exception_runtime_error(),
|
|
224
|
-
format!("Failed to set image: {}", e),
|
|
225
|
-
))
|
|
226
|
-
}
|
|
216
|
+
).map_err(|e| Self::runtime_error("Failed to set image", e))?;
|
|
227
217
|
|
|
228
218
|
// Extract text
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
Ruby::get().unwrap().exception_runtime_error(),
|
|
233
|
-
format!("Failed to perform OCR: {}", e),
|
|
234
|
-
)),
|
|
235
|
-
}
|
|
219
|
+
tesseract.get_utf8_text()
|
|
220
|
+
.map(|text| text.trim().to_string())
|
|
221
|
+
.map_err(|e| Self::runtime_error("Failed to perform OCR", e))
|
|
236
222
|
}
|
|
237
223
|
|
|
238
224
|
|
|
@@ -242,51 +228,31 @@ impl Parser {
|
|
|
242
228
|
|
|
243
229
|
// Try to load the PDF from memory
|
|
244
230
|
// The magic parameter helps MuPDF identify the file type
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
match doc.load_page(page_num) {
|
|
263
|
-
Ok(page) => {
|
|
264
|
-
// Extract text from the page
|
|
265
|
-
match page.to_text() {
|
|
266
|
-
Ok(text) => {
|
|
267
|
-
all_text.push_str(&text);
|
|
268
|
-
all_text.push('\n');
|
|
269
|
-
}
|
|
270
|
-
Err(_) => continue,
|
|
271
|
-
}
|
|
272
|
-
}
|
|
273
|
-
Err(_) => continue,
|
|
274
|
-
}
|
|
275
|
-
}
|
|
276
|
-
|
|
277
|
-
if all_text.is_empty() {
|
|
278
|
-
Ok(
|
|
279
|
-
"PDF contains no extractable text (might be scanned/image-based)"
|
|
280
|
-
.to_string(),
|
|
281
|
-
)
|
|
282
|
-
} else {
|
|
283
|
-
Ok(all_text.trim().to_string())
|
|
231
|
+
let doc = Document::from_bytes(&data, "pdf")
|
|
232
|
+
.map_err(|e| Self::runtime_error("Failed to parse PDF", e))?;
|
|
233
|
+
|
|
234
|
+
let mut all_text = String::new();
|
|
235
|
+
|
|
236
|
+
// Get page count
|
|
237
|
+
let page_count = doc.page_count()
|
|
238
|
+
.map_err(|e| Self::runtime_error("Failed to get page count", e))?;
|
|
239
|
+
|
|
240
|
+
// Iterate through pages
|
|
241
|
+
for page_num in 0..page_count {
|
|
242
|
+
// Continue on page errors rather than failing entirely
|
|
243
|
+
if let Ok(page) = doc.load_page(page_num) {
|
|
244
|
+
// Extract text from the page
|
|
245
|
+
if let Ok(text) = page.to_text() {
|
|
246
|
+
all_text.push_str(&text);
|
|
247
|
+
all_text.push('\n');
|
|
284
248
|
}
|
|
285
249
|
}
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
))
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
if all_text.is_empty() {
|
|
253
|
+
Ok("PDF contains no extractable text (might be scanned/image-based)".to_string())
|
|
254
|
+
} else {
|
|
255
|
+
Ok(all_text.trim().to_string())
|
|
290
256
|
}
|
|
291
257
|
}
|
|
292
258
|
|
|
@@ -322,10 +288,7 @@ impl Parser {
|
|
|
322
288
|
|
|
323
289
|
Ok(result.trim().to_string())
|
|
324
290
|
}
|
|
325
|
-
Err(e) => Err(
|
|
326
|
-
Ruby::get().unwrap().exception_runtime_error(),
|
|
327
|
-
format!("Failed to parse DOCX file: {}", e),
|
|
328
|
-
)),
|
|
291
|
+
Err(e) => Err(Self::runtime_error("Failed to parse DOCX file", e)),
|
|
329
292
|
}
|
|
330
293
|
}
|
|
331
294
|
|
|
@@ -335,15 +298,8 @@ impl Parser {
|
|
|
335
298
|
use zip::ZipArchive;
|
|
336
299
|
|
|
337
300
|
let cursor = Cursor::new(data);
|
|
338
|
-
let mut archive =
|
|
339
|
-
|
|
340
|
-
Err(e) => {
|
|
341
|
-
return Err(Error::new(
|
|
342
|
-
Ruby::get().unwrap().exception_runtime_error(),
|
|
343
|
-
format!("Failed to open PPTX as ZIP: {}", e),
|
|
344
|
-
))
|
|
345
|
-
}
|
|
346
|
-
};
|
|
301
|
+
let mut archive = ZipArchive::new(cursor)
|
|
302
|
+
.map_err(|e| Self::runtime_error("Failed to open PPTX as ZIP", e))?;
|
|
347
303
|
|
|
348
304
|
let mut all_text = Vec::new();
|
|
349
305
|
let mut slide_numbers = Vec::new();
|
|
@@ -492,10 +448,7 @@ impl Parser {
|
|
|
492
448
|
|
|
493
449
|
Ok(result)
|
|
494
450
|
}
|
|
495
|
-
Err(e) => Err(
|
|
496
|
-
Ruby::get().unwrap().exception_runtime_error(),
|
|
497
|
-
format!("Failed to parse Excel file: {}", e),
|
|
498
|
-
)),
|
|
451
|
+
Err(e) => Err(Self::runtime_error("Failed to parse Excel file", e)),
|
|
499
452
|
}
|
|
500
453
|
}
|
|
501
454
|
|
|
@@ -527,10 +480,7 @@ impl Parser {
|
|
|
527
480
|
}
|
|
528
481
|
Ok(Event::Eof) => break,
|
|
529
482
|
Err(e) => {
|
|
530
|
-
return Err(
|
|
531
|
-
Ruby::get().unwrap().exception_runtime_error(),
|
|
532
|
-
format!("XML parse error: {}", e),
|
|
533
|
-
))
|
|
483
|
+
return Err(Self::runtime_error("XML parse error", e))
|
|
534
484
|
}
|
|
535
485
|
_ => {}
|
|
536
486
|
}
|
|
@@ -557,10 +507,7 @@ impl Parser {
|
|
|
557
507
|
/// Parse input string (for text content)
|
|
558
508
|
fn parse(&self, input: String) -> Result<String, Error> {
|
|
559
509
|
if input.is_empty() {
|
|
560
|
-
return Err(
|
|
561
|
-
Ruby::get().unwrap().exception_arg_error(),
|
|
562
|
-
"Input cannot be empty",
|
|
563
|
-
));
|
|
510
|
+
return Err(Self::argument_error("Input cannot be empty"));
|
|
564
511
|
}
|
|
565
512
|
|
|
566
513
|
// For string input, just return cleaned text
|
|
@@ -576,12 +523,8 @@ impl Parser {
|
|
|
576
523
|
fn parse_file(&self, path: String) -> Result<String, Error> {
|
|
577
524
|
use std::fs;
|
|
578
525
|
|
|
579
|
-
let data = fs::read(&path)
|
|
580
|
-
|
|
581
|
-
Ruby::get().unwrap().exception_io_error(),
|
|
582
|
-
format!("Failed to read file: {}", e),
|
|
583
|
-
)
|
|
584
|
-
})?;
|
|
526
|
+
let data = fs::read(&path)
|
|
527
|
+
.map_err(|e| Self::io_error("Failed to read file", e))?;
|
|
585
528
|
|
|
586
529
|
self.parse_bytes_internal(data, Some(&path))
|
|
587
530
|
}
|
|
@@ -589,10 +532,7 @@ impl Parser {
|
|
|
589
532
|
/// Parse bytes from Ruby
|
|
590
533
|
fn parse_bytes(&self, data: Vec<u8>) -> Result<String, Error> {
|
|
591
534
|
if data.is_empty() {
|
|
592
|
-
return Err(
|
|
593
|
-
Ruby::get().unwrap().exception_arg_error(),
|
|
594
|
-
"Data cannot be empty",
|
|
595
|
-
));
|
|
535
|
+
return Err(Self::argument_error("Data cannot be empty"));
|
|
596
536
|
}
|
|
597
537
|
|
|
598
538
|
self.parse_bytes_internal(data, None)
|
|
@@ -616,25 +556,11 @@ impl Parser {
|
|
|
616
556
|
|
|
617
557
|
/// Check supported file types
|
|
618
558
|
fn supported_formats() -> Vec<String> {
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
"htm".to_string(), // HTML files (alternative extension)
|
|
625
|
-
"md".to_string(), // Markdown files
|
|
626
|
-
"docx".to_string(),
|
|
627
|
-
"pptx".to_string(),
|
|
628
|
-
"xlsx".to_string(),
|
|
629
|
-
"xls".to_string(),
|
|
630
|
-
"csv".to_string(),
|
|
631
|
-
"pdf".to_string(), // Text extraction via MuPDF
|
|
632
|
-
"png".to_string(), // OCR via Tesseract
|
|
633
|
-
"jpg".to_string(), // OCR via Tesseract
|
|
634
|
-
"jpeg".to_string(), // OCR via Tesseract
|
|
635
|
-
"tiff".to_string(), // OCR via Tesseract
|
|
636
|
-
"bmp".to_string(), // OCR via Tesseract
|
|
637
|
-
]
|
|
559
|
+
// Use the centralized list from FormatDetector
|
|
560
|
+
FormatDetector::supported_extensions()
|
|
561
|
+
.iter()
|
|
562
|
+
.map(|&s| s.to_string())
|
|
563
|
+
.collect()
|
|
638
564
|
}
|
|
639
565
|
|
|
640
566
|
/// Detect if file extension is supported
|
|
@@ -688,6 +614,10 @@ pub fn init(_ruby: &Ruby, module: RModule) -> Result<(), Error> {
|
|
|
688
614
|
class.define_method("parse_xml", method!(Parser::parse_xml, 1))?;
|
|
689
615
|
class.define_method("parse_text", method!(Parser::parse_text, 1))?;
|
|
690
616
|
class.define_method("ocr_image", method!(Parser::ocr_image, 1))?;
|
|
617
|
+
|
|
618
|
+
// Format detection methods
|
|
619
|
+
class.define_method("detect_format_from_bytes", method!(Parser::detect_format_from_bytes, 1))?;
|
|
620
|
+
class.define_method("detect_format_from_filename", method!(Parser::detect_format_from_filename, 1))?;
|
|
691
621
|
|
|
692
622
|
// Class methods
|
|
693
623
|
class.define_singleton_method("supported_formats", function!(Parser::supported_formats, 0))?;
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
# ParseKit Native API Documentation
|
|
2
|
+
|
|
3
|
+
This document describes the methods implemented in the Rust native extension for ParseKit::Parser.
|
|
4
|
+
|
|
5
|
+
## Instance Methods
|
|
6
|
+
|
|
7
|
+
### `initialize(options = {})`
|
|
8
|
+
Initialize a new Parser instance with optional configuration.
|
|
9
|
+
|
|
10
|
+
**Parameters:**
|
|
11
|
+
- `options` [Hash] Configuration options
|
|
12
|
+
- `:encoding` [String] Input encoding (default: UTF-8)
|
|
13
|
+
- `:strict_mode` [Boolean] Enable strict parsing mode (default: false)
|
|
14
|
+
- `:max_depth` [Integer] Maximum nesting depth (default: 100)
|
|
15
|
+
- `:max_size` [Integer] Maximum file size in bytes (default: 100MB)
|
|
16
|
+
|
|
17
|
+
### `parse(input)`
|
|
18
|
+
Parse an input string (for text content).
|
|
19
|
+
|
|
20
|
+
**Parameters:**
|
|
21
|
+
- `input` [String] The input to parse
|
|
22
|
+
|
|
23
|
+
**Returns:**
|
|
24
|
+
- [String] The parsed result
|
|
25
|
+
|
|
26
|
+
**Raises:**
|
|
27
|
+
- `ArgumentError` If input is empty
|
|
28
|
+
|
|
29
|
+
### `parse_file(path)`
|
|
30
|
+
Parse a file (supports PDF, Office documents, text files, images with OCR).
|
|
31
|
+
|
|
32
|
+
**Parameters:**
|
|
33
|
+
- `path` [String] Path to the file to parse
|
|
34
|
+
|
|
35
|
+
**Returns:**
|
|
36
|
+
- [String] The extracted text content
|
|
37
|
+
|
|
38
|
+
**Raises:**
|
|
39
|
+
- `IOError` If file cannot be read
|
|
40
|
+
- `RuntimeError` If parsing fails
|
|
41
|
+
|
|
42
|
+
### `parse_bytes(data)`
|
|
43
|
+
Parse binary data.
|
|
44
|
+
|
|
45
|
+
**Parameters:**
|
|
46
|
+
- `data` [Array<Integer>] Binary data as byte array
|
|
47
|
+
|
|
48
|
+
**Returns:**
|
|
49
|
+
- [String] The extracted text content
|
|
50
|
+
|
|
51
|
+
**Raises:**
|
|
52
|
+
- `ArgumentError` If data is empty
|
|
53
|
+
- `RuntimeError` If parsing fails
|
|
54
|
+
|
|
55
|
+
### `config`
|
|
56
|
+
Get the current parser configuration.
|
|
57
|
+
|
|
58
|
+
**Returns:**
|
|
59
|
+
- [Hash] The parser configuration including encoding, strict_mode, max_depth, and max_size
|
|
60
|
+
|
|
61
|
+
### `supports_file?(path)`
|
|
62
|
+
Check if a file format is supported.
|
|
63
|
+
|
|
64
|
+
**Parameters:**
|
|
65
|
+
- `path` [String] File path to check
|
|
66
|
+
|
|
67
|
+
**Returns:**
|
|
68
|
+
- [Boolean] True if the file format is supported
|
|
69
|
+
|
|
70
|
+
### `strict_mode?`
|
|
71
|
+
Check if strict mode is enabled.
|
|
72
|
+
|
|
73
|
+
**Returns:**
|
|
74
|
+
- [Boolean] True if strict mode is enabled
|
|
75
|
+
|
|
76
|
+
## Format-Specific Parsers
|
|
77
|
+
|
|
78
|
+
These methods are also available but typically called internally via `parse_file` or `parse_bytes`:
|
|
79
|
+
|
|
80
|
+
### `parse_pdf(data)`
|
|
81
|
+
Parse PDF files using MuPDF (statically linked).
|
|
82
|
+
|
|
83
|
+
### `parse_docx(data)`
|
|
84
|
+
Parse Microsoft Word documents.
|
|
85
|
+
|
|
86
|
+
### `parse_pptx(data)`
|
|
87
|
+
Parse Microsoft PowerPoint presentations.
|
|
88
|
+
|
|
89
|
+
### `parse_xlsx(data)`
|
|
90
|
+
Parse Microsoft Excel spreadsheets.
|
|
91
|
+
|
|
92
|
+
### `parse_json(data)`
|
|
93
|
+
Parse and pretty-print JSON data.
|
|
94
|
+
|
|
95
|
+
### `parse_xml(data)`
|
|
96
|
+
Parse XML/HTML files and extract text content.
|
|
97
|
+
|
|
98
|
+
### `parse_text(data)`
|
|
99
|
+
Parse plain text files.
|
|
100
|
+
|
|
101
|
+
### `ocr_image(data)`
|
|
102
|
+
Perform OCR on images (PNG, JPEG, TIFF, BMP) using Tesseract.
|
|
103
|
+
|
|
104
|
+
## Class Methods
|
|
105
|
+
|
|
106
|
+
### `Parser.supported_formats`
|
|
107
|
+
Get list of supported file formats.
|
|
108
|
+
|
|
109
|
+
**Returns:**
|
|
110
|
+
- [Array<String>] List of supported file extensions
|
|
111
|
+
|
|
112
|
+
**Example:**
|
|
113
|
+
```ruby
|
|
114
|
+
ParseKit::Parser.supported_formats
|
|
115
|
+
# => ["txt", "json", "xml", "html", "docx", "xlsx", "xls", "csv", "pdf", "png", "jpg", "jpeg", "tiff", "bmp", ...]
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
## Implementation Notes
|
|
119
|
+
|
|
120
|
+
All these methods are implemented in Rust via the native extension. The Ruby layer (`lib/parsekit/parser.rb`) provides additional convenience methods and helpers that wrap these native methods.
|
|
121
|
+
|
|
122
|
+
The native extension uses:
|
|
123
|
+
- **MuPDF** for PDF parsing (statically linked)
|
|
124
|
+
- **Tesseract** for OCR functionality (bundled)
|
|
125
|
+
- **Various Rust crates** for Office document parsing (docx-rs, calamine, etc.)
|
|
Binary file
|
data/lib/parsekit/parser.rb
CHANGED
|
@@ -3,65 +3,24 @@
|
|
|
3
3
|
module ParseKit
|
|
4
4
|
# Ruby wrapper for the native Parser class
|
|
5
5
|
#
|
|
6
|
-
#
|
|
7
|
-
#
|
|
6
|
+
# This class provides document parsing capabilities through a native Rust extension.
|
|
7
|
+
# For documentation of native methods, see NATIVE_API.md
|
|
8
|
+
#
|
|
9
|
+
# The Ruby layer provides convenience methods and helpers while the Rust
|
|
10
|
+
# extension handles the actual parsing of PDF, Office documents, images (OCR), etc.
|
|
8
11
|
class Parser
|
|
9
|
-
#
|
|
10
|
-
#
|
|
11
|
-
|
|
12
|
-
#
|
|
13
|
-
#
|
|
14
|
-
#
|
|
15
|
-
#
|
|
16
|
-
#
|
|
17
|
-
#
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
#
|
|
21
|
-
# @return [String] The parsed result
|
|
22
|
-
# @raise [ArgumentError] If input is empty
|
|
23
|
-
# def parse(input)
|
|
24
|
-
# # Implemented in native extension
|
|
25
|
-
# end
|
|
26
|
-
|
|
27
|
-
# Parse a file (supports PDF, Office documents, text files)
|
|
28
|
-
# @param path [String] Path to the file to parse
|
|
29
|
-
# @return [String] The extracted text content
|
|
30
|
-
# @raise [IOError] If file cannot be read
|
|
31
|
-
# @raise [RuntimeError] If parsing fails
|
|
32
|
-
# def parse_file(path)
|
|
33
|
-
# # Implemented in native extension
|
|
34
|
-
# end
|
|
35
|
-
|
|
36
|
-
# Parse binary data
|
|
37
|
-
# @param data [Array<Integer>] Binary data as byte array
|
|
38
|
-
# @return [String] The extracted text content
|
|
39
|
-
# @raise [ArgumentError] If data is empty
|
|
40
|
-
# @raise [RuntimeError] If parsing fails
|
|
41
|
-
# def parse_bytes(data)
|
|
42
|
-
# # Implemented in native extension
|
|
43
|
-
# end
|
|
44
|
-
|
|
45
|
-
# Get the current configuration
|
|
46
|
-
# @return [Hash] The parser configuration
|
|
47
|
-
# def config
|
|
48
|
-
# # Implemented in native extension
|
|
49
|
-
# end
|
|
50
|
-
|
|
51
|
-
# Check if a file format is supported
|
|
52
|
-
# @param path [String] File path to check
|
|
53
|
-
# @return [Boolean] True if the file format is supported
|
|
54
|
-
# def supports_file?(path)
|
|
55
|
-
# # Implemented in native extension
|
|
56
|
-
# end
|
|
57
|
-
|
|
58
|
-
# Get list of supported file formats
|
|
59
|
-
# @return [Array<String>] List of supported file extensions
|
|
60
|
-
# def self.supported_formats
|
|
61
|
-
# # Implemented in native extension
|
|
62
|
-
# end
|
|
63
|
-
|
|
64
|
-
# Ruby-level helper methods
|
|
12
|
+
# Native methods implemented in Rust:
|
|
13
|
+
# - initialize(options = {})
|
|
14
|
+
# - parse(input)
|
|
15
|
+
# - parse_file(path)
|
|
16
|
+
# - parse_bytes(data)
|
|
17
|
+
# - config
|
|
18
|
+
# - supports_file?(path)
|
|
19
|
+
# - strict_mode?
|
|
20
|
+
# - parse_pdf, parse_docx, parse_xlsx, parse_pptx, parse_json, parse_xml, parse_text, ocr_image
|
|
21
|
+
# See NATIVE_API.md for detailed documentation
|
|
22
|
+
|
|
23
|
+
# Ruby convenience methods and helpers
|
|
65
24
|
|
|
66
25
|
# Create a parser with strict mode enabled
|
|
67
26
|
# @param options [Hash] Additional options
|
|
@@ -81,6 +40,7 @@ module ParseKit
|
|
|
81
40
|
end
|
|
82
41
|
|
|
83
42
|
# Detect format from file path
|
|
43
|
+
# @deprecated Use the native format detection in parse_file instead
|
|
84
44
|
# @param path [String] File path
|
|
85
45
|
# @return [Symbol, nil] Format symbol or nil if unknown
|
|
86
46
|
def detect_format(path)
|
|
@@ -101,67 +61,134 @@ module ParseKit
|
|
|
101
61
|
end
|
|
102
62
|
|
|
103
63
|
# Detect format from binary data
|
|
64
|
+
# @deprecated Use the native format detection in parse_bytes instead
|
|
104
65
|
# @param data [String, Array<Integer>] Binary data
|
|
105
66
|
# @return [Symbol] Format symbol
|
|
106
67
|
def detect_format_from_bytes(data)
|
|
107
68
|
# Convert to bytes if string
|
|
108
69
|
bytes = data.is_a?(String) ? data.bytes : data
|
|
109
|
-
return :text if bytes.empty?
|
|
110
|
-
|
|
111
|
-
# Check magic bytes
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
70
|
+
return :text if bytes.empty? # Return :text for empty data
|
|
71
|
+
|
|
72
|
+
# Check magic bytes for various formats
|
|
73
|
+
|
|
74
|
+
# PDF
|
|
75
|
+
if bytes.size >= 4 && bytes[0..3] == [0x25, 0x50, 0x44, 0x46] # %PDF
|
|
76
|
+
return :pdf
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
# PNG
|
|
80
|
+
if bytes.size >= 8 && bytes[0..7] == [0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A]
|
|
81
|
+
return :png
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
# JPEG
|
|
85
|
+
if bytes.size >= 3 && bytes[0..2] == [0xFF, 0xD8, 0xFF]
|
|
86
|
+
return :jpeg
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
# BMP
|
|
90
|
+
if bytes.size >= 2 && bytes[0..1] == [0x42, 0x4D] # BM
|
|
91
|
+
return :bmp
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
# TIFF (little-endian or big-endian)
|
|
95
|
+
if bytes.size >= 4
|
|
96
|
+
if bytes[0..3] == [0x49, 0x49, 0x2A, 0x00] # II*\0 (little-endian)
|
|
97
|
+
return :tiff
|
|
98
|
+
elsif bytes[0..3] == [0x4D, 0x4D, 0x00, 0x2A] # MM\0* (big-endian)
|
|
99
|
+
return :tiff
|
|
100
|
+
end
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
# OLE Compound Document (old Excel/Word) - return :xlsx for compatibility
|
|
104
|
+
if bytes.size >= 4 && bytes[0..3] == [0xD0, 0xCF, 0x11, 0xE0]
|
|
105
|
+
return :xlsx # Return :xlsx for compatibility with existing tests
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
# ZIP archive (could be DOCX, XLSX, PPTX)
|
|
109
|
+
if bytes.size >= 2 && bytes[0..1] == [0x50, 0x4B] # PK
|
|
110
|
+
# Try to determine the specific Office format by checking ZIP contents
|
|
111
|
+
# For now, we'll need to inspect the ZIP structure
|
|
112
|
+
return detect_office_format_from_zip(bytes)
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
# XML
|
|
116
|
+
if bytes.size >= 5
|
|
117
|
+
first_chars = bytes[0..4].pack('C*')
|
|
118
|
+
if first_chars == '<?xml' || first_chars.start_with?('<!')
|
|
119
|
+
return :xml
|
|
120
|
+
end
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
# HTML
|
|
124
|
+
if bytes.size >= 14
|
|
125
|
+
first_chars = bytes[0..13].pack('C*').downcase
|
|
126
|
+
if first_chars.include?('<!doctype') || first_chars.include?('<html')
|
|
127
|
+
return :xml # HTML is treated as XML
|
|
128
|
+
end
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
# JSON
|
|
132
|
+
if bytes.size > 0
|
|
133
|
+
first_char = bytes[0]
|
|
134
|
+
# Skip whitespace
|
|
135
|
+
idx = 0
|
|
136
|
+
while idx < bytes.size && [0x20, 0x09, 0x0A, 0x0D].include?(bytes[idx])
|
|
137
|
+
idx += 1
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
if idx < bytes.size
|
|
141
|
+
first_non_ws = bytes[idx]
|
|
142
|
+
if first_non_ws == 0x7B || first_non_ws == 0x5B # { or [
|
|
143
|
+
return :json
|
|
144
|
+
end
|
|
145
|
+
end
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
# Default to text if not recognized
|
|
149
|
+
:text
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
# Detect specific Office format from ZIP data
|
|
153
|
+
# @param bytes [Array<Integer>] ZIP file bytes
|
|
154
|
+
# @return [Symbol] :docx, :xlsx, :pptx, or :unknown
|
|
155
|
+
def detect_office_format_from_zip(bytes)
|
|
156
|
+
# This is a simplified detection - in practice you'd parse the ZIP
|
|
157
|
+
# For the test, we'll check for known patterns in the ZIP structure
|
|
158
|
+
|
|
159
|
+
# Convert bytes to string for pattern matching
|
|
160
|
+
content = bytes[0..2000].pack('C*') # Check first 2KB
|
|
161
|
+
|
|
162
|
+
# Look for Office-specific directory names in the ZIP
|
|
163
|
+
if content.include?('word/') || content.include?('word/_rels')
|
|
164
|
+
:docx
|
|
165
|
+
elsif content.include?('xl/') || content.include?('xl/_rels')
|
|
119
166
|
:xlsx
|
|
120
|
-
elsif
|
|
121
|
-
:
|
|
122
|
-
elsif bytes[0..4] == [0x3C, 0x68, 0x74, 0x6D, 0x6C] # <html
|
|
123
|
-
:xml
|
|
124
|
-
elsif bytes[0] == 0x7B || bytes[0] == 0x5B # { or [
|
|
125
|
-
:json
|
|
167
|
+
elsif content.include?('ppt/') || content.include?('ppt/_rels')
|
|
168
|
+
:pptx
|
|
126
169
|
else
|
|
127
|
-
|
|
170
|
+
# Default to xlsx for generic ZIP
|
|
171
|
+
:xlsx
|
|
128
172
|
end
|
|
129
173
|
end
|
|
130
174
|
|
|
131
175
|
# Parse file using format-specific parser
|
|
132
|
-
# This method
|
|
176
|
+
# This method delegates to parse_file which uses centralized dispatch in Rust
|
|
133
177
|
# @param path [String] File path
|
|
134
178
|
# @return [String] Parsed content
|
|
135
179
|
def parse_file_routed(path)
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
case format
|
|
140
|
-
when :docx then parse_docx(data)
|
|
141
|
-
when :xlsx then parse_xlsx(data)
|
|
142
|
-
when :pdf then parse_pdf(data)
|
|
143
|
-
when :json then parse_json(data)
|
|
144
|
-
when :xml then parse_xml(data)
|
|
145
|
-
else parse_text(data)
|
|
146
|
-
end
|
|
180
|
+
# Simply delegate to parse_file which already has dispatch logic
|
|
181
|
+
parse_file(path)
|
|
147
182
|
end
|
|
148
183
|
|
|
149
184
|
# Parse bytes using format-specific parser
|
|
150
|
-
# This method
|
|
185
|
+
# This method delegates to parse_bytes which uses centralized dispatch in Rust
|
|
151
186
|
# @param data [String, Array<Integer>] Binary data
|
|
152
187
|
# @return [String] Parsed content
|
|
153
188
|
def parse_bytes_routed(data)
|
|
154
|
-
|
|
189
|
+
# Simply delegate to parse_bytes which already has dispatch logic
|
|
155
190
|
bytes = data.is_a?(String) ? data.bytes : data
|
|
156
|
-
|
|
157
|
-
case format
|
|
158
|
-
when :docx then parse_docx(bytes)
|
|
159
|
-
when :xlsx then parse_xlsx(bytes)
|
|
160
|
-
when :pdf then parse_pdf(bytes)
|
|
161
|
-
when :json then parse_json(bytes)
|
|
162
|
-
when :xml then parse_xml(bytes)
|
|
163
|
-
else parse_text(bytes)
|
|
164
|
-
end
|
|
191
|
+
parse_bytes(bytes)
|
|
165
192
|
end
|
|
166
193
|
|
|
167
194
|
# Parse with a block for processing results
|
|
@@ -178,25 +205,49 @@ module ParseKit
|
|
|
178
205
|
# @param input [String] The input to validate
|
|
179
206
|
# @return [Boolean] True if input is valid
|
|
180
207
|
def valid_input?(input)
|
|
181
|
-
|
|
182
|
-
return false if input.empty?
|
|
183
|
-
true
|
|
208
|
+
input.is_a?(String) && !input.empty?
|
|
184
209
|
end
|
|
185
210
|
|
|
186
211
|
# Validate file before parsing
|
|
187
212
|
# @param path [String] The file path to validate
|
|
188
213
|
# @return [Boolean] True if file exists and format is supported
|
|
189
214
|
def valid_file?(path)
|
|
215
|
+
return false if path.nil? || path.empty?
|
|
190
216
|
return false unless File.exist?(path)
|
|
217
|
+
return false if File.directory?(path)
|
|
191
218
|
supports_file?(path)
|
|
192
219
|
end
|
|
193
220
|
|
|
194
221
|
# Get file extension
|
|
195
222
|
# @param path [String] File path
|
|
196
|
-
# @return [String, nil] File extension in lowercase
|
|
223
|
+
# @return [String, nil] File extension in lowercase without leading dot
|
|
197
224
|
def file_extension(path)
|
|
198
|
-
|
|
199
|
-
|
|
225
|
+
return nil if path.nil? || path.empty?
|
|
226
|
+
|
|
227
|
+
# Handle trailing whitespace
|
|
228
|
+
clean_path = path.strip
|
|
229
|
+
|
|
230
|
+
# Handle trailing slashes (directory indicator)
|
|
231
|
+
return nil if clean_path.end_with?('/')
|
|
232
|
+
|
|
233
|
+
# Get the extension
|
|
234
|
+
ext = File.extname(clean_path)
|
|
235
|
+
|
|
236
|
+
# Handle special cases
|
|
237
|
+
if ext.empty?
|
|
238
|
+
# Check for hidden files like .gitignore (the whole name after dot is the "extension")
|
|
239
|
+
basename = File.basename(clean_path)
|
|
240
|
+
if basename.start_with?('.') && basename.length > 1 && !basename[1..-1].include?('.')
|
|
241
|
+
return basename[1..-1].downcase
|
|
242
|
+
end
|
|
243
|
+
return nil
|
|
244
|
+
elsif ext == '.'
|
|
245
|
+
# File ends with a dot but no extension
|
|
246
|
+
return nil
|
|
247
|
+
else
|
|
248
|
+
# Normal extension, remove the dot and downcase
|
|
249
|
+
ext[1..-1].downcase
|
|
250
|
+
end
|
|
200
251
|
end
|
|
201
252
|
end
|
|
202
253
|
end
|
data/lib/parsekit/version.rb
CHANGED
data/lib/parsekit.rb
CHANGED
|
@@ -14,6 +14,22 @@ require_relative "parsekit/parser"
|
|
|
14
14
|
|
|
15
15
|
# ParseKit is a Ruby document parsing toolkit with PDF and OCR support
|
|
16
16
|
module ParseKit
|
|
17
|
+
# Supported file formats and their extensions
|
|
18
|
+
SUPPORTED_FORMATS = {
|
|
19
|
+
pdf: ['.pdf'],
|
|
20
|
+
docx: ['.docx'],
|
|
21
|
+
xlsx: ['.xlsx'],
|
|
22
|
+
xls: ['.xls'],
|
|
23
|
+
pptx: ['.pptx'],
|
|
24
|
+
png: ['.png'],
|
|
25
|
+
jpeg: ['.jpg', '.jpeg'],
|
|
26
|
+
tiff: ['.tiff', '.tif'],
|
|
27
|
+
bmp: ['.bmp'],
|
|
28
|
+
json: ['.json'],
|
|
29
|
+
xml: ['.xml', '.html'],
|
|
30
|
+
text: ['.txt', '.md', '.csv']
|
|
31
|
+
}.freeze
|
|
32
|
+
|
|
17
33
|
class << self
|
|
18
34
|
# The parse_file and parse_bytes methods are defined in the native extension
|
|
19
35
|
# We just need to document them here or add wrapper logic if needed
|
|
@@ -50,6 +66,22 @@ module ParseKit
|
|
|
50
66
|
Parser.new.supports_file?(path)
|
|
51
67
|
end
|
|
52
68
|
|
|
69
|
+
# Detect file format from filename/extension
|
|
70
|
+
# @param filename [String, nil] The filename to check
|
|
71
|
+
# @return [Symbol] The detected format, or :unknown
|
|
72
|
+
def detect_format(filename)
|
|
73
|
+
return :unknown if filename.nil? || filename.empty?
|
|
74
|
+
|
|
75
|
+
ext = File.extname(filename).downcase
|
|
76
|
+
return :unknown if ext.empty?
|
|
77
|
+
|
|
78
|
+
SUPPORTED_FORMATS.each do |format, extensions|
|
|
79
|
+
return format if extensions.include?(ext)
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
:unknown
|
|
83
|
+
end
|
|
84
|
+
|
|
53
85
|
# Get the native library version
|
|
54
86
|
# @return [String] Version of the native library
|
|
55
87
|
def native_version
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: parsekit
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.1.
|
|
4
|
+
version: 0.1.2
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Chris Petersen
|
|
@@ -96,20 +96,22 @@ files:
|
|
|
96
96
|
- ext/parsekit/Cargo.toml
|
|
97
97
|
- ext/parsekit/extconf.rb
|
|
98
98
|
- ext/parsekit/src/error.rs
|
|
99
|
+
- ext/parsekit/src/format_detector.rs
|
|
99
100
|
- ext/parsekit/src/lib.rs
|
|
100
101
|
- ext/parsekit/src/parser.rs
|
|
101
102
|
- lib/parsekit.rb
|
|
103
|
+
- lib/parsekit/NATIVE_API.md
|
|
102
104
|
- lib/parsekit/error.rb
|
|
103
105
|
- lib/parsekit/parsekit.bundle
|
|
104
106
|
- lib/parsekit/parser.rb
|
|
105
107
|
- lib/parsekit/version.rb
|
|
106
|
-
homepage: https://github.com/
|
|
108
|
+
homepage: https://github.com/scientist-labs/parsekit
|
|
107
109
|
licenses:
|
|
108
110
|
- MIT
|
|
109
111
|
metadata:
|
|
110
|
-
homepage_uri: https://github.com/
|
|
111
|
-
source_code_uri: https://github.com/
|
|
112
|
-
changelog_uri: https://github.com/
|
|
112
|
+
homepage_uri: https://github.com/scientist-labs/parsekit
|
|
113
|
+
source_code_uri: https://github.com/scientist-labs/parsekit
|
|
114
|
+
changelog_uri: https://github.com/scientist-labs/parsekit/blob/main/CHANGELOG.md
|
|
113
115
|
post_install_message:
|
|
114
116
|
rdoc_options: []
|
|
115
117
|
require_paths:
|