parsekit 0.1.0.pre.1 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +29 -17
- data/ext/parsekit/Cargo.toml +9 -7
- data/ext/parsekit/src/error.rs +7 -7
- data/ext/parsekit/src/format_detector.rs +233 -0
- data/ext/parsekit/src/lib.rs +1 -0
- data/ext/parsekit/src/parser.rs +357 -199
- data/lib/parsekit/NATIVE_API.md +125 -0
- data/lib/parsekit/parsekit.bundle +0 -0
- data/lib/parsekit/parser.rb +156 -104
- data/lib/parsekit/version.rb +1 -1
- data/lib/parsekit.rb +32 -0
- metadata +4 -2
data/ext/parsekit/src/parser.rs
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
use magnus::{
|
|
2
|
-
|
|
2
|
+
function, method, prelude::*, scan_args, Error, Module, RHash, RModule, Ruby, Value,
|
|
3
3
|
};
|
|
4
|
-
use
|
|
4
|
+
use crate::format_detector::{FileFormat, FormatDetector};
|
|
5
5
|
|
|
6
6
|
#[derive(Debug, Clone)]
|
|
7
7
|
#[magnus::wrap(class = "ParseKit::Parser", free_immediately, size)]
|
|
@@ -28,14 +28,41 @@ impl Default for ParserConfig {
|
|
|
28
28
|
}
|
|
29
29
|
}
|
|
30
30
|
|
|
31
|
+
// Error handling helpers
|
|
32
|
+
impl Parser {
|
|
33
|
+
/// Create a RuntimeError with formatted message
|
|
34
|
+
fn runtime_error<E: std::fmt::Display>(context: &str, err: E) -> Error {
|
|
35
|
+
Error::new(
|
|
36
|
+
Ruby::get().unwrap().exception_runtime_error(),
|
|
37
|
+
format!("{}: {}", context, err),
|
|
38
|
+
)
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
/// Create an ArgumentError with message
|
|
42
|
+
fn argument_error(msg: &str) -> Error {
|
|
43
|
+
Error::new(
|
|
44
|
+
Ruby::get().unwrap().exception_arg_error(),
|
|
45
|
+
msg.to_string(),
|
|
46
|
+
)
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
/// Create an IOError with formatted message
|
|
50
|
+
fn io_error<E: std::fmt::Display>(context: &str, err: E) -> Error {
|
|
51
|
+
Error::new(
|
|
52
|
+
Ruby::get().unwrap().exception_io_error(),
|
|
53
|
+
format!("{}: {}", context, err),
|
|
54
|
+
)
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
|
|
31
58
|
impl Parser {
|
|
32
59
|
/// Create a new Parser instance with optional configuration
|
|
33
60
|
fn new(ruby: &Ruby, args: &[Value]) -> Result<Self, Error> {
|
|
34
61
|
let args = scan_args::scan_args::<(), (Option<RHash>,), (), (), (), ()>(args)?;
|
|
35
62
|
let options = args.optional.0;
|
|
36
|
-
|
|
63
|
+
|
|
37
64
|
let mut config = ParserConfig::default();
|
|
38
|
-
|
|
65
|
+
|
|
39
66
|
if let Some(opts) = options {
|
|
40
67
|
if let Some(strict) = opts.get(ruby.to_symbol("strict_mode")) {
|
|
41
68
|
config.strict_mode = bool::try_convert(strict)?;
|
|
@@ -50,173 +77,193 @@ impl Parser {
|
|
|
50
77
|
config.max_size = usize::try_convert(max_size)?;
|
|
51
78
|
}
|
|
52
79
|
}
|
|
53
|
-
|
|
80
|
+
|
|
54
81
|
Ok(Self { config })
|
|
55
82
|
}
|
|
56
|
-
|
|
83
|
+
|
|
57
84
|
/// Parse input bytes based on file type (internal helper)
|
|
58
85
|
fn parse_bytes_internal(&self, data: Vec<u8>, filename: Option<&str>) -> Result<String, Error> {
|
|
59
86
|
// Check size limit
|
|
60
87
|
if data.len() > self.config.max_size {
|
|
61
|
-
return Err(
|
|
62
|
-
|
|
63
|
-
format!("
|
|
88
|
+
return Err(Self::runtime_error(
|
|
89
|
+
"File size exceeds limit",
|
|
90
|
+
format!("{} bytes exceeds maximum allowed size of {} bytes",
|
|
91
|
+
data.len(), self.config.max_size)
|
|
64
92
|
));
|
|
65
93
|
}
|
|
94
|
+
|
|
95
|
+
// Use centralized format detection
|
|
96
|
+
let format = FormatDetector::detect(filename, Some(&data));
|
|
66
97
|
|
|
67
|
-
//
|
|
68
|
-
|
|
69
|
-
Self::detect_type_from_filename(name)
|
|
70
|
-
} else {
|
|
71
|
-
Self::detect_type_from_content(&data)
|
|
72
|
-
};
|
|
73
|
-
|
|
74
|
-
match file_type.as_str() {
|
|
75
|
-
"pdf" => self.parse_pdf(data),
|
|
76
|
-
"docx" => self.parse_docx(data),
|
|
77
|
-
"xlsx" | "xls" => self.parse_xlsx(data),
|
|
78
|
-
"json" => self.parse_json(data),
|
|
79
|
-
"xml" | "html" => self.parse_xml(data),
|
|
80
|
-
"png" | "jpg" | "jpeg" | "tiff" | "bmp" => self.ocr_image(data),
|
|
81
|
-
"txt" | "text" => self.parse_text(data),
|
|
82
|
-
_ => self.parse_text(data), // Default to text parsing
|
|
83
|
-
}
|
|
98
|
+
// Use centralized dispatch
|
|
99
|
+
self.dispatch_to_parser(format, data)
|
|
84
100
|
}
|
|
85
101
|
|
|
86
|
-
///
|
|
87
|
-
fn
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
102
|
+
/// Centralized dispatch logic - routes format to appropriate parser
|
|
103
|
+
fn dispatch_to_parser(&self, format: FileFormat, data: Vec<u8>) -> Result<String, Error> {
|
|
104
|
+
match format {
|
|
105
|
+
FileFormat::Pdf => self.parse_pdf(data),
|
|
106
|
+
FileFormat::Docx => self.parse_docx(data),
|
|
107
|
+
FileFormat::Pptx => self.parse_pptx(data),
|
|
108
|
+
FileFormat::Xlsx | FileFormat::Xls => self.parse_xlsx(data),
|
|
109
|
+
FileFormat::Json => self.parse_json(data),
|
|
110
|
+
FileFormat::Xml | FileFormat::Html => self.parse_xml(data),
|
|
111
|
+
FileFormat::Png | FileFormat::Jpeg | FileFormat::Tiff | FileFormat::Bmp => self.ocr_image(data),
|
|
112
|
+
FileFormat::Text | FileFormat::Unknown => self.parse_text(data),
|
|
92
113
|
}
|
|
93
114
|
}
|
|
94
|
-
|
|
95
|
-
///
|
|
96
|
-
fn
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
// This is a simplified check - both DOCX and XLSX are ZIP files
|
|
103
|
-
// For now, default to xlsx as it's more commonly parsed
|
|
104
|
-
"xlsx".to_string() // Office Open XML format (could also be DOCX)
|
|
105
|
-
} else if data.starts_with(&[0xD0, 0xCF, 0x11, 0xE0]) {
|
|
106
|
-
"xls".to_string() // Old Excel format
|
|
107
|
-
} else if data.starts_with(&[0x89, 0x50, 0x4E, 0x47]) {
|
|
108
|
-
"png".to_string() // PNG signature
|
|
109
|
-
} else if data.starts_with(&[0xFF, 0xD8, 0xFF]) {
|
|
110
|
-
"jpg".to_string() // JPEG signature
|
|
111
|
-
} else if data.starts_with(b"BM") {
|
|
112
|
-
"bmp".to_string() // BMP signature
|
|
113
|
-
} else if data.starts_with(b"II\x2A\x00") || data.starts_with(b"MM\x00\x2A") {
|
|
114
|
-
"tiff".to_string() // TIFF signature (little-endian or big-endian)
|
|
115
|
-
} else if data.starts_with(b"<?xml") || data.starts_with(b"<html") {
|
|
116
|
-
"xml".to_string()
|
|
117
|
-
} else if data.starts_with(b"{") || data.starts_with(b"[") {
|
|
118
|
-
"json".to_string()
|
|
119
|
-
} else {
|
|
120
|
-
"txt".to_string()
|
|
115
|
+
|
|
116
|
+
/// Ruby-accessible method to detect format from bytes
|
|
117
|
+
fn detect_format_from_bytes(&self, data: Vec<u8>) -> String {
|
|
118
|
+
let format = FormatDetector::detect_from_content(&data);
|
|
119
|
+
// For compatibility with Ruby tests, return "xlsx" for old Excel
|
|
120
|
+
match format {
|
|
121
|
+
FileFormat::Xls => "xlsx".to_string(), // Compatibility with existing tests
|
|
122
|
+
_ => format.to_symbol().to_string(),
|
|
121
123
|
}
|
|
122
124
|
}
|
|
123
125
|
|
|
126
|
+
/// Ruby-accessible method to detect format from filename
|
|
127
|
+
fn detect_format_from_filename(&self, filename: String) -> String {
|
|
128
|
+
let format = FormatDetector::detect_from_extension(&filename);
|
|
129
|
+
format.to_symbol().to_string()
|
|
130
|
+
}
|
|
131
|
+
|
|
124
132
|
/// Perform OCR on image data using Tesseract
|
|
125
133
|
fn ocr_image(&self, data: Vec<u8>) -> Result<String, Error> {
|
|
126
|
-
use
|
|
134
|
+
use tesseract_rs::TesseractAPI;
|
|
127
135
|
|
|
128
|
-
//
|
|
129
|
-
let
|
|
130
|
-
Ok(img) => img,
|
|
131
|
-
Err(e) => return Err(Error::new(
|
|
132
|
-
magnus::exception::runtime_error(),
|
|
133
|
-
format!("Failed to load image: {}", e),
|
|
134
|
-
))
|
|
135
|
-
};
|
|
136
|
+
// Create tesseract instance
|
|
137
|
+
let tesseract = TesseractAPI::new();
|
|
136
138
|
|
|
137
|
-
//
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
139
|
+
// Try to initialize with appropriate tessdata path
|
|
140
|
+
// Even in bundled mode, we need to find tessdata files
|
|
141
|
+
#[cfg(feature = "bundled-tesseract")]
|
|
142
|
+
let init_result = {
|
|
143
|
+
// Build list of tessdata paths to try
|
|
144
|
+
let mut tessdata_paths = Vec::new();
|
|
145
|
+
|
|
146
|
+
// Check TESSDATA_PREFIX environment variable first (for CI)
|
|
147
|
+
if let Ok(env_path) = std::env::var("TESSDATA_PREFIX") {
|
|
148
|
+
tessdata_paths.push(env_path);
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
// Add common system paths
|
|
152
|
+
tessdata_paths.extend_from_slice(&[
|
|
153
|
+
"/usr/share/tessdata".to_string(),
|
|
154
|
+
"/usr/local/share/tessdata".to_string(),
|
|
155
|
+
"/opt/homebrew/share/tessdata".to_string(),
|
|
156
|
+
"/opt/local/share/tessdata".to_string(),
|
|
157
|
+
"tessdata".to_string(), // Local tessdata directory
|
|
158
|
+
".".to_string(), // Current directory as fallback
|
|
159
|
+
]);
|
|
160
|
+
|
|
161
|
+
let mut result = Err(tesseract_rs::TesseractError::InitError);
|
|
162
|
+
for path in &tessdata_paths {
|
|
163
|
+
// Check if path exists first to avoid noisy error messages
|
|
164
|
+
if std::path::Path::new(path).exists() {
|
|
165
|
+
if tesseract.init(path.as_str(), "eng").is_ok() {
|
|
166
|
+
result = Ok(());
|
|
167
|
+
break;
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
result
|
|
144
172
|
};
|
|
145
173
|
|
|
146
|
-
|
|
147
|
-
let
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
174
|
+
#[cfg(not(feature = "bundled-tesseract"))]
|
|
175
|
+
let init_result = {
|
|
176
|
+
// Try common system tessdata paths
|
|
177
|
+
let tessdata_paths = vec![
|
|
178
|
+
"/usr/share/tessdata",
|
|
179
|
+
"/usr/local/share/tessdata",
|
|
180
|
+
"/opt/homebrew/share/tessdata",
|
|
181
|
+
"/opt/local/share/tessdata",
|
|
182
|
+
];
|
|
183
|
+
|
|
184
|
+
let mut result = Err(tesseract_rs::TesseractError::InitError);
|
|
185
|
+
for path in &tessdata_paths {
|
|
186
|
+
if std::path::Path::new(path).exists() {
|
|
187
|
+
if tesseract.init(path, "eng").is_ok() {
|
|
188
|
+
result = Ok(());
|
|
189
|
+
break;
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
result
|
|
194
|
+
};
|
|
152
195
|
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
Ok(text) => Ok(text.trim().to_string()),
|
|
156
|
-
Err(e) => Err(Error::new(
|
|
157
|
-
magnus::exception::runtime_error(),
|
|
158
|
-
format!("Failed to perform OCR: {}", e),
|
|
159
|
-
))
|
|
196
|
+
if let Err(e) = init_result {
|
|
197
|
+
return Err(Self::runtime_error("Failed to initialize Tesseract", e));
|
|
160
198
|
}
|
|
199
|
+
|
|
200
|
+
// Load the image from bytes
|
|
201
|
+
let img = image::load_from_memory(&data)
|
|
202
|
+
.map_err(|e| Self::runtime_error("Failed to load image", e))?;
|
|
203
|
+
|
|
204
|
+
// Convert to RGBA8 format
|
|
205
|
+
let rgba_img = img.to_rgba8();
|
|
206
|
+
let (width, height) = rgba_img.dimensions();
|
|
207
|
+
let raw_data = rgba_img.into_raw();
|
|
208
|
+
|
|
209
|
+
// Set image data
|
|
210
|
+
tesseract.set_image(
|
|
211
|
+
&raw_data,
|
|
212
|
+
width as i32,
|
|
213
|
+
height as i32,
|
|
214
|
+
4, // bytes per pixel (RGBA)
|
|
215
|
+
(width * 4) as i32, // bytes per line
|
|
216
|
+
).map_err(|e| Self::runtime_error("Failed to set image", e))?;
|
|
217
|
+
|
|
218
|
+
// Extract text
|
|
219
|
+
tesseract.get_utf8_text()
|
|
220
|
+
.map(|text| text.trim().to_string())
|
|
221
|
+
.map_err(|e| Self::runtime_error("Failed to perform OCR", e))
|
|
161
222
|
}
|
|
162
223
|
|
|
224
|
+
|
|
163
225
|
/// Parse PDF files using MuPDF (statically linked) - exposed to Ruby
|
|
164
226
|
fn parse_pdf(&self, data: Vec<u8>) -> Result<String, Error> {
|
|
165
227
|
use mupdf::Document;
|
|
166
|
-
|
|
228
|
+
|
|
167
229
|
// Try to load the PDF from memory
|
|
168
230
|
// The magic parameter helps MuPDF identify the file type
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
//
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
// Extract text from the page
|
|
187
|
-
match page.to_text() {
|
|
188
|
-
Ok(text) => {
|
|
189
|
-
all_text.push_str(&text);
|
|
190
|
-
all_text.push('\n');
|
|
191
|
-
}
|
|
192
|
-
Err(_) => continue,
|
|
193
|
-
}
|
|
194
|
-
}
|
|
195
|
-
Err(_) => continue,
|
|
196
|
-
}
|
|
197
|
-
}
|
|
198
|
-
|
|
199
|
-
if all_text.is_empty() {
|
|
200
|
-
Ok("PDF contains no extractable text (might be scanned/image-based)".to_string())
|
|
201
|
-
} else {
|
|
202
|
-
Ok(all_text.trim().to_string())
|
|
231
|
+
let doc = Document::from_bytes(&data, "pdf")
|
|
232
|
+
.map_err(|e| Self::runtime_error("Failed to parse PDF", e))?;
|
|
233
|
+
|
|
234
|
+
let mut all_text = String::new();
|
|
235
|
+
|
|
236
|
+
// Get page count
|
|
237
|
+
let page_count = doc.page_count()
|
|
238
|
+
.map_err(|e| Self::runtime_error("Failed to get page count", e))?;
|
|
239
|
+
|
|
240
|
+
// Iterate through pages
|
|
241
|
+
for page_num in 0..page_count {
|
|
242
|
+
// Continue on page errors rather than failing entirely
|
|
243
|
+
if let Ok(page) = doc.load_page(page_num) {
|
|
244
|
+
// Extract text from the page
|
|
245
|
+
if let Ok(text) = page.to_text() {
|
|
246
|
+
all_text.push_str(&text);
|
|
247
|
+
all_text.push('\n');
|
|
203
248
|
}
|
|
204
249
|
}
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
))
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
if all_text.is_empty() {
|
|
253
|
+
Ok("PDF contains no extractable text (might be scanned/image-based)".to_string())
|
|
254
|
+
} else {
|
|
255
|
+
Ok(all_text.trim().to_string())
|
|
209
256
|
}
|
|
210
257
|
}
|
|
211
|
-
|
|
258
|
+
|
|
212
259
|
/// Parse DOCX (Word) files - exposed to Ruby
|
|
213
260
|
fn parse_docx(&self, data: Vec<u8>) -> Result<String, Error> {
|
|
214
261
|
use docx_rs::read_docx;
|
|
215
|
-
|
|
262
|
+
|
|
216
263
|
match read_docx(&data) {
|
|
217
264
|
Ok(docx) => {
|
|
218
265
|
let mut result = String::new();
|
|
219
|
-
|
|
266
|
+
|
|
220
267
|
// Extract text from all document children
|
|
221
268
|
// For simplicity, we'll focus on paragraphs only for now
|
|
222
269
|
// Tables require more complex handling with the current API
|
|
@@ -238,29 +285,156 @@ impl Parser {
|
|
|
238
285
|
// table.rows -> TableChild::TableRow -> row.cells -> TableRowChild
|
|
239
286
|
// which has a more complex structure in docx-rs
|
|
240
287
|
}
|
|
241
|
-
|
|
288
|
+
|
|
242
289
|
Ok(result.trim().to_string())
|
|
243
290
|
}
|
|
244
|
-
Err(e) => Err(
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
291
|
+
Err(e) => Err(Self::runtime_error("Failed to parse DOCX file", e)),
|
|
292
|
+
}
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
/// Parse PPTX (PowerPoint) files - exposed to Ruby
|
|
296
|
+
fn parse_pptx(&self, data: Vec<u8>) -> Result<String, Error> {
|
|
297
|
+
use std::io::{Cursor, Read};
|
|
298
|
+
use zip::ZipArchive;
|
|
299
|
+
|
|
300
|
+
let cursor = Cursor::new(data);
|
|
301
|
+
let mut archive = ZipArchive::new(cursor)
|
|
302
|
+
.map_err(|e| Self::runtime_error("Failed to open PPTX as ZIP", e))?;
|
|
303
|
+
|
|
304
|
+
let mut all_text = Vec::new();
|
|
305
|
+
let mut slide_numbers = Vec::new();
|
|
306
|
+
|
|
307
|
+
// First, collect slide numbers and sort them
|
|
308
|
+
for i in 0..archive.len() {
|
|
309
|
+
let file = match archive.by_index(i) {
|
|
310
|
+
Ok(file) => file,
|
|
311
|
+
Err(_) => continue,
|
|
312
|
+
};
|
|
313
|
+
|
|
314
|
+
let name = file.name();
|
|
315
|
+
// Match slide XML files (e.g., ppt/slides/slide1.xml)
|
|
316
|
+
if name.starts_with("ppt/slides/slide") && name.ends_with(".xml") && !name.contains("_rels") {
|
|
317
|
+
// Extract slide number from filename
|
|
318
|
+
if let Some(num_str) = name
|
|
319
|
+
.strip_prefix("ppt/slides/slide")
|
|
320
|
+
.and_then(|s| s.strip_suffix(".xml"))
|
|
321
|
+
{
|
|
322
|
+
if let Ok(num) = num_str.parse::<usize>() {
|
|
323
|
+
slide_numbers.push((num, i));
|
|
324
|
+
}
|
|
325
|
+
}
|
|
326
|
+
}
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
// Sort by slide number to maintain order
|
|
330
|
+
slide_numbers.sort_by_key(|&(num, _)| num);
|
|
331
|
+
|
|
332
|
+
// Now process slides in order
|
|
333
|
+
for (_, index) in slide_numbers {
|
|
334
|
+
let mut file = match archive.by_index(index) {
|
|
335
|
+
Ok(file) => file,
|
|
336
|
+
Err(_) => continue,
|
|
337
|
+
};
|
|
338
|
+
|
|
339
|
+
let mut contents = String::new();
|
|
340
|
+
if file.read_to_string(&mut contents).is_ok() {
|
|
341
|
+
// Extract text from slide XML
|
|
342
|
+
let text = self.extract_text_from_slide_xml(&contents);
|
|
343
|
+
if !text.is_empty() {
|
|
344
|
+
all_text.push(text);
|
|
345
|
+
}
|
|
346
|
+
}
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
// Also extract notes if present
|
|
350
|
+
for i in 0..archive.len() {
|
|
351
|
+
let mut file = match archive.by_index(i) {
|
|
352
|
+
Ok(file) => file,
|
|
353
|
+
Err(_) => continue,
|
|
354
|
+
};
|
|
355
|
+
|
|
356
|
+
let name = file.name();
|
|
357
|
+
// Match notes slide XML files
|
|
358
|
+
if name.starts_with("ppt/notesSlides/notesSlide") && name.ends_with(".xml") && !name.contains("_rels") {
|
|
359
|
+
let mut contents = String::new();
|
|
360
|
+
if file.read_to_string(&mut contents).is_ok() {
|
|
361
|
+
let text = self.extract_text_from_slide_xml(&contents);
|
|
362
|
+
if !text.is_empty() {
|
|
363
|
+
all_text.push(format!("[Notes: {}]", text));
|
|
364
|
+
}
|
|
365
|
+
}
|
|
366
|
+
}
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
if all_text.is_empty() {
|
|
370
|
+
Ok("".to_string())
|
|
371
|
+
} else {
|
|
372
|
+
Ok(all_text.join("\n\n"))
|
|
248
373
|
}
|
|
249
374
|
}
|
|
250
375
|
|
|
376
|
+
/// Helper method to extract text from slide XML
|
|
377
|
+
fn extract_text_from_slide_xml(&self, xml_content: &str) -> String {
|
|
378
|
+
use quick_xml::events::Event;
|
|
379
|
+
use quick_xml::Reader;
|
|
380
|
+
|
|
381
|
+
let mut reader = Reader::from_str(xml_content);
|
|
382
|
+
|
|
383
|
+
let mut text_parts = Vec::new();
|
|
384
|
+
let mut buf = Vec::new();
|
|
385
|
+
let mut in_text_element = false;
|
|
386
|
+
|
|
387
|
+
loop {
|
|
388
|
+
match reader.read_event_into(&mut buf) {
|
|
389
|
+
Ok(Event::Start(ref e)) => {
|
|
390
|
+
// Look for text elements (a:t or t)
|
|
391
|
+
let name = e.name();
|
|
392
|
+
let local_name_bytes = name.local_name();
|
|
393
|
+
let local_name = std::str::from_utf8(local_name_bytes.as_ref()).unwrap_or("");
|
|
394
|
+
if local_name == "t" {
|
|
395
|
+
in_text_element = true;
|
|
396
|
+
}
|
|
397
|
+
}
|
|
398
|
+
Ok(Event::Text(e)) => {
|
|
399
|
+
if in_text_element {
|
|
400
|
+
if let Ok(text) = e.decode() {
|
|
401
|
+
let text_str = text.trim();
|
|
402
|
+
if !text_str.is_empty() {
|
|
403
|
+
text_parts.push(text_str.to_string());
|
|
404
|
+
}
|
|
405
|
+
}
|
|
406
|
+
}
|
|
407
|
+
}
|
|
408
|
+
Ok(Event::End(ref e)) => {
|
|
409
|
+
let name = e.name();
|
|
410
|
+
let local_name_bytes = name.local_name();
|
|
411
|
+
let local_name = std::str::from_utf8(local_name_bytes.as_ref()).unwrap_or("");
|
|
412
|
+
if local_name == "t" {
|
|
413
|
+
in_text_element = false;
|
|
414
|
+
}
|
|
415
|
+
}
|
|
416
|
+
Ok(Event::Eof) => break,
|
|
417
|
+
_ => {}
|
|
418
|
+
}
|
|
419
|
+
buf.clear();
|
|
420
|
+
}
|
|
421
|
+
|
|
422
|
+
text_parts.join(" ")
|
|
423
|
+
}
|
|
424
|
+
|
|
251
425
|
/// Parse Excel files - exposed to Ruby
|
|
252
426
|
fn parse_xlsx(&self, data: Vec<u8>) -> Result<String, Error> {
|
|
253
427
|
use calamine::{Reader, Xlsx};
|
|
254
428
|
use std::io::Cursor;
|
|
255
|
-
|
|
429
|
+
|
|
256
430
|
let cursor = Cursor::new(data);
|
|
257
431
|
match Xlsx::new(cursor) {
|
|
258
432
|
Ok(mut workbook) => {
|
|
259
433
|
let mut result = String::new();
|
|
260
|
-
|
|
434
|
+
|
|
261
435
|
for sheet_name in workbook.sheet_names().to_owned() {
|
|
262
436
|
result.push_str(&format!("Sheet: {}\n", sheet_name));
|
|
263
|
-
|
|
437
|
+
|
|
264
438
|
if let Ok(range) = workbook.worksheet_range(&sheet_name) {
|
|
265
439
|
for row in range.rows() {
|
|
266
440
|
for cell in row {
|
|
@@ -271,60 +445,56 @@ impl Parser {
|
|
|
271
445
|
}
|
|
272
446
|
result.push('\n');
|
|
273
447
|
}
|
|
274
|
-
|
|
448
|
+
|
|
275
449
|
Ok(result)
|
|
276
450
|
}
|
|
277
|
-
Err(e) => Err(
|
|
278
|
-
magnus::exception::runtime_error(),
|
|
279
|
-
format!("Failed to parse Excel file: {}", e),
|
|
280
|
-
))
|
|
451
|
+
Err(e) => Err(Self::runtime_error("Failed to parse Excel file", e)),
|
|
281
452
|
}
|
|
282
453
|
}
|
|
283
|
-
|
|
454
|
+
|
|
284
455
|
/// Parse JSON files - exposed to Ruby
|
|
285
456
|
fn parse_json(&self, data: Vec<u8>) -> Result<String, Error> {
|
|
286
457
|
let text = String::from_utf8_lossy(&data);
|
|
287
458
|
match serde_json::from_str::<serde_json::Value>(&text) {
|
|
288
|
-
Ok(json) =>
|
|
459
|
+
Ok(json) => {
|
|
460
|
+
Ok(serde_json::to_string_pretty(&json).unwrap_or_else(|_| text.to_string()))
|
|
461
|
+
}
|
|
289
462
|
Err(_) => Ok(text.to_string()),
|
|
290
463
|
}
|
|
291
464
|
}
|
|
292
|
-
|
|
465
|
+
|
|
293
466
|
/// Parse XML/HTML files - exposed to Ruby
|
|
294
467
|
fn parse_xml(&self, data: Vec<u8>) -> Result<String, Error> {
|
|
295
468
|
use quick_xml::events::Event;
|
|
296
469
|
use quick_xml::Reader;
|
|
297
|
-
|
|
470
|
+
|
|
298
471
|
let mut reader = Reader::from_reader(&data[..]);
|
|
299
472
|
let mut txt = String::new();
|
|
300
473
|
let mut buf = Vec::new();
|
|
301
|
-
|
|
474
|
+
|
|
302
475
|
loop {
|
|
303
476
|
match reader.read_event_into(&mut buf) {
|
|
304
477
|
Ok(Event::Text(e)) => {
|
|
305
|
-
txt.push_str(&e.
|
|
478
|
+
txt.push_str(&e.decode().unwrap_or_default());
|
|
306
479
|
txt.push(' ');
|
|
307
480
|
}
|
|
308
481
|
Ok(Event::Eof) => break,
|
|
309
482
|
Err(e) => {
|
|
310
|
-
return Err(
|
|
311
|
-
magnus::exception::runtime_error(),
|
|
312
|
-
format!("XML parse error: {}", e),
|
|
313
|
-
))
|
|
483
|
+
return Err(Self::runtime_error("XML parse error", e))
|
|
314
484
|
}
|
|
315
485
|
_ => {}
|
|
316
486
|
}
|
|
317
487
|
buf.clear();
|
|
318
488
|
}
|
|
319
|
-
|
|
489
|
+
|
|
320
490
|
Ok(txt.trim().to_string())
|
|
321
491
|
}
|
|
322
|
-
|
|
492
|
+
|
|
323
493
|
/// Parse plain text with encoding detection - exposed to Ruby
|
|
324
494
|
fn parse_text(&self, data: Vec<u8>) -> Result<String, Error> {
|
|
325
495
|
// Detect encoding
|
|
326
496
|
let (decoded, _encoding, malformed) = encoding_rs::UTF_8.decode(&data);
|
|
327
|
-
|
|
497
|
+
|
|
328
498
|
if malformed {
|
|
329
499
|
// Try other encodings
|
|
330
500
|
let (decoded, _encoding, _malformed) = encoding_rs::WINDOWS_1252.decode(&data);
|
|
@@ -333,16 +503,13 @@ impl Parser {
|
|
|
333
503
|
Ok(decoded.to_string())
|
|
334
504
|
}
|
|
335
505
|
}
|
|
336
|
-
|
|
506
|
+
|
|
337
507
|
/// Parse input string (for text content)
|
|
338
508
|
fn parse(&self, input: String) -> Result<String, Error> {
|
|
339
509
|
if input.is_empty() {
|
|
340
|
-
return Err(
|
|
341
|
-
magnus::exception::arg_error(),
|
|
342
|
-
"Input cannot be empty",
|
|
343
|
-
));
|
|
510
|
+
return Err(Self::argument_error("Input cannot be empty"));
|
|
344
511
|
}
|
|
345
|
-
|
|
512
|
+
|
|
346
513
|
// For string input, just return cleaned text
|
|
347
514
|
// If strict mode is on, append indicator for testing
|
|
348
515
|
if self.config.strict_mode {
|
|
@@ -351,29 +518,26 @@ impl Parser {
|
|
|
351
518
|
Ok(input.trim().to_string())
|
|
352
519
|
}
|
|
353
520
|
}
|
|
354
|
-
|
|
521
|
+
|
|
355
522
|
/// Parse a file
|
|
356
523
|
fn parse_file(&self, path: String) -> Result<String, Error> {
|
|
357
524
|
use std::fs;
|
|
358
|
-
|
|
525
|
+
|
|
359
526
|
let data = fs::read(&path)
|
|
360
|
-
.map_err(|e|
|
|
361
|
-
|
|
527
|
+
.map_err(|e| Self::io_error("Failed to read file", e))?;
|
|
528
|
+
|
|
362
529
|
self.parse_bytes_internal(data, Some(&path))
|
|
363
530
|
}
|
|
364
|
-
|
|
531
|
+
|
|
365
532
|
/// Parse bytes from Ruby
|
|
366
533
|
fn parse_bytes(&self, data: Vec<u8>) -> Result<String, Error> {
|
|
367
534
|
if data.is_empty() {
|
|
368
|
-
return Err(
|
|
369
|
-
magnus::exception::arg_error(),
|
|
370
|
-
"Data cannot be empty",
|
|
371
|
-
));
|
|
535
|
+
return Err(Self::argument_error("Data cannot be empty"));
|
|
372
536
|
}
|
|
373
|
-
|
|
537
|
+
|
|
374
538
|
self.parse_bytes_internal(data, None)
|
|
375
539
|
}
|
|
376
|
-
|
|
540
|
+
|
|
377
541
|
/// Get parser configuration
|
|
378
542
|
fn config(&self) -> Result<RHash, Error> {
|
|
379
543
|
let ruby = Ruby::get().unwrap();
|
|
@@ -384,37 +548,26 @@ impl Parser {
|
|
|
384
548
|
hash.aset(ruby.to_symbol("max_size"), self.config.max_size)?;
|
|
385
549
|
Ok(hash)
|
|
386
550
|
}
|
|
387
|
-
|
|
551
|
+
|
|
388
552
|
/// Check if parser is in strict mode
|
|
389
553
|
fn strict_mode(&self) -> bool {
|
|
390
554
|
self.config.strict_mode
|
|
391
555
|
}
|
|
392
|
-
|
|
556
|
+
|
|
393
557
|
/// Check supported file types
|
|
394
558
|
fn supported_formats() -> Vec<String> {
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
"docx".to_string(),
|
|
401
|
-
"xlsx".to_string(),
|
|
402
|
-
"xls".to_string(),
|
|
403
|
-
"csv".to_string(),
|
|
404
|
-
"pdf".to_string(), // Text extraction via MuPDF
|
|
405
|
-
"png".to_string(), // OCR via Tesseract
|
|
406
|
-
"jpg".to_string(), // OCR via Tesseract
|
|
407
|
-
"jpeg".to_string(), // OCR via Tesseract
|
|
408
|
-
"tiff".to_string(), // OCR via Tesseract
|
|
409
|
-
"bmp".to_string(), // OCR via Tesseract
|
|
410
|
-
]
|
|
559
|
+
// Use the centralized list from FormatDetector
|
|
560
|
+
FormatDetector::supported_extensions()
|
|
561
|
+
.iter()
|
|
562
|
+
.map(|&s| s.to_string())
|
|
563
|
+
.collect()
|
|
411
564
|
}
|
|
412
|
-
|
|
565
|
+
|
|
413
566
|
/// Detect if file extension is supported
|
|
414
567
|
fn supports_file(&self, path: String) -> bool {
|
|
415
568
|
if let Some(ext) = std::path::Path::new(&path)
|
|
416
569
|
.extension()
|
|
417
|
-
.and_then(|s| s.to_str())
|
|
570
|
+
.and_then(|s| s.to_str())
|
|
418
571
|
{
|
|
419
572
|
Self::supported_formats().contains(&ext.to_lowercase())
|
|
420
573
|
} else {
|
|
@@ -441,8 +594,8 @@ fn parse_bytes_direct(data: Vec<u8>) -> Result<String, Error> {
|
|
|
441
594
|
|
|
442
595
|
/// Initialize the Parser class
|
|
443
596
|
pub fn init(_ruby: &Ruby, module: RModule) -> Result<(), Error> {
|
|
444
|
-
let class = module.define_class("Parser",
|
|
445
|
-
|
|
597
|
+
let class = module.define_class("Parser", Ruby::get().unwrap().class_object())?;
|
|
598
|
+
|
|
446
599
|
// Instance methods
|
|
447
600
|
class.define_singleton_method("new", function!(Parser::new, -1))?;
|
|
448
601
|
class.define_method("parse", method!(Parser::parse, 1))?;
|
|
@@ -451,22 +604,27 @@ pub fn init(_ruby: &Ruby, module: RModule) -> Result<(), Error> {
|
|
|
451
604
|
class.define_method("config", method!(Parser::config, 0))?;
|
|
452
605
|
class.define_method("strict_mode?", method!(Parser::strict_mode, 0))?;
|
|
453
606
|
class.define_method("supports_file?", method!(Parser::supports_file, 1))?;
|
|
454
|
-
|
|
607
|
+
|
|
455
608
|
// Individual parser methods exposed to Ruby
|
|
456
609
|
class.define_method("parse_pdf", method!(Parser::parse_pdf, 1))?;
|
|
457
610
|
class.define_method("parse_docx", method!(Parser::parse_docx, 1))?;
|
|
611
|
+
class.define_method("parse_pptx", method!(Parser::parse_pptx, 1))?;
|
|
458
612
|
class.define_method("parse_xlsx", method!(Parser::parse_xlsx, 1))?;
|
|
459
613
|
class.define_method("parse_json", method!(Parser::parse_json, 1))?;
|
|
460
614
|
class.define_method("parse_xml", method!(Parser::parse_xml, 1))?;
|
|
461
615
|
class.define_method("parse_text", method!(Parser::parse_text, 1))?;
|
|
462
616
|
class.define_method("ocr_image", method!(Parser::ocr_image, 1))?;
|
|
463
617
|
|
|
618
|
+
// Format detection methods
|
|
619
|
+
class.define_method("detect_format_from_bytes", method!(Parser::detect_format_from_bytes, 1))?;
|
|
620
|
+
class.define_method("detect_format_from_filename", method!(Parser::detect_format_from_filename, 1))?;
|
|
621
|
+
|
|
464
622
|
// Class methods
|
|
465
623
|
class.define_singleton_method("supported_formats", function!(Parser::supported_formats, 0))?;
|
|
466
|
-
|
|
624
|
+
|
|
467
625
|
// Module-level convenience methods
|
|
468
626
|
module.define_singleton_method("parse_file", function!(parse_file_direct, 1))?;
|
|
469
627
|
module.define_singleton_method("parse_bytes", function!(parse_bytes_direct, 1))?;
|
|
470
|
-
|
|
628
|
+
|
|
471
629
|
Ok(())
|
|
472
|
-
}
|
|
630
|
+
}
|