parsekit 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 35c0708c088075c883b3b35c7d76f1573f29a19bf65ac0b89b636a5b76cee662
4
- data.tar.gz: b1ddf9260329239c3a1e791f3ed3249b3577cb210f4c898677316fe55cc951f4
3
+ metadata.gz: 6ad6eb42fb7e96fa944f30245b2c7be51bf4ce1a0f7766749309676b225b17df
4
+ data.tar.gz: deb56ea394ac3fedc840e890e4d27de14585661233f19eeaae06baf7be1b1e90
5
5
  SHA512:
6
- metadata.gz: 2fe76f5b28927e3989502b0ea5f084f5bfc265aae9a65aaba47349e3e540e8150612d75f8f4ddcdc38be7edd9ae7edbf42220ba95b42a535dbc200503759c419
7
- data.tar.gz: e5b9e8eff90f8583f8289bea5100ac43434978ebba814bf9198fb92cc622a9b4fa6e99e28fe2ed31ffa0040c3ac48a38c8361bc1994200059a23d040440a64cc
6
+ metadata.gz: dc88b902dd12008a6936f4d62f5d4651544a3f463b725a15d385b919141e93873bd809436e6b9b008baa7b310d149becb2106a29ca103736f6525e09bef871d6
7
+ data.tar.gz: 9cbc5464a5cbe06a241d2253cde81da82c7eb75742654b7753c91a922acc87125f81a33c3e77d0d107a1435e8946a860e12388e44fa84dc887d9bb4bf9d2d3a2
data/CHANGELOG.md CHANGED
@@ -49,5 +49,5 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
49
49
  - Rust edition 2021
50
50
  - Cross-compilation support for multiple platforms
51
51
 
52
- [Unreleased]: https://github.com/cpetersen/parsekit/compare/v0.1.0...HEAD
53
- [0.1.0]: https://github.com/cpetersen/parsekit/releases/tag/v0.1.0
52
+ [Unreleased]: https://github.com/scientist-labs/parsekit/compare/v0.1.0...HEAD
53
+ [0.1.0]: https://github.com/scientist-labs/parsekit/releases/tag/v0.1.0
data/README.md CHANGED
@@ -186,7 +186,7 @@ ParseKit uses a hybrid Ruby/Rust architecture:
186
186
 
187
187
  ## Contributing
188
188
 
189
- Bug reports and pull requests are welcome on GitHub at https://github.com/cpetersen/parsekit.
189
+ Bug reports and pull requests are welcome on GitHub at https://github.com/scientist-labs/parsekit.
190
190
 
191
191
  ## License
192
192
 
@@ -21,7 +21,7 @@ image = "0.25" # Image processing library (match rusty-tesseract's version)
21
21
  calamine = "0.30" # Excel parsing
22
22
  docx-rs = "0.4" # Word document parsing
23
23
  quick-xml = "0.38" # XML parsing
24
- zip = "2.1" # ZIP archive handling for PPTX
24
+ zip = "5.0" # ZIP archive handling for PPTX
25
25
  serde_json = "1.0" # JSON parsing
26
26
  regex = "1.10" # Text parsing
27
27
  encoding_rs = "0.8" # Encoding detection
@@ -0,0 +1,233 @@
1
+ use std::path::Path;
2
+
3
+ /// Represents a detected file format
4
+ #[derive(Debug, Clone, PartialEq)]
5
+ pub enum FileFormat {
6
+ Pdf,
7
+ Docx,
8
+ Xlsx,
9
+ Xls,
10
+ Pptx,
11
+ Png,
12
+ Jpeg,
13
+ Tiff,
14
+ Bmp,
15
+ Json,
16
+ Xml,
17
+ Html,
18
+ Text,
19
+ Unknown,
20
+ }
21
+
22
+ impl FileFormat {
23
+ /// Convert to Ruby symbol representation
24
+ pub fn to_symbol(&self) -> &'static str {
25
+ match self {
26
+ FileFormat::Pdf => "pdf",
27
+ FileFormat::Docx => "docx",
28
+ FileFormat::Xlsx => "xlsx",
29
+ FileFormat::Xls => "xls",
30
+ FileFormat::Pptx => "pptx",
31
+ FileFormat::Png => "png",
32
+ FileFormat::Jpeg => "jpeg",
33
+ FileFormat::Tiff => "tiff",
34
+ FileFormat::Bmp => "bmp",
35
+ FileFormat::Json => "json",
36
+ FileFormat::Xml => "xml",
37
+ FileFormat::Html => "xml", // HTML is treated as XML in Ruby
38
+ FileFormat::Text => "text",
39
+ FileFormat::Unknown => "unknown",
40
+ }
41
+ }
42
+ }
43
+
44
+ /// Central format detection logic
45
+ pub struct FormatDetector;
46
+
47
+ impl FormatDetector {
48
+ /// Detect format from filename and content
49
+ /// Prioritizes content detection over extension when both are available
50
+ pub fn detect(filename: Option<&str>, content: Option<&[u8]>) -> FileFormat {
51
+ // First try content-based detection if content is provided
52
+ if let Some(data) = content {
53
+ let format = Self::detect_from_content(data);
54
+ // If we got a definitive format from content, use it
55
+ if !matches!(format, FileFormat::Text | FileFormat::Unknown) {
56
+ return format;
57
+ }
58
+ }
59
+
60
+ // Fall back to extension-based detection
61
+ if let Some(name) = filename {
62
+ let ext_format = Self::detect_from_extension(name);
63
+ if ext_format != FileFormat::Unknown {
64
+ return ext_format;
65
+ }
66
+ }
67
+
68
+ // If content detection returned Text and no extension match, return Text
69
+ if let Some(data) = content {
70
+ let format = Self::detect_from_content(data);
71
+ if format == FileFormat::Text {
72
+ return FileFormat::Text;
73
+ }
74
+ }
75
+
76
+ FileFormat::Unknown
77
+ }
78
+
79
+ /// Detect format from file extension
80
+ pub fn detect_from_extension(filename: &str) -> FileFormat {
81
+ let path = Path::new(filename);
82
+ let ext = match path.extension().and_then(|s| s.to_str()) {
83
+ Some(e) => e.to_lowercase(),
84
+ None => return FileFormat::Unknown,
85
+ };
86
+
87
+ match ext.as_str() {
88
+ "pdf" => FileFormat::Pdf,
89
+ "docx" => FileFormat::Docx,
90
+ "xlsx" => FileFormat::Xlsx,
91
+ "xls" => FileFormat::Xls,
92
+ "pptx" => FileFormat::Pptx,
93
+ "png" => FileFormat::Png,
94
+ "jpg" | "jpeg" => FileFormat::Jpeg,
95
+ "tiff" | "tif" => FileFormat::Tiff,
96
+ "bmp" => FileFormat::Bmp,
97
+ "json" => FileFormat::Json,
98
+ "xml" => FileFormat::Xml,
99
+ "html" | "htm" => FileFormat::Html,
100
+ "txt" | "text" | "md" | "markdown" | "csv" => FileFormat::Text,
101
+ _ => FileFormat::Unknown,
102
+ }
103
+ }
104
+
105
+ /// Detect format from file content (magic bytes)
106
+ pub fn detect_from_content(data: &[u8]) -> FileFormat {
107
+ if data.is_empty() {
108
+ return FileFormat::Text; // Empty files are treated as text
109
+ }
110
+
111
+ // PDF
112
+ if data.len() >= 4 && data.starts_with(b"%PDF") {
113
+ return FileFormat::Pdf;
114
+ }
115
+
116
+ // PNG
117
+ if data.len() >= 8 && data.starts_with(&[0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A]) {
118
+ return FileFormat::Png;
119
+ }
120
+
121
+ // JPEG
122
+ if data.len() >= 3 && data.starts_with(&[0xFF, 0xD8, 0xFF]) {
123
+ return FileFormat::Jpeg;
124
+ }
125
+
126
+ // BMP
127
+ if data.len() >= 2 && data.starts_with(b"BM") {
128
+ return FileFormat::Bmp;
129
+ }
130
+
131
+ // TIFF (little-endian or big-endian)
132
+ if data.len() >= 4 {
133
+ if data.starts_with(b"II\x2A\x00") || data.starts_with(b"MM\x00\x2A") {
134
+ return FileFormat::Tiff;
135
+ }
136
+ }
137
+
138
+ // OLE Compound Document (old Excel/Word)
139
+ if data.len() >= 4 && data.starts_with(&[0xD0, 0xCF, 0x11, 0xE0]) {
140
+ return FileFormat::Xls; // Old Office format, usually Excel
141
+ }
142
+
143
+ // ZIP archive (could be DOCX, XLSX, PPTX)
144
+ if data.len() >= 2 && data.starts_with(b"PK") {
145
+ return Self::detect_office_format(data);
146
+ }
147
+
148
+ // XML
149
+ if data.len() >= 5 {
150
+ let start = String::from_utf8_lossy(&data[0..5.min(data.len())]);
151
+ if start.starts_with("<?xml") || start.starts_with("<!") {
152
+ return FileFormat::Xml;
153
+ }
154
+ }
155
+
156
+ // HTML
157
+ if data.len() >= 14 {
158
+ let start = String::from_utf8_lossy(&data[0..14.min(data.len())]).to_lowercase();
159
+ if start.contains("<!doctype") || start.contains("<html") {
160
+ return FileFormat::Html;
161
+ }
162
+ }
163
+
164
+ // JSON
165
+ if let Some(&first_non_ws) = data.iter().find(|&&b| !b" \t\n\r".contains(&b)) {
166
+ if first_non_ws == b'{' || first_non_ws == b'[' {
167
+ return FileFormat::Json;
168
+ }
169
+ }
170
+
171
+ // Default to text for unrecognized formats
172
+ FileFormat::Text
173
+ }
174
+
175
+ /// Detect specific Office format from ZIP data
176
+ fn detect_office_format(data: &[u8]) -> FileFormat {
177
+ // Look for Office-specific directory names in first 2KB of ZIP
178
+ let check_len = 2000.min(data.len());
179
+ let content = String::from_utf8_lossy(&data[0..check_len]);
180
+
181
+ // Check for format-specific markers
182
+ if content.contains("word/") || content.contains("word/_rels") {
183
+ FileFormat::Docx
184
+ } else if content.contains("xl/") || content.contains("xl/_rels") {
185
+ FileFormat::Xlsx
186
+ } else if content.contains("ppt/") || content.contains("ppt/_rels") {
187
+ FileFormat::Pptx
188
+ } else {
189
+ // Default to XLSX for generic ZIP (most common Office format)
190
+ FileFormat::Xlsx
191
+ }
192
+ }
193
+
194
+
195
+ /// Get all supported extensions
196
+ pub fn supported_extensions() -> Vec<&'static str> {
197
+ vec![
198
+ "pdf", "docx", "xlsx", "xls", "pptx",
199
+ "png", "jpg", "jpeg", "tiff", "tif", "bmp",
200
+ "json", "xml", "html", "htm",
201
+ "txt", "text", "md", "markdown", "csv"
202
+ ]
203
+ }
204
+ }
205
+
206
+ #[cfg(test)]
207
+ mod tests {
208
+ use super::*;
209
+
210
+ #[test]
211
+ fn test_detect_pdf() {
212
+ let pdf_data = b"%PDF-1.5\n";
213
+ assert_eq!(FormatDetector::detect_from_content(pdf_data), FileFormat::Pdf);
214
+ }
215
+
216
+ #[test]
217
+ fn test_detect_png() {
218
+ let png_data = &[0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A];
219
+ assert_eq!(FormatDetector::detect_from_content(png_data), FileFormat::Png);
220
+ }
221
+
222
+ #[test]
223
+ fn test_detect_from_extension() {
224
+ assert_eq!(FormatDetector::detect_from_extension("document.pdf"), FileFormat::Pdf);
225
+ assert_eq!(FormatDetector::detect_from_extension("Document.PDF"), FileFormat::Pdf);
226
+ assert_eq!(FormatDetector::detect_from_extension("data.xlsx"), FileFormat::Xlsx);
227
+ }
228
+
229
+ #[test]
230
+ fn test_empty_data() {
231
+ assert_eq!(FormatDetector::detect_from_content(&[]), FileFormat::Text);
232
+ }
233
+ }
@@ -2,6 +2,7 @@ use magnus::{function, prelude::*, Error, Ruby};
2
2
 
3
3
  mod parser;
4
4
  mod error;
5
+ mod format_detector;
5
6
 
6
7
  /// Initialize the ParseKit module and its submodules
7
8
  #[magnus::init]
@@ -1,7 +1,7 @@
1
1
  use magnus::{
2
2
  function, method, prelude::*, scan_args, Error, Module, RHash, RModule, Ruby, Value,
3
3
  };
4
- use std::path::Path;
4
+ use crate::format_detector::{FileFormat, FormatDetector};
5
5
 
6
6
  #[derive(Debug, Clone)]
7
7
  #[magnus::wrap(class = "ParseKit::Parser", free_immediately, size)]
@@ -28,6 +28,33 @@ impl Default for ParserConfig {
28
28
  }
29
29
  }
30
30
 
31
+ // Error handling helpers
32
+ impl Parser {
33
+ /// Create a RuntimeError with formatted message
34
+ fn runtime_error<E: std::fmt::Display>(context: &str, err: E) -> Error {
35
+ Error::new(
36
+ Ruby::get().unwrap().exception_runtime_error(),
37
+ format!("{}: {}", context, err),
38
+ )
39
+ }
40
+
41
+ /// Create an ArgumentError with message
42
+ fn argument_error(msg: &str) -> Error {
43
+ Error::new(
44
+ Ruby::get().unwrap().exception_arg_error(),
45
+ msg.to_string(),
46
+ )
47
+ }
48
+
49
+ /// Create an IOError with formatted message
50
+ fn io_error<E: std::fmt::Display>(context: &str, err: E) -> Error {
51
+ Error::new(
52
+ Ruby::get().unwrap().exception_io_error(),
53
+ format!("{}: {}", context, err),
54
+ )
55
+ }
56
+ }
57
+
31
58
  impl Parser {
32
59
  /// Create a new Parser instance with optional configuration
33
60
  fn new(ruby: &Ruby, args: &[Value]) -> Result<Self, Error> {
@@ -58,73 +85,49 @@ impl Parser {
58
85
  fn parse_bytes_internal(&self, data: Vec<u8>, filename: Option<&str>) -> Result<String, Error> {
59
86
  // Check size limit
60
87
  if data.len() > self.config.max_size {
61
- return Err(Error::new(
62
- Ruby::get().unwrap().exception_runtime_error(),
63
- format!(
64
- "File size {} exceeds maximum allowed size {}",
65
- data.len(),
66
- self.config.max_size
67
- ),
88
+ return Err(Self::runtime_error(
89
+ "File size exceeds limit",
90
+ format!("{} bytes exceeds maximum allowed size of {} bytes",
91
+ data.len(), self.config.max_size)
68
92
  ));
69
93
  }
70
94
 
71
- // Detect file type from extension or content
72
- let file_type = if let Some(name) = filename {
73
- Self::detect_type_from_filename(name)
74
- } else {
75
- Self::detect_type_from_content(&data)
76
- };
77
-
78
- match file_type.as_str() {
79
- "pdf" => self.parse_pdf(data),
80
- "docx" => self.parse_docx(data),
81
- "pptx" => self.parse_pptx(data),
82
- "xlsx" | "xls" => self.parse_xlsx(data),
83
- "json" => self.parse_json(data),
84
- "xml" | "html" => self.parse_xml(data),
85
- "png" | "jpg" | "jpeg" | "tiff" | "bmp" => self.ocr_image(data),
86
- "txt" | "text" => self.parse_text(data),
87
- _ => self.parse_text(data), // Default to text parsing
88
- }
95
+ // Use centralized format detection
96
+ let format = FormatDetector::detect(filename, Some(&data));
97
+
98
+ // Use centralized dispatch
99
+ self.dispatch_to_parser(format, data)
89
100
  }
90
-
91
- /// Detect file type from filename extension
92
- fn detect_type_from_filename(filename: &str) -> String {
93
- let path = Path::new(filename);
94
- match path.extension().and_then(|s| s.to_str()) {
95
- Some(ext) => ext.to_lowercase(),
96
- None => "txt".to_string(),
101
+
102
+ /// Centralized dispatch logic - routes format to appropriate parser
103
+ fn dispatch_to_parser(&self, format: FileFormat, data: Vec<u8>) -> Result<String, Error> {
104
+ match format {
105
+ FileFormat::Pdf => self.parse_pdf(data),
106
+ FileFormat::Docx => self.parse_docx(data),
107
+ FileFormat::Pptx => self.parse_pptx(data),
108
+ FileFormat::Xlsx | FileFormat::Xls => self.parse_xlsx(data),
109
+ FileFormat::Json => self.parse_json(data),
110
+ FileFormat::Xml | FileFormat::Html => self.parse_xml(data),
111
+ FileFormat::Png | FileFormat::Jpeg | FileFormat::Tiff | FileFormat::Bmp => self.ocr_image(data),
112
+ FileFormat::Text | FileFormat::Unknown => self.parse_text(data),
97
113
  }
98
114
  }
99
115
 
100
- /// Detect file type from content (basic detection)
101
- fn detect_type_from_content(data: &[u8]) -> String {
102
- if data.starts_with(b"%PDF") {
103
- "pdf".to_string()
104
- } else if data.starts_with(b"PK") {
105
- // PK is the ZIP signature - could be DOCX or XLSX
106
- // Try to differentiate by looking for common patterns
107
- // This is a simplified check - both DOCX and XLSX are ZIP files
108
- // For now, default to xlsx as it's more commonly parsed
109
- "xlsx".to_string() // Office Open XML format (could also be DOCX)
110
- } else if data.starts_with(&[0xD0, 0xCF, 0x11, 0xE0]) {
111
- "xls".to_string() // Old Excel format
112
- } else if data.starts_with(&[0x89, 0x50, 0x4E, 0x47]) {
113
- "png".to_string() // PNG signature
114
- } else if data.starts_with(&[0xFF, 0xD8, 0xFF]) {
115
- "jpg".to_string() // JPEG signature
116
- } else if data.starts_with(b"BM") {
117
- "bmp".to_string() // BMP signature
118
- } else if data.starts_with(b"II\x2A\x00") || data.starts_with(b"MM\x00\x2A") {
119
- "tiff".to_string() // TIFF signature (little-endian or big-endian)
120
- } else if data.starts_with(b"<?xml") || data.starts_with(b"<html") {
121
- "xml".to_string()
122
- } else if data.starts_with(b"{") || data.starts_with(b"[") {
123
- "json".to_string()
124
- } else {
125
- "txt".to_string()
116
+ /// Ruby-accessible method to detect format from bytes
117
+ fn detect_format_from_bytes(&self, data: Vec<u8>) -> String {
118
+ let format = FormatDetector::detect_from_content(&data);
119
+ // For compatibility with Ruby tests, return "xlsx" for old Excel
120
+ match format {
121
+ FileFormat::Xls => "xlsx".to_string(), // Compatibility with existing tests
122
+ _ => format.to_symbol().to_string(),
126
123
  }
127
124
  }
125
+
126
+ /// Ruby-accessible method to detect format from filename
127
+ fn detect_format_from_filename(&self, filename: String) -> String {
128
+ let format = FormatDetector::detect_from_extension(&filename);
129
+ format.to_symbol().to_string()
130
+ }
128
131
 
129
132
  /// Perform OCR on image data using Tesseract
130
133
  fn ocr_image(&self, data: Vec<u8>) -> Result<String, Error> {
@@ -191,20 +194,12 @@ impl Parser {
191
194
  };
192
195
 
193
196
  if let Err(e) = init_result {
194
- return Err(Error::new(
195
- Ruby::get().unwrap().exception_runtime_error(),
196
- format!("Failed to initialize Tesseract: {:?}", e),
197
- ))
197
+ return Err(Self::runtime_error("Failed to initialize Tesseract", e));
198
198
  }
199
199
 
200
200
  // Load the image from bytes
201
- let img = match image::load_from_memory(&data) {
202
- Ok(img) => img,
203
- Err(e) => return Err(Error::new(
204
- Ruby::get().unwrap().exception_runtime_error(),
205
- format!("Failed to load image: {}", e),
206
- ))
207
- };
201
+ let img = image::load_from_memory(&data)
202
+ .map_err(|e| Self::runtime_error("Failed to load image", e))?;
208
203
 
209
204
  // Convert to RGBA8 format
210
205
  let rgba_img = img.to_rgba8();
@@ -212,27 +207,18 @@ impl Parser {
212
207
  let raw_data = rgba_img.into_raw();
213
208
 
214
209
  // Set image data
215
- if let Err(e) = tesseract.set_image(
210
+ tesseract.set_image(
216
211
  &raw_data,
217
212
  width as i32,
218
213
  height as i32,
219
214
  4, // bytes per pixel (RGBA)
220
215
  (width * 4) as i32, // bytes per line
221
- ) {
222
- return Err(Error::new(
223
- Ruby::get().unwrap().exception_runtime_error(),
224
- format!("Failed to set image: {}", e),
225
- ))
226
- }
216
+ ).map_err(|e| Self::runtime_error("Failed to set image", e))?;
227
217
 
228
218
  // Extract text
229
- match tesseract.get_utf8_text() {
230
- Ok(text) => Ok(text.trim().to_string()),
231
- Err(e) => Err(Error::new(
232
- Ruby::get().unwrap().exception_runtime_error(),
233
- format!("Failed to perform OCR: {}", e),
234
- )),
235
- }
219
+ tesseract.get_utf8_text()
220
+ .map(|text| text.trim().to_string())
221
+ .map_err(|e| Self::runtime_error("Failed to perform OCR", e))
236
222
  }
237
223
 
238
224
 
@@ -242,51 +228,31 @@ impl Parser {
242
228
 
243
229
  // Try to load the PDF from memory
244
230
  // The magic parameter helps MuPDF identify the file type
245
- match Document::from_bytes(&data, "pdf") {
246
- Ok(doc) => {
247
- let mut all_text = String::new();
248
-
249
- // Get page count - this returns a Result
250
- let page_count = match doc.page_count() {
251
- Ok(count) => count,
252
- Err(e) => {
253
- return Err(Error::new(
254
- Ruby::get().unwrap().exception_runtime_error(),
255
- format!("Failed to get page count: {}", e),
256
- ))
257
- }
258
- };
259
-
260
- // Iterate through pages
261
- for page_num in 0..page_count {
262
- match doc.load_page(page_num) {
263
- Ok(page) => {
264
- // Extract text from the page
265
- match page.to_text() {
266
- Ok(text) => {
267
- all_text.push_str(&text);
268
- all_text.push('\n');
269
- }
270
- Err(_) => continue,
271
- }
272
- }
273
- Err(_) => continue,
274
- }
275
- }
276
-
277
- if all_text.is_empty() {
278
- Ok(
279
- "PDF contains no extractable text (might be scanned/image-based)"
280
- .to_string(),
281
- )
282
- } else {
283
- Ok(all_text.trim().to_string())
231
+ let doc = Document::from_bytes(&data, "pdf")
232
+ .map_err(|e| Self::runtime_error("Failed to parse PDF", e))?;
233
+
234
+ let mut all_text = String::new();
235
+
236
+ // Get page count
237
+ let page_count = doc.page_count()
238
+ .map_err(|e| Self::runtime_error("Failed to get page count", e))?;
239
+
240
+ // Iterate through pages
241
+ for page_num in 0..page_count {
242
+ // Continue on page errors rather than failing entirely
243
+ if let Ok(page) = doc.load_page(page_num) {
244
+ // Extract text from the page
245
+ if let Ok(text) = page.to_text() {
246
+ all_text.push_str(&text);
247
+ all_text.push('\n');
284
248
  }
285
249
  }
286
- Err(e) => Err(Error::new(
287
- Ruby::get().unwrap().exception_runtime_error(),
288
- format!("Failed to parse PDF: {}", e),
289
- )),
250
+ }
251
+
252
+ if all_text.is_empty() {
253
+ Ok("PDF contains no extractable text (might be scanned/image-based)".to_string())
254
+ } else {
255
+ Ok(all_text.trim().to_string())
290
256
  }
291
257
  }
292
258
 
@@ -322,10 +288,7 @@ impl Parser {
322
288
 
323
289
  Ok(result.trim().to_string())
324
290
  }
325
- Err(e) => Err(Error::new(
326
- Ruby::get().unwrap().exception_runtime_error(),
327
- format!("Failed to parse DOCX file: {}", e),
328
- )),
291
+ Err(e) => Err(Self::runtime_error("Failed to parse DOCX file", e)),
329
292
  }
330
293
  }
331
294
 
@@ -335,15 +298,8 @@ impl Parser {
335
298
  use zip::ZipArchive;
336
299
 
337
300
  let cursor = Cursor::new(data);
338
- let mut archive = match ZipArchive::new(cursor) {
339
- Ok(archive) => archive,
340
- Err(e) => {
341
- return Err(Error::new(
342
- Ruby::get().unwrap().exception_runtime_error(),
343
- format!("Failed to open PPTX as ZIP: {}", e),
344
- ))
345
- }
346
- };
301
+ let mut archive = ZipArchive::new(cursor)
302
+ .map_err(|e| Self::runtime_error("Failed to open PPTX as ZIP", e))?;
347
303
 
348
304
  let mut all_text = Vec::new();
349
305
  let mut slide_numbers = Vec::new();
@@ -492,10 +448,7 @@ impl Parser {
492
448
 
493
449
  Ok(result)
494
450
  }
495
- Err(e) => Err(Error::new(
496
- Ruby::get().unwrap().exception_runtime_error(),
497
- format!("Failed to parse Excel file: {}", e),
498
- )),
451
+ Err(e) => Err(Self::runtime_error("Failed to parse Excel file", e)),
499
452
  }
500
453
  }
501
454
 
@@ -527,10 +480,7 @@ impl Parser {
527
480
  }
528
481
  Ok(Event::Eof) => break,
529
482
  Err(e) => {
530
- return Err(Error::new(
531
- Ruby::get().unwrap().exception_runtime_error(),
532
- format!("XML parse error: {}", e),
533
- ))
483
+ return Err(Self::runtime_error("XML parse error", e))
534
484
  }
535
485
  _ => {}
536
486
  }
@@ -557,10 +507,7 @@ impl Parser {
557
507
  /// Parse input string (for text content)
558
508
  fn parse(&self, input: String) -> Result<String, Error> {
559
509
  if input.is_empty() {
560
- return Err(Error::new(
561
- Ruby::get().unwrap().exception_arg_error(),
562
- "Input cannot be empty",
563
- ));
510
+ return Err(Self::argument_error("Input cannot be empty"));
564
511
  }
565
512
 
566
513
  // For string input, just return cleaned text
@@ -576,12 +523,8 @@ impl Parser {
576
523
  fn parse_file(&self, path: String) -> Result<String, Error> {
577
524
  use std::fs;
578
525
 
579
- let data = fs::read(&path).map_err(|e| {
580
- Error::new(
581
- Ruby::get().unwrap().exception_io_error(),
582
- format!("Failed to read file: {}", e),
583
- )
584
- })?;
526
+ let data = fs::read(&path)
527
+ .map_err(|e| Self::io_error("Failed to read file", e))?;
585
528
 
586
529
  self.parse_bytes_internal(data, Some(&path))
587
530
  }
@@ -589,10 +532,7 @@ impl Parser {
589
532
  /// Parse bytes from Ruby
590
533
  fn parse_bytes(&self, data: Vec<u8>) -> Result<String, Error> {
591
534
  if data.is_empty() {
592
- return Err(Error::new(
593
- Ruby::get().unwrap().exception_arg_error(),
594
- "Data cannot be empty",
595
- ));
535
+ return Err(Self::argument_error("Data cannot be empty"));
596
536
  }
597
537
 
598
538
  self.parse_bytes_internal(data, None)
@@ -616,25 +556,11 @@ impl Parser {
616
556
 
617
557
  /// Check supported file types
618
558
  fn supported_formats() -> Vec<String> {
619
- vec![
620
- "txt".to_string(),
621
- "json".to_string(),
622
- "xml".to_string(),
623
- "html".to_string(),
624
- "htm".to_string(), // HTML files (alternative extension)
625
- "md".to_string(), // Markdown files
626
- "docx".to_string(),
627
- "pptx".to_string(),
628
- "xlsx".to_string(),
629
- "xls".to_string(),
630
- "csv".to_string(),
631
- "pdf".to_string(), // Text extraction via MuPDF
632
- "png".to_string(), // OCR via Tesseract
633
- "jpg".to_string(), // OCR via Tesseract
634
- "jpeg".to_string(), // OCR via Tesseract
635
- "tiff".to_string(), // OCR via Tesseract
636
- "bmp".to_string(), // OCR via Tesseract
637
- ]
559
+ // Use the centralized list from FormatDetector
560
+ FormatDetector::supported_extensions()
561
+ .iter()
562
+ .map(|&s| s.to_string())
563
+ .collect()
638
564
  }
639
565
 
640
566
  /// Detect if file extension is supported
@@ -688,6 +614,10 @@ pub fn init(_ruby: &Ruby, module: RModule) -> Result<(), Error> {
688
614
  class.define_method("parse_xml", method!(Parser::parse_xml, 1))?;
689
615
  class.define_method("parse_text", method!(Parser::parse_text, 1))?;
690
616
  class.define_method("ocr_image", method!(Parser::ocr_image, 1))?;
617
+
618
+ // Format detection methods
619
+ class.define_method("detect_format_from_bytes", method!(Parser::detect_format_from_bytes, 1))?;
620
+ class.define_method("detect_format_from_filename", method!(Parser::detect_format_from_filename, 1))?;
691
621
 
692
622
  // Class methods
693
623
  class.define_singleton_method("supported_formats", function!(Parser::supported_formats, 0))?;
@@ -0,0 +1,125 @@
1
+ # ParseKit Native API Documentation
2
+
3
+ This document describes the methods implemented in the Rust native extension for ParseKit::Parser.
4
+
5
+ ## Instance Methods
6
+
7
+ ### `initialize(options = {})`
8
+ Initialize a new Parser instance with optional configuration.
9
+
10
+ **Parameters:**
11
+ - `options` [Hash] Configuration options
12
+ - `:encoding` [String] Input encoding (default: UTF-8)
13
+ - `:strict_mode` [Boolean] Enable strict parsing mode (default: false)
14
+ - `:max_depth` [Integer] Maximum nesting depth (default: 100)
15
+ - `:max_size` [Integer] Maximum file size in bytes (default: 100MB)
16
+
17
+ ### `parse(input)`
18
+ Parse an input string (for text content).
19
+
20
+ **Parameters:**
21
+ - `input` [String] The input to parse
22
+
23
+ **Returns:**
24
+ - [String] The parsed result
25
+
26
+ **Raises:**
27
+ - `ArgumentError` If input is empty
28
+
29
+ ### `parse_file(path)`
30
+ Parse a file (supports PDF, Office documents, text files, images with OCR).
31
+
32
+ **Parameters:**
33
+ - `path` [String] Path to the file to parse
34
+
35
+ **Returns:**
36
+ - [String] The extracted text content
37
+
38
+ **Raises:**
39
+ - `IOError` If file cannot be read
40
+ - `RuntimeError` If parsing fails
41
+
42
+ ### `parse_bytes(data)`
43
+ Parse binary data.
44
+
45
+ **Parameters:**
46
+ - `data` [Array<Integer>] Binary data as byte array
47
+
48
+ **Returns:**
49
+ - [String] The extracted text content
50
+
51
+ **Raises:**
52
+ - `ArgumentError` If data is empty
53
+ - `RuntimeError` If parsing fails
54
+
55
+ ### `config`
56
+ Get the current parser configuration.
57
+
58
+ **Returns:**
59
+ - [Hash] The parser configuration including encoding, strict_mode, max_depth, and max_size
60
+
61
+ ### `supports_file?(path)`
62
+ Check if a file format is supported.
63
+
64
+ **Parameters:**
65
+ - `path` [String] File path to check
66
+
67
+ **Returns:**
68
+ - [Boolean] True if the file format is supported
69
+
70
+ ### `strict_mode?`
71
+ Check if strict mode is enabled.
72
+
73
+ **Returns:**
74
+ - [Boolean] True if strict mode is enabled
75
+
76
+ ## Format-Specific Parsers
77
+
78
+ These methods are also available but typically called internally via `parse_file` or `parse_bytes`:
79
+
80
+ ### `parse_pdf(data)`
81
+ Parse PDF files using MuPDF (statically linked).
82
+
83
+ ### `parse_docx(data)`
84
+ Parse Microsoft Word documents.
85
+
86
+ ### `parse_pptx(data)`
87
+ Parse Microsoft PowerPoint presentations.
88
+
89
+ ### `parse_xlsx(data)`
90
+ Parse Microsoft Excel spreadsheets.
91
+
92
+ ### `parse_json(data)`
93
+ Parse and pretty-print JSON data.
94
+
95
+ ### `parse_xml(data)`
96
+ Parse XML/HTML files and extract text content.
97
+
98
+ ### `parse_text(data)`
99
+ Parse plain text files.
100
+
101
+ ### `ocr_image(data)`
102
+ Perform OCR on images (PNG, JPEG, TIFF, BMP) using Tesseract.
103
+
104
+ ## Class Methods
105
+
106
+ ### `Parser.supported_formats`
107
+ Get list of supported file formats.
108
+
109
+ **Returns:**
110
+ - [Array<String>] List of supported file extensions
111
+
112
+ **Example:**
113
+ ```ruby
114
+ ParseKit::Parser.supported_formats
115
+ # => ["txt", "json", "xml", "html", "docx", "xlsx", "xls", "csv", "pdf", "png", "jpg", "jpeg", "tiff", "bmp", ...]
116
+ ```
117
+
118
+ ## Implementation Notes
119
+
120
+ All these methods are implemented in Rust via the native extension. The Ruby layer (`lib/parsekit/parser.rb`) provides additional convenience methods and helpers that wrap these native methods.
121
+
122
+ The native extension uses:
123
+ - **MuPDF** for PDF parsing (statically linked)
124
+ - **Tesseract** for OCR functionality (bundled)
125
+ - **Various Rust crates** for Office document parsing (docx-rs, calamine, etc.)
Binary file
@@ -3,65 +3,24 @@
3
3
  module ParseKit
4
4
  # Ruby wrapper for the native Parser class
5
5
  #
6
- # The Ruby layer now handles format detection and routing to specific parsers,
7
- # while Rust provides the actual parsing implementations.
6
+ # This class provides document parsing capabilities through a native Rust extension.
7
+ # For documentation of native methods, see NATIVE_API.md
8
+ #
9
+ # The Ruby layer provides convenience methods and helpers while the Rust
10
+ # extension handles the actual parsing of PDF, Office documents, images (OCR), etc.
8
11
  class Parser
9
- # These methods are implemented in the native extension
10
- # and are documented here for YARD
11
-
12
- # Initialize a new Parser instance
13
- # @param options [Hash] Configuration options
14
- # @option options [String] :encoding Input encoding (default: UTF-8)
15
- # def initialize(options = {})
16
- # # Implemented in native extension
17
- # end
18
-
19
- # Parse an input string (for text content)
20
- # @param input [String] The input to parse
21
- # @return [String] The parsed result
22
- # @raise [ArgumentError] If input is empty
23
- # def parse(input)
24
- # # Implemented in native extension
25
- # end
26
-
27
- # Parse a file (supports PDF, Office documents, text files)
28
- # @param path [String] Path to the file to parse
29
- # @return [String] The extracted text content
30
- # @raise [IOError] If file cannot be read
31
- # @raise [RuntimeError] If parsing fails
32
- # def parse_file(path)
33
- # # Implemented in native extension
34
- # end
35
-
36
- # Parse binary data
37
- # @param data [Array<Integer>] Binary data as byte array
38
- # @return [String] The extracted text content
39
- # @raise [ArgumentError] If data is empty
40
- # @raise [RuntimeError] If parsing fails
41
- # def parse_bytes(data)
42
- # # Implemented in native extension
43
- # end
44
-
45
- # Get the current configuration
46
- # @return [Hash] The parser configuration
47
- # def config
48
- # # Implemented in native extension
49
- # end
50
-
51
- # Check if a file format is supported
52
- # @param path [String] File path to check
53
- # @return [Boolean] True if the file format is supported
54
- # def supports_file?(path)
55
- # # Implemented in native extension
56
- # end
57
-
58
- # Get list of supported file formats
59
- # @return [Array<String>] List of supported file extensions
60
- # def self.supported_formats
61
- # # Implemented in native extension
62
- # end
63
-
64
- # Ruby-level helper methods
12
+ # Native methods implemented in Rust:
13
+ # - initialize(options = {})
14
+ # - parse(input)
15
+ # - parse_file(path)
16
+ # - parse_bytes(data)
17
+ # - config
18
+ # - supports_file?(path)
19
+ # - strict_mode?
20
+ # - parse_pdf, parse_docx, parse_xlsx, parse_pptx, parse_json, parse_xml, parse_text, ocr_image
21
+ # See NATIVE_API.md for detailed documentation
22
+
23
+ # Ruby convenience methods and helpers
65
24
 
66
25
  # Create a parser with strict mode enabled
67
26
  # @param options [Hash] Additional options
@@ -81,6 +40,7 @@ module ParseKit
81
40
  end
82
41
 
83
42
  # Detect format from file path
43
+ # @deprecated Use the native format detection in parse_file instead
84
44
  # @param path [String] File path
85
45
  # @return [Symbol, nil] Format symbol or nil if unknown
86
46
  def detect_format(path)
@@ -101,67 +61,134 @@ module ParseKit
101
61
  end
102
62
 
103
63
  # Detect format from binary data
64
+ # @deprecated Use the native format detection in parse_bytes instead
104
65
  # @param data [String, Array<Integer>] Binary data
105
66
  # @return [Symbol] Format symbol
106
67
  def detect_format_from_bytes(data)
107
68
  # Convert to bytes if string
108
69
  bytes = data.is_a?(String) ? data.bytes : data
109
- return :text if bytes.empty?
110
-
111
- # Check magic bytes
112
- if bytes[0..3] == [0x25, 0x50, 0x44, 0x46] # %PDF
113
- :pdf
114
- elsif bytes[0..1] == [0x50, 0x4B] # PK (ZIP archive)
115
- # Could be DOCX or XLSX, default to xlsx for now
116
- # In the future, could inspect ZIP contents to determine
117
- :xlsx
118
- elsif bytes[0..3] == [0xD0, 0xCF, 0x11, 0xE0] # Old Excel
70
+ return :text if bytes.empty? # Return :text for empty data
71
+
72
+ # Check magic bytes for various formats
73
+
74
+ # PDF
75
+ if bytes.size >= 4 && bytes[0..3] == [0x25, 0x50, 0x44, 0x46] # %PDF
76
+ return :pdf
77
+ end
78
+
79
+ # PNG
80
+ if bytes.size >= 8 && bytes[0..7] == [0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A]
81
+ return :png
82
+ end
83
+
84
+ # JPEG
85
+ if bytes.size >= 3 && bytes[0..2] == [0xFF, 0xD8, 0xFF]
86
+ return :jpeg
87
+ end
88
+
89
+ # BMP
90
+ if bytes.size >= 2 && bytes[0..1] == [0x42, 0x4D] # BM
91
+ return :bmp
92
+ end
93
+
94
+ # TIFF (little-endian or big-endian)
95
+ if bytes.size >= 4
96
+ if bytes[0..3] == [0x49, 0x49, 0x2A, 0x00] # II*\0 (little-endian)
97
+ return :tiff
98
+ elsif bytes[0..3] == [0x4D, 0x4D, 0x00, 0x2A] # MM\0* (big-endian)
99
+ return :tiff
100
+ end
101
+ end
102
+
103
+ # OLE Compound Document (old Excel/Word) - return :xlsx for compatibility
104
+ if bytes.size >= 4 && bytes[0..3] == [0xD0, 0xCF, 0x11, 0xE0]
105
+ return :xlsx # Return :xlsx for compatibility with existing tests
106
+ end
107
+
108
+ # ZIP archive (could be DOCX, XLSX, PPTX)
109
+ if bytes.size >= 2 && bytes[0..1] == [0x50, 0x4B] # PK
110
+ # Try to determine the specific Office format by checking ZIP contents
111
+ # For now, we'll need to inspect the ZIP structure
112
+ return detect_office_format_from_zip(bytes)
113
+ end
114
+
115
+ # XML
116
+ if bytes.size >= 5
117
+ first_chars = bytes[0..4].pack('C*')
118
+ if first_chars == '<?xml' || first_chars.start_with?('<!')
119
+ return :xml
120
+ end
121
+ end
122
+
123
+ # HTML
124
+ if bytes.size >= 14
125
+ first_chars = bytes[0..13].pack('C*').downcase
126
+ if first_chars.include?('<!doctype') || first_chars.include?('<html')
127
+ return :xml # HTML is treated as XML
128
+ end
129
+ end
130
+
131
+ # JSON
132
+ if bytes.size > 0
133
+ first_char = bytes[0]
134
+ # Skip whitespace
135
+ idx = 0
136
+ while idx < bytes.size && [0x20, 0x09, 0x0A, 0x0D].include?(bytes[idx])
137
+ idx += 1
138
+ end
139
+
140
+ if idx < bytes.size
141
+ first_non_ws = bytes[idx]
142
+ if first_non_ws == 0x7B || first_non_ws == 0x5B # { or [
143
+ return :json
144
+ end
145
+ end
146
+ end
147
+
148
+ # Default to text if not recognized
149
+ :text
150
+ end
151
+
152
+ # Detect specific Office format from ZIP data
153
+ # @param bytes [Array<Integer>] ZIP file bytes
154
+ # @return [Symbol] :docx, :xlsx, :pptx, or :unknown
155
+ def detect_office_format_from_zip(bytes)
156
+ # This is a simplified detection - in practice you'd parse the ZIP
157
+ # For the test, we'll check for known patterns in the ZIP structure
158
+
159
+ # Convert bytes to string for pattern matching
160
+ content = bytes[0..2000].pack('C*') # Check first 2KB
161
+
162
+ # Look for Office-specific directory names in the ZIP
163
+ if content.include?('word/') || content.include?('word/_rels')
164
+ :docx
165
+ elsif content.include?('xl/') || content.include?('xl/_rels')
119
166
  :xlsx
120
- elsif bytes[0..4] == [0x3C, 0x3F, 0x78, 0x6D, 0x6C] # <?xml
121
- :xml
122
- elsif bytes[0..4] == [0x3C, 0x68, 0x74, 0x6D, 0x6C] # <html
123
- :xml
124
- elsif bytes[0] == 0x7B || bytes[0] == 0x5B # { or [
125
- :json
167
+ elsif content.include?('ppt/') || content.include?('ppt/_rels')
168
+ :pptx
126
169
  else
127
- :text
170
+ # Default to xlsx for generic ZIP
171
+ :xlsx
128
172
  end
129
173
  end
130
174
 
131
175
  # Parse file using format-specific parser
132
- # This method now detects format and routes to the appropriate parser
176
+ # This method delegates to parse_file which uses centralized dispatch in Rust
133
177
  # @param path [String] File path
134
178
  # @return [String] Parsed content
135
179
  def parse_file_routed(path)
136
- format = detect_format(path)
137
- data = File.read(path, mode: 'rb').bytes
138
-
139
- case format
140
- when :docx then parse_docx(data)
141
- when :xlsx then parse_xlsx(data)
142
- when :pdf then parse_pdf(data)
143
- when :json then parse_json(data)
144
- when :xml then parse_xml(data)
145
- else parse_text(data)
146
- end
180
+ # Simply delegate to parse_file which already has dispatch logic
181
+ parse_file(path)
147
182
  end
148
183
 
149
184
  # Parse bytes using format-specific parser
150
- # This method detects format and routes to the appropriate parser
185
+ # This method delegates to parse_bytes which uses centralized dispatch in Rust
151
186
  # @param data [String, Array<Integer>] Binary data
152
187
  # @return [String] Parsed content
153
188
  def parse_bytes_routed(data)
154
- format = detect_format_from_bytes(data)
189
+ # Simply delegate to parse_bytes which already has dispatch logic
155
190
  bytes = data.is_a?(String) ? data.bytes : data
156
-
157
- case format
158
- when :docx then parse_docx(bytes)
159
- when :xlsx then parse_xlsx(bytes)
160
- when :pdf then parse_pdf(bytes)
161
- when :json then parse_json(bytes)
162
- when :xml then parse_xml(bytes)
163
- else parse_text(bytes)
164
- end
191
+ parse_bytes(bytes)
165
192
  end
166
193
 
167
194
  # Parse with a block for processing results
@@ -178,25 +205,49 @@ module ParseKit
178
205
  # @param input [String] The input to validate
179
206
  # @return [Boolean] True if input is valid
180
207
  def valid_input?(input)
181
- return false unless input.is_a?(String)
182
- return false if input.empty?
183
- true
208
+ input.is_a?(String) && !input.empty?
184
209
  end
185
210
 
186
211
  # Validate file before parsing
187
212
  # @param path [String] The file path to validate
188
213
  # @return [Boolean] True if file exists and format is supported
189
214
  def valid_file?(path)
215
+ return false if path.nil? || path.empty?
190
216
  return false unless File.exist?(path)
217
+ return false if File.directory?(path)
191
218
  supports_file?(path)
192
219
  end
193
220
 
194
221
  # Get file extension
195
222
  # @param path [String] File path
196
- # @return [String, nil] File extension in lowercase
223
+ # @return [String, nil] File extension in lowercase without leading dot
197
224
  def file_extension(path)
198
- ext = File.extname(path)
199
- ext.empty? ? nil : ext[1..].downcase
225
+ return nil if path.nil? || path.empty?
226
+
227
+ # Handle trailing whitespace
228
+ clean_path = path.strip
229
+
230
+ # Handle trailing slashes (directory indicator)
231
+ return nil if clean_path.end_with?('/')
232
+
233
+ # Get the extension
234
+ ext = File.extname(clean_path)
235
+
236
+ # Handle special cases
237
+ if ext.empty?
238
+ # Check for hidden files like .gitignore (the whole name after dot is the "extension")
239
+ basename = File.basename(clean_path)
240
+ if basename.start_with?('.') && basename.length > 1 && !basename[1..-1].include?('.')
241
+ return basename[1..-1].downcase
242
+ end
243
+ return nil
244
+ elsif ext == '.'
245
+ # File ends with a dot but no extension
246
+ return nil
247
+ else
248
+ # Normal extension, remove the dot and downcase
249
+ ext[1..-1].downcase
250
+ end
200
251
  end
201
252
  end
202
253
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module ParseKit
4
- VERSION = "0.1.0"
4
+ VERSION = "0.1.2"
5
5
  end
data/lib/parsekit.rb CHANGED
@@ -14,6 +14,22 @@ require_relative "parsekit/parser"
14
14
 
15
15
  # ParseKit is a Ruby document parsing toolkit with PDF and OCR support
16
16
  module ParseKit
17
+ # Supported file formats and their extensions
18
+ SUPPORTED_FORMATS = {
19
+ pdf: ['.pdf'],
20
+ docx: ['.docx'],
21
+ xlsx: ['.xlsx'],
22
+ xls: ['.xls'],
23
+ pptx: ['.pptx'],
24
+ png: ['.png'],
25
+ jpeg: ['.jpg', '.jpeg'],
26
+ tiff: ['.tiff', '.tif'],
27
+ bmp: ['.bmp'],
28
+ json: ['.json'],
29
+ xml: ['.xml', '.html'],
30
+ text: ['.txt', '.md', '.csv']
31
+ }.freeze
32
+
17
33
  class << self
18
34
  # The parse_file and parse_bytes methods are defined in the native extension
19
35
  # We just need to document them here or add wrapper logic if needed
@@ -50,6 +66,22 @@ module ParseKit
50
66
  Parser.new.supports_file?(path)
51
67
  end
52
68
 
69
+ # Detect file format from filename/extension
70
+ # @param filename [String, nil] The filename to check
71
+ # @return [Symbol] The detected format, or :unknown
72
+ def detect_format(filename)
73
+ return :unknown if filename.nil? || filename.empty?
74
+
75
+ ext = File.extname(filename).downcase
76
+ return :unknown if ext.empty?
77
+
78
+ SUPPORTED_FORMATS.each do |format, extensions|
79
+ return format if extensions.include?(ext)
80
+ end
81
+
82
+ :unknown
83
+ end
84
+
53
85
  # Get the native library version
54
86
  # @return [String] Version of the native library
55
87
  def native_version
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: parsekit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Chris Petersen
@@ -96,20 +96,22 @@ files:
96
96
  - ext/parsekit/Cargo.toml
97
97
  - ext/parsekit/extconf.rb
98
98
  - ext/parsekit/src/error.rs
99
+ - ext/parsekit/src/format_detector.rs
99
100
  - ext/parsekit/src/lib.rs
100
101
  - ext/parsekit/src/parser.rs
101
102
  - lib/parsekit.rb
103
+ - lib/parsekit/NATIVE_API.md
102
104
  - lib/parsekit/error.rb
103
105
  - lib/parsekit/parsekit.bundle
104
106
  - lib/parsekit/parser.rb
105
107
  - lib/parsekit/version.rb
106
- homepage: https://github.com/cpetersen/parsekit
108
+ homepage: https://github.com/scientist-labs/parsekit
107
109
  licenses:
108
110
  - MIT
109
111
  metadata:
110
- homepage_uri: https://github.com/cpetersen/parsekit
111
- source_code_uri: https://github.com/cpetersen/parsekit
112
- changelog_uri: https://github.com/cpetersen/parsekit/blob/main/CHANGELOG.md
112
+ homepage_uri: https://github.com/scientist-labs/parsekit
113
+ source_code_uri: https://github.com/scientist-labs/parsekit
114
+ changelog_uri: https://github.com/scientist-labs/parsekit/blob/main/CHANGELOG.md
113
115
  post_install_message:
114
116
  rdoc_options: []
115
117
  require_paths: