parsekit-bin 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +53 -0
- data/LICENSE.txt +21 -0
- data/README.md +195 -0
- data/ext/parsekit/Cargo.toml +36 -0
- data/ext/parsekit/extconf.rb +6 -0
- data/ext/parsekit/src/error.rs +45 -0
- data/ext/parsekit/src/format_detector.rs +233 -0
- data/ext/parsekit/src/lib.rs +25 -0
- data/ext/parsekit/src/parser.rs +630 -0
- data/lib/parsekit/error.rb +15 -0
- data/lib/parsekit/parser.rb +253 -0
- data/lib/parsekit/version.rb +5 -0
- data/lib/parsekit.rb +93 -0
- metadata +130 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: 0a6447155b1ecdb5426c2051e680a12a6b0091a43539d46c711443a5bc98f4bd
|
|
4
|
+
data.tar.gz: 1dc5570e92dbc491b50669d8dcc68aec57ca9249dd6f31fd82c5767c1109beea
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: 681e2892319f1f8ccf51982836224fae0b38946c54dd44eaf184fca8deefc1b21875ce9359523663d09610eb2cfeec1efe6425d897584d6d8bbc9fc3ca51196b
|
|
7
|
+
data.tar.gz: 891180dcb47736714c2f33c42bb13032bf8f6fddcf09e663915edcb525c8715d137ac9309b1821702c7c8b8918f3aa552b291e9a340bf169e69da6e5b257ac5d
|
data/CHANGELOG.md
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
|
+
|
|
8
|
+
## [Unreleased]
|
|
9
|
+
|
|
10
|
+
### Added
|
|
11
|
+
- Nothing yet
|
|
12
|
+
|
|
13
|
+
### Changed
|
|
14
|
+
- Nothing yet
|
|
15
|
+
|
|
16
|
+
### Deprecated
|
|
17
|
+
- Nothing yet
|
|
18
|
+
|
|
19
|
+
### Removed
|
|
20
|
+
- Nothing yet
|
|
21
|
+
|
|
22
|
+
### Fixed
|
|
23
|
+
- Nothing yet
|
|
24
|
+
|
|
25
|
+
### Security
|
|
26
|
+
- Nothing yet
|
|
27
|
+
|
|
28
|
+
## [0.1.0] - 2024-08-09
|
|
29
|
+
|
|
30
|
+
### Added
|
|
31
|
+
- Initial release of parsekit
|
|
32
|
+
- Basic parser functionality with Ruby bindings via Magnus
|
|
33
|
+
- Support for parsing strings and files
|
|
34
|
+
- Configurable parser with options (strict_mode, max_depth, encoding)
|
|
35
|
+
- Parser class with instance methods
|
|
36
|
+
- Module-level convenience methods
|
|
37
|
+
- Error handling with custom error classes
|
|
38
|
+
- Thread-safe parsing operations
|
|
39
|
+
- Cross-platform support (Linux, macOS, Windows)
|
|
40
|
+
- Ruby 3.0+ support
|
|
41
|
+
- Comprehensive test suite with RSpec
|
|
42
|
+
- CI/CD with GitHub Actions
|
|
43
|
+
- Documentation and examples
|
|
44
|
+
- Integration with ruby-nlp ecosystem
|
|
45
|
+
|
|
46
|
+
### Technical Details
|
|
47
|
+
- Built with Magnus 0.7 for Ruby-Rust bindings
|
|
48
|
+
- Uses rb_sys 0.9 for build system integration
|
|
49
|
+
- Rust edition 2021
|
|
50
|
+
- Cross-compilation support for multiple platforms
|
|
51
|
+
|
|
52
|
+
[Unreleased]: https://github.com/scientist-labs/parsekit/compare/v0.1.0...HEAD
|
|
53
|
+
[0.1.0]: https://github.com/scientist-labs/parsekit/releases/tag/v0.1.0
|
data/LICENSE.txt
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
The MIT License (MIT)
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 Your Name
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
|
13
|
+
all copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
<img src="/docs/assets/parsekit-wide.png" alt="parsekit" height="80px">
|
|
2
|
+
|
|
3
|
+
[](https://badge.fury.io/rb/parsekit-bin)
|
|
4
|
+
[](https://opensource.org/licenses/MIT)
|
|
5
|
+
|
|
6
|
+
Native Ruby bindings for the [parser-core](https://crates.io/crates/parser-core) Rust crate, providing high-performance document parsing and text extraction capabilities through Magnus. This gem wraps parser-core to extract text from PDFs, Office documents (DOCX, XLSX), images (with OCR), and more. Part of the ruby-nlp ecosystem.
|
|
7
|
+
|
|
8
|
+
## Features
|
|
9
|
+
|
|
10
|
+
- ๐ **Document Parsing**: Extract text from PDFs, Office documents (DOCX, XLSX)
|
|
11
|
+
- ๐ผ๏ธ **OCR Support**: Extract text from images using Tesseract OCR
|
|
12
|
+
- ๐ **High Performance**: Native Rust performance with Ruby convenience
|
|
13
|
+
- ๐ง **Unified API**: Single interface for multiple document formats
|
|
14
|
+
- ๐ฆ **Cross-Platform**: Works on Linux, macOS, and Windows
|
|
15
|
+
- ๐งช **Well Tested**: Comprehensive test suite with RSpec
|
|
16
|
+
|
|
17
|
+
## Installation
|
|
18
|
+
|
|
19
|
+
Add this line to your application's Gemfile:
|
|
20
|
+
|
|
21
|
+
```ruby
|
|
22
|
+
gem 'parsekit-bin'
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
And then execute:
|
|
26
|
+
|
|
27
|
+
$ bundle install
|
|
28
|
+
|
|
29
|
+
Or install it yourself as:
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
gem install parsekit-bin
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
### Requirements
|
|
36
|
+
|
|
37
|
+
- Ruby >= 3.0.0
|
|
38
|
+
- Rust toolchain (stable)
|
|
39
|
+
- C compiler (for linking)
|
|
40
|
+
|
|
41
|
+
That's it! ParseKit bundles all necessary libraries including Tesseract for OCR, so you don't need to install any system dependencies.
|
|
42
|
+
|
|
43
|
+
## Usage
|
|
44
|
+
|
|
45
|
+
### Basic Usage
|
|
46
|
+
|
|
47
|
+
```ruby
|
|
48
|
+
require 'parsekit'
|
|
49
|
+
|
|
50
|
+
# Parse a PDF file
|
|
51
|
+
text = ParseKit.parse_file("document.pdf")
|
|
52
|
+
puts text # Extracted text from the PDF
|
|
53
|
+
|
|
54
|
+
# Parse an Excel file
|
|
55
|
+
text = ParseKit.parse_file("spreadsheet.xlsx")
|
|
56
|
+
puts text # Extracted text from all sheets
|
|
57
|
+
|
|
58
|
+
# Parse binary data directly
|
|
59
|
+
file_data = File.binread("document.pdf")
|
|
60
|
+
text = ParseKit.parse_bytes(file_data)
|
|
61
|
+
puts text
|
|
62
|
+
|
|
63
|
+
# Parse with a Parser instance
|
|
64
|
+
parser = ParseKit::Parser.new
|
|
65
|
+
text = parser.parse_file("report.docx")
|
|
66
|
+
puts text
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
### Module-Level Convenience Methods
|
|
70
|
+
|
|
71
|
+
```ruby
|
|
72
|
+
# Parse files directly
|
|
73
|
+
content = ParseKit.parse_file('document.pdf')
|
|
74
|
+
|
|
75
|
+
# Parse bytes
|
|
76
|
+
data = File.read('document.pdf', mode: 'rb')
|
|
77
|
+
content = ParseKit.parse_bytes(data.bytes)
|
|
78
|
+
|
|
79
|
+
# Check supported formats
|
|
80
|
+
formats = ParseKit.supported_formats
|
|
81
|
+
# => ["txt", "json", "xml", "html", "docx", "xlsx", "xls", "csv", "pdf", "png", "jpg", "jpeg", "tiff", "bmp"]
|
|
82
|
+
|
|
83
|
+
# Check if a file is supported
|
|
84
|
+
ParseKit.supports_file?('document.pdf') # => true
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
### Configuration Options
|
|
88
|
+
|
|
89
|
+
```ruby
|
|
90
|
+
# Create parser with options
|
|
91
|
+
parser = ParseKit::Parser.new(
|
|
92
|
+
strict_mode: true,
|
|
93
|
+
max_size: 50 * 1024 * 1024, # 50MB limit
|
|
94
|
+
encoding: 'UTF-8'
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
# Or use the strict convenience method
|
|
98
|
+
parser = ParseKit::Parser.strict
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
### Format-Specific Parsing
|
|
102
|
+
|
|
103
|
+
```ruby
|
|
104
|
+
parser = ParseKit::Parser.new
|
|
105
|
+
|
|
106
|
+
# Direct access to format-specific parsers
|
|
107
|
+
pdf_data = File.read('document.pdf', mode: 'rb').bytes
|
|
108
|
+
pdf_text = parser.parse_pdf(pdf_data)
|
|
109
|
+
|
|
110
|
+
image_data = File.read('image.png', mode: 'rb').bytes
|
|
111
|
+
ocr_text = parser.ocr_image(image_data)
|
|
112
|
+
|
|
113
|
+
excel_data = File.read('data.xlsx', mode: 'rb').bytes
|
|
114
|
+
excel_text = parser.parse_xlsx(excel_data)
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
## Supported Formats
|
|
118
|
+
|
|
119
|
+
| Format | Extensions | Method | Notes |
|
|
120
|
+
|--------|------------|--------|-------|
|
|
121
|
+
| PDF | .pdf | `parse_pdf` | Text extraction via MuPDF |
|
|
122
|
+
| Word | .docx | `parse_docx` | Office Open XML format |
|
|
123
|
+
| Excel | .xlsx, .xls | `parse_xlsx` | Both modern and legacy formats |
|
|
124
|
+
| PowerPoint | .pptx | `parse_pptx` | Text extraction from slides and notes |
|
|
125
|
+
| Images | .png, .jpg, .jpeg, .tiff, .bmp | `ocr_image` | OCR via bundled Tesseract |
|
|
126
|
+
| JSON | .json | `parse_json` | Pretty-printed output |
|
|
127
|
+
| XML/HTML | .xml, .html | `parse_xml` | Extracts text content |
|
|
128
|
+
| Text | .txt, .csv, .md | `parse_text` | With encoding detection |
|
|
129
|
+
|
|
130
|
+
## Performance
|
|
131
|
+
|
|
132
|
+
ParseKit is built with performance in mind:
|
|
133
|
+
|
|
134
|
+
- Native Rust implementation for speed
|
|
135
|
+
- Statically linked C libraries (MuPDF, Tesseract) compiled with optimizations
|
|
136
|
+
- Efficient memory usage with streaming where possible
|
|
137
|
+
- Configurable size limits to prevent memory issues
|
|
138
|
+
|
|
139
|
+
## Development
|
|
140
|
+
|
|
141
|
+
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests.
|
|
142
|
+
|
|
143
|
+
To compile the Rust extension:
|
|
144
|
+
|
|
145
|
+
```bash
|
|
146
|
+
rake compile
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
To run tests with coverage:
|
|
150
|
+
|
|
151
|
+
```bash
|
|
152
|
+
rake dev:coverage
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
### OCR Mode Configuration
|
|
156
|
+
|
|
157
|
+
By default, ParseKit bundles Tesseract for zero-dependency OCR support. Advanced users who already have Tesseract installed system-wide and want faster gem installation can use system mode:
|
|
158
|
+
|
|
159
|
+
**Using system Tesseract during installation:**
|
|
160
|
+
```bash
|
|
161
|
+
gem install parsekit-bin -- --no-default-features
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
**For development with system Tesseract:**
|
|
165
|
+
```bash
|
|
166
|
+
rake compile CARGO_FEATURES="" # Disables bundled-tesseract feature
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
**System Tesseract requirements:**
|
|
170
|
+
- **macOS**: `brew install tesseract`
|
|
171
|
+
- **Ubuntu/Debian**: `sudo apt-get install libtesseract-dev`
|
|
172
|
+
- **Fedora/RHEL**: `sudo dnf install tesseract-devel`
|
|
173
|
+
|
|
174
|
+
The bundled mode adds ~1-3 minutes to initial gem installation but provides a completely self-contained experience with no external dependencies.
|
|
175
|
+
|
|
176
|
+
## Architecture
|
|
177
|
+
|
|
178
|
+
ParseKit uses a hybrid Ruby/Rust architecture:
|
|
179
|
+
|
|
180
|
+
- **Ruby Layer**: Provides convenient API and format detection
|
|
181
|
+
- **Rust Layer**: Implements high-performance parsing using:
|
|
182
|
+
- MuPDF for PDF text extraction (statically linked)
|
|
183
|
+
- tesseract-rs for OCR (with bundled Tesseract by default)
|
|
184
|
+
- Pure Rust libraries for DOCX/XLSX parsing
|
|
185
|
+
- Magnus for Ruby-Rust FFI bindings
|
|
186
|
+
|
|
187
|
+
## Contributing
|
|
188
|
+
|
|
189
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/scientist-labs/parsekit.
|
|
190
|
+
|
|
191
|
+
## License
|
|
192
|
+
|
|
193
|
+
The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
|
|
194
|
+
|
|
195
|
+
Note: This gem includes statically linked versions of MuPDF (AGPL/Commercial) and Tesseract (Apache 2.0). Please review their respective licenses for compliance with your use case.
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
[package]
|
|
2
|
+
name = "parsekit"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
edition = "2021"
|
|
5
|
+
authors = ["Your Name <your.email@example.com>"]
|
|
6
|
+
license = "MIT"
|
|
7
|
+
publish = false
|
|
8
|
+
|
|
9
|
+
[lib]
|
|
10
|
+
crate-type = ["cdylib"]
|
|
11
|
+
name = "parsekit"
|
|
12
|
+
|
|
13
|
+
[dependencies]
|
|
14
|
+
magnus = { version = "0.8", features = ["rb-sys"] }
|
|
15
|
+
# Document parsing - testing embedded C libraries
|
|
16
|
+
# MuPDF builds from source and statically links
|
|
17
|
+
mupdf = { version = "0.5", default-features = false, features = [] }
|
|
18
|
+
# OCR - Using tesseract-rs for both system and bundled modes
|
|
19
|
+
tesseract-rs = "0.1" # Tesseract with optional bundling
|
|
20
|
+
image = "0.25" # Image processing library (match rusty-tesseract's version)
|
|
21
|
+
calamine = "0.31" # Excel parsing
|
|
22
|
+
docx-rs = "0.4" # Word document parsing
|
|
23
|
+
quick-xml = "0.38" # XML parsing
|
|
24
|
+
zip = "5.0" # ZIP archive handling for PPTX
|
|
25
|
+
serde_json = "1.0" # JSON parsing
|
|
26
|
+
regex = "1.10" # Text parsing
|
|
27
|
+
encoding_rs = "0.8" # Encoding detection
|
|
28
|
+
|
|
29
|
+
[features]
|
|
30
|
+
default = ["bundled-tesseract"]
|
|
31
|
+
bundled-tesseract = []
|
|
32
|
+
|
|
33
|
+
[profile.release]
|
|
34
|
+
opt-level = 3
|
|
35
|
+
lto = true
|
|
36
|
+
codegen-units = 1
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
use magnus::{Error, RModule, Ruby, Module};
|
|
2
|
+
|
|
3
|
+
/// Custom error types for ParseKit
|
|
4
|
+
#[derive(Debug)]
|
|
5
|
+
#[allow(dead_code)]
|
|
6
|
+
pub enum ParserError {
|
|
7
|
+
ParseError(String),
|
|
8
|
+
ConfigError(String),
|
|
9
|
+
IoError(String),
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
impl ParserError {
|
|
13
|
+
/// Convert to Magnus Error
|
|
14
|
+
#[allow(dead_code)]
|
|
15
|
+
pub fn to_error(&self) -> Error {
|
|
16
|
+
match self {
|
|
17
|
+
ParserError::ParseError(msg) => {
|
|
18
|
+
Error::new(Ruby::get().unwrap().exception_runtime_error(), msg.clone())
|
|
19
|
+
}
|
|
20
|
+
ParserError::ConfigError(msg) => {
|
|
21
|
+
Error::new(Ruby::get().unwrap().exception_arg_error(), msg.clone())
|
|
22
|
+
}
|
|
23
|
+
ParserError::IoError(msg) => {
|
|
24
|
+
Error::new(Ruby::get().unwrap().exception_io_error(), msg.clone())
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
/// Initialize error classes
|
|
31
|
+
/// For simplicity, we'll just create Ruby classes that inherit from Object,
|
|
32
|
+
/// and document that they should be treated as exceptions
|
|
33
|
+
pub fn init(_ruby: &Ruby, module: RModule) -> Result<(), Error> {
|
|
34
|
+
// For now, just create placeholder classes
|
|
35
|
+
// In a real implementation, you'd want to properly set up exception classes
|
|
36
|
+
// but Magnus 0.7's API for this is complex
|
|
37
|
+
|
|
38
|
+
// Define error classes as regular Ruby classes
|
|
39
|
+
// Users can still rescue them by name in Ruby code
|
|
40
|
+
let _error = module.define_class("Error", Ruby::get().unwrap().class_object())?;
|
|
41
|
+
let _parse_error = module.define_class("ParseError", Ruby::get().unwrap().class_object())?;
|
|
42
|
+
let _config_error = module.define_class("ConfigError", Ruby::get().unwrap().class_object())?;
|
|
43
|
+
|
|
44
|
+
Ok(())
|
|
45
|
+
}
|
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
use std::path::Path;
|
|
2
|
+
|
|
3
|
+
/// Represents a detected file format
|
|
4
|
+
#[derive(Debug, Clone, PartialEq)]
|
|
5
|
+
pub enum FileFormat {
|
|
6
|
+
Pdf,
|
|
7
|
+
Docx,
|
|
8
|
+
Xlsx,
|
|
9
|
+
Xls,
|
|
10
|
+
Pptx,
|
|
11
|
+
Png,
|
|
12
|
+
Jpeg,
|
|
13
|
+
Tiff,
|
|
14
|
+
Bmp,
|
|
15
|
+
Json,
|
|
16
|
+
Xml,
|
|
17
|
+
Html,
|
|
18
|
+
Text,
|
|
19
|
+
Unknown,
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
impl FileFormat {
|
|
23
|
+
/// Convert to Ruby symbol representation
|
|
24
|
+
pub fn to_symbol(&self) -> &'static str {
|
|
25
|
+
match self {
|
|
26
|
+
FileFormat::Pdf => "pdf",
|
|
27
|
+
FileFormat::Docx => "docx",
|
|
28
|
+
FileFormat::Xlsx => "xlsx",
|
|
29
|
+
FileFormat::Xls => "xls",
|
|
30
|
+
FileFormat::Pptx => "pptx",
|
|
31
|
+
FileFormat::Png => "png",
|
|
32
|
+
FileFormat::Jpeg => "jpeg",
|
|
33
|
+
FileFormat::Tiff => "tiff",
|
|
34
|
+
FileFormat::Bmp => "bmp",
|
|
35
|
+
FileFormat::Json => "json",
|
|
36
|
+
FileFormat::Xml => "xml",
|
|
37
|
+
FileFormat::Html => "xml", // HTML is treated as XML in Ruby
|
|
38
|
+
FileFormat::Text => "text",
|
|
39
|
+
FileFormat::Unknown => "unknown",
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
/// Central format detection logic
|
|
45
|
+
pub struct FormatDetector;
|
|
46
|
+
|
|
47
|
+
impl FormatDetector {
|
|
48
|
+
/// Detect format from filename and content
|
|
49
|
+
/// Prioritizes content detection over extension when both are available
|
|
50
|
+
pub fn detect(filename: Option<&str>, content: Option<&[u8]>) -> FileFormat {
|
|
51
|
+
// First try content-based detection if content is provided
|
|
52
|
+
if let Some(data) = content {
|
|
53
|
+
let format = Self::detect_from_content(data);
|
|
54
|
+
// If we got a definitive format from content, use it
|
|
55
|
+
if !matches!(format, FileFormat::Text | FileFormat::Unknown) {
|
|
56
|
+
return format;
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
// Fall back to extension-based detection
|
|
61
|
+
if let Some(name) = filename {
|
|
62
|
+
let ext_format = Self::detect_from_extension(name);
|
|
63
|
+
if ext_format != FileFormat::Unknown {
|
|
64
|
+
return ext_format;
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
// If content detection returned Text and no extension match, return Text
|
|
69
|
+
if let Some(data) = content {
|
|
70
|
+
let format = Self::detect_from_content(data);
|
|
71
|
+
if format == FileFormat::Text {
|
|
72
|
+
return FileFormat::Text;
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
FileFormat::Unknown
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
/// Detect format from file extension
|
|
80
|
+
pub fn detect_from_extension(filename: &str) -> FileFormat {
|
|
81
|
+
let path = Path::new(filename);
|
|
82
|
+
let ext = match path.extension().and_then(|s| s.to_str()) {
|
|
83
|
+
Some(e) => e.to_lowercase(),
|
|
84
|
+
None => return FileFormat::Unknown,
|
|
85
|
+
};
|
|
86
|
+
|
|
87
|
+
match ext.as_str() {
|
|
88
|
+
"pdf" => FileFormat::Pdf,
|
|
89
|
+
"docx" => FileFormat::Docx,
|
|
90
|
+
"xlsx" => FileFormat::Xlsx,
|
|
91
|
+
"xls" => FileFormat::Xls,
|
|
92
|
+
"pptx" => FileFormat::Pptx,
|
|
93
|
+
"png" => FileFormat::Png,
|
|
94
|
+
"jpg" | "jpeg" => FileFormat::Jpeg,
|
|
95
|
+
"tiff" | "tif" => FileFormat::Tiff,
|
|
96
|
+
"bmp" => FileFormat::Bmp,
|
|
97
|
+
"json" => FileFormat::Json,
|
|
98
|
+
"xml" => FileFormat::Xml,
|
|
99
|
+
"html" | "htm" => FileFormat::Html,
|
|
100
|
+
"txt" | "text" | "md" | "markdown" | "csv" => FileFormat::Text,
|
|
101
|
+
_ => FileFormat::Unknown,
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
/// Detect format from file content (magic bytes)
|
|
106
|
+
pub fn detect_from_content(data: &[u8]) -> FileFormat {
|
|
107
|
+
if data.is_empty() {
|
|
108
|
+
return FileFormat::Text; // Empty files are treated as text
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
// PDF
|
|
112
|
+
if data.len() >= 4 && data.starts_with(b"%PDF") {
|
|
113
|
+
return FileFormat::Pdf;
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
// PNG
|
|
117
|
+
if data.len() >= 8 && data.starts_with(&[0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A]) {
|
|
118
|
+
return FileFormat::Png;
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
// JPEG
|
|
122
|
+
if data.len() >= 3 && data.starts_with(&[0xFF, 0xD8, 0xFF]) {
|
|
123
|
+
return FileFormat::Jpeg;
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
// BMP
|
|
127
|
+
if data.len() >= 2 && data.starts_with(b"BM") {
|
|
128
|
+
return FileFormat::Bmp;
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
// TIFF (little-endian or big-endian)
|
|
132
|
+
if data.len() >= 4 {
|
|
133
|
+
if data.starts_with(b"II\x2A\x00") || data.starts_with(b"MM\x00\x2A") {
|
|
134
|
+
return FileFormat::Tiff;
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
// OLE Compound Document (old Excel/Word)
|
|
139
|
+
if data.len() >= 4 && data.starts_with(&[0xD0, 0xCF, 0x11, 0xE0]) {
|
|
140
|
+
return FileFormat::Xls; // Old Office format, usually Excel
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
// ZIP archive (could be DOCX, XLSX, PPTX)
|
|
144
|
+
if data.len() >= 2 && data.starts_with(b"PK") {
|
|
145
|
+
return Self::detect_office_format(data);
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
// XML
|
|
149
|
+
if data.len() >= 5 {
|
|
150
|
+
let start = String::from_utf8_lossy(&data[0..5.min(data.len())]);
|
|
151
|
+
if start.starts_with("<?xml") || start.starts_with("<!") {
|
|
152
|
+
return FileFormat::Xml;
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
// HTML
|
|
157
|
+
if data.len() >= 14 {
|
|
158
|
+
let start = String::from_utf8_lossy(&data[0..14.min(data.len())]).to_lowercase();
|
|
159
|
+
if start.contains("<!doctype") || start.contains("<html") {
|
|
160
|
+
return FileFormat::Html;
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
// JSON
|
|
165
|
+
if let Some(&first_non_ws) = data.iter().find(|&&b| !b" \t\n\r".contains(&b)) {
|
|
166
|
+
if first_non_ws == b'{' || first_non_ws == b'[' {
|
|
167
|
+
return FileFormat::Json;
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
// Default to text for unrecognized formats
|
|
172
|
+
FileFormat::Text
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
/// Detect specific Office format from ZIP data
|
|
176
|
+
fn detect_office_format(data: &[u8]) -> FileFormat {
|
|
177
|
+
// Look for Office-specific directory names in first 2KB of ZIP
|
|
178
|
+
let check_len = 2000.min(data.len());
|
|
179
|
+
let content = String::from_utf8_lossy(&data[0..check_len]);
|
|
180
|
+
|
|
181
|
+
// Check for format-specific markers
|
|
182
|
+
if content.contains("word/") || content.contains("word/_rels") {
|
|
183
|
+
FileFormat::Docx
|
|
184
|
+
} else if content.contains("xl/") || content.contains("xl/_rels") {
|
|
185
|
+
FileFormat::Xlsx
|
|
186
|
+
} else if content.contains("ppt/") || content.contains("ppt/_rels") {
|
|
187
|
+
FileFormat::Pptx
|
|
188
|
+
} else {
|
|
189
|
+
// Default to XLSX for generic ZIP (most common Office format)
|
|
190
|
+
FileFormat::Xlsx
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
/// Get all supported extensions
|
|
196
|
+
pub fn supported_extensions() -> Vec<&'static str> {
|
|
197
|
+
vec![
|
|
198
|
+
"pdf", "docx", "xlsx", "xls", "pptx",
|
|
199
|
+
"png", "jpg", "jpeg", "tiff", "tif", "bmp",
|
|
200
|
+
"json", "xml", "html", "htm",
|
|
201
|
+
"txt", "text", "md", "markdown", "csv"
|
|
202
|
+
]
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
#[cfg(test)]
|
|
207
|
+
mod tests {
|
|
208
|
+
use super::*;
|
|
209
|
+
|
|
210
|
+
#[test]
|
|
211
|
+
fn test_detect_pdf() {
|
|
212
|
+
let pdf_data = b"%PDF-1.5\n";
|
|
213
|
+
assert_eq!(FormatDetector::detect_from_content(pdf_data), FileFormat::Pdf);
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
#[test]
|
|
217
|
+
fn test_detect_png() {
|
|
218
|
+
let png_data = &[0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A];
|
|
219
|
+
assert_eq!(FormatDetector::detect_from_content(png_data), FileFormat::Png);
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
#[test]
|
|
223
|
+
fn test_detect_from_extension() {
|
|
224
|
+
assert_eq!(FormatDetector::detect_from_extension("document.pdf"), FileFormat::Pdf);
|
|
225
|
+
assert_eq!(FormatDetector::detect_from_extension("Document.PDF"), FileFormat::Pdf);
|
|
226
|
+
assert_eq!(FormatDetector::detect_from_extension("data.xlsx"), FileFormat::Xlsx);
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
#[test]
|
|
230
|
+
fn test_empty_data() {
|
|
231
|
+
assert_eq!(FormatDetector::detect_from_content(&[]), FileFormat::Text);
|
|
232
|
+
}
|
|
233
|
+
}
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
use magnus::{function, prelude::*, Error, Ruby};
|
|
2
|
+
|
|
3
|
+
mod parser;
|
|
4
|
+
mod error;
|
|
5
|
+
mod format_detector;
|
|
6
|
+
|
|
7
|
+
/// Initialize the ParseKit module and its submodules
|
|
8
|
+
#[magnus::init]
|
|
9
|
+
fn init(ruby: &Ruby) -> Result<(), Error> {
|
|
10
|
+
let module = ruby.define_module("ParseKit")?;
|
|
11
|
+
|
|
12
|
+
// Initialize submodules
|
|
13
|
+
parser::init(ruby, module)?;
|
|
14
|
+
error::init(ruby, module)?;
|
|
15
|
+
|
|
16
|
+
// Add module-level methods
|
|
17
|
+
module.define_singleton_method("version", function!(version, 0))?;
|
|
18
|
+
|
|
19
|
+
Ok(())
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
/// Return the version of the parsekit gem
|
|
23
|
+
fn version() -> String {
|
|
24
|
+
env!("CARGO_PKG_VERSION").to_string()
|
|
25
|
+
}
|