parsekit 0.1.0.pre.1 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +29 -17
- data/ext/parsekit/Cargo.toml +9 -7
- data/ext/parsekit/src/error.rs +7 -7
- data/ext/parsekit/src/format_detector.rs +233 -0
- data/ext/parsekit/src/lib.rs +1 -0
- data/ext/parsekit/src/parser.rs +357 -199
- data/lib/parsekit/NATIVE_API.md +125 -0
- data/lib/parsekit/parsekit.bundle +0 -0
- data/lib/parsekit/parser.rb +156 -104
- data/lib/parsekit/version.rb +1 -1
- data/lib/parsekit.rb +32 -0
- metadata +4 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: e77e605d938d5b0b89c7814d1360f4c505415c54efbf8ffe9f2f7d4c564d917e
|
|
4
|
+
data.tar.gz: 6b86f57b2dce1231cae704b4d35c7562807ab77b001860b6fa5bbcdc9844781f
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: a3f7089e8bd3e84cb2e14614cb78c3b3132d4d93a3c95d5cdcfa6c63723fe2dfce3a01bf0ee27255be7ff036bd0e438492434ded72853772e57b65faf7bded9b
|
|
7
|
+
data.tar.gz: c84b03d65471f50d6ec72eaa21269b5fc1c5e40e0cefa923cc71d50b802734d64c8296e5e5e6a76ca2f5d388a568119ed09474ae622534aaef03e0a96109dee3
|
data/README.md
CHANGED
|
@@ -1,14 +1,13 @@
|
|
|
1
|
-
|
|
1
|
+
<img src="/docs/assets/parsekit-wide.png" alt="parsekit" height="80px">
|
|
2
2
|
|
|
3
|
-
[](https://github.com/cpetersen/parsekit/actions/workflows/ci.yml)
|
|
4
3
|
[](https://badge.fury.io/rb/parsekit)
|
|
5
4
|
[](https://opensource.org/licenses/MIT)
|
|
6
5
|
|
|
7
|
-
Native Ruby bindings for the [parser-core](https://crates.io/crates/parser-core) Rust crate, providing high-performance document parsing and text extraction capabilities through Magnus. This gem wraps parser-core to extract text from PDFs, Office documents (DOCX, XLSX
|
|
6
|
+
Native Ruby bindings for the [parser-core](https://crates.io/crates/parser-core) Rust crate, providing high-performance document parsing and text extraction capabilities through Magnus. This gem wraps parser-core to extract text from PDFs, Office documents (DOCX, XLSX), images (with OCR), and more. Part of the ruby-nlp ecosystem.
|
|
8
7
|
|
|
9
8
|
## Features
|
|
10
9
|
|
|
11
|
-
- 📄 **Document Parsing**: Extract text from PDFs, Office documents (DOCX, XLSX
|
|
10
|
+
- 📄 **Document Parsing**: Extract text from PDFs, Office documents (DOCX, XLSX)
|
|
12
11
|
- 🖼️ **OCR Support**: Extract text from images using Tesseract OCR
|
|
13
12
|
- 🚀 **High Performance**: Native Rust performance with Ruby convenience
|
|
14
13
|
- 🔧 **Unified API**: Single interface for multiple document formats
|
|
@@ -38,13 +37,8 @@ gem install parsekit
|
|
|
38
37
|
- Ruby >= 3.0.0
|
|
39
38
|
- Rust toolchain (stable)
|
|
40
39
|
- C compiler (for linking)
|
|
41
|
-
- System libraries for document parsing:
|
|
42
|
-
- **macOS**: `brew install leptonica tesseract poppler`
|
|
43
|
-
- **Ubuntu/Debian**: `sudo apt-get install libleptonica-dev libtesseract-dev libpoppler-cpp-dev`
|
|
44
|
-
- **Fedora/RHEL**: `sudo dnf install leptonica-devel tesseract-devel poppler-cpp-devel`
|
|
45
|
-
- **Windows**: See [DEPENDENCIES.md](DEPENDENCIES.md) for MSYS2 instructions
|
|
46
40
|
|
|
47
|
-
|
|
41
|
+
That's it! ParseKit bundles all necessary libraries including Tesseract for OCR, so you don't need to install any system dependencies.
|
|
48
42
|
|
|
49
43
|
## Usage
|
|
50
44
|
|
|
@@ -57,10 +51,6 @@ require 'parsekit'
|
|
|
57
51
|
text = ParseKit.parse_file("document.pdf")
|
|
58
52
|
puts text # Extracted text from the PDF
|
|
59
53
|
|
|
60
|
-
# Parse an Office document
|
|
61
|
-
text = ParseKit.parse_file("presentation.pptx")
|
|
62
|
-
puts text # Extracted text from all slides
|
|
63
|
-
|
|
64
54
|
# Parse an Excel file
|
|
65
55
|
text = ParseKit.parse_file("spreadsheet.xlsx")
|
|
66
56
|
puts text # Extracted text from all sheets
|
|
@@ -131,7 +121,8 @@ excel_text = parser.parse_xlsx(excel_data)
|
|
|
131
121
|
| PDF | .pdf | `parse_pdf` | Text extraction via MuPDF |
|
|
132
122
|
| Word | .docx | `parse_docx` | Office Open XML format |
|
|
133
123
|
| Excel | .xlsx, .xls | `parse_xlsx` | Both modern and legacy formats |
|
|
134
|
-
|
|
|
124
|
+
| PowerPoint | .pptx | `parse_pptx` | Text extraction from slides and notes |
|
|
125
|
+
| Images | .png, .jpg, .jpeg, .tiff, .bmp | `ocr_image` | OCR via bundled Tesseract |
|
|
135
126
|
| JSON | .json | `parse_json` | Pretty-printed output |
|
|
136
127
|
| XML/HTML | .xml, .html | `parse_xml` | Extracts text content |
|
|
137
128
|
| Text | .txt, .csv, .md | `parse_text` | With encoding detection |
|
|
@@ -161,6 +152,27 @@ To run tests with coverage:
|
|
|
161
152
|
rake dev:coverage
|
|
162
153
|
```
|
|
163
154
|
|
|
155
|
+
### OCR Mode Configuration
|
|
156
|
+
|
|
157
|
+
By default, ParseKit bundles Tesseract for zero-dependency OCR support. Advanced users who already have Tesseract installed system-wide and want faster gem installation can use system mode:
|
|
158
|
+
|
|
159
|
+
**Using system Tesseract during installation:**
|
|
160
|
+
```bash
|
|
161
|
+
gem install parsekit -- --no-default-features
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
**For development with system Tesseract:**
|
|
165
|
+
```bash
|
|
166
|
+
rake compile CARGO_FEATURES="" # Disables bundled-tesseract feature
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
**System Tesseract requirements:**
|
|
170
|
+
- **macOS**: `brew install tesseract`
|
|
171
|
+
- **Ubuntu/Debian**: `sudo apt-get install libtesseract-dev`
|
|
172
|
+
- **Fedora/RHEL**: `sudo dnf install tesseract-devel`
|
|
173
|
+
|
|
174
|
+
The bundled mode adds ~1-3 minutes to initial gem installation but provides a completely self-contained experience with no external dependencies.
|
|
175
|
+
|
|
164
176
|
## Architecture
|
|
165
177
|
|
|
166
178
|
ParseKit uses a hybrid Ruby/Rust architecture:
|
|
@@ -168,7 +180,7 @@ ParseKit uses a hybrid Ruby/Rust architecture:
|
|
|
168
180
|
- **Ruby Layer**: Provides convenient API and format detection
|
|
169
181
|
- **Rust Layer**: Implements high-performance parsing using:
|
|
170
182
|
- MuPDF for PDF text extraction (statically linked)
|
|
171
|
-
-
|
|
183
|
+
- tesseract-rs for OCR (with bundled Tesseract by default)
|
|
172
184
|
- Pure Rust libraries for DOCX/XLSX parsing
|
|
173
185
|
- Magnus for Ruby-Rust FFI bindings
|
|
174
186
|
|
|
@@ -180,4 +192,4 @@ Bug reports and pull requests are welcome on GitHub at https://github.com/cpeter
|
|
|
180
192
|
|
|
181
193
|
The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
|
|
182
194
|
|
|
183
|
-
Note: This gem includes statically linked versions of MuPDF (AGPL/Commercial) and Tesseract (Apache 2.0). Please review their respective licenses for compliance with your use case.
|
|
195
|
+
Note: This gem includes statically linked versions of MuPDF (AGPL/Commercial) and Tesseract (Apache 2.0). Please review their respective licenses for compliance with your use case.
|
data/ext/parsekit/Cargo.toml
CHANGED
|
@@ -11,24 +11,26 @@ crate-type = ["cdylib"]
|
|
|
11
11
|
name = "parsekit"
|
|
12
12
|
|
|
13
13
|
[dependencies]
|
|
14
|
-
magnus = { version = "0.
|
|
14
|
+
magnus = { version = "0.8", features = ["rb-sys"] }
|
|
15
15
|
# Document parsing - testing embedded C libraries
|
|
16
16
|
# MuPDF builds from source and statically links
|
|
17
17
|
mupdf = { version = "0.5", default-features = false, features = [] }
|
|
18
|
-
# OCR -
|
|
19
|
-
|
|
18
|
+
# OCR - Using tesseract-rs for both system and bundled modes
|
|
19
|
+
tesseract-rs = "0.1" # Tesseract with optional bundling
|
|
20
20
|
image = "0.25" # Image processing library (match rusty-tesseract's version)
|
|
21
|
-
calamine = "0.
|
|
21
|
+
calamine = "0.30" # Excel parsing
|
|
22
22
|
docx-rs = "0.4" # Word document parsing
|
|
23
|
-
quick-xml = "0.
|
|
23
|
+
quick-xml = "0.38" # XML parsing
|
|
24
|
+
zip = "2.1" # ZIP archive handling for PPTX
|
|
24
25
|
serde_json = "1.0" # JSON parsing
|
|
25
26
|
regex = "1.10" # Text parsing
|
|
26
27
|
encoding_rs = "0.8" # Encoding detection
|
|
27
28
|
|
|
28
29
|
[features]
|
|
29
|
-
default = []
|
|
30
|
+
default = ["bundled-tesseract"]
|
|
31
|
+
bundled-tesseract = []
|
|
30
32
|
|
|
31
33
|
[profile.release]
|
|
32
34
|
opt-level = 3
|
|
33
35
|
lto = true
|
|
34
|
-
codegen-units = 1
|
|
36
|
+
codegen-units = 1
|
data/ext/parsekit/src/error.rs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
use magnus::{
|
|
1
|
+
use magnus::{Error, RModule, Ruby, Module};
|
|
2
2
|
|
|
3
3
|
/// Custom error types for ParseKit
|
|
4
4
|
#[derive(Debug)]
|
|
@@ -15,13 +15,13 @@ impl ParserError {
|
|
|
15
15
|
pub fn to_error(&self) -> Error {
|
|
16
16
|
match self {
|
|
17
17
|
ParserError::ParseError(msg) => {
|
|
18
|
-
Error::new(
|
|
18
|
+
Error::new(Ruby::get().unwrap().exception_runtime_error(), msg.clone())
|
|
19
19
|
}
|
|
20
20
|
ParserError::ConfigError(msg) => {
|
|
21
|
-
Error::new(
|
|
21
|
+
Error::new(Ruby::get().unwrap().exception_arg_error(), msg.clone())
|
|
22
22
|
}
|
|
23
23
|
ParserError::IoError(msg) => {
|
|
24
|
-
Error::new(
|
|
24
|
+
Error::new(Ruby::get().unwrap().exception_io_error(), msg.clone())
|
|
25
25
|
}
|
|
26
26
|
}
|
|
27
27
|
}
|
|
@@ -37,9 +37,9 @@ pub fn init(_ruby: &Ruby, module: RModule) -> Result<(), Error> {
|
|
|
37
37
|
|
|
38
38
|
// Define error classes as regular Ruby classes
|
|
39
39
|
// Users can still rescue them by name in Ruby code
|
|
40
|
-
let _error = module.define_class("Error",
|
|
41
|
-
let _parse_error = module.define_class("ParseError",
|
|
42
|
-
let _config_error = module.define_class("ConfigError",
|
|
40
|
+
let _error = module.define_class("Error", Ruby::get().unwrap().class_object())?;
|
|
41
|
+
let _parse_error = module.define_class("ParseError", Ruby::get().unwrap().class_object())?;
|
|
42
|
+
let _config_error = module.define_class("ConfigError", Ruby::get().unwrap().class_object())?;
|
|
43
43
|
|
|
44
44
|
Ok(())
|
|
45
45
|
}
|
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
use std::path::Path;
|
|
2
|
+
|
|
3
|
+
/// Represents a detected file format
|
|
4
|
+
#[derive(Debug, Clone, PartialEq)]
|
|
5
|
+
pub enum FileFormat {
|
|
6
|
+
Pdf,
|
|
7
|
+
Docx,
|
|
8
|
+
Xlsx,
|
|
9
|
+
Xls,
|
|
10
|
+
Pptx,
|
|
11
|
+
Png,
|
|
12
|
+
Jpeg,
|
|
13
|
+
Tiff,
|
|
14
|
+
Bmp,
|
|
15
|
+
Json,
|
|
16
|
+
Xml,
|
|
17
|
+
Html,
|
|
18
|
+
Text,
|
|
19
|
+
Unknown,
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
impl FileFormat {
|
|
23
|
+
/// Convert to Ruby symbol representation
|
|
24
|
+
pub fn to_symbol(&self) -> &'static str {
|
|
25
|
+
match self {
|
|
26
|
+
FileFormat::Pdf => "pdf",
|
|
27
|
+
FileFormat::Docx => "docx",
|
|
28
|
+
FileFormat::Xlsx => "xlsx",
|
|
29
|
+
FileFormat::Xls => "xls",
|
|
30
|
+
FileFormat::Pptx => "pptx",
|
|
31
|
+
FileFormat::Png => "png",
|
|
32
|
+
FileFormat::Jpeg => "jpeg",
|
|
33
|
+
FileFormat::Tiff => "tiff",
|
|
34
|
+
FileFormat::Bmp => "bmp",
|
|
35
|
+
FileFormat::Json => "json",
|
|
36
|
+
FileFormat::Xml => "xml",
|
|
37
|
+
FileFormat::Html => "xml", // HTML is treated as XML in Ruby
|
|
38
|
+
FileFormat::Text => "text",
|
|
39
|
+
FileFormat::Unknown => "unknown",
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
/// Central format detection logic
|
|
45
|
+
pub struct FormatDetector;
|
|
46
|
+
|
|
47
|
+
impl FormatDetector {
|
|
48
|
+
/// Detect format from filename and content
|
|
49
|
+
/// Prioritizes content detection over extension when both are available
|
|
50
|
+
pub fn detect(filename: Option<&str>, content: Option<&[u8]>) -> FileFormat {
|
|
51
|
+
// First try content-based detection if content is provided
|
|
52
|
+
if let Some(data) = content {
|
|
53
|
+
let format = Self::detect_from_content(data);
|
|
54
|
+
// If we got a definitive format from content, use it
|
|
55
|
+
if !matches!(format, FileFormat::Text | FileFormat::Unknown) {
|
|
56
|
+
return format;
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
// Fall back to extension-based detection
|
|
61
|
+
if let Some(name) = filename {
|
|
62
|
+
let ext_format = Self::detect_from_extension(name);
|
|
63
|
+
if ext_format != FileFormat::Unknown {
|
|
64
|
+
return ext_format;
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
// If content detection returned Text and no extension match, return Text
|
|
69
|
+
if let Some(data) = content {
|
|
70
|
+
let format = Self::detect_from_content(data);
|
|
71
|
+
if format == FileFormat::Text {
|
|
72
|
+
return FileFormat::Text;
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
FileFormat::Unknown
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
/// Detect format from file extension
|
|
80
|
+
pub fn detect_from_extension(filename: &str) -> FileFormat {
|
|
81
|
+
let path = Path::new(filename);
|
|
82
|
+
let ext = match path.extension().and_then(|s| s.to_str()) {
|
|
83
|
+
Some(e) => e.to_lowercase(),
|
|
84
|
+
None => return FileFormat::Unknown,
|
|
85
|
+
};
|
|
86
|
+
|
|
87
|
+
match ext.as_str() {
|
|
88
|
+
"pdf" => FileFormat::Pdf,
|
|
89
|
+
"docx" => FileFormat::Docx,
|
|
90
|
+
"xlsx" => FileFormat::Xlsx,
|
|
91
|
+
"xls" => FileFormat::Xls,
|
|
92
|
+
"pptx" => FileFormat::Pptx,
|
|
93
|
+
"png" => FileFormat::Png,
|
|
94
|
+
"jpg" | "jpeg" => FileFormat::Jpeg,
|
|
95
|
+
"tiff" | "tif" => FileFormat::Tiff,
|
|
96
|
+
"bmp" => FileFormat::Bmp,
|
|
97
|
+
"json" => FileFormat::Json,
|
|
98
|
+
"xml" => FileFormat::Xml,
|
|
99
|
+
"html" | "htm" => FileFormat::Html,
|
|
100
|
+
"txt" | "text" | "md" | "markdown" | "csv" => FileFormat::Text,
|
|
101
|
+
_ => FileFormat::Unknown,
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
/// Detect format from file content (magic bytes)
|
|
106
|
+
pub fn detect_from_content(data: &[u8]) -> FileFormat {
|
|
107
|
+
if data.is_empty() {
|
|
108
|
+
return FileFormat::Text; // Empty files are treated as text
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
// PDF
|
|
112
|
+
if data.len() >= 4 && data.starts_with(b"%PDF") {
|
|
113
|
+
return FileFormat::Pdf;
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
// PNG
|
|
117
|
+
if data.len() >= 8 && data.starts_with(&[0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A]) {
|
|
118
|
+
return FileFormat::Png;
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
// JPEG
|
|
122
|
+
if data.len() >= 3 && data.starts_with(&[0xFF, 0xD8, 0xFF]) {
|
|
123
|
+
return FileFormat::Jpeg;
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
// BMP
|
|
127
|
+
if data.len() >= 2 && data.starts_with(b"BM") {
|
|
128
|
+
return FileFormat::Bmp;
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
// TIFF (little-endian or big-endian)
|
|
132
|
+
if data.len() >= 4 {
|
|
133
|
+
if data.starts_with(b"II\x2A\x00") || data.starts_with(b"MM\x00\x2A") {
|
|
134
|
+
return FileFormat::Tiff;
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
// OLE Compound Document (old Excel/Word)
|
|
139
|
+
if data.len() >= 4 && data.starts_with(&[0xD0, 0xCF, 0x11, 0xE0]) {
|
|
140
|
+
return FileFormat::Xls; // Old Office format, usually Excel
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
// ZIP archive (could be DOCX, XLSX, PPTX)
|
|
144
|
+
if data.len() >= 2 && data.starts_with(b"PK") {
|
|
145
|
+
return Self::detect_office_format(data);
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
// XML
|
|
149
|
+
if data.len() >= 5 {
|
|
150
|
+
let start = String::from_utf8_lossy(&data[0..5.min(data.len())]);
|
|
151
|
+
if start.starts_with("<?xml") || start.starts_with("<!") {
|
|
152
|
+
return FileFormat::Xml;
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
// HTML
|
|
157
|
+
if data.len() >= 14 {
|
|
158
|
+
let start = String::from_utf8_lossy(&data[0..14.min(data.len())]).to_lowercase();
|
|
159
|
+
if start.contains("<!doctype") || start.contains("<html") {
|
|
160
|
+
return FileFormat::Html;
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
// JSON
|
|
165
|
+
if let Some(&first_non_ws) = data.iter().find(|&&b| !b" \t\n\r".contains(&b)) {
|
|
166
|
+
if first_non_ws == b'{' || first_non_ws == b'[' {
|
|
167
|
+
return FileFormat::Json;
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
// Default to text for unrecognized formats
|
|
172
|
+
FileFormat::Text
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
/// Detect specific Office format from ZIP data
|
|
176
|
+
fn detect_office_format(data: &[u8]) -> FileFormat {
|
|
177
|
+
// Look for Office-specific directory names in first 2KB of ZIP
|
|
178
|
+
let check_len = 2000.min(data.len());
|
|
179
|
+
let content = String::from_utf8_lossy(&data[0..check_len]);
|
|
180
|
+
|
|
181
|
+
// Check for format-specific markers
|
|
182
|
+
if content.contains("word/") || content.contains("word/_rels") {
|
|
183
|
+
FileFormat::Docx
|
|
184
|
+
} else if content.contains("xl/") || content.contains("xl/_rels") {
|
|
185
|
+
FileFormat::Xlsx
|
|
186
|
+
} else if content.contains("ppt/") || content.contains("ppt/_rels") {
|
|
187
|
+
FileFormat::Pptx
|
|
188
|
+
} else {
|
|
189
|
+
// Default to XLSX for generic ZIP (most common Office format)
|
|
190
|
+
FileFormat::Xlsx
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
/// Get all supported extensions
|
|
196
|
+
pub fn supported_extensions() -> Vec<&'static str> {
|
|
197
|
+
vec![
|
|
198
|
+
"pdf", "docx", "xlsx", "xls", "pptx",
|
|
199
|
+
"png", "jpg", "jpeg", "tiff", "tif", "bmp",
|
|
200
|
+
"json", "xml", "html", "htm",
|
|
201
|
+
"txt", "text", "md", "markdown", "csv"
|
|
202
|
+
]
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
#[cfg(test)]
|
|
207
|
+
mod tests {
|
|
208
|
+
use super::*;
|
|
209
|
+
|
|
210
|
+
#[test]
|
|
211
|
+
fn test_detect_pdf() {
|
|
212
|
+
let pdf_data = b"%PDF-1.5\n";
|
|
213
|
+
assert_eq!(FormatDetector::detect_from_content(pdf_data), FileFormat::Pdf);
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
#[test]
|
|
217
|
+
fn test_detect_png() {
|
|
218
|
+
let png_data = &[0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A];
|
|
219
|
+
assert_eq!(FormatDetector::detect_from_content(png_data), FileFormat::Png);
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
#[test]
|
|
223
|
+
fn test_detect_from_extension() {
|
|
224
|
+
assert_eq!(FormatDetector::detect_from_extension("document.pdf"), FileFormat::Pdf);
|
|
225
|
+
assert_eq!(FormatDetector::detect_from_extension("Document.PDF"), FileFormat::Pdf);
|
|
226
|
+
assert_eq!(FormatDetector::detect_from_extension("data.xlsx"), FileFormat::Xlsx);
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
#[test]
|
|
230
|
+
fn test_empty_data() {
|
|
231
|
+
assert_eq!(FormatDetector::detect_from_content(&[]), FileFormat::Text);
|
|
232
|
+
}
|
|
233
|
+
}
|