parsekit 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/ext/parsekit/Cargo.toml +4 -4
- data/ext/parsekit/src/parser.rs +1 -1
- data/lib/parsekit/version.rb +1 -1
- metadata +3 -5
- data/lib/parsekit/NATIVE_API.md +0 -125
- data/lib/parsekit/parsekit.bundle +0 -0
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: ee11d59d78b4a2d0d837233b464f3bc84c934659826fe879953b2c1caa56521a
|
|
4
|
+
data.tar.gz: b3829363b1821c19d51d86beb494b83a30e0f875e847a7b439a6ed27a993cb26
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: dddee73ead9421e822a25d97b8fc32ac852c6cc4b8729ae77308b304544738f53e3f6313f124eaf01e4a0bee0e28f2066d411dc00fe324ec47133a0ab9dc4445
|
|
7
|
+
data.tar.gz: 724666245f2fe62df854b034deb6db3b1db0fcd55f58a257e6929d0946fa1c88ec3e8644b59a0c3930dbc82608ff113d5a2d5a782b6e30f0e40678f1ed5edb43
|
data/ext/parsekit/Cargo.toml
CHANGED
|
@@ -14,14 +14,14 @@ name = "parsekit"
|
|
|
14
14
|
magnus = { version = "0.8", features = ["rb-sys"] }
|
|
15
15
|
# Document parsing - testing embedded C libraries
|
|
16
16
|
# MuPDF builds from source and statically links
|
|
17
|
-
mupdf = { version = "0.
|
|
17
|
+
mupdf = { version = "0.6", default-features = false, features = [] }
|
|
18
18
|
# OCR - Using tesseract-rs for both system and bundled modes
|
|
19
19
|
tesseract-rs = "0.1" # Tesseract with optional bundling
|
|
20
20
|
image = "0.25" # Image processing library (match rusty-tesseract's version)
|
|
21
|
-
calamine = "0.
|
|
21
|
+
calamine = "0.34" # Excel parsing
|
|
22
22
|
docx-rs = "0.4" # Word document parsing
|
|
23
|
-
quick-xml = "0.
|
|
24
|
-
zip = "
|
|
23
|
+
quick-xml = "0.39" # XML parsing
|
|
24
|
+
zip = "8.2" # ZIP archive handling for PPTX
|
|
25
25
|
serde_json = "1.0" # JSON parsing
|
|
26
26
|
regex = "1.10" # Text parsing
|
|
27
27
|
encoding_rs = "0.8" # Encoding detection
|
data/ext/parsekit/src/parser.rs
CHANGED
|
@@ -242,7 +242,7 @@ impl Parser {
|
|
|
242
242
|
// Continue on page errors rather than failing entirely
|
|
243
243
|
if let Ok(page) = doc.load_page(page_num) {
|
|
244
244
|
// Extract text from the page
|
|
245
|
-
if let Ok(text) = page.to_text() {
|
|
245
|
+
if let Ok(text) = page.to_text_page(mupdf::TextPageFlags::empty()).and_then(|tp| tp.to_text()) {
|
|
246
246
|
all_text.push_str(&text);
|
|
247
247
|
all_text.push('\n');
|
|
248
248
|
}
|
data/lib/parsekit/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: parsekit
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.1.
|
|
4
|
+
version: 0.1.3
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Chris Petersen
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date:
|
|
11
|
+
date: 2026-03-24 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: rb_sys
|
|
@@ -100,9 +100,7 @@ files:
|
|
|
100
100
|
- ext/parsekit/src/lib.rs
|
|
101
101
|
- ext/parsekit/src/parser.rs
|
|
102
102
|
- lib/parsekit.rb
|
|
103
|
-
- lib/parsekit/NATIVE_API.md
|
|
104
103
|
- lib/parsekit/error.rb
|
|
105
|
-
- lib/parsekit/parsekit.bundle
|
|
106
104
|
- lib/parsekit/parser.rb
|
|
107
105
|
- lib/parsekit/version.rb
|
|
108
106
|
homepage: https://github.com/scientist-labs/parsekit
|
|
@@ -127,7 +125,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
127
125
|
- !ruby/object:Gem::Version
|
|
128
126
|
version: '0'
|
|
129
127
|
requirements: []
|
|
130
|
-
rubygems_version: 3.5.
|
|
128
|
+
rubygems_version: 3.5.22
|
|
131
129
|
signing_key:
|
|
132
130
|
specification_version: 4
|
|
133
131
|
summary: Ruby document parsing toolkit with PDF and OCR support
|
data/lib/parsekit/NATIVE_API.md
DELETED
|
@@ -1,125 +0,0 @@
|
|
|
1
|
-
# ParseKit Native API Documentation
|
|
2
|
-
|
|
3
|
-
This document describes the methods implemented in the Rust native extension for ParseKit::Parser.
|
|
4
|
-
|
|
5
|
-
## Instance Methods
|
|
6
|
-
|
|
7
|
-
### `initialize(options = {})`
|
|
8
|
-
Initialize a new Parser instance with optional configuration.
|
|
9
|
-
|
|
10
|
-
**Parameters:**
|
|
11
|
-
- `options` [Hash] Configuration options
|
|
12
|
-
- `:encoding` [String] Input encoding (default: UTF-8)
|
|
13
|
-
- `:strict_mode` [Boolean] Enable strict parsing mode (default: false)
|
|
14
|
-
- `:max_depth` [Integer] Maximum nesting depth (default: 100)
|
|
15
|
-
- `:max_size` [Integer] Maximum file size in bytes (default: 100MB)
|
|
16
|
-
|
|
17
|
-
### `parse(input)`
|
|
18
|
-
Parse an input string (for text content).
|
|
19
|
-
|
|
20
|
-
**Parameters:**
|
|
21
|
-
- `input` [String] The input to parse
|
|
22
|
-
|
|
23
|
-
**Returns:**
|
|
24
|
-
- [String] The parsed result
|
|
25
|
-
|
|
26
|
-
**Raises:**
|
|
27
|
-
- `ArgumentError` If input is empty
|
|
28
|
-
|
|
29
|
-
### `parse_file(path)`
|
|
30
|
-
Parse a file (supports PDF, Office documents, text files, images with OCR).
|
|
31
|
-
|
|
32
|
-
**Parameters:**
|
|
33
|
-
- `path` [String] Path to the file to parse
|
|
34
|
-
|
|
35
|
-
**Returns:**
|
|
36
|
-
- [String] The extracted text content
|
|
37
|
-
|
|
38
|
-
**Raises:**
|
|
39
|
-
- `IOError` If file cannot be read
|
|
40
|
-
- `RuntimeError` If parsing fails
|
|
41
|
-
|
|
42
|
-
### `parse_bytes(data)`
|
|
43
|
-
Parse binary data.
|
|
44
|
-
|
|
45
|
-
**Parameters:**
|
|
46
|
-
- `data` [Array<Integer>] Binary data as byte array
|
|
47
|
-
|
|
48
|
-
**Returns:**
|
|
49
|
-
- [String] The extracted text content
|
|
50
|
-
|
|
51
|
-
**Raises:**
|
|
52
|
-
- `ArgumentError` If data is empty
|
|
53
|
-
- `RuntimeError` If parsing fails
|
|
54
|
-
|
|
55
|
-
### `config`
|
|
56
|
-
Get the current parser configuration.
|
|
57
|
-
|
|
58
|
-
**Returns:**
|
|
59
|
-
- [Hash] The parser configuration including encoding, strict_mode, max_depth, and max_size
|
|
60
|
-
|
|
61
|
-
### `supports_file?(path)`
|
|
62
|
-
Check if a file format is supported.
|
|
63
|
-
|
|
64
|
-
**Parameters:**
|
|
65
|
-
- `path` [String] File path to check
|
|
66
|
-
|
|
67
|
-
**Returns:**
|
|
68
|
-
- [Boolean] True if the file format is supported
|
|
69
|
-
|
|
70
|
-
### `strict_mode?`
|
|
71
|
-
Check if strict mode is enabled.
|
|
72
|
-
|
|
73
|
-
**Returns:**
|
|
74
|
-
- [Boolean] True if strict mode is enabled
|
|
75
|
-
|
|
76
|
-
## Format-Specific Parsers
|
|
77
|
-
|
|
78
|
-
These methods are also available but typically called internally via `parse_file` or `parse_bytes`:
|
|
79
|
-
|
|
80
|
-
### `parse_pdf(data)`
|
|
81
|
-
Parse PDF files using MuPDF (statically linked).
|
|
82
|
-
|
|
83
|
-
### `parse_docx(data)`
|
|
84
|
-
Parse Microsoft Word documents.
|
|
85
|
-
|
|
86
|
-
### `parse_pptx(data)`
|
|
87
|
-
Parse Microsoft PowerPoint presentations.
|
|
88
|
-
|
|
89
|
-
### `parse_xlsx(data)`
|
|
90
|
-
Parse Microsoft Excel spreadsheets.
|
|
91
|
-
|
|
92
|
-
### `parse_json(data)`
|
|
93
|
-
Parse and pretty-print JSON data.
|
|
94
|
-
|
|
95
|
-
### `parse_xml(data)`
|
|
96
|
-
Parse XML/HTML files and extract text content.
|
|
97
|
-
|
|
98
|
-
### `parse_text(data)`
|
|
99
|
-
Parse plain text files.
|
|
100
|
-
|
|
101
|
-
### `ocr_image(data)`
|
|
102
|
-
Perform OCR on images (PNG, JPEG, TIFF, BMP) using Tesseract.
|
|
103
|
-
|
|
104
|
-
## Class Methods
|
|
105
|
-
|
|
106
|
-
### `Parser.supported_formats`
|
|
107
|
-
Get list of supported file formats.
|
|
108
|
-
|
|
109
|
-
**Returns:**
|
|
110
|
-
- [Array<String>] List of supported file extensions
|
|
111
|
-
|
|
112
|
-
**Example:**
|
|
113
|
-
```ruby
|
|
114
|
-
ParseKit::Parser.supported_formats
|
|
115
|
-
# => ["txt", "json", "xml", "html", "docx", "xlsx", "xls", "csv", "pdf", "png", "jpg", "jpeg", "tiff", "bmp", ...]
|
|
116
|
-
```
|
|
117
|
-
|
|
118
|
-
## Implementation Notes
|
|
119
|
-
|
|
120
|
-
All these methods are implemented in Rust via the native extension. The Ruby layer (`lib/parsekit/parser.rb`) provides additional convenience methods and helpers that wrap these native methods.
|
|
121
|
-
|
|
122
|
-
The native extension uses:
|
|
123
|
-
- **MuPDF** for PDF parsing (statically linked)
|
|
124
|
-
- **Tesseract** for OCR functionality (bundled)
|
|
125
|
-
- **Various Rust crates** for Office document parsing (docx-rs, calamine, etc.)
|
|
Binary file
|