parsekit 0.1.2 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 6ad6eb42fb7e96fa944f30245b2c7be51bf4ce1a0f7766749309676b225b17df
4
- data.tar.gz: deb56ea394ac3fedc840e890e4d27de14585661233f19eeaae06baf7be1b1e90
3
+ metadata.gz: b32f09ec6af6545f7db84b9c6c6f10a27998d95b2305ec5f5a5bef4a80a2a717
4
+ data.tar.gz: 7b36ef18a14bd708ae885c5b101f822cf6cb088c1ba729b0398bd4d5522ab0fb
5
5
  SHA512:
6
- metadata.gz: dc88b902dd12008a6936f4d62f5d4651544a3f463b725a15d385b919141e93873bd809436e6b9b008baa7b310d149becb2106a29ca103736f6525e09bef871d6
7
- data.tar.gz: 9cbc5464a5cbe06a241d2253cde81da82c7eb75742654b7753c91a922acc87125f81a33c3e77d0d107a1435e8946a860e12388e44fa84dc887d9bb4bf9d2d3a2
6
+ metadata.gz: 2f5b479a90c550ea25c4a0a6f19afbb10aee5a51b14f7a27868857249605e6b72e045288eb2019805c8281d499d31b8d0597c96bea3c1cee87753430541116a1
7
+ data.tar.gz: a1ab174853194a4806e1c88606912005bc074b072893681c17e3e271339baadd401e7849ebc026305809507c7ca0b6d30584941c4a94f5619d339000df3e06dd
@@ -14,14 +14,14 @@ name = "parsekit"
14
14
  magnus = { version = "0.8", features = ["rb-sys"] }
15
15
  # Document parsing - testing embedded C libraries
16
16
  # MuPDF builds from source and statically links
17
- mupdf = { version = "0.5", default-features = false, features = [] }
17
+ mupdf = { version = "0.7", default-features = false, features = [] }
18
18
  # OCR - Using tesseract-rs for both system and bundled modes
19
- tesseract-rs = "0.1" # Tesseract with optional bundling
19
+ tesseract-rs = "0.2" # Tesseract with optional bundling
20
20
  image = "0.25" # Image processing library (match rusty-tesseract's version)
21
- calamine = "0.30" # Excel parsing
21
+ calamine = "0.35" # Excel parsing
22
22
  docx-rs = "0.4" # Word document parsing
23
- quick-xml = "0.38" # XML parsing
24
- zip = "5.0" # ZIP archive handling for PPTX
23
+ quick-xml = "0.40" # XML parsing
24
+ zip = "8.2" # ZIP archive handling for PPTX
25
25
  serde_json = "1.0" # JSON parsing
26
26
  regex = "1.10" # Text parsing
27
27
  encoding_rs = "0.8" # Encoding detection
@@ -242,7 +242,7 @@ impl Parser {
242
242
  // Continue on page errors rather than failing entirely
243
243
  if let Ok(page) = doc.load_page(page_num) {
244
244
  // Extract text from the page
245
- if let Ok(text) = page.to_text() {
245
+ if let Ok(text) = page.to_text_page(mupdf::TextPageFlags::empty()).and_then(|tp| tp.to_text()) {
246
246
  all_text.push_str(&text);
247
247
  all_text.push('\n');
248
248
  }
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module ParseKit
4
- VERSION = "0.1.2"
4
+ VERSION = "0.2.0"
5
5
  end
data/lib/parsekit.rb CHANGED
@@ -2,9 +2,15 @@
2
2
 
3
3
  require_relative "parsekit/version"
4
4
 
5
- # Load the native extension
5
+ # Load the compiled Rust extension. Precompiled (platform) gems install it into a
6
+ # Ruby-ABI-versioned subdir (lib/parsekit/<major.minor>/parsekit.{so,bundle}) so a
7
+ # single fat gem can carry a binary per Ruby version; source/dev builds place it flat
8
+ # at lib/parsekit/parsekit.{so,bundle}. Try the versioned path first, fall back to the
9
+ # flat one. Resolution goes through $LOAD_PATH (`require`, never `require_relative`)
10
+ # because RubyGems installs native extensions outside the gem's lib/ dir.
6
11
  begin
7
- require_relative "parsekit/parsekit"
12
+ RUBY_VERSION =~ /(\d+\.\d+)/
13
+ require "parsekit/#{Regexp.last_match(1)}/parsekit"
8
14
  rescue LoadError
9
15
  require "parsekit/parsekit"
10
16
  end
metadata CHANGED
@@ -1,14 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: parsekit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Chris Petersen
8
- autorequire:
9
8
  bindir: exe
10
9
  cert_chain: []
11
- date: 2025-09-06 00:00:00.000000000 Z
10
+ date: 1980-01-02 00:00:00.000000000 Z
12
11
  dependencies:
13
12
  - !ruby/object:Gem::Dependency
14
13
  name: rb_sys
@@ -100,9 +99,7 @@ files:
100
99
  - ext/parsekit/src/lib.rs
101
100
  - ext/parsekit/src/parser.rs
102
101
  - lib/parsekit.rb
103
- - lib/parsekit/NATIVE_API.md
104
102
  - lib/parsekit/error.rb
105
- - lib/parsekit/parsekit.bundle
106
103
  - lib/parsekit/parser.rb
107
104
  - lib/parsekit/version.rb
108
105
  homepage: https://github.com/scientist-labs/parsekit
@@ -112,7 +109,6 @@ metadata:
112
109
  homepage_uri: https://github.com/scientist-labs/parsekit
113
110
  source_code_uri: https://github.com/scientist-labs/parsekit
114
111
  changelog_uri: https://github.com/scientist-labs/parsekit/blob/main/CHANGELOG.md
115
- post_install_message:
116
112
  rdoc_options: []
117
113
  require_paths:
118
114
  - lib
@@ -127,8 +123,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
127
123
  - !ruby/object:Gem::Version
128
124
  version: '0'
129
125
  requirements: []
130
- rubygems_version: 3.5.3
131
- signing_key:
126
+ rubygems_version: 3.6.9
132
127
  specification_version: 4
133
128
  summary: Ruby document parsing toolkit with PDF and OCR support
134
129
  test_files: []
@@ -1,125 +0,0 @@
1
- # ParseKit Native API Documentation
2
-
3
- This document describes the methods implemented in the Rust native extension for ParseKit::Parser.
4
-
5
- ## Instance Methods
6
-
7
- ### `initialize(options = {})`
8
- Initialize a new Parser instance with optional configuration.
9
-
10
- **Parameters:**
11
- - `options` [Hash] Configuration options
12
- - `:encoding` [String] Input encoding (default: UTF-8)
13
- - `:strict_mode` [Boolean] Enable strict parsing mode (default: false)
14
- - `:max_depth` [Integer] Maximum nesting depth (default: 100)
15
- - `:max_size` [Integer] Maximum file size in bytes (default: 100MB)
16
-
17
- ### `parse(input)`
18
- Parse an input string (for text content).
19
-
20
- **Parameters:**
21
- - `input` [String] The input to parse
22
-
23
- **Returns:**
24
- - [String] The parsed result
25
-
26
- **Raises:**
27
- - `ArgumentError` If input is empty
28
-
29
- ### `parse_file(path)`
30
- Parse a file (supports PDF, Office documents, text files, images with OCR).
31
-
32
- **Parameters:**
33
- - `path` [String] Path to the file to parse
34
-
35
- **Returns:**
36
- - [String] The extracted text content
37
-
38
- **Raises:**
39
- - `IOError` If file cannot be read
40
- - `RuntimeError` If parsing fails
41
-
42
- ### `parse_bytes(data)`
43
- Parse binary data.
44
-
45
- **Parameters:**
46
- - `data` [Array<Integer>] Binary data as byte array
47
-
48
- **Returns:**
49
- - [String] The extracted text content
50
-
51
- **Raises:**
52
- - `ArgumentError` If data is empty
53
- - `RuntimeError` If parsing fails
54
-
55
- ### `config`
56
- Get the current parser configuration.
57
-
58
- **Returns:**
59
- - [Hash] The parser configuration including encoding, strict_mode, max_depth, and max_size
60
-
61
- ### `supports_file?(path)`
62
- Check if a file format is supported.
63
-
64
- **Parameters:**
65
- - `path` [String] File path to check
66
-
67
- **Returns:**
68
- - [Boolean] True if the file format is supported
69
-
70
- ### `strict_mode?`
71
- Check if strict mode is enabled.
72
-
73
- **Returns:**
74
- - [Boolean] True if strict mode is enabled
75
-
76
- ## Format-Specific Parsers
77
-
78
- These methods are also available but typically called internally via `parse_file` or `parse_bytes`:
79
-
80
- ### `parse_pdf(data)`
81
- Parse PDF files using MuPDF (statically linked).
82
-
83
- ### `parse_docx(data)`
84
- Parse Microsoft Word documents.
85
-
86
- ### `parse_pptx(data)`
87
- Parse Microsoft PowerPoint presentations.
88
-
89
- ### `parse_xlsx(data)`
90
- Parse Microsoft Excel spreadsheets.
91
-
92
- ### `parse_json(data)`
93
- Parse and pretty-print JSON data.
94
-
95
- ### `parse_xml(data)`
96
- Parse XML/HTML files and extract text content.
97
-
98
- ### `parse_text(data)`
99
- Parse plain text files.
100
-
101
- ### `ocr_image(data)`
102
- Perform OCR on images (PNG, JPEG, TIFF, BMP) using Tesseract.
103
-
104
- ## Class Methods
105
-
106
- ### `Parser.supported_formats`
107
- Get list of supported file formats.
108
-
109
- **Returns:**
110
- - [Array<String>] List of supported file extensions
111
-
112
- **Example:**
113
- ```ruby
114
- ParseKit::Parser.supported_formats
115
- # => ["txt", "json", "xml", "html", "docx", "xlsx", "xls", "csv", "pdf", "png", "jpg", "jpeg", "tiff", "bmp", ...]
116
- ```
117
-
118
- ## Implementation Notes
119
-
120
- All these methods are implemented in Rust via the native extension. The Ruby layer (`lib/parsekit/parser.rb`) provides additional convenience methods and helpers that wrap these native methods.
121
-
122
- The native extension uses:
123
- - **MuPDF** for PDF parsing (statically linked)
124
- - **Tesseract** for OCR functionality (bundled)
125
- - **Various Rust crates** for Office document parsing (docx-rs, calamine, etc.)
Binary file