liteparse-rb 0.1.13 → 0.1.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 53e2fa220141957960e80e01cadf273e8aa3e1f757f8acc8712bdc158264efcc
4
- data.tar.gz: f24904a27aa45533c63a7e1d216a68229abcfa212d6df5b181af7dab97eb847e
3
+ metadata.gz: 840e257ec0149b7d40c9385ba934409200d047be40cd80be06dd4bef49b93bd6
4
+ data.tar.gz: 2e51b19e50155b48a126a746eeb233b633ca97cbaa80eec087e107be3914d27c
5
5
  SHA512:
6
- metadata.gz: 74b57d39d1644196a2b3e5734908b148593a3c08b583dfaa362f00ba00570b70650b7f7710b9bb82b8f97b643941798a75b4010ce48739e8a08c046ff17cb9ff
7
- data.tar.gz: 1cc2d8881244d759239d94bce1aa0260254d0b65ef1559a97bfdbd14cb84bad0b764f02226c14dc2e78c488bd6dd63388dd7ec0124c75a028fc2af441060d02d
6
+ metadata.gz: ef80a7427299914b237c053fbb365564ae56d76e847d6dbd5d23b3ec61af37562f073f99f54582e89d743647505b19c996b146ce4ef08e57ded9303210be9e6b
7
+ data.tar.gz: 483f25fb7098464be3594b5e762790a912147dd468f23cb0477d1c04e365c816784f6dde6de9aab3ae4bd3cc8cad5bab3e0644950126c387fe11bec6082e8fb8
data/Cargo.lock CHANGED
@@ -16,7 +16,7 @@ checksum = "b169f7a6d4742236a0a00c541b845991d0ac43e546831af1249753ab4c3aa3a0"
16
16
  dependencies = [
17
17
  "cfg-if",
18
18
  "cipher",
19
- "cpufeatures",
19
+ "cpufeatures 0.2.17",
20
20
  ]
21
21
 
22
22
  [[package]]
@@ -80,9 +80,9 @@ dependencies = [
80
80
 
81
81
  [[package]]
82
82
  name = "anyhow"
83
- version = "1.0.102"
83
+ version = "1.0.103"
84
84
  source = "registry+https://github.com/rust-lang/crates.io-index"
85
- checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c"
85
+ checksum = "2a4385e2e34eb35d6b3efe798b9eb88096925d87726c0798709bf56d9ed84af3"
86
86
 
87
87
  [[package]]
88
88
  name = "arbitrary"
@@ -93,6 +93,18 @@ dependencies = [
93
93
  "derive_arbitrary",
94
94
  ]
95
95
 
96
+ [[package]]
97
+ name = "arrayref"
98
+ version = "0.3.9"
99
+ source = "registry+https://github.com/rust-lang/crates.io-index"
100
+ checksum = "76a2e8124351fda1ef8aaaa3bbd7ebbcb486bbcd4225aca0aa0d84bb2db8fecb"
101
+
102
+ [[package]]
103
+ name = "arrayvec"
104
+ version = "0.7.7"
105
+ source = "registry+https://github.com/rust-lang/crates.io-index"
106
+ checksum = "f02882884d3e1bc524fb12c79f107f6ad0e1cfd498c536ffb494301740995dfe"
107
+
96
108
  [[package]]
97
109
  name = "atomic-waker"
98
110
  version = "1.1.2"
@@ -157,6 +169,20 @@ version = "2.13.0"
157
169
  source = "registry+https://github.com/rust-lang/crates.io-index"
158
170
  checksum = "b4388bee8683e3d04af747c73422af53102d2bd24d9eadb6cbc100baef4b43f8"
159
171
 
172
+ [[package]]
173
+ name = "blake3"
174
+ version = "1.8.5"
175
+ source = "registry+https://github.com/rust-lang/crates.io-index"
176
+ checksum = "0aa83c34e62843d924f905e0f5c866eb1dd6545fc4d719e803d9ba6030371fce"
177
+ dependencies = [
178
+ "arrayref",
179
+ "arrayvec",
180
+ "cc",
181
+ "cfg-if",
182
+ "constant_time_eq 0.4.2",
183
+ "cpufeatures 0.3.0",
184
+ ]
185
+
160
186
  [[package]]
161
187
  name = "block-buffer"
162
188
  version = "0.10.4"
@@ -351,6 +377,12 @@ version = "0.3.1"
351
377
  source = "registry+https://github.com/rust-lang/crates.io-index"
352
378
  checksum = "7c74b8349d32d297c9134b8c88677813a227df8f779daa29bfc29c183fe3dca6"
353
379
 
380
+ [[package]]
381
+ name = "constant_time_eq"
382
+ version = "0.4.2"
383
+ source = "registry+https://github.com/rust-lang/crates.io-index"
384
+ checksum = "3d52eff69cd5e647efe296129160853a42795992097e8af39800e1060caeea9b"
385
+
354
386
  [[package]]
355
387
  name = "core-foundation"
356
388
  version = "0.9.4"
@@ -386,6 +418,15 @@ dependencies = [
386
418
  "libc",
387
419
  ]
388
420
 
421
+ [[package]]
422
+ name = "cpufeatures"
423
+ version = "0.3.0"
424
+ source = "registry+https://github.com/rust-lang/crates.io-index"
425
+ checksum = "8b2a41393f66f16b0823bb79094d54ac5fbd34ab292ddafb9a0456ac9f87d201"
426
+ dependencies = [
427
+ "libc",
428
+ ]
429
+
389
430
  [[package]]
390
431
  name = "crc"
391
432
  version = "3.4.0"
@@ -1121,9 +1162,9 @@ checksum = "92daf443525c4cce67b150400bc2316076100ce0b3686209eb8cf3c31612e6f0"
1121
1162
 
1122
1163
  [[package]]
1123
1164
  name = "liteparse"
1124
- version = "0.1.13"
1165
+ version = "0.1.14"
1125
1166
  dependencies = [
1126
- "liteparse 2.2.0",
1167
+ "liteparse 2.2.1",
1127
1168
  "liteparse-ruby",
1128
1169
  "magnus",
1129
1170
  "rb-sys",
@@ -1131,10 +1172,11 @@ dependencies = [
1131
1172
 
1132
1173
  [[package]]
1133
1174
  name = "liteparse"
1134
- version = "2.2.0"
1175
+ version = "2.2.1"
1135
1176
  source = "registry+https://github.com/rust-lang/crates.io-index"
1136
- checksum = "95c884d0aacf6f51bf93b5f870f18ed5f8188e8474b2d110a8acaf57821382ef"
1177
+ checksum = "292eab7b9f87ed43813072777fe318b2146c05ca8920a7afe047ca6f1900930b"
1137
1178
  dependencies = [
1179
+ "blake3",
1138
1180
  "clap",
1139
1181
  "image",
1140
1182
  "infer",
@@ -1142,6 +1184,7 @@ dependencies = [
1142
1184
  "liteparse-pdfium-sys",
1143
1185
  "ordered-float",
1144
1186
  "reqwest 0.13.4",
1187
+ "rmp",
1145
1188
  "serde",
1146
1189
  "serde_json",
1147
1190
  "tempfile",
@@ -1154,18 +1197,18 @@ dependencies = [
1154
1197
 
1155
1198
  [[package]]
1156
1199
  name = "liteparse-pdfium"
1157
- version = "1.2.0"
1200
+ version = "1.3.0"
1158
1201
  source = "registry+https://github.com/rust-lang/crates.io-index"
1159
- checksum = "da32ff4cfed8ad099a7f4539b2765d5d74309f54deb0ca001b6331b530e2a9e8"
1202
+ checksum = "dbb04fb271230f7c13d97ce7f0261ecbcfd218272228c2d97b339482e60f9694"
1160
1203
  dependencies = [
1161
1204
  "liteparse-pdfium-sys",
1162
1205
  ]
1163
1206
 
1164
1207
  [[package]]
1165
1208
  name = "liteparse-pdfium-sys"
1166
- version = "1.2.0"
1209
+ version = "1.3.0"
1167
1210
  source = "registry+https://github.com/rust-lang/crates.io-index"
1168
- checksum = "be18655d558598f1caec00ec9534e2798db9176a9549924a7508689e5ce15f24"
1211
+ checksum = "a920d4b7bd4c75f0d2f47a93ef2b2b71348ab7bc14d6773f9c348549fa24d6d8"
1169
1212
  dependencies = [
1170
1213
  "flate2",
1171
1214
  "libloading",
@@ -1175,11 +1218,11 @@ dependencies = [
1175
1218
 
1176
1219
  [[package]]
1177
1220
  name = "liteparse-ruby"
1178
- version = "0.1.13"
1221
+ version = "0.1.14"
1179
1222
  dependencies = [
1180
1223
  "anyhow",
1181
1224
  "image",
1182
- "liteparse 2.2.0",
1225
+ "liteparse 2.2.1",
1183
1226
  "liteparse-pdfium",
1184
1227
  "liteparse-pdfium-sys",
1185
1228
  "magnus",
@@ -1750,6 +1793,15 @@ dependencies = [
1750
1793
  "windows-sys 0.52.0",
1751
1794
  ]
1752
1795
 
1796
+ [[package]]
1797
+ name = "rmp"
1798
+ version = "0.8.15"
1799
+ source = "registry+https://github.com/rust-lang/crates.io-index"
1800
+ checksum = "4ba8be72d372b2c9b35542551678538b562e7cf86c3315773cae48dfbfe7790c"
1801
+ dependencies = [
1802
+ "num-traits",
1803
+ ]
1804
+
1753
1805
  [[package]]
1754
1806
  name = "rustc-hash"
1755
1807
  version = "2.1.2"
@@ -1982,7 +2034,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
1982
2034
  checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba"
1983
2035
  dependencies = [
1984
2036
  "cfg-if",
1985
- "cpufeatures",
2037
+ "cpufeatures 0.2.17",
1986
2038
  "digest",
1987
2039
  ]
1988
2040
 
@@ -2970,7 +3022,7 @@ dependencies = [
2970
3022
  "aes",
2971
3023
  "arbitrary",
2972
3024
  "bzip2",
2973
- "constant_time_eq",
3025
+ "constant_time_eq 0.3.1",
2974
3026
  "crc32fast",
2975
3027
  "crossbeam-utils",
2976
3028
  "deflate64",
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "liteparse-ruby"
3
- version = "0.1.13"
3
+ version = "0.1.14"
4
4
  edition.workspace = true
5
5
  license.workspace = true
6
6
  repository.workspace = true
@@ -10,12 +10,12 @@ use liteparse::config::{ImageMode, LiteParseConfig, OutputFormat};
10
10
  use liteparse::types::PdfInput;
11
11
 
12
12
  fn kwarg<T: magnus::TryConvert>(kwargs: &RHash, key: &str) -> Option<T> {
13
- let sym = magnus::Symbol::new(key);
13
+ let sym = Ruby::get().unwrap().to_symbol(key);
14
14
  kwargs.get::<magnus::Symbol>(sym).and_then(|v| <T as magnus::TryConvert>::try_convert(v).ok())
15
15
  }
16
16
 
17
17
  fn kwarg_bool(kwargs: &RHash, key: &str) -> Option<bool> {
18
- let sym = magnus::Symbol::new(key);
18
+ let sym = Ruby::get().unwrap().to_symbol(key);
19
19
  kwargs.get::<magnus::Symbol>(sym).and_then(|v| {
20
20
  use magnus::value::ReprValue;
21
21
  if v.is_nil() { None } else { Some(v.to_bool()) }
data/lib/liteparse/cli.rb CHANGED
@@ -3,9 +3,18 @@ require_relative "../liteparse"
3
3
  module LiteParse
4
4
  # CLI support is handled by the native `lit` binary.
5
5
  # Run `lit --help` from the command line for usage.
6
+ #
7
+ # This module provides a programmatic entry point that delegates
8
+ # to the native CLI runner.
9
+ #
10
+ # @example
11
+ # LiteParse::CLI.run(["parse", "document.pdf"])
6
12
  module CLI
7
13
  module_function
8
14
 
15
+ # Run the CLI with the given arguments.
16
+ # @param args [Array<String>] Command-line arguments (defaults to ARGV)
17
+ # @return [void]
9
18
  def run(args = ARGV)
10
19
  LiteParse.run_cli(args)
11
20
  rescue => e
@@ -5,9 +5,31 @@ module LiteParse
5
5
  # This file exists to mirror the Python wrapper structure and provide
6
6
  # a convenient require path.
7
7
  #
8
- # Usage:
8
+ # @example Basic usage
9
9
  # require "liteparse"
10
10
  # parser = LiteParse::LiteParse.new(ocr_enabled: true)
11
11
  # result = parser.parse("document.pdf")
12
12
  # puts result.text
13
+
14
+ # @!method parse(input)
15
+ # Parse a document from a file path.
16
+ # @param input [String] Path to the document file (.pdf, .docx, .pptx, .xlsx, .html, image, etc.)
17
+ # @return [LiteParse::ParseResult] Parsed document with pages, text, and images
18
+ # @raise [RuntimeError] If parsing fails
19
+ # @example
20
+ # result = parser.parse("report.pdf")
21
+ # result.pages.each { |page| puts page.text }
22
+
23
+ # @!method parse_bytes(data)
24
+ # Parse a document from raw bytes.
25
+ # @param data [String] Raw document bytes (binary string)
26
+ # @return [LiteParse::ParseResult] Parsed document with pages, text, and images
27
+ # @raise [RuntimeError] If parsing fails
28
+ # @example
29
+ # data = File.binread("report.pdf")
30
+ # result = parser.parse_bytes(data)
31
+
32
+ # @!method config
33
+ # Get the current parser configuration.
34
+ # @return [LiteParse::Config] The active configuration
13
35
  end
@@ -4,6 +4,104 @@ require_relative "liteparse/liteparse"
4
4
  # in the Rust extension. This file re-exports them for convenience.
5
5
 
6
6
  module LiteParse
7
+ # A single item of text with spatial position information.
8
+ # @!method text
9
+ # @return [String] The text content of this item
10
+ # @!method x
11
+ # @return [Float] Left edge X coordinate of the text bounding box
12
+ # @!method y
13
+ # @return [Float] Top edge Y coordinate of the text bounding box
14
+ # @!method width
15
+ # @return [Float] Width of the text bounding box
16
+ # @!method height
17
+ # @return [Float] Height of the text bounding box
18
+ # @!method font_name
19
+ # @return [String, nil] Name of the font used, if available
20
+ # @!method font_size
21
+ # @return [Float, nil] Font size in points, if available
22
+ # @!method confidence
23
+ # @return [Float, nil] OCR/text extraction confidence (0.0–1.0), if available
24
+ class TextItem; end
25
+
26
+ # A single parsed page from a document.
27
+ # @!method page_num
28
+ # @return [Integer] 1-indexed page number
29
+ # @!method width
30
+ # @return [Float] Page width in points
31
+ # @!method height
32
+ # @return [Float] Page height in points
33
+ # @!method text
34
+ # @return [String] Full concatenated text of this page
35
+ # @!method text_items
36
+ # @return [Array<LiteParse::TextItem>] Individual text items with spatial information
37
+ class ParsedPage; end
38
+
39
+ # The complete result of parsing a document.
40
+ # @!method pages
41
+ # @return [Array<LiteParse::ParsedPage>] All parsed pages
42
+ # @!method text
43
+ # @return [String] Full concatenated text across all pages
44
+ # @!method images
45
+ # @return [Array<LiteParse::ExtractedImage>] Images extracted from the document
46
+ # @!method num_pages
47
+ # @return [Integer] Total number of pages parsed
48
+ # @!method get_page(page_num)
49
+ # Retrieve a specific page by its 1-indexed page number.
50
+ # @param page_num [Integer] 1-indexed page number
51
+ # @return [LiteParse::ParsedPage, nil] The page, or nil if not found
52
+ class ParseResult; end
53
+
54
+ # An image extracted from a parsed document.
55
+ # @!method id
56
+ # @return [String] Image identifier
57
+ # @!method page
58
+ # @return [Integer] Page number where the image was found
59
+ # @!method format
60
+ # @return [String] Image format (e.g. "png", "jpeg")
61
+ # @!method bytes
62
+ # @return [String] Raw image bytes (binary string)
63
+ class ExtractedImage; end
64
+
65
+ # A screenshot of a document page rendered as an image.
66
+ # @!method page_num
67
+ # @return [Integer] 1-indexed page number of the screenshot
68
+ # @!method width
69
+ # @return [Integer] Width of the screenshot in pixels
70
+ # @!method height
71
+ # @return [Integer] Height of the screenshot in pixels
72
+ # @!method image_bytes
73
+ # @return [String] PNG image bytes (binary string)
74
+ class ScreenshotResult; end
75
+
76
+ # The current configuration of a LiteParse parser instance.
77
+ # @!method ocr_language
78
+ # @return [String] Language used for OCR (e.g. "eng")
79
+ # @!method ocr_enabled
80
+ # @return [Boolean] Whether OCR is enabled
81
+ # @!method ocr_server_url
82
+ # @return [String, nil] External OCR server URL, if configured
83
+ # @!method ocr_server_headers
84
+ # @return [Hash<String, String>, nil] Headers for OCR server requests
85
+ # @!method tessdata_path
86
+ # @return [String, nil] Path to Tesseract tessdata directory
87
+ # @!method max_pages
88
+ # @return [Integer] Maximum pages to parse
89
+ # @!method target_pages
90
+ # @return [String, nil] Page range expression, if configured
91
+ # @!method dpi
92
+ # @return [Float] Rendering DPI
93
+ # @!method output_format
94
+ # @return [String] Output format: "json", "text", or "markdown"
95
+ # @!method preserve_very_small_text
96
+ # @return [Boolean] Whether very small text is preserved
97
+ # @!method password
98
+ # @return [String, nil] Document password, if set
99
+ # @!method quiet
100
+ # @return [Boolean] Whether non-error output is suppressed
101
+ # @!method num_workers
102
+ # @return [Integer] Number of worker threads
103
+ class Config; end
104
+
7
105
  # No additional Ruby wrapping needed — the native classes are registered
8
106
  # directly on the LiteParse module by the Rust init function.
9
107
  end
@@ -1,3 +1,4 @@
1
1
  module LiteParse
2
- VERSION = "0.1.13"
2
+ # Current version of liteparse-rb.
3
+ VERSION = "0.1.14"
3
4
  end
data/lib/liteparse.rb CHANGED
@@ -2,9 +2,46 @@ require_relative "liteparse/version"
2
2
  require_relative "liteparse/liteparse"
3
3
 
4
4
  module LiteParse
5
+ # Generic error raised by LiteParse operations.
5
6
  class Error < StandardError; end
6
7
  end
7
8
 
9
+ # YARD declarations for methods defined in the Rust extension.
10
+ # These are invisible to YARD, so we declare them with @!method.
11
+ class LiteParse::LiteParse
12
+ # @!method self.new(**kwargs)
13
+ # Create a new LiteParse parser instance.
14
+ # @param kwargs [Hash] Keyword arguments for parser configuration
15
+ # @option kwargs [String] :ocr_language ("eng") Language for OCR
16
+ # @option kwargs [Boolean] :ocr_enabled (true) Enable OCR
17
+ # @option kwargs [String, nil] :ocr_server_url URL of an external OCR server
18
+ # @option kwargs [Hash<String, String>, nil] :ocr_server_headers Headers for OCR server requests
19
+ # @option kwargs [String, nil] :tessdata_path Path to Tesseract tessdata directory
20
+ # @option kwargs [Integer] :max_pages (1000) Maximum pages to parse
21
+ # @option kwargs [String, nil] :target_pages Page range expression (e.g. "1-5,7")
22
+ # @option kwargs [Float] :dpi (150.0) Rendering DPI
23
+ # @option kwargs [String] :output_format ("json") Output format: "json", "text", or "markdown"
24
+ # @option kwargs [Boolean] :preserve_very_small_text (false) Preserve tiny text
25
+ # @option kwargs [String, nil] :password Password for encrypted documents
26
+ # @option kwargs [Boolean] :quiet (false) Suppress non-error output
27
+ # @option kwargs [Integer] :num_workers Number of worker threads (auto-detected)
28
+ # @option kwargs [String] :image_mode ("placeholder") Image mode: "placeholder", "embed", or "off"
29
+ # @option kwargs [Boolean] :extract_links (false) Extract hyperlinks
30
+ # @return [LiteParse::LiteParse]
31
+ # @example
32
+ # parser = LiteParse::LiteParse.new(ocr_enabled: true, dpi: 200)
33
+
34
+ # @!method screenshot(input, page_numbers: nil)
35
+ # Take screenshots of document pages.
36
+ # @param input [String] Path to the document file
37
+ # @param page_numbers [Array<Integer>, nil] Specific page numbers (1-indexed) to screenshot. nil = all pages.
38
+ # @return [Array<LiteParse::ScreenshotResult>] Screenshot results with image bytes
39
+ # @example
40
+ # parser = LiteParse::LiteParse.new
41
+ # screenshots = parser.screenshot("document.pdf", page_numbers: [1, 3])
42
+ # screenshots.each { |s| File.write("page_#{s.page_num}.png", s.image_bytes) }
43
+ end
44
+
8
45
  # Wrap native new to accept 0 args (the Rust constructor expects 1 positional arg).
9
46
  LiteParse::LiteParse.singleton_class.alias_method :native_new, :new
10
47
  LiteParse::LiteParse.define_singleton_method(:new) do |**kwargs|
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: liteparse-rb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.13
4
+ version: 0.1.14
5
5
  platform: ruby
6
6
  authors:
7
7
  - Evan Mattiza
@@ -23,6 +23,20 @@ dependencies:
23
23
  - - "~>"
24
24
  - !ruby/object:Gem::Version
25
25
  version: '0.9'
26
+ - !ruby/object:Gem::Dependency
27
+ name: yard
28
+ requirement: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - "~>"
31
+ - !ruby/object:Gem::Version
32
+ version: '0.9'
33
+ type: :development
34
+ prerelease: false
35
+ version_requirements: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - "~>"
38
+ - !ruby/object:Gem::Version
39
+ version: '0.9'
26
40
  description: Ruby bindings for LiteParse — an open-source document parser that extracts
27
41
  text with spatial layout information, bounding boxes, OCR support, and more.
28
42
  executables: []
@@ -49,6 +63,7 @@ licenses:
49
63
  metadata:
50
64
  homepage_uri: https://github.com/emattiza/liteparse-rb
51
65
  source_code_uri: https://github.com/emattiza/liteparse-rb
66
+ documentation_uri: https://rubydoc.info/gems/liteparse-rb
52
67
  rdoc_options: []
53
68
  require_paths:
54
69
  - lib