liteparse-rb 0.1.13 → 0.1.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +67 -15
- data/crates/liteparse-ruby/Cargo.toml +1 -1
- data/crates/liteparse-ruby/src/lib.rs +2 -2
- data/lib/liteparse/cli.rb +9 -0
- data/lib/liteparse/parser.rb +23 -1
- data/lib/liteparse/types.rb +98 -0
- data/lib/liteparse/version.rb +2 -1
- data/lib/liteparse.rb +37 -0
- metadata +16 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 840e257ec0149b7d40c9385ba934409200d047be40cd80be06dd4bef49b93bd6
|
|
4
|
+
data.tar.gz: 2e51b19e50155b48a126a746eeb233b633ca97cbaa80eec087e107be3914d27c
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: ef80a7427299914b237c053fbb365564ae56d76e847d6dbd5d23b3ec61af37562f073f99f54582e89d743647505b19c996b146ce4ef08e57ded9303210be9e6b
|
|
7
|
+
data.tar.gz: 483f25fb7098464be3594b5e762790a912147dd468f23cb0477d1c04e365c816784f6dde6de9aab3ae4bd3cc8cad5bab3e0644950126c387fe11bec6082e8fb8
|
data/Cargo.lock
CHANGED
|
@@ -16,7 +16,7 @@ checksum = "b169f7a6d4742236a0a00c541b845991d0ac43e546831af1249753ab4c3aa3a0"
|
|
|
16
16
|
dependencies = [
|
|
17
17
|
"cfg-if",
|
|
18
18
|
"cipher",
|
|
19
|
-
"cpufeatures",
|
|
19
|
+
"cpufeatures 0.2.17",
|
|
20
20
|
]
|
|
21
21
|
|
|
22
22
|
[[package]]
|
|
@@ -80,9 +80,9 @@ dependencies = [
|
|
|
80
80
|
|
|
81
81
|
[[package]]
|
|
82
82
|
name = "anyhow"
|
|
83
|
-
version = "1.0.
|
|
83
|
+
version = "1.0.103"
|
|
84
84
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
85
|
-
checksum = "
|
|
85
|
+
checksum = "2a4385e2e34eb35d6b3efe798b9eb88096925d87726c0798709bf56d9ed84af3"
|
|
86
86
|
|
|
87
87
|
[[package]]
|
|
88
88
|
name = "arbitrary"
|
|
@@ -93,6 +93,18 @@ dependencies = [
|
|
|
93
93
|
"derive_arbitrary",
|
|
94
94
|
]
|
|
95
95
|
|
|
96
|
+
[[package]]
|
|
97
|
+
name = "arrayref"
|
|
98
|
+
version = "0.3.9"
|
|
99
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
100
|
+
checksum = "76a2e8124351fda1ef8aaaa3bbd7ebbcb486bbcd4225aca0aa0d84bb2db8fecb"
|
|
101
|
+
|
|
102
|
+
[[package]]
|
|
103
|
+
name = "arrayvec"
|
|
104
|
+
version = "0.7.7"
|
|
105
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
106
|
+
checksum = "f02882884d3e1bc524fb12c79f107f6ad0e1cfd498c536ffb494301740995dfe"
|
|
107
|
+
|
|
96
108
|
[[package]]
|
|
97
109
|
name = "atomic-waker"
|
|
98
110
|
version = "1.1.2"
|
|
@@ -157,6 +169,20 @@ version = "2.13.0"
|
|
|
157
169
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
158
170
|
checksum = "b4388bee8683e3d04af747c73422af53102d2bd24d9eadb6cbc100baef4b43f8"
|
|
159
171
|
|
|
172
|
+
[[package]]
|
|
173
|
+
name = "blake3"
|
|
174
|
+
version = "1.8.5"
|
|
175
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
176
|
+
checksum = "0aa83c34e62843d924f905e0f5c866eb1dd6545fc4d719e803d9ba6030371fce"
|
|
177
|
+
dependencies = [
|
|
178
|
+
"arrayref",
|
|
179
|
+
"arrayvec",
|
|
180
|
+
"cc",
|
|
181
|
+
"cfg-if",
|
|
182
|
+
"constant_time_eq 0.4.2",
|
|
183
|
+
"cpufeatures 0.3.0",
|
|
184
|
+
]
|
|
185
|
+
|
|
160
186
|
[[package]]
|
|
161
187
|
name = "block-buffer"
|
|
162
188
|
version = "0.10.4"
|
|
@@ -351,6 +377,12 @@ version = "0.3.1"
|
|
|
351
377
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
352
378
|
checksum = "7c74b8349d32d297c9134b8c88677813a227df8f779daa29bfc29c183fe3dca6"
|
|
353
379
|
|
|
380
|
+
[[package]]
|
|
381
|
+
name = "constant_time_eq"
|
|
382
|
+
version = "0.4.2"
|
|
383
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
384
|
+
checksum = "3d52eff69cd5e647efe296129160853a42795992097e8af39800e1060caeea9b"
|
|
385
|
+
|
|
354
386
|
[[package]]
|
|
355
387
|
name = "core-foundation"
|
|
356
388
|
version = "0.9.4"
|
|
@@ -386,6 +418,15 @@ dependencies = [
|
|
|
386
418
|
"libc",
|
|
387
419
|
]
|
|
388
420
|
|
|
421
|
+
[[package]]
|
|
422
|
+
name = "cpufeatures"
|
|
423
|
+
version = "0.3.0"
|
|
424
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
425
|
+
checksum = "8b2a41393f66f16b0823bb79094d54ac5fbd34ab292ddafb9a0456ac9f87d201"
|
|
426
|
+
dependencies = [
|
|
427
|
+
"libc",
|
|
428
|
+
]
|
|
429
|
+
|
|
389
430
|
[[package]]
|
|
390
431
|
name = "crc"
|
|
391
432
|
version = "3.4.0"
|
|
@@ -1121,9 +1162,9 @@ checksum = "92daf443525c4cce67b150400bc2316076100ce0b3686209eb8cf3c31612e6f0"
|
|
|
1121
1162
|
|
|
1122
1163
|
[[package]]
|
|
1123
1164
|
name = "liteparse"
|
|
1124
|
-
version = "0.1.
|
|
1165
|
+
version = "0.1.14"
|
|
1125
1166
|
dependencies = [
|
|
1126
|
-
"liteparse 2.2.
|
|
1167
|
+
"liteparse 2.2.1",
|
|
1127
1168
|
"liteparse-ruby",
|
|
1128
1169
|
"magnus",
|
|
1129
1170
|
"rb-sys",
|
|
@@ -1131,10 +1172,11 @@ dependencies = [
|
|
|
1131
1172
|
|
|
1132
1173
|
[[package]]
|
|
1133
1174
|
name = "liteparse"
|
|
1134
|
-
version = "2.2.
|
|
1175
|
+
version = "2.2.1"
|
|
1135
1176
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1136
|
-
checksum = "
|
|
1177
|
+
checksum = "292eab7b9f87ed43813072777fe318b2146c05ca8920a7afe047ca6f1900930b"
|
|
1137
1178
|
dependencies = [
|
|
1179
|
+
"blake3",
|
|
1138
1180
|
"clap",
|
|
1139
1181
|
"image",
|
|
1140
1182
|
"infer",
|
|
@@ -1142,6 +1184,7 @@ dependencies = [
|
|
|
1142
1184
|
"liteparse-pdfium-sys",
|
|
1143
1185
|
"ordered-float",
|
|
1144
1186
|
"reqwest 0.13.4",
|
|
1187
|
+
"rmp",
|
|
1145
1188
|
"serde",
|
|
1146
1189
|
"serde_json",
|
|
1147
1190
|
"tempfile",
|
|
@@ -1154,18 +1197,18 @@ dependencies = [
|
|
|
1154
1197
|
|
|
1155
1198
|
[[package]]
|
|
1156
1199
|
name = "liteparse-pdfium"
|
|
1157
|
-
version = "1.
|
|
1200
|
+
version = "1.3.0"
|
|
1158
1201
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1159
|
-
checksum = "
|
|
1202
|
+
checksum = "dbb04fb271230f7c13d97ce7f0261ecbcfd218272228c2d97b339482e60f9694"
|
|
1160
1203
|
dependencies = [
|
|
1161
1204
|
"liteparse-pdfium-sys",
|
|
1162
1205
|
]
|
|
1163
1206
|
|
|
1164
1207
|
[[package]]
|
|
1165
1208
|
name = "liteparse-pdfium-sys"
|
|
1166
|
-
version = "1.
|
|
1209
|
+
version = "1.3.0"
|
|
1167
1210
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1168
|
-
checksum = "
|
|
1211
|
+
checksum = "a920d4b7bd4c75f0d2f47a93ef2b2b71348ab7bc14d6773f9c348549fa24d6d8"
|
|
1169
1212
|
dependencies = [
|
|
1170
1213
|
"flate2",
|
|
1171
1214
|
"libloading",
|
|
@@ -1175,11 +1218,11 @@ dependencies = [
|
|
|
1175
1218
|
|
|
1176
1219
|
[[package]]
|
|
1177
1220
|
name = "liteparse-ruby"
|
|
1178
|
-
version = "0.1.
|
|
1221
|
+
version = "0.1.14"
|
|
1179
1222
|
dependencies = [
|
|
1180
1223
|
"anyhow",
|
|
1181
1224
|
"image",
|
|
1182
|
-
"liteparse 2.2.
|
|
1225
|
+
"liteparse 2.2.1",
|
|
1183
1226
|
"liteparse-pdfium",
|
|
1184
1227
|
"liteparse-pdfium-sys",
|
|
1185
1228
|
"magnus",
|
|
@@ -1750,6 +1793,15 @@ dependencies = [
|
|
|
1750
1793
|
"windows-sys 0.52.0",
|
|
1751
1794
|
]
|
|
1752
1795
|
|
|
1796
|
+
[[package]]
|
|
1797
|
+
name = "rmp"
|
|
1798
|
+
version = "0.8.15"
|
|
1799
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1800
|
+
checksum = "4ba8be72d372b2c9b35542551678538b562e7cf86c3315773cae48dfbfe7790c"
|
|
1801
|
+
dependencies = [
|
|
1802
|
+
"num-traits",
|
|
1803
|
+
]
|
|
1804
|
+
|
|
1753
1805
|
[[package]]
|
|
1754
1806
|
name = "rustc-hash"
|
|
1755
1807
|
version = "2.1.2"
|
|
@@ -1982,7 +2034,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
|
1982
2034
|
checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba"
|
|
1983
2035
|
dependencies = [
|
|
1984
2036
|
"cfg-if",
|
|
1985
|
-
"cpufeatures",
|
|
2037
|
+
"cpufeatures 0.2.17",
|
|
1986
2038
|
"digest",
|
|
1987
2039
|
]
|
|
1988
2040
|
|
|
@@ -2970,7 +3022,7 @@ dependencies = [
|
|
|
2970
3022
|
"aes",
|
|
2971
3023
|
"arbitrary",
|
|
2972
3024
|
"bzip2",
|
|
2973
|
-
"constant_time_eq",
|
|
3025
|
+
"constant_time_eq 0.3.1",
|
|
2974
3026
|
"crc32fast",
|
|
2975
3027
|
"crossbeam-utils",
|
|
2976
3028
|
"deflate64",
|
|
@@ -10,12 +10,12 @@ use liteparse::config::{ImageMode, LiteParseConfig, OutputFormat};
|
|
|
10
10
|
use liteparse::types::PdfInput;
|
|
11
11
|
|
|
12
12
|
fn kwarg<T: magnus::TryConvert>(kwargs: &RHash, key: &str) -> Option<T> {
|
|
13
|
-
let sym =
|
|
13
|
+
let sym = Ruby::get().unwrap().to_symbol(key);
|
|
14
14
|
kwargs.get::<magnus::Symbol>(sym).and_then(|v| <T as magnus::TryConvert>::try_convert(v).ok())
|
|
15
15
|
}
|
|
16
16
|
|
|
17
17
|
fn kwarg_bool(kwargs: &RHash, key: &str) -> Option<bool> {
|
|
18
|
-
let sym =
|
|
18
|
+
let sym = Ruby::get().unwrap().to_symbol(key);
|
|
19
19
|
kwargs.get::<magnus::Symbol>(sym).and_then(|v| {
|
|
20
20
|
use magnus::value::ReprValue;
|
|
21
21
|
if v.is_nil() { None } else { Some(v.to_bool()) }
|
data/lib/liteparse/cli.rb
CHANGED
|
@@ -3,9 +3,18 @@ require_relative "../liteparse"
|
|
|
3
3
|
module LiteParse
|
|
4
4
|
# CLI support is handled by the native `lit` binary.
|
|
5
5
|
# Run `lit --help` from the command line for usage.
|
|
6
|
+
#
|
|
7
|
+
# This module provides a programmatic entry point that delegates
|
|
8
|
+
# to the native CLI runner.
|
|
9
|
+
#
|
|
10
|
+
# @example
|
|
11
|
+
# LiteParse::CLI.run(["parse", "document.pdf"])
|
|
6
12
|
module CLI
|
|
7
13
|
module_function
|
|
8
14
|
|
|
15
|
+
# Run the CLI with the given arguments.
|
|
16
|
+
# @param args [Array<String>] Command-line arguments (defaults to ARGV)
|
|
17
|
+
# @return [void]
|
|
9
18
|
def run(args = ARGV)
|
|
10
19
|
LiteParse.run_cli(args)
|
|
11
20
|
rescue => e
|
data/lib/liteparse/parser.rb
CHANGED
|
@@ -5,9 +5,31 @@ module LiteParse
|
|
|
5
5
|
# This file exists to mirror the Python wrapper structure and provide
|
|
6
6
|
# a convenient require path.
|
|
7
7
|
#
|
|
8
|
-
#
|
|
8
|
+
# @example Basic usage
|
|
9
9
|
# require "liteparse"
|
|
10
10
|
# parser = LiteParse::LiteParse.new(ocr_enabled: true)
|
|
11
11
|
# result = parser.parse("document.pdf")
|
|
12
12
|
# puts result.text
|
|
13
|
+
|
|
14
|
+
# @!method parse(input)
|
|
15
|
+
# Parse a document from a file path.
|
|
16
|
+
# @param input [String] Path to the document file (.pdf, .docx, .pptx, .xlsx, .html, image, etc.)
|
|
17
|
+
# @return [LiteParse::ParseResult] Parsed document with pages, text, and images
|
|
18
|
+
# @raise [RuntimeError] If parsing fails
|
|
19
|
+
# @example
|
|
20
|
+
# result = parser.parse("report.pdf")
|
|
21
|
+
# result.pages.each { |page| puts page.text }
|
|
22
|
+
|
|
23
|
+
# @!method parse_bytes(data)
|
|
24
|
+
# Parse a document from raw bytes.
|
|
25
|
+
# @param data [String] Raw document bytes (binary string)
|
|
26
|
+
# @return [LiteParse::ParseResult] Parsed document with pages, text, and images
|
|
27
|
+
# @raise [RuntimeError] If parsing fails
|
|
28
|
+
# @example
|
|
29
|
+
# data = File.binread("report.pdf")
|
|
30
|
+
# result = parser.parse_bytes(data)
|
|
31
|
+
|
|
32
|
+
# @!method config
|
|
33
|
+
# Get the current parser configuration.
|
|
34
|
+
# @return [LiteParse::Config] The active configuration
|
|
13
35
|
end
|
data/lib/liteparse/types.rb
CHANGED
|
@@ -4,6 +4,104 @@ require_relative "liteparse/liteparse"
|
|
|
4
4
|
# in the Rust extension. This file re-exports them for convenience.
|
|
5
5
|
|
|
6
6
|
module LiteParse
|
|
7
|
+
# A single item of text with spatial position information.
|
|
8
|
+
# @!method text
|
|
9
|
+
# @return [String] The text content of this item
|
|
10
|
+
# @!method x
|
|
11
|
+
# @return [Float] Left edge X coordinate of the text bounding box
|
|
12
|
+
# @!method y
|
|
13
|
+
# @return [Float] Top edge Y coordinate of the text bounding box
|
|
14
|
+
# @!method width
|
|
15
|
+
# @return [Float] Width of the text bounding box
|
|
16
|
+
# @!method height
|
|
17
|
+
# @return [Float] Height of the text bounding box
|
|
18
|
+
# @!method font_name
|
|
19
|
+
# @return [String, nil] Name of the font used, if available
|
|
20
|
+
# @!method font_size
|
|
21
|
+
# @return [Float, nil] Font size in points, if available
|
|
22
|
+
# @!method confidence
|
|
23
|
+
# @return [Float, nil] OCR/text extraction confidence (0.0–1.0), if available
|
|
24
|
+
class TextItem; end
|
|
25
|
+
|
|
26
|
+
# A single parsed page from a document.
|
|
27
|
+
# @!method page_num
|
|
28
|
+
# @return [Integer] 1-indexed page number
|
|
29
|
+
# @!method width
|
|
30
|
+
# @return [Float] Page width in points
|
|
31
|
+
# @!method height
|
|
32
|
+
# @return [Float] Page height in points
|
|
33
|
+
# @!method text
|
|
34
|
+
# @return [String] Full concatenated text of this page
|
|
35
|
+
# @!method text_items
|
|
36
|
+
# @return [Array<LiteParse::TextItem>] Individual text items with spatial information
|
|
37
|
+
class ParsedPage; end
|
|
38
|
+
|
|
39
|
+
# The complete result of parsing a document.
|
|
40
|
+
# @!method pages
|
|
41
|
+
# @return [Array<LiteParse::ParsedPage>] All parsed pages
|
|
42
|
+
# @!method text
|
|
43
|
+
# @return [String] Full concatenated text across all pages
|
|
44
|
+
# @!method images
|
|
45
|
+
# @return [Array<LiteParse::ExtractedImage>] Images extracted from the document
|
|
46
|
+
# @!method num_pages
|
|
47
|
+
# @return [Integer] Total number of pages parsed
|
|
48
|
+
# @!method get_page(page_num)
|
|
49
|
+
# Retrieve a specific page by its 1-indexed page number.
|
|
50
|
+
# @param page_num [Integer] 1-indexed page number
|
|
51
|
+
# @return [LiteParse::ParsedPage, nil] The page, or nil if not found
|
|
52
|
+
class ParseResult; end
|
|
53
|
+
|
|
54
|
+
# An image extracted from a parsed document.
|
|
55
|
+
# @!method id
|
|
56
|
+
# @return [String] Image identifier
|
|
57
|
+
# @!method page
|
|
58
|
+
# @return [Integer] Page number where the image was found
|
|
59
|
+
# @!method format
|
|
60
|
+
# @return [String] Image format (e.g. "png", "jpeg")
|
|
61
|
+
# @!method bytes
|
|
62
|
+
# @return [String] Raw image bytes (binary string)
|
|
63
|
+
class ExtractedImage; end
|
|
64
|
+
|
|
65
|
+
# A screenshot of a document page rendered as an image.
|
|
66
|
+
# @!method page_num
|
|
67
|
+
# @return [Integer] 1-indexed page number of the screenshot
|
|
68
|
+
# @!method width
|
|
69
|
+
# @return [Integer] Width of the screenshot in pixels
|
|
70
|
+
# @!method height
|
|
71
|
+
# @return [Integer] Height of the screenshot in pixels
|
|
72
|
+
# @!method image_bytes
|
|
73
|
+
# @return [String] PNG image bytes (binary string)
|
|
74
|
+
class ScreenshotResult; end
|
|
75
|
+
|
|
76
|
+
# The current configuration of a LiteParse parser instance.
|
|
77
|
+
# @!method ocr_language
|
|
78
|
+
# @return [String] Language used for OCR (e.g. "eng")
|
|
79
|
+
# @!method ocr_enabled
|
|
80
|
+
# @return [Boolean] Whether OCR is enabled
|
|
81
|
+
# @!method ocr_server_url
|
|
82
|
+
# @return [String, nil] External OCR server URL, if configured
|
|
83
|
+
# @!method ocr_server_headers
|
|
84
|
+
# @return [Hash<String, String>, nil] Headers for OCR server requests
|
|
85
|
+
# @!method tessdata_path
|
|
86
|
+
# @return [String, nil] Path to Tesseract tessdata directory
|
|
87
|
+
# @!method max_pages
|
|
88
|
+
# @return [Integer] Maximum pages to parse
|
|
89
|
+
# @!method target_pages
|
|
90
|
+
# @return [String, nil] Page range expression, if configured
|
|
91
|
+
# @!method dpi
|
|
92
|
+
# @return [Float] Rendering DPI
|
|
93
|
+
# @!method output_format
|
|
94
|
+
# @return [String] Output format: "json", "text", or "markdown"
|
|
95
|
+
# @!method preserve_very_small_text
|
|
96
|
+
# @return [Boolean] Whether very small text is preserved
|
|
97
|
+
# @!method password
|
|
98
|
+
# @return [String, nil] Document password, if set
|
|
99
|
+
# @!method quiet
|
|
100
|
+
# @return [Boolean] Whether non-error output is suppressed
|
|
101
|
+
# @!method num_workers
|
|
102
|
+
# @return [Integer] Number of worker threads
|
|
103
|
+
class Config; end
|
|
104
|
+
|
|
7
105
|
# No additional Ruby wrapping needed — the native classes are registered
|
|
8
106
|
# directly on the LiteParse module by the Rust init function.
|
|
9
107
|
end
|
data/lib/liteparse/version.rb
CHANGED
data/lib/liteparse.rb
CHANGED
|
@@ -2,9 +2,46 @@ require_relative "liteparse/version"
|
|
|
2
2
|
require_relative "liteparse/liteparse"
|
|
3
3
|
|
|
4
4
|
module LiteParse
|
|
5
|
+
# Generic error raised by LiteParse operations.
|
|
5
6
|
class Error < StandardError; end
|
|
6
7
|
end
|
|
7
8
|
|
|
9
|
+
# YARD declarations for methods defined in the Rust extension.
|
|
10
|
+
# These are invisible to YARD, so we declare them with @!method.
|
|
11
|
+
class LiteParse::LiteParse
|
|
12
|
+
# @!method self.new(**kwargs)
|
|
13
|
+
# Create a new LiteParse parser instance.
|
|
14
|
+
# @param kwargs [Hash] Keyword arguments for parser configuration
|
|
15
|
+
# @option kwargs [String] :ocr_language ("eng") Language for OCR
|
|
16
|
+
# @option kwargs [Boolean] :ocr_enabled (true) Enable OCR
|
|
17
|
+
# @option kwargs [String, nil] :ocr_server_url URL of an external OCR server
|
|
18
|
+
# @option kwargs [Hash<String, String>, nil] :ocr_server_headers Headers for OCR server requests
|
|
19
|
+
# @option kwargs [String, nil] :tessdata_path Path to Tesseract tessdata directory
|
|
20
|
+
# @option kwargs [Integer] :max_pages (1000) Maximum pages to parse
|
|
21
|
+
# @option kwargs [String, nil] :target_pages Page range expression (e.g. "1-5,7")
|
|
22
|
+
# @option kwargs [Float] :dpi (150.0) Rendering DPI
|
|
23
|
+
# @option kwargs [String] :output_format ("json") Output format: "json", "text", or "markdown"
|
|
24
|
+
# @option kwargs [Boolean] :preserve_very_small_text (false) Preserve tiny text
|
|
25
|
+
# @option kwargs [String, nil] :password Password for encrypted documents
|
|
26
|
+
# @option kwargs [Boolean] :quiet (false) Suppress non-error output
|
|
27
|
+
# @option kwargs [Integer] :num_workers Number of worker threads (auto-detected)
|
|
28
|
+
# @option kwargs [String] :image_mode ("placeholder") Image mode: "placeholder", "embed", or "off"
|
|
29
|
+
# @option kwargs [Boolean] :extract_links (false) Extract hyperlinks
|
|
30
|
+
# @return [LiteParse::LiteParse]
|
|
31
|
+
# @example
|
|
32
|
+
# parser = LiteParse::LiteParse.new(ocr_enabled: true, dpi: 200)
|
|
33
|
+
|
|
34
|
+
# @!method screenshot(input, page_numbers: nil)
|
|
35
|
+
# Take screenshots of document pages.
|
|
36
|
+
# @param input [String] Path to the document file
|
|
37
|
+
# @param page_numbers [Array<Integer>, nil] Specific page numbers (1-indexed) to screenshot. nil = all pages.
|
|
38
|
+
# @return [Array<LiteParse::ScreenshotResult>] Screenshot results with image bytes
|
|
39
|
+
# @example
|
|
40
|
+
# parser = LiteParse::LiteParse.new
|
|
41
|
+
# screenshots = parser.screenshot("document.pdf", page_numbers: [1, 3])
|
|
42
|
+
# screenshots.each { |s| File.write("page_#{s.page_num}.png", s.image_bytes) }
|
|
43
|
+
end
|
|
44
|
+
|
|
8
45
|
# Wrap native new to accept 0 args (the Rust constructor expects 1 positional arg).
|
|
9
46
|
LiteParse::LiteParse.singleton_class.alias_method :native_new, :new
|
|
10
47
|
LiteParse::LiteParse.define_singleton_method(:new) do |**kwargs|
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: liteparse-rb
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.1.
|
|
4
|
+
version: 0.1.14
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Evan Mattiza
|
|
@@ -23,6 +23,20 @@ dependencies:
|
|
|
23
23
|
- - "~>"
|
|
24
24
|
- !ruby/object:Gem::Version
|
|
25
25
|
version: '0.9'
|
|
26
|
+
- !ruby/object:Gem::Dependency
|
|
27
|
+
name: yard
|
|
28
|
+
requirement: !ruby/object:Gem::Requirement
|
|
29
|
+
requirements:
|
|
30
|
+
- - "~>"
|
|
31
|
+
- !ruby/object:Gem::Version
|
|
32
|
+
version: '0.9'
|
|
33
|
+
type: :development
|
|
34
|
+
prerelease: false
|
|
35
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
36
|
+
requirements:
|
|
37
|
+
- - "~>"
|
|
38
|
+
- !ruby/object:Gem::Version
|
|
39
|
+
version: '0.9'
|
|
26
40
|
description: Ruby bindings for LiteParse — an open-source document parser that extracts
|
|
27
41
|
text with spatial layout information, bounding boxes, OCR support, and more.
|
|
28
42
|
executables: []
|
|
@@ -49,6 +63,7 @@ licenses:
|
|
|
49
63
|
metadata:
|
|
50
64
|
homepage_uri: https://github.com/emattiza/liteparse-rb
|
|
51
65
|
source_code_uri: https://github.com/emattiza/liteparse-rb
|
|
66
|
+
documentation_uri: https://rubydoc.info/gems/liteparse-rb
|
|
52
67
|
rdoc_options: []
|
|
53
68
|
require_paths:
|
|
54
69
|
- lib
|