liteparse-rb 0.1.7-x86_64-linux
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/Cargo.lock +3038 -0
- data/Cargo.toml +23 -0
- data/crates/liteparse-ruby/Cargo.toml +26 -0
- data/crates/liteparse-ruby/src/lib.rs +529 -0
- data/ext/liteparse/extconf.rb +4 -0
- data/ext/liteparse/src/lib.rs +8 -0
- data/lib/liteparse/cli.rb +16 -0
- data/lib/liteparse/liteparse.so +0 -0
- data/lib/liteparse/parser.rb +13 -0
- data/lib/liteparse/types.rb +9 -0
- data/lib/liteparse/version.rb +3 -0
- data/lib/liteparse.rb +18 -0
- metadata +68 -0
data/Cargo.toml
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
[workspace]
|
|
2
|
+
resolver = "2"
|
|
3
|
+
members = [
|
|
4
|
+
"crates/liteparse-ruby",
|
|
5
|
+
"ext/liteparse",
|
|
6
|
+
]
|
|
7
|
+
|
|
8
|
+
[workspace.package]
|
|
9
|
+
edition = "2024"
|
|
10
|
+
license = "Apache-2.0"
|
|
11
|
+
repository = "https://github.com/emattiza/liteparse-rb"
|
|
12
|
+
|
|
13
|
+
[workspace.dependencies]
|
|
14
|
+
liteparse = { version = "2.1.1", default-features = false }
|
|
15
|
+
liteparse-pdfium = { version = "1.2.0" }
|
|
16
|
+
liteparse-pdfium-sys = { version = "1.2.0" }
|
|
17
|
+
magnus = { version = "0.8", features = ["embed"] }
|
|
18
|
+
rb-sys = { version = "0.9.128", default-features = false, features = ["stable-api-compiled-fallback"] }
|
|
19
|
+
serde = { version = "1", features = ["derive"] }
|
|
20
|
+
serde_json = "1"
|
|
21
|
+
tokio = { version = "1", features = ["rt-multi-thread", "macros", "sync", "time", "io-util"] }
|
|
22
|
+
anyhow = "1"
|
|
23
|
+
image = { version = "0.25", default-features = false, features = ["png"] }
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
[package]
|
|
2
|
+
name = "liteparse-ruby"
|
|
3
|
+
version = "0.1.7"
|
|
4
|
+
edition.workspace = true
|
|
5
|
+
license.workspace = true
|
|
6
|
+
repository.workspace = true
|
|
7
|
+
description = "Ruby bindings for LiteParse"
|
|
8
|
+
|
|
9
|
+
[lib]
|
|
10
|
+
name = "liteparse_ruby"
|
|
11
|
+
crate-type = ["lib"]
|
|
12
|
+
|
|
13
|
+
[features]
|
|
14
|
+
default = []
|
|
15
|
+
tesseract = ["liteparse/tesseract"]
|
|
16
|
+
|
|
17
|
+
[dependencies]
|
|
18
|
+
liteparse = { workspace = true, features = [] }
|
|
19
|
+
liteparse-pdfium = { workspace = true }
|
|
20
|
+
liteparse-pdfium-sys = { workspace = true }
|
|
21
|
+
magnus = { workspace = true }
|
|
22
|
+
serde = { workspace = true }
|
|
23
|
+
serde_json = { workspace = true }
|
|
24
|
+
tokio = { workspace = true }
|
|
25
|
+
anyhow = { workspace = true }
|
|
26
|
+
image = { workspace = true }
|
|
@@ -0,0 +1,529 @@
|
|
|
1
|
+
use std::collections::HashMap;
|
|
2
|
+
use std::sync::Arc;
|
|
3
|
+
|
|
4
|
+
use magnus::typed_data::Obj;
|
|
5
|
+
use magnus::{function, method, Error, RArray, RHash, RString, Ruby, TryConvert};
|
|
6
|
+
use magnus::prelude::*;
|
|
7
|
+
use tokio::sync::Mutex;
|
|
8
|
+
|
|
9
|
+
use liteparse::config::{ImageMode, LiteParseConfig, OutputFormat};
|
|
10
|
+
use liteparse::types::PdfInput;
|
|
11
|
+
|
|
12
|
+
fn kwarg<T: magnus::TryConvert>(kwargs: &RHash, key: &str) -> Option<T> {
|
|
13
|
+
let sym = magnus::Symbol::new(key);
|
|
14
|
+
kwargs.get::<magnus::Symbol>(sym).and_then(|v| <T as magnus::TryConvert>::try_convert(v).ok())
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
fn kwarg_bool(kwargs: &RHash, key: &str) -> Option<bool> {
|
|
18
|
+
let sym = magnus::Symbol::new(key);
|
|
19
|
+
kwargs.get::<magnus::Symbol>(sym).and_then(|v| {
|
|
20
|
+
use magnus::value::ReprValue;
|
|
21
|
+
if v.is_nil() { None } else { Some(v.to_bool()) }
|
|
22
|
+
})
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
// ---------------------------------------------------------------------------
|
|
26
|
+
// Conversion helpers
|
|
27
|
+
// ---------------------------------------------------------------------------
|
|
28
|
+
|
|
29
|
+
fn screenshots_to_ruby(results: Vec<liteparse::parser::ScreenshotResult>) -> RArray {
|
|
30
|
+
let ruby = Ruby::get().unwrap();
|
|
31
|
+
let ary = ruby.ary_new();
|
|
32
|
+
for r in results {
|
|
33
|
+
let sr = ScreenshotResult {
|
|
34
|
+
page_num: r.page_num,
|
|
35
|
+
width: r.width,
|
|
36
|
+
height: r.height,
|
|
37
|
+
image_bytes: r.image_bytes,
|
|
38
|
+
};
|
|
39
|
+
let _ = ary.push(sr);
|
|
40
|
+
}
|
|
41
|
+
ary
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
// ---------------------------------------------------------------------------
|
|
45
|
+
// Ruby type wrappers
|
|
46
|
+
// ---------------------------------------------------------------------------
|
|
47
|
+
|
|
48
|
+
#[magnus::wrap(class = "LiteParse::TextItem")]
|
|
49
|
+
#[derive(Clone)]
|
|
50
|
+
struct TextItem {
|
|
51
|
+
text: String,
|
|
52
|
+
x: f64,
|
|
53
|
+
y: f64,
|
|
54
|
+
width: f64,
|
|
55
|
+
height: f64,
|
|
56
|
+
font_name: Option<String>,
|
|
57
|
+
font_size: Option<f64>,
|
|
58
|
+
confidence: Option<f64>,
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
impl TextItem {
|
|
62
|
+
fn from_rust(item: liteparse::types::TextItem) -> Self {
|
|
63
|
+
Self {
|
|
64
|
+
text: item.text,
|
|
65
|
+
x: item.x as f64,
|
|
66
|
+
y: item.y as f64,
|
|
67
|
+
width: item.width as f64,
|
|
68
|
+
height: item.height as f64,
|
|
69
|
+
font_name: item.font_name,
|
|
70
|
+
font_size: item.font_size.map(|v| v as f64),
|
|
71
|
+
confidence: item.confidence.map(|v| v as f64).or(Some(1.0)),
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
fn to_rust(&self) -> liteparse::types::TextItem {
|
|
76
|
+
liteparse::types::TextItem {
|
|
77
|
+
text: self.text.clone(),
|
|
78
|
+
x: self.x as f32,
|
|
79
|
+
y: self.y as f32,
|
|
80
|
+
width: self.width as f32,
|
|
81
|
+
height: self.height as f32,
|
|
82
|
+
font_name: self.font_name.clone(),
|
|
83
|
+
font_size: self.font_size.map(|v| v as f32),
|
|
84
|
+
confidence: self.confidence.map(|v| v as f32),
|
|
85
|
+
..Default::default()
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
fn text(&self) -> &str { &self.text }
|
|
90
|
+
fn x(&self) -> f64 { self.x }
|
|
91
|
+
fn y(&self) -> f64 { self.y }
|
|
92
|
+
fn width(&self) -> f64 { self.width }
|
|
93
|
+
fn height(&self) -> f64 { self.height }
|
|
94
|
+
fn font_name(&self) -> Option<&str> { self.font_name.as_deref() }
|
|
95
|
+
fn font_size(&self) -> Option<f64> { self.font_size }
|
|
96
|
+
fn confidence(&self) -> Option<f64> { self.confidence }
|
|
97
|
+
fn inspect(&self) -> String {
|
|
98
|
+
format!("#<LiteParse::TextItem text={:?} x={} y={} width={} height={}>",
|
|
99
|
+
self.text, self.x, self.y, self.width, self.height)
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
#[magnus::wrap(class = "LiteParse::ParsedPage")]
|
|
104
|
+
#[derive(Clone)]
|
|
105
|
+
struct ParsedPage {
|
|
106
|
+
page_num: u32,
|
|
107
|
+
width: f64,
|
|
108
|
+
height: f64,
|
|
109
|
+
text: String,
|
|
110
|
+
text_items: Vec<TextItem>,
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
impl ParsedPage {
|
|
114
|
+
fn from_rust(page: liteparse::types::ParsedPage) -> Self {
|
|
115
|
+
Self {
|
|
116
|
+
page_num: page.page_number as u32,
|
|
117
|
+
width: page.page_width as f64,
|
|
118
|
+
height: page.page_height as f64,
|
|
119
|
+
text: page.text,
|
|
120
|
+
text_items: page.text_items.into_iter().map(TextItem::from_rust).collect(),
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
fn page_num(&self) -> u32 { self.page_num }
|
|
125
|
+
fn width(&self) -> f64 { self.width }
|
|
126
|
+
fn height(&self) -> f64 { self.height }
|
|
127
|
+
fn text(&self) -> &str { &self.text }
|
|
128
|
+
fn text_items(&self) -> RArray {
|
|
129
|
+
let ruby = Ruby::get().unwrap();
|
|
130
|
+
let ary = ruby.ary_new();
|
|
131
|
+
for item in &self.text_items {
|
|
132
|
+
let _ = ary.push(item.clone());
|
|
133
|
+
}
|
|
134
|
+
ary
|
|
135
|
+
}
|
|
136
|
+
fn inspect(&self) -> String {
|
|
137
|
+
format!("#<LiteParse::ParsedPage page_num={} width={} height={} text_items_len={}>",
|
|
138
|
+
self.page_num, self.width, self.height, self.text_items.len())
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
#[magnus::wrap(class = "LiteParse::ParseResult")]
|
|
143
|
+
#[derive(Clone)]
|
|
144
|
+
struct ParseResult {
|
|
145
|
+
pages: Vec<ParsedPage>,
|
|
146
|
+
text: String,
|
|
147
|
+
images: Vec<ExtractedImage>,
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
impl ParseResult {
|
|
151
|
+
fn from_rust(result: liteparse::parser::ParseResult) -> Self {
|
|
152
|
+
Self {
|
|
153
|
+
pages: result.pages.into_iter().map(ParsedPage::from_rust).collect(),
|
|
154
|
+
text: result.text,
|
|
155
|
+
images: result.images.into_iter().map(ExtractedImage::from_rust).collect(),
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
fn pages(&self) -> RArray {
|
|
160
|
+
let ruby = Ruby::get().unwrap();
|
|
161
|
+
let ary = ruby.ary_new();
|
|
162
|
+
for page in &self.pages {
|
|
163
|
+
let _ = ary.push(page.clone());
|
|
164
|
+
}
|
|
165
|
+
ary
|
|
166
|
+
}
|
|
167
|
+
fn text(&self) -> &str { &self.text }
|
|
168
|
+
fn images(&self) -> RArray {
|
|
169
|
+
let ruby = Ruby::get().unwrap();
|
|
170
|
+
let ary = ruby.ary_new();
|
|
171
|
+
for img in &self.images {
|
|
172
|
+
let _ = ary.push(img.clone());
|
|
173
|
+
}
|
|
174
|
+
ary
|
|
175
|
+
}
|
|
176
|
+
fn num_pages(&self) -> usize { self.pages.len() }
|
|
177
|
+
fn get_page(&self, page_num: u32) -> Option<ParsedPage> {
|
|
178
|
+
self.pages.iter().find(|p| p.page_num == page_num).cloned()
|
|
179
|
+
}
|
|
180
|
+
fn inspect(&self) -> String {
|
|
181
|
+
format!("#<LiteParse::ParseResult pages={} text_len={} images={}>",
|
|
182
|
+
self.pages.len(), self.text.len(), self.images.len())
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
#[magnus::wrap(class = "LiteParse::ExtractedImage")]
|
|
187
|
+
#[derive(Clone)]
|
|
188
|
+
struct ExtractedImage {
|
|
189
|
+
id: String,
|
|
190
|
+
page: u32,
|
|
191
|
+
format: String,
|
|
192
|
+
bytes: Vec<u8>,
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
impl ExtractedImage {
|
|
196
|
+
fn from_rust(img: liteparse::types::ExtractedImage) -> Self {
|
|
197
|
+
Self { id: img.id, page: img.page, format: img.format, bytes: img.bytes }
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
fn id(&self) -> &str { &self.id }
|
|
201
|
+
fn page(&self) -> u32 { self.page }
|
|
202
|
+
fn format(&self) -> &str { &self.format }
|
|
203
|
+
fn inspect(&self) -> String {
|
|
204
|
+
format!("#<LiteParse::ExtractedImage id={:?} page={} format={:?} bytes_len={}>",
|
|
205
|
+
self.id, self.page, self.format, self.bytes.len())
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
fn extracted_image_bytes(rb_self: &ExtractedImage) -> RString {
|
|
210
|
+
Ruby::get().expect("Ruby not available").str_from_slice(&rb_self.bytes)
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
#[magnus::wrap(class = "LiteParse::ScreenshotResult")]
|
|
214
|
+
#[derive(Clone)]
|
|
215
|
+
struct ScreenshotResult {
|
|
216
|
+
page_num: u32,
|
|
217
|
+
width: u32,
|
|
218
|
+
height: u32,
|
|
219
|
+
image_bytes: Vec<u8>,
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
impl ScreenshotResult {
|
|
223
|
+
fn page_num(&self) -> u32 { self.page_num }
|
|
224
|
+
fn width(&self) -> u32 { self.width }
|
|
225
|
+
fn height(&self) -> u32 { self.height }
|
|
226
|
+
fn inspect(&self) -> String {
|
|
227
|
+
format!("#<LiteParse::ScreenshotResult page_num={} width={} height={}>",
|
|
228
|
+
self.page_num, self.width, self.height)
|
|
229
|
+
}
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
fn screenshot_result_image_bytes(rb_self: &ScreenshotResult) -> RString {
|
|
233
|
+
Ruby::get().expect("Ruby not available").str_from_slice(&rb_self.image_bytes)
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
#[magnus::wrap(class = "LiteParse::Config")]
|
|
237
|
+
#[derive(Clone)]
|
|
238
|
+
struct Config {
|
|
239
|
+
ocr_language: String,
|
|
240
|
+
ocr_enabled: bool,
|
|
241
|
+
ocr_server_url: Option<String>,
|
|
242
|
+
ocr_server_headers: Option<HashMap<String, String>>,
|
|
243
|
+
tessdata_path: Option<String>,
|
|
244
|
+
max_pages: usize,
|
|
245
|
+
target_pages: Option<String>,
|
|
246
|
+
dpi: f64,
|
|
247
|
+
output_format: String,
|
|
248
|
+
preserve_very_small_text: bool,
|
|
249
|
+
password: Option<String>,
|
|
250
|
+
quiet: bool,
|
|
251
|
+
num_workers: usize,
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
impl Config {
|
|
255
|
+
fn from_rust(cfg: &LiteParseConfig) -> Self {
|
|
256
|
+
Self {
|
|
257
|
+
ocr_language: cfg.ocr_language.clone(),
|
|
258
|
+
ocr_enabled: cfg.ocr_enabled,
|
|
259
|
+
ocr_server_url: cfg.ocr_server_url.clone(),
|
|
260
|
+
ocr_server_headers: if cfg.ocr_server_headers.is_empty() { None } else {
|
|
261
|
+
Some(cfg.ocr_server_headers.iter().cloned().collect()) },
|
|
262
|
+
tessdata_path: cfg.tessdata_path.clone(),
|
|
263
|
+
max_pages: cfg.max_pages,
|
|
264
|
+
target_pages: cfg.target_pages.clone(),
|
|
265
|
+
dpi: cfg.dpi as f64,
|
|
266
|
+
output_format: match cfg.output_format {
|
|
267
|
+
OutputFormat::Json => "json".to_string(),
|
|
268
|
+
OutputFormat::Text => "text".to_string(),
|
|
269
|
+
OutputFormat::Markdown => "markdown".to_string(),
|
|
270
|
+
},
|
|
271
|
+
preserve_very_small_text: cfg.preserve_very_small_text,
|
|
272
|
+
password: cfg.password.clone(),
|
|
273
|
+
quiet: cfg.quiet,
|
|
274
|
+
num_workers: cfg.num_workers,
|
|
275
|
+
}
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
fn ocr_language(&self) -> &str { &self.ocr_language }
|
|
279
|
+
fn ocr_enabled(&self) -> bool { self.ocr_enabled }
|
|
280
|
+
fn ocr_server_url(&self) -> Option<&str> { self.ocr_server_url.as_deref() }
|
|
281
|
+
fn ocr_server_headers(&self) -> Option<RHash> {
|
|
282
|
+
self.ocr_server_headers.clone().map(|h| {
|
|
283
|
+
let hash = Ruby::get().unwrap().hash_new();
|
|
284
|
+
for (k, v) in &h {
|
|
285
|
+
let _ = hash.aset(k.as_str(), v.as_str());
|
|
286
|
+
}
|
|
287
|
+
hash
|
|
288
|
+
})
|
|
289
|
+
}
|
|
290
|
+
fn tessdata_path(&self) -> Option<&str> { self.tessdata_path.as_deref() }
|
|
291
|
+
fn max_pages(&self) -> usize { self.max_pages }
|
|
292
|
+
fn target_pages(&self) -> Option<&str> { self.target_pages.as_deref() }
|
|
293
|
+
fn dpi(&self) -> f64 { self.dpi }
|
|
294
|
+
fn output_format(&self) -> &str { &self.output_format }
|
|
295
|
+
fn preserve_very_small_text(&self) -> bool { self.preserve_very_small_text }
|
|
296
|
+
fn password(&self) -> Option<&str> { self.password.as_deref() }
|
|
297
|
+
fn quiet(&self) -> bool { self.quiet }
|
|
298
|
+
fn num_workers(&self) -> usize { self.num_workers }
|
|
299
|
+
fn inspect(&self) -> String {
|
|
300
|
+
format!("#<LiteParse::Config ocr_enabled={} dpi={} max_pages={}>",
|
|
301
|
+
self.ocr_enabled, self.dpi, self.max_pages)
|
|
302
|
+
}
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
// ---------------------------------------------------------------------------
|
|
306
|
+
// Main LiteParse class
|
|
307
|
+
// ---------------------------------------------------------------------------
|
|
308
|
+
|
|
309
|
+
struct Inner {
|
|
310
|
+
parser: liteparse::parser::LiteParse,
|
|
311
|
+
config: LiteParseConfig,
|
|
312
|
+
runtime: tokio::runtime::Runtime,
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
#[magnus::wrap(class = "LiteParse::LiteParse")]
|
|
316
|
+
struct LiteParse {
|
|
317
|
+
inner: Arc<Mutex<Inner>>,
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
impl LiteParse {
|
|
321
|
+
fn new(kwargs: Option<RHash>) -> Result<Self, Error> {
|
|
322
|
+
// Start with upstream defaults then apply kwargs
|
|
323
|
+
let mut cfg = LiteParseConfig {
|
|
324
|
+
ocr_enabled: true,
|
|
325
|
+
dpi: 150.0,
|
|
326
|
+
max_pages: 1000,
|
|
327
|
+
..Default::default()
|
|
328
|
+
};
|
|
329
|
+
|
|
330
|
+
if let Some(ref kwargs) = kwargs {
|
|
331
|
+
if let Some(v) = kwarg::<String>(kwargs, "ocr_language") { cfg.ocr_language = v; }
|
|
332
|
+
if let Some(v) = kwarg_bool(kwargs, "ocr_enabled") { cfg.ocr_enabled = v; }
|
|
333
|
+
if let Some(v) = kwarg::<String>(kwargs, "ocr_server_url") { cfg.ocr_server_url = Some(v); }
|
|
334
|
+
if let Some(v) = kwarg::<HashMap<String, String>>(kwargs, "ocr_server_headers") {
|
|
335
|
+
cfg.ocr_server_headers = v.into_iter().collect();
|
|
336
|
+
}
|
|
337
|
+
if let Some(v) = kwarg::<String>(kwargs, "tessdata_path") { cfg.tessdata_path = Some(v); }
|
|
338
|
+
if let Some(v) = kwarg::<usize>(kwargs, "max_pages") { cfg.max_pages = v; }
|
|
339
|
+
if let Some(v) = kwarg::<String>(kwargs, "target_pages") { cfg.target_pages = Some(v); }
|
|
340
|
+
if let Some(v) = kwarg::<f64>(kwargs, "dpi") { cfg.dpi = v as f32; }
|
|
341
|
+
if let Some(v) = kwarg::<String>(kwargs, "output_format") {
|
|
342
|
+
cfg.output_format = match v.as_str() {
|
|
343
|
+
"text" => OutputFormat::Text,
|
|
344
|
+
"markdown" | "md" => OutputFormat::Markdown,
|
|
345
|
+
_ => OutputFormat::Json,
|
|
346
|
+
};
|
|
347
|
+
}
|
|
348
|
+
if let Some(v) = kwarg_bool(kwargs, "preserve_very_small_text") { cfg.preserve_very_small_text = v; }
|
|
349
|
+
if let Some(v) = kwarg::<String>(kwargs, "password") { cfg.password = Some(v); }
|
|
350
|
+
if let Some(v) = kwarg_bool(kwargs, "quiet") { cfg.quiet = v; }
|
|
351
|
+
if let Some(v) = kwarg::<usize>(kwargs, "num_workers") { cfg.num_workers = v; }
|
|
352
|
+
if let Some(v) = kwarg::<String>(kwargs, "image_mode") {
|
|
353
|
+
cfg.image_mode = match v.as_str() {
|
|
354
|
+
"off" | "none" => ImageMode::Off,
|
|
355
|
+
"embed" => ImageMode::Embed,
|
|
356
|
+
_ => ImageMode::Placeholder,
|
|
357
|
+
};
|
|
358
|
+
}
|
|
359
|
+
if let Some(v) = kwarg_bool(kwargs, "extract_links") { cfg.extract_links = v; }
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
let parser = liteparse::parser::LiteParse::new(cfg.clone());
|
|
363
|
+
let runtime = tokio::runtime::Runtime::new()
|
|
364
|
+
.map_err(|e| Error::new(Ruby::get().unwrap().exception_runtime_error(), e.to_string()))?;
|
|
365
|
+
|
|
366
|
+
Ok(Self { inner: Arc::new(Mutex::new(Inner { parser, config: cfg, runtime })) })
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
fn parse(&self, input: String) -> Result<ParseResult, Error> {
|
|
370
|
+
let pdf_input = PdfInput::Path(input);
|
|
371
|
+
let locked = self.inner.blocking_lock();
|
|
372
|
+
let result = locked.runtime.block_on(locked.parser.parse_input(pdf_input))
|
|
373
|
+
.map_err(|e| runtime_err(e.to_string()))?;
|
|
374
|
+
Ok(ParseResult::from_rust(result))
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
fn parse_bytes(&self, data: Vec<u8>) -> Result<ParseResult, Error> {
|
|
378
|
+
let pdf_input = PdfInput::Bytes(data);
|
|
379
|
+
let locked = self.inner.blocking_lock();
|
|
380
|
+
let result = locked.runtime.block_on(locked.parser.parse_input(pdf_input))
|
|
381
|
+
.map_err(|e| runtime_err(e.to_string()))?;
|
|
382
|
+
Ok(ParseResult::from_rust(result))
|
|
383
|
+
}
|
|
384
|
+
|
|
385
|
+
fn screenshot(&self, input: String, page_numbers: Option<Vec<u32>>) -> Result<RArray, Error> {
|
|
386
|
+
let locked = self.inner.blocking_lock();
|
|
387
|
+
let results = locked.runtime.block_on(locked.parser.screenshot(&input, page_numbers))
|
|
388
|
+
.map_err(|e| runtime_err(e.to_string()))?;
|
|
389
|
+
Ok(screenshots_to_ruby(results))
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
fn config(&self) -> Config {
|
|
393
|
+
Config::from_rust(&self.inner.blocking_lock().config)
|
|
394
|
+
}
|
|
395
|
+
|
|
396
|
+
fn inspect(&self) -> String {
|
|
397
|
+
let inner = self.inner.blocking_lock();
|
|
398
|
+
format!("#<LiteParse::LiteParse ocr_enabled={} dpi={} max_pages={}>",
|
|
399
|
+
inner.config.ocr_enabled, inner.config.dpi, inner.config.max_pages)
|
|
400
|
+
}
|
|
401
|
+
}
|
|
402
|
+
|
|
403
|
+
fn search_items(items: RArray, phrase: String, case_sensitive: Option<bool>) -> RArray {
|
|
404
|
+
let case_sensitive = case_sensitive.unwrap_or(false);
|
|
405
|
+
let ruby = Ruby::get().unwrap();
|
|
406
|
+
let ary = ruby.ary_new();
|
|
407
|
+
|
|
408
|
+
let mut rust_items: Vec<liteparse::types::TextItem> = Vec::new();
|
|
409
|
+
for item_value in items.into_iter() {
|
|
410
|
+
let obj: Obj<TextItem> = match Obj::<TextItem>::try_convert(item_value) {
|
|
411
|
+
Ok(v) => v,
|
|
412
|
+
_ => continue,
|
|
413
|
+
};
|
|
414
|
+
rust_items.push(obj.to_rust());
|
|
415
|
+
}
|
|
416
|
+
|
|
417
|
+
let options = liteparse::search::SearchOptions { phrase, case_sensitive };
|
|
418
|
+
let matches = liteparse::search::search_items(&rust_items, &options);
|
|
419
|
+
for m in matches {
|
|
420
|
+
let _ = ary.push(TextItem::from_rust(m));
|
|
421
|
+
}
|
|
422
|
+
ary
|
|
423
|
+
}
|
|
424
|
+
|
|
425
|
+
fn run_cli(_args: Vec<String>) -> Result<(), Error> {
|
|
426
|
+
Err(runtime_err("CLI not available in Ruby gem."))
|
|
427
|
+
}
|
|
428
|
+
|
|
429
|
+
fn runtime_err(msg: impl ToString) -> Error {
|
|
430
|
+
Error::new(Ruby::get().unwrap().exception_runtime_error(), msg.to_string())
|
|
431
|
+
}
|
|
432
|
+
|
|
433
|
+
// ---------------------------------------------------------------------------
|
|
434
|
+
// Init
|
|
435
|
+
// ---------------------------------------------------------------------------
|
|
436
|
+
|
|
437
|
+
/// Exposed so the ext crate's `#[magnus::init]` can call it.
|
|
438
|
+
/// All classes / methods are registered inside here.
|
|
439
|
+
pub fn define_liteparse_module(ruby: &Ruby) -> Result<(), Error> {
|
|
440
|
+
let module = ruby.define_module("LiteParse")?;
|
|
441
|
+
|
|
442
|
+
// TextItem
|
|
443
|
+
let text_item = module.define_class("TextItem", ruby.class_object())?;
|
|
444
|
+
text_item.define_method("text", method!(TextItem::text, 0))?;
|
|
445
|
+
text_item.define_method("x", method!(TextItem::x, 0))?;
|
|
446
|
+
text_item.define_method("y", method!(TextItem::y, 0))?;
|
|
447
|
+
text_item.define_method("width", method!(TextItem::width, 0))?;
|
|
448
|
+
text_item.define_method("height", method!(TextItem::height, 0))?;
|
|
449
|
+
text_item.define_method("font_name", method!(TextItem::font_name, 0))?;
|
|
450
|
+
text_item.define_method("font_size", method!(TextItem::font_size, 0))?;
|
|
451
|
+
text_item.define_method("confidence", method!(TextItem::confidence, 0))?;
|
|
452
|
+
text_item.define_method("inspect", method!(TextItem::inspect, 0))?;
|
|
453
|
+
text_item.define_method("to_s", method!(TextItem::inspect, 0))?;
|
|
454
|
+
|
|
455
|
+
// ParsedPage
|
|
456
|
+
let parsed_page = module.define_class("ParsedPage", ruby.class_object())?;
|
|
457
|
+
parsed_page.define_method("page_num", method!(ParsedPage::page_num, 0))?;
|
|
458
|
+
parsed_page.define_method("width", method!(ParsedPage::width, 0))?;
|
|
459
|
+
parsed_page.define_method("height", method!(ParsedPage::height, 0))?;
|
|
460
|
+
parsed_page.define_method("text", method!(ParsedPage::text, 0))?;
|
|
461
|
+
parsed_page.define_method("text_items", method!(ParsedPage::text_items, 0))?;
|
|
462
|
+
parsed_page.define_method("inspect", method!(ParsedPage::inspect, 0))?;
|
|
463
|
+
parsed_page.define_method("to_s", method!(ParsedPage::inspect, 0))?;
|
|
464
|
+
|
|
465
|
+
// ParseResult
|
|
466
|
+
let parse_result = module.define_class("ParseResult", ruby.class_object())?;
|
|
467
|
+
parse_result.define_method("pages", method!(ParseResult::pages, 0))?;
|
|
468
|
+
parse_result.define_method("text", method!(ParseResult::text, 0))?;
|
|
469
|
+
parse_result.define_method("images", method!(ParseResult::images, 0))?;
|
|
470
|
+
parse_result.define_method("num_pages", method!(ParseResult::num_pages, 0))?;
|
|
471
|
+
parse_result.define_method("get_page", method!(ParseResult::get_page, 1))?;
|
|
472
|
+
parse_result.define_method("inspect", method!(ParseResult::inspect, 0))?;
|
|
473
|
+
parse_result.define_method("to_s", method!(ParseResult::inspect, 0))?;
|
|
474
|
+
|
|
475
|
+
// ExtractedImage
|
|
476
|
+
let extracted_image = module.define_class("ExtractedImage", ruby.class_object())?;
|
|
477
|
+
extracted_image.define_method("id", method!(ExtractedImage::id, 0))?;
|
|
478
|
+
extracted_image.define_method("page", method!(ExtractedImage::page, 0))?;
|
|
479
|
+
extracted_image.define_method("format", method!(ExtractedImage::format, 0))?;
|
|
480
|
+
extracted_image.define_method("bytes", method!(extracted_image_bytes, 0))?;
|
|
481
|
+
extracted_image.define_method("inspect", method!(ExtractedImage::inspect, 0))?;
|
|
482
|
+
extracted_image.define_method("to_s", method!(ExtractedImage::inspect, 0))?;
|
|
483
|
+
|
|
484
|
+
// ScreenshotResult
|
|
485
|
+
let screenshot_result = module.define_class("ScreenshotResult", ruby.class_object())?;
|
|
486
|
+
screenshot_result.define_method("page_num", method!(ScreenshotResult::page_num, 0))?;
|
|
487
|
+
screenshot_result.define_method("width", method!(ScreenshotResult::width, 0))?;
|
|
488
|
+
screenshot_result.define_method("height", method!(ScreenshotResult::height, 0))?;
|
|
489
|
+
screenshot_result.define_method("image_bytes", method!(screenshot_result_image_bytes, 0))?;
|
|
490
|
+
screenshot_result.define_method("inspect", method!(ScreenshotResult::inspect, 0))?;
|
|
491
|
+
screenshot_result.define_method("to_s", method!(ScreenshotResult::inspect, 0))?;
|
|
492
|
+
|
|
493
|
+
// Config
|
|
494
|
+
let config_class = module.define_class("Config", ruby.class_object())?;
|
|
495
|
+
config_class.define_method("ocr_language", method!(Config::ocr_language, 0))?;
|
|
496
|
+
config_class.define_method("ocr_enabled", method!(Config::ocr_enabled, 0))?;
|
|
497
|
+
config_class.define_method("ocr_server_url", method!(Config::ocr_server_url, 0))?;
|
|
498
|
+
config_class.define_method("ocr_server_headers", method!(Config::ocr_server_headers, 0))?;
|
|
499
|
+
config_class.define_method("tessdata_path", method!(Config::tessdata_path, 0))?;
|
|
500
|
+
config_class.define_method("max_pages", method!(Config::max_pages, 0))?;
|
|
501
|
+
config_class.define_method("target_pages", method!(Config::target_pages, 0))?;
|
|
502
|
+
config_class.define_method("dpi", method!(Config::dpi, 0))?;
|
|
503
|
+
config_class.define_method("output_format", method!(Config::output_format, 0))?;
|
|
504
|
+
config_class.define_method("preserve_very_small_text", method!(Config::preserve_very_small_text, 0))?;
|
|
505
|
+
config_class.define_method("password", method!(Config::password, 0))?;
|
|
506
|
+
config_class.define_method("quiet", method!(Config::quiet, 0))?;
|
|
507
|
+
config_class.define_method("num_workers", method!(Config::num_workers, 0))?;
|
|
508
|
+
config_class.define_method("inspect", method!(Config::inspect, 0))?;
|
|
509
|
+
config_class.define_method("to_s", method!(Config::inspect, 0))?;
|
|
510
|
+
|
|
511
|
+
// Main LiteParse class
|
|
512
|
+
let liteparse_class = module.define_class("LiteParse", ruby.class_object())?;
|
|
513
|
+
liteparse_class.define_singleton_method("new", function!(LiteParse::new, 1))?;
|
|
514
|
+
liteparse_class.define_method("parse", method!(LiteParse::parse, 1))?;
|
|
515
|
+
liteparse_class.define_method("parse_bytes", method!(LiteParse::parse_bytes, 1))?;
|
|
516
|
+
liteparse_class.define_method("screenshot", method!(LiteParse::screenshot, 2))?;
|
|
517
|
+
liteparse_class.define_method("config", method!(LiteParse::config, 0))?;
|
|
518
|
+
liteparse_class.define_method("inspect", method!(LiteParse::inspect, 0))?;
|
|
519
|
+
liteparse_class.define_method("to_s", method!(LiteParse::inspect, 0))?;
|
|
520
|
+
|
|
521
|
+
// Module functions
|
|
522
|
+
module.define_module_function("search_items", function!(search_items, 3))?;
|
|
523
|
+
module.define_module_function("run_cli", function!(run_cli, 1))?;
|
|
524
|
+
|
|
525
|
+
// ParseError exception
|
|
526
|
+
module.define_class("ParseError", ruby.exception_exception().as_r_class())?;
|
|
527
|
+
|
|
528
|
+
Ok(())
|
|
529
|
+
}
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
// Re-export all types and functions from the workspace crate.
|
|
2
|
+
pub use liteparse_ruby::*;
|
|
3
|
+
|
|
4
|
+
/// Registers all LiteParse classes/modules with Ruby.
|
|
5
|
+
#[magnus::init]
|
|
6
|
+
fn init(ruby: &magnus::Ruby) -> Result<(), magnus::Error> {
|
|
7
|
+
liteparse_ruby::define_liteparse_module(ruby)
|
|
8
|
+
}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
require_relative "../liteparse"
|
|
2
|
+
|
|
3
|
+
module LiteParse
|
|
4
|
+
# CLI support is handled by the native `lit` binary.
|
|
5
|
+
# Run `lit --help` from the command line for usage.
|
|
6
|
+
module CLI
|
|
7
|
+
module_function
|
|
8
|
+
|
|
9
|
+
def run(args = ARGV)
|
|
10
|
+
LiteParse.run_cli(args)
|
|
11
|
+
rescue => e
|
|
12
|
+
$stderr.puts "Error: #{e.message}"
|
|
13
|
+
exit 1
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
Binary file
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
require_relative "types"
|
|
2
|
+
|
|
3
|
+
module LiteParse
|
|
4
|
+
# LiteParse is defined natively in the Rust extension.
|
|
5
|
+
# This file exists to mirror the Python wrapper structure and provide
|
|
6
|
+
# a convenient require path.
|
|
7
|
+
#
|
|
8
|
+
# Usage:
|
|
9
|
+
# require "liteparse"
|
|
10
|
+
# parser = LiteParse::LiteParse.new(ocr_enabled: true)
|
|
11
|
+
# result = parser.parse("document.pdf")
|
|
12
|
+
# puts result.text
|
|
13
|
+
end
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
require_relative "liteparse/liteparse"
|
|
2
|
+
|
|
3
|
+
# All types (TextItem, ParsedPage, ParseResult, etc.) are defined natively
|
|
4
|
+
# in the Rust extension. This file re-exports them for convenience.
|
|
5
|
+
|
|
6
|
+
module LiteParse
|
|
7
|
+
# No additional Ruby wrapping needed — the native classes are registered
|
|
8
|
+
# directly on the LiteParse module by the Rust init function.
|
|
9
|
+
end
|
data/lib/liteparse.rb
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
require_relative "liteparse/version"
|
|
2
|
+
require_relative "liteparse/liteparse"
|
|
3
|
+
|
|
4
|
+
module LiteParse
|
|
5
|
+
class Error < StandardError; end
|
|
6
|
+
end
|
|
7
|
+
|
|
8
|
+
# Wrap native new to accept 0 args (the Rust constructor expects 1 positional arg).
|
|
9
|
+
LiteParse::LiteParse.singleton_class.alias_method :native_new, :new
|
|
10
|
+
LiteParse::LiteParse.define_singleton_method(:new) do |**kwargs|
|
|
11
|
+
native_new(kwargs.empty? ? nil : kwargs)
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
# Wrap screenshot to accept page_numbers as keyword arg (matching Python API).
|
|
15
|
+
LiteParse::LiteParse.alias_method :native_screenshot, :screenshot
|
|
16
|
+
LiteParse::LiteParse.define_method(:screenshot) do |input, page_numbers: nil|
|
|
17
|
+
native_screenshot(input, page_numbers)
|
|
18
|
+
end
|