liteparse-rb 0.1.7-x86_64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Cargo.toml ADDED
@@ -0,0 +1,23 @@
1
+ [workspace]
2
+ resolver = "2"
3
+ members = [
4
+ "crates/liteparse-ruby",
5
+ "ext/liteparse",
6
+ ]
7
+
8
+ [workspace.package]
9
+ edition = "2024"
10
+ license = "Apache-2.0"
11
+ repository = "https://github.com/emattiza/liteparse-rb"
12
+
13
+ [workspace.dependencies]
14
+ liteparse = { version = "2.1.1", default-features = false }
15
+ liteparse-pdfium = { version = "1.2.0" }
16
+ liteparse-pdfium-sys = { version = "1.2.0" }
17
+ magnus = { version = "0.8", features = ["embed"] }
18
+ rb-sys = { version = "0.9.128", default-features = false, features = ["stable-api-compiled-fallback"] }
19
+ serde = { version = "1", features = ["derive"] }
20
+ serde_json = "1"
21
+ tokio = { version = "1", features = ["rt-multi-thread", "macros", "sync", "time", "io-util"] }
22
+ anyhow = "1"
23
+ image = { version = "0.25", default-features = false, features = ["png"] }
@@ -0,0 +1,26 @@
1
+ [package]
2
+ name = "liteparse-ruby"
3
+ version = "0.1.7"
4
+ edition.workspace = true
5
+ license.workspace = true
6
+ repository.workspace = true
7
+ description = "Ruby bindings for LiteParse"
8
+
9
+ [lib]
10
+ name = "liteparse_ruby"
11
+ crate-type = ["lib"]
12
+
13
+ [features]
14
+ default = []
15
+ tesseract = ["liteparse/tesseract"]
16
+
17
+ [dependencies]
18
+ liteparse = { workspace = true, features = [] }
19
+ liteparse-pdfium = { workspace = true }
20
+ liteparse-pdfium-sys = { workspace = true }
21
+ magnus = { workspace = true }
22
+ serde = { workspace = true }
23
+ serde_json = { workspace = true }
24
+ tokio = { workspace = true }
25
+ anyhow = { workspace = true }
26
+ image = { workspace = true }
@@ -0,0 +1,529 @@
1
+ use std::collections::HashMap;
2
+ use std::sync::Arc;
3
+
4
+ use magnus::typed_data::Obj;
5
+ use magnus::{function, method, Error, RArray, RHash, RString, Ruby, TryConvert};
6
+ use magnus::prelude::*;
7
+ use tokio::sync::Mutex;
8
+
9
+ use liteparse::config::{ImageMode, LiteParseConfig, OutputFormat};
10
+ use liteparse::types::PdfInput;
11
+
12
+ fn kwarg<T: magnus::TryConvert>(kwargs: &RHash, key: &str) -> Option<T> {
13
+ let sym = magnus::Symbol::new(key);
14
+ kwargs.get::<magnus::Symbol>(sym).and_then(|v| <T as magnus::TryConvert>::try_convert(v).ok())
15
+ }
16
+
17
+ fn kwarg_bool(kwargs: &RHash, key: &str) -> Option<bool> {
18
+ let sym = magnus::Symbol::new(key);
19
+ kwargs.get::<magnus::Symbol>(sym).and_then(|v| {
20
+ use magnus::value::ReprValue;
21
+ if v.is_nil() { None } else { Some(v.to_bool()) }
22
+ })
23
+ }
24
+
25
+ // ---------------------------------------------------------------------------
26
+ // Conversion helpers
27
+ // ---------------------------------------------------------------------------
28
+
29
+ fn screenshots_to_ruby(results: Vec<liteparse::parser::ScreenshotResult>) -> RArray {
30
+ let ruby = Ruby::get().unwrap();
31
+ let ary = ruby.ary_new();
32
+ for r in results {
33
+ let sr = ScreenshotResult {
34
+ page_num: r.page_num,
35
+ width: r.width,
36
+ height: r.height,
37
+ image_bytes: r.image_bytes,
38
+ };
39
+ let _ = ary.push(sr);
40
+ }
41
+ ary
42
+ }
43
+
44
+ // ---------------------------------------------------------------------------
45
+ // Ruby type wrappers
46
+ // ---------------------------------------------------------------------------
47
+
48
+ #[magnus::wrap(class = "LiteParse::TextItem")]
49
+ #[derive(Clone)]
50
+ struct TextItem {
51
+ text: String,
52
+ x: f64,
53
+ y: f64,
54
+ width: f64,
55
+ height: f64,
56
+ font_name: Option<String>,
57
+ font_size: Option<f64>,
58
+ confidence: Option<f64>,
59
+ }
60
+
61
+ impl TextItem {
62
+ fn from_rust(item: liteparse::types::TextItem) -> Self {
63
+ Self {
64
+ text: item.text,
65
+ x: item.x as f64,
66
+ y: item.y as f64,
67
+ width: item.width as f64,
68
+ height: item.height as f64,
69
+ font_name: item.font_name,
70
+ font_size: item.font_size.map(|v| v as f64),
71
+ confidence: item.confidence.map(|v| v as f64).or(Some(1.0)),
72
+ }
73
+ }
74
+
75
+ fn to_rust(&self) -> liteparse::types::TextItem {
76
+ liteparse::types::TextItem {
77
+ text: self.text.clone(),
78
+ x: self.x as f32,
79
+ y: self.y as f32,
80
+ width: self.width as f32,
81
+ height: self.height as f32,
82
+ font_name: self.font_name.clone(),
83
+ font_size: self.font_size.map(|v| v as f32),
84
+ confidence: self.confidence.map(|v| v as f32),
85
+ ..Default::default()
86
+ }
87
+ }
88
+
89
+ fn text(&self) -> &str { &self.text }
90
+ fn x(&self) -> f64 { self.x }
91
+ fn y(&self) -> f64 { self.y }
92
+ fn width(&self) -> f64 { self.width }
93
+ fn height(&self) -> f64 { self.height }
94
+ fn font_name(&self) -> Option<&str> { self.font_name.as_deref() }
95
+ fn font_size(&self) -> Option<f64> { self.font_size }
96
+ fn confidence(&self) -> Option<f64> { self.confidence }
97
+ fn inspect(&self) -> String {
98
+ format!("#<LiteParse::TextItem text={:?} x={} y={} width={} height={}>",
99
+ self.text, self.x, self.y, self.width, self.height)
100
+ }
101
+ }
102
+
103
+ #[magnus::wrap(class = "LiteParse::ParsedPage")]
104
+ #[derive(Clone)]
105
+ struct ParsedPage {
106
+ page_num: u32,
107
+ width: f64,
108
+ height: f64,
109
+ text: String,
110
+ text_items: Vec<TextItem>,
111
+ }
112
+
113
+ impl ParsedPage {
114
+ fn from_rust(page: liteparse::types::ParsedPage) -> Self {
115
+ Self {
116
+ page_num: page.page_number as u32,
117
+ width: page.page_width as f64,
118
+ height: page.page_height as f64,
119
+ text: page.text,
120
+ text_items: page.text_items.into_iter().map(TextItem::from_rust).collect(),
121
+ }
122
+ }
123
+
124
+ fn page_num(&self) -> u32 { self.page_num }
125
+ fn width(&self) -> f64 { self.width }
126
+ fn height(&self) -> f64 { self.height }
127
+ fn text(&self) -> &str { &self.text }
128
+ fn text_items(&self) -> RArray {
129
+ let ruby = Ruby::get().unwrap();
130
+ let ary = ruby.ary_new();
131
+ for item in &self.text_items {
132
+ let _ = ary.push(item.clone());
133
+ }
134
+ ary
135
+ }
136
+ fn inspect(&self) -> String {
137
+ format!("#<LiteParse::ParsedPage page_num={} width={} height={} text_items_len={}>",
138
+ self.page_num, self.width, self.height, self.text_items.len())
139
+ }
140
+ }
141
+
142
+ #[magnus::wrap(class = "LiteParse::ParseResult")]
143
+ #[derive(Clone)]
144
+ struct ParseResult {
145
+ pages: Vec<ParsedPage>,
146
+ text: String,
147
+ images: Vec<ExtractedImage>,
148
+ }
149
+
150
+ impl ParseResult {
151
+ fn from_rust(result: liteparse::parser::ParseResult) -> Self {
152
+ Self {
153
+ pages: result.pages.into_iter().map(ParsedPage::from_rust).collect(),
154
+ text: result.text,
155
+ images: result.images.into_iter().map(ExtractedImage::from_rust).collect(),
156
+ }
157
+ }
158
+
159
+ fn pages(&self) -> RArray {
160
+ let ruby = Ruby::get().unwrap();
161
+ let ary = ruby.ary_new();
162
+ for page in &self.pages {
163
+ let _ = ary.push(page.clone());
164
+ }
165
+ ary
166
+ }
167
+ fn text(&self) -> &str { &self.text }
168
+ fn images(&self) -> RArray {
169
+ let ruby = Ruby::get().unwrap();
170
+ let ary = ruby.ary_new();
171
+ for img in &self.images {
172
+ let _ = ary.push(img.clone());
173
+ }
174
+ ary
175
+ }
176
+ fn num_pages(&self) -> usize { self.pages.len() }
177
+ fn get_page(&self, page_num: u32) -> Option<ParsedPage> {
178
+ self.pages.iter().find(|p| p.page_num == page_num).cloned()
179
+ }
180
+ fn inspect(&self) -> String {
181
+ format!("#<LiteParse::ParseResult pages={} text_len={} images={}>",
182
+ self.pages.len(), self.text.len(), self.images.len())
183
+ }
184
+ }
185
+
186
+ #[magnus::wrap(class = "LiteParse::ExtractedImage")]
187
+ #[derive(Clone)]
188
+ struct ExtractedImage {
189
+ id: String,
190
+ page: u32,
191
+ format: String,
192
+ bytes: Vec<u8>,
193
+ }
194
+
195
+ impl ExtractedImage {
196
+ fn from_rust(img: liteparse::types::ExtractedImage) -> Self {
197
+ Self { id: img.id, page: img.page, format: img.format, bytes: img.bytes }
198
+ }
199
+
200
+ fn id(&self) -> &str { &self.id }
201
+ fn page(&self) -> u32 { self.page }
202
+ fn format(&self) -> &str { &self.format }
203
+ fn inspect(&self) -> String {
204
+ format!("#<LiteParse::ExtractedImage id={:?} page={} format={:?} bytes_len={}>",
205
+ self.id, self.page, self.format, self.bytes.len())
206
+ }
207
+ }
208
+
209
+ fn extracted_image_bytes(rb_self: &ExtractedImage) -> RString {
210
+ Ruby::get().expect("Ruby not available").str_from_slice(&rb_self.bytes)
211
+ }
212
+
213
+ #[magnus::wrap(class = "LiteParse::ScreenshotResult")]
214
+ #[derive(Clone)]
215
+ struct ScreenshotResult {
216
+ page_num: u32,
217
+ width: u32,
218
+ height: u32,
219
+ image_bytes: Vec<u8>,
220
+ }
221
+
222
+ impl ScreenshotResult {
223
+ fn page_num(&self) -> u32 { self.page_num }
224
+ fn width(&self) -> u32 { self.width }
225
+ fn height(&self) -> u32 { self.height }
226
+ fn inspect(&self) -> String {
227
+ format!("#<LiteParse::ScreenshotResult page_num={} width={} height={}>",
228
+ self.page_num, self.width, self.height)
229
+ }
230
+ }
231
+
232
+ fn screenshot_result_image_bytes(rb_self: &ScreenshotResult) -> RString {
233
+ Ruby::get().expect("Ruby not available").str_from_slice(&rb_self.image_bytes)
234
+ }
235
+
236
+ #[magnus::wrap(class = "LiteParse::Config")]
237
+ #[derive(Clone)]
238
+ struct Config {
239
+ ocr_language: String,
240
+ ocr_enabled: bool,
241
+ ocr_server_url: Option<String>,
242
+ ocr_server_headers: Option<HashMap<String, String>>,
243
+ tessdata_path: Option<String>,
244
+ max_pages: usize,
245
+ target_pages: Option<String>,
246
+ dpi: f64,
247
+ output_format: String,
248
+ preserve_very_small_text: bool,
249
+ password: Option<String>,
250
+ quiet: bool,
251
+ num_workers: usize,
252
+ }
253
+
254
+ impl Config {
255
+ fn from_rust(cfg: &LiteParseConfig) -> Self {
256
+ Self {
257
+ ocr_language: cfg.ocr_language.clone(),
258
+ ocr_enabled: cfg.ocr_enabled,
259
+ ocr_server_url: cfg.ocr_server_url.clone(),
260
+ ocr_server_headers: if cfg.ocr_server_headers.is_empty() { None } else {
261
+ Some(cfg.ocr_server_headers.iter().cloned().collect()) },
262
+ tessdata_path: cfg.tessdata_path.clone(),
263
+ max_pages: cfg.max_pages,
264
+ target_pages: cfg.target_pages.clone(),
265
+ dpi: cfg.dpi as f64,
266
+ output_format: match cfg.output_format {
267
+ OutputFormat::Json => "json".to_string(),
268
+ OutputFormat::Text => "text".to_string(),
269
+ OutputFormat::Markdown => "markdown".to_string(),
270
+ },
271
+ preserve_very_small_text: cfg.preserve_very_small_text,
272
+ password: cfg.password.clone(),
273
+ quiet: cfg.quiet,
274
+ num_workers: cfg.num_workers,
275
+ }
276
+ }
277
+
278
+ fn ocr_language(&self) -> &str { &self.ocr_language }
279
+ fn ocr_enabled(&self) -> bool { self.ocr_enabled }
280
+ fn ocr_server_url(&self) -> Option<&str> { self.ocr_server_url.as_deref() }
281
+ fn ocr_server_headers(&self) -> Option<RHash> {
282
+ self.ocr_server_headers.clone().map(|h| {
283
+ let hash = Ruby::get().unwrap().hash_new();
284
+ for (k, v) in &h {
285
+ let _ = hash.aset(k.as_str(), v.as_str());
286
+ }
287
+ hash
288
+ })
289
+ }
290
+ fn tessdata_path(&self) -> Option<&str> { self.tessdata_path.as_deref() }
291
+ fn max_pages(&self) -> usize { self.max_pages }
292
+ fn target_pages(&self) -> Option<&str> { self.target_pages.as_deref() }
293
+ fn dpi(&self) -> f64 { self.dpi }
294
+ fn output_format(&self) -> &str { &self.output_format }
295
+ fn preserve_very_small_text(&self) -> bool { self.preserve_very_small_text }
296
+ fn password(&self) -> Option<&str> { self.password.as_deref() }
297
+ fn quiet(&self) -> bool { self.quiet }
298
+ fn num_workers(&self) -> usize { self.num_workers }
299
+ fn inspect(&self) -> String {
300
+ format!("#<LiteParse::Config ocr_enabled={} dpi={} max_pages={}>",
301
+ self.ocr_enabled, self.dpi, self.max_pages)
302
+ }
303
+ }
304
+
305
+ // ---------------------------------------------------------------------------
306
+ // Main LiteParse class
307
+ // ---------------------------------------------------------------------------
308
+
309
+ struct Inner {
310
+ parser: liteparse::parser::LiteParse,
311
+ config: LiteParseConfig,
312
+ runtime: tokio::runtime::Runtime,
313
+ }
314
+
315
+ #[magnus::wrap(class = "LiteParse::LiteParse")]
316
+ struct LiteParse {
317
+ inner: Arc<Mutex<Inner>>,
318
+ }
319
+
320
+ impl LiteParse {
321
+ fn new(kwargs: Option<RHash>) -> Result<Self, Error> {
322
+ // Start with upstream defaults then apply kwargs
323
+ let mut cfg = LiteParseConfig {
324
+ ocr_enabled: true,
325
+ dpi: 150.0,
326
+ max_pages: 1000,
327
+ ..Default::default()
328
+ };
329
+
330
+ if let Some(ref kwargs) = kwargs {
331
+ if let Some(v) = kwarg::<String>(kwargs, "ocr_language") { cfg.ocr_language = v; }
332
+ if let Some(v) = kwarg_bool(kwargs, "ocr_enabled") { cfg.ocr_enabled = v; }
333
+ if let Some(v) = kwarg::<String>(kwargs, "ocr_server_url") { cfg.ocr_server_url = Some(v); }
334
+ if let Some(v) = kwarg::<HashMap<String, String>>(kwargs, "ocr_server_headers") {
335
+ cfg.ocr_server_headers = v.into_iter().collect();
336
+ }
337
+ if let Some(v) = kwarg::<String>(kwargs, "tessdata_path") { cfg.tessdata_path = Some(v); }
338
+ if let Some(v) = kwarg::<usize>(kwargs, "max_pages") { cfg.max_pages = v; }
339
+ if let Some(v) = kwarg::<String>(kwargs, "target_pages") { cfg.target_pages = Some(v); }
340
+ if let Some(v) = kwarg::<f64>(kwargs, "dpi") { cfg.dpi = v as f32; }
341
+ if let Some(v) = kwarg::<String>(kwargs, "output_format") {
342
+ cfg.output_format = match v.as_str() {
343
+ "text" => OutputFormat::Text,
344
+ "markdown" | "md" => OutputFormat::Markdown,
345
+ _ => OutputFormat::Json,
346
+ };
347
+ }
348
+ if let Some(v) = kwarg_bool(kwargs, "preserve_very_small_text") { cfg.preserve_very_small_text = v; }
349
+ if let Some(v) = kwarg::<String>(kwargs, "password") { cfg.password = Some(v); }
350
+ if let Some(v) = kwarg_bool(kwargs, "quiet") { cfg.quiet = v; }
351
+ if let Some(v) = kwarg::<usize>(kwargs, "num_workers") { cfg.num_workers = v; }
352
+ if let Some(v) = kwarg::<String>(kwargs, "image_mode") {
353
+ cfg.image_mode = match v.as_str() {
354
+ "off" | "none" => ImageMode::Off,
355
+ "embed" => ImageMode::Embed,
356
+ _ => ImageMode::Placeholder,
357
+ };
358
+ }
359
+ if let Some(v) = kwarg_bool(kwargs, "extract_links") { cfg.extract_links = v; }
360
+ }
361
+
362
+ let parser = liteparse::parser::LiteParse::new(cfg.clone());
363
+ let runtime = tokio::runtime::Runtime::new()
364
+ .map_err(|e| Error::new(Ruby::get().unwrap().exception_runtime_error(), e.to_string()))?;
365
+
366
+ Ok(Self { inner: Arc::new(Mutex::new(Inner { parser, config: cfg, runtime })) })
367
+ }
368
+
369
+ fn parse(&self, input: String) -> Result<ParseResult, Error> {
370
+ let pdf_input = PdfInput::Path(input);
371
+ let locked = self.inner.blocking_lock();
372
+ let result = locked.runtime.block_on(locked.parser.parse_input(pdf_input))
373
+ .map_err(|e| runtime_err(e.to_string()))?;
374
+ Ok(ParseResult::from_rust(result))
375
+ }
376
+
377
+ fn parse_bytes(&self, data: Vec<u8>) -> Result<ParseResult, Error> {
378
+ let pdf_input = PdfInput::Bytes(data);
379
+ let locked = self.inner.blocking_lock();
380
+ let result = locked.runtime.block_on(locked.parser.parse_input(pdf_input))
381
+ .map_err(|e| runtime_err(e.to_string()))?;
382
+ Ok(ParseResult::from_rust(result))
383
+ }
384
+
385
+ fn screenshot(&self, input: String, page_numbers: Option<Vec<u32>>) -> Result<RArray, Error> {
386
+ let locked = self.inner.blocking_lock();
387
+ let results = locked.runtime.block_on(locked.parser.screenshot(&input, page_numbers))
388
+ .map_err(|e| runtime_err(e.to_string()))?;
389
+ Ok(screenshots_to_ruby(results))
390
+ }
391
+
392
+ fn config(&self) -> Config {
393
+ Config::from_rust(&self.inner.blocking_lock().config)
394
+ }
395
+
396
+ fn inspect(&self) -> String {
397
+ let inner = self.inner.blocking_lock();
398
+ format!("#<LiteParse::LiteParse ocr_enabled={} dpi={} max_pages={}>",
399
+ inner.config.ocr_enabled, inner.config.dpi, inner.config.max_pages)
400
+ }
401
+ }
402
+
403
+ fn search_items(items: RArray, phrase: String, case_sensitive: Option<bool>) -> RArray {
404
+ let case_sensitive = case_sensitive.unwrap_or(false);
405
+ let ruby = Ruby::get().unwrap();
406
+ let ary = ruby.ary_new();
407
+
408
+ let mut rust_items: Vec<liteparse::types::TextItem> = Vec::new();
409
+ for item_value in items.into_iter() {
410
+ let obj: Obj<TextItem> = match Obj::<TextItem>::try_convert(item_value) {
411
+ Ok(v) => v,
412
+ _ => continue,
413
+ };
414
+ rust_items.push(obj.to_rust());
415
+ }
416
+
417
+ let options = liteparse::search::SearchOptions { phrase, case_sensitive };
418
+ let matches = liteparse::search::search_items(&rust_items, &options);
419
+ for m in matches {
420
+ let _ = ary.push(TextItem::from_rust(m));
421
+ }
422
+ ary
423
+ }
424
+
425
+ fn run_cli(_args: Vec<String>) -> Result<(), Error> {
426
+ Err(runtime_err("CLI not available in Ruby gem."))
427
+ }
428
+
429
+ fn runtime_err(msg: impl ToString) -> Error {
430
+ Error::new(Ruby::get().unwrap().exception_runtime_error(), msg.to_string())
431
+ }
432
+
433
+ // ---------------------------------------------------------------------------
434
+ // Init
435
+ // ---------------------------------------------------------------------------
436
+
437
+ /// Exposed so the ext crate's `#[magnus::init]` can call it.
438
+ /// All classes / methods are registered inside here.
439
+ pub fn define_liteparse_module(ruby: &Ruby) -> Result<(), Error> {
440
+ let module = ruby.define_module("LiteParse")?;
441
+
442
+ // TextItem
443
+ let text_item = module.define_class("TextItem", ruby.class_object())?;
444
+ text_item.define_method("text", method!(TextItem::text, 0))?;
445
+ text_item.define_method("x", method!(TextItem::x, 0))?;
446
+ text_item.define_method("y", method!(TextItem::y, 0))?;
447
+ text_item.define_method("width", method!(TextItem::width, 0))?;
448
+ text_item.define_method("height", method!(TextItem::height, 0))?;
449
+ text_item.define_method("font_name", method!(TextItem::font_name, 0))?;
450
+ text_item.define_method("font_size", method!(TextItem::font_size, 0))?;
451
+ text_item.define_method("confidence", method!(TextItem::confidence, 0))?;
452
+ text_item.define_method("inspect", method!(TextItem::inspect, 0))?;
453
+ text_item.define_method("to_s", method!(TextItem::inspect, 0))?;
454
+
455
+ // ParsedPage
456
+ let parsed_page = module.define_class("ParsedPage", ruby.class_object())?;
457
+ parsed_page.define_method("page_num", method!(ParsedPage::page_num, 0))?;
458
+ parsed_page.define_method("width", method!(ParsedPage::width, 0))?;
459
+ parsed_page.define_method("height", method!(ParsedPage::height, 0))?;
460
+ parsed_page.define_method("text", method!(ParsedPage::text, 0))?;
461
+ parsed_page.define_method("text_items", method!(ParsedPage::text_items, 0))?;
462
+ parsed_page.define_method("inspect", method!(ParsedPage::inspect, 0))?;
463
+ parsed_page.define_method("to_s", method!(ParsedPage::inspect, 0))?;
464
+
465
+ // ParseResult
466
+ let parse_result = module.define_class("ParseResult", ruby.class_object())?;
467
+ parse_result.define_method("pages", method!(ParseResult::pages, 0))?;
468
+ parse_result.define_method("text", method!(ParseResult::text, 0))?;
469
+ parse_result.define_method("images", method!(ParseResult::images, 0))?;
470
+ parse_result.define_method("num_pages", method!(ParseResult::num_pages, 0))?;
471
+ parse_result.define_method("get_page", method!(ParseResult::get_page, 1))?;
472
+ parse_result.define_method("inspect", method!(ParseResult::inspect, 0))?;
473
+ parse_result.define_method("to_s", method!(ParseResult::inspect, 0))?;
474
+
475
+ // ExtractedImage
476
+ let extracted_image = module.define_class("ExtractedImage", ruby.class_object())?;
477
+ extracted_image.define_method("id", method!(ExtractedImage::id, 0))?;
478
+ extracted_image.define_method("page", method!(ExtractedImage::page, 0))?;
479
+ extracted_image.define_method("format", method!(ExtractedImage::format, 0))?;
480
+ extracted_image.define_method("bytes", method!(extracted_image_bytes, 0))?;
481
+ extracted_image.define_method("inspect", method!(ExtractedImage::inspect, 0))?;
482
+ extracted_image.define_method("to_s", method!(ExtractedImage::inspect, 0))?;
483
+
484
+ // ScreenshotResult
485
+ let screenshot_result = module.define_class("ScreenshotResult", ruby.class_object())?;
486
+ screenshot_result.define_method("page_num", method!(ScreenshotResult::page_num, 0))?;
487
+ screenshot_result.define_method("width", method!(ScreenshotResult::width, 0))?;
488
+ screenshot_result.define_method("height", method!(ScreenshotResult::height, 0))?;
489
+ screenshot_result.define_method("image_bytes", method!(screenshot_result_image_bytes, 0))?;
490
+ screenshot_result.define_method("inspect", method!(ScreenshotResult::inspect, 0))?;
491
+ screenshot_result.define_method("to_s", method!(ScreenshotResult::inspect, 0))?;
492
+
493
+ // Config
494
+ let config_class = module.define_class("Config", ruby.class_object())?;
495
+ config_class.define_method("ocr_language", method!(Config::ocr_language, 0))?;
496
+ config_class.define_method("ocr_enabled", method!(Config::ocr_enabled, 0))?;
497
+ config_class.define_method("ocr_server_url", method!(Config::ocr_server_url, 0))?;
498
+ config_class.define_method("ocr_server_headers", method!(Config::ocr_server_headers, 0))?;
499
+ config_class.define_method("tessdata_path", method!(Config::tessdata_path, 0))?;
500
+ config_class.define_method("max_pages", method!(Config::max_pages, 0))?;
501
+ config_class.define_method("target_pages", method!(Config::target_pages, 0))?;
502
+ config_class.define_method("dpi", method!(Config::dpi, 0))?;
503
+ config_class.define_method("output_format", method!(Config::output_format, 0))?;
504
+ config_class.define_method("preserve_very_small_text", method!(Config::preserve_very_small_text, 0))?;
505
+ config_class.define_method("password", method!(Config::password, 0))?;
506
+ config_class.define_method("quiet", method!(Config::quiet, 0))?;
507
+ config_class.define_method("num_workers", method!(Config::num_workers, 0))?;
508
+ config_class.define_method("inspect", method!(Config::inspect, 0))?;
509
+ config_class.define_method("to_s", method!(Config::inspect, 0))?;
510
+
511
+ // Main LiteParse class
512
+ let liteparse_class = module.define_class("LiteParse", ruby.class_object())?;
513
+ liteparse_class.define_singleton_method("new", function!(LiteParse::new, 1))?;
514
+ liteparse_class.define_method("parse", method!(LiteParse::parse, 1))?;
515
+ liteparse_class.define_method("parse_bytes", method!(LiteParse::parse_bytes, 1))?;
516
+ liteparse_class.define_method("screenshot", method!(LiteParse::screenshot, 2))?;
517
+ liteparse_class.define_method("config", method!(LiteParse::config, 0))?;
518
+ liteparse_class.define_method("inspect", method!(LiteParse::inspect, 0))?;
519
+ liteparse_class.define_method("to_s", method!(LiteParse::inspect, 0))?;
520
+
521
+ // Module functions
522
+ module.define_module_function("search_items", function!(search_items, 3))?;
523
+ module.define_module_function("run_cli", function!(run_cli, 1))?;
524
+
525
+ // ParseError exception
526
+ module.define_class("ParseError", ruby.exception_exception().as_r_class())?;
527
+
528
+ Ok(())
529
+ }
@@ -0,0 +1,4 @@
1
+ require "mkmf"
2
+ require "rb_sys/mkmf"
3
+
4
+ create_rust_makefile("liteparse/liteparse")
@@ -0,0 +1,8 @@
1
+ // Re-export all types and functions from the workspace crate.
2
+ pub use liteparse_ruby::*;
3
+
4
+ /// Registers all LiteParse classes/modules with Ruby.
5
+ #[magnus::init]
6
+ fn init(ruby: &magnus::Ruby) -> Result<(), magnus::Error> {
7
+ liteparse_ruby::define_liteparse_module(ruby)
8
+ }
@@ -0,0 +1,16 @@
1
+ require_relative "../liteparse"
2
+
3
+ module LiteParse
4
+ # CLI support is handled by the native `lit` binary.
5
+ # Run `lit --help` from the command line for usage.
6
+ module CLI
7
+ module_function
8
+
9
+ def run(args = ARGV)
10
+ LiteParse.run_cli(args)
11
+ rescue => e
12
+ $stderr.puts "Error: #{e.message}"
13
+ exit 1
14
+ end
15
+ end
16
+ end
Binary file
@@ -0,0 +1,13 @@
1
+ require_relative "types"
2
+
3
+ module LiteParse
4
+ # LiteParse is defined natively in the Rust extension.
5
+ # This file exists to mirror the Python wrapper structure and provide
6
+ # a convenient require path.
7
+ #
8
+ # Usage:
9
+ # require "liteparse"
10
+ # parser = LiteParse::LiteParse.new(ocr_enabled: true)
11
+ # result = parser.parse("document.pdf")
12
+ # puts result.text
13
+ end
@@ -0,0 +1,9 @@
1
+ require_relative "liteparse/liteparse"
2
+
3
+ # All types (TextItem, ParsedPage, ParseResult, etc.) are defined natively
4
+ # in the Rust extension. This file re-exports them for convenience.
5
+
6
+ module LiteParse
7
+ # No additional Ruby wrapping needed — the native classes are registered
8
+ # directly on the LiteParse module by the Rust init function.
9
+ end
@@ -0,0 +1,3 @@
1
+ module LiteParse
2
+ VERSION = "0.1.7"
3
+ end
data/lib/liteparse.rb ADDED
@@ -0,0 +1,18 @@
1
+ require_relative "liteparse/version"
2
+ require_relative "liteparse/liteparse"
3
+
4
+ module LiteParse
5
+ class Error < StandardError; end
6
+ end
7
+
8
+ # Wrap native new to accept 0 args (the Rust constructor expects 1 positional arg).
9
+ LiteParse::LiteParse.singleton_class.alias_method :native_new, :new
10
+ LiteParse::LiteParse.define_singleton_method(:new) do |**kwargs|
11
+ native_new(kwargs.empty? ? nil : kwargs)
12
+ end
13
+
14
+ # Wrap screenshot to accept page_numbers as keyword arg (matching Python API).
15
+ LiteParse::LiteParse.alias_method :native_screenshot, :screenshot
16
+ LiteParse::LiteParse.define_method(:screenshot) do |input, page_numbers: nil|
17
+ native_screenshot(input, page_numbers)
18
+ end