doctra 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. doctra/__init__.py +19 -0
  2. doctra/cli/__init__.py +27 -0
  3. doctra/cli/main.py +856 -0
  4. doctra/cli/utils.py +340 -0
  5. doctra/engines/__init__.py +0 -0
  6. doctra/engines/layout/__init__.py +0 -0
  7. doctra/engines/layout/layout_models.py +90 -0
  8. doctra/engines/layout/paddle_layout.py +225 -0
  9. doctra/engines/ocr/__init__.py +4 -0
  10. doctra/engines/ocr/api.py +36 -0
  11. doctra/engines/ocr/path_resolver.py +48 -0
  12. doctra/engines/ocr/pytesseract_engine.py +76 -0
  13. doctra/engines/vlm/__init__.py +0 -0
  14. doctra/engines/vlm/outlines_types.py +31 -0
  15. doctra/engines/vlm/provider.py +58 -0
  16. doctra/engines/vlm/service.py +117 -0
  17. doctra/exporters/__init__.py +0 -0
  18. doctra/exporters/excel_writer.py +197 -0
  19. doctra/exporters/image_saver.py +42 -0
  20. doctra/exporters/markdown_table.py +56 -0
  21. doctra/exporters/markdown_writer.py +29 -0
  22. doctra/parsers/__init__.py +6 -0
  23. doctra/parsers/layout_order.py +16 -0
  24. doctra/parsers/structured_pdf_parser.py +434 -0
  25. doctra/parsers/table_chart_extractor.py +283 -0
  26. doctra/utils/__init__.py +0 -0
  27. doctra/utils/bbox.py +18 -0
  28. doctra/utils/constants.py +8 -0
  29. doctra/utils/file_ops.py +26 -0
  30. doctra/utils/io_utils.py +10 -0
  31. doctra/utils/ocr_utils.py +20 -0
  32. doctra/utils/pdf_io.py +19 -0
  33. doctra/utils/quiet.py +13 -0
  34. doctra/utils/structured_utils.py +49 -0
  35. doctra/version.py +2 -0
  36. doctra-0.1.0.dist-info/METADATA +626 -0
  37. doctra-0.1.0.dist-info/RECORD +40 -0
  38. doctra-0.1.0.dist-info/WHEEL +5 -0
  39. doctra-0.1.0.dist-info/licenses/LICENSE +201 -0
  40. doctra-0.1.0.dist-info/top_level.txt +1 -0
doctra/cli/main.py ADDED
@@ -0,0 +1,856 @@
1
+ """
2
+ Doctra CLI - Command line interface for document processing
3
+
4
+ This module provides a comprehensive CLI for the Doctra library, enabling
5
+ users to process PDF documents, extract charts/tables, visualize layout
6
+ detection results, and analyze document structure from the command line.
7
+ """
8
+
9
+ import click
10
+ import os
11
+ import sys
12
+ from pathlib import Path
13
+ from typing import Optional
14
+
15
+ # Import parsers
16
+ try:
17
+ from doctra.parsers.structured_pdf_parser_enhancer import StructuredPDFParser
18
+ from doctra.parsers.chart_table_pdf_parser import ChartTablePDFParser
19
+ except ImportError:
20
+ # Fallback for development/testing
21
+ project_root = Path(__file__).parent.parent.parent
22
+ sys.path.insert(0, str(project_root))
23
+ from doctra.parsers.structured_pdf_parser import StructuredPDFParser
24
+ from doctra.parsers.table_chart_extractor import ChartTablePDFParser
25
+
26
+
27
+ @click.group(invoke_without_command=True)
28
+ @click.pass_context
29
+ @click.version_option(version="1.0.0", prog_name="doctra")
30
+ def cli(ctx):
31
+ """
32
+ šŸ”¬ Doctra - Advanced Document Processing Library
33
+
34
+ Extract text, tables, charts, and figures from PDF documents using
35
+ layout detection, OCR, and optional VLM (Vision Language Model) enhancement.
36
+
37
+ \b
38
+ Commands:
39
+ parse Full document parsing with text, tables, charts, and figures
40
+ extract Extract only charts and/or tables from documents
41
+ visualize Visualize layout detection results
42
+ analyze Quick document analysis without processing
43
+ info Show system information and dependencies
44
+
45
+ \b
46
+ Examples:
47
+ doctra parse document.pdf # Full document parsing
48
+ doctra extract charts document.pdf # Extract only charts
49
+ doctra extract both document.pdf --use-vlm # Extract charts & tables with VLM
50
+ doctra visualize document.pdf # Visualize layout detection
51
+ doctra analyze document.pdf # Quick document analysis
52
+ doctra info # System information
53
+
54
+ For more help on any command, use: doctra COMMAND --help
55
+
56
+ :param ctx: Click context object containing command information
57
+ :return: None
58
+ """
59
+ if ctx.invoked_subcommand is None:
60
+ click.echo(ctx.get_help())
61
+
62
+
63
+ # Common options for VLM configuration
64
+ def vlm_options(func):
65
+ """
66
+ Decorator to add common VLM options to commands.
67
+
68
+ Adds the following options to a Click command:
69
+ - --use-vlm/--no-vlm: Enable/disable VLM processing
70
+ - --vlm-provider: Choose between 'gemini' or 'openai'
71
+ - --vlm-model: Model name to use (defaults to provider-specific defaults)
72
+ - --vlm-api-key: API key for VLM provider
73
+
74
+ :param func: The Click command function to decorate
75
+ :return: Decorated function with VLM options
76
+ """
77
+ func = click.option('--use-vlm/--no-vlm', default=False,
78
+ help='Use Vision Language Model for table/chart extraction')(func)
79
+ func = click.option('--vlm-provider', type=click.Choice(['gemini', 'openai']), default='gemini',
80
+ help='VLM provider to use (default: gemini)')(func)
81
+ func = click.option('--vlm-model', type=str, default=None,
82
+ help='Model name to use (defaults to provider-specific defaults)')(func)
83
+ func = click.option('--vlm-api-key', type=str, envvar='VLM_API_KEY',
84
+ help='API key for VLM provider (or set VLM_API_KEY env var)')(func)
85
+ return func
86
+
87
+
88
+ # Common options for layout detection
89
+ def layout_options(func):
90
+ """
91
+ Decorator to add common layout detection options to commands.
92
+
93
+ Adds the following options to a Click command:
94
+ - --layout-model: Layout detection model name
95
+ - --dpi: DPI for PDF rendering
96
+ - --min-score: Minimum confidence score for layout detection
97
+
98
+ :param func: The Click command function to decorate
99
+ :return: Decorated function with layout options
100
+ """
101
+ func = click.option('--layout-model', default='PP-DocLayout_plus-L',
102
+ help='Layout detection model name (default: PP-DocLayout_plus-L)')(func)
103
+ func = click.option('--dpi', type=int, default=200,
104
+ help='DPI for PDF rendering (default: 200)')(func)
105
+ func = click.option('--min-score', type=float, default=0.0,
106
+ help='Minimum confidence score for layout detection (default: 0.0)')(func)
107
+ return func
108
+
109
+
110
+ # Common options for OCR
111
+ def ocr_options(func):
112
+ """
113
+ Decorator to add common OCR options to commands.
114
+
115
+ Adds the following options to a Click command:
116
+ - --ocr-lang: OCR language code
117
+ - --ocr-psm: Tesseract page segmentation mode
118
+ - --ocr-oem: Tesseract OCR engine mode
119
+ - --ocr-config: Additional Tesseract configuration
120
+
121
+ :param func: The Click command function to decorate
122
+ :return: Decorated function with OCR options
123
+ """
124
+ func = click.option('--ocr-lang', default='eng',
125
+ help='OCR language code (default: eng)')(func)
126
+ func = click.option('--ocr-psm', type=int, default=4,
127
+ help='Tesseract page segmentation mode (default: 4)')(func)
128
+ func = click.option('--ocr-oem', type=int, default=3,
129
+ help='Tesseract OCR engine mode (default: 3)')(func)
130
+ func = click.option('--ocr-config', default='',
131
+ help='Additional Tesseract configuration string')(func)
132
+ return func
133
+
134
+
135
+ def validate_vlm_config(use_vlm: bool, vlm_api_key: Optional[str]) -> None:
136
+ """
137
+ Validate VLM configuration and exit with error if invalid.
138
+
139
+ Checks if VLM is enabled but no API key is provided, and exits
140
+ with an appropriate error message if the configuration is invalid.
141
+
142
+ :param use_vlm: Whether VLM processing is enabled
143
+ :param vlm_api_key: The VLM API key (can be None if VLM is disabled)
144
+ :return: None
145
+ :raises SystemExit: If VLM is enabled but no API key is provided
146
+ """
147
+ if use_vlm and not vlm_api_key:
148
+ click.echo("āŒ Error: VLM API key is required when using --use-vlm", err=True)
149
+ click.echo(" Set the VLM_API_KEY environment variable or use --vlm-api-key", err=True)
150
+ click.echo(" Example: export VLM_API_KEY=your_api_key", err=True)
151
+ sys.exit(1)
152
+
153
+
154
+ @cli.command()
155
+ @click.argument('pdf_path', type=click.Path(exists=True, path_type=Path))
156
+ @click.option('--output-dir', '-o', type=click.Path(path_type=Path),
157
+ help='Output directory (default: outputs/{pdf_filename})')
158
+ @vlm_options
159
+ @layout_options
160
+ @ocr_options
161
+ @click.option('--box-separator', default='\n',
162
+ help='Separator between text boxes in output (default: newline)')
163
+ @click.option('--verbose', '-v', is_flag=True,
164
+ help='Enable verbose output')
165
+ def parse(pdf_path: Path, output_dir: Optional[Path], use_vlm: bool,
166
+ vlm_provider: str, vlm_model: Optional[str], vlm_api_key: Optional[str],
167
+ layout_model: str, dpi: int, min_score: float,
168
+ ocr_lang: str, ocr_psm: int, ocr_oem: int, ocr_config: str,
169
+ box_separator: str, verbose: bool):
170
+ """
171
+ Parse a PDF document and extract all structured content.
172
+
173
+ Performs comprehensive document processing including text extraction,
174
+ layout detection, OCR, and optional VLM-based table/chart extraction.
175
+ Outputs markdown file and optionally Excel file with structured data.
176
+
177
+ \b
178
+ Examples:
179
+ doctra parse document.pdf
180
+ doctra parse document.pdf --use-vlm --vlm-api-key your_key
181
+ doctra parse document.pdf -o ./results --dpi 300
182
+ doctra parse document.pdf --vlm-provider openai --use-vlm
183
+
184
+ \b
185
+ VLM Setup:
186
+ Set environment variable: export VLM_API_KEY=your_api_key
187
+ Or use: --vlm-api-key your_api_key
188
+
189
+ :param pdf_path: Path to the input PDF file
190
+ :param output_dir: Output directory for results (optional)
191
+ :param use_vlm: Whether to use VLM for enhanced extraction
192
+ :param vlm_provider: VLM provider ('gemini' or 'openai')
193
+ :param vlm_model: Model name to use (defaults to provider-specific defaults)
194
+ :param vlm_api_key: API key for VLM provider
195
+ :param layout_model: Layout detection model name
196
+ :param dpi: DPI for PDF rendering
197
+ :param min_score: Minimum confidence score for layout detection
198
+ :param ocr_lang: OCR language code
199
+ :param ocr_psm: Tesseract page segmentation mode
200
+ :param ocr_oem: Tesseract OCR engine mode
201
+ :param ocr_config: Additional Tesseract configuration
202
+ :param box_separator: Separator between text boxes in output
203
+ :param verbose: Whether to enable verbose output
204
+ :return: None
205
+ """
206
+ validate_vlm_config(use_vlm, vlm_api_key)
207
+
208
+ if verbose:
209
+ click.echo(f"šŸ” Starting full PDF parsing...")
210
+ click.echo(f" Input: {pdf_path}")
211
+ if output_dir:
212
+ click.echo(f" Output: {output_dir}")
213
+
214
+ # Create parser instance
215
+ try:
216
+ if verbose:
217
+ click.echo(f"šŸ”§ Initializing full parser...")
218
+ if use_vlm:
219
+ click.echo(f" VLM Provider: {vlm_provider}")
220
+ click.echo(f" VLM Model: {vlm_model or 'default'}")
221
+ click.echo(f" Layout Model: {layout_model}")
222
+ click.echo(f" DPI: {dpi}")
223
+ click.echo(f" OCR Language: {ocr_lang}")
224
+ else:
225
+ click.echo(f"šŸ” Initializing full document parser...")
226
+ if use_vlm:
227
+ click.echo(f" Using VLM: {vlm_provider}")
228
+
229
+ parser = StructuredPDFParser(
230
+ use_vlm=use_vlm,
231
+ vlm_provider=vlm_provider,
232
+ vlm_model=vlm_model,
233
+ vlm_api_key=vlm_api_key,
234
+ layout_model_name=layout_model,
235
+ dpi=dpi,
236
+ min_score=min_score,
237
+ ocr_lang=ocr_lang,
238
+ ocr_psm=ocr_psm,
239
+ ocr_oem=ocr_oem,
240
+ ocr_extra_config=ocr_config,
241
+ box_separator=box_separator
242
+ )
243
+ except Exception as e:
244
+ click.echo(f"āŒ Error initializing parser: {e}", err=True)
245
+ if verbose:
246
+ import traceback
247
+ click.echo(traceback.format_exc(), err=True)
248
+ sys.exit(1)
249
+
250
+ # Change to output directory if specified
251
+ original_cwd = os.getcwd()
252
+ if output_dir:
253
+ output_dir.mkdir(parents=True, exist_ok=True)
254
+ os.chdir(output_dir)
255
+ click.echo(f"šŸ“ Output directory: {output_dir.absolute()}")
256
+
257
+ try:
258
+ # Parse the document
259
+ click.echo(f"šŸ“„ Processing: {pdf_path.name}")
260
+ parser.parse(str(pdf_path.absolute()))
261
+ click.echo("āœ… Full document processing completed successfully!")
262
+
263
+ except KeyboardInterrupt:
264
+ click.echo("\nāš ļø Processing interrupted by user", err=True)
265
+ sys.exit(130)
266
+ except Exception as e:
267
+ click.echo(f"āŒ Error during parsing: {e}", err=True)
268
+ if verbose:
269
+ import traceback
270
+ click.echo(traceback.format_exc(), err=True)
271
+ sys.exit(1)
272
+ finally:
273
+ # Restore original working directory
274
+ os.chdir(original_cwd)
275
+
276
+
277
+ @cli.group(invoke_without_command=True)
278
+ @click.pass_context
279
+ def extract(ctx):
280
+ """
281
+ Extract charts and/or tables from PDF documents.
282
+
283
+ This command focuses specifically on chart and table extraction,
284
+ providing faster processing when you only need these elements.
285
+
286
+ \b
287
+ Subcommands:
288
+ charts Extract only charts from the document
289
+ tables Extract only tables from the document
290
+ both Extract both charts and tables
291
+
292
+ \b
293
+ Examples:
294
+ doctra extract charts document.pdf
295
+ doctra extract tables document.pdf --use-vlm
296
+ doctra extract both document.pdf --output-dir ./results
297
+
298
+ :param ctx: Click context object containing command information
299
+ :return: None
300
+ """
301
+ if ctx.invoked_subcommand is None:
302
+ click.echo(ctx.get_help())
303
+
304
+
305
+ @extract.command()
306
+ @click.argument('pdf_path', type=click.Path(exists=True, path_type=Path))
307
+ @click.option('--output-dir', '-o', type=click.Path(path_type=Path), default=Path("outputs"),
308
+ help='Output base directory (default: outputs)')
309
+ @vlm_options
310
+ @layout_options
311
+ @click.option('--verbose', '-v', is_flag=True, help='Enable verbose output')
312
+ def charts(pdf_path: Path, output_dir: Path, use_vlm: bool, vlm_provider: str,
313
+ vlm_model: Optional[str], vlm_api_key: Optional[str],
314
+ layout_model: str, dpi: int, min_score: float, verbose: bool):
315
+ """
316
+ Extract only charts from a PDF document.
317
+
318
+ Saves chart images and optionally converts them to structured data using VLM.
319
+
320
+ \b
321
+ Examples:
322
+ doctra extract charts document.pdf
323
+ doctra extract charts document.pdf --use-vlm --vlm-api-key your_key
324
+ doctra extract charts document.pdf -o ./my_outputs --dpi 300
325
+
326
+ :param pdf_path: Path to the input PDF file
327
+ :param output_dir: Output base directory for results
328
+ :param use_vlm: Whether to use VLM for enhanced chart extraction
329
+ :param vlm_provider: VLM provider ('gemini' or 'openai')
330
+ :param vlm_model: Model name to use (defaults to provider-specific defaults)
331
+ :param vlm_api_key: API key for VLM provider
332
+ :param layout_model: Layout detection model name
333
+ :param dpi: DPI for PDF rendering
334
+ :param min_score: Minimum confidence score for layout detection
335
+ :param verbose: Whether to enable verbose output
336
+ :return: None
337
+ """
338
+ validate_vlm_config(use_vlm, vlm_api_key)
339
+
340
+ if verbose:
341
+ click.echo(f"šŸ“Š Starting chart extraction...")
342
+ click.echo(f" Input: {pdf_path}")
343
+ click.echo(f" Output base: {output_dir}")
344
+
345
+ try:
346
+ if verbose:
347
+ click.echo(f"šŸ”§ Initializing chart extractor...")
348
+ if use_vlm:
349
+ click.echo(f" VLM Provider: {vlm_provider}")
350
+ else:
351
+ click.echo(f"šŸ“Š Initializing chart extractor...")
352
+ if use_vlm:
353
+ click.echo(f" Using VLM: {vlm_provider}")
354
+
355
+ parser = ChartTablePDFParser(
356
+ extract_charts=True,
357
+ extract_tables=False,
358
+ use_vlm=use_vlm,
359
+ vlm_provider=vlm_provider,
360
+ vlm_model=vlm_model,
361
+ vlm_api_key=vlm_api_key,
362
+ layout_model_name=layout_model,
363
+ dpi=dpi,
364
+ min_score=min_score
365
+ )
366
+
367
+ click.echo(f"šŸ“„ Processing: {pdf_path.name}")
368
+ parser.parse(str(pdf_path), str(output_dir))
369
+ click.echo("āœ… Chart extraction completed successfully!")
370
+
371
+ except KeyboardInterrupt:
372
+ click.echo("\nāš ļø Extraction interrupted by user", err=True)
373
+ sys.exit(130)
374
+ except Exception as e:
375
+ click.echo(f"āŒ Error during chart extraction: {e}", err=True)
376
+ if verbose:
377
+ import traceback
378
+ click.echo(traceback.format_exc(), err=True)
379
+ sys.exit(1)
380
+
381
+
382
+ @extract.command()
383
+ @click.argument('pdf_path', type=click.Path(exists=True, path_type=Path))
384
+ @click.option('--output-dir', '-o', type=click.Path(path_type=Path), default=Path("outputs"),
385
+ help='Output base directory (default: outputs)')
386
+ @vlm_options
387
+ @layout_options
388
+ @click.option('--verbose', '-v', is_flag=True, help='Enable verbose output')
389
+ def tables(pdf_path: Path, output_dir: Path, use_vlm: bool, vlm_provider: str,
390
+ vlm_model: Optional[str], vlm_api_key: Optional[str],
391
+ layout_model: str, dpi: int, min_score: float, verbose: bool):
392
+ """
393
+ Extract only tables from a PDF document.
394
+
395
+ Saves table images and optionally converts them to structured data using VLM.
396
+
397
+ \b
398
+ Examples:
399
+ doctra extract tables document.pdf
400
+ doctra extract tables document.pdf --use-vlm --vlm-api-key your_key
401
+ doctra extract tables document.pdf -o ./my_outputs --min-score 0.5
402
+
403
+ :param pdf_path: Path to the input PDF file
404
+ :param output_dir: Output base directory for results
405
+ :param use_vlm: Whether to use VLM for enhanced table extraction
406
+ :param vlm_provider: VLM provider ('gemini' or 'openai')
407
+ :param vlm_model: Model name to use (defaults to provider-specific defaults)
408
+ :param vlm_api_key: API key for VLM provider
409
+ :param layout_model: Layout detection model name
410
+ :param dpi: DPI for PDF rendering
411
+ :param min_score: Minimum confidence score for layout detection
412
+ :param verbose: Whether to enable verbose output
413
+ :return: None
414
+ """
415
+ validate_vlm_config(use_vlm, vlm_api_key)
416
+
417
+ if verbose:
418
+ click.echo(f"šŸ“‹ Starting table extraction...")
419
+ click.echo(f" Input: {pdf_path}")
420
+ click.echo(f" Output base: {output_dir}")
421
+
422
+ try:
423
+ if verbose:
424
+ click.echo(f"šŸ”§ Initializing table extractor...")
425
+ if use_vlm:
426
+ click.echo(f" VLM Provider: {vlm_provider}")
427
+ else:
428
+ click.echo(f"šŸ“‹ Initializing table extractor...")
429
+ if use_vlm:
430
+ click.echo(f" Using VLM: {vlm_provider}")
431
+
432
+ parser = ChartTablePDFParser(
433
+ extract_charts=False,
434
+ extract_tables=True,
435
+ use_vlm=use_vlm,
436
+ vlm_provider=vlm_provider,
437
+ vlm_model=vlm_model,
438
+ vlm_api_key=vlm_api_key,
439
+ layout_model_name=layout_model,
440
+ dpi=dpi,
441
+ min_score=min_score
442
+ )
443
+
444
+ click.echo(f"šŸ“„ Processing: {pdf_path.name}")
445
+ parser.parse(str(pdf_path), str(output_dir))
446
+ click.echo("āœ… Table extraction completed successfully!")
447
+
448
+ except KeyboardInterrupt:
449
+ click.echo("\nāš ļø Extraction interrupted by user", err=True)
450
+ sys.exit(130)
451
+ except Exception as e:
452
+ click.echo(f"āŒ Error during table extraction: {e}", err=True)
453
+ if verbose:
454
+ import traceback
455
+ click.echo(traceback.format_exc(), err=True)
456
+ sys.exit(1)
457
+
458
+
459
+ @extract.command()
460
+ @click.argument('pdf_path', type=click.Path(exists=True, path_type=Path))
461
+ @click.option('--output-dir', '-o', type=click.Path(path_type=Path), default=Path("outputs"),
462
+ help='Output base directory (default: outputs)')
463
+ @vlm_options
464
+ @layout_options
465
+ @click.option('--verbose', '-v', is_flag=True, help='Enable verbose output')
466
+ def both(pdf_path: Path, output_dir: Path, use_vlm: bool, vlm_provider: str,
467
+ vlm_model: Optional[str], vlm_api_key: Optional[str],
468
+ layout_model: str, dpi: int, min_score: float, verbose: bool):
469
+ """
470
+ Extract both charts and tables from a PDF document.
471
+
472
+ Saves both chart and table images, and optionally converts them
473
+ to structured data using VLM.
474
+
475
+ \b
476
+ Examples:
477
+ doctra extract both document.pdf
478
+ doctra extract both document.pdf --use-vlm --vlm-api-key your_key
479
+ doctra extract both document.pdf -o ./my_outputs --dpi 300
480
+
481
+ :param pdf_path: Path to the input PDF file
482
+ :param output_dir: Output base directory for results
483
+ :param use_vlm: Whether to use VLM for enhanced extraction
484
+ :param vlm_provider: VLM provider ('gemini' or 'openai')
485
+ :param vlm_model: Model name to use (defaults to provider-specific defaults)
486
+ :param vlm_api_key: API key for VLM provider
487
+ :param layout_model: Layout detection model name
488
+ :param dpi: DPI for PDF rendering
489
+ :param min_score: Minimum confidence score for layout detection
490
+ :param verbose: Whether to enable verbose output
491
+ :return: None
492
+ """
493
+ validate_vlm_config(use_vlm, vlm_api_key)
494
+
495
+ if verbose:
496
+ click.echo(f"šŸ“ŠšŸ“‹ Starting chart and table extraction...")
497
+ click.echo(f" Input: {pdf_path}")
498
+ click.echo(f" Output base: {output_dir}")
499
+
500
+ try:
501
+ if verbose:
502
+ click.echo(f"šŸ”§ Initializing chart/table extractor...")
503
+ if use_vlm:
504
+ click.echo(f" VLM Provider: {vlm_provider}")
505
+ else:
506
+ click.echo(f"šŸ“ŠšŸ“‹ Initializing chart and table extractor...")
507
+ if use_vlm:
508
+ click.echo(f" Using VLM: {vlm_provider}")
509
+
510
+ parser = ChartTablePDFParser(
511
+ extract_charts=True,
512
+ extract_tables=True,
513
+ use_vlm=use_vlm,
514
+ vlm_provider=vlm_provider,
515
+ vlm_model=vlm_model,
516
+ vlm_api_key=vlm_api_key,
517
+ layout_model_name=layout_model,
518
+ dpi=dpi,
519
+ min_score=min_score
520
+ )
521
+
522
+ click.echo(f"šŸ“„ Processing: {pdf_path.name}")
523
+ parser.parse(str(pdf_path), str(output_dir))
524
+ click.echo("āœ… Chart and table extraction completed successfully!")
525
+
526
+ except KeyboardInterrupt:
527
+ click.echo("\nāš ļø Extraction interrupted by user", err=True)
528
+ sys.exit(130)
529
+ except Exception as e:
530
+ click.echo(f"āŒ Error during extraction: {e}", err=True)
531
+ if verbose:
532
+ import traceback
533
+ click.echo(traceback.format_exc(), err=True)
534
+ sys.exit(1)
535
+
536
+
537
+ @cli.command()
538
+ @click.argument('pdf_path', type=click.Path(exists=True, path_type=Path))
539
+ @click.option('--pages', '-p', type=int, default=3,
540
+ help='Number of pages to visualize (default: 3)')
541
+ @click.option('--columns', '-c', type=int, default=2,
542
+ help='Number of columns in grid layout (default: 2)')
543
+ @click.option('--width', '-w', type=int, default=800,
544
+ help='Width of each page in pixels (default: 800)')
545
+ @click.option('--spacing', '-s', type=int, default=40,
546
+ help='Spacing between pages in pixels (default: 40)')
547
+ @click.option('--output', '-o', type=click.Path(path_type=Path),
548
+ help='Save visualization to file (PNG/JPG)')
549
+ @layout_options
550
+ @click.option('--verbose', '-v', is_flag=True, help='Enable verbose output')
551
+ def visualize(pdf_path: Path, pages: int, columns: int, width: int,
552
+ spacing: int, output: Optional[Path], dpi: int, min_score: float,
553
+ layout_model: str, verbose: bool):
554
+ """
555
+ Visualize layout detection results for a PDF.
556
+
557
+ Shows detected layout elements (text, tables, figures, charts)
558
+ with bounding boxes and confidence scores in a modern grid layout.
559
+
560
+ \b
561
+ Examples:
562
+ doctra visualize document.pdf
563
+ doctra visualize document.pdf --pages 5 --output layout.png
564
+ doctra visualize document.pdf --columns 3 --width 600
565
+
566
+ :param pdf_path: Path to the input PDF file
567
+ :param pages: Number of pages to visualize
568
+ :param columns: Number of columns in the grid layout
569
+ :param width: Width of each page in pixels
570
+ :param spacing: Spacing between pages in pixels
571
+ :param output: Optional path to save visualization as image file
572
+ :param dpi: DPI for PDF rendering
573
+ :param min_score: Minimum confidence score for layout detection
574
+ :param layout_model: Layout detection model name
575
+ :param verbose: Whether to enable verbose output
576
+ :return: None
577
+ """
578
+ try:
579
+ if verbose:
580
+ click.echo(f"šŸŽØ Creating layout visualization...")
581
+ click.echo(f" Input: {pdf_path}")
582
+ click.echo(f" Pages: {pages}, Columns: {columns}")
583
+ click.echo(f" Page width: {width}px, Spacing: {spacing}px")
584
+ click.echo(f" DPI: {dpi}, Min score: {min_score}")
585
+ else:
586
+ click.echo(f"šŸŽØ Creating layout visualization...")
587
+
588
+ # Create parser instance (no VLM needed for visualization)
589
+ parser = StructuredPDFParser(
590
+ layout_model_name=layout_model,
591
+ dpi=dpi,
592
+ min_score=min_score
593
+ )
594
+
595
+ click.echo(f"šŸ“„ Processing: {pdf_path.name}")
596
+ if output:
597
+ click.echo(f"šŸ’¾ Saving to: {output}")
598
+ else:
599
+ click.echo("šŸ‘ļø Will display visualization window")
600
+
601
+ parser.display_pages_with_boxes(
602
+ pdf_path=str(pdf_path),
603
+ num_pages=pages,
604
+ cols=columns,
605
+ page_width=width,
606
+ spacing=spacing,
607
+ save_path=str(output) if output else None
608
+ )
609
+
610
+ if not output:
611
+ click.echo(" Close the window to continue...")
612
+ else:
613
+ click.echo("āœ… Visualization saved successfully!")
614
+
615
+ except KeyboardInterrupt:
616
+ click.echo("\nāš ļø Visualization interrupted by user", err=True)
617
+ sys.exit(130)
618
+ except Exception as e:
619
+ click.echo(f"āŒ Error creating visualization: {e}", err=True)
620
+ if verbose:
621
+ import traceback
622
+ click.echo(traceback.format_exc(), err=True)
623
+ sys.exit(1)
624
+
625
+
626
+ @cli.command()
627
+ @click.argument('pdf_path', type=click.Path(exists=True, path_type=Path))
628
+ @layout_options
629
+ @click.option('--verbose', '-v', is_flag=True, help='Show detailed per-page breakdown')
630
+ def analyze(pdf_path: Path, dpi: int, min_score: float, layout_model: str, verbose: bool):
631
+ """
632
+ Analyze a PDF and show statistics without processing.
633
+
634
+ Quick analysis to understand document structure before full processing.
635
+ Shows total pages, element counts, and distribution statistics.
636
+
637
+ \b
638
+ Examples:
639
+ doctra analyze document.pdf
640
+ doctra analyze document.pdf --verbose
641
+ doctra analyze document.pdf --min-score 0.5
642
+
643
+ :param pdf_path: Path to the input PDF file
644
+ :param dpi: DPI for PDF rendering
645
+ :param min_score: Minimum confidence score for layout detection
646
+ :param layout_model: Layout detection model name
647
+ :param verbose: Whether to show detailed per-page breakdown
648
+ :return: None
649
+ """
650
+ try:
651
+ click.echo(f"šŸ” Analyzing: {pdf_path.name}")
652
+
653
+ # Create layout engine for analysis only
654
+ from doctra.engines.layout.paddle_layout import PaddleLayoutEngine
655
+
656
+ if verbose:
657
+ click.echo(f" Using model: {layout_model}")
658
+ click.echo(f" DPI: {dpi}, Min score: {min_score}")
659
+
660
+ layout_engine = PaddleLayoutEngine(model_name=layout_model)
661
+ pages = layout_engine.predict_pdf(str(pdf_path), dpi=dpi, min_score=min_score)
662
+
663
+ click.echo(f"\nšŸ“Š Document Analysis Results:")
664
+ click.echo(f" Total pages: {len(pages)}")
665
+
666
+ # Collect statistics
667
+ total_elements = 0
668
+ element_counts = {}
669
+ page_elements = []
670
+
671
+ for page in pages:
672
+ page_element_count = len(page.boxes)
673
+ total_elements += page_element_count
674
+ page_elements.append(page_element_count)
675
+
676
+ for box in page.boxes:
677
+ element_counts[box.label] = element_counts.get(box.label, 0) + 1
678
+
679
+ click.echo(f" Total elements: {total_elements}")
680
+
681
+ if total_elements > 0:
682
+ # Average elements per page
683
+ avg_elements = total_elements / len(pages)
684
+ click.echo(f" Average per page: {avg_elements:.1f}")
685
+
686
+ click.echo(f"\n šŸ“‹ Elements by type:")
687
+ for element_type, count in sorted(element_counts.items(), key=lambda x: x[1], reverse=True):
688
+ percentage = (count / total_elements) * 100
689
+ click.echo(f" • {element_type.ljust(10)}: {str(count).rjust(3)} ({percentage:4.1f}%)")
690
+
691
+ # Chart and table specific analysis
692
+ charts = element_counts.get('chart', 0)
693
+ tables = element_counts.get('table', 0)
694
+
695
+ if charts > 0 or tables > 0:
696
+ click.echo(f"\n šŸŽÆ Extraction recommendations:")
697
+ if charts > 0 and tables > 0:
698
+ click.echo(f" • Use: doctra extract both document.pdf")
699
+ click.echo(f" • Charts: {charts}, Tables: {tables}")
700
+ elif charts > 0:
701
+ click.echo(f" • Use: doctra extract charts document.pdf")
702
+ click.echo(f" • Charts found: {charts}")
703
+ elif tables > 0:
704
+ click.echo(f" • Use: doctra extract tables document.pdf")
705
+ click.echo(f" • Tables found: {tables}")
706
+
707
+ # Page-by-page breakdown
708
+ if verbose:
709
+ click.echo(f"\n šŸ“„ Page-by-page breakdown:")
710
+ for i, page in enumerate(pages[:20]): # Show first 20 pages in verbose mode
711
+ page_stats = {}
712
+ for box in page.boxes:
713
+ page_stats[box.label] = page_stats.get(box.label, 0) + 1
714
+
715
+ stats_str = ", ".join([f"{k}: {v}" for k, v in sorted(page_stats.items())])
716
+ click.echo(f" Page {page.page_index:3d}: {len(page.boxes):2d} elements ({stats_str})")
717
+
718
+ if len(pages) > 20:
719
+ click.echo(f" ... and {len(pages) - 20} more pages")
720
+ else:
721
+ click.echo(f"\n šŸ“„ Page summary:")
722
+ if page_elements:
723
+ min_elements = min(page_elements)
724
+ max_elements = max(page_elements)
725
+ click.echo(f" Range: {min_elements} - {max_elements} elements per page")
726
+
727
+ # Show pages with most/least elements
728
+ max_page = page_elements.index(max_elements) + 1
729
+ min_page = page_elements.index(min_elements) + 1
730
+ click.echo(f" Most elements: Page {max_page} ({max_elements} elements)")
731
+ click.echo(f" Least elements: Page {min_page} ({min_elements} elements)")
732
+
733
+ # Processing time estimate
734
+ estimated_time = len(pages) * 2 # Rough estimate: 2 seconds per page
735
+ if element_counts.get('table', 0) > 0 or element_counts.get('chart', 0) > 0:
736
+ estimated_time += (element_counts.get('table', 0) + element_counts.get('chart', 0)) * 5
737
+
738
+ click.echo(f"\n ā±ļø Estimated processing time: ~{estimated_time} seconds")
739
+ if element_counts.get('table', 0) > 0 or element_counts.get('chart', 0) > 0:
740
+ vlm_time = (element_counts.get('table', 0) + element_counts.get('chart', 0)) * 3
741
+ click.echo(
742
+ f" (Add ~{vlm_time}s more with VLM for {element_counts.get('table', 0) + element_counts.get('chart', 0)} tables/charts)")
743
+ else:
744
+ click.echo(" āš ļø No elements detected (try lowering --min-score)")
745
+
746
+ except KeyboardInterrupt:
747
+ click.echo("\nāš ļø Analysis interrupted by user", err=True)
748
+ sys.exit(130)
749
+ except Exception as e:
750
+ click.echo(f"āŒ Error analyzing PDF: {e}", err=True)
751
+ if verbose:
752
+ import traceback
753
+ click.echo(traceback.format_exc(), err=True)
754
+ sys.exit(1)
755
+
756
+
757
+ @cli.command()
758
+ def info():
759
+ """
760
+ Show system information and available models.
761
+
762
+ Displays Python version, dependency status, available VLM providers,
763
+ layout models, and OCR language information.
764
+
765
+ :return: None
766
+ """
767
+ click.echo("šŸ”¬ Doctra System Information")
768
+ click.echo("=" * 50)
769
+
770
+ # Check Python version
771
+ import sys
772
+ python_version = f"{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}"
773
+ click.echo(f"Python version: {python_version}")
774
+
775
+ # Check key dependencies
776
+ dependencies = [
777
+ ('PIL', 'Pillow', 'Image processing'),
778
+ ('paddle', 'PaddlePaddle', 'Layout detection engine'),
779
+ ('pytesseract', 'pytesseract', 'OCR engine'),
780
+ ('tqdm', 'tqdm', 'Progress bars'),
781
+ ('click', 'click', 'CLI framework'),
782
+ ]
783
+
784
+ click.echo("\nCore Dependencies:")
785
+ for module_name, package_name, description in dependencies:
786
+ try:
787
+ module = __import__(module_name)
788
+ version = getattr(module, '__version__', 'unknown')
789
+ click.echo(f" āœ… {package_name} ({version}) - {description}")
790
+ except ImportError:
791
+ click.echo(f" āŒ {package_name} - {description} (not installed)")
792
+
793
+ # Optional VLM dependencies
794
+ click.echo("\nVLM Dependencies (Optional):")
795
+ vlm_deps = [
796
+ ('google.generativeai', 'google-generativeai', 'Gemini VLM support'),
797
+ ('openai', 'openai', 'OpenAI VLM support'),
798
+ ]
799
+
800
+ for module_name, package_name, description in vlm_deps:
801
+ try:
802
+ module = __import__(module_name)
803
+ version = getattr(module, '__version__', 'unknown')
804
+ click.echo(f" āœ… {package_name} ({version}) - {description}")
805
+ except ImportError:
806
+ click.echo(f" āš ļø {package_name} - {description} (not installed)")
807
+
808
+ # Available commands
809
+ click.echo("\nAvailable Commands:")
810
+ click.echo(" šŸ“„ parse - Full document processing (text, tables, charts, figures)")
811
+ click.echo(" šŸ“Š extract - Chart/table extraction only")
812
+ click.echo(" ā”œā”€ charts - Extract only charts")
813
+ click.echo(" ā”œā”€ tables - Extract only tables")
814
+ click.echo(" └─ both - Extract charts and tables")
815
+ click.echo(" šŸŽØ visualize - Layout detection visualization")
816
+ click.echo(" šŸ” analyze - Document structure analysis")
817
+ click.echo(" ā„¹ļø info - System information (this command)")
818
+
819
+ # VLM providers
820
+ click.echo("\nVLM Providers:")
821
+ click.echo(" • Gemini (Google) - gemini-1.5-flash-latest, gemini-1.5-pro")
822
+ click.echo(" • OpenAI - gpt-4o, gpt-4o-mini, gpt-4-vision-preview")
823
+
824
+ # Available layout models
825
+ click.echo("\nLayout Detection Models:")
826
+ click.echo(" • PP-DocLayout_plus-L (default) - High accuracy layout detection")
827
+ click.echo(" • PP-DocLayout_plus-M - Balanced speed and accuracy")
828
+ click.echo(" • PP-DocLayout_plus-S - Fast inference")
829
+
830
+ # OCR information
831
+ click.echo("\nOCR Configuration:")
832
+ click.echo(" Engine: Tesseract OCR")
833
+ click.echo(" Common languages: eng, fra, deu, spa, ita, por, rus, ara, chi_sim, jpn")
834
+ click.echo(" Use 'tesseract --list-langs' for complete language list")
835
+
836
+ # Environment variables
837
+ click.echo("\nEnvironment Variables:")
838
+ vlm_key = os.environ.get('VLM_API_KEY')
839
+ if vlm_key:
840
+ masked_key = vlm_key[:8] + '*' * (len(vlm_key) - 12) + vlm_key[-4:] if len(vlm_key) > 12 else '*' * len(vlm_key)
841
+ click.echo(f" VLM_API_KEY: {masked_key}")
842
+ else:
843
+ click.echo(" VLM_API_KEY: (not set)")
844
+
845
+ # Usage examples
846
+ click.echo("\nšŸ’” Quick Start Examples:")
847
+ click.echo(" doctra parse document.pdf # Full document parsing")
848
+ click.echo(" doctra extract both document.pdf --use-vlm # Charts & tables with VLM")
849
+ click.echo(" doctra extract charts document.pdf # Only charts")
850
+ click.echo(" doctra extract tables document.pdf # Only tables")
851
+ click.echo(" doctra visualize document.pdf # Visualize layout")
852
+ click.echo(" doctra analyze document.pdf # Quick analysis")
853
+
854
+
855
+ if __name__ == '__main__':
856
+ cli()