doctra 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- doctra/__init__.py +19 -0
- doctra/cli/__init__.py +27 -0
- doctra/cli/main.py +856 -0
- doctra/cli/utils.py +340 -0
- doctra/engines/__init__.py +0 -0
- doctra/engines/layout/__init__.py +0 -0
- doctra/engines/layout/layout_models.py +90 -0
- doctra/engines/layout/paddle_layout.py +225 -0
- doctra/engines/ocr/__init__.py +4 -0
- doctra/engines/ocr/api.py +36 -0
- doctra/engines/ocr/path_resolver.py +48 -0
- doctra/engines/ocr/pytesseract_engine.py +76 -0
- doctra/engines/vlm/__init__.py +0 -0
- doctra/engines/vlm/outlines_types.py +31 -0
- doctra/engines/vlm/provider.py +58 -0
- doctra/engines/vlm/service.py +117 -0
- doctra/exporters/__init__.py +0 -0
- doctra/exporters/excel_writer.py +197 -0
- doctra/exporters/image_saver.py +42 -0
- doctra/exporters/markdown_table.py +56 -0
- doctra/exporters/markdown_writer.py +29 -0
- doctra/parsers/__init__.py +6 -0
- doctra/parsers/layout_order.py +16 -0
- doctra/parsers/structured_pdf_parser.py +434 -0
- doctra/parsers/table_chart_extractor.py +283 -0
- doctra/utils/__init__.py +0 -0
- doctra/utils/bbox.py +18 -0
- doctra/utils/constants.py +8 -0
- doctra/utils/file_ops.py +26 -0
- doctra/utils/io_utils.py +10 -0
- doctra/utils/ocr_utils.py +20 -0
- doctra/utils/pdf_io.py +19 -0
- doctra/utils/quiet.py +13 -0
- doctra/utils/structured_utils.py +49 -0
- doctra/version.py +2 -0
- doctra-0.1.0.dist-info/METADATA +626 -0
- doctra-0.1.0.dist-info/RECORD +40 -0
- doctra-0.1.0.dist-info/WHEEL +5 -0
- doctra-0.1.0.dist-info/licenses/LICENSE +201 -0
- doctra-0.1.0.dist-info/top_level.txt +1 -0
doctra/cli/main.py
ADDED
@@ -0,0 +1,856 @@
|
|
1
|
+
"""
|
2
|
+
Doctra CLI - Command line interface for document processing
|
3
|
+
|
4
|
+
This module provides a comprehensive CLI for the Doctra library, enabling
|
5
|
+
users to process PDF documents, extract charts/tables, visualize layout
|
6
|
+
detection results, and analyze document structure from the command line.
|
7
|
+
"""
|
8
|
+
|
9
|
+
import click
|
10
|
+
import os
|
11
|
+
import sys
|
12
|
+
from pathlib import Path
|
13
|
+
from typing import Optional
|
14
|
+
|
15
|
+
# Import parsers
|
16
|
+
try:
|
17
|
+
from doctra.parsers.structured_pdf_parser_enhancer import StructuredPDFParser
|
18
|
+
from doctra.parsers.chart_table_pdf_parser import ChartTablePDFParser
|
19
|
+
except ImportError:
|
20
|
+
# Fallback for development/testing
|
21
|
+
project_root = Path(__file__).parent.parent.parent
|
22
|
+
sys.path.insert(0, str(project_root))
|
23
|
+
from doctra.parsers.structured_pdf_parser import StructuredPDFParser
|
24
|
+
from doctra.parsers.table_chart_extractor import ChartTablePDFParser
|
25
|
+
|
26
|
+
|
27
|
+
@click.group(invoke_without_command=True)
|
28
|
+
@click.pass_context
|
29
|
+
@click.version_option(version="1.0.0", prog_name="doctra")
|
30
|
+
def cli(ctx):
|
31
|
+
"""
|
32
|
+
š¬ Doctra - Advanced Document Processing Library
|
33
|
+
|
34
|
+
Extract text, tables, charts, and figures from PDF documents using
|
35
|
+
layout detection, OCR, and optional VLM (Vision Language Model) enhancement.
|
36
|
+
|
37
|
+
\b
|
38
|
+
Commands:
|
39
|
+
parse Full document parsing with text, tables, charts, and figures
|
40
|
+
extract Extract only charts and/or tables from documents
|
41
|
+
visualize Visualize layout detection results
|
42
|
+
analyze Quick document analysis without processing
|
43
|
+
info Show system information and dependencies
|
44
|
+
|
45
|
+
\b
|
46
|
+
Examples:
|
47
|
+
doctra parse document.pdf # Full document parsing
|
48
|
+
doctra extract charts document.pdf # Extract only charts
|
49
|
+
doctra extract both document.pdf --use-vlm # Extract charts & tables with VLM
|
50
|
+
doctra visualize document.pdf # Visualize layout detection
|
51
|
+
doctra analyze document.pdf # Quick document analysis
|
52
|
+
doctra info # System information
|
53
|
+
|
54
|
+
For more help on any command, use: doctra COMMAND --help
|
55
|
+
|
56
|
+
:param ctx: Click context object containing command information
|
57
|
+
:return: None
|
58
|
+
"""
|
59
|
+
if ctx.invoked_subcommand is None:
|
60
|
+
click.echo(ctx.get_help())
|
61
|
+
|
62
|
+
|
63
|
+
# Common options for VLM configuration
|
64
|
+
def vlm_options(func):
|
65
|
+
"""
|
66
|
+
Decorator to add common VLM options to commands.
|
67
|
+
|
68
|
+
Adds the following options to a Click command:
|
69
|
+
- --use-vlm/--no-vlm: Enable/disable VLM processing
|
70
|
+
- --vlm-provider: Choose between 'gemini' or 'openai'
|
71
|
+
- --vlm-model: Model name to use (defaults to provider-specific defaults)
|
72
|
+
- --vlm-api-key: API key for VLM provider
|
73
|
+
|
74
|
+
:param func: The Click command function to decorate
|
75
|
+
:return: Decorated function with VLM options
|
76
|
+
"""
|
77
|
+
func = click.option('--use-vlm/--no-vlm', default=False,
|
78
|
+
help='Use Vision Language Model for table/chart extraction')(func)
|
79
|
+
func = click.option('--vlm-provider', type=click.Choice(['gemini', 'openai']), default='gemini',
|
80
|
+
help='VLM provider to use (default: gemini)')(func)
|
81
|
+
func = click.option('--vlm-model', type=str, default=None,
|
82
|
+
help='Model name to use (defaults to provider-specific defaults)')(func)
|
83
|
+
func = click.option('--vlm-api-key', type=str, envvar='VLM_API_KEY',
|
84
|
+
help='API key for VLM provider (or set VLM_API_KEY env var)')(func)
|
85
|
+
return func
|
86
|
+
|
87
|
+
|
88
|
+
# Common options for layout detection
|
89
|
+
def layout_options(func):
|
90
|
+
"""
|
91
|
+
Decorator to add common layout detection options to commands.
|
92
|
+
|
93
|
+
Adds the following options to a Click command:
|
94
|
+
- --layout-model: Layout detection model name
|
95
|
+
- --dpi: DPI for PDF rendering
|
96
|
+
- --min-score: Minimum confidence score for layout detection
|
97
|
+
|
98
|
+
:param func: The Click command function to decorate
|
99
|
+
:return: Decorated function with layout options
|
100
|
+
"""
|
101
|
+
func = click.option('--layout-model', default='PP-DocLayout_plus-L',
|
102
|
+
help='Layout detection model name (default: PP-DocLayout_plus-L)')(func)
|
103
|
+
func = click.option('--dpi', type=int, default=200,
|
104
|
+
help='DPI for PDF rendering (default: 200)')(func)
|
105
|
+
func = click.option('--min-score', type=float, default=0.0,
|
106
|
+
help='Minimum confidence score for layout detection (default: 0.0)')(func)
|
107
|
+
return func
|
108
|
+
|
109
|
+
|
110
|
+
# Common options for OCR
|
111
|
+
def ocr_options(func):
|
112
|
+
"""
|
113
|
+
Decorator to add common OCR options to commands.
|
114
|
+
|
115
|
+
Adds the following options to a Click command:
|
116
|
+
- --ocr-lang: OCR language code
|
117
|
+
- --ocr-psm: Tesseract page segmentation mode
|
118
|
+
- --ocr-oem: Tesseract OCR engine mode
|
119
|
+
- --ocr-config: Additional Tesseract configuration
|
120
|
+
|
121
|
+
:param func: The Click command function to decorate
|
122
|
+
:return: Decorated function with OCR options
|
123
|
+
"""
|
124
|
+
func = click.option('--ocr-lang', default='eng',
|
125
|
+
help='OCR language code (default: eng)')(func)
|
126
|
+
func = click.option('--ocr-psm', type=int, default=4,
|
127
|
+
help='Tesseract page segmentation mode (default: 4)')(func)
|
128
|
+
func = click.option('--ocr-oem', type=int, default=3,
|
129
|
+
help='Tesseract OCR engine mode (default: 3)')(func)
|
130
|
+
func = click.option('--ocr-config', default='',
|
131
|
+
help='Additional Tesseract configuration string')(func)
|
132
|
+
return func
|
133
|
+
|
134
|
+
|
135
|
+
def validate_vlm_config(use_vlm: bool, vlm_api_key: Optional[str]) -> None:
|
136
|
+
"""
|
137
|
+
Validate VLM configuration and exit with error if invalid.
|
138
|
+
|
139
|
+
Checks if VLM is enabled but no API key is provided, and exits
|
140
|
+
with an appropriate error message if the configuration is invalid.
|
141
|
+
|
142
|
+
:param use_vlm: Whether VLM processing is enabled
|
143
|
+
:param vlm_api_key: The VLM API key (can be None if VLM is disabled)
|
144
|
+
:return: None
|
145
|
+
:raises SystemExit: If VLM is enabled but no API key is provided
|
146
|
+
"""
|
147
|
+
if use_vlm and not vlm_api_key:
|
148
|
+
click.echo("ā Error: VLM API key is required when using --use-vlm", err=True)
|
149
|
+
click.echo(" Set the VLM_API_KEY environment variable or use --vlm-api-key", err=True)
|
150
|
+
click.echo(" Example: export VLM_API_KEY=your_api_key", err=True)
|
151
|
+
sys.exit(1)
|
152
|
+
|
153
|
+
|
154
|
+
@cli.command()
|
155
|
+
@click.argument('pdf_path', type=click.Path(exists=True, path_type=Path))
|
156
|
+
@click.option('--output-dir', '-o', type=click.Path(path_type=Path),
|
157
|
+
help='Output directory (default: outputs/{pdf_filename})')
|
158
|
+
@vlm_options
|
159
|
+
@layout_options
|
160
|
+
@ocr_options
|
161
|
+
@click.option('--box-separator', default='\n',
|
162
|
+
help='Separator between text boxes in output (default: newline)')
|
163
|
+
@click.option('--verbose', '-v', is_flag=True,
|
164
|
+
help='Enable verbose output')
|
165
|
+
def parse(pdf_path: Path, output_dir: Optional[Path], use_vlm: bool,
|
166
|
+
vlm_provider: str, vlm_model: Optional[str], vlm_api_key: Optional[str],
|
167
|
+
layout_model: str, dpi: int, min_score: float,
|
168
|
+
ocr_lang: str, ocr_psm: int, ocr_oem: int, ocr_config: str,
|
169
|
+
box_separator: str, verbose: bool):
|
170
|
+
"""
|
171
|
+
Parse a PDF document and extract all structured content.
|
172
|
+
|
173
|
+
Performs comprehensive document processing including text extraction,
|
174
|
+
layout detection, OCR, and optional VLM-based table/chart extraction.
|
175
|
+
Outputs markdown file and optionally Excel file with structured data.
|
176
|
+
|
177
|
+
\b
|
178
|
+
Examples:
|
179
|
+
doctra parse document.pdf
|
180
|
+
doctra parse document.pdf --use-vlm --vlm-api-key your_key
|
181
|
+
doctra parse document.pdf -o ./results --dpi 300
|
182
|
+
doctra parse document.pdf --vlm-provider openai --use-vlm
|
183
|
+
|
184
|
+
\b
|
185
|
+
VLM Setup:
|
186
|
+
Set environment variable: export VLM_API_KEY=your_api_key
|
187
|
+
Or use: --vlm-api-key your_api_key
|
188
|
+
|
189
|
+
:param pdf_path: Path to the input PDF file
|
190
|
+
:param output_dir: Output directory for results (optional)
|
191
|
+
:param use_vlm: Whether to use VLM for enhanced extraction
|
192
|
+
:param vlm_provider: VLM provider ('gemini' or 'openai')
|
193
|
+
:param vlm_model: Model name to use (defaults to provider-specific defaults)
|
194
|
+
:param vlm_api_key: API key for VLM provider
|
195
|
+
:param layout_model: Layout detection model name
|
196
|
+
:param dpi: DPI for PDF rendering
|
197
|
+
:param min_score: Minimum confidence score for layout detection
|
198
|
+
:param ocr_lang: OCR language code
|
199
|
+
:param ocr_psm: Tesseract page segmentation mode
|
200
|
+
:param ocr_oem: Tesseract OCR engine mode
|
201
|
+
:param ocr_config: Additional Tesseract configuration
|
202
|
+
:param box_separator: Separator between text boxes in output
|
203
|
+
:param verbose: Whether to enable verbose output
|
204
|
+
:return: None
|
205
|
+
"""
|
206
|
+
validate_vlm_config(use_vlm, vlm_api_key)
|
207
|
+
|
208
|
+
if verbose:
|
209
|
+
click.echo(f"š Starting full PDF parsing...")
|
210
|
+
click.echo(f" Input: {pdf_path}")
|
211
|
+
if output_dir:
|
212
|
+
click.echo(f" Output: {output_dir}")
|
213
|
+
|
214
|
+
# Create parser instance
|
215
|
+
try:
|
216
|
+
if verbose:
|
217
|
+
click.echo(f"š§ Initializing full parser...")
|
218
|
+
if use_vlm:
|
219
|
+
click.echo(f" VLM Provider: {vlm_provider}")
|
220
|
+
click.echo(f" VLM Model: {vlm_model or 'default'}")
|
221
|
+
click.echo(f" Layout Model: {layout_model}")
|
222
|
+
click.echo(f" DPI: {dpi}")
|
223
|
+
click.echo(f" OCR Language: {ocr_lang}")
|
224
|
+
else:
|
225
|
+
click.echo(f"š Initializing full document parser...")
|
226
|
+
if use_vlm:
|
227
|
+
click.echo(f" Using VLM: {vlm_provider}")
|
228
|
+
|
229
|
+
parser = StructuredPDFParser(
|
230
|
+
use_vlm=use_vlm,
|
231
|
+
vlm_provider=vlm_provider,
|
232
|
+
vlm_model=vlm_model,
|
233
|
+
vlm_api_key=vlm_api_key,
|
234
|
+
layout_model_name=layout_model,
|
235
|
+
dpi=dpi,
|
236
|
+
min_score=min_score,
|
237
|
+
ocr_lang=ocr_lang,
|
238
|
+
ocr_psm=ocr_psm,
|
239
|
+
ocr_oem=ocr_oem,
|
240
|
+
ocr_extra_config=ocr_config,
|
241
|
+
box_separator=box_separator
|
242
|
+
)
|
243
|
+
except Exception as e:
|
244
|
+
click.echo(f"ā Error initializing parser: {e}", err=True)
|
245
|
+
if verbose:
|
246
|
+
import traceback
|
247
|
+
click.echo(traceback.format_exc(), err=True)
|
248
|
+
sys.exit(1)
|
249
|
+
|
250
|
+
# Change to output directory if specified
|
251
|
+
original_cwd = os.getcwd()
|
252
|
+
if output_dir:
|
253
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
254
|
+
os.chdir(output_dir)
|
255
|
+
click.echo(f"š Output directory: {output_dir.absolute()}")
|
256
|
+
|
257
|
+
try:
|
258
|
+
# Parse the document
|
259
|
+
click.echo(f"š Processing: {pdf_path.name}")
|
260
|
+
parser.parse(str(pdf_path.absolute()))
|
261
|
+
click.echo("ā
Full document processing completed successfully!")
|
262
|
+
|
263
|
+
except KeyboardInterrupt:
|
264
|
+
click.echo("\nā ļø Processing interrupted by user", err=True)
|
265
|
+
sys.exit(130)
|
266
|
+
except Exception as e:
|
267
|
+
click.echo(f"ā Error during parsing: {e}", err=True)
|
268
|
+
if verbose:
|
269
|
+
import traceback
|
270
|
+
click.echo(traceback.format_exc(), err=True)
|
271
|
+
sys.exit(1)
|
272
|
+
finally:
|
273
|
+
# Restore original working directory
|
274
|
+
os.chdir(original_cwd)
|
275
|
+
|
276
|
+
|
277
|
+
@cli.group(invoke_without_command=True)
|
278
|
+
@click.pass_context
|
279
|
+
def extract(ctx):
|
280
|
+
"""
|
281
|
+
Extract charts and/or tables from PDF documents.
|
282
|
+
|
283
|
+
This command focuses specifically on chart and table extraction,
|
284
|
+
providing faster processing when you only need these elements.
|
285
|
+
|
286
|
+
\b
|
287
|
+
Subcommands:
|
288
|
+
charts Extract only charts from the document
|
289
|
+
tables Extract only tables from the document
|
290
|
+
both Extract both charts and tables
|
291
|
+
|
292
|
+
\b
|
293
|
+
Examples:
|
294
|
+
doctra extract charts document.pdf
|
295
|
+
doctra extract tables document.pdf --use-vlm
|
296
|
+
doctra extract both document.pdf --output-dir ./results
|
297
|
+
|
298
|
+
:param ctx: Click context object containing command information
|
299
|
+
:return: None
|
300
|
+
"""
|
301
|
+
if ctx.invoked_subcommand is None:
|
302
|
+
click.echo(ctx.get_help())
|
303
|
+
|
304
|
+
|
305
|
+
@extract.command()
|
306
|
+
@click.argument('pdf_path', type=click.Path(exists=True, path_type=Path))
|
307
|
+
@click.option('--output-dir', '-o', type=click.Path(path_type=Path), default=Path("outputs"),
|
308
|
+
help='Output base directory (default: outputs)')
|
309
|
+
@vlm_options
|
310
|
+
@layout_options
|
311
|
+
@click.option('--verbose', '-v', is_flag=True, help='Enable verbose output')
|
312
|
+
def charts(pdf_path: Path, output_dir: Path, use_vlm: bool, vlm_provider: str,
|
313
|
+
vlm_model: Optional[str], vlm_api_key: Optional[str],
|
314
|
+
layout_model: str, dpi: int, min_score: float, verbose: bool):
|
315
|
+
"""
|
316
|
+
Extract only charts from a PDF document.
|
317
|
+
|
318
|
+
Saves chart images and optionally converts them to structured data using VLM.
|
319
|
+
|
320
|
+
\b
|
321
|
+
Examples:
|
322
|
+
doctra extract charts document.pdf
|
323
|
+
doctra extract charts document.pdf --use-vlm --vlm-api-key your_key
|
324
|
+
doctra extract charts document.pdf -o ./my_outputs --dpi 300
|
325
|
+
|
326
|
+
:param pdf_path: Path to the input PDF file
|
327
|
+
:param output_dir: Output base directory for results
|
328
|
+
:param use_vlm: Whether to use VLM for enhanced chart extraction
|
329
|
+
:param vlm_provider: VLM provider ('gemini' or 'openai')
|
330
|
+
:param vlm_model: Model name to use (defaults to provider-specific defaults)
|
331
|
+
:param vlm_api_key: API key for VLM provider
|
332
|
+
:param layout_model: Layout detection model name
|
333
|
+
:param dpi: DPI for PDF rendering
|
334
|
+
:param min_score: Minimum confidence score for layout detection
|
335
|
+
:param verbose: Whether to enable verbose output
|
336
|
+
:return: None
|
337
|
+
"""
|
338
|
+
validate_vlm_config(use_vlm, vlm_api_key)
|
339
|
+
|
340
|
+
if verbose:
|
341
|
+
click.echo(f"š Starting chart extraction...")
|
342
|
+
click.echo(f" Input: {pdf_path}")
|
343
|
+
click.echo(f" Output base: {output_dir}")
|
344
|
+
|
345
|
+
try:
|
346
|
+
if verbose:
|
347
|
+
click.echo(f"š§ Initializing chart extractor...")
|
348
|
+
if use_vlm:
|
349
|
+
click.echo(f" VLM Provider: {vlm_provider}")
|
350
|
+
else:
|
351
|
+
click.echo(f"š Initializing chart extractor...")
|
352
|
+
if use_vlm:
|
353
|
+
click.echo(f" Using VLM: {vlm_provider}")
|
354
|
+
|
355
|
+
parser = ChartTablePDFParser(
|
356
|
+
extract_charts=True,
|
357
|
+
extract_tables=False,
|
358
|
+
use_vlm=use_vlm,
|
359
|
+
vlm_provider=vlm_provider,
|
360
|
+
vlm_model=vlm_model,
|
361
|
+
vlm_api_key=vlm_api_key,
|
362
|
+
layout_model_name=layout_model,
|
363
|
+
dpi=dpi,
|
364
|
+
min_score=min_score
|
365
|
+
)
|
366
|
+
|
367
|
+
click.echo(f"š Processing: {pdf_path.name}")
|
368
|
+
parser.parse(str(pdf_path), str(output_dir))
|
369
|
+
click.echo("ā
Chart extraction completed successfully!")
|
370
|
+
|
371
|
+
except KeyboardInterrupt:
|
372
|
+
click.echo("\nā ļø Extraction interrupted by user", err=True)
|
373
|
+
sys.exit(130)
|
374
|
+
except Exception as e:
|
375
|
+
click.echo(f"ā Error during chart extraction: {e}", err=True)
|
376
|
+
if verbose:
|
377
|
+
import traceback
|
378
|
+
click.echo(traceback.format_exc(), err=True)
|
379
|
+
sys.exit(1)
|
380
|
+
|
381
|
+
|
382
|
+
@extract.command()
|
383
|
+
@click.argument('pdf_path', type=click.Path(exists=True, path_type=Path))
|
384
|
+
@click.option('--output-dir', '-o', type=click.Path(path_type=Path), default=Path("outputs"),
|
385
|
+
help='Output base directory (default: outputs)')
|
386
|
+
@vlm_options
|
387
|
+
@layout_options
|
388
|
+
@click.option('--verbose', '-v', is_flag=True, help='Enable verbose output')
|
389
|
+
def tables(pdf_path: Path, output_dir: Path, use_vlm: bool, vlm_provider: str,
|
390
|
+
vlm_model: Optional[str], vlm_api_key: Optional[str],
|
391
|
+
layout_model: str, dpi: int, min_score: float, verbose: bool):
|
392
|
+
"""
|
393
|
+
Extract only tables from a PDF document.
|
394
|
+
|
395
|
+
Saves table images and optionally converts them to structured data using VLM.
|
396
|
+
|
397
|
+
\b
|
398
|
+
Examples:
|
399
|
+
doctra extract tables document.pdf
|
400
|
+
doctra extract tables document.pdf --use-vlm --vlm-api-key your_key
|
401
|
+
doctra extract tables document.pdf -o ./my_outputs --min-score 0.5
|
402
|
+
|
403
|
+
:param pdf_path: Path to the input PDF file
|
404
|
+
:param output_dir: Output base directory for results
|
405
|
+
:param use_vlm: Whether to use VLM for enhanced table extraction
|
406
|
+
:param vlm_provider: VLM provider ('gemini' or 'openai')
|
407
|
+
:param vlm_model: Model name to use (defaults to provider-specific defaults)
|
408
|
+
:param vlm_api_key: API key for VLM provider
|
409
|
+
:param layout_model: Layout detection model name
|
410
|
+
:param dpi: DPI for PDF rendering
|
411
|
+
:param min_score: Minimum confidence score for layout detection
|
412
|
+
:param verbose: Whether to enable verbose output
|
413
|
+
:return: None
|
414
|
+
"""
|
415
|
+
validate_vlm_config(use_vlm, vlm_api_key)
|
416
|
+
|
417
|
+
if verbose:
|
418
|
+
click.echo(f"š Starting table extraction...")
|
419
|
+
click.echo(f" Input: {pdf_path}")
|
420
|
+
click.echo(f" Output base: {output_dir}")
|
421
|
+
|
422
|
+
try:
|
423
|
+
if verbose:
|
424
|
+
click.echo(f"š§ Initializing table extractor...")
|
425
|
+
if use_vlm:
|
426
|
+
click.echo(f" VLM Provider: {vlm_provider}")
|
427
|
+
else:
|
428
|
+
click.echo(f"š Initializing table extractor...")
|
429
|
+
if use_vlm:
|
430
|
+
click.echo(f" Using VLM: {vlm_provider}")
|
431
|
+
|
432
|
+
parser = ChartTablePDFParser(
|
433
|
+
extract_charts=False,
|
434
|
+
extract_tables=True,
|
435
|
+
use_vlm=use_vlm,
|
436
|
+
vlm_provider=vlm_provider,
|
437
|
+
vlm_model=vlm_model,
|
438
|
+
vlm_api_key=vlm_api_key,
|
439
|
+
layout_model_name=layout_model,
|
440
|
+
dpi=dpi,
|
441
|
+
min_score=min_score
|
442
|
+
)
|
443
|
+
|
444
|
+
click.echo(f"š Processing: {pdf_path.name}")
|
445
|
+
parser.parse(str(pdf_path), str(output_dir))
|
446
|
+
click.echo("ā
Table extraction completed successfully!")
|
447
|
+
|
448
|
+
except KeyboardInterrupt:
|
449
|
+
click.echo("\nā ļø Extraction interrupted by user", err=True)
|
450
|
+
sys.exit(130)
|
451
|
+
except Exception as e:
|
452
|
+
click.echo(f"ā Error during table extraction: {e}", err=True)
|
453
|
+
if verbose:
|
454
|
+
import traceback
|
455
|
+
click.echo(traceback.format_exc(), err=True)
|
456
|
+
sys.exit(1)
|
457
|
+
|
458
|
+
|
459
|
+
@extract.command()
|
460
|
+
@click.argument('pdf_path', type=click.Path(exists=True, path_type=Path))
|
461
|
+
@click.option('--output-dir', '-o', type=click.Path(path_type=Path), default=Path("outputs"),
|
462
|
+
help='Output base directory (default: outputs)')
|
463
|
+
@vlm_options
|
464
|
+
@layout_options
|
465
|
+
@click.option('--verbose', '-v', is_flag=True, help='Enable verbose output')
|
466
|
+
def both(pdf_path: Path, output_dir: Path, use_vlm: bool, vlm_provider: str,
|
467
|
+
vlm_model: Optional[str], vlm_api_key: Optional[str],
|
468
|
+
layout_model: str, dpi: int, min_score: float, verbose: bool):
|
469
|
+
"""
|
470
|
+
Extract both charts and tables from a PDF document.
|
471
|
+
|
472
|
+
Saves both chart and table images, and optionally converts them
|
473
|
+
to structured data using VLM.
|
474
|
+
|
475
|
+
\b
|
476
|
+
Examples:
|
477
|
+
doctra extract both document.pdf
|
478
|
+
doctra extract both document.pdf --use-vlm --vlm-api-key your_key
|
479
|
+
doctra extract both document.pdf -o ./my_outputs --dpi 300
|
480
|
+
|
481
|
+
:param pdf_path: Path to the input PDF file
|
482
|
+
:param output_dir: Output base directory for results
|
483
|
+
:param use_vlm: Whether to use VLM for enhanced extraction
|
484
|
+
:param vlm_provider: VLM provider ('gemini' or 'openai')
|
485
|
+
:param vlm_model: Model name to use (defaults to provider-specific defaults)
|
486
|
+
:param vlm_api_key: API key for VLM provider
|
487
|
+
:param layout_model: Layout detection model name
|
488
|
+
:param dpi: DPI for PDF rendering
|
489
|
+
:param min_score: Minimum confidence score for layout detection
|
490
|
+
:param verbose: Whether to enable verbose output
|
491
|
+
:return: None
|
492
|
+
"""
|
493
|
+
validate_vlm_config(use_vlm, vlm_api_key)
|
494
|
+
|
495
|
+
if verbose:
|
496
|
+
click.echo(f"šš Starting chart and table extraction...")
|
497
|
+
click.echo(f" Input: {pdf_path}")
|
498
|
+
click.echo(f" Output base: {output_dir}")
|
499
|
+
|
500
|
+
try:
|
501
|
+
if verbose:
|
502
|
+
click.echo(f"š§ Initializing chart/table extractor...")
|
503
|
+
if use_vlm:
|
504
|
+
click.echo(f" VLM Provider: {vlm_provider}")
|
505
|
+
else:
|
506
|
+
click.echo(f"šš Initializing chart and table extractor...")
|
507
|
+
if use_vlm:
|
508
|
+
click.echo(f" Using VLM: {vlm_provider}")
|
509
|
+
|
510
|
+
parser = ChartTablePDFParser(
|
511
|
+
extract_charts=True,
|
512
|
+
extract_tables=True,
|
513
|
+
use_vlm=use_vlm,
|
514
|
+
vlm_provider=vlm_provider,
|
515
|
+
vlm_model=vlm_model,
|
516
|
+
vlm_api_key=vlm_api_key,
|
517
|
+
layout_model_name=layout_model,
|
518
|
+
dpi=dpi,
|
519
|
+
min_score=min_score
|
520
|
+
)
|
521
|
+
|
522
|
+
click.echo(f"š Processing: {pdf_path.name}")
|
523
|
+
parser.parse(str(pdf_path), str(output_dir))
|
524
|
+
click.echo("ā
Chart and table extraction completed successfully!")
|
525
|
+
|
526
|
+
except KeyboardInterrupt:
|
527
|
+
click.echo("\nā ļø Extraction interrupted by user", err=True)
|
528
|
+
sys.exit(130)
|
529
|
+
except Exception as e:
|
530
|
+
click.echo(f"ā Error during extraction: {e}", err=True)
|
531
|
+
if verbose:
|
532
|
+
import traceback
|
533
|
+
click.echo(traceback.format_exc(), err=True)
|
534
|
+
sys.exit(1)
|
535
|
+
|
536
|
+
|
537
|
+
@cli.command()
|
538
|
+
@click.argument('pdf_path', type=click.Path(exists=True, path_type=Path))
|
539
|
+
@click.option('--pages', '-p', type=int, default=3,
|
540
|
+
help='Number of pages to visualize (default: 3)')
|
541
|
+
@click.option('--columns', '-c', type=int, default=2,
|
542
|
+
help='Number of columns in grid layout (default: 2)')
|
543
|
+
@click.option('--width', '-w', type=int, default=800,
|
544
|
+
help='Width of each page in pixels (default: 800)')
|
545
|
+
@click.option('--spacing', '-s', type=int, default=40,
|
546
|
+
help='Spacing between pages in pixels (default: 40)')
|
547
|
+
@click.option('--output', '-o', type=click.Path(path_type=Path),
|
548
|
+
help='Save visualization to file (PNG/JPG)')
|
549
|
+
@layout_options
|
550
|
+
@click.option('--verbose', '-v', is_flag=True, help='Enable verbose output')
|
551
|
+
def visualize(pdf_path: Path, pages: int, columns: int, width: int,
|
552
|
+
spacing: int, output: Optional[Path], dpi: int, min_score: float,
|
553
|
+
layout_model: str, verbose: bool):
|
554
|
+
"""
|
555
|
+
Visualize layout detection results for a PDF.
|
556
|
+
|
557
|
+
Shows detected layout elements (text, tables, figures, charts)
|
558
|
+
with bounding boxes and confidence scores in a modern grid layout.
|
559
|
+
|
560
|
+
\b
|
561
|
+
Examples:
|
562
|
+
doctra visualize document.pdf
|
563
|
+
doctra visualize document.pdf --pages 5 --output layout.png
|
564
|
+
doctra visualize document.pdf --columns 3 --width 600
|
565
|
+
|
566
|
+
:param pdf_path: Path to the input PDF file
|
567
|
+
:param pages: Number of pages to visualize
|
568
|
+
:param columns: Number of columns in the grid layout
|
569
|
+
:param width: Width of each page in pixels
|
570
|
+
:param spacing: Spacing between pages in pixels
|
571
|
+
:param output: Optional path to save visualization as image file
|
572
|
+
:param dpi: DPI for PDF rendering
|
573
|
+
:param min_score: Minimum confidence score for layout detection
|
574
|
+
:param layout_model: Layout detection model name
|
575
|
+
:param verbose: Whether to enable verbose output
|
576
|
+
:return: None
|
577
|
+
"""
|
578
|
+
try:
|
579
|
+
if verbose:
|
580
|
+
click.echo(f"šØ Creating layout visualization...")
|
581
|
+
click.echo(f" Input: {pdf_path}")
|
582
|
+
click.echo(f" Pages: {pages}, Columns: {columns}")
|
583
|
+
click.echo(f" Page width: {width}px, Spacing: {spacing}px")
|
584
|
+
click.echo(f" DPI: {dpi}, Min score: {min_score}")
|
585
|
+
else:
|
586
|
+
click.echo(f"šØ Creating layout visualization...")
|
587
|
+
|
588
|
+
# Create parser instance (no VLM needed for visualization)
|
589
|
+
parser = StructuredPDFParser(
|
590
|
+
layout_model_name=layout_model,
|
591
|
+
dpi=dpi,
|
592
|
+
min_score=min_score
|
593
|
+
)
|
594
|
+
|
595
|
+
click.echo(f"š Processing: {pdf_path.name}")
|
596
|
+
if output:
|
597
|
+
click.echo(f"š¾ Saving to: {output}")
|
598
|
+
else:
|
599
|
+
click.echo("šļø Will display visualization window")
|
600
|
+
|
601
|
+
parser.display_pages_with_boxes(
|
602
|
+
pdf_path=str(pdf_path),
|
603
|
+
num_pages=pages,
|
604
|
+
cols=columns,
|
605
|
+
page_width=width,
|
606
|
+
spacing=spacing,
|
607
|
+
save_path=str(output) if output else None
|
608
|
+
)
|
609
|
+
|
610
|
+
if not output:
|
611
|
+
click.echo(" Close the window to continue...")
|
612
|
+
else:
|
613
|
+
click.echo("ā
Visualization saved successfully!")
|
614
|
+
|
615
|
+
except KeyboardInterrupt:
|
616
|
+
click.echo("\nā ļø Visualization interrupted by user", err=True)
|
617
|
+
sys.exit(130)
|
618
|
+
except Exception as e:
|
619
|
+
click.echo(f"ā Error creating visualization: {e}", err=True)
|
620
|
+
if verbose:
|
621
|
+
import traceback
|
622
|
+
click.echo(traceback.format_exc(), err=True)
|
623
|
+
sys.exit(1)
|
624
|
+
|
625
|
+
|
626
|
+
@cli.command()
|
627
|
+
@click.argument('pdf_path', type=click.Path(exists=True, path_type=Path))
|
628
|
+
@layout_options
|
629
|
+
@click.option('--verbose', '-v', is_flag=True, help='Show detailed per-page breakdown')
|
630
|
+
def analyze(pdf_path: Path, dpi: int, min_score: float, layout_model: str, verbose: bool):
|
631
|
+
"""
|
632
|
+
Analyze a PDF and show statistics without processing.
|
633
|
+
|
634
|
+
Quick analysis to understand document structure before full processing.
|
635
|
+
Shows total pages, element counts, and distribution statistics.
|
636
|
+
|
637
|
+
\b
|
638
|
+
Examples:
|
639
|
+
doctra analyze document.pdf
|
640
|
+
doctra analyze document.pdf --verbose
|
641
|
+
doctra analyze document.pdf --min-score 0.5
|
642
|
+
|
643
|
+
:param pdf_path: Path to the input PDF file
|
644
|
+
:param dpi: DPI for PDF rendering
|
645
|
+
:param min_score: Minimum confidence score for layout detection
|
646
|
+
:param layout_model: Layout detection model name
|
647
|
+
:param verbose: Whether to show detailed per-page breakdown
|
648
|
+
:return: None
|
649
|
+
"""
|
650
|
+
try:
|
651
|
+
click.echo(f"š Analyzing: {pdf_path.name}")
|
652
|
+
|
653
|
+
# Create layout engine for analysis only
|
654
|
+
from doctra.engines.layout.paddle_layout import PaddleLayoutEngine
|
655
|
+
|
656
|
+
if verbose:
|
657
|
+
click.echo(f" Using model: {layout_model}")
|
658
|
+
click.echo(f" DPI: {dpi}, Min score: {min_score}")
|
659
|
+
|
660
|
+
layout_engine = PaddleLayoutEngine(model_name=layout_model)
|
661
|
+
pages = layout_engine.predict_pdf(str(pdf_path), dpi=dpi, min_score=min_score)
|
662
|
+
|
663
|
+
click.echo(f"\nš Document Analysis Results:")
|
664
|
+
click.echo(f" Total pages: {len(pages)}")
|
665
|
+
|
666
|
+
# Collect statistics
|
667
|
+
total_elements = 0
|
668
|
+
element_counts = {}
|
669
|
+
page_elements = []
|
670
|
+
|
671
|
+
for page in pages:
|
672
|
+
page_element_count = len(page.boxes)
|
673
|
+
total_elements += page_element_count
|
674
|
+
page_elements.append(page_element_count)
|
675
|
+
|
676
|
+
for box in page.boxes:
|
677
|
+
element_counts[box.label] = element_counts.get(box.label, 0) + 1
|
678
|
+
|
679
|
+
click.echo(f" Total elements: {total_elements}")
|
680
|
+
|
681
|
+
if total_elements > 0:
|
682
|
+
# Average elements per page
|
683
|
+
avg_elements = total_elements / len(pages)
|
684
|
+
click.echo(f" Average per page: {avg_elements:.1f}")
|
685
|
+
|
686
|
+
click.echo(f"\n š Elements by type:")
|
687
|
+
for element_type, count in sorted(element_counts.items(), key=lambda x: x[1], reverse=True):
|
688
|
+
percentage = (count / total_elements) * 100
|
689
|
+
click.echo(f" ⢠{element_type.ljust(10)}: {str(count).rjust(3)} ({percentage:4.1f}%)")
|
690
|
+
|
691
|
+
# Chart and table specific analysis
|
692
|
+
charts = element_counts.get('chart', 0)
|
693
|
+
tables = element_counts.get('table', 0)
|
694
|
+
|
695
|
+
if charts > 0 or tables > 0:
|
696
|
+
click.echo(f"\n šÆ Extraction recommendations:")
|
697
|
+
if charts > 0 and tables > 0:
|
698
|
+
click.echo(f" ⢠Use: doctra extract both document.pdf")
|
699
|
+
click.echo(f" ⢠Charts: {charts}, Tables: {tables}")
|
700
|
+
elif charts > 0:
|
701
|
+
click.echo(f" ⢠Use: doctra extract charts document.pdf")
|
702
|
+
click.echo(f" ⢠Charts found: {charts}")
|
703
|
+
elif tables > 0:
|
704
|
+
click.echo(f" ⢠Use: doctra extract tables document.pdf")
|
705
|
+
click.echo(f" ⢠Tables found: {tables}")
|
706
|
+
|
707
|
+
# Page-by-page breakdown
|
708
|
+
if verbose:
|
709
|
+
click.echo(f"\n š Page-by-page breakdown:")
|
710
|
+
for i, page in enumerate(pages[:20]): # Show first 20 pages in verbose mode
|
711
|
+
page_stats = {}
|
712
|
+
for box in page.boxes:
|
713
|
+
page_stats[box.label] = page_stats.get(box.label, 0) + 1
|
714
|
+
|
715
|
+
stats_str = ", ".join([f"{k}: {v}" for k, v in sorted(page_stats.items())])
|
716
|
+
click.echo(f" Page {page.page_index:3d}: {len(page.boxes):2d} elements ({stats_str})")
|
717
|
+
|
718
|
+
if len(pages) > 20:
|
719
|
+
click.echo(f" ... and {len(pages) - 20} more pages")
|
720
|
+
else:
|
721
|
+
click.echo(f"\n š Page summary:")
|
722
|
+
if page_elements:
|
723
|
+
min_elements = min(page_elements)
|
724
|
+
max_elements = max(page_elements)
|
725
|
+
click.echo(f" Range: {min_elements} - {max_elements} elements per page")
|
726
|
+
|
727
|
+
# Show pages with most/least elements
|
728
|
+
max_page = page_elements.index(max_elements) + 1
|
729
|
+
min_page = page_elements.index(min_elements) + 1
|
730
|
+
click.echo(f" Most elements: Page {max_page} ({max_elements} elements)")
|
731
|
+
click.echo(f" Least elements: Page {min_page} ({min_elements} elements)")
|
732
|
+
|
733
|
+
# Processing time estimate
|
734
|
+
estimated_time = len(pages) * 2 # Rough estimate: 2 seconds per page
|
735
|
+
if element_counts.get('table', 0) > 0 or element_counts.get('chart', 0) > 0:
|
736
|
+
estimated_time += (element_counts.get('table', 0) + element_counts.get('chart', 0)) * 5
|
737
|
+
|
738
|
+
click.echo(f"\n ā±ļø Estimated processing time: ~{estimated_time} seconds")
|
739
|
+
if element_counts.get('table', 0) > 0 or element_counts.get('chart', 0) > 0:
|
740
|
+
vlm_time = (element_counts.get('table', 0) + element_counts.get('chart', 0)) * 3
|
741
|
+
click.echo(
|
742
|
+
f" (Add ~{vlm_time}s more with VLM for {element_counts.get('table', 0) + element_counts.get('chart', 0)} tables/charts)")
|
743
|
+
else:
|
744
|
+
click.echo(" ā ļø No elements detected (try lowering --min-score)")
|
745
|
+
|
746
|
+
except KeyboardInterrupt:
|
747
|
+
click.echo("\nā ļø Analysis interrupted by user", err=True)
|
748
|
+
sys.exit(130)
|
749
|
+
except Exception as e:
|
750
|
+
click.echo(f"ā Error analyzing PDF: {e}", err=True)
|
751
|
+
if verbose:
|
752
|
+
import traceback
|
753
|
+
click.echo(traceback.format_exc(), err=True)
|
754
|
+
sys.exit(1)
|
755
|
+
|
756
|
+
|
757
|
+
@cli.command()
|
758
|
+
def info():
|
759
|
+
"""
|
760
|
+
Show system information and available models.
|
761
|
+
|
762
|
+
Displays Python version, dependency status, available VLM providers,
|
763
|
+
layout models, and OCR language information.
|
764
|
+
|
765
|
+
:return: None
|
766
|
+
"""
|
767
|
+
click.echo("š¬ Doctra System Information")
|
768
|
+
click.echo("=" * 50)
|
769
|
+
|
770
|
+
# Check Python version
|
771
|
+
import sys
|
772
|
+
python_version = f"{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}"
|
773
|
+
click.echo(f"Python version: {python_version}")
|
774
|
+
|
775
|
+
# Check key dependencies
|
776
|
+
dependencies = [
|
777
|
+
('PIL', 'Pillow', 'Image processing'),
|
778
|
+
('paddle', 'PaddlePaddle', 'Layout detection engine'),
|
779
|
+
('pytesseract', 'pytesseract', 'OCR engine'),
|
780
|
+
('tqdm', 'tqdm', 'Progress bars'),
|
781
|
+
('click', 'click', 'CLI framework'),
|
782
|
+
]
|
783
|
+
|
784
|
+
click.echo("\nCore Dependencies:")
|
785
|
+
for module_name, package_name, description in dependencies:
|
786
|
+
try:
|
787
|
+
module = __import__(module_name)
|
788
|
+
version = getattr(module, '__version__', 'unknown')
|
789
|
+
click.echo(f" ā
{package_name} ({version}) - {description}")
|
790
|
+
except ImportError:
|
791
|
+
click.echo(f" ā {package_name} - {description} (not installed)")
|
792
|
+
|
793
|
+
# Optional VLM dependencies
|
794
|
+
click.echo("\nVLM Dependencies (Optional):")
|
795
|
+
vlm_deps = [
|
796
|
+
('google.generativeai', 'google-generativeai', 'Gemini VLM support'),
|
797
|
+
('openai', 'openai', 'OpenAI VLM support'),
|
798
|
+
]
|
799
|
+
|
800
|
+
for module_name, package_name, description in vlm_deps:
|
801
|
+
try:
|
802
|
+
module = __import__(module_name)
|
803
|
+
version = getattr(module, '__version__', 'unknown')
|
804
|
+
click.echo(f" ā
{package_name} ({version}) - {description}")
|
805
|
+
except ImportError:
|
806
|
+
click.echo(f" ā ļø {package_name} - {description} (not installed)")
|
807
|
+
|
808
|
+
# Available commands
|
809
|
+
click.echo("\nAvailable Commands:")
|
810
|
+
click.echo(" š parse - Full document processing (text, tables, charts, figures)")
|
811
|
+
click.echo(" š extract - Chart/table extraction only")
|
812
|
+
click.echo(" āā charts - Extract only charts")
|
813
|
+
click.echo(" āā tables - Extract only tables")
|
814
|
+
click.echo(" āā both - Extract charts and tables")
|
815
|
+
click.echo(" šØ visualize - Layout detection visualization")
|
816
|
+
click.echo(" š analyze - Document structure analysis")
|
817
|
+
click.echo(" ā¹ļø info - System information (this command)")
|
818
|
+
|
819
|
+
# VLM providers
|
820
|
+
click.echo("\nVLM Providers:")
|
821
|
+
click.echo(" ⢠Gemini (Google) - gemini-1.5-flash-latest, gemini-1.5-pro")
|
822
|
+
click.echo(" ⢠OpenAI - gpt-4o, gpt-4o-mini, gpt-4-vision-preview")
|
823
|
+
|
824
|
+
# Available layout models
|
825
|
+
click.echo("\nLayout Detection Models:")
|
826
|
+
click.echo(" ⢠PP-DocLayout_plus-L (default) - High accuracy layout detection")
|
827
|
+
click.echo(" ⢠PP-DocLayout_plus-M - Balanced speed and accuracy")
|
828
|
+
click.echo(" ⢠PP-DocLayout_plus-S - Fast inference")
|
829
|
+
|
830
|
+
# OCR information
|
831
|
+
click.echo("\nOCR Configuration:")
|
832
|
+
click.echo(" Engine: Tesseract OCR")
|
833
|
+
click.echo(" Common languages: eng, fra, deu, spa, ita, por, rus, ara, chi_sim, jpn")
|
834
|
+
click.echo(" Use 'tesseract --list-langs' for complete language list")
|
835
|
+
|
836
|
+
# Environment variables
|
837
|
+
click.echo("\nEnvironment Variables:")
|
838
|
+
vlm_key = os.environ.get('VLM_API_KEY')
|
839
|
+
if vlm_key:
|
840
|
+
masked_key = vlm_key[:8] + '*' * (len(vlm_key) - 12) + vlm_key[-4:] if len(vlm_key) > 12 else '*' * len(vlm_key)
|
841
|
+
click.echo(f" VLM_API_KEY: {masked_key}")
|
842
|
+
else:
|
843
|
+
click.echo(" VLM_API_KEY: (not set)")
|
844
|
+
|
845
|
+
# Usage examples
|
846
|
+
click.echo("\nš” Quick Start Examples:")
|
847
|
+
click.echo(" doctra parse document.pdf # Full document parsing")
|
848
|
+
click.echo(" doctra extract both document.pdf --use-vlm # Charts & tables with VLM")
|
849
|
+
click.echo(" doctra extract charts document.pdf # Only charts")
|
850
|
+
click.echo(" doctra extract tables document.pdf # Only tables")
|
851
|
+
click.echo(" doctra visualize document.pdf # Visualize layout")
|
852
|
+
click.echo(" doctra analyze document.pdf # Quick analysis")
|
853
|
+
|
854
|
+
|
855
|
+
if __name__ == '__main__':
|
856
|
+
cli()
|