doctra 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- doctra/cli/main.py +2 -2
- doctra/cli/utils.py +12 -3
- doctra/engines/layout/paddle_layout.py +3 -2
- doctra/engines/vlm/provider.py +34 -6
- doctra/engines/vlm/service.py +5 -2
- doctra/parsers/structured_pdf_parser.py +23 -8
- doctra/parsers/table_chart_extractor.py +19 -6
- doctra/utils/progress.py +277 -0
- doctra/version.py +1 -1
- {doctra-0.1.0.dist-info → doctra-0.2.0.dist-info}/METADATA +12 -10
- {doctra-0.1.0.dist-info → doctra-0.2.0.dist-info}/RECORD +14 -13
- {doctra-0.1.0.dist-info → doctra-0.2.0.dist-info}/WHEEL +0 -0
- {doctra-0.1.0.dist-info → doctra-0.2.0.dist-info}/licenses/LICENSE +0 -0
- {doctra-0.1.0.dist-info → doctra-0.2.0.dist-info}/top_level.txt +0 -0
doctra/cli/main.py
CHANGED
@@ -818,8 +818,8 @@ def info():
|
|
818
818
|
|
819
819
|
# VLM providers
|
820
820
|
click.echo("\nVLM Providers:")
|
821
|
-
click.echo(" • Gemini (Google) - gemini-
|
822
|
-
click.echo(" • OpenAI - gpt-
|
821
|
+
click.echo(" • Gemini (Google) - gemini-2.5-pro, gemini-2.5-flash, gemini-2.5-flash-lite, gemini-2.0-flash")
|
822
|
+
click.echo(" • OpenAI - gpt-5, gpt-5-mini, gpt-4.1, gpt-4.1-mini, gpt-4o")
|
823
823
|
|
824
824
|
# Available layout models
|
825
825
|
click.echo("\nLayout Detection Models:")
|
doctra/cli/utils.py
CHANGED
@@ -263,7 +263,7 @@ def create_progress_callback(description: str, total: int):
|
|
263
263
|
"""
|
264
264
|
Create a progress callback function for use with processing operations.
|
265
265
|
|
266
|
-
Creates a tqdm progress bar and returns a callback function that
|
266
|
+
Creates a beautiful tqdm progress bar and returns a callback function that
|
267
267
|
can be used to update the progress during long-running operations.
|
268
268
|
|
269
269
|
:param description: Description text for the progress bar
|
@@ -271,9 +271,18 @@ def create_progress_callback(description: str, total: int):
|
|
271
271
|
:return: Callable progress callback function that takes an integer
|
272
272
|
representing the number of completed items
|
273
273
|
"""
|
274
|
-
|
274
|
+
import sys
|
275
|
+
from doctra.utils.progress import create_beautiful_progress_bar, create_notebook_friendly_bar
|
275
276
|
|
276
|
-
|
277
|
+
# Enhanced environment detection
|
278
|
+
is_notebook = "ipykernel" in sys.modules or "jupyter" in sys.modules
|
279
|
+
is_terminal = hasattr(sys.stdout, 'isatty') and sys.stdout.isatty()
|
280
|
+
|
281
|
+
# Choose appropriate progress bar based on environment
|
282
|
+
if is_notebook:
|
283
|
+
pbar = create_notebook_friendly_bar(total=total, desc=description)
|
284
|
+
else:
|
285
|
+
pbar = create_beautiful_progress_bar(total=total, desc=description, leave=True)
|
277
286
|
|
278
287
|
def callback(completed: int):
|
279
288
|
pbar.n = completed
|
@@ -14,6 +14,7 @@ from paddleocr import LayoutDetection # pip install paddleocr>=2.7.0.3
|
|
14
14
|
from doctra.utils.pdf_io import render_pdf_to_images
|
15
15
|
from doctra.engines.layout.layout_models import LayoutBox, LayoutPage
|
16
16
|
from doctra.utils.quiet import suppress_output
|
17
|
+
from doctra.utils.progress import create_loading_bar
|
17
18
|
|
18
19
|
|
19
20
|
class PaddleLayoutEngine:
|
@@ -53,8 +54,8 @@ class PaddleLayoutEngine:
|
|
53
54
|
if self.model is not None:
|
54
55
|
return
|
55
56
|
|
56
|
-
#
|
57
|
-
with
|
57
|
+
# Beautiful loading progress bar
|
58
|
+
with create_loading_bar(f'Loading PaddleOCR layout model: "{self.model_name}"') as bar:
|
58
59
|
# Monkey patch tqdm to disable it completely during model loading
|
59
60
|
original_tqdm_init = tqdm.__init__
|
60
61
|
original_tqdm_update = tqdm.update
|
doctra/engines/vlm/provider.py
CHANGED
@@ -8,6 +8,7 @@ import outlines
|
|
8
8
|
from pydantic import BaseModel
|
9
9
|
from google.genai import Client
|
10
10
|
from outlines.inputs import Image
|
11
|
+
from anthropic import Anthropic
|
11
12
|
# ------------------------------------------------------
|
12
13
|
|
13
14
|
def make_model(
|
@@ -19,12 +20,12 @@ def make_model(
|
|
19
20
|
"""
|
20
21
|
Build a callable Outlines model for VLM processing.
|
21
22
|
|
22
|
-
Creates an Outlines model instance configured for
|
23
|
+
Creates an Outlines model instance configured for Gemini, OpenAI, Anthropic, or OpenRouter
|
23
24
|
providers. Only one backend is active at a time, with Gemini as the default.
|
24
25
|
|
25
|
-
:param vlm_provider: VLM provider to use ("gemini" or "
|
26
|
+
:param vlm_provider: VLM provider to use ("gemini", "openai", or "anthropic", default: "gemini")
|
26
27
|
:param vlm_model: Model name to use (defaults to provider-specific defaults)
|
27
|
-
:param api_key: API key for the VLM provider (required for
|
28
|
+
:param api_key: API key for the VLM provider (required for all providers)
|
28
29
|
:return: Configured Outlines model instance
|
29
30
|
:raises ValueError: If provider is unsupported or API key is missing
|
30
31
|
"""
|
@@ -33,9 +34,13 @@ def make_model(
|
|
33
34
|
# Set default models if not provided
|
34
35
|
if vlm_model is None:
|
35
36
|
if vlm_provider == "gemini":
|
36
|
-
vlm_model = "gemini-
|
37
|
+
vlm_model = "gemini-2.5-pro"
|
37
38
|
elif vlm_provider == "openai":
|
38
|
-
vlm_model = "gpt-
|
39
|
+
vlm_model = "gpt-5"
|
40
|
+
elif vlm_provider == "anthropic":
|
41
|
+
vlm_model = "claude-opus-4-1"
|
42
|
+
elif vlm_provider == "openrouter":
|
43
|
+
vlm_model = "x-ai/grok-4"
|
39
44
|
|
40
45
|
if vlm_provider == "gemini":
|
41
46
|
if not api_key:
|
@@ -55,4 +60,27 @@ def make_model(
|
|
55
60
|
vlm_model,
|
56
61
|
)
|
57
62
|
|
58
|
-
|
63
|
+
if vlm_provider == "anthropic":
|
64
|
+
if not api_key:
|
65
|
+
raise ValueError("Anthropic provider requires api_key to be passed to make_model(...).")
|
66
|
+
# Create the Anthropic client and model (exactly like your snippet)
|
67
|
+
client = Anthropic(api_key=api_key)
|
68
|
+
return outlines.from_anthropic(
|
69
|
+
client,
|
70
|
+
vlm_model,
|
71
|
+
)
|
72
|
+
|
73
|
+
if vlm_provider == "openrouter":
|
74
|
+
if not api_key:
|
75
|
+
raise ValueError("OpenRouter provider requires api_key to be passed to make_model(...).")
|
76
|
+
# Create the Anthropic client and model (exactly like your snippet)
|
77
|
+
client = openai.OpenAI(
|
78
|
+
base_url="https://openrouter.ai/api/v1",
|
79
|
+
api_key=api_key,
|
80
|
+
)
|
81
|
+
return outlines.from_openai(
|
82
|
+
client,
|
83
|
+
vlm_model
|
84
|
+
)
|
85
|
+
|
86
|
+
raise ValueError(f"Unsupported provider: {vlm_provider}. Use 'gemini', 'openai', or 'anthropic'.")
|
doctra/engines/vlm/service.py
CHANGED
@@ -18,6 +18,9 @@ class VLMStructuredExtractor:
|
|
18
18
|
vlm = VLMStructuredExtractor(vlm_provider="gemini", api_key="YOUR_KEY", debug=True)
|
19
19
|
chart = vlm.extract_chart("/abs/path/chart.jpg")
|
20
20
|
table = vlm.extract_table("/abs/path/table.jpg")
|
21
|
+
|
22
|
+
# Or with Anthropic:
|
23
|
+
vlm = VLMStructuredExtractor(vlm_provider="anthropic", api_key="YOUR_KEY", debug=True)
|
21
24
|
"""
|
22
25
|
|
23
26
|
def __init__(
|
@@ -34,9 +37,9 @@ class VLMStructuredExtractor:
|
|
34
37
|
Sets up the VLM model and debug settings for structured data extraction
|
35
38
|
from images.
|
36
39
|
|
37
|
-
:param vlm_provider: VLM provider to use ("gemini" or "
|
40
|
+
:param vlm_provider: VLM provider to use ("gemini", "openai", "anthropic", or "openrouter", default: "gemini")
|
38
41
|
:param vlm_model: Model name to use (defaults to provider-specific defaults)
|
39
|
-
:param api_key: API key for the VLM provider (required for
|
42
|
+
:param api_key: API key for the VLM provider (required for all providers)
|
40
43
|
:param debug: Whether to enable debug output for error handling (default: True)
|
41
44
|
"""
|
42
45
|
self.model = make_model(
|
@@ -1,6 +1,7 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
import os
|
3
3
|
import re
|
4
|
+
import sys
|
4
5
|
from typing import List, Dict, Any
|
5
6
|
from contextlib import ExitStack
|
6
7
|
from PIL import Image, ImageDraw, ImageFont
|
@@ -19,6 +20,7 @@ from doctra.exporters.excel_writer import write_structured_excel
|
|
19
20
|
from doctra.utils.structured_utils import to_structured_dict
|
20
21
|
from doctra.exporters.markdown_table import render_markdown_table
|
21
22
|
from doctra.exporters.markdown_writer import write_markdown
|
23
|
+
from doctra.utils.progress import create_beautiful_progress_bar, create_multi_progress_bars, create_notebook_friendly_bar
|
22
24
|
|
23
25
|
|
24
26
|
class StructuredPDFParser:
|
@@ -30,7 +32,7 @@ class StructuredPDFParser:
|
|
30
32
|
converting visual elements into structured data.
|
31
33
|
|
32
34
|
:param use_vlm: Whether to use VLM for structured data extraction (default: False)
|
33
|
-
:param vlm_provider: VLM provider to use ("gemini" or "
|
35
|
+
:param vlm_provider: VLM provider to use ("gemini", "openai", "anthropic", or "openrouter", default: "gemini")
|
34
36
|
:param vlm_model: Model name to use (defaults to provider-specific defaults)
|
35
37
|
:param vlm_api_key: API key for VLM provider (required if use_vlm is True)
|
36
38
|
:param layout_model_name: Layout detection model name (default: "PP-DocLayout_plus-L")
|
@@ -66,7 +68,7 @@ class StructuredPDFParser:
|
|
66
68
|
the VLM service for comprehensive document processing.
|
67
69
|
|
68
70
|
:param use_vlm: Whether to use VLM for structured data extraction
|
69
|
-
:param vlm_provider: VLM provider to use ("gemini" or "
|
71
|
+
:param vlm_provider: VLM provider to use ("gemini", "openai", "anthropic", or "openrouter")
|
70
72
|
:param vlm_model: Model name to use (defaults to provider-specific defaults)
|
71
73
|
:param vlm_api_key: API key for VLM provider
|
72
74
|
:param layout_model_name: Layout detection model name
|
@@ -130,12 +132,25 @@ class StructuredPDFParser:
|
|
130
132
|
figures_desc = "Figures (cropped)"
|
131
133
|
|
132
134
|
with ExitStack() as stack:
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
135
|
+
# Enhanced environment detection
|
136
|
+
is_notebook = "ipykernel" in sys.modules or "jupyter" in sys.modules
|
137
|
+
is_terminal = hasattr(sys.stdout, 'isatty') and sys.stdout.isatty()
|
138
|
+
|
139
|
+
# Use appropriate progress bars based on environment
|
140
|
+
if is_notebook:
|
141
|
+
charts_bar = stack.enter_context(
|
142
|
+
create_notebook_friendly_bar(total=chart_count, desc=charts_desc)) if chart_count else None
|
143
|
+
tables_bar = stack.enter_context(
|
144
|
+
create_notebook_friendly_bar(total=table_count, desc=tables_desc)) if table_count else None
|
145
|
+
figures_bar = stack.enter_context(
|
146
|
+
create_notebook_friendly_bar(total=fig_count, desc=figures_desc)) if fig_count else None
|
147
|
+
else:
|
148
|
+
charts_bar = stack.enter_context(
|
149
|
+
create_beautiful_progress_bar(total=chart_count, desc=charts_desc, leave=True)) if chart_count else None
|
150
|
+
tables_bar = stack.enter_context(
|
151
|
+
create_beautiful_progress_bar(total=table_count, desc=tables_desc, leave=True)) if table_count else None
|
152
|
+
figures_bar = stack.enter_context(
|
153
|
+
create_beautiful_progress_bar(total=fig_count, desc=figures_desc, leave=True)) if fig_count else None
|
139
154
|
|
140
155
|
for p in pages:
|
141
156
|
page_num = p.page_index
|
@@ -1,6 +1,7 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
import os
|
4
|
+
import sys
|
4
5
|
from typing import List, Dict, Any
|
5
6
|
from contextlib import ExitStack
|
6
7
|
from pathlib import Path
|
@@ -9,6 +10,7 @@ from PIL import Image
|
|
9
10
|
from tqdm import tqdm
|
10
11
|
|
11
12
|
from doctra.utils.pdf_io import render_pdf_to_images
|
13
|
+
from doctra.utils.progress import create_beautiful_progress_bar, create_multi_progress_bars, create_notebook_friendly_bar
|
12
14
|
from doctra.engines.layout.paddle_layout import PaddleLayoutEngine
|
13
15
|
from doctra.engines.layout.layout_models import LayoutPage
|
14
16
|
|
@@ -34,7 +36,7 @@ class ChartTablePDFParser:
|
|
34
36
|
:param extract_charts: Whether to extract charts from the document (default: True)
|
35
37
|
:param extract_tables: Whether to extract tables from the document (default: True)
|
36
38
|
:param use_vlm: Whether to use VLM for structured data extraction (default: False)
|
37
|
-
:param vlm_provider: VLM provider to use ("gemini" or "
|
39
|
+
:param vlm_provider: VLM provider to use ("gemini", "openai", "anthropic", or "openrouter", default: "gemini")
|
38
40
|
:param vlm_model: Model name to use (defaults to provider-specific defaults)
|
39
41
|
:param vlm_api_key: API key for VLM provider (required if use_vlm is True)
|
40
42
|
:param layout_model_name: Layout detection model name (default: "PP-DocLayout_plus-L")
|
@@ -64,7 +66,7 @@ class ChartTablePDFParser:
|
|
64
66
|
:param extract_charts: Whether to extract charts from the document
|
65
67
|
:param extract_tables: Whether to extract tables from the document
|
66
68
|
:param use_vlm: Whether to use VLM for structured data extraction
|
67
|
-
:param vlm_provider: VLM provider to use ("gemini" or "
|
69
|
+
:param vlm_provider: VLM provider to use ("gemini", "openai", "anthropic", or "openrouter")
|
68
70
|
:param vlm_model: Model name to use (defaults to provider-specific defaults)
|
69
71
|
:param vlm_api_key: API key for VLM provider
|
70
72
|
:param layout_model_name: Layout detection model name
|
@@ -149,10 +151,21 @@ class ChartTablePDFParser:
|
|
149
151
|
table_counter = 1
|
150
152
|
|
151
153
|
with ExitStack() as stack:
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
154
|
+
# Enhanced environment detection
|
155
|
+
is_notebook = "ipykernel" in sys.modules or "jupyter" in sys.modules
|
156
|
+
is_terminal = hasattr(sys.stdout, 'isatty') and sys.stdout.isatty()
|
157
|
+
|
158
|
+
# Use appropriate progress bars based on environment
|
159
|
+
if is_notebook:
|
160
|
+
charts_bar = stack.enter_context(
|
161
|
+
create_notebook_friendly_bar(total=chart_count, desc=charts_desc)) if chart_count else None
|
162
|
+
tables_bar = stack.enter_context(
|
163
|
+
create_notebook_friendly_bar(total=table_count, desc=tables_desc)) if table_count else None
|
164
|
+
else:
|
165
|
+
charts_bar = stack.enter_context(
|
166
|
+
create_beautiful_progress_bar(total=chart_count, desc=charts_desc, leave=True)) if chart_count else None
|
167
|
+
tables_bar = stack.enter_context(
|
168
|
+
create_beautiful_progress_bar(total=table_count, desc=tables_desc, leave=True)) if table_count else None
|
156
169
|
|
157
170
|
for p in pages:
|
158
171
|
page_num = p.page_index
|
doctra/utils/progress.py
ADDED
@@ -0,0 +1,277 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import os
|
4
|
+
import sys
|
5
|
+
from typing import Optional, Dict, Any
|
6
|
+
from tqdm import tqdm
|
7
|
+
from tqdm.auto import tqdm as tqdm_auto
|
8
|
+
|
9
|
+
|
10
|
+
def create_beautiful_progress_bar(
|
11
|
+
total: int,
|
12
|
+
desc: str,
|
13
|
+
leave: bool = True,
|
14
|
+
position: Optional[int] = None,
|
15
|
+
**kwargs
|
16
|
+
) -> tqdm:
|
17
|
+
"""
|
18
|
+
Create a beautiful and interactive tqdm progress bar with enhanced styling.
|
19
|
+
|
20
|
+
Features:
|
21
|
+
- Colorful progress bars with gradients
|
22
|
+
- Emoji icons for different operations
|
23
|
+
- Better formatting and spacing
|
24
|
+
- Interactive features
|
25
|
+
- Responsive design
|
26
|
+
|
27
|
+
:param total: Total number of items to process
|
28
|
+
:param desc: Description text for the progress bar
|
29
|
+
:param leave: Whether to leave the progress bar after completion
|
30
|
+
:param position: Position of the progress bar (for multiple bars)
|
31
|
+
:param kwargs: Additional tqdm parameters
|
32
|
+
:return: Configured tqdm progress bar instance
|
33
|
+
"""
|
34
|
+
|
35
|
+
# Enhanced styling parameters - notebook-friendly format
|
36
|
+
if "ipykernel" in sys.modules:
|
37
|
+
# Simpler format for notebooks to avoid display issues
|
38
|
+
bar_format = "{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]"
|
39
|
+
else:
|
40
|
+
# Full format for terminal
|
41
|
+
bar_format = (
|
42
|
+
"{l_bar}{bar:30}| {n_fmt}/{total_fmt} "
|
43
|
+
"[{elapsed}<{remaining}, {rate_fmt}{postfix}]"
|
44
|
+
)
|
45
|
+
|
46
|
+
# Color schemes based on operation type
|
47
|
+
color_schemes = {
|
48
|
+
"loading": {"colour": "cyan", "ncols": 100},
|
49
|
+
"charts": {"colour": "green", "ncols": 100},
|
50
|
+
"tables": {"colour": "blue", "ncols": 100},
|
51
|
+
"figures": {"colour": "magenta", "ncols": 100},
|
52
|
+
"ocr": {"colour": "yellow", "ncols": 100},
|
53
|
+
"vlm": {"colour": "red", "ncols": 100},
|
54
|
+
"processing": {"colour": "white", "ncols": 100},
|
55
|
+
}
|
56
|
+
|
57
|
+
# Determine color scheme based on description
|
58
|
+
desc_lower = desc.lower()
|
59
|
+
if "loading" in desc_lower or "model" in desc_lower:
|
60
|
+
color_scheme = color_schemes["loading"]
|
61
|
+
elif "chart" in desc_lower:
|
62
|
+
color_scheme = color_schemes["charts"]
|
63
|
+
elif "table" in desc_lower:
|
64
|
+
color_scheme = color_schemes["tables"]
|
65
|
+
elif "figure" in desc_lower:
|
66
|
+
color_scheme = color_schemes["figures"]
|
67
|
+
elif "ocr" in desc_lower:
|
68
|
+
color_scheme = color_schemes["ocr"]
|
69
|
+
elif "vlm" in desc_lower:
|
70
|
+
color_scheme = color_schemes["vlm"]
|
71
|
+
else:
|
72
|
+
color_scheme = color_schemes["processing"]
|
73
|
+
|
74
|
+
# Add emoji icons to descriptions
|
75
|
+
emoji_map = {
|
76
|
+
"loading": "🔄",
|
77
|
+
"charts": "📊",
|
78
|
+
"tables": "📋",
|
79
|
+
"figures": "🖼️",
|
80
|
+
"ocr": "🔍",
|
81
|
+
"vlm": "🤖",
|
82
|
+
"processing": "⚙️",
|
83
|
+
}
|
84
|
+
|
85
|
+
# Add appropriate emoji to description
|
86
|
+
for key, emoji in emoji_map.items():
|
87
|
+
if key in desc_lower:
|
88
|
+
desc = f"{emoji} {desc}"
|
89
|
+
break
|
90
|
+
else:
|
91
|
+
desc = f"⚙️ {desc}"
|
92
|
+
|
93
|
+
# Enhanced tqdm configuration
|
94
|
+
tqdm_config = {
|
95
|
+
"total": total,
|
96
|
+
"desc": desc,
|
97
|
+
"leave": leave,
|
98
|
+
"bar_format": bar_format,
|
99
|
+
"ncols": color_scheme["ncols"],
|
100
|
+
"ascii": False, # Use Unicode characters for better appearance
|
101
|
+
"dynamic_ncols": True, # Responsive width
|
102
|
+
"smoothing": 0.3, # Smooth progress updates
|
103
|
+
"mininterval": 0.1, # Minimum update interval
|
104
|
+
"maxinterval": 1.0, # Maximum update interval
|
105
|
+
"position": position,
|
106
|
+
**kwargs
|
107
|
+
}
|
108
|
+
|
109
|
+
# Enhanced environment detection
|
110
|
+
is_notebook = "ipykernel" in sys.modules or "jupyter" in sys.modules
|
111
|
+
is_terminal = hasattr(sys.stdout, 'isatty') and sys.stdout.isatty()
|
112
|
+
|
113
|
+
# Add color only for terminal environments (not notebooks)
|
114
|
+
if not is_notebook and is_terminal:
|
115
|
+
tqdm_config["colour"] = color_scheme["colour"]
|
116
|
+
|
117
|
+
# Use auto tqdm for better Jupyter notebook support
|
118
|
+
if is_notebook:
|
119
|
+
# In notebooks, don't use color to avoid ANSI code issues
|
120
|
+
tqdm_config.pop("colour", None) # Remove color in notebooks
|
121
|
+
return tqdm_auto(**tqdm_config)
|
122
|
+
else:
|
123
|
+
# In terminal/cmd/powershell, we can use colors
|
124
|
+
return tqdm(**tqdm_config)
|
125
|
+
|
126
|
+
|
127
|
+
def create_multi_progress_bars(
|
128
|
+
descriptions: list[str],
|
129
|
+
totals: list[int],
|
130
|
+
positions: Optional[list[int]] = None
|
131
|
+
) -> list[tqdm]:
|
132
|
+
"""
|
133
|
+
Create multiple beautiful progress bars for concurrent operations.
|
134
|
+
|
135
|
+
:param descriptions: List of descriptions for each progress bar
|
136
|
+
:param totals: List of totals for each progress bar
|
137
|
+
:param positions: Optional list of positions for each bar
|
138
|
+
:return: List of configured tqdm progress bar instances
|
139
|
+
"""
|
140
|
+
if positions is None:
|
141
|
+
positions = list(range(len(descriptions)))
|
142
|
+
|
143
|
+
bars = []
|
144
|
+
for desc, total, pos in zip(descriptions, totals, positions):
|
145
|
+
bar = create_beautiful_progress_bar(
|
146
|
+
total=total,
|
147
|
+
desc=desc,
|
148
|
+
position=pos,
|
149
|
+
leave=True
|
150
|
+
)
|
151
|
+
bars.append(bar)
|
152
|
+
|
153
|
+
return bars
|
154
|
+
|
155
|
+
|
156
|
+
def update_progress_with_info(
|
157
|
+
bar: tqdm,
|
158
|
+
increment: int = 1,
|
159
|
+
info: Optional[Dict[str, Any]] = None
|
160
|
+
) -> None:
|
161
|
+
"""
|
162
|
+
Update progress bar with additional information.
|
163
|
+
|
164
|
+
:param bar: tqdm progress bar instance
|
165
|
+
:param increment: Number to increment the progress
|
166
|
+
:param info: Optional dictionary of information to display
|
167
|
+
"""
|
168
|
+
if info:
|
169
|
+
# Format info as postfix
|
170
|
+
postfix_parts = []
|
171
|
+
for key, value in info.items():
|
172
|
+
if isinstance(value, float):
|
173
|
+
postfix_parts.append(f"{key}: {value:.2f}")
|
174
|
+
else:
|
175
|
+
postfix_parts.append(f"{key}: {value}")
|
176
|
+
|
177
|
+
bar.set_postfix_str(", ".join(postfix_parts))
|
178
|
+
|
179
|
+
bar.update(increment)
|
180
|
+
|
181
|
+
|
182
|
+
def create_loading_bar(desc: str = "Loading", **kwargs) -> tqdm:
|
183
|
+
"""
|
184
|
+
Create a special loading progress bar for model initialization.
|
185
|
+
|
186
|
+
:param desc: Description for the loading operation
|
187
|
+
:param kwargs: Additional tqdm parameters
|
188
|
+
:return: Configured loading progress bar
|
189
|
+
"""
|
190
|
+
return create_beautiful_progress_bar(
|
191
|
+
total=1,
|
192
|
+
desc=desc,
|
193
|
+
leave=True,
|
194
|
+
**kwargs
|
195
|
+
)
|
196
|
+
|
197
|
+
|
198
|
+
def create_processing_bar(
|
199
|
+
total: int,
|
200
|
+
operation: str,
|
201
|
+
**kwargs
|
202
|
+
) -> tqdm:
|
203
|
+
"""
|
204
|
+
Create a processing progress bar for data operations.
|
205
|
+
|
206
|
+
:param total: Total number of items to process
|
207
|
+
:param operation: Type of operation (charts, tables, figures, etc.)
|
208
|
+
:param kwargs: Additional tqdm parameters
|
209
|
+
:return: Configured processing progress bar
|
210
|
+
"""
|
211
|
+
desc = f"{operation.title()} (processing)"
|
212
|
+
return create_beautiful_progress_bar(
|
213
|
+
total=total,
|
214
|
+
desc=desc,
|
215
|
+
leave=True,
|
216
|
+
**kwargs
|
217
|
+
)
|
218
|
+
|
219
|
+
|
220
|
+
def create_notebook_friendly_bar(
|
221
|
+
total: int,
|
222
|
+
desc: str,
|
223
|
+
**kwargs
|
224
|
+
) -> tqdm:
|
225
|
+
"""
|
226
|
+
Create a notebook-friendly progress bar with minimal formatting.
|
227
|
+
|
228
|
+
This function creates progress bars specifically optimized for Jupyter notebooks
|
229
|
+
to avoid display issues and ANSI code problems.
|
230
|
+
|
231
|
+
:param total: Total number of items to process
|
232
|
+
:param desc: Description text for the progress bar
|
233
|
+
:param kwargs: Additional tqdm parameters
|
234
|
+
:return: Configured notebook-friendly progress bar
|
235
|
+
"""
|
236
|
+
# Force notebook mode
|
237
|
+
kwargs["disable"] = False
|
238
|
+
kwargs["ascii"] = True # Use ASCII characters for better notebook compatibility
|
239
|
+
|
240
|
+
# Add emoji icons to descriptions (same as beautiful bars)
|
241
|
+
emoji_map = {
|
242
|
+
"loading": "🔄",
|
243
|
+
"charts": "📊",
|
244
|
+
"tables": "📋",
|
245
|
+
"figures": "🖼️",
|
246
|
+
"ocr": "🔍",
|
247
|
+
"vlm": "🤖",
|
248
|
+
"processing": "⚙️",
|
249
|
+
}
|
250
|
+
|
251
|
+
# Add appropriate emoji to description
|
252
|
+
desc_lower = desc.lower()
|
253
|
+
for key, emoji in emoji_map.items():
|
254
|
+
if key in desc_lower:
|
255
|
+
desc = f"{emoji} {desc}"
|
256
|
+
break
|
257
|
+
else:
|
258
|
+
desc = f"⚙️ {desc}"
|
259
|
+
|
260
|
+
# Simple format for notebooks
|
261
|
+
bar_format = "{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt}"
|
262
|
+
|
263
|
+
tqdm_config = {
|
264
|
+
"total": total,
|
265
|
+
"desc": desc,
|
266
|
+
"leave": True,
|
267
|
+
"bar_format": bar_format,
|
268
|
+
"ncols": 80,
|
269
|
+
"ascii": True,
|
270
|
+
"dynamic_ncols": False, # Fixed width for notebooks
|
271
|
+
"smoothing": 0.1, # Faster updates
|
272
|
+
"mininterval": 0.05,
|
273
|
+
"maxinterval": 0.5,
|
274
|
+
**kwargs
|
275
|
+
}
|
276
|
+
|
277
|
+
return tqdm_auto(**tqdm_config)
|
doctra/version.py
CHANGED
@@ -1,2 +1,2 @@
|
|
1
1
|
"""Version information for Doctra."""
|
2
|
-
__version__ = '0.
|
2
|
+
__version__ = '0.2.0'
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: doctra
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.2.0
|
4
4
|
Summary: Parse, extract, and analyze documents with ease
|
5
5
|
Home-page: https://github.com/AdemBoukhris457/Doctra
|
6
6
|
Author: Adem Boukhris
|
@@ -241,6 +241,8 @@ Provides-Extra: openai
|
|
241
241
|
Requires-Dist: openai>=1.0.0; extra == "openai"
|
242
242
|
Provides-Extra: gemini
|
243
243
|
Requires-Dist: google-generativeai>=0.3.0; extra == "gemini"
|
244
|
+
Provides-Extra: anthropic
|
245
|
+
Requires-Dist: anthropic>=0.40.0; extra == "anthropic"
|
244
246
|
Provides-Extra: dev
|
245
247
|
Requires-Dist: pytest>=6.0; extra == "dev"
|
246
248
|
Requires-Dist: pytest-cov>=2.0; extra == "dev"
|
@@ -256,13 +258,13 @@ Dynamic: requires-python
|
|
256
258
|
|
257
259
|
# 🚀 **Doctra - Document Parser Library** 📑🔎
|
258
260
|
|
259
|
-

|
260
262
|
|
261
263
|
<div align="center">
|
262
264
|
|
263
|
-
[](https://github.com/AdemBoukhris457/Doctra)
|
266
|
+
[](https://github.com/AdemBoukhris457/Doctra)
|
267
|
+
[](https://pypi.org/project/doctra/)
|
266
268
|
</div>
|
267
269
|
|
268
270
|
## 📋 Table of Contents
|
@@ -329,7 +331,7 @@ parser = StructuredPDFParser()
|
|
329
331
|
# Parser with VLM for structured data extraction
|
330
332
|
parser = StructuredPDFParser(
|
331
333
|
use_vlm=True,
|
332
|
-
vlm_provider="openai", # or "gemini"
|
334
|
+
vlm_provider="openai", # or "gemini" or "anthropic" or "openrouter"
|
333
335
|
vlm_api_key="your_api_key_here"
|
334
336
|
)
|
335
337
|
|
@@ -344,7 +346,7 @@ parser = StructuredPDFParser(
|
|
344
346
|
# VLM Settings
|
345
347
|
use_vlm=True,
|
346
348
|
vlm_provider="openai",
|
347
|
-
vlm_model="gpt-
|
349
|
+
vlm_model="gpt-5",
|
348
350
|
vlm_api_key="your_api_key",
|
349
351
|
|
350
352
|
# Layout Detection Settings
|
@@ -406,7 +408,7 @@ parser = ChartTablePDFParser(
|
|
406
408
|
# VLM Settings
|
407
409
|
use_vlm=True,
|
408
410
|
vlm_provider="openai",
|
409
|
-
vlm_model="gpt-
|
411
|
+
vlm_model="gpt-5",
|
410
412
|
vlm_api_key="your_api_key",
|
411
413
|
|
412
414
|
# Layout Detection Settings
|
@@ -545,7 +547,7 @@ parser = StructuredPDFParser(
|
|
545
547
|
use_vlm=True,
|
546
548
|
vlm_provider="openai",
|
547
549
|
vlm_api_key="your_openai_api_key",
|
548
|
-
vlm__model="gpt-
|
550
|
+
vlm__model="gpt-5",
|
549
551
|
layout_model_name="PP-DocLayout_plus-L",
|
550
552
|
dpi=300, # Higher DPI for better quality
|
551
553
|
min_score=0.5, # Higher confidence threshold
|
@@ -623,4 +625,4 @@ parser.display_pages_with_boxes("document.pdf")
|
|
623
625
|
- **Pandas**: Data manipulation
|
624
626
|
- **OpenPyXL**: Excel file generation
|
625
627
|
- **Google Generative AI**: For Gemini VLM integration
|
626
|
-
- **OpenAI**: For GPT-
|
628
|
+
- **OpenAI**: For GPT-5 VLM integration
|
@@ -1,20 +1,20 @@
|
|
1
1
|
doctra/__init__.py,sha256=-Pkx0Vh4Hz3EQvLaxlL6Mo4lVig59FTN5LvUcxThn4U,519
|
2
|
-
doctra/version.py,sha256=
|
2
|
+
doctra/version.py,sha256=oXtS5MRUB2QfE2Q8GOIq0p_iwA9QH5_2LxFqVKJlb_I,60
|
3
3
|
doctra/cli/__init__.py,sha256=4PTujjYRShOOUlZ7PwuWckShPWLC4v4CYIhJpzgyv1k,911
|
4
|
-
doctra/cli/main.py,sha256=
|
5
|
-
doctra/cli/utils.py,sha256=
|
4
|
+
doctra/cli/main.py,sha256=aRxV0yMtswwXKcBrIE7rxMvZCsFSjCVrE5rIqKzYGOY,35368
|
5
|
+
doctra/cli/utils.py,sha256=IghiUZQCOmXODC5-5smHGz2KeV4xqbP4avmA1Mggln0,11800
|
6
6
|
doctra/engines/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
7
|
doctra/engines/layout/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
8
8
|
doctra/engines/layout/layout_models.py,sha256=vuTzjWd3FD-SkFPngktmUVhOJ6Xvff6ufwFEq796PQs,3162
|
9
|
-
doctra/engines/layout/paddle_layout.py,sha256=
|
9
|
+
doctra/engines/layout/paddle_layout.py,sha256=N9Bzt6372BfWUtQspYqh6PpYWOndjoIYET0_OJU85cs,9405
|
10
10
|
doctra/engines/ocr/__init__.py,sha256=h6bFiveGXdI59fsKzCqOXki3C74DCndEmvloOtMqnR0,133
|
11
11
|
doctra/engines/ocr/api.py,sha256=YOBKDLExXpvSiOsc_TDJasaMPxzdVx1llQCtYlsruWo,1280
|
12
12
|
doctra/engines/ocr/path_resolver.py,sha256=2_7Nsekt3dCDU3oVsgdr62iMrlAhbGNfYwgh4G7S3pA,1492
|
13
13
|
doctra/engines/ocr/pytesseract_engine.py,sha256=Imz2uwju6himkBiS8CH7DLxBRe-LtmMYZiOdb_6PoQw,2911
|
14
14
|
doctra/engines/vlm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
15
15
|
doctra/engines/vlm/outlines_types.py,sha256=qL-G6MNiA5mxp1qAPVEFhOANp4NqVt_MQKseJCr_xXE,970
|
16
|
-
doctra/engines/vlm/provider.py,sha256=
|
17
|
-
doctra/engines/vlm/service.py,sha256=
|
16
|
+
doctra/engines/vlm/provider.py,sha256=njkz99NXZQjkPlRKeje9M_tlaktXyw3VnpFT7enNalk,3216
|
17
|
+
doctra/engines/vlm/service.py,sha256=uD4BXz3u7B_3iq-xU3MTdDDyjrj1Jm8MDeJU1KXHTZc,5121
|
18
18
|
doctra/exporters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
19
19
|
doctra/exporters/excel_writer.py,sha256=U5Eb5SF7_ll1QveUapSWSkCRt3OEoisKEVUQ_7X8Wjo,7762
|
20
20
|
doctra/exporters/image_saver.py,sha256=zsPoQ0CwoE643ui4iZMdXk96kv5mU8L_zC2JfF22N1A,1639
|
@@ -22,8 +22,8 @@ doctra/exporters/markdown_table.py,sha256=4_OJIwG_WoIPYBzJx1njy_3tNVdkK6QKSP-P9r
|
|
22
22
|
doctra/exporters/markdown_writer.py,sha256=L7EjF2MB8jYX7XkZ3a3NeeEC8gnb0qzRPTzIN9tdfuw,1027
|
23
23
|
doctra/parsers/__init__.py,sha256=8M6LVzcWGpuTIK_1SMXML3ll7zK1CTHXGI5qXvqdm-A,206
|
24
24
|
doctra/parsers/layout_order.py,sha256=W6b-T11H907RZ2FaZwNvnYhmvH11rpUzxC5yLkdf28k,640
|
25
|
-
doctra/parsers/structured_pdf_parser.py,sha256=
|
26
|
-
doctra/parsers/table_chart_extractor.py,sha256
|
25
|
+
doctra/parsers/structured_pdf_parser.py,sha256=4T4zYZWbqqtRua_TPSRmjT1tOc1RE-XSMOLC5fVFJk0,21070
|
26
|
+
doctra/parsers/table_chart_extractor.py,sha256=-pyJFYzFVdxDwCD1z5BHhT6qyf9BljbfccKGoMJiD90,13591
|
27
27
|
doctra/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
28
28
|
doctra/utils/bbox.py,sha256=R2-95p0KiWvet3TH27TQVvCar7WJg6z0u3L21iEDF-A,674
|
29
29
|
doctra/utils/constants.py,sha256=ZWOvNDrvETbQ_pxHiX7vUW4J5Oj8_qnov0QacUOBizI,189
|
@@ -31,10 +31,11 @@ doctra/utils/file_ops.py,sha256=3IS0EQncs6Kaj27fcg2zxQX3xRSvtItIsyKGLYgeOgw,815
|
|
31
31
|
doctra/utils/io_utils.py,sha256=L1bWV4-ybs2j_3ZEN7GfQVgdC73JKVECVnpwKbP0dy0,219
|
32
32
|
doctra/utils/ocr_utils.py,sha256=Doa1uYBg3kRgRYd2aPq9fICHgHfrM_efdhZfI7jl6OM,780
|
33
33
|
doctra/utils/pdf_io.py,sha256=c8EY47Z1iqVtlLFHS_n0qGuXJ5ERFaMUd84ivXV0b9E,706
|
34
|
+
doctra/utils/progress.py,sha256=GSjHkNulwqX-Uh_QNP-g-nZH6F-zAwQC120KeTRkRlo,8752
|
34
35
|
doctra/utils/quiet.py,sha256=5XPS-1CtJ0sVk6qgSQctdhr_wR8mP1xoJLoUbmkXROA,387
|
35
36
|
doctra/utils/structured_utils.py,sha256=EdNhCUDLKvYcLqXbTGveNtIRGyQ3yzYhTh-zy_awwM4,1450
|
36
|
-
doctra-0.
|
37
|
-
doctra-0.
|
38
|
-
doctra-0.
|
39
|
-
doctra-0.
|
40
|
-
doctra-0.
|
37
|
+
doctra-0.2.0.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
|
38
|
+
doctra-0.2.0.dist-info/METADATA,sha256=zxVmrkHUI4puc1D8fdUFbRb8WQAL0M4X92v-UaZswPI,26862
|
39
|
+
doctra-0.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
40
|
+
doctra-0.2.0.dist-info/top_level.txt,sha256=jI7E8jHci2gP9y0GYaWxlg9jG0O5n3FjHJJPLXDXMds,7
|
41
|
+
doctra-0.2.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|