doctra 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- doctra/__init__.py +19 -0
- doctra/cli/__init__.py +27 -0
- doctra/cli/main.py +856 -0
- doctra/cli/utils.py +340 -0
- doctra/engines/__init__.py +0 -0
- doctra/engines/layout/__init__.py +0 -0
- doctra/engines/layout/layout_models.py +90 -0
- doctra/engines/layout/paddle_layout.py +225 -0
- doctra/engines/ocr/__init__.py +4 -0
- doctra/engines/ocr/api.py +36 -0
- doctra/engines/ocr/path_resolver.py +48 -0
- doctra/engines/ocr/pytesseract_engine.py +76 -0
- doctra/engines/vlm/__init__.py +0 -0
- doctra/engines/vlm/outlines_types.py +31 -0
- doctra/engines/vlm/provider.py +58 -0
- doctra/engines/vlm/service.py +117 -0
- doctra/exporters/__init__.py +0 -0
- doctra/exporters/excel_writer.py +197 -0
- doctra/exporters/image_saver.py +42 -0
- doctra/exporters/markdown_table.py +56 -0
- doctra/exporters/markdown_writer.py +29 -0
- doctra/parsers/__init__.py +6 -0
- doctra/parsers/layout_order.py +16 -0
- doctra/parsers/structured_pdf_parser.py +434 -0
- doctra/parsers/table_chart_extractor.py +283 -0
- doctra/utils/__init__.py +0 -0
- doctra/utils/bbox.py +18 -0
- doctra/utils/constants.py +8 -0
- doctra/utils/file_ops.py +26 -0
- doctra/utils/io_utils.py +10 -0
- doctra/utils/ocr_utils.py +20 -0
- doctra/utils/pdf_io.py +19 -0
- doctra/utils/quiet.py +13 -0
- doctra/utils/structured_utils.py +49 -0
- doctra/version.py +2 -0
- doctra-0.1.0.dist-info/METADATA +626 -0
- doctra-0.1.0.dist-info/RECORD +40 -0
- doctra-0.1.0.dist-info/WHEEL +5 -0
- doctra-0.1.0.dist-info/licenses/LICENSE +201 -0
- doctra-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,48 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import os
|
4
|
+
import platform
|
5
|
+
import shutil
|
6
|
+
from typing import Optional
|
7
|
+
|
8
|
+
def resolve_tesseract_cmd(tesseract_cmd: Optional[str] = None) -> Optional[str]:
|
9
|
+
"""
|
10
|
+
Best-effort discovery of the Tesseract executable.
|
11
|
+
|
12
|
+
Searches for the Tesseract executable using a priority-based approach:
|
13
|
+
1. Explicitly provided path
|
14
|
+
2. TESSERACT_CMD environment variable
|
15
|
+
3. System PATH
|
16
|
+
4. Common installation paths for the current platform
|
17
|
+
|
18
|
+
:param tesseract_cmd: Optional explicit path to tesseract executable
|
19
|
+
:return: Resolved path to tesseract executable, or None if not found
|
20
|
+
"""
|
21
|
+
if tesseract_cmd and os.path.exists(tesseract_cmd):
|
22
|
+
return tesseract_cmd
|
23
|
+
|
24
|
+
env_cmd = os.getenv("TESSERACT_CMD")
|
25
|
+
if env_cmd and os.path.exists(env_cmd):
|
26
|
+
return env_cmd
|
27
|
+
|
28
|
+
which = shutil.which("tesseract")
|
29
|
+
if which:
|
30
|
+
return which
|
31
|
+
|
32
|
+
system = platform.system()
|
33
|
+
candidates = []
|
34
|
+
if system == "Windows":
|
35
|
+
candidates = [
|
36
|
+
r"C:\Program Files\Tesseract-OCR\tesseract.exe",
|
37
|
+
r"C:\Program Files (x86)\Tesseract-OCR\tesseract.exe",
|
38
|
+
]
|
39
|
+
elif system == "Darwin":
|
40
|
+
candidates = ["/opt/homebrew/bin/tesseract", "/usr/local/bin/tesseract"]
|
41
|
+
else: # Linux/Unix
|
42
|
+
candidates = ["/usr/bin/tesseract", "/usr/local/bin/tesseract"]
|
43
|
+
|
44
|
+
for c in candidates:
|
45
|
+
if os.path.exists(c):
|
46
|
+
return c
|
47
|
+
|
48
|
+
return None
|
@@ -0,0 +1,76 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from typing import Optional
|
4
|
+
from PIL import Image
|
5
|
+
import pytesseract
|
6
|
+
|
7
|
+
from .path_resolver import resolve_tesseract_cmd
|
8
|
+
|
9
|
+
|
10
|
+
class PytesseractOCREngine:
|
11
|
+
"""
|
12
|
+
Minimal OCR engine using pytesseract.
|
13
|
+
|
14
|
+
Accepts a cropped PIL image (e.g., a text block from layout detection)
|
15
|
+
and returns raw text. Provides a simple interface to Tesseract OCR
|
16
|
+
with configurable parameters for different use cases.
|
17
|
+
|
18
|
+
:param tesseract_cmd: Optional path to tesseract executable
|
19
|
+
:param lang: OCR language code (default: "eng")
|
20
|
+
:param psm: Tesseract page segmentation mode (default: 4)
|
21
|
+
:param oem: Tesseract OCR engine mode (default: 3)
|
22
|
+
:param extra_config: Additional Tesseract configuration string (default: "")
|
23
|
+
"""
|
24
|
+
|
25
|
+
def __init__(
|
26
|
+
self,
|
27
|
+
tesseract_cmd: Optional[str] = None,
|
28
|
+
lang: str = "eng",
|
29
|
+
psm: int = 4,
|
30
|
+
oem: int = 3,
|
31
|
+
extra_config: str = "",
|
32
|
+
):
|
33
|
+
"""
|
34
|
+
Initialize the PytesseractOCREngine with OCR configuration.
|
35
|
+
|
36
|
+
Sets up the Tesseract command path and stores configuration parameters
|
37
|
+
for use during text recognition.
|
38
|
+
|
39
|
+
:param tesseract_cmd: Optional path to tesseract executable
|
40
|
+
:param lang: OCR language code (default: "eng")
|
41
|
+
:param psm: Tesseract page segmentation mode (default: 4)
|
42
|
+
:param oem: Tesseract OCR engine mode (default: 3)
|
43
|
+
:param extra_config: Additional Tesseract configuration string (default: "")
|
44
|
+
"""
|
45
|
+
cmd = resolve_tesseract_cmd(tesseract_cmd)
|
46
|
+
if cmd:
|
47
|
+
pytesseract.pytesseract.tesseract_cmd = cmd
|
48
|
+
# If not found, let pytesseract raise a clear error at call time.
|
49
|
+
|
50
|
+
self.lang = lang
|
51
|
+
self.psm = psm
|
52
|
+
self.oem = oem
|
53
|
+
self.extra_config = (extra_config or "").strip()
|
54
|
+
|
55
|
+
def recognize(self, image: Image.Image) -> str:
|
56
|
+
"""
|
57
|
+
Run OCR on a cropped PIL image and return extracted text (stripped).
|
58
|
+
|
59
|
+
Performs text recognition on the provided image using the configured
|
60
|
+
Tesseract parameters and returns the extracted text with whitespace
|
61
|
+
stripped from the beginning and end.
|
62
|
+
|
63
|
+
:param image: PIL Image object to perform OCR on
|
64
|
+
:return: Extracted text string with leading/trailing whitespace removed
|
65
|
+
:raises TypeError: If the input is not a PIL Image object
|
66
|
+
"""
|
67
|
+
if not isinstance(image, Image.Image):
|
68
|
+
raise TypeError("PytesseractOCREngine expects a PIL.Image.Image as input.")
|
69
|
+
|
70
|
+
config_parts = [f"--psm {self.psm}", f"--oem {self.oem}"]
|
71
|
+
if self.extra_config:
|
72
|
+
config_parts.append(self.extra_config)
|
73
|
+
config = " ".join(config_parts)
|
74
|
+
|
75
|
+
text = pytesseract.image_to_string(image, lang=self.lang, config=config)
|
76
|
+
return text.strip()
|
File without changes
|
@@ -0,0 +1,31 @@
|
|
1
|
+
from pydantic import BaseModel
|
2
|
+
|
3
|
+
class Chart(BaseModel):
|
4
|
+
"""
|
5
|
+
Structured representation of a chart extracted from an image.
|
6
|
+
|
7
|
+
Contains the title, headers, and data rows extracted from a chart
|
8
|
+
using VLM (Vision Language Model) processing.
|
9
|
+
|
10
|
+
:param title: Title or caption of the chart
|
11
|
+
:param headers: Column headers for the chart data
|
12
|
+
:param rows: Data rows containing the chart values
|
13
|
+
"""
|
14
|
+
title: str
|
15
|
+
headers: list[str]
|
16
|
+
rows: list[list[str]]
|
17
|
+
|
18
|
+
class Table(BaseModel):
|
19
|
+
"""
|
20
|
+
Structured representation of a table extracted from an image.
|
21
|
+
|
22
|
+
Contains the title, headers, and data rows extracted from a table
|
23
|
+
using VLM (Vision Language Model) processing.
|
24
|
+
|
25
|
+
:param title: Title or caption of the table
|
26
|
+
:param headers: Column headers for the table data
|
27
|
+
:param rows: Data rows containing the table values
|
28
|
+
"""
|
29
|
+
title: str
|
30
|
+
headers: list[str]
|
31
|
+
rows: list[list[str]]
|
@@ -0,0 +1,58 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
# --- keep these imports to match your snippet style ---
|
4
|
+
import io
|
5
|
+
import PIL
|
6
|
+
import openai
|
7
|
+
import outlines
|
8
|
+
from pydantic import BaseModel
|
9
|
+
from google.genai import Client
|
10
|
+
from outlines.inputs import Image
|
11
|
+
# ------------------------------------------------------
|
12
|
+
|
13
|
+
def make_model(
|
14
|
+
vlm_provider: str | None = "gemini",
|
15
|
+
vlm_model: str | None = None,
|
16
|
+
*,
|
17
|
+
api_key: str | None = None,
|
18
|
+
):
|
19
|
+
"""
|
20
|
+
Build a callable Outlines model for VLM processing.
|
21
|
+
|
22
|
+
Creates an Outlines model instance configured for either Gemini or OpenAI
|
23
|
+
providers. Only one backend is active at a time, with Gemini as the default.
|
24
|
+
|
25
|
+
:param vlm_provider: VLM provider to use ("gemini" or "openai", default: "gemini")
|
26
|
+
:param vlm_model: Model name to use (defaults to provider-specific defaults)
|
27
|
+
:param api_key: API key for the VLM provider (required for both Gemini and OpenAI)
|
28
|
+
:return: Configured Outlines model instance
|
29
|
+
:raises ValueError: If provider is unsupported or API key is missing
|
30
|
+
"""
|
31
|
+
vlm_provider = (vlm_provider or "gemini").lower()
|
32
|
+
|
33
|
+
# Set default models if not provided
|
34
|
+
if vlm_model is None:
|
35
|
+
if vlm_provider == "gemini":
|
36
|
+
vlm_model = "gemini-1.5-flash-latest"
|
37
|
+
elif vlm_provider == "openai":
|
38
|
+
vlm_model = "gpt-4o"
|
39
|
+
|
40
|
+
if vlm_provider == "gemini":
|
41
|
+
if not api_key:
|
42
|
+
raise ValueError("Gemini provider requires api_key to be passed to make_model(...).")
|
43
|
+
# Create the model (exactly like your snippet)
|
44
|
+
return outlines.from_gemini(
|
45
|
+
Client(api_key=api_key),
|
46
|
+
vlm_model,
|
47
|
+
)
|
48
|
+
|
49
|
+
if vlm_provider == "openai":
|
50
|
+
if not api_key:
|
51
|
+
raise ValueError("OpenAI provider requires api_key to be passed to make_model(...).")
|
52
|
+
# this part is for the openai models (exactly like your snippet)
|
53
|
+
return outlines.from_openai(
|
54
|
+
openai.OpenAI(api_key=api_key),
|
55
|
+
vlm_model,
|
56
|
+
)
|
57
|
+
|
58
|
+
raise ValueError(f"Unsupported provider: {vlm_provider}. Use 'gemini' or 'openai'.")
|
@@ -0,0 +1,117 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
import os
|
3
|
+
from outlines.inputs import Image
|
4
|
+
|
5
|
+
from ...utils.io_utils import get_image_from_local
|
6
|
+
from .outlines_types import Chart, Table
|
7
|
+
from .provider import make_model
|
8
|
+
|
9
|
+
|
10
|
+
class VLMStructuredExtractor:
|
11
|
+
"""
|
12
|
+
Thin service around prompts + Outlines calls for structured data extraction.
|
13
|
+
|
14
|
+
Provides a high-level interface for extracting structured data (charts and tables)
|
15
|
+
from images using Vision Language Models (VLM) with Outlines for type safety.
|
16
|
+
|
17
|
+
Usage:
|
18
|
+
vlm = VLMStructuredExtractor(vlm_provider="gemini", api_key="YOUR_KEY", debug=True)
|
19
|
+
chart = vlm.extract_chart("/abs/path/chart.jpg")
|
20
|
+
table = vlm.extract_table("/abs/path/table.jpg")
|
21
|
+
"""
|
22
|
+
|
23
|
+
def __init__(
|
24
|
+
self,
|
25
|
+
vlm_provider: str = "gemini",
|
26
|
+
vlm_model: str | None = None,
|
27
|
+
*,
|
28
|
+
api_key: str | None = None,
|
29
|
+
debug: bool = True,
|
30
|
+
):
|
31
|
+
"""
|
32
|
+
Initialize the VLMStructuredExtractor with provider configuration.
|
33
|
+
|
34
|
+
Sets up the VLM model and debug settings for structured data extraction
|
35
|
+
from images.
|
36
|
+
|
37
|
+
:param vlm_provider: VLM provider to use ("gemini" or "openai", default: "gemini")
|
38
|
+
:param vlm_model: Model name to use (defaults to provider-specific defaults)
|
39
|
+
:param api_key: API key for the VLM provider (required for both Gemini and OpenAI)
|
40
|
+
:param debug: Whether to enable debug output for error handling (default: True)
|
41
|
+
"""
|
42
|
+
self.model = make_model(
|
43
|
+
vlm_provider,
|
44
|
+
vlm_model,
|
45
|
+
api_key=api_key,
|
46
|
+
)
|
47
|
+
self.debug = debug
|
48
|
+
|
49
|
+
def _call(self, prompt_text: str, image_path: str, schema):
|
50
|
+
"""
|
51
|
+
Common call: open/normalize image, convert to RGB, invoke model with schema.
|
52
|
+
|
53
|
+
Internal method that handles the common workflow for VLM processing:
|
54
|
+
loading the image, normalizing it, and calling the model with the provided
|
55
|
+
prompt and schema.
|
56
|
+
|
57
|
+
:param prompt_text: Text prompt to send to the VLM
|
58
|
+
:param image_path: Path to the image file to process
|
59
|
+
:param schema: Pydantic schema class for structured output
|
60
|
+
:return: Structured data object matching the provided schema
|
61
|
+
:raises Exception: If image processing or VLM call fails
|
62
|
+
"""
|
63
|
+
try:
|
64
|
+
# Normalize path and verify readability
|
65
|
+
# (get_image_from_local already absolutizes & raises if missing)
|
66
|
+
img = get_image_from_local(image_path)
|
67
|
+
if img.mode != "RGB":
|
68
|
+
img = img.convert("RGB")
|
69
|
+
|
70
|
+
prompt = [prompt_text, Image(img)]
|
71
|
+
return self.model(prompt, schema)
|
72
|
+
except Exception as e:
|
73
|
+
if self.debug:
|
74
|
+
import traceback
|
75
|
+
print(f"[VLM ERROR] while processing: {image_path}")
|
76
|
+
traceback.print_exc()
|
77
|
+
print(f"[VLM ERROR] type={type(e).__name__} msg={e}")
|
78
|
+
# Re-raise so caller can handle/log too
|
79
|
+
raise
|
80
|
+
|
81
|
+
def extract_chart(self, image_path: str) -> Chart:
|
82
|
+
"""
|
83
|
+
Extract structured chart data from an image.
|
84
|
+
|
85
|
+
Uses VLM to analyze a chart image and extract the data in a structured
|
86
|
+
format with title, headers, and rows.
|
87
|
+
|
88
|
+
:param image_path: Path to the chart image file
|
89
|
+
:return: Chart object containing extracted title, headers, and data rows
|
90
|
+
:raises Exception: If image processing or VLM extraction fails
|
91
|
+
"""
|
92
|
+
prompt_text = (
|
93
|
+
"Convert the given chart into a table format with headers and rows. "
|
94
|
+
"If the title is not present in the image, generate a suitable title. "
|
95
|
+
"Ensure that the table represents the data from the chart accurately."
|
96
|
+
"The number of columns in the headers must match the number of columns in each row."
|
97
|
+
)
|
98
|
+
return self._call(prompt_text, image_path, Chart)
|
99
|
+
|
100
|
+
def extract_table(self, image_path: str) -> Table:
|
101
|
+
"""
|
102
|
+
Extract structured table data from an image.
|
103
|
+
|
104
|
+
Uses VLM to analyze a table image and extract the data in a structured
|
105
|
+
format with title, headers, and rows.
|
106
|
+
|
107
|
+
:param image_path: Path to the table image file
|
108
|
+
:return: Table object containing extracted title, headers, and data rows
|
109
|
+
:raises Exception: If image processing or VLM extraction fails
|
110
|
+
"""
|
111
|
+
prompt_text = (
|
112
|
+
"Extract the data from the given table in image format. "
|
113
|
+
"Provide the headers and rows of the table, ensuring accuracy in the extraction. "
|
114
|
+
"If the title is not present in the image, generate a suitable title."
|
115
|
+
"The number of columns in the headers must match the number of columns in each row."
|
116
|
+
)
|
117
|
+
return self._call(prompt_text, image_path, Table)
|
File without changes
|
@@ -0,0 +1,197 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
import os
|
3
|
+
import re
|
4
|
+
from typing import Dict, Any, List, Set
|
5
|
+
import pandas as pd # pip install pandas openpyxl
|
6
|
+
from openpyxl.styles import PatternFill, Font, Alignment
|
7
|
+
from openpyxl.utils import get_column_letter
|
8
|
+
|
9
|
+
_INVALID_SHEET_CHARS = r'[:\\/*?\[\]]' # Excel-invalid characters
|
10
|
+
_MAX_SHEET_LEN = 31
|
11
|
+
|
12
|
+
# Header style: solid green background + white bold font
|
13
|
+
_HEADER_FILL = PatternFill(fill_type="solid", start_color="FF2E7D32", end_color="FF2E7D32") # #2E7D32
|
14
|
+
_HEADER_FONT = Font(color="FFFFFFFF", bold=True)
|
15
|
+
_HEADER_ALIGN = Alignment(horizontal="center", vertical="center", wrap_text=True)
|
16
|
+
|
17
|
+
|
18
|
+
def _safe_sheet_name(raw_title: str, taken: Set[str]) -> str:
|
19
|
+
"""
|
20
|
+
Create a safe Excel sheet name from a raw title.
|
21
|
+
|
22
|
+
Ensures the sheet name is valid for Excel by removing invalid characters,
|
23
|
+
handling length limits, and avoiding duplicates.
|
24
|
+
:param raw_title: Original title to convert to sheet name
|
25
|
+
:param taken: Set of already used sheet names to avoid conflicts
|
26
|
+
:return: Safe Excel sheet name that doesn't conflict with existing names
|
27
|
+
"""
|
28
|
+
name = (raw_title or "Untitled").strip()
|
29
|
+
name = re.sub(_INVALID_SHEET_CHARS, "_", name)
|
30
|
+
name = re.sub(r"\s+", " ", name)
|
31
|
+
name = name[:_MAX_SHEET_LEN] if name else "Sheet"
|
32
|
+
|
33
|
+
base = name or "Sheet"
|
34
|
+
candidate = base
|
35
|
+
i = 1
|
36
|
+
while candidate in taken or not candidate:
|
37
|
+
suffix = f"_{i}"
|
38
|
+
candidate = (base[:_MAX_SHEET_LEN - len(suffix)] + suffix) if len(base) + len(
|
39
|
+
suffix) > _MAX_SHEET_LEN else base + suffix
|
40
|
+
i += 1
|
41
|
+
|
42
|
+
taken.add(candidate)
|
43
|
+
return candidate
|
44
|
+
|
45
|
+
|
46
|
+
def _style_header(ws, ncols: int) -> None:
|
47
|
+
"""
|
48
|
+
Apply styling to the header row of an Excel worksheet.
|
49
|
+
|
50
|
+
Styles the first row with green background, white bold font, and center alignment.
|
51
|
+
Also freezes the panes below the header row.
|
52
|
+
:param ws: OpenPyXL worksheet object to style
|
53
|
+
:param ncols: Number of columns in the worksheet
|
54
|
+
:return: None
|
55
|
+
"""
|
56
|
+
# Style first row (header) and freeze panes below it
|
57
|
+
if ncols > 0:
|
58
|
+
ws.freeze_panes = "A2"
|
59
|
+
for idx in range(1, ncols + 1):
|
60
|
+
cell = ws.cell(row=1, column=idx)
|
61
|
+
cell.fill = _HEADER_FILL
|
62
|
+
cell.font = _HEADER_FONT
|
63
|
+
cell.alignment = _HEADER_ALIGN
|
64
|
+
|
65
|
+
|
66
|
+
def _autosize_columns(ws, df: pd.DataFrame) -> None:
|
67
|
+
"""
|
68
|
+
Automatically size columns in an Excel worksheet based on content.
|
69
|
+
|
70
|
+
Calculates optimal column widths based on header text and sample data
|
71
|
+
from the first 200 rows for performance.
|
72
|
+
:param ws: OpenPyXL worksheet object to resize
|
73
|
+
:param df: Pandas DataFrame containing the data
|
74
|
+
:return: None
|
75
|
+
"""
|
76
|
+
# Basic autosize based on header + sample of values
|
77
|
+
for i, col in enumerate(df.columns, start=1):
|
78
|
+
header = str(col) if col is not None else ""
|
79
|
+
max_len = len(header)
|
80
|
+
# sample first ~200 rows for performance
|
81
|
+
if not df.empty and i <= len(df.columns):
|
82
|
+
for val in df.iloc[:min(200, len(df)), i - 1].astype(str).values:
|
83
|
+
if len(val) > max_len:
|
84
|
+
max_len = len(val)
|
85
|
+
ws.column_dimensions[get_column_letter(i)].width = min(max(10, max_len + 2), 60)
|
86
|
+
|
87
|
+
|
88
|
+
def _normalize_data(headers: List[str], rows: List[List]) -> tuple[List[str], List[List]]:
|
89
|
+
"""
|
90
|
+
Normalize headers and rows to ensure consistent dimensions.
|
91
|
+
|
92
|
+
:param headers: List of column headers
|
93
|
+
:param rows: List of data rows
|
94
|
+
:return: Tuple of (normalized_headers, normalized_rows)
|
95
|
+
"""
|
96
|
+
if not rows:
|
97
|
+
return headers, []
|
98
|
+
|
99
|
+
# Find the maximum number of columns across all rows
|
100
|
+
max_cols = max(len(row) for row in rows) if rows else 0
|
101
|
+
|
102
|
+
# If we have headers, use them as the basis, otherwise use max columns
|
103
|
+
if headers:
|
104
|
+
target_cols = max(len(headers), max_cols)
|
105
|
+
else:
|
106
|
+
target_cols = max_cols
|
107
|
+
headers = [f"Column_{i + 1}" for i in range(target_cols)]
|
108
|
+
|
109
|
+
# Normalize headers: pad with generic names if too short, truncate if too long
|
110
|
+
normalized_headers = list(headers)
|
111
|
+
while len(normalized_headers) < target_cols:
|
112
|
+
normalized_headers.append(f"Column_{len(normalized_headers) + 1}")
|
113
|
+
normalized_headers = normalized_headers[:target_cols]
|
114
|
+
|
115
|
+
# Normalize rows: pad with None if too short, truncate if too long
|
116
|
+
normalized_rows = []
|
117
|
+
for row in rows:
|
118
|
+
normalized_row = list(row)
|
119
|
+
while len(normalized_row) < target_cols:
|
120
|
+
normalized_row.append(None)
|
121
|
+
normalized_rows.append(normalized_row[:target_cols])
|
122
|
+
|
123
|
+
return normalized_headers, normalized_rows
|
124
|
+
|
125
|
+
|
126
|
+
def write_structured_excel(excel_path: str, items: List[Dict[str, Any]]) -> str | None:
|
127
|
+
"""
|
128
|
+
Write a list of structured data items into an Excel workbook.
|
129
|
+
|
130
|
+
Each item becomes a separate worksheet with styled headers. The function
|
131
|
+
handles sheet name sanitization, header styling, and column autosizing.
|
132
|
+
Automatically handles mismatched headers and data columns.
|
133
|
+
|
134
|
+
:param excel_path: Path where the Excel file will be saved
|
135
|
+
:param items: List of dictionaries, each containing:
|
136
|
+
- 'title': Sheet title (optional)
|
137
|
+
- 'headers': List of column headers (optional)
|
138
|
+
- 'rows': List of data rows (optional)
|
139
|
+
:return: Path to the written Excel file if successful, None if no items provided
|
140
|
+
"""
|
141
|
+
if not items:
|
142
|
+
return None
|
143
|
+
|
144
|
+
# Filter out items that have no meaningful data
|
145
|
+
valid_items = []
|
146
|
+
for item in items:
|
147
|
+
headers = item.get("headers") or []
|
148
|
+
rows = item.get("rows") or []
|
149
|
+
# Keep items that have either headers or rows with data
|
150
|
+
if headers or (rows and any(
|
151
|
+
row for row in rows if any(cell for cell in row if cell is not None and str(cell).strip()))):
|
152
|
+
valid_items.append(item)
|
153
|
+
|
154
|
+
if not valid_items:
|
155
|
+
print("Warning: No valid items to write to Excel")
|
156
|
+
return None
|
157
|
+
|
158
|
+
os.makedirs(os.path.dirname(excel_path) or ".", exist_ok=True)
|
159
|
+
taken: Set[str] = set()
|
160
|
+
|
161
|
+
with pd.ExcelWriter(excel_path, engine="openpyxl", mode="w") as writer:
|
162
|
+
for item in valid_items:
|
163
|
+
try:
|
164
|
+
title = item.get("title") or "Untitled"
|
165
|
+
headers = item.get("headers") or []
|
166
|
+
rows = item.get("rows") or []
|
167
|
+
|
168
|
+
sheet_name = _safe_sheet_name(title, taken)
|
169
|
+
|
170
|
+
# Normalize data to handle mismatched dimensions
|
171
|
+
normalized_headers, normalized_rows = _normalize_data(headers, rows)
|
172
|
+
|
173
|
+
if not normalized_rows and not normalized_headers:
|
174
|
+
print(f"Skipping empty item: {title}")
|
175
|
+
continue
|
176
|
+
|
177
|
+
# Create DataFrame with normalized data
|
178
|
+
try:
|
179
|
+
df = pd.DataFrame(normalized_rows, columns=normalized_headers)
|
180
|
+
except Exception as e:
|
181
|
+
print(f"Error creating DataFrame for '{title}': {e}")
|
182
|
+
# Fallback: create a simple DataFrame
|
183
|
+
df = pd.DataFrame([["Error processing data"]], columns=["Message"])
|
184
|
+
|
185
|
+
# Write to Excel
|
186
|
+
df.to_excel(writer, sheet_name=sheet_name, index=False)
|
187
|
+
|
188
|
+
# Style header + autosize
|
189
|
+
ws = writer.sheets[sheet_name]
|
190
|
+
_style_header(ws, ncols=df.shape[1])
|
191
|
+
_autosize_columns(ws, df)
|
192
|
+
|
193
|
+
except Exception as e:
|
194
|
+
print(f"Error processing item '{item.get('title', 'Unknown')}': {e}")
|
195
|
+
continue
|
196
|
+
|
197
|
+
return excel_path
|
@@ -0,0 +1,42 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import os
|
4
|
+
from PIL import Image
|
5
|
+
from typing import Dict
|
6
|
+
|
7
|
+
from doctra.utils.file_ops import sanitize_filename
|
8
|
+
from doctra.utils.bbox import clip_bbox_to_image
|
9
|
+
from doctra.engines.layout.layout_models import LayoutBox
|
10
|
+
|
11
|
+
def save_box_image(
|
12
|
+
page_img: Image.Image,
|
13
|
+
box: LayoutBox,
|
14
|
+
out_dir: str,
|
15
|
+
page_idx: int,
|
16
|
+
box_idx: int,
|
17
|
+
image_subdirs: Dict[str, str],
|
18
|
+
) -> str:
|
19
|
+
"""
|
20
|
+
Crop and save a labeled box to the appropriate images/<subdir>/ folder.
|
21
|
+
|
22
|
+
Extracts a region from a page image based on the layout box coordinates,
|
23
|
+
crops it to the specified area, and saves it to the appropriate subdirectory
|
24
|
+
based on the box label (e.g., figures, charts, tables).
|
25
|
+
|
26
|
+
:param page_img: PIL Image object of the full page
|
27
|
+
:param box: LayoutBox object containing coordinates and label
|
28
|
+
:param out_dir: Base output directory for saving images
|
29
|
+
:param page_idx: Page index for naming the output file
|
30
|
+
:param box_idx: Box index for naming the output file
|
31
|
+
:param image_subdirs: Dictionary mapping box labels to subdirectory names
|
32
|
+
:return: Absolute file path to the saved image
|
33
|
+
"""
|
34
|
+
w, h = page_img.size
|
35
|
+
l, t, r, b = clip_bbox_to_image(box.x1, box.y1, box.x2, box.y2, w, h)
|
36
|
+
crop = page_img.crop((l, t, r, b))
|
37
|
+
|
38
|
+
sub = image_subdirs[box.label] # e.g., 'figures' | 'charts' | 'tables'
|
39
|
+
fname = f"page_{page_idx:03d}_{box.label}_{box_idx:03d}.jpg"
|
40
|
+
fpath = os.path.join(out_dir, "images", sub, sanitize_filename(fname))
|
41
|
+
crop.save(fpath, format="JPEG", quality=95)
|
42
|
+
return os.path.abspath(fpath)
|
@@ -0,0 +1,56 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
from typing import List, Optional
|
3
|
+
|
4
|
+
def _esc(cell: object) -> str:
|
5
|
+
"""
|
6
|
+
Escape and clean a cell value for Markdown table formatting.
|
7
|
+
|
8
|
+
Handles None values, escapes pipe characters, and collapses newlines
|
9
|
+
to ensure proper Markdown table formatting.
|
10
|
+
|
11
|
+
:param cell: Cell value to escape (can be any object)
|
12
|
+
:return: Escaped string safe for Markdown table cells
|
13
|
+
"""
|
14
|
+
s = "" if cell is None else str(cell)
|
15
|
+
# Escape pipes and collapse newlines for MD
|
16
|
+
return s.replace("|", r"\|").replace("\n", " ").strip()
|
17
|
+
|
18
|
+
def render_markdown_table(
|
19
|
+
headers: List[str] | None,
|
20
|
+
rows: List[List[str]] | None,
|
21
|
+
title: Optional[str] = None,
|
22
|
+
) -> str:
|
23
|
+
"""
|
24
|
+
Render a Markdown table from headers, rows, and optional title.
|
25
|
+
|
26
|
+
Creates a properly formatted Markdown table with headers, separator row,
|
27
|
+
and data rows. Handles missing headers by generating column names and
|
28
|
+
ensures all rows have consistent width.
|
29
|
+
|
30
|
+
:param headers: List of column headers (optional, will be auto-generated if None)
|
31
|
+
:param rows: List of data rows, where each row is a list of cell values
|
32
|
+
:param title: Optional title to display above the table
|
33
|
+
:return: Formatted Markdown table string
|
34
|
+
"""
|
35
|
+
headers = headers or []
|
36
|
+
rows = rows or []
|
37
|
+
|
38
|
+
lines: List[str] = []
|
39
|
+
if title:
|
40
|
+
lines.append(f"**{title}**")
|
41
|
+
# determine width
|
42
|
+
width = len(headers) if headers else (max((len(r) for r in rows), default=1))
|
43
|
+
|
44
|
+
# header row
|
45
|
+
if not headers:
|
46
|
+
headers = [f"col{i+1}" for i in range(width)]
|
47
|
+
lines.append("| " + " | ".join(_esc(h) for h in headers[:width]) + " |")
|
48
|
+
lines.append("| " + " | ".join(["---"] * width) + " |")
|
49
|
+
|
50
|
+
# data rows (pad/truncate to width)
|
51
|
+
for r in rows:
|
52
|
+
row = (r + [""] * width)[:width]
|
53
|
+
lines.append("| " + " | ".join(_esc(c) for c in row) + " |")
|
54
|
+
|
55
|
+
lines.append("") # blank line after table block
|
56
|
+
return "\n".join(lines)
|
@@ -0,0 +1,29 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
import os
|
3
|
+
import re
|
4
|
+
from typing import List
|
5
|
+
|
6
|
+
|
7
|
+
def write_markdown(md_lines: List[str], out_dir: str, filename: str = "result.md") -> str:
|
8
|
+
"""
|
9
|
+
Convert collected Markdown lines into a single Markdown file and save it.
|
10
|
+
|
11
|
+
Joins multiple Markdown content lines into a single file, cleans up excessive
|
12
|
+
blank lines, and saves it to the specified directory.
|
13
|
+
|
14
|
+
:param md_lines: List of markdown strings to join into a single file
|
15
|
+
:param out_dir: Directory where the markdown file will be saved
|
16
|
+
:param filename: Name of the markdown file (default: "result.md")
|
17
|
+
:return: The absolute path of the written markdown file
|
18
|
+
"""
|
19
|
+
os.makedirs(out_dir, exist_ok=True)
|
20
|
+
|
21
|
+
md = "\n".join(md_lines).strip() + "\n"
|
22
|
+
# Collapse excessive blank lines
|
23
|
+
md = re.sub(r"\n{3,}", "\n\n", md)
|
24
|
+
|
25
|
+
md_path = os.path.join(out_dir, filename)
|
26
|
+
with open(md_path, "w", encoding="utf-8") as f:
|
27
|
+
f.write(md)
|
28
|
+
|
29
|
+
return os.path.abspath(md_path)
|