doctra 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. doctra/__init__.py +19 -0
  2. doctra/cli/__init__.py +27 -0
  3. doctra/cli/main.py +856 -0
  4. doctra/cli/utils.py +340 -0
  5. doctra/engines/__init__.py +0 -0
  6. doctra/engines/layout/__init__.py +0 -0
  7. doctra/engines/layout/layout_models.py +90 -0
  8. doctra/engines/layout/paddle_layout.py +225 -0
  9. doctra/engines/ocr/__init__.py +4 -0
  10. doctra/engines/ocr/api.py +36 -0
  11. doctra/engines/ocr/path_resolver.py +48 -0
  12. doctra/engines/ocr/pytesseract_engine.py +76 -0
  13. doctra/engines/vlm/__init__.py +0 -0
  14. doctra/engines/vlm/outlines_types.py +31 -0
  15. doctra/engines/vlm/provider.py +58 -0
  16. doctra/engines/vlm/service.py +117 -0
  17. doctra/exporters/__init__.py +0 -0
  18. doctra/exporters/excel_writer.py +197 -0
  19. doctra/exporters/image_saver.py +42 -0
  20. doctra/exporters/markdown_table.py +56 -0
  21. doctra/exporters/markdown_writer.py +29 -0
  22. doctra/parsers/__init__.py +6 -0
  23. doctra/parsers/layout_order.py +16 -0
  24. doctra/parsers/structured_pdf_parser.py +434 -0
  25. doctra/parsers/table_chart_extractor.py +283 -0
  26. doctra/utils/__init__.py +0 -0
  27. doctra/utils/bbox.py +18 -0
  28. doctra/utils/constants.py +8 -0
  29. doctra/utils/file_ops.py +26 -0
  30. doctra/utils/io_utils.py +10 -0
  31. doctra/utils/ocr_utils.py +20 -0
  32. doctra/utils/pdf_io.py +19 -0
  33. doctra/utils/quiet.py +13 -0
  34. doctra/utils/structured_utils.py +49 -0
  35. doctra/version.py +2 -0
  36. doctra-0.1.0.dist-info/METADATA +626 -0
  37. doctra-0.1.0.dist-info/RECORD +40 -0
  38. doctra-0.1.0.dist-info/WHEEL +5 -0
  39. doctra-0.1.0.dist-info/licenses/LICENSE +201 -0
  40. doctra-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,48 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import platform
5
+ import shutil
6
+ from typing import Optional
7
+
8
+ def resolve_tesseract_cmd(tesseract_cmd: Optional[str] = None) -> Optional[str]:
9
+ """
10
+ Best-effort discovery of the Tesseract executable.
11
+
12
+ Searches for the Tesseract executable using a priority-based approach:
13
+ 1. Explicitly provided path
14
+ 2. TESSERACT_CMD environment variable
15
+ 3. System PATH
16
+ 4. Common installation paths for the current platform
17
+
18
+ :param tesseract_cmd: Optional explicit path to tesseract executable
19
+ :return: Resolved path to tesseract executable, or None if not found
20
+ """
21
+ if tesseract_cmd and os.path.exists(tesseract_cmd):
22
+ return tesseract_cmd
23
+
24
+ env_cmd = os.getenv("TESSERACT_CMD")
25
+ if env_cmd and os.path.exists(env_cmd):
26
+ return env_cmd
27
+
28
+ which = shutil.which("tesseract")
29
+ if which:
30
+ return which
31
+
32
+ system = platform.system()
33
+ candidates = []
34
+ if system == "Windows":
35
+ candidates = [
36
+ r"C:\Program Files\Tesseract-OCR\tesseract.exe",
37
+ r"C:\Program Files (x86)\Tesseract-OCR\tesseract.exe",
38
+ ]
39
+ elif system == "Darwin":
40
+ candidates = ["/opt/homebrew/bin/tesseract", "/usr/local/bin/tesseract"]
41
+ else: # Linux/Unix
42
+ candidates = ["/usr/bin/tesseract", "/usr/local/bin/tesseract"]
43
+
44
+ for c in candidates:
45
+ if os.path.exists(c):
46
+ return c
47
+
48
+ return None
@@ -0,0 +1,76 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Optional
4
+ from PIL import Image
5
+ import pytesseract
6
+
7
+ from .path_resolver import resolve_tesseract_cmd
8
+
9
+
10
+ class PytesseractOCREngine:
11
+ """
12
+ Minimal OCR engine using pytesseract.
13
+
14
+ Accepts a cropped PIL image (e.g., a text block from layout detection)
15
+ and returns raw text. Provides a simple interface to Tesseract OCR
16
+ with configurable parameters for different use cases.
17
+
18
+ :param tesseract_cmd: Optional path to tesseract executable
19
+ :param lang: OCR language code (default: "eng")
20
+ :param psm: Tesseract page segmentation mode (default: 4)
21
+ :param oem: Tesseract OCR engine mode (default: 3)
22
+ :param extra_config: Additional Tesseract configuration string (default: "")
23
+ """
24
+
25
+ def __init__(
26
+ self,
27
+ tesseract_cmd: Optional[str] = None,
28
+ lang: str = "eng",
29
+ psm: int = 4,
30
+ oem: int = 3,
31
+ extra_config: str = "",
32
+ ):
33
+ """
34
+ Initialize the PytesseractOCREngine with OCR configuration.
35
+
36
+ Sets up the Tesseract command path and stores configuration parameters
37
+ for use during text recognition.
38
+
39
+ :param tesseract_cmd: Optional path to tesseract executable
40
+ :param lang: OCR language code (default: "eng")
41
+ :param psm: Tesseract page segmentation mode (default: 4)
42
+ :param oem: Tesseract OCR engine mode (default: 3)
43
+ :param extra_config: Additional Tesseract configuration string (default: "")
44
+ """
45
+ cmd = resolve_tesseract_cmd(tesseract_cmd)
46
+ if cmd:
47
+ pytesseract.pytesseract.tesseract_cmd = cmd
48
+ # If not found, let pytesseract raise a clear error at call time.
49
+
50
+ self.lang = lang
51
+ self.psm = psm
52
+ self.oem = oem
53
+ self.extra_config = (extra_config or "").strip()
54
+
55
+ def recognize(self, image: Image.Image) -> str:
56
+ """
57
+ Run OCR on a cropped PIL image and return extracted text (stripped).
58
+
59
+ Performs text recognition on the provided image using the configured
60
+ Tesseract parameters and returns the extracted text with whitespace
61
+ stripped from the beginning and end.
62
+
63
+ :param image: PIL Image object to perform OCR on
64
+ :return: Extracted text string with leading/trailing whitespace removed
65
+ :raises TypeError: If the input is not a PIL Image object
66
+ """
67
+ if not isinstance(image, Image.Image):
68
+ raise TypeError("PytesseractOCREngine expects a PIL.Image.Image as input.")
69
+
70
+ config_parts = [f"--psm {self.psm}", f"--oem {self.oem}"]
71
+ if self.extra_config:
72
+ config_parts.append(self.extra_config)
73
+ config = " ".join(config_parts)
74
+
75
+ text = pytesseract.image_to_string(image, lang=self.lang, config=config)
76
+ return text.strip()
File without changes
@@ -0,0 +1,31 @@
1
+ from pydantic import BaseModel
2
+
3
+ class Chart(BaseModel):
4
+ """
5
+ Structured representation of a chart extracted from an image.
6
+
7
+ Contains the title, headers, and data rows extracted from a chart
8
+ using VLM (Vision Language Model) processing.
9
+
10
+ :param title: Title or caption of the chart
11
+ :param headers: Column headers for the chart data
12
+ :param rows: Data rows containing the chart values
13
+ """
14
+ title: str
15
+ headers: list[str]
16
+ rows: list[list[str]]
17
+
18
+ class Table(BaseModel):
19
+ """
20
+ Structured representation of a table extracted from an image.
21
+
22
+ Contains the title, headers, and data rows extracted from a table
23
+ using VLM (Vision Language Model) processing.
24
+
25
+ :param title: Title or caption of the table
26
+ :param headers: Column headers for the table data
27
+ :param rows: Data rows containing the table values
28
+ """
29
+ title: str
30
+ headers: list[str]
31
+ rows: list[list[str]]
@@ -0,0 +1,58 @@
1
+ from __future__ import annotations
2
+
3
+ # --- keep these imports to match your snippet style ---
4
+ import io
5
+ import PIL
6
+ import openai
7
+ import outlines
8
+ from pydantic import BaseModel
9
+ from google.genai import Client
10
+ from outlines.inputs import Image
11
+ # ------------------------------------------------------
12
+
13
+ def make_model(
14
+ vlm_provider: str | None = "gemini",
15
+ vlm_model: str | None = None,
16
+ *,
17
+ api_key: str | None = None,
18
+ ):
19
+ """
20
+ Build a callable Outlines model for VLM processing.
21
+
22
+ Creates an Outlines model instance configured for either Gemini or OpenAI
23
+ providers. Only one backend is active at a time, with Gemini as the default.
24
+
25
+ :param vlm_provider: VLM provider to use ("gemini" or "openai", default: "gemini")
26
+ :param vlm_model: Model name to use (defaults to provider-specific defaults)
27
+ :param api_key: API key for the VLM provider (required for both Gemini and OpenAI)
28
+ :return: Configured Outlines model instance
29
+ :raises ValueError: If provider is unsupported or API key is missing
30
+ """
31
+ vlm_provider = (vlm_provider or "gemini").lower()
32
+
33
+ # Set default models if not provided
34
+ if vlm_model is None:
35
+ if vlm_provider == "gemini":
36
+ vlm_model = "gemini-1.5-flash-latest"
37
+ elif vlm_provider == "openai":
38
+ vlm_model = "gpt-4o"
39
+
40
+ if vlm_provider == "gemini":
41
+ if not api_key:
42
+ raise ValueError("Gemini provider requires api_key to be passed to make_model(...).")
43
+ # Create the model (exactly like your snippet)
44
+ return outlines.from_gemini(
45
+ Client(api_key=api_key),
46
+ vlm_model,
47
+ )
48
+
49
+ if vlm_provider == "openai":
50
+ if not api_key:
51
+ raise ValueError("OpenAI provider requires api_key to be passed to make_model(...).")
52
+ # this part is for the openai models (exactly like your snippet)
53
+ return outlines.from_openai(
54
+ openai.OpenAI(api_key=api_key),
55
+ vlm_model,
56
+ )
57
+
58
+ raise ValueError(f"Unsupported provider: {vlm_provider}. Use 'gemini' or 'openai'.")
@@ -0,0 +1,117 @@
1
+ from __future__ import annotations
2
+ import os
3
+ from outlines.inputs import Image
4
+
5
+ from ...utils.io_utils import get_image_from_local
6
+ from .outlines_types import Chart, Table
7
+ from .provider import make_model
8
+
9
+
10
+ class VLMStructuredExtractor:
11
+ """
12
+ Thin service around prompts + Outlines calls for structured data extraction.
13
+
14
+ Provides a high-level interface for extracting structured data (charts and tables)
15
+ from images using Vision Language Models (VLM) with Outlines for type safety.
16
+
17
+ Usage:
18
+ vlm = VLMStructuredExtractor(vlm_provider="gemini", api_key="YOUR_KEY", debug=True)
19
+ chart = vlm.extract_chart("/abs/path/chart.jpg")
20
+ table = vlm.extract_table("/abs/path/table.jpg")
21
+ """
22
+
23
+ def __init__(
24
+ self,
25
+ vlm_provider: str = "gemini",
26
+ vlm_model: str | None = None,
27
+ *,
28
+ api_key: str | None = None,
29
+ debug: bool = True,
30
+ ):
31
+ """
32
+ Initialize the VLMStructuredExtractor with provider configuration.
33
+
34
+ Sets up the VLM model and debug settings for structured data extraction
35
+ from images.
36
+
37
+ :param vlm_provider: VLM provider to use ("gemini" or "openai", default: "gemini")
38
+ :param vlm_model: Model name to use (defaults to provider-specific defaults)
39
+ :param api_key: API key for the VLM provider (required for both Gemini and OpenAI)
40
+ :param debug: Whether to enable debug output for error handling (default: True)
41
+ """
42
+ self.model = make_model(
43
+ vlm_provider,
44
+ vlm_model,
45
+ api_key=api_key,
46
+ )
47
+ self.debug = debug
48
+
49
+ def _call(self, prompt_text: str, image_path: str, schema):
50
+ """
51
+ Common call: open/normalize image, convert to RGB, invoke model with schema.
52
+
53
+ Internal method that handles the common workflow for VLM processing:
54
+ loading the image, normalizing it, and calling the model with the provided
55
+ prompt and schema.
56
+
57
+ :param prompt_text: Text prompt to send to the VLM
58
+ :param image_path: Path to the image file to process
59
+ :param schema: Pydantic schema class for structured output
60
+ :return: Structured data object matching the provided schema
61
+ :raises Exception: If image processing or VLM call fails
62
+ """
63
+ try:
64
+ # Normalize path and verify readability
65
+ # (get_image_from_local already absolutizes & raises if missing)
66
+ img = get_image_from_local(image_path)
67
+ if img.mode != "RGB":
68
+ img = img.convert("RGB")
69
+
70
+ prompt = [prompt_text, Image(img)]
71
+ return self.model(prompt, schema)
72
+ except Exception as e:
73
+ if self.debug:
74
+ import traceback
75
+ print(f"[VLM ERROR] while processing: {image_path}")
76
+ traceback.print_exc()
77
+ print(f"[VLM ERROR] type={type(e).__name__} msg={e}")
78
+ # Re-raise so caller can handle/log too
79
+ raise
80
+
81
+ def extract_chart(self, image_path: str) -> Chart:
82
+ """
83
+ Extract structured chart data from an image.
84
+
85
+ Uses VLM to analyze a chart image and extract the data in a structured
86
+ format with title, headers, and rows.
87
+
88
+ :param image_path: Path to the chart image file
89
+ :return: Chart object containing extracted title, headers, and data rows
90
+ :raises Exception: If image processing or VLM extraction fails
91
+ """
92
+ prompt_text = (
93
+ "Convert the given chart into a table format with headers and rows. "
94
+ "If the title is not present in the image, generate a suitable title. "
95
+ "Ensure that the table represents the data from the chart accurately."
96
+ "The number of columns in the headers must match the number of columns in each row."
97
+ )
98
+ return self._call(prompt_text, image_path, Chart)
99
+
100
+ def extract_table(self, image_path: str) -> Table:
101
+ """
102
+ Extract structured table data from an image.
103
+
104
+ Uses VLM to analyze a table image and extract the data in a structured
105
+ format with title, headers, and rows.
106
+
107
+ :param image_path: Path to the table image file
108
+ :return: Table object containing extracted title, headers, and data rows
109
+ :raises Exception: If image processing or VLM extraction fails
110
+ """
111
+ prompt_text = (
112
+ "Extract the data from the given table in image format. "
113
+ "Provide the headers and rows of the table, ensuring accuracy in the extraction. "
114
+ "If the title is not present in the image, generate a suitable title."
115
+ "The number of columns in the headers must match the number of columns in each row."
116
+ )
117
+ return self._call(prompt_text, image_path, Table)
File without changes
@@ -0,0 +1,197 @@
1
+ from __future__ import annotations
2
+ import os
3
+ import re
4
+ from typing import Dict, Any, List, Set
5
+ import pandas as pd # pip install pandas openpyxl
6
+ from openpyxl.styles import PatternFill, Font, Alignment
7
+ from openpyxl.utils import get_column_letter
8
+
9
+ _INVALID_SHEET_CHARS = r'[:\\/*?\[\]]' # Excel-invalid characters
10
+ _MAX_SHEET_LEN = 31
11
+
12
+ # Header style: solid green background + white bold font
13
+ _HEADER_FILL = PatternFill(fill_type="solid", start_color="FF2E7D32", end_color="FF2E7D32") # #2E7D32
14
+ _HEADER_FONT = Font(color="FFFFFFFF", bold=True)
15
+ _HEADER_ALIGN = Alignment(horizontal="center", vertical="center", wrap_text=True)
16
+
17
+
18
+ def _safe_sheet_name(raw_title: str, taken: Set[str]) -> str:
19
+ """
20
+ Create a safe Excel sheet name from a raw title.
21
+
22
+ Ensures the sheet name is valid for Excel by removing invalid characters,
23
+ handling length limits, and avoiding duplicates.
24
+ :param raw_title: Original title to convert to sheet name
25
+ :param taken: Set of already used sheet names to avoid conflicts
26
+ :return: Safe Excel sheet name that doesn't conflict with existing names
27
+ """
28
+ name = (raw_title or "Untitled").strip()
29
+ name = re.sub(_INVALID_SHEET_CHARS, "_", name)
30
+ name = re.sub(r"\s+", " ", name)
31
+ name = name[:_MAX_SHEET_LEN] if name else "Sheet"
32
+
33
+ base = name or "Sheet"
34
+ candidate = base
35
+ i = 1
36
+ while candidate in taken or not candidate:
37
+ suffix = f"_{i}"
38
+ candidate = (base[:_MAX_SHEET_LEN - len(suffix)] + suffix) if len(base) + len(
39
+ suffix) > _MAX_SHEET_LEN else base + suffix
40
+ i += 1
41
+
42
+ taken.add(candidate)
43
+ return candidate
44
+
45
+
46
+ def _style_header(ws, ncols: int) -> None:
47
+ """
48
+ Apply styling to the header row of an Excel worksheet.
49
+
50
+ Styles the first row with green background, white bold font, and center alignment.
51
+ Also freezes the panes below the header row.
52
+ :param ws: OpenPyXL worksheet object to style
53
+ :param ncols: Number of columns in the worksheet
54
+ :return: None
55
+ """
56
+ # Style first row (header) and freeze panes below it
57
+ if ncols > 0:
58
+ ws.freeze_panes = "A2"
59
+ for idx in range(1, ncols + 1):
60
+ cell = ws.cell(row=1, column=idx)
61
+ cell.fill = _HEADER_FILL
62
+ cell.font = _HEADER_FONT
63
+ cell.alignment = _HEADER_ALIGN
64
+
65
+
66
+ def _autosize_columns(ws, df: pd.DataFrame) -> None:
67
+ """
68
+ Automatically size columns in an Excel worksheet based on content.
69
+
70
+ Calculates optimal column widths based on header text and sample data
71
+ from the first 200 rows for performance.
72
+ :param ws: OpenPyXL worksheet object to resize
73
+ :param df: Pandas DataFrame containing the data
74
+ :return: None
75
+ """
76
+ # Basic autosize based on header + sample of values
77
+ for i, col in enumerate(df.columns, start=1):
78
+ header = str(col) if col is not None else ""
79
+ max_len = len(header)
80
+ # sample first ~200 rows for performance
81
+ if not df.empty and i <= len(df.columns):
82
+ for val in df.iloc[:min(200, len(df)), i - 1].astype(str).values:
83
+ if len(val) > max_len:
84
+ max_len = len(val)
85
+ ws.column_dimensions[get_column_letter(i)].width = min(max(10, max_len + 2), 60)
86
+
87
+
88
+ def _normalize_data(headers: List[str], rows: List[List]) -> tuple[List[str], List[List]]:
89
+ """
90
+ Normalize headers and rows to ensure consistent dimensions.
91
+
92
+ :param headers: List of column headers
93
+ :param rows: List of data rows
94
+ :return: Tuple of (normalized_headers, normalized_rows)
95
+ """
96
+ if not rows:
97
+ return headers, []
98
+
99
+ # Find the maximum number of columns across all rows
100
+ max_cols = max(len(row) for row in rows) if rows else 0
101
+
102
+ # If we have headers, use them as the basis, otherwise use max columns
103
+ if headers:
104
+ target_cols = max(len(headers), max_cols)
105
+ else:
106
+ target_cols = max_cols
107
+ headers = [f"Column_{i + 1}" for i in range(target_cols)]
108
+
109
+ # Normalize headers: pad with generic names if too short, truncate if too long
110
+ normalized_headers = list(headers)
111
+ while len(normalized_headers) < target_cols:
112
+ normalized_headers.append(f"Column_{len(normalized_headers) + 1}")
113
+ normalized_headers = normalized_headers[:target_cols]
114
+
115
+ # Normalize rows: pad with None if too short, truncate if too long
116
+ normalized_rows = []
117
+ for row in rows:
118
+ normalized_row = list(row)
119
+ while len(normalized_row) < target_cols:
120
+ normalized_row.append(None)
121
+ normalized_rows.append(normalized_row[:target_cols])
122
+
123
+ return normalized_headers, normalized_rows
124
+
125
+
126
+ def write_structured_excel(excel_path: str, items: List[Dict[str, Any]]) -> str | None:
127
+ """
128
+ Write a list of structured data items into an Excel workbook.
129
+
130
+ Each item becomes a separate worksheet with styled headers. The function
131
+ handles sheet name sanitization, header styling, and column autosizing.
132
+ Automatically handles mismatched headers and data columns.
133
+
134
+ :param excel_path: Path where the Excel file will be saved
135
+ :param items: List of dictionaries, each containing:
136
+ - 'title': Sheet title (optional)
137
+ - 'headers': List of column headers (optional)
138
+ - 'rows': List of data rows (optional)
139
+ :return: Path to the written Excel file if successful, None if no items provided
140
+ """
141
+ if not items:
142
+ return None
143
+
144
+ # Filter out items that have no meaningful data
145
+ valid_items = []
146
+ for item in items:
147
+ headers = item.get("headers") or []
148
+ rows = item.get("rows") or []
149
+ # Keep items that have either headers or rows with data
150
+ if headers or (rows and any(
151
+ row for row in rows if any(cell for cell in row if cell is not None and str(cell).strip()))):
152
+ valid_items.append(item)
153
+
154
+ if not valid_items:
155
+ print("Warning: No valid items to write to Excel")
156
+ return None
157
+
158
+ os.makedirs(os.path.dirname(excel_path) or ".", exist_ok=True)
159
+ taken: Set[str] = set()
160
+
161
+ with pd.ExcelWriter(excel_path, engine="openpyxl", mode="w") as writer:
162
+ for item in valid_items:
163
+ try:
164
+ title = item.get("title") or "Untitled"
165
+ headers = item.get("headers") or []
166
+ rows = item.get("rows") or []
167
+
168
+ sheet_name = _safe_sheet_name(title, taken)
169
+
170
+ # Normalize data to handle mismatched dimensions
171
+ normalized_headers, normalized_rows = _normalize_data(headers, rows)
172
+
173
+ if not normalized_rows and not normalized_headers:
174
+ print(f"Skipping empty item: {title}")
175
+ continue
176
+
177
+ # Create DataFrame with normalized data
178
+ try:
179
+ df = pd.DataFrame(normalized_rows, columns=normalized_headers)
180
+ except Exception as e:
181
+ print(f"Error creating DataFrame for '{title}': {e}")
182
+ # Fallback: create a simple DataFrame
183
+ df = pd.DataFrame([["Error processing data"]], columns=["Message"])
184
+
185
+ # Write to Excel
186
+ df.to_excel(writer, sheet_name=sheet_name, index=False)
187
+
188
+ # Style header + autosize
189
+ ws = writer.sheets[sheet_name]
190
+ _style_header(ws, ncols=df.shape[1])
191
+ _autosize_columns(ws, df)
192
+
193
+ except Exception as e:
194
+ print(f"Error processing item '{item.get('title', 'Unknown')}': {e}")
195
+ continue
196
+
197
+ return excel_path
@@ -0,0 +1,42 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ from PIL import Image
5
+ from typing import Dict
6
+
7
+ from doctra.utils.file_ops import sanitize_filename
8
+ from doctra.utils.bbox import clip_bbox_to_image
9
+ from doctra.engines.layout.layout_models import LayoutBox
10
+
11
+ def save_box_image(
12
+ page_img: Image.Image,
13
+ box: LayoutBox,
14
+ out_dir: str,
15
+ page_idx: int,
16
+ box_idx: int,
17
+ image_subdirs: Dict[str, str],
18
+ ) -> str:
19
+ """
20
+ Crop and save a labeled box to the appropriate images/<subdir>/ folder.
21
+
22
+ Extracts a region from a page image based on the layout box coordinates,
23
+ crops it to the specified area, and saves it to the appropriate subdirectory
24
+ based on the box label (e.g., figures, charts, tables).
25
+
26
+ :param page_img: PIL Image object of the full page
27
+ :param box: LayoutBox object containing coordinates and label
28
+ :param out_dir: Base output directory for saving images
29
+ :param page_idx: Page index for naming the output file
30
+ :param box_idx: Box index for naming the output file
31
+ :param image_subdirs: Dictionary mapping box labels to subdirectory names
32
+ :return: Absolute file path to the saved image
33
+ """
34
+ w, h = page_img.size
35
+ l, t, r, b = clip_bbox_to_image(box.x1, box.y1, box.x2, box.y2, w, h)
36
+ crop = page_img.crop((l, t, r, b))
37
+
38
+ sub = image_subdirs[box.label] # e.g., 'figures' | 'charts' | 'tables'
39
+ fname = f"page_{page_idx:03d}_{box.label}_{box_idx:03d}.jpg"
40
+ fpath = os.path.join(out_dir, "images", sub, sanitize_filename(fname))
41
+ crop.save(fpath, format="JPEG", quality=95)
42
+ return os.path.abspath(fpath)
@@ -0,0 +1,56 @@
1
+ from __future__ import annotations
2
+ from typing import List, Optional
3
+
4
+ def _esc(cell: object) -> str:
5
+ """
6
+ Escape and clean a cell value for Markdown table formatting.
7
+
8
+ Handles None values, escapes pipe characters, and collapses newlines
9
+ to ensure proper Markdown table formatting.
10
+
11
+ :param cell: Cell value to escape (can be any object)
12
+ :return: Escaped string safe for Markdown table cells
13
+ """
14
+ s = "" if cell is None else str(cell)
15
+ # Escape pipes and collapse newlines for MD
16
+ return s.replace("|", r"\|").replace("\n", " ").strip()
17
+
18
+ def render_markdown_table(
19
+ headers: List[str] | None,
20
+ rows: List[List[str]] | None,
21
+ title: Optional[str] = None,
22
+ ) -> str:
23
+ """
24
+ Render a Markdown table from headers, rows, and optional title.
25
+
26
+ Creates a properly formatted Markdown table with headers, separator row,
27
+ and data rows. Handles missing headers by generating column names and
28
+ ensures all rows have consistent width.
29
+
30
+ :param headers: List of column headers (optional, will be auto-generated if None)
31
+ :param rows: List of data rows, where each row is a list of cell values
32
+ :param title: Optional title to display above the table
33
+ :return: Formatted Markdown table string
34
+ """
35
+ headers = headers or []
36
+ rows = rows or []
37
+
38
+ lines: List[str] = []
39
+ if title:
40
+ lines.append(f"**{title}**")
41
+ # determine width
42
+ width = len(headers) if headers else (max((len(r) for r in rows), default=1))
43
+
44
+ # header row
45
+ if not headers:
46
+ headers = [f"col{i+1}" for i in range(width)]
47
+ lines.append("| " + " | ".join(_esc(h) for h in headers[:width]) + " |")
48
+ lines.append("| " + " | ".join(["---"] * width) + " |")
49
+
50
+ # data rows (pad/truncate to width)
51
+ for r in rows:
52
+ row = (r + [""] * width)[:width]
53
+ lines.append("| " + " | ".join(_esc(c) for c in row) + " |")
54
+
55
+ lines.append("") # blank line after table block
56
+ return "\n".join(lines)
@@ -0,0 +1,29 @@
1
+ from __future__ import annotations
2
+ import os
3
+ import re
4
+ from typing import List
5
+
6
+
7
+ def write_markdown(md_lines: List[str], out_dir: str, filename: str = "result.md") -> str:
8
+ """
9
+ Convert collected Markdown lines into a single Markdown file and save it.
10
+
11
+ Joins multiple Markdown content lines into a single file, cleans up excessive
12
+ blank lines, and saves it to the specified directory.
13
+
14
+ :param md_lines: List of markdown strings to join into a single file
15
+ :param out_dir: Directory where the markdown file will be saved
16
+ :param filename: Name of the markdown file (default: "result.md")
17
+ :return: The absolute path of the written markdown file
18
+ """
19
+ os.makedirs(out_dir, exist_ok=True)
20
+
21
+ md = "\n".join(md_lines).strip() + "\n"
22
+ # Collapse excessive blank lines
23
+ md = re.sub(r"\n{3,}", "\n\n", md)
24
+
25
+ md_path = os.path.join(out_dir, filename)
26
+ with open(md_path, "w", encoding="utf-8") as f:
27
+ f.write(md)
28
+
29
+ return os.path.abspath(md_path)
@@ -0,0 +1,6 @@
1
+ """Parsers module for Doctra."""
2
+
3
+ from .structured_pdf_parser import StructuredPDFParser
4
+ from .table_chart_extractor import ChartTablePDFParser
5
+
6
+ __all__ = ['StructuredPDFParser', 'ChartTablePDFParser']