pdf2mj 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pdf2mj/__init__.py +3 -0
- pdf2mj/chunker.py +58 -0
- pdf2mj/cli.py +207 -0
- pdf2mj/config.py +70 -0
- pdf2mj/console_util.py +26 -0
- pdf2mj/converter.py +197 -0
- pdf2mj/doctor.py +165 -0
- pdf2mj/image_extractor.py +48 -0
- pdf2mj/json_export.py +57 -0
- pdf2mj/markdown.py +75 -0
- pdf2mj/metadata.py +31 -0
- pdf2mj/models.py +52 -0
- pdf2mj/ocr.py +42 -0
- pdf2mj/table_extractor.py +64 -0
- pdf2mj/welcome.py +147 -0
- pdf2mj-0.1.0.dist-info/METADATA +241 -0
- pdf2mj-0.1.0.dist-info/RECORD +19 -0
- pdf2mj-0.1.0.dist-info/WHEEL +4 -0
- pdf2mj-0.1.0.dist-info/entry_points.txt +2 -0
pdf2mj/__init__.py
ADDED
pdf2mj/chunker.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""Text chunking for RAG pipelines."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
|
|
7
|
+
from pdf2mj.models import Chunk, Document
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _estimate_tokens(text: str) -> int:
|
|
11
|
+
return len(re.findall(r"\S+", text))
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def chunk_document(
|
|
15
|
+
document: Document,
|
|
16
|
+
chunk_size: int = 1000,
|
|
17
|
+
overlap: int = 200,
|
|
18
|
+
) -> list[Chunk]:
|
|
19
|
+
"""Split document text into overlapping chunks with page references."""
|
|
20
|
+
chunks: list[Chunk] = []
|
|
21
|
+
chunk_index = 0
|
|
22
|
+
|
|
23
|
+
for page in document.pages:
|
|
24
|
+
page_text_parts: list[str] = []
|
|
25
|
+
for block in page.blocks:
|
|
26
|
+
if block.type in ("image",):
|
|
27
|
+
continue
|
|
28
|
+
text = block.text.strip()
|
|
29
|
+
if text:
|
|
30
|
+
page_text_parts.append(text)
|
|
31
|
+
|
|
32
|
+
full_text = "\n\n".join(page_text_parts)
|
|
33
|
+
if not full_text.strip():
|
|
34
|
+
continue
|
|
35
|
+
|
|
36
|
+
words = full_text.split()
|
|
37
|
+
if not words:
|
|
38
|
+
continue
|
|
39
|
+
|
|
40
|
+
start = 0
|
|
41
|
+
while start < len(words):
|
|
42
|
+
end = min(start + chunk_size, len(words))
|
|
43
|
+
chunk_words = words[start:end]
|
|
44
|
+
chunk_text = " ".join(chunk_words)
|
|
45
|
+
chunk_index += 1
|
|
46
|
+
chunks.append(
|
|
47
|
+
Chunk(
|
|
48
|
+
chunk_id=f"c{chunk_index}",
|
|
49
|
+
page=page.page_number,
|
|
50
|
+
text=chunk_text,
|
|
51
|
+
tokens=_estimate_tokens(chunk_text),
|
|
52
|
+
)
|
|
53
|
+
)
|
|
54
|
+
if end >= len(words):
|
|
55
|
+
break
|
|
56
|
+
start = max(0, end - overlap)
|
|
57
|
+
|
|
58
|
+
return chunks
|
pdf2mj/cli.py
ADDED
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
"""PDF2MJ command-line interface."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import sys
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Annotated, Optional
|
|
8
|
+
|
|
9
|
+
import typer
|
|
10
|
+
from rich.console import Console
|
|
11
|
+
from rich.panel import Panel
|
|
12
|
+
|
|
13
|
+
from pdf2mj.chunker import chunk_document
|
|
14
|
+
from pdf2mj.console_util import make_console
|
|
15
|
+
from pdf2mj.converter import PDFConversionError, parse_pdf
|
|
16
|
+
from pdf2mj.doctor import run_doctor
|
|
17
|
+
from pdf2mj.json_export import write_chunks, write_json, write_metadata_json
|
|
18
|
+
from pdf2mj.markdown import write_markdown
|
|
19
|
+
from pdf2mj.ocr import ocr_available
|
|
20
|
+
from pdf2mj.welcome import show_welcome, show_welcome_if_needed
|
|
21
|
+
|
|
22
|
+
console = make_console()
|
|
23
|
+
|
|
24
|
+
app = typer.Typer(
|
|
25
|
+
name="pdf2mj",
|
|
26
|
+
help="Convert PDF documents to Markdown and structured JSON.",
|
|
27
|
+
no_args_is_help=False,
|
|
28
|
+
invoke_without_command=True,
|
|
29
|
+
add_completion=False,
|
|
30
|
+
rich_markup_mode="rich",
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
convert_app = typer.Typer(
|
|
34
|
+
help="Convert a PDF to Markdown and/or structured JSON.",
|
|
35
|
+
add_completion=False,
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
_SUBCOMMANDS = frozenset({"welcome", "doctor"})
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@app.callback()
|
|
42
|
+
def root(ctx: typer.Context) -> None:
|
|
43
|
+
"""Show first-run welcome and help when invoked with no subcommand."""
|
|
44
|
+
if ctx.invoked_subcommand is not None:
|
|
45
|
+
return
|
|
46
|
+
show_welcome_if_needed(console=console)
|
|
47
|
+
typer.echo(ctx.get_help())
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@app.command()
|
|
51
|
+
def welcome() -> None:
|
|
52
|
+
"""Display the onboarding welcome screen."""
|
|
53
|
+
show_welcome(console=console, mark_seen=True)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@app.command()
|
|
57
|
+
def doctor(
|
|
58
|
+
output_dir: Annotated[
|
|
59
|
+
Optional[Path],
|
|
60
|
+
typer.Option(
|
|
61
|
+
"--output",
|
|
62
|
+
"-o",
|
|
63
|
+
help="Directory to verify write permissions for.",
|
|
64
|
+
file_okay=False,
|
|
65
|
+
),
|
|
66
|
+
] = None,
|
|
67
|
+
) -> None:
|
|
68
|
+
"""Verify environment, dependencies, and optional OCR support."""
|
|
69
|
+
code = run_doctor(console=console, output_dir=output_dir)
|
|
70
|
+
raise typer.Exit(code=code)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
@convert_app.command()
|
|
74
|
+
def convert(
|
|
75
|
+
pdf: Annotated[Path, typer.Argument(help="Path to the input PDF file.", exists=True)],
|
|
76
|
+
markdown: Annotated[bool, typer.Option("--markdown/--no-markdown", help="Generate markdown.")] = True,
|
|
77
|
+
json_out: Annotated[bool, typer.Option("--json/--no-json", help="Generate structured JSON.")] = True,
|
|
78
|
+
ocr: Annotated[bool, typer.Option("--ocr", help="OCR scanned PDF pages.")] = False,
|
|
79
|
+
extract_images: Annotated[
|
|
80
|
+
bool, typer.Option("--extract-images", help="Extract embedded images.")
|
|
81
|
+
] = False,
|
|
82
|
+
chunk_size: Annotated[
|
|
83
|
+
Optional[int], typer.Option("--chunk-size", help="Chunk size (words) for RAG.")
|
|
84
|
+
] = None,
|
|
85
|
+
chunk_overlap: Annotated[
|
|
86
|
+
int, typer.Option("--chunk-overlap", help="Overlap (words) between chunks.")
|
|
87
|
+
] = 200,
|
|
88
|
+
output: Annotated[
|
|
89
|
+
Optional[Path], typer.Option("--output", "-o", help="Output directory.")
|
|
90
|
+
] = None,
|
|
91
|
+
verbose: Annotated[bool, typer.Option("--verbose", "-v", help="Detailed logs.")] = False,
|
|
92
|
+
metadata: Annotated[bool, typer.Option("--metadata", help="Export metadata.json.")] = False,
|
|
93
|
+
tables: Annotated[bool, typer.Option("--tables/--no-tables", help="Extract tables.")] = True,
|
|
94
|
+
figures: Annotated[
|
|
95
|
+
bool, typer.Option("--figures", help="Extract figures (same as --extract-images).")
|
|
96
|
+
] = False,
|
|
97
|
+
all_outputs: Annotated[
|
|
98
|
+
bool, typer.Option("--all", help="Enable all outputs.")
|
|
99
|
+
] = False,
|
|
100
|
+
) -> None:
|
|
101
|
+
"""Convert a PDF to Markdown and/or structured JSON."""
|
|
102
|
+
if all_outputs:
|
|
103
|
+
markdown = True
|
|
104
|
+
json_out = True
|
|
105
|
+
extract_images = True
|
|
106
|
+
metadata = True
|
|
107
|
+
tables = True
|
|
108
|
+
if chunk_size is None:
|
|
109
|
+
chunk_size = 1000
|
|
110
|
+
|
|
111
|
+
if figures:
|
|
112
|
+
extract_images = True
|
|
113
|
+
|
|
114
|
+
if ocr and not ocr_available():
|
|
115
|
+
console.print(
|
|
116
|
+
"[yellow]Warning:[/yellow] OCR requested but pytesseract is not installed. "
|
|
117
|
+
"Install with: pip install pdf2mj[ocr]"
|
|
118
|
+
)
|
|
119
|
+
ocr = False
|
|
120
|
+
|
|
121
|
+
stem = pdf.stem
|
|
122
|
+
out_dir = output if output is not None else pdf.parent
|
|
123
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
124
|
+
images_dir = out_dir / "images"
|
|
125
|
+
|
|
126
|
+
def log(msg: str, style: str = "green") -> None:
|
|
127
|
+
console.print(f"[{style}][OK][/] {msg}")
|
|
128
|
+
|
|
129
|
+
try:
|
|
130
|
+
if verbose:
|
|
131
|
+
console.print(Panel.fit(f"Processing [bold]{pdf.name}[/bold]", title="PDF2MJ"))
|
|
132
|
+
|
|
133
|
+
document = parse_pdf(
|
|
134
|
+
pdf,
|
|
135
|
+
use_ocr=ocr,
|
|
136
|
+
extract_images=extract_images,
|
|
137
|
+
extract_tables=tables,
|
|
138
|
+
images_output_dir=images_dir if extract_images else None,
|
|
139
|
+
)
|
|
140
|
+
log("PDF Loaded")
|
|
141
|
+
|
|
142
|
+
if metadata:
|
|
143
|
+
write_metadata_json(document, out_dir / f"{stem}_metadata.json")
|
|
144
|
+
log("Metadata Extracted")
|
|
145
|
+
|
|
146
|
+
if markdown:
|
|
147
|
+
md_path = out_dir / f"{stem}.md"
|
|
148
|
+
write_markdown(document, md_path, pdf_path=pdf, use_pymupdf4llm=True)
|
|
149
|
+
log("Markdown Generated")
|
|
150
|
+
|
|
151
|
+
if json_out:
|
|
152
|
+
json_path = out_dir / f"{stem}.json"
|
|
153
|
+
write_json(document, json_path)
|
|
154
|
+
log("JSON Generated")
|
|
155
|
+
|
|
156
|
+
if extract_images:
|
|
157
|
+
log("Images Extracted")
|
|
158
|
+
|
|
159
|
+
if chunk_size is not None:
|
|
160
|
+
chunks = chunk_document(document, chunk_size=chunk_size, overlap=chunk_overlap)
|
|
161
|
+
write_chunks(chunks, out_dir / "chunks.json")
|
|
162
|
+
log(f"Chunks Generated ({len(chunks)} chunks)")
|
|
163
|
+
|
|
164
|
+
console.print(
|
|
165
|
+
f"\n[bold green]Done.[/] Output written to [cyan]{out_dir.resolve()}[/]"
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
except PDFConversionError as exc:
|
|
169
|
+
console.print(f"[bold red]Error:[/bold red] {exc}")
|
|
170
|
+
raise typer.Exit(code=1) from exc
|
|
171
|
+
except Exception as exc:
|
|
172
|
+
console.print(f"[bold red]Unexpected error:[/bold red] {exc}")
|
|
173
|
+
if verbose:
|
|
174
|
+
console.print_exception()
|
|
175
|
+
raise typer.Exit(code=1) from exc
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def _dispatch(argv: list[str] | None = None) -> None:
|
|
179
|
+
"""Route argv to subcommands or the convert command."""
|
|
180
|
+
args = list(argv if argv is not None else sys.argv[1:])
|
|
181
|
+
|
|
182
|
+
if not args:
|
|
183
|
+
app(args=[])
|
|
184
|
+
return
|
|
185
|
+
|
|
186
|
+
head = args[0]
|
|
187
|
+
if head in _SUBCOMMANDS:
|
|
188
|
+
app(args=args)
|
|
189
|
+
return
|
|
190
|
+
|
|
191
|
+
if head in ("--help", "-h") and len(args) == 1:
|
|
192
|
+
app(args=args)
|
|
193
|
+
return
|
|
194
|
+
|
|
195
|
+
convert_app(args=args, prog_name="pdf2mj")
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def run() -> None:
|
|
199
|
+
"""Console script entry point."""
|
|
200
|
+
from pdf2mj.console_util import ensure_utf8_stdio
|
|
201
|
+
|
|
202
|
+
ensure_utf8_stdio()
|
|
203
|
+
_dispatch()
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
if __name__ == "__main__":
|
|
207
|
+
run()
|
pdf2mj/config.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
"""User configuration and onboarding state."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from platformdirs import user_config_dir
|
|
9
|
+
from pydantic import BaseModel
|
|
10
|
+
|
|
11
|
+
APP_NAME = "pdf2mj"
|
|
12
|
+
CONFIG_FILENAME = "config.json"
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class UserConfig(BaseModel):
|
|
16
|
+
"""Persisted user preferences and onboarding flags."""
|
|
17
|
+
|
|
18
|
+
welcome_shown: bool = False
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def get_config_dir() -> Path:
|
|
22
|
+
"""Return the platform-specific config directory for PDF2MJ."""
|
|
23
|
+
return Path(user_config_dir(APP_NAME, appauthor=False))
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def get_config_path() -> Path:
|
|
27
|
+
"""Return the path to the user config file."""
|
|
28
|
+
return get_config_dir() / CONFIG_FILENAME
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def load_config() -> UserConfig:
|
|
32
|
+
"""Load config from disk, returning defaults if missing or invalid."""
|
|
33
|
+
path = get_config_path()
|
|
34
|
+
if not path.exists():
|
|
35
|
+
return UserConfig()
|
|
36
|
+
|
|
37
|
+
try:
|
|
38
|
+
data = json.loads(path.read_text(encoding="utf-8"))
|
|
39
|
+
return UserConfig.model_validate(data)
|
|
40
|
+
except (json.JSONDecodeError, OSError, ValueError):
|
|
41
|
+
return UserConfig()
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def save_config(config: UserConfig) -> None:
|
|
45
|
+
"""Persist config to disk."""
|
|
46
|
+
path = get_config_path()
|
|
47
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
48
|
+
path.write_text(
|
|
49
|
+
json.dumps(config.model_dump(), indent=2) + "\n",
|
|
50
|
+
encoding="utf-8",
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def is_welcome_shown() -> bool:
|
|
55
|
+
"""Return whether the first-run welcome screen has been displayed."""
|
|
56
|
+
return load_config().welcome_shown
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def mark_welcome_shown() -> None:
|
|
60
|
+
"""Mark the welcome screen as shown."""
|
|
61
|
+
config = load_config()
|
|
62
|
+
config.welcome_shown = True
|
|
63
|
+
save_config(config)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def reset_welcome_shown() -> None:
|
|
67
|
+
"""Reset onboarding state (primarily for tests)."""
|
|
68
|
+
config = load_config()
|
|
69
|
+
config.welcome_shown = False
|
|
70
|
+
save_config(config)
|
pdf2mj/console_util.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
"""Shared Rich console helpers."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import sys
|
|
6
|
+
|
|
7
|
+
from rich.console import Console
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def ensure_utf8_stdio() -> None:
|
|
11
|
+
"""Reconfigure stdout/stderr to UTF-8 on Windows when supported."""
|
|
12
|
+
if sys.platform != "win32":
|
|
13
|
+
return
|
|
14
|
+
for stream in (sys.stdout, sys.stderr):
|
|
15
|
+
reconfigure = getattr(stream, "reconfigure", None)
|
|
16
|
+
if callable(reconfigure):
|
|
17
|
+
try:
|
|
18
|
+
reconfigure(encoding="utf-8")
|
|
19
|
+
except Exception:
|
|
20
|
+
pass
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def make_console() -> Console:
|
|
24
|
+
"""Create a Rich console with sensible cross-platform defaults."""
|
|
25
|
+
ensure_utf8_stdio()
|
|
26
|
+
return Console(legacy_windows=False)
|
pdf2mj/converter.py
ADDED
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
"""PDF parsing pipeline that builds the common Document model."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
import fitz
|
|
9
|
+
|
|
10
|
+
from pdf2mj.image_extractor import extract_images_from_page
|
|
11
|
+
from pdf2mj.metadata import extract_metadata
|
|
12
|
+
from pdf2mj.models import Block, Document, Page
|
|
13
|
+
from pdf2mj.ocr import ocr_page
|
|
14
|
+
from pdf2mj.table_extractor import extract_tables_from_page
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class PDFConversionError(Exception):
|
|
18
|
+
"""Raised when a PDF cannot be processed."""
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _classify_block(
|
|
22
|
+
text: str,
|
|
23
|
+
font_size: float,
|
|
24
|
+
body_size: float,
|
|
25
|
+
is_bold: bool,
|
|
26
|
+
is_mono: bool,
|
|
27
|
+
is_italic: bool,
|
|
28
|
+
) -> tuple[str, int | None]:
|
|
29
|
+
stripped = text.strip()
|
|
30
|
+
if not stripped:
|
|
31
|
+
return "paragraph", None
|
|
32
|
+
|
|
33
|
+
if is_mono or re.match(r"^(\s{4}|\t)", stripped):
|
|
34
|
+
return "code", None
|
|
35
|
+
|
|
36
|
+
if stripped.startswith(">") or (is_italic and len(stripped) < 200):
|
|
37
|
+
return "quote", None
|
|
38
|
+
|
|
39
|
+
list_pattern = re.compile(
|
|
40
|
+
r"^(\u2022|\u2023|\u25E6|\u2043|\u2219|[-*•]|\d+[.)])\s+",
|
|
41
|
+
re.MULTILINE,
|
|
42
|
+
)
|
|
43
|
+
if list_pattern.match(stripped) or "\n" in stripped and all(
|
|
44
|
+
list_pattern.match(line.strip()) or not line.strip()
|
|
45
|
+
for line in stripped.splitlines()
|
|
46
|
+
if line.strip()
|
|
47
|
+
):
|
|
48
|
+
return "list", None
|
|
49
|
+
|
|
50
|
+
if font_size >= body_size * 1.35 or (is_bold and font_size >= body_size * 1.15):
|
|
51
|
+
if font_size >= body_size * 1.8:
|
|
52
|
+
return "heading", 1
|
|
53
|
+
if font_size >= body_size * 1.5:
|
|
54
|
+
return "heading", 2
|
|
55
|
+
return "heading", 3
|
|
56
|
+
|
|
57
|
+
return "paragraph", None
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _parse_text_blocks(page: fitz.Page, block_id_start: int) -> tuple[list[Block], int, bool]:
|
|
61
|
+
"""Parse text blocks from a page; returns (blocks, next_id, has_text)."""
|
|
62
|
+
blocks: list[Block] = []
|
|
63
|
+
next_id = block_id_start
|
|
64
|
+
has_text = False
|
|
65
|
+
|
|
66
|
+
data = page.get_text("dict", flags=fitz.TEXT_PRESERVE_WHITESPACE)
|
|
67
|
+
font_sizes: list[float] = []
|
|
68
|
+
|
|
69
|
+
raw_blocks: list[dict] = []
|
|
70
|
+
for b in data.get("blocks", []):
|
|
71
|
+
if b.get("type") != 0:
|
|
72
|
+
continue
|
|
73
|
+
spans_info: list[dict] = []
|
|
74
|
+
full_text = ""
|
|
75
|
+
for line in b.get("lines", []):
|
|
76
|
+
for span in line.get("spans", []):
|
|
77
|
+
t = span.get("text", "")
|
|
78
|
+
full_text += t
|
|
79
|
+
size = span.get("size", 12.0)
|
|
80
|
+
font_sizes.append(size)
|
|
81
|
+
spans_info.append(
|
|
82
|
+
{
|
|
83
|
+
"size": size,
|
|
84
|
+
"flags": span.get("flags", 0),
|
|
85
|
+
"font": span.get("font", ""),
|
|
86
|
+
}
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
text = full_text.strip()
|
|
90
|
+
if not text:
|
|
91
|
+
continue
|
|
92
|
+
|
|
93
|
+
raw_blocks.append(
|
|
94
|
+
{
|
|
95
|
+
"text": text,
|
|
96
|
+
"bbox": list(b.get("bbox", [])),
|
|
97
|
+
"spans": spans_info,
|
|
98
|
+
}
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
body_size = sum(font_sizes) / len(font_sizes) if font_sizes else 12.0
|
|
102
|
+
|
|
103
|
+
for rb in raw_blocks:
|
|
104
|
+
has_text = True
|
|
105
|
+
spans = rb["spans"]
|
|
106
|
+
avg_size = sum(s["size"] for s in spans) / len(spans) if spans else body_size
|
|
107
|
+
flags = spans[0]["flags"] if spans else 0
|
|
108
|
+
font_name = spans[0]["font"].lower() if spans else ""
|
|
109
|
+
is_bold = bool(flags & 2**4)
|
|
110
|
+
is_italic = bool(flags & 2**1)
|
|
111
|
+
is_mono = "mono" in font_name or "courier" in font_name or "consolas" in font_name
|
|
112
|
+
|
|
113
|
+
block_type, level = _classify_block(
|
|
114
|
+
rb["text"], avg_size, body_size, is_bold, is_mono, is_italic
|
|
115
|
+
)
|
|
116
|
+
blocks.append(
|
|
117
|
+
Block(
|
|
118
|
+
id=f"blk_{next_id}",
|
|
119
|
+
type=block_type,
|
|
120
|
+
text=rb["text"],
|
|
121
|
+
bbox=rb["bbox"],
|
|
122
|
+
level=level,
|
|
123
|
+
)
|
|
124
|
+
)
|
|
125
|
+
next_id += 1
|
|
126
|
+
|
|
127
|
+
return blocks, next_id, has_text
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def parse_pdf(
|
|
131
|
+
pdf_path: Path,
|
|
132
|
+
*,
|
|
133
|
+
use_ocr: bool = False,
|
|
134
|
+
extract_images: bool = False,
|
|
135
|
+
extract_tables: bool = True,
|
|
136
|
+
images_output_dir: Path | None = None,
|
|
137
|
+
) -> Document:
|
|
138
|
+
"""Parse a PDF file into the common Document model."""
|
|
139
|
+
if not pdf_path.exists():
|
|
140
|
+
raise PDFConversionError(f"File not found: {pdf_path}")
|
|
141
|
+
|
|
142
|
+
try:
|
|
143
|
+
doc = fitz.open(pdf_path)
|
|
144
|
+
except Exception as exc:
|
|
145
|
+
raise PDFConversionError(f"Cannot open PDF (corrupt or invalid): {pdf_path.name}") from exc
|
|
146
|
+
|
|
147
|
+
if doc.is_encrypted:
|
|
148
|
+
doc.close()
|
|
149
|
+
raise PDFConversionError(
|
|
150
|
+
f"PDF is encrypted and cannot be read without a password: {pdf_path.name}"
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
if doc.page_count == 0:
|
|
154
|
+
doc.close()
|
|
155
|
+
raise PDFConversionError(f"PDF has no pages: {pdf_path.name}")
|
|
156
|
+
|
|
157
|
+
metadata = extract_metadata(doc)
|
|
158
|
+
pages: list[Page] = []
|
|
159
|
+
global_block_id = 1
|
|
160
|
+
|
|
161
|
+
for page_index in range(doc.page_count):
|
|
162
|
+
page = doc[page_index]
|
|
163
|
+
page_number = page_index + 1
|
|
164
|
+
page_blocks: list[Block] = []
|
|
165
|
+
|
|
166
|
+
table_blocks: list[Block] = []
|
|
167
|
+
if extract_tables:
|
|
168
|
+
table_blocks, global_block_id = extract_tables_from_page(
|
|
169
|
+
page, page_number, global_block_id
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
text_blocks, global_block_id, has_text = _parse_text_blocks(page, global_block_id)
|
|
173
|
+
|
|
174
|
+
if use_ocr and not has_text:
|
|
175
|
+
ocr_blocks, global_block_id = ocr_page(page, global_block_id)
|
|
176
|
+
text_blocks = ocr_blocks
|
|
177
|
+
|
|
178
|
+
if extract_images and images_output_dir is not None:
|
|
179
|
+
img_blocks, global_block_id = extract_images_from_page(
|
|
180
|
+
page, page_number, images_output_dir, global_block_id
|
|
181
|
+
)
|
|
182
|
+
page_blocks.extend(img_blocks)
|
|
183
|
+
|
|
184
|
+
page_blocks.extend(text_blocks)
|
|
185
|
+
page_blocks.extend(table_blocks)
|
|
186
|
+
|
|
187
|
+
pages.append(Page(page_number=page_number, blocks=page_blocks))
|
|
188
|
+
|
|
189
|
+
page_count = doc.page_count
|
|
190
|
+
doc.close()
|
|
191
|
+
|
|
192
|
+
return Document(
|
|
193
|
+
file_name=pdf_path.name,
|
|
194
|
+
page_count=page_count,
|
|
195
|
+
metadata=metadata,
|
|
196
|
+
pages=pages,
|
|
197
|
+
)
|