pdf2mj 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pdf2mj/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ """PDF2MJ — convert PDF documents to Markdown and structured JSON."""
2
+
3
+ __version__ = "0.1.0"
pdf2mj/chunker.py ADDED
@@ -0,0 +1,58 @@
1
+ """Text chunking for RAG pipelines."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+
7
+ from pdf2mj.models import Chunk, Document
8
+
9
+
10
+ def _estimate_tokens(text: str) -> int:
11
+ return len(re.findall(r"\S+", text))
12
+
13
+
14
+ def chunk_document(
15
+ document: Document,
16
+ chunk_size: int = 1000,
17
+ overlap: int = 200,
18
+ ) -> list[Chunk]:
19
+ """Split document text into overlapping chunks with page references."""
20
+ chunks: list[Chunk] = []
21
+ chunk_index = 0
22
+
23
+ for page in document.pages:
24
+ page_text_parts: list[str] = []
25
+ for block in page.blocks:
26
+ if block.type in ("image",):
27
+ continue
28
+ text = block.text.strip()
29
+ if text:
30
+ page_text_parts.append(text)
31
+
32
+ full_text = "\n\n".join(page_text_parts)
33
+ if not full_text.strip():
34
+ continue
35
+
36
+ words = full_text.split()
37
+ if not words:
38
+ continue
39
+
40
+ start = 0
41
+ while start < len(words):
42
+ end = min(start + chunk_size, len(words))
43
+ chunk_words = words[start:end]
44
+ chunk_text = " ".join(chunk_words)
45
+ chunk_index += 1
46
+ chunks.append(
47
+ Chunk(
48
+ chunk_id=f"c{chunk_index}",
49
+ page=page.page_number,
50
+ text=chunk_text,
51
+ tokens=_estimate_tokens(chunk_text),
52
+ )
53
+ )
54
+ if end >= len(words):
55
+ break
56
+ start = max(0, end - overlap)
57
+
58
+ return chunks
pdf2mj/cli.py ADDED
@@ -0,0 +1,207 @@
1
+ """PDF2MJ command-line interface."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import sys
6
+ from pathlib import Path
7
+ from typing import Annotated, Optional
8
+
9
+ import typer
10
+ from rich.console import Console
11
+ from rich.panel import Panel
12
+
13
+ from pdf2mj.chunker import chunk_document
14
+ from pdf2mj.console_util import make_console
15
+ from pdf2mj.converter import PDFConversionError, parse_pdf
16
+ from pdf2mj.doctor import run_doctor
17
+ from pdf2mj.json_export import write_chunks, write_json, write_metadata_json
18
+ from pdf2mj.markdown import write_markdown
19
+ from pdf2mj.ocr import ocr_available
20
+ from pdf2mj.welcome import show_welcome, show_welcome_if_needed
21
+
22
+ console = make_console()
23
+
24
+ app = typer.Typer(
25
+ name="pdf2mj",
26
+ help="Convert PDF documents to Markdown and structured JSON.",
27
+ no_args_is_help=False,
28
+ invoke_without_command=True,
29
+ add_completion=False,
30
+ rich_markup_mode="rich",
31
+ )
32
+
33
+ convert_app = typer.Typer(
34
+ help="Convert a PDF to Markdown and/or structured JSON.",
35
+ add_completion=False,
36
+ )
37
+
38
+ _SUBCOMMANDS = frozenset({"welcome", "doctor"})
39
+
40
+
41
+ @app.callback()
42
+ def root(ctx: typer.Context) -> None:
43
+ """Show first-run welcome and help when invoked with no subcommand."""
44
+ if ctx.invoked_subcommand is not None:
45
+ return
46
+ show_welcome_if_needed(console=console)
47
+ typer.echo(ctx.get_help())
48
+
49
+
50
+ @app.command()
51
+ def welcome() -> None:
52
+ """Display the onboarding welcome screen."""
53
+ show_welcome(console=console, mark_seen=True)
54
+
55
+
56
+ @app.command()
57
+ def doctor(
58
+ output_dir: Annotated[
59
+ Optional[Path],
60
+ typer.Option(
61
+ "--output",
62
+ "-o",
63
+ help="Directory to verify write permissions for.",
64
+ file_okay=False,
65
+ ),
66
+ ] = None,
67
+ ) -> None:
68
+ """Verify environment, dependencies, and optional OCR support."""
69
+ code = run_doctor(console=console, output_dir=output_dir)
70
+ raise typer.Exit(code=code)
71
+
72
+
73
+ @convert_app.command()
74
+ def convert(
75
+ pdf: Annotated[Path, typer.Argument(help="Path to the input PDF file.", exists=True)],
76
+ markdown: Annotated[bool, typer.Option("--markdown/--no-markdown", help="Generate markdown.")] = True,
77
+ json_out: Annotated[bool, typer.Option("--json/--no-json", help="Generate structured JSON.")] = True,
78
+ ocr: Annotated[bool, typer.Option("--ocr", help="OCR scanned PDF pages.")] = False,
79
+ extract_images: Annotated[
80
+ bool, typer.Option("--extract-images", help="Extract embedded images.")
81
+ ] = False,
82
+ chunk_size: Annotated[
83
+ Optional[int], typer.Option("--chunk-size", help="Chunk size (words) for RAG.")
84
+ ] = None,
85
+ chunk_overlap: Annotated[
86
+ int, typer.Option("--chunk-overlap", help="Overlap (words) between chunks.")
87
+ ] = 200,
88
+ output: Annotated[
89
+ Optional[Path], typer.Option("--output", "-o", help="Output directory.")
90
+ ] = None,
91
+ verbose: Annotated[bool, typer.Option("--verbose", "-v", help="Detailed logs.")] = False,
92
+ metadata: Annotated[bool, typer.Option("--metadata", help="Export metadata.json.")] = False,
93
+ tables: Annotated[bool, typer.Option("--tables/--no-tables", help="Extract tables.")] = True,
94
+ figures: Annotated[
95
+ bool, typer.Option("--figures", help="Extract figures (same as --extract-images).")
96
+ ] = False,
97
+ all_outputs: Annotated[
98
+ bool, typer.Option("--all", help="Enable all outputs.")
99
+ ] = False,
100
+ ) -> None:
101
+ """Convert a PDF to Markdown and/or structured JSON."""
102
+ if all_outputs:
103
+ markdown = True
104
+ json_out = True
105
+ extract_images = True
106
+ metadata = True
107
+ tables = True
108
+ if chunk_size is None:
109
+ chunk_size = 1000
110
+
111
+ if figures:
112
+ extract_images = True
113
+
114
+ if ocr and not ocr_available():
115
+ console.print(
116
+ "[yellow]Warning:[/yellow] OCR requested but pytesseract is not installed. "
117
+ "Install with: pip install pdf2mj[ocr]"
118
+ )
119
+ ocr = False
120
+
121
+ stem = pdf.stem
122
+ out_dir = output if output is not None else pdf.parent
123
+ out_dir.mkdir(parents=True, exist_ok=True)
124
+ images_dir = out_dir / "images"
125
+
126
+ def log(msg: str, style: str = "green") -> None:
127
+ console.print(f"[{style}][OK][/] {msg}")
128
+
129
+ try:
130
+ if verbose:
131
+ console.print(Panel.fit(f"Processing [bold]{pdf.name}[/bold]", title="PDF2MJ"))
132
+
133
+ document = parse_pdf(
134
+ pdf,
135
+ use_ocr=ocr,
136
+ extract_images=extract_images,
137
+ extract_tables=tables,
138
+ images_output_dir=images_dir if extract_images else None,
139
+ )
140
+ log("PDF Loaded")
141
+
142
+ if metadata:
143
+ write_metadata_json(document, out_dir / f"{stem}_metadata.json")
144
+ log("Metadata Extracted")
145
+
146
+ if markdown:
147
+ md_path = out_dir / f"{stem}.md"
148
+ write_markdown(document, md_path, pdf_path=pdf, use_pymupdf4llm=True)
149
+ log("Markdown Generated")
150
+
151
+ if json_out:
152
+ json_path = out_dir / f"{stem}.json"
153
+ write_json(document, json_path)
154
+ log("JSON Generated")
155
+
156
+ if extract_images:
157
+ log("Images Extracted")
158
+
159
+ if chunk_size is not None:
160
+ chunks = chunk_document(document, chunk_size=chunk_size, overlap=chunk_overlap)
161
+ write_chunks(chunks, out_dir / "chunks.json")
162
+ log(f"Chunks Generated ({len(chunks)} chunks)")
163
+
164
+ console.print(
165
+ f"\n[bold green]Done.[/] Output written to [cyan]{out_dir.resolve()}[/]"
166
+ )
167
+
168
+ except PDFConversionError as exc:
169
+ console.print(f"[bold red]Error:[/bold red] {exc}")
170
+ raise typer.Exit(code=1) from exc
171
+ except Exception as exc:
172
+ console.print(f"[bold red]Unexpected error:[/bold red] {exc}")
173
+ if verbose:
174
+ console.print_exception()
175
+ raise typer.Exit(code=1) from exc
176
+
177
+
178
+ def _dispatch(argv: list[str] | None = None) -> None:
179
+ """Route argv to subcommands or the convert command."""
180
+ args = list(argv if argv is not None else sys.argv[1:])
181
+
182
+ if not args:
183
+ app(args=[])
184
+ return
185
+
186
+ head = args[0]
187
+ if head in _SUBCOMMANDS:
188
+ app(args=args)
189
+ return
190
+
191
+ if head in ("--help", "-h") and len(args) == 1:
192
+ app(args=args)
193
+ return
194
+
195
+ convert_app(args=args, prog_name="pdf2mj")
196
+
197
+
198
+ def run() -> None:
199
+ """Console script entry point."""
200
+ from pdf2mj.console_util import ensure_utf8_stdio
201
+
202
+ ensure_utf8_stdio()
203
+ _dispatch()
204
+
205
+
206
+ if __name__ == "__main__":
207
+ run()
pdf2mj/config.py ADDED
@@ -0,0 +1,70 @@
1
+ """User configuration and onboarding state."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from pathlib import Path
7
+
8
+ from platformdirs import user_config_dir
9
+ from pydantic import BaseModel
10
+
11
+ APP_NAME = "pdf2mj"
12
+ CONFIG_FILENAME = "config.json"
13
+
14
+
15
+ class UserConfig(BaseModel):
16
+ """Persisted user preferences and onboarding flags."""
17
+
18
+ welcome_shown: bool = False
19
+
20
+
21
+ def get_config_dir() -> Path:
22
+ """Return the platform-specific config directory for PDF2MJ."""
23
+ return Path(user_config_dir(APP_NAME, appauthor=False))
24
+
25
+
26
+ def get_config_path() -> Path:
27
+ """Return the path to the user config file."""
28
+ return get_config_dir() / CONFIG_FILENAME
29
+
30
+
31
+ def load_config() -> UserConfig:
32
+ """Load config from disk, returning defaults if missing or invalid."""
33
+ path = get_config_path()
34
+ if not path.exists():
35
+ return UserConfig()
36
+
37
+ try:
38
+ data = json.loads(path.read_text(encoding="utf-8"))
39
+ return UserConfig.model_validate(data)
40
+ except (json.JSONDecodeError, OSError, ValueError):
41
+ return UserConfig()
42
+
43
+
44
+ def save_config(config: UserConfig) -> None:
45
+ """Persist config to disk."""
46
+ path = get_config_path()
47
+ path.parent.mkdir(parents=True, exist_ok=True)
48
+ path.write_text(
49
+ json.dumps(config.model_dump(), indent=2) + "\n",
50
+ encoding="utf-8",
51
+ )
52
+
53
+
54
+ def is_welcome_shown() -> bool:
55
+ """Return whether the first-run welcome screen has been displayed."""
56
+ return load_config().welcome_shown
57
+
58
+
59
+ def mark_welcome_shown() -> None:
60
+ """Mark the welcome screen as shown."""
61
+ config = load_config()
62
+ config.welcome_shown = True
63
+ save_config(config)
64
+
65
+
66
+ def reset_welcome_shown() -> None:
67
+ """Reset onboarding state (primarily for tests)."""
68
+ config = load_config()
69
+ config.welcome_shown = False
70
+ save_config(config)
pdf2mj/console_util.py ADDED
@@ -0,0 +1,26 @@
1
+ """Shared Rich console helpers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import sys
6
+
7
+ from rich.console import Console
8
+
9
+
10
+ def ensure_utf8_stdio() -> None:
11
+ """Reconfigure stdout/stderr to UTF-8 on Windows when supported."""
12
+ if sys.platform != "win32":
13
+ return
14
+ for stream in (sys.stdout, sys.stderr):
15
+ reconfigure = getattr(stream, "reconfigure", None)
16
+ if callable(reconfigure):
17
+ try:
18
+ reconfigure(encoding="utf-8")
19
+ except Exception:
20
+ pass
21
+
22
+
23
+ def make_console() -> Console:
24
+ """Create a Rich console with sensible cross-platform defaults."""
25
+ ensure_utf8_stdio()
26
+ return Console(legacy_windows=False)
pdf2mj/converter.py ADDED
@@ -0,0 +1,197 @@
1
+ """PDF parsing pipeline that builds the common Document model."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ from pathlib import Path
7
+
8
+ import fitz
9
+
10
+ from pdf2mj.image_extractor import extract_images_from_page
11
+ from pdf2mj.metadata import extract_metadata
12
+ from pdf2mj.models import Block, Document, Page
13
+ from pdf2mj.ocr import ocr_page
14
+ from pdf2mj.table_extractor import extract_tables_from_page
15
+
16
+
17
+ class PDFConversionError(Exception):
18
+ """Raised when a PDF cannot be processed."""
19
+
20
+
21
+ def _classify_block(
22
+ text: str,
23
+ font_size: float,
24
+ body_size: float,
25
+ is_bold: bool,
26
+ is_mono: bool,
27
+ is_italic: bool,
28
+ ) -> tuple[str, int | None]:
29
+ stripped = text.strip()
30
+ if not stripped:
31
+ return "paragraph", None
32
+
33
+ if is_mono or re.match(r"^(\s{4}|\t)", stripped):
34
+ return "code", None
35
+
36
+ if stripped.startswith(">") or (is_italic and len(stripped) < 200):
37
+ return "quote", None
38
+
39
+ list_pattern = re.compile(
40
+ r"^(\u2022|\u2023|\u25E6|\u2043|\u2219|[-*•]|\d+[.)])\s+",
41
+ re.MULTILINE,
42
+ )
43
+ if list_pattern.match(stripped) or "\n" in stripped and all(
44
+ list_pattern.match(line.strip()) or not line.strip()
45
+ for line in stripped.splitlines()
46
+ if line.strip()
47
+ ):
48
+ return "list", None
49
+
50
+ if font_size >= body_size * 1.35 or (is_bold and font_size >= body_size * 1.15):
51
+ if font_size >= body_size * 1.8:
52
+ return "heading", 1
53
+ if font_size >= body_size * 1.5:
54
+ return "heading", 2
55
+ return "heading", 3
56
+
57
+ return "paragraph", None
58
+
59
+
60
+ def _parse_text_blocks(page: fitz.Page, block_id_start: int) -> tuple[list[Block], int, bool]:
61
+ """Parse text blocks from a page; returns (blocks, next_id, has_text)."""
62
+ blocks: list[Block] = []
63
+ next_id = block_id_start
64
+ has_text = False
65
+
66
+ data = page.get_text("dict", flags=fitz.TEXT_PRESERVE_WHITESPACE)
67
+ font_sizes: list[float] = []
68
+
69
+ raw_blocks: list[dict] = []
70
+ for b in data.get("blocks", []):
71
+ if b.get("type") != 0:
72
+ continue
73
+ spans_info: list[dict] = []
74
+ full_text = ""
75
+ for line in b.get("lines", []):
76
+ for span in line.get("spans", []):
77
+ t = span.get("text", "")
78
+ full_text += t
79
+ size = span.get("size", 12.0)
80
+ font_sizes.append(size)
81
+ spans_info.append(
82
+ {
83
+ "size": size,
84
+ "flags": span.get("flags", 0),
85
+ "font": span.get("font", ""),
86
+ }
87
+ )
88
+
89
+ text = full_text.strip()
90
+ if not text:
91
+ continue
92
+
93
+ raw_blocks.append(
94
+ {
95
+ "text": text,
96
+ "bbox": list(b.get("bbox", [])),
97
+ "spans": spans_info,
98
+ }
99
+ )
100
+
101
+ body_size = sum(font_sizes) / len(font_sizes) if font_sizes else 12.0
102
+
103
+ for rb in raw_blocks:
104
+ has_text = True
105
+ spans = rb["spans"]
106
+ avg_size = sum(s["size"] for s in spans) / len(spans) if spans else body_size
107
+ flags = spans[0]["flags"] if spans else 0
108
+ font_name = spans[0]["font"].lower() if spans else ""
109
+ is_bold = bool(flags & 2**4)
110
+ is_italic = bool(flags & 2**1)
111
+ is_mono = "mono" in font_name or "courier" in font_name or "consolas" in font_name
112
+
113
+ block_type, level = _classify_block(
114
+ rb["text"], avg_size, body_size, is_bold, is_mono, is_italic
115
+ )
116
+ blocks.append(
117
+ Block(
118
+ id=f"blk_{next_id}",
119
+ type=block_type,
120
+ text=rb["text"],
121
+ bbox=rb["bbox"],
122
+ level=level,
123
+ )
124
+ )
125
+ next_id += 1
126
+
127
+ return blocks, next_id, has_text
128
+
129
+
130
+ def parse_pdf(
131
+ pdf_path: Path,
132
+ *,
133
+ use_ocr: bool = False,
134
+ extract_images: bool = False,
135
+ extract_tables: bool = True,
136
+ images_output_dir: Path | None = None,
137
+ ) -> Document:
138
+ """Parse a PDF file into the common Document model."""
139
+ if not pdf_path.exists():
140
+ raise PDFConversionError(f"File not found: {pdf_path}")
141
+
142
+ try:
143
+ doc = fitz.open(pdf_path)
144
+ except Exception as exc:
145
+ raise PDFConversionError(f"Cannot open PDF (corrupt or invalid): {pdf_path.name}") from exc
146
+
147
+ if doc.is_encrypted:
148
+ doc.close()
149
+ raise PDFConversionError(
150
+ f"PDF is encrypted and cannot be read without a password: {pdf_path.name}"
151
+ )
152
+
153
+ if doc.page_count == 0:
154
+ doc.close()
155
+ raise PDFConversionError(f"PDF has no pages: {pdf_path.name}")
156
+
157
+ metadata = extract_metadata(doc)
158
+ pages: list[Page] = []
159
+ global_block_id = 1
160
+
161
+ for page_index in range(doc.page_count):
162
+ page = doc[page_index]
163
+ page_number = page_index + 1
164
+ page_blocks: list[Block] = []
165
+
166
+ table_blocks: list[Block] = []
167
+ if extract_tables:
168
+ table_blocks, global_block_id = extract_tables_from_page(
169
+ page, page_number, global_block_id
170
+ )
171
+
172
+ text_blocks, global_block_id, has_text = _parse_text_blocks(page, global_block_id)
173
+
174
+ if use_ocr and not has_text:
175
+ ocr_blocks, global_block_id = ocr_page(page, global_block_id)
176
+ text_blocks = ocr_blocks
177
+
178
+ if extract_images and images_output_dir is not None:
179
+ img_blocks, global_block_id = extract_images_from_page(
180
+ page, page_number, images_output_dir, global_block_id
181
+ )
182
+ page_blocks.extend(img_blocks)
183
+
184
+ page_blocks.extend(text_blocks)
185
+ page_blocks.extend(table_blocks)
186
+
187
+ pages.append(Page(page_number=page_number, blocks=page_blocks))
188
+
189
+ page_count = doc.page_count
190
+ doc.close()
191
+
192
+ return Document(
193
+ file_name=pdf_path.name,
194
+ page_count=page_count,
195
+ metadata=metadata,
196
+ pages=pages,
197
+ )