tocsmith 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tocsmith/__init__.py ADDED
@@ -0,0 +1,9 @@
1
+ __all__ = [
2
+ "Heading",
3
+ "parse_toc_lines",
4
+ "generate_bookmarks",
5
+ ]
6
+
7
+ from .core import Heading, parse_toc_lines, generate_bookmarks # noqa: E402
8
+
9
+
tocsmith/cli.py ADDED
@@ -0,0 +1,195 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ from pathlib import Path
5
+ from typing import Any, Dict, List, Optional
6
+ import sys
7
+
8
+ from .core import generate_bookmarks, parse_toc_lines
9
+
10
+ try: # Python 3.11+
11
+ import tomllib # type: ignore[attr-defined]
12
+ except ModuleNotFoundError: # Python 3.9-3.10
13
+ try:
14
+ import tomli as tomllib # type: ignore[assignment]
15
+ except ModuleNotFoundError:
16
+ tomllib = None # type: ignore[assignment]
17
+
18
+
19
+ def parse_args(argv: List[str] | None = None) -> argparse.Namespace:
20
+ p = argparse.ArgumentParser(prog="tocsmith", description="Auto add bookmarks to PDF")
21
+ p.add_argument("pdf", nargs="?", help="Input PDF path")
22
+ p.add_argument("-o", "--out", help="Output PDF path; default: <name>.bookmarked.pdf")
23
+ p.add_argument("--min-len", type=int, default=3, help="Minimum heading text length")
24
+ p.add_argument("--page-offset", type=int, default=0, help="Page offset: actual - book page")
25
+ p.add_argument("--toc-file", help="Path to a text file containing TOC lines")
26
+ p.add_argument(
27
+ "-c",
28
+ "--config",
29
+ help="Path to a TOML config file for batch tasks (overrides single-run args)",
30
+ )
31
+ return p.parse_args(argv)
32
+
33
+
34
+ def _resolve_relative(base_dir: Path, maybe_path: Optional[str]) -> Optional[Path]:
35
+ """Resolve a path relative to base_dir if provided; return None if empty."""
36
+ if not maybe_path:
37
+ return None
38
+ p = Path(maybe_path)
39
+ return (base_dir / p).resolve() if not p.is_absolute() else p
40
+
41
+
42
+ def _run_single(
43
+ src: Path,
44
+ out: Optional[Path],
45
+ toc_file: Optional[Path],
46
+ page_offset: int,
47
+ min_len: int,
48
+ toc_text: Optional[str] = None,
49
+ ) -> int:
50
+ """Run a single task and return process exit code."""
51
+ if not src.exists():
52
+ print(f"File not found: {src}")
53
+ return 2
54
+ out_path = out if out else src.with_suffix(".bookmarked.pdf")
55
+
56
+ headings = []
57
+ if toc_text is not None and toc_text.strip():
58
+ headings = parse_toc_lines(toc_text, page_offset=page_offset, min_len=min_len)
59
+ elif toc_file:
60
+ file_text = Path(toc_file).read_text(encoding="utf-8")
61
+ headings = parse_toc_lines(file_text, page_offset=page_offset, min_len=min_len)
62
+ else:
63
+ print("No TOC source provided (use --toc-file). Producing a copy without outline.")
64
+ headings = []
65
+ if not headings:
66
+ print("No headings; output will be a copy without outline.")
67
+ generate_bookmarks(str(src), str(out_path), headings)
68
+ print(f"Wrote: {out_path}")
69
+ return 0
70
+
71
+
72
+ def _run_batch(config_path: Path) -> int:
73
+ '''Run batch tasks from a TOML config file.
74
+
75
+ Config schema (customized):
76
+ [defaults]
77
+ page_offset = 0
78
+ min_len = 3
79
+ input_prefix = "input" # optional; base dir for input files
80
+ output_prefix = "output" # optional; base dir for outputs
81
+ output_suffix = ".bookmarked.pdf" # optional; appended to stem
82
+
83
+ [[tasks]]
84
+ input_file = "book1.pdf" # required; relative to input_prefix
85
+ toc = """...""" # optional inline TOC text
86
+ # Alternatively: toc_file = "toc.txt"
87
+ page_offset = 10 # optional overrides default
88
+ min_len = 2 # optional overrides default
89
+ '''
90
+ if tomllib is None:
91
+ print("Error: TOML support not available. Please install 'tomli' for Python < 3.11.")
92
+ return 2
93
+
94
+ if not config_path.exists():
95
+ print(f"Config file not found: {config_path}")
96
+ return 2
97
+
98
+ with open(config_path, "rb") as f:
99
+ data = tomllib.load(f)
100
+
101
+ base_dir = config_path.parent
102
+ defaults: Dict[str, Any] = data.get("defaults", {}) or {}
103
+ tasks: List[Dict[str, Any]] = data.get("tasks", []) or []
104
+ if not isinstance(tasks, list) or not tasks:
105
+ print("No tasks found in config (expected [[tasks]] array)")
106
+ return 2
107
+
108
+ default_page_offset = int(defaults.get("page_offset", 0) or 0)
109
+ default_min_len = int(defaults.get("min_len", 3) or 3)
110
+ input_prefix = str(defaults.get("input_prefix", "")).strip() or ""
111
+ output_prefix = str(defaults.get("output_prefix", "")).strip() or ""
112
+ output_suffix = (
113
+ str(defaults.get("output_suffix", ".bookmarked.pdf")).strip() or ".bookmarked.pdf"
114
+ )
115
+
116
+ input_base = (base_dir / input_prefix).resolve() if input_prefix else base_dir
117
+ output_base = (base_dir / output_prefix).resolve() if output_prefix else base_dir
118
+
119
+ failures = 0
120
+ for idx, t in enumerate(tasks, start=1):
121
+ input_file_val = t.get("input_file")
122
+ if not input_file_val:
123
+ print(f"[Task {idx}] Skipped: missing 'input_file'")
124
+ failures += 1
125
+ continue
126
+
127
+ # Resolve input file relative to input_base
128
+ src = (input_base / str(input_file_val)).resolve()
129
+
130
+ # Determine output path {output_base}/{stem}{output_suffix}
131
+ try:
132
+ out_stem = Path(str(input_file_val)).stem
133
+ except Exception:
134
+ out_stem = "output"
135
+ out = (output_base / f"{out_stem}{output_suffix}").resolve()
136
+
137
+ # Obtain TOC from inline 'toc' or optional 'toc_file' fallback
138
+ toc_inline: Optional[str] = t.get("toc")
139
+ toc_file = _resolve_relative(base_dir, t.get("toc_file"))
140
+ page_offset = int(t.get("page_offset", default_page_offset) or default_page_offset)
141
+ min_len = int(t.get("min_len", default_min_len) or default_min_len)
142
+
143
+ print(
144
+ f"[Task {idx}] Running: src={src} out={out} "
145
+ f"toc={'inline' if (toc_inline and toc_inline.strip()) else (toc_file or '<none>')} "
146
+ f"offset={page_offset} min_len={min_len}"
147
+ )
148
+ try:
149
+ # Ensure output directory exists
150
+ out.parent.mkdir(parents=True, exist_ok=True)
151
+ code = _run_single(
152
+ src=Path(src),
153
+ out=out,
154
+ toc_file=toc_file,
155
+ page_offset=page_offset,
156
+ min_len=min_len,
157
+ toc_text=toc_inline,
158
+ )
159
+ if code != 0:
160
+ failures += 1
161
+ except Exception as e:
162
+ failures += 1
163
+ print(f"[Task {idx}] Failed: {e}")
164
+
165
+ if failures:
166
+ print(f"Completed with {failures} failure(s)")
167
+ return 1
168
+ print("All tasks completed successfully")
169
+ return 0
170
+
171
+
172
+ def main(argv: List[str] | None = None) -> int:
173
+ ns = parse_args(argv)
174
+ if ns.config:
175
+ return _run_batch(Path(ns.config))
176
+
177
+ if not ns.pdf:
178
+ print("Error: either specify a PDF or use --config for batch mode.")
179
+ return 2
180
+
181
+ src = Path(ns.pdf)
182
+ out = Path(ns.out) if ns.out else None
183
+ return _run_single(
184
+ src=src,
185
+ out=out,
186
+ toc_file=Path(ns.toc_file) if ns.toc_file else None,
187
+ page_offset=ns.page_offset,
188
+ min_len=ns.min_len,
189
+ )
190
+
191
+
192
+ if __name__ == "__main__": # pragma: no cover
193
+ raise SystemExit(main())
194
+
195
+
tocsmith/core.py ADDED
@@ -0,0 +1,122 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ import re
5
+ from typing import Iterable, List, Tuple, Optional
6
+
7
+ from pypdf import PdfReader, PdfWriter
8
+
9
+
10
+ @dataclass
11
+ class Heading:
12
+ title: str
13
+ page: int # 1-based
14
+ level: int # 1..6
15
+
16
+
17
+ def generate_bookmarks(src_pdf: str, out_pdf: str, headings: Iterable[Heading]) -> None:
18
+ """Write given headings into a new PDF file as outline/bookmarks."""
19
+ reader = PdfReader(src_pdf)
20
+ writer = PdfWriter()
21
+ for page in reader.pages:
22
+ writer.add_page(page)
23
+
24
+ # Build hierarchical outlines using a simple stack by levels
25
+ stack: List[Tuple[int, object]] = [] # (level, parent_ref)
26
+
27
+ for h in headings:
28
+ page_index = max(0, min(len(reader.pages) - 1, h.page - 1))
29
+ while stack and stack[-1][0] >= h.level:
30
+ stack.pop()
31
+ parent = stack[-1][1] if stack else None
32
+ dest = writer.add_outline_item(h.title, page_index, parent=parent)
33
+ stack.append((h.level, dest))
34
+
35
+ with open(out_pdf, "wb") as f:
36
+ writer.write(f)
37
+
38
+
39
+ # -------------------- TOC parsing utilities --------------------
40
+
41
+ _NUM_PREFIX_RE = re.compile(
42
+ r"^\s*(?P<num>(第\s*\d+[一二三四五六七八九十百千]*[章节节部分编]?)|((\d+\.)+\d+)|\d+)?\s*"
43
+ )
44
+ _TRAILING_PAGE_RE = re.compile(r"(?P<page>\d{1,5})\s*$")
45
+
46
+
47
+ def _infer_level_from_numbering(num: Optional[str]) -> int:
48
+ if not num:
49
+ return 1
50
+ num = num.strip()
51
+ if num.startswith("第"):
52
+ # "第1章" style => top-level
53
+ return 1
54
+ if "." in num:
55
+ # "1.2.3" => level = segments + 1 (so 1.2 is level 2)
56
+ return min(6, max(1, num.count(".") + 1))
57
+ # Simple leading integer like "1" => level 1
58
+ return 1
59
+
60
+
61
+ def parse_toc_lines(toc_text: str, page_offset: int = 0, min_len: int = 1) -> List[Heading]:
62
+ """
63
+ Parse a pasted TOC text into Heading entries.
64
+ - Each line should end with the book page number (digits)
65
+ - Leading numbering like "第1章" or "1.2" is used to infer the level
66
+ - page_offset is added to the parsed page number to map to PDF actual pages
67
+ """
68
+ headings: List[Heading] = []
69
+ for raw_line in toc_text.splitlines():
70
+ line = raw_line.strip()
71
+ if len(line) < min_len:
72
+ continue
73
+ # Detect and temporarily strip leading asterisk marker(s)
74
+ star_prefix = ""
75
+ m_star = re.match(r"^\*+\s*", line)
76
+ if m_star:
77
+ stars = m_star.group(0)
78
+ star_count = stars.count("*")
79
+ # Preserve star(s) without trailing space; spacing will be normalized later
80
+ star_prefix = ("*" * star_count)
81
+ line = line[m_star.end() :].lstrip()
82
+
83
+ # Extract trailing page digits
84
+ page_m = _TRAILING_PAGE_RE.search(line)
85
+ if not page_m:
86
+ continue
87
+ page_num = int(page_m.group("page"))
88
+ # Remove trailing page from the line
89
+ line_wo_page = line[: page_m.start()].rstrip()
90
+ # Extract leading numbering if exists
91
+ num_m = _NUM_PREFIX_RE.match(line_wo_page)
92
+ numbering = None
93
+ title_part = line_wo_page
94
+ if num_m:
95
+ numbering = num_m.group("num")
96
+ title_part = line_wo_page[num_m.end() :].strip()
97
+ # Build title while preserving numbering prefix (e.g., "第1章" or "1.1")
98
+ if numbering:
99
+ combined = f"{numbering.strip()} {title_part}".strip()
100
+ else:
101
+ combined = title_part
102
+ # Cleanup whitespace
103
+ title = re.sub(r"\s+", " ", combined)
104
+ if not title:
105
+ # fallback to raw without numbering
106
+ title = line_wo_page.strip()
107
+ # Restore asterisk prefix if present
108
+ if star_prefix:
109
+ # No space between star(s) and numbering/title
110
+ title = f"{star_prefix}{title}".strip()
111
+ level = _infer_level_from_numbering(numbering)
112
+ pdf_page = max(1, page_num + page_offset)
113
+ headings.append(Heading(title=title, page=pdf_page, level=level))
114
+
115
+ # Sort by page then by inferred level
116
+ headings.sort(key=lambda h: (h.page, h.level, h.title.lower()))
117
+ return headings
118
+
119
+
120
+ ## URL/website TOC fetching intentionally removed; only manual text input is supported.
121
+
122
+
tocsmith/gui.py ADDED
@@ -0,0 +1,253 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import threading
5
+ import tkinter as tk
6
+ from tkinter import filedialog, messagebox, ttk
7
+ from tkinter import font as tkfont
8
+ from pathlib import Path
9
+ from typing import Optional
10
+ import platform
11
+ import subprocess
12
+ import os
13
+
14
+ from .core import generate_bookmarks, parse_toc_lines
15
+
16
+
17
+ # Run blocking CPU/IO bound function in thread to keep UI responsive
18
+ async def run_in_thread(func, *args, **kwargs):
19
+ loop = asyncio.get_event_loop()
20
+ return await loop.run_in_executor(None, lambda: func(*args, **kwargs))
21
+
22
+
23
+ class App:
24
+ def __init__(self, root: tk.Tk) -> None:
25
+ self.root = root
26
+ self.root.title("TocSmith")
27
+ self.root.geometry("820x640")
28
+
29
+ self.input_path: Optional[Path] = None
30
+ self.output_path: Optional[Path] = None
31
+
32
+ self._build_ui()
33
+ self._setup_event_loop()
34
+
35
+ def _build_ui(self) -> None:
36
+ frm = ttk.Frame(self.root, padding=10)
37
+ frm.pack(fill=tk.BOTH, expand=True)
38
+
39
+ # Prominent primary action
40
+ style = ttk.Style(self.root)
41
+ big_font = tkfont.Font(size=12, weight="bold")
42
+ style.configure("Primary.TButton", font=big_font, padding=(10, 12))
43
+
44
+ # Input selector
45
+ in_row = ttk.Frame(frm)
46
+ in_row.pack(fill=tk.X)
47
+ ttk.Label(in_row, text="Input PDF:").pack(side=tk.LEFT)
48
+ self.in_var = tk.StringVar()
49
+ self.in_entry = ttk.Entry(in_row, textvariable=self.in_var)
50
+ self.in_entry.pack(side=tk.LEFT, fill=tk.X, expand=True, padx=6)
51
+ ttk.Button(in_row, text="Browse", command=self.choose_input).pack(side=tk.LEFT)
52
+
53
+ # Output path
54
+ out_row = ttk.Frame(frm)
55
+ out_row.pack(fill=tk.X, pady=(8, 0))
56
+ ttk.Label(out_row, text="Output PDF:").pack(side=tk.LEFT)
57
+ self.out_var = tk.StringVar()
58
+ self.out_entry = ttk.Entry(out_row, textvariable=self.out_var)
59
+ self.out_entry.pack(side=tk.LEFT, fill=tk.X, expand=True, padx=6)
60
+ ttk.Button(out_row, text="Browse", command=self.choose_output).pack(side=tk.LEFT)
61
+ ttk.Button(out_row, text="Open Folder", command=self.open_output_folder).pack(side=tk.LEFT, padx=(6, 0))
62
+
63
+ # Offset + Controls
64
+ ctrl = ttk.Frame(frm)
65
+ ctrl.pack(fill=tk.X, pady=10)
66
+ ttk.Label(ctrl, text="Page Offset:").pack(side=tk.LEFT)
67
+ self.offset_var = tk.StringVar(value="0")
68
+ self.offset_entry = ttk.Entry(ctrl, textvariable=self.offset_var, width=6)
69
+ self.offset_entry.pack(side=tk.LEFT, padx=(4, 12))
70
+
71
+ # TOC input
72
+ toc_row = ttk.Frame(frm)
73
+ toc_row.pack(fill=tk.BOTH, expand=True)
74
+ left = ttk.Frame(toc_row)
75
+ left.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)
76
+ right = ttk.Frame(toc_row, width=260)
77
+ right.pack(side=tk.LEFT, fill=tk.Y)
78
+
79
+ ttk.Label(left, text="TOC text:").pack(anchor=tk.W)
80
+ self.toc_text = tk.Text(left, height=10)
81
+ self.toc_text.pack(fill=tk.BOTH, expand=True)
82
+ btns = ttk.Frame(left)
83
+ btns.pack(fill=tk.X, pady=4)
84
+ ttk.Button(btns, text="Parse TOC Text", command=self._on_parse_toc_text).pack(side=tk.LEFT)
85
+
86
+ # Tree view for headings
87
+ self.tree = ttk.Treeview(right, columns=("title", "page", "level"), show="headings", height=15)
88
+ self.tree.heading("title", text="Title")
89
+ self.tree.heading("page", text="Page")
90
+ self.tree.heading("level", text="Level")
91
+ self.tree.column("title", width=160)
92
+ self.tree.pack(fill=tk.BOTH, expand=True)
93
+
94
+ ttk.Button(frm, text="Generate", command=self._on_generate, style="Primary.TButton").pack(
95
+ fill=tk.X, pady=(0, 10)
96
+ )
97
+
98
+ self.status_var = tk.StringVar(value="Ready")
99
+ ttk.Label(frm, textvariable=self.status_var).pack(anchor=tk.W, pady=(8, 0))
100
+
101
+ def _setup_event_loop(self) -> None:
102
+ # tkinter mainloop is blocking; integrate asyncio by polling
103
+ self.loop = asyncio.new_event_loop()
104
+ self.loop_thread = threading.Thread(target=self.loop.run_forever, daemon=True)
105
+ self.loop_thread.start()
106
+ self.root.after(50, self._poll_loop)
107
+
108
+ def _poll_loop(self) -> None:
109
+ # UI heartbeat
110
+ if self.root.winfo_exists():
111
+ self.root.after(50, self._poll_loop)
112
+
113
+ def _set_status(self, text: str) -> None:
114
+ self.status_var.set(text)
115
+ self.root.update_idletasks()
116
+
117
+ def choose_input(self) -> None:
118
+ path = filedialog.askopenfilename(filetypes=[("PDF", "*.pdf")])
119
+ if path:
120
+ self.input_path = Path(path)
121
+ self.in_var.set(path)
122
+ if not self.out_var.get():
123
+ # Default output to Downloads directory with suffixed name
124
+ downloads = self._get_downloads_dir()
125
+ default_out = downloads / f"{self.input_path.stem}.bookmarked.pdf"
126
+ self.output_path = default_out
127
+ self.out_var.set(str(default_out))
128
+
129
+ def choose_output(self) -> None:
130
+ # Suggest Downloads as default directory, and a sensible default filename
131
+ downloads = self._get_downloads_dir()
132
+ initialdir = str(downloads)
133
+ initialfile = ""
134
+ if self.input_path:
135
+ initialfile = f"{self.input_path.stem}.bookmarked.pdf"
136
+ elif self.out_var.get():
137
+ try:
138
+ p = Path(self.out_var.get())
139
+ initialdir = str(p.parent)
140
+ initialfile = p.name
141
+ except Exception:
142
+ pass
143
+
144
+ path = filedialog.asksaveasfilename(
145
+ defaultextension=".pdf",
146
+ filetypes=[("PDF", "*.pdf")],
147
+ initialdir=initialdir,
148
+ initialfile=initialfile,
149
+ )
150
+ if path:
151
+ self.output_path = Path(path)
152
+ self.out_var.set(path)
153
+
154
+ def _clear_tree(self) -> None:
155
+ for i in self.tree.get_children():
156
+ self.tree.delete(i)
157
+
158
+ def _populate_tree(self, headings) -> None:
159
+ self._clear_tree()
160
+ for h in headings:
161
+ self.tree.insert("", tk.END, values=(h.title, h.page, h.level))
162
+
163
+ # Auto analysis removed
164
+
165
+ def _on_generate(self) -> None:
166
+ if not self.in_var.get():
167
+ messagebox.showwarning("Missing", "Please choose an input PDF")
168
+ return
169
+ if not self.out_var.get():
170
+ messagebox.showwarning("Missing", "Please choose an output path")
171
+ return
172
+
173
+ async def task():
174
+ self._set_status("Generating…")
175
+ # Prefer TOC from text if present
176
+ text = self.toc_text.get("1.0", tk.END).strip()
177
+ hs = []
178
+ if text:
179
+ try:
180
+ offset = int(self.offset_var.get() or 0)
181
+ except ValueError:
182
+ offset = 0
183
+ hs = await run_in_thread(parse_toc_lines, text, offset)
184
+ else:
185
+ hs = []
186
+ await run_in_thread(generate_bookmarks, self.in_var.get(), self.out_var.get(), hs)
187
+ self._set_status("Done")
188
+ messagebox.showinfo("Success", f"Wrote: {self.out_var.get()}")
189
+
190
+ asyncio.run_coroutine_threadsafe(task(), self.loop)
191
+
192
+ def _on_parse_toc_text(self) -> None:
193
+ text = self.toc_text.get("1.0", tk.END).strip()
194
+ if not text:
195
+ messagebox.showwarning("Empty", "Please paste TOC text or URL first")
196
+ return
197
+ try:
198
+ offset = int(self.offset_var.get() or 0)
199
+ except ValueError:
200
+ offset = 0
201
+
202
+ async def task():
203
+ self._set_status("Parsing TOC…")
204
+ hs = await run_in_thread(parse_toc_lines, text, offset)
205
+ self._populate_tree(hs)
206
+ self._set_status(f"Parsed {len(hs)} entries")
207
+
208
+ asyncio.run_coroutine_threadsafe(task(), self.loop)
209
+
210
+ # URL fetch removed: manual TOC input only
211
+
212
+ def _get_downloads_dir(self) -> Path:
213
+ """Return the user's Downloads directory, fallback to home if missing."""
214
+ downloads = Path.home() / "Downloads"
215
+ return downloads if downloads.exists() else Path.home()
216
+
217
+ def open_output_folder(self) -> None:
218
+ """Open the output directory in the system file manager.
219
+
220
+ If an explicit output path is set, opens its parent directory;
221
+ otherwise opens the Downloads directory.
222
+ """
223
+ target_dir: Path
224
+ try:
225
+ if self.out_var.get():
226
+ target_dir = Path(self.out_var.get()).expanduser().resolve().parent
227
+ else:
228
+ target_dir = self._get_downloads_dir()
229
+ except Exception:
230
+ target_dir = self._get_downloads_dir()
231
+
232
+ if not target_dir.exists():
233
+ messagebox.showwarning("Missing", f"Directory does not exist: {target_dir}")
234
+ return
235
+
236
+ system = platform.system().lower()
237
+ try:
238
+ if system == "windows":
239
+ os.startfile(str(target_dir)) # type: ignore[attr-defined]
240
+ elif system == "darwin":
241
+ subprocess.run(["open", str(target_dir)], check=False)
242
+ else:
243
+ subprocess.run(["xdg-open", str(target_dir)], check=False)
244
+ except Exception as e:
245
+ messagebox.showerror("Error", f"Failed to open folder: {e}")
246
+
247
+
248
+ def main() -> None: # pragma: no cover
249
+ root = tk.Tk()
250
+ App(root)
251
+ root.mainloop()
252
+
253
+
@@ -0,0 +1,165 @@
1
+ from pathlib import Path
2
+ import re
3
+
4
+ import pytest
5
+
6
+ from tocsmith.core import Heading, generate_bookmarks, parse_toc_lines
7
+ from tocsmith import cli
8
+ import textwrap
9
+
10
+
11
+ @pytest.fixture()
12
+ def tmp_pdf(tmp_path: Path) -> Path:
13
+ # Create a minimal single-page PDF using pypdf
14
+ from pypdf import PdfWriter
15
+
16
+ out = tmp_path / "a.pdf"
17
+ w = PdfWriter()
18
+ w.add_blank_page(width=595, height=842) # A4
19
+ with out.open("wb") as f:
20
+ w.write(f)
21
+ return out
22
+
23
+
24
+ def test_generate_bookmarks_no_headings(tmp_pdf: Path, tmp_path: Path):
25
+ out = tmp_path / "out.pdf"
26
+ generate_bookmarks(str(tmp_pdf), str(out), [])
27
+ assert out.exists() and out.stat().st_size > 0
28
+
29
+
30
+ def test_generate_bookmarks_with_headings(tmp_pdf: Path, tmp_path: Path):
31
+ out = tmp_path / "out.pdf"
32
+ hs = [Heading(title="Intro", page=1, level=1)]
33
+ generate_bookmarks(str(tmp_pdf), str(out), hs)
34
+ assert out.exists() and out.stat().st_size > 0
35
+
36
+
37
+ def test_no_auto_analysis_copy_only(tmp_pdf: Path):
38
+ # Without headings, we can still generate a copy
39
+ from pypdf import PdfReader
40
+ out = tmp_pdf.with_name("copy.pdf")
41
+ generate_bookmarks(str(tmp_pdf), str(out), [])
42
+ r = PdfReader(str(out))
43
+ assert len(r.pages) == 1
44
+
45
+
46
+ def test_parse_toc_lines_basic_offset():
47
+ toc = """
48
+ 第1章 基础 1
49
+ 1.1 Scala解释器 3
50
+ 1.2 声明值和变量 4
51
+ 2 进阶 10
52
+ """.strip()
53
+ hs = parse_toc_lines(toc, page_offset=14)
54
+ assert [h.page for h in hs] == [15, 17, 18, 24]
55
+ # Ensure titles exist and have reasonable levels
56
+ assert hs[0].level == 1
57
+ assert hs[1].level >= 2
58
+
59
+
60
+ def test_parse_toc_lines_robust_trailing_spaces_and_tabs():
61
+ toc = "\n".join([
62
+ "第1章 基础\t 1",
63
+ " 1.1\tScala解释器 \t 3 ",
64
+ "附录 A 100",
65
+ ])
66
+ hs = parse_toc_lines(toc, page_offset=0)
67
+ assert hs[0].page == 1
68
+ assert hs[1].page == 3
69
+ # When no numeric prefix (like "附录 A"), default to level 1
70
+ assert any(h.title.startswith("附录") and h.level == 1 for h in hs)
71
+
72
+
73
+ def test_parse_toc_lines_preserve_asterisk_prefix():
74
+ toc = "\n".join([
75
+ "*1.1 subdirectory 12",
76
+ "* 1.2 another subdirectory 13",
77
+ "1.3 normal 14",
78
+ ])
79
+ hs = parse_toc_lines(toc, page_offset=0)
80
+ titles = [h.title for h in hs]
81
+ assert titles[0].startswith("*") and "subdirectory" in titles[0]
82
+ assert titles[1].startswith("*") and "another subdirectory" in titles[1]
83
+ assert not titles[2].startswith("*")
84
+
85
+
86
+ def test_batch_config_custom_format(tmp_path: Path, monkeypatch):
87
+ # Arrange input/output structure
88
+ input_dir = tmp_path / "input"
89
+ output_dir = tmp_path / "output"
90
+ input_pdf = input_dir / "book1.pdf"
91
+ input_dir.mkdir(parents=True, exist_ok=True)
92
+ # Create a tiny but valid one-page PDF for reading
93
+ from pypdf import PdfWriter
94
+ writer = PdfWriter()
95
+ writer.add_blank_page(width=100, height=100)
96
+ with input_pdf.open("wb") as f:
97
+ writer.write(f)
98
+
99
+ config_text = textwrap.dedent('''
100
+ [defaults]
101
+ page_offset = 1
102
+ min_len = 1
103
+ input_prefix = "input"
104
+ output_prefix = "output"
105
+ output_suffix = ".bookmarked.pdf"
106
+
107
+ [[tasks]]
108
+ input_file = "book1.pdf"
109
+ toc = """
110
+ 第一章 绪论 1
111
+ 1.1 引言 2
112
+ """
113
+ page_offset = 2
114
+ min_len = 1
115
+ ''').strip()
116
+
117
+ config_path = tmp_path / "config.toml"
118
+ config_path.write_text(config_text, encoding="utf-8")
119
+
120
+ # Capture calls to generate_bookmarks
121
+ captured = {}
122
+
123
+ def fake_generate(src: str, out: str, headings):
124
+ captured["src"] = Path(src)
125
+ captured["out"] = Path(out)
126
+ captured["headings"] = list(headings)
127
+
128
+ monkeypatch.setattr(cli, "generate_bookmarks", fake_generate)
129
+
130
+ # Act
131
+ code = cli._run_batch(config_path)
132
+
133
+ # Assert
134
+ assert code == 0
135
+ assert captured["src"].resolve() == input_pdf.resolve()
136
+ assert captured["out"].resolve() == (output_dir / "book1.bookmarked.pdf").resolve()
137
+ assert len(captured["headings"]) == 2
138
+
139
+
140
+ def test_parse_toc_lines_preserve_numbering_prefix_in_title():
141
+ toc = "\n".join([
142
+ "第1章 计算机系统概述 1",
143
+ "1.1 操作系统的基本概念 2",
144
+ "2 其他章节 10",
145
+ ])
146
+ hs = parse_toc_lines(toc, page_offset=0)
147
+ titles = [h.title for h in hs]
148
+ # Ensure numbering like "第1章" and "1.1" are preserved in the final title
149
+ assert any(t.startswith("第1章 ") and "计算机系统概述" in t for t in titles)
150
+ assert any(t.startswith("1.1 ") and "操作系统的基本概念" in t for t in titles)
151
+ assert any(t.startswith("2 ") and "其他章节" in t for t in titles)
152
+
153
+
154
+ def test_parse_toc_lines_preserve_numbering_with_asterisk():
155
+ toc = "\n".join([
156
+ "*1.1 星标小节 12",
157
+ "* 2 星标章节 13",
158
+ ])
159
+ hs = parse_toc_lines(toc, page_offset=0)
160
+ titles = [h.title for h in hs]
161
+ # Star prefix should precede the numbering, and numbering should remain
162
+ assert any(t.startswith("*1.1 ") and "星标小节" in t for t in titles)
163
+ assert any(t.startswith("*2 ") and "星标章节" in t for t in titles)
164
+
165
+
@@ -0,0 +1,155 @@
1
+ Metadata-Version: 2.4
2
+ Name: tocsmith
3
+ Version: 0.1.0
4
+ Summary: Create PDF bookmarks automatically using heuristics, with CLI and async tkinter GUI
5
+ Author-email: Wesley Yang <yxnian@outlook.com>
6
+ Project-URL: Homepage, https://github.com/wesleyel/pdf-bookmark
7
+ Project-URL: Source, https://github.com/wesleyel/pdf-bookmark
8
+ Requires-Python: >=3.9
9
+ Description-Content-Type: text/markdown
10
+ Requires-Dist: pypdf>=4.2.0
11
+ Requires-Dist: tomli>=2.0.1; python_version < "3.11"
12
+ Provides-Extra: dev
13
+ Requires-Dist: pytest>=8.2; extra == "dev"
14
+ Requires-Dist: pytest-cov>=5.0; extra == "dev"
15
+ Requires-Dist: ruff>=0.5.0; extra == "dev"
16
+ Requires-Dist: mypy>=1.10.0; extra == "dev"
17
+
18
+ # TocSmith
19
+
20
+ 为 PDF 添加目录书签的实用工具,支持命令行与简易 GUI。通过“手动粘贴目录文本 + 页码偏移”的方式生成 PDF 书签(大纲/Outline)。
21
+
22
+ - 运行环境:Python 3.9+
23
+ - 依赖:pypdf(写书签)
24
+ - 提供方式:CLI、Tk GUI、Python API
25
+
26
+ ## 功能概览
27
+ - 手动粘贴目录文本(每行以书中页码结尾),自动解析标题、页码与层级(1..6)
28
+ - 支持页码偏移(实际页码 - 书籍页码),用于扫描件/前置页差异
29
+ - 编号前缀会被保留到标题中:如 `第1章`、`1.1` 将出现在最终书签标题里
30
+ - 支持行首星号标记:允许输入 `*1.1 Title` 或 `* 1.1 Title`,输出统一为 `*1.1 Title`
31
+ - 将条目以父子层级写入 PDF 书签
32
+ - 提供 CLI 与 GUI;亦可通过 Python API 使用
33
+
34
+ ## 快速开始
35
+
36
+ ### 安装与运行(uv 推荐)
37
+ 本仓库使用 uv 管理与分发工具。
38
+
39
+ 1) 通过uv安装命令行工具(推荐):
40
+ ```bash
41
+ uv tool install tocsmith
42
+ # 安装后可直接使用:
43
+ tocsmith --help
44
+ tocsmith-gui
45
+ ```
46
+
47
+ 2) 使用 pip 安装(备选):
48
+ ```bash
49
+ pip install tocsmith
50
+
51
+ # 现在可直接使用:
52
+ tocsmith --help
53
+ tocsmith-gui
54
+ ```
55
+
56
+ 3) 本地开发
57
+ ```bash
58
+ git clone https://github.com/wesleyel/pdf-bookmark.git
59
+ cd pdf-bookmark
60
+ uv sync
61
+
62
+ uv tool install . --reinstall
63
+
64
+ tocsmith --help
65
+ tocsmith-gui
66
+ ```
67
+
68
+ ## 命令行使用(CLI)
69
+
70
+ ```bash
71
+ tocsmith --help
72
+ ```
73
+
74
+ ### 通过 TOML 批量执行(自定义格式)
75
+ 支持通过 TOML 配置批量执行多个任务。相对路径均以配置文件所在目录为基准;还可以通过 `defaults.input_prefix` 与 `defaults.output_prefix` 设定输入/输出根目录。
76
+
77
+ 示例 `config.toml`:
78
+
79
+ ```toml
80
+ [defaults]
81
+ # global page offset
82
+ page_offset = 0
83
+ # global minimum length
84
+ min_len = 3
85
+
86
+ # input folder
87
+ input_prefix = "input"
88
+ # output folder
89
+ output_prefix = "output"
90
+ # output file name append
91
+ output_suffix = ".bookmarked.pdf"
92
+
93
+ [[tasks]]
94
+ # input file name. relative to input_prefix
95
+ input_file = "book1.pdf"
96
+ toc = """
97
+ 第一章 绪论 1
98
+ 1.1 引言 3
99
+ 1.2 数学分析的基本概念 5
100
+ """
101
+ page_offset = 10
102
+ min_len = 2
103
+ ```
104
+
105
+ 运行:
106
+
107
+ ```bash
108
+ tocsmith --config config.toml
109
+ ```
110
+
111
+ 说明:
112
+ - `defaults` 中的 `page_offset`、`min_len` 可被每个任务覆盖。
113
+ - `input_prefix` 用于解析任务中的 `input_file`;`output_prefix` 为输出目录根。
114
+ - 输出文件名为 `{stem}{output_suffix}`,其中 `stem` 来源于 `input_file`。
115
+ - 任务可直接内联 `toc` 文本;也兼容 `toc_file` 指定外部文件。
116
+
117
+ ## 图形界面(GUI)
118
+ 提供一个基于 Tk 的简易界面,便于在桌面环境下操作:
119
+ ```bash
120
+ tocsmith-gui
121
+ # 或
122
+ uv run python -m tocsmith.gui
123
+ ```
124
+ 基本流程:
125
+ - 选择输入 PDF
126
+ - 可选:修改输出路径
127
+ - 在 “TOC text” 中粘贴目录文本;在 “Page Offset” 填写偏移(实际 - 书籍)
128
+ - 点击 “Parse TOC Text” 查看解析结果
129
+ - 点击 “Generate” 生成带书签的 PDF
130
+
131
+ 提示:Linux 上若缺少 tkinter,可通过安装系统包启用(例如 Debian/Ubuntu:`sudo apt-get update && sudo apt-get install -y python3-tk`)。
132
+
133
+ ## 开发与测试
134
+
135
+ - 代码检查与测试:
136
+ ```bash
137
+ uv tool install . # 安装命令,便于本地手动验证
138
+ uv run pytest -q
139
+ # 可选:
140
+ uv run ruff check
141
+ uv run mypy tocsmith
142
+ ```
143
+
144
+ - 项目结构:
145
+ ```
146
+ tocsmith/
147
+ core.py # 目录解析与书签生成核心逻辑
148
+ cli.py # 命令行入口
149
+ gui.py # Tk GUI 入口
150
+ tests/ # 单元测试(pytest)
151
+ ```
152
+
153
+ ## 许可证
154
+
155
+ MIT
@@ -0,0 +1,10 @@
1
+ tocsmith/__init__.py,sha256=L7gRE2la7fHpuyFckdKxmIHJVv0nIIxMvtfEFGS20jg,158
2
+ tocsmith/cli.py,sha256=jAV978BgSeps4_NoD0_QN74H62LvR5zYKGn8ltI0hXs,6926
3
+ tocsmith/core.py,sha256=s4tbGRrkzsxpEdLIAc0qi9r2pedDGnTrcudTcuw_IAs,4223
4
+ tocsmith/gui.py,sha256=0cNmYezkBW2-G_whvpdsrp4E5LFpzn1u9RG_NC9CMDY,9330
5
+ tocsmith/tests/test_core.py,sha256=GrV5L3F1E1b5KK9xmyV24EKt7QaeSK0bFMpfxiVQ0T0,5130
6
+ tocsmith-0.1.0.dist-info/METADATA,sha256=HjjPBWVk8xYYexzVIKKii0r7O-fNLTM_IW1e6ydMtcw,4153
7
+ tocsmith-0.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
8
+ tocsmith-0.1.0.dist-info/entry_points.txt,sha256=AtJWiNYNXMUBHsk_baLqvWIakUgWtdpoT7jvj_iupBc,80
9
+ tocsmith-0.1.0.dist-info/top_level.txt,sha256=Sh_2ols-P55IZJEr4UDleISnjvu8WlxW-aYipACWYDI,9
10
+ tocsmith-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.9.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,3 @@
1
+ [console_scripts]
2
+ tocsmith = tocsmith.cli:main
3
+ tocsmith-gui = tocsmith.gui:main
@@ -0,0 +1 @@
1
+ tocsmith