tocsmith 0.1.0__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: tocsmith
3
- Version: 0.1.0
3
+ Version: 0.2.0
4
4
  Summary: Create PDF bookmarks automatically using heuristics, with CLI and async tkinter GUI
5
5
  Author-email: Wesley Yang <yxnian@outlook.com>
6
6
  Project-URL: Homepage, https://github.com/wesleyel/pdf-bookmark
@@ -25,8 +25,12 @@ Requires-Dist: mypy>=1.10.0; extra == "dev"
25
25
 
26
26
  ## 功能概览
27
27
  - 手动粘贴目录文本(每行以书中页码结尾),自动解析标题、页码与层级(1..6)
28
+ - 支持两种层级解析方式:
29
+ - **按序号**:`1 标题 1` / `1.1 子标题 2` / `1.1.1 子子标题 3`,可通过 `keep_numbering` 控制序号是否写入书签标题
30
+ - **按缩进**:通过行首空格/Tab 缩进表示层级,标题不含序号
31
+ - 默认 `auto` 自动识别;也可通过 `--toc-mode` 或配置 `toc_mode` 显式指定
28
32
  - 支持页码偏移(实际页码 - 书籍页码),用于扫描件/前置页差异
29
- - 编号前缀会被保留到标题中:如 `第1章`、`1.1` 将出现在最终书签标题里
33
+ - 按序号模式下,默认保留编号前缀到标题中(如 `第1章`、`1.1`);设置 `keep_numbering = false` 或 `--no-keep-numbering` 可仅用于推断层级
30
34
  - 支持行首星号标记:允许输入 `*1.1 Title` 或 `* 1.1 Title`,输出统一为 `*1.1 Title`
31
35
  - 将条目以父子层级写入 PDF 书签
32
36
  - 提供 CLI 与 GUI;亦可通过 Python API 使用
@@ -82,6 +86,10 @@ tocsmith --help
82
86
  page_offset = 0
83
87
  # global minimum length
84
88
  min_len = 3
89
+ # TOC hierarchy mode: auto | numbering | indent
90
+ toc_mode = "auto"
91
+ # keep numbering prefix in bookmark titles (numbering mode only)
92
+ keep_numbering = true
85
93
 
86
94
  # input folder
87
95
  input_prefix = "input"
@@ -100,6 +108,26 @@ toc = """
100
108
  """
101
109
  page_offset = 10
102
110
  min_len = 2
111
+ toc_mode = "numbering"
112
+ keep_numbering = false
113
+ ```
114
+
115
+ ### 目录文本格式
116
+
117
+ **按序号**(`toc_mode = "numbering"` 或自动识别):
118
+
119
+ ```
120
+ 1 我是标题 1
121
+ 1.1 我是子标题 2
122
+ 1.1.1 我是子子标题 3
123
+ ```
124
+
125
+ **按缩进**(`toc_mode = "indent"` 或自动识别):
126
+
127
+ ```
128
+ 我是标题 1
129
+ 我是子标题 2
130
+ 我是子子标题 3
103
131
  ```
104
132
 
105
133
  运行:
@@ -109,7 +137,7 @@ tocsmith --config config.toml
109
137
  ```
110
138
 
111
139
  说明:
112
- - `defaults` 中的 `page_offset`、`min_len` 可被每个任务覆盖。
140
+ - `defaults` 中的 `page_offset`、`min_len`、`toc_mode`、`keep_numbering` 可被每个任务覆盖。
113
141
  - `input_prefix` 用于解析任务中的 `input_file`;`output_prefix` 为输出目录根。
114
142
  - 输出文件名为 `{stem}{output_suffix}`,其中 `stem` 来源于 `input_file`。
115
143
  - 任务可直接内联 `toc` 文本;也兼容 `toc_file` 指定外部文件。
@@ -125,6 +153,8 @@ uv run python -m tocsmith.gui
125
153
  - 选择输入 PDF
126
154
  - 可选:修改输出路径
127
155
  - 在 “TOC text” 中粘贴目录文本;在 “Page Offset” 填写偏移(实际 - 书籍)
156
+ - 选择 “TOC Mode”:`auto`(自动识别)、`numbering`(按序号)、`indent`(按缩进)
157
+ - 勾选 “Keep numbering” 控制按序号模式下是否保留标题中的序号(默认保留)
128
158
  - 点击 “Parse TOC Text” 查看解析结果
129
159
  - 点击 “Generate” 生成带书签的 PDF
130
160
 
@@ -8,8 +8,12 @@
8
8
 
9
9
  ## 功能概览
10
10
  - 手动粘贴目录文本(每行以书中页码结尾),自动解析标题、页码与层级(1..6)
11
+ - 支持两种层级解析方式:
12
+ - **按序号**:`1 标题 1` / `1.1 子标题 2` / `1.1.1 子子标题 3`,可通过 `keep_numbering` 控制序号是否写入书签标题
13
+ - **按缩进**:通过行首空格/Tab 缩进表示层级,标题不含序号
14
+ - 默认 `auto` 自动识别;也可通过 `--toc-mode` 或配置 `toc_mode` 显式指定
11
15
  - 支持页码偏移(实际页码 - 书籍页码),用于扫描件/前置页差异
12
- - 编号前缀会被保留到标题中:如 `第1章`、`1.1` 将出现在最终书签标题里
16
+ - 按序号模式下,默认保留编号前缀到标题中(如 `第1章`、`1.1`);设置 `keep_numbering = false` 或 `--no-keep-numbering` 可仅用于推断层级
13
17
  - 支持行首星号标记:允许输入 `*1.1 Title` 或 `* 1.1 Title`,输出统一为 `*1.1 Title`
14
18
  - 将条目以父子层级写入 PDF 书签
15
19
  - 提供 CLI 与 GUI;亦可通过 Python API 使用
@@ -65,6 +69,10 @@ tocsmith --help
65
69
  page_offset = 0
66
70
  # global minimum length
67
71
  min_len = 3
72
+ # TOC hierarchy mode: auto | numbering | indent
73
+ toc_mode = "auto"
74
+ # keep numbering prefix in bookmark titles (numbering mode only)
75
+ keep_numbering = true
68
76
 
69
77
  # input folder
70
78
  input_prefix = "input"
@@ -83,6 +91,26 @@ toc = """
83
91
  """
84
92
  page_offset = 10
85
93
  min_len = 2
94
+ toc_mode = "numbering"
95
+ keep_numbering = false
96
+ ```
97
+
98
+ ### 目录文本格式
99
+
100
+ **按序号**(`toc_mode = "numbering"` 或自动识别):
101
+
102
+ ```
103
+ 1 我是标题 1
104
+ 1.1 我是子标题 2
105
+ 1.1.1 我是子子标题 3
106
+ ```
107
+
108
+ **按缩进**(`toc_mode = "indent"` 或自动识别):
109
+
110
+ ```
111
+ 我是标题 1
112
+ 我是子标题 2
113
+ 我是子子标题 3
86
114
  ```
87
115
 
88
116
  运行:
@@ -92,7 +120,7 @@ tocsmith --config config.toml
92
120
  ```
93
121
 
94
122
  说明:
95
- - `defaults` 中的 `page_offset`、`min_len` 可被每个任务覆盖。
123
+ - `defaults` 中的 `page_offset`、`min_len`、`toc_mode`、`keep_numbering` 可被每个任务覆盖。
96
124
  - `input_prefix` 用于解析任务中的 `input_file`;`output_prefix` 为输出目录根。
97
125
  - 输出文件名为 `{stem}{output_suffix}`,其中 `stem` 来源于 `input_file`。
98
126
  - 任务可直接内联 `toc` 文本;也兼容 `toc_file` 指定外部文件。
@@ -108,6 +136,8 @@ uv run python -m tocsmith.gui
108
136
  - 选择输入 PDF
109
137
  - 可选:修改输出路径
110
138
  - 在 “TOC text” 中粘贴目录文本;在 “Page Offset” 填写偏移(实际 - 书籍)
139
+ - 选择 “TOC Mode”:`auto`(自动识别)、`numbering`(按序号)、`indent`(按缩进)
140
+ - 勾选 “Keep numbering” 控制按序号模式下是否保留标题中的序号(默认保留)
111
141
  - 点击 “Parse TOC Text” 查看解析结果
112
142
  - 点击 “Generate” 生成带书签的 PDF
113
143
 
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "tocsmith"
3
- version = "0.1.0"
3
+ version = "0.2.0"
4
4
  description = "Create PDF bookmarks automatically using heuristics, with CLI and async tkinter GUI"
5
5
  authors = [{ name = "Wesley Yang", email = "yxnian@outlook.com" }]
6
6
  urls = { Homepage = "https://github.com/wesleyel/pdf-bookmark", Source = "https://github.com/wesleyel/pdf-bookmark" }
@@ -5,7 +5,7 @@ from pathlib import Path
5
5
  from typing import Any, Dict, List, Optional
6
6
  import sys
7
7
 
8
- from .core import generate_bookmarks, parse_toc_lines
8
+ from .core import TocMode, generate_bookmarks, parse_toc_lines
9
9
 
10
10
  try: # Python 3.11+
11
11
  import tomllib # type: ignore[attr-defined]
@@ -23,6 +23,18 @@ def parse_args(argv: List[str] | None = None) -> argparse.Namespace:
23
23
  p.add_argument("--min-len", type=int, default=3, help="Minimum heading text length")
24
24
  p.add_argument("--page-offset", type=int, default=0, help="Page offset: actual - book page")
25
25
  p.add_argument("--toc-file", help="Path to a text file containing TOC lines")
26
+ p.add_argument(
27
+ "--toc-mode",
28
+ choices=["auto", "numbering", "indent"],
29
+ default="auto",
30
+ help="TOC hierarchy mode: numbering (1/1.1), indent (spaces), or auto-detect",
31
+ )
32
+ p.add_argument(
33
+ "--keep-numbering",
34
+ action=argparse.BooleanOptionalAction,
35
+ default=True,
36
+ help="Keep numbering prefix in bookmark titles (numbering mode only)",
37
+ )
26
38
  p.add_argument(
27
39
  "-c",
28
40
  "--config",
@@ -46,6 +58,8 @@ def _run_single(
46
58
  page_offset: int,
47
59
  min_len: int,
48
60
  toc_text: Optional[str] = None,
61
+ toc_mode: TocMode = "auto",
62
+ keep_numbering: bool = True,
49
63
  ) -> int:
50
64
  """Run a single task and return process exit code."""
51
65
  if not src.exists():
@@ -55,10 +69,22 @@ def _run_single(
55
69
 
56
70
  headings = []
57
71
  if toc_text is not None and toc_text.strip():
58
- headings = parse_toc_lines(toc_text, page_offset=page_offset, min_len=min_len)
72
+ headings = parse_toc_lines(
73
+ toc_text,
74
+ page_offset=page_offset,
75
+ min_len=min_len,
76
+ mode=toc_mode,
77
+ keep_numbering=keep_numbering,
78
+ )
59
79
  elif toc_file:
60
80
  file_text = Path(toc_file).read_text(encoding="utf-8")
61
- headings = parse_toc_lines(file_text, page_offset=page_offset, min_len=min_len)
81
+ headings = parse_toc_lines(
82
+ file_text,
83
+ page_offset=page_offset,
84
+ min_len=min_len,
85
+ mode=toc_mode,
86
+ keep_numbering=keep_numbering,
87
+ )
62
88
  else:
63
89
  print("No TOC source provided (use --toc-file). Producing a copy without outline.")
64
90
  headings = []
@@ -86,6 +112,8 @@ def _run_batch(config_path: Path) -> int:
86
112
  # Alternatively: toc_file = "toc.txt"
87
113
  page_offset = 10 # optional overrides default
88
114
  min_len = 2 # optional overrides default
115
+ toc_mode = "auto" # optional: auto | numbering | indent
116
+ keep_numbering = true # optional: keep numbering in bookmark titles
89
117
  '''
90
118
  if tomllib is None:
91
119
  print("Error: TOML support not available. Please install 'tomli' for Python < 3.11.")
@@ -107,6 +135,11 @@ def _run_batch(config_path: Path) -> int:
107
135
 
108
136
  default_page_offset = int(defaults.get("page_offset", 0) or 0)
109
137
  default_min_len = int(defaults.get("min_len", 3) or 3)
138
+ default_toc_mode = str(defaults.get("toc_mode", "auto") or "auto").strip() or "auto"
139
+ if default_toc_mode not in ("auto", "numbering", "indent"):
140
+ print(f"Invalid defaults.toc_mode: {default_toc_mode!r}")
141
+ return 2
142
+ default_keep_numbering = bool(defaults.get("keep_numbering", True))
110
143
  input_prefix = str(defaults.get("input_prefix", "")).strip() or ""
111
144
  output_prefix = str(defaults.get("output_prefix", "")).strip() or ""
112
145
  output_suffix = (
@@ -139,11 +172,18 @@ def _run_batch(config_path: Path) -> int:
139
172
  toc_file = _resolve_relative(base_dir, t.get("toc_file"))
140
173
  page_offset = int(t.get("page_offset", default_page_offset) or default_page_offset)
141
174
  min_len = int(t.get("min_len", default_min_len) or default_min_len)
175
+ toc_mode = str(t.get("toc_mode", default_toc_mode) or default_toc_mode).strip() or "auto"
176
+ if toc_mode not in ("auto", "numbering", "indent"):
177
+ print(f"[Task {idx}] Skipped: invalid toc_mode {toc_mode!r}")
178
+ failures += 1
179
+ continue
180
+ keep_numbering = bool(t.get("keep_numbering", default_keep_numbering))
142
181
 
143
182
  print(
144
183
  f"[Task {idx}] Running: src={src} out={out} "
145
184
  f"toc={'inline' if (toc_inline and toc_inline.strip()) else (toc_file or '<none>')} "
146
- f"offset={page_offset} min_len={min_len}"
185
+ f"offset={page_offset} min_len={min_len} toc_mode={toc_mode} "
186
+ f"keep_numbering={keep_numbering}"
147
187
  )
148
188
  try:
149
189
  # Ensure output directory exists
@@ -155,6 +195,8 @@ def _run_batch(config_path: Path) -> int:
155
195
  page_offset=page_offset,
156
196
  min_len=min_len,
157
197
  toc_text=toc_inline,
198
+ toc_mode=toc_mode, # type: ignore[arg-type]
199
+ keep_numbering=keep_numbering,
158
200
  )
159
201
  if code != 0:
160
202
  failures += 1
@@ -186,6 +228,8 @@ def main(argv: List[str] | None = None) -> int:
186
228
  toc_file=Path(ns.toc_file) if ns.toc_file else None,
187
229
  page_offset=ns.page_offset,
188
230
  min_len=ns.min_len,
231
+ toc_mode=ns.toc_mode,
232
+ keep_numbering=ns.keep_numbering,
189
233
  )
190
234
 
191
235
 
@@ -0,0 +1,223 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ import re
5
+ from functools import reduce
6
+ from math import gcd
7
+ from typing import Iterable, List, Literal, Tuple, Optional
8
+
9
+ TocMode = Literal["numbering", "indent", "auto"]
10
+
11
+ from pypdf import PdfReader, PdfWriter
12
+
13
+
14
+ @dataclass
15
+ class Heading:
16
+ title: str
17
+ page: int # 1-based
18
+ level: int # 1..6
19
+
20
+
21
+ def generate_bookmarks(src_pdf: str, out_pdf: str, headings: Iterable[Heading]) -> None:
22
+ """Write given headings into a new PDF file as outline/bookmarks."""
23
+ reader = PdfReader(src_pdf)
24
+ writer = PdfWriter()
25
+ for page in reader.pages:
26
+ writer.add_page(page)
27
+
28
+ # Build hierarchical outlines using a simple stack by levels
29
+ stack: List[Tuple[int, object]] = [] # (level, parent_ref)
30
+
31
+ for h in headings:
32
+ page_index = max(0, min(len(reader.pages) - 1, h.page - 1))
33
+ while stack and stack[-1][0] >= h.level:
34
+ stack.pop()
35
+ parent = stack[-1][1] if stack else None
36
+ dest = writer.add_outline_item(h.title, page_index, parent=parent)
37
+ stack.append((h.level, dest))
38
+
39
+ with open(out_pdf, "wb") as f:
40
+ writer.write(f)
41
+
42
+
43
+ # -------------------- TOC parsing utilities --------------------
44
+
45
+ _NUM_PREFIX_RE = re.compile(
46
+ r"^\s*(?P<num>(第\s*\d+[一二三四五六七八九十百千]*[章节节部分编]?)|((\d+\.)+\d+)|\d+)?\s*"
47
+ )
48
+ _TRAILING_PAGE_RE = re.compile(r"(?P<page>\d{1,5})\s*$")
49
+
50
+
51
+ def _infer_level_from_numbering(num: Optional[str]) -> int:
52
+ if not num:
53
+ return 1
54
+ num = num.strip()
55
+ if num.startswith("第"):
56
+ # "第1章" style => top-level
57
+ return 1
58
+ if "." in num:
59
+ # "1.2.3" => level = segments + 1 (so 1.2 is level 2)
60
+ return min(6, max(1, num.count(".") + 1))
61
+ # Simple leading integer like "1" => level 1
62
+ return 1
63
+
64
+
65
+ def _leading_indent_width(raw_line: str) -> int:
66
+ width = 0
67
+ for ch in raw_line:
68
+ if ch == " ":
69
+ width += 1
70
+ elif ch == "\t":
71
+ width += 4
72
+ else:
73
+ break
74
+ return width
75
+
76
+
77
+ def _detect_indent_unit(indents: Iterable[int]) -> int:
78
+ non_zero = sorted({i for i in indents if i > 0})
79
+ if not non_zero:
80
+ return 4
81
+ unit = non_zero[0]
82
+ if all(i % unit == 0 for i in indents):
83
+ return max(1, unit)
84
+ return max(1, reduce(gcd, non_zero))
85
+
86
+
87
+ def _infer_level_from_indent(indent: int, unit: int) -> int:
88
+ if indent <= 0:
89
+ return 1
90
+ return min(6, max(1, indent // unit + 1))
91
+
92
+
93
+ def _strip_star_prefix(line: str) -> Tuple[str, str]:
94
+ star_prefix = ""
95
+ m_star = re.match(r"^\*+\s*", line)
96
+ if m_star:
97
+ star_prefix = "*" * m_star.group(0).count("*")
98
+ line = line[m_star.end() :].lstrip()
99
+ return star_prefix, line
100
+
101
+
102
+ def _detect_toc_mode(toc_text: str, min_len: int = 1) -> TocMode:
103
+ """Auto-detect whether TOC hierarchy is expressed by numbering or indentation."""
104
+ indent_signals = 0
105
+ numbering_signals = 0
106
+ for raw_line in toc_text.splitlines():
107
+ if len(raw_line.strip()) < min_len:
108
+ continue
109
+ line = raw_line.lstrip()
110
+ _, line = _strip_star_prefix(line)
111
+ page_m = _TRAILING_PAGE_RE.search(line)
112
+ if not page_m:
113
+ continue
114
+ line_wo_page = line[: page_m.start()].rstrip()
115
+ indent = _leading_indent_width(raw_line)
116
+ num_m = _NUM_PREFIX_RE.match(line_wo_page)
117
+ has_numbering = bool(num_m and num_m.group("num"))
118
+ if has_numbering:
119
+ numbering_signals += 1
120
+ elif indent > 0:
121
+ indent_signals += 1
122
+ return "indent" if indent_signals > numbering_signals else "numbering"
123
+
124
+
125
+ def _parse_toc_lines_numbering(
126
+ toc_text: str,
127
+ page_offset: int = 0,
128
+ min_len: int = 1,
129
+ keep_numbering: bool = True,
130
+ ) -> List[Heading]:
131
+ headings: List[Heading] = []
132
+ for raw_line in toc_text.splitlines():
133
+ line = raw_line.strip()
134
+ if len(line) < min_len:
135
+ continue
136
+ star_prefix, line = _strip_star_prefix(line)
137
+
138
+ page_m = _TRAILING_PAGE_RE.search(line)
139
+ if not page_m:
140
+ continue
141
+ page_num = int(page_m.group("page"))
142
+ line_wo_page = line[: page_m.start()].rstrip()
143
+ num_m = _NUM_PREFIX_RE.match(line_wo_page)
144
+ numbering = None
145
+ title_part = line_wo_page
146
+ if num_m:
147
+ numbering = num_m.group("num")
148
+ title_part = line_wo_page[num_m.end() :].strip()
149
+ if numbering and keep_numbering:
150
+ combined = f"{numbering.strip()} {title_part}".strip()
151
+ else:
152
+ combined = title_part
153
+ title = re.sub(r"\s+", " ", combined)
154
+ if not title:
155
+ title = line_wo_page.strip() if keep_numbering else title_part.strip()
156
+ if star_prefix:
157
+ title = f"{star_prefix}{title}".strip()
158
+ level = _infer_level_from_numbering(numbering)
159
+ pdf_page = max(1, page_num + page_offset)
160
+ headings.append(Heading(title=title, page=pdf_page, level=level))
161
+
162
+ headings.sort(key=lambda h: (h.page, h.level, h.title.lower()))
163
+ return headings
164
+
165
+
166
+ def _parse_toc_lines_indent(toc_text: str, page_offset: int = 0, min_len: int = 1) -> List[Heading]:
167
+ lines_data: List[Tuple[int, str, int]] = []
168
+ indents: List[int] = []
169
+ for raw_line in toc_text.splitlines():
170
+ if len(raw_line.strip()) < min_len:
171
+ continue
172
+ indent = _leading_indent_width(raw_line)
173
+ line = raw_line.lstrip()
174
+ star_prefix, line = _strip_star_prefix(line)
175
+
176
+ page_m = _TRAILING_PAGE_RE.search(line)
177
+ if not page_m:
178
+ continue
179
+ page_num = int(page_m.group("page"))
180
+ title = re.sub(r"\s+", " ", line[: page_m.start()].rstrip())
181
+ if star_prefix:
182
+ title = f"{star_prefix}{title}".strip()
183
+ indents.append(indent)
184
+ lines_data.append((indent, title, page_num))
185
+
186
+ unit = _detect_indent_unit(indents)
187
+ headings: List[Heading] = []
188
+ for indent, title, page_num in lines_data:
189
+ level = _infer_level_from_indent(indent, unit)
190
+ pdf_page = max(1, page_num + page_offset)
191
+ headings.append(Heading(title=title, page=pdf_page, level=level))
192
+
193
+ headings.sort(key=lambda h: (h.page, h.level, h.title.lower()))
194
+ return headings
195
+
196
+
197
+ def parse_toc_lines(
198
+ toc_text: str,
199
+ page_offset: int = 0,
200
+ min_len: int = 1,
201
+ mode: TocMode = "auto",
202
+ keep_numbering: bool = True,
203
+ ) -> List[Heading]:
204
+ """
205
+ Parse a pasted TOC text into Heading entries.
206
+ - Each line should end with the book page number (digits)
207
+ - mode="numbering": hierarchy from leading numbers like "1", "1.1", "第1章"
208
+ - mode="indent": hierarchy from leading spaces/tabs
209
+ - mode="auto": detect numbering vs indent automatically
210
+ - keep_numbering: when True (default), numbering prefix is kept in bookmark titles
211
+ - page_offset is added to the parsed page number to map to PDF actual pages
212
+ """
213
+ resolved_mode = _detect_toc_mode(toc_text, min_len) if mode == "auto" else mode
214
+ if resolved_mode == "indent":
215
+ return _parse_toc_lines_indent(toc_text, page_offset=page_offset, min_len=min_len)
216
+ return _parse_toc_lines_numbering(
217
+ toc_text, page_offset=page_offset, min_len=min_len, keep_numbering=keep_numbering
218
+ )
219
+
220
+
221
+ ## URL/website TOC fetching intentionally removed; only manual text input is supported.
222
+
223
+
@@ -68,6 +68,24 @@ class App:
68
68
  self.offset_entry = ttk.Entry(ctrl, textvariable=self.offset_var, width=6)
69
69
  self.offset_entry.pack(side=tk.LEFT, padx=(4, 12))
70
70
 
71
+ ttk.Label(ctrl, text="TOC Mode:").pack(side=tk.LEFT)
72
+ self.toc_mode_var = tk.StringVar(value="auto")
73
+ self.toc_mode_combo = ttk.Combobox(
74
+ ctrl,
75
+ textvariable=self.toc_mode_var,
76
+ values=["auto", "numbering", "indent"],
77
+ state="readonly",
78
+ width=10,
79
+ )
80
+ self.toc_mode_combo.pack(side=tk.LEFT, padx=(4, 12))
81
+
82
+ self.keep_numbering_var = tk.BooleanVar(value=True)
83
+ ttk.Checkbutton(
84
+ ctrl,
85
+ text="Keep numbering",
86
+ variable=self.keep_numbering_var,
87
+ ).pack(side=tk.LEFT)
88
+
71
89
  # TOC input
72
90
  toc_row = ttk.Frame(frm)
73
91
  toc_row.pack(fill=tk.BOTH, expand=True)
@@ -162,6 +180,20 @@ class App:
162
180
 
163
181
  # Auto analysis removed
164
182
 
183
+ def _get_parse_kwargs(self) -> dict:
184
+ try:
185
+ offset = int(self.offset_var.get() or 0)
186
+ except ValueError:
187
+ offset = 0
188
+ mode = self.toc_mode_var.get() or "auto"
189
+ if mode not in ("auto", "numbering", "indent"):
190
+ mode = "auto"
191
+ return {
192
+ "page_offset": offset,
193
+ "mode": mode,
194
+ "keep_numbering": self.keep_numbering_var.get(),
195
+ }
196
+
165
197
  def _on_generate(self) -> None:
166
198
  if not self.in_var.get():
167
199
  messagebox.showwarning("Missing", "Please choose an input PDF")
@@ -176,11 +208,7 @@ class App:
176
208
  text = self.toc_text.get("1.0", tk.END).strip()
177
209
  hs = []
178
210
  if text:
179
- try:
180
- offset = int(self.offset_var.get() or 0)
181
- except ValueError:
182
- offset = 0
183
- hs = await run_in_thread(parse_toc_lines, text, offset)
211
+ hs = await run_in_thread(parse_toc_lines, text, **self._get_parse_kwargs())
184
212
  else:
185
213
  hs = []
186
214
  await run_in_thread(generate_bookmarks, self.in_var.get(), self.out_var.get(), hs)
@@ -194,14 +222,9 @@ class App:
194
222
  if not text:
195
223
  messagebox.showwarning("Empty", "Please paste TOC text or URL first")
196
224
  return
197
- try:
198
- offset = int(self.offset_var.get() or 0)
199
- except ValueError:
200
- offset = 0
201
-
202
225
  async def task():
203
226
  self._set_status("Parsing TOC…")
204
- hs = await run_in_thread(parse_toc_lines, text, offset)
227
+ hs = await run_in_thread(parse_toc_lines, text, **self._get_parse_kwargs())
205
228
  self._populate_tree(hs)
206
229
  self._set_status(f"Parsed {len(hs)} entries")
207
230
 
@@ -163,3 +163,82 @@ def test_parse_toc_lines_preserve_numbering_with_asterisk():
163
163
  assert any(t.startswith("*2 ") and "星标章节" in t for t in titles)
164
164
 
165
165
 
166
+ def test_parse_toc_lines_numbering_mode_example():
167
+ toc = "\n".join([
168
+ "1 我是标题 1",
169
+ "1.1 我是子标题 2",
170
+ "1.1.1 我是子子标题 3",
171
+ ])
172
+ hs = parse_toc_lines(toc, page_offset=0, mode="numbering")
173
+ assert len(hs) == 3
174
+ assert hs[0].title == "1 我是标题"
175
+ assert hs[0].level == 1
176
+ assert hs[1].title == "1.1 我是子标题"
177
+ assert hs[1].level == 2
178
+ assert hs[2].title == "1.1.1 我是子子标题"
179
+ assert hs[2].level == 3
180
+
181
+
182
+ def test_parse_toc_lines_indent_mode_example():
183
+ toc = "\n".join([
184
+ "我是标题 1",
185
+ " 我是子标题 2",
186
+ " 我是子子标题 3",
187
+ ])
188
+ hs = parse_toc_lines(toc, page_offset=0, mode="indent")
189
+ assert len(hs) == 3
190
+ assert hs[0].title == "我是标题"
191
+ assert hs[0].level == 1
192
+ assert hs[1].title == "我是子标题"
193
+ assert hs[1].level == 2
194
+ assert hs[2].title == "我是子子标题"
195
+ assert hs[2].level == 3
196
+
197
+
198
+ def test_parse_toc_lines_auto_detect_indent():
199
+ toc = "\n".join([
200
+ "Chapter A 1",
201
+ " Section B 2",
202
+ " Subsection C 3",
203
+ ])
204
+ hs = parse_toc_lines(toc, page_offset=0, mode="auto")
205
+ assert [h.level for h in hs] == [1, 2, 3]
206
+
207
+
208
+ def test_parse_toc_lines_strip_numbering_when_disabled():
209
+ toc = "\n".join([
210
+ "第1章 计算机系统概述 1",
211
+ "1.1 操作系统的基本概念 2",
212
+ "2 其他章节 10",
213
+ ])
214
+ hs = parse_toc_lines(toc, page_offset=0, keep_numbering=False)
215
+ titles = [h.title for h in hs]
216
+ assert any(t == "计算机系统概述" for t in titles)
217
+ assert any(t == "操作系统的基本概念" for t in titles)
218
+ assert any(t == "其他章节" for t in titles)
219
+ assert [h.level for h in hs] == [1, 2, 1]
220
+
221
+
222
+ def test_parse_toc_lines_strip_numbering_with_asterisk():
223
+ toc = "\n".join([
224
+ "*1.1 星标小节 12",
225
+ "* 2 星标章节 13",
226
+ ])
227
+ hs = parse_toc_lines(toc, page_offset=0, keep_numbering=False)
228
+ titles = [h.title for h in hs]
229
+ assert any(t == "*星标小节" for t in titles)
230
+ assert any(t == "*星标章节" for t in titles)
231
+
232
+
233
+ def test_parse_toc_lines_auto_detect_numbering():
234
+ toc = "\n".join([
235
+ "第1章 基础 1",
236
+ "1.1 小节 2",
237
+ ])
238
+ hs = parse_toc_lines(toc, page_offset=0, mode="auto")
239
+ assert hs[0].level == 1
240
+ assert hs[1].level == 2
241
+ assert hs[0].title.startswith("第1章")
242
+ assert hs[1].title.startswith("1.1")
243
+
244
+
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: tocsmith
3
- Version: 0.1.0
3
+ Version: 0.2.0
4
4
  Summary: Create PDF bookmarks automatically using heuristics, with CLI and async tkinter GUI
5
5
  Author-email: Wesley Yang <yxnian@outlook.com>
6
6
  Project-URL: Homepage, https://github.com/wesleyel/pdf-bookmark
@@ -25,8 +25,12 @@ Requires-Dist: mypy>=1.10.0; extra == "dev"
25
25
 
26
26
  ## 功能概览
27
27
  - 手动粘贴目录文本(每行以书中页码结尾),自动解析标题、页码与层级(1..6)
28
+ - 支持两种层级解析方式:
29
+ - **按序号**:`1 标题 1` / `1.1 子标题 2` / `1.1.1 子子标题 3`,可通过 `keep_numbering` 控制序号是否写入书签标题
30
+ - **按缩进**:通过行首空格/Tab 缩进表示层级,标题不含序号
31
+ - 默认 `auto` 自动识别;也可通过 `--toc-mode` 或配置 `toc_mode` 显式指定
28
32
  - 支持页码偏移(实际页码 - 书籍页码),用于扫描件/前置页差异
29
- - 编号前缀会被保留到标题中:如 `第1章`、`1.1` 将出现在最终书签标题里
33
+ - 按序号模式下,默认保留编号前缀到标题中(如 `第1章`、`1.1`);设置 `keep_numbering = false` 或 `--no-keep-numbering` 可仅用于推断层级
30
34
  - 支持行首星号标记:允许输入 `*1.1 Title` 或 `* 1.1 Title`,输出统一为 `*1.1 Title`
31
35
  - 将条目以父子层级写入 PDF 书签
32
36
  - 提供 CLI 与 GUI;亦可通过 Python API 使用
@@ -82,6 +86,10 @@ tocsmith --help
82
86
  page_offset = 0
83
87
  # global minimum length
84
88
  min_len = 3
89
+ # TOC hierarchy mode: auto | numbering | indent
90
+ toc_mode = "auto"
91
+ # keep numbering prefix in bookmark titles (numbering mode only)
92
+ keep_numbering = true
85
93
 
86
94
  # input folder
87
95
  input_prefix = "input"
@@ -100,6 +108,26 @@ toc = """
100
108
  """
101
109
  page_offset = 10
102
110
  min_len = 2
111
+ toc_mode = "numbering"
112
+ keep_numbering = false
113
+ ```
114
+
115
+ ### 目录文本格式
116
+
117
+ **按序号**(`toc_mode = "numbering"` 或自动识别):
118
+
119
+ ```
120
+ 1 我是标题 1
121
+ 1.1 我是子标题 2
122
+ 1.1.1 我是子子标题 3
123
+ ```
124
+
125
+ **按缩进**(`toc_mode = "indent"` 或自动识别):
126
+
127
+ ```
128
+ 我是标题 1
129
+ 我是子标题 2
130
+ 我是子子标题 3
103
131
  ```
104
132
 
105
133
  运行:
@@ -109,7 +137,7 @@ tocsmith --config config.toml
109
137
  ```
110
138
 
111
139
  说明:
112
- - `defaults` 中的 `page_offset`、`min_len` 可被每个任务覆盖。
140
+ - `defaults` 中的 `page_offset`、`min_len`、`toc_mode`、`keep_numbering` 可被每个任务覆盖。
113
141
  - `input_prefix` 用于解析任务中的 `input_file`;`output_prefix` 为输出目录根。
114
142
  - 输出文件名为 `{stem}{output_suffix}`,其中 `stem` 来源于 `input_file`。
115
143
  - 任务可直接内联 `toc` 文本;也兼容 `toc_file` 指定外部文件。
@@ -125,6 +153,8 @@ uv run python -m tocsmith.gui
125
153
  - 选择输入 PDF
126
154
  - 可选:修改输出路径
127
155
  - 在 “TOC text” 中粘贴目录文本;在 “Page Offset” 填写偏移(实际 - 书籍)
156
+ - 选择 “TOC Mode”:`auto`(自动识别)、`numbering`(按序号)、`indent`(按缩进)
157
+ - 勾选 “Keep numbering” 控制按序号模式下是否保留标题中的序号(默认保留)
128
158
  - 点击 “Parse TOC Text” 查看解析结果
129
159
  - 点击 “Generate” 生成带书签的 PDF
130
160
 
@@ -1,122 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from dataclasses import dataclass
4
- import re
5
- from typing import Iterable, List, Tuple, Optional
6
-
7
- from pypdf import PdfReader, PdfWriter
8
-
9
-
10
- @dataclass
11
- class Heading:
12
- title: str
13
- page: int # 1-based
14
- level: int # 1..6
15
-
16
-
17
- def generate_bookmarks(src_pdf: str, out_pdf: str, headings: Iterable[Heading]) -> None:
18
- """Write given headings into a new PDF file as outline/bookmarks."""
19
- reader = PdfReader(src_pdf)
20
- writer = PdfWriter()
21
- for page in reader.pages:
22
- writer.add_page(page)
23
-
24
- # Build hierarchical outlines using a simple stack by levels
25
- stack: List[Tuple[int, object]] = [] # (level, parent_ref)
26
-
27
- for h in headings:
28
- page_index = max(0, min(len(reader.pages) - 1, h.page - 1))
29
- while stack and stack[-1][0] >= h.level:
30
- stack.pop()
31
- parent = stack[-1][1] if stack else None
32
- dest = writer.add_outline_item(h.title, page_index, parent=parent)
33
- stack.append((h.level, dest))
34
-
35
- with open(out_pdf, "wb") as f:
36
- writer.write(f)
37
-
38
-
39
- # -------------------- TOC parsing utilities --------------------
40
-
41
- _NUM_PREFIX_RE = re.compile(
42
- r"^\s*(?P<num>(第\s*\d+[一二三四五六七八九十百千]*[章节节部分编]?)|((\d+\.)+\d+)|\d+)?\s*"
43
- )
44
- _TRAILING_PAGE_RE = re.compile(r"(?P<page>\d{1,5})\s*$")
45
-
46
-
47
- def _infer_level_from_numbering(num: Optional[str]) -> int:
48
- if not num:
49
- return 1
50
- num = num.strip()
51
- if num.startswith("第"):
52
- # "第1章" style => top-level
53
- return 1
54
- if "." in num:
55
- # "1.2.3" => level = segments + 1 (so 1.2 is level 2)
56
- return min(6, max(1, num.count(".") + 1))
57
- # Simple leading integer like "1" => level 1
58
- return 1
59
-
60
-
61
- def parse_toc_lines(toc_text: str, page_offset: int = 0, min_len: int = 1) -> List[Heading]:
62
- """
63
- Parse a pasted TOC text into Heading entries.
64
- - Each line should end with the book page number (digits)
65
- - Leading numbering like "第1章" or "1.2" is used to infer the level
66
- - page_offset is added to the parsed page number to map to PDF actual pages
67
- """
68
- headings: List[Heading] = []
69
- for raw_line in toc_text.splitlines():
70
- line = raw_line.strip()
71
- if len(line) < min_len:
72
- continue
73
- # Detect and temporarily strip leading asterisk marker(s)
74
- star_prefix = ""
75
- m_star = re.match(r"^\*+\s*", line)
76
- if m_star:
77
- stars = m_star.group(0)
78
- star_count = stars.count("*")
79
- # Preserve star(s) without trailing space; spacing will be normalized later
80
- star_prefix = ("*" * star_count)
81
- line = line[m_star.end() :].lstrip()
82
-
83
- # Extract trailing page digits
84
- page_m = _TRAILING_PAGE_RE.search(line)
85
- if not page_m:
86
- continue
87
- page_num = int(page_m.group("page"))
88
- # Remove trailing page from the line
89
- line_wo_page = line[: page_m.start()].rstrip()
90
- # Extract leading numbering if exists
91
- num_m = _NUM_PREFIX_RE.match(line_wo_page)
92
- numbering = None
93
- title_part = line_wo_page
94
- if num_m:
95
- numbering = num_m.group("num")
96
- title_part = line_wo_page[num_m.end() :].strip()
97
- # Build title while preserving numbering prefix (e.g., "第1章" or "1.1")
98
- if numbering:
99
- combined = f"{numbering.strip()} {title_part}".strip()
100
- else:
101
- combined = title_part
102
- # Cleanup whitespace
103
- title = re.sub(r"\s+", " ", combined)
104
- if not title:
105
- # fallback to raw without numbering
106
- title = line_wo_page.strip()
107
- # Restore asterisk prefix if present
108
- if star_prefix:
109
- # No space between star(s) and numbering/title
110
- title = f"{star_prefix}{title}".strip()
111
- level = _infer_level_from_numbering(numbering)
112
- pdf_page = max(1, page_num + page_offset)
113
- headings.append(Heading(title=title, page=pdf_page, level=level))
114
-
115
- # Sort by page then by inferred level
116
- headings.sort(key=lambda h: (h.page, h.level, h.title.lower()))
117
- return headings
118
-
119
-
120
- ## URL/website TOC fetching intentionally removed; only manual text input is supported.
121
-
122
-
File without changes
File without changes