tocsmith 0.1.0__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {tocsmith-0.1.0 → tocsmith-0.2.0}/PKG-INFO +33 -3
- {tocsmith-0.1.0 → tocsmith-0.2.0}/README.md +32 -2
- {tocsmith-0.1.0 → tocsmith-0.2.0}/pyproject.toml +1 -1
- {tocsmith-0.1.0 → tocsmith-0.2.0}/tocsmith/cli.py +48 -4
- tocsmith-0.2.0/tocsmith/core.py +223 -0
- {tocsmith-0.1.0 → tocsmith-0.2.0}/tocsmith/gui.py +34 -11
- {tocsmith-0.1.0 → tocsmith-0.2.0}/tocsmith/tests/test_core.py +79 -0
- {tocsmith-0.1.0 → tocsmith-0.2.0}/tocsmith.egg-info/PKG-INFO +33 -3
- tocsmith-0.1.0/tocsmith/core.py +0 -122
- {tocsmith-0.1.0 → tocsmith-0.2.0}/setup.cfg +0 -0
- {tocsmith-0.1.0 → tocsmith-0.2.0}/tocsmith/__init__.py +0 -0
- {tocsmith-0.1.0 → tocsmith-0.2.0}/tocsmith.egg-info/SOURCES.txt +0 -0
- {tocsmith-0.1.0 → tocsmith-0.2.0}/tocsmith.egg-info/dependency_links.txt +0 -0
- {tocsmith-0.1.0 → tocsmith-0.2.0}/tocsmith.egg-info/entry_points.txt +0 -0
- {tocsmith-0.1.0 → tocsmith-0.2.0}/tocsmith.egg-info/requires.txt +0 -0
- {tocsmith-0.1.0 → tocsmith-0.2.0}/tocsmith.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: tocsmith
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: Create PDF bookmarks automatically using heuristics, with CLI and async tkinter GUI
|
|
5
5
|
Author-email: Wesley Yang <yxnian@outlook.com>
|
|
6
6
|
Project-URL: Homepage, https://github.com/wesleyel/pdf-bookmark
|
|
@@ -25,8 +25,12 @@ Requires-Dist: mypy>=1.10.0; extra == "dev"
|
|
|
25
25
|
|
|
26
26
|
## 功能概览
|
|
27
27
|
- 手动粘贴目录文本(每行以书中页码结尾),自动解析标题、页码与层级(1..6)
|
|
28
|
+
- 支持两种层级解析方式:
|
|
29
|
+
- **按序号**:`1 标题 1` / `1.1 子标题 2` / `1.1.1 子子标题 3`,可通过 `keep_numbering` 控制序号是否写入书签标题
|
|
30
|
+
- **按缩进**:通过行首空格/Tab 缩进表示层级,标题不含序号
|
|
31
|
+
- 默认 `auto` 自动识别;也可通过 `--toc-mode` 或配置 `toc_mode` 显式指定
|
|
28
32
|
- 支持页码偏移(实际页码 - 书籍页码),用于扫描件/前置页差异
|
|
29
|
-
-
|
|
33
|
+
- 按序号模式下,默认保留编号前缀到标题中(如 `第1章`、`1.1`);设置 `keep_numbering = false` 或 `--no-keep-numbering` 可仅用于推断层级
|
|
30
34
|
- 支持行首星号标记:允许输入 `*1.1 Title` 或 `* 1.1 Title`,输出统一为 `*1.1 Title`
|
|
31
35
|
- 将条目以父子层级写入 PDF 书签
|
|
32
36
|
- 提供 CLI 与 GUI;亦可通过 Python API 使用
|
|
@@ -82,6 +86,10 @@ tocsmith --help
|
|
|
82
86
|
page_offset = 0
|
|
83
87
|
# global minimum length
|
|
84
88
|
min_len = 3
|
|
89
|
+
# TOC hierarchy mode: auto | numbering | indent
|
|
90
|
+
toc_mode = "auto"
|
|
91
|
+
# keep numbering prefix in bookmark titles (numbering mode only)
|
|
92
|
+
keep_numbering = true
|
|
85
93
|
|
|
86
94
|
# input folder
|
|
87
95
|
input_prefix = "input"
|
|
@@ -100,6 +108,26 @@ toc = """
|
|
|
100
108
|
"""
|
|
101
109
|
page_offset = 10
|
|
102
110
|
min_len = 2
|
|
111
|
+
toc_mode = "numbering"
|
|
112
|
+
keep_numbering = false
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
### 目录文本格式
|
|
116
|
+
|
|
117
|
+
**按序号**(`toc_mode = "numbering"` 或自动识别):
|
|
118
|
+
|
|
119
|
+
```
|
|
120
|
+
1 我是标题 1
|
|
121
|
+
1.1 我是子标题 2
|
|
122
|
+
1.1.1 我是子子标题 3
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
**按缩进**(`toc_mode = "indent"` 或自动识别):
|
|
126
|
+
|
|
127
|
+
```
|
|
128
|
+
我是标题 1
|
|
129
|
+
我是子标题 2
|
|
130
|
+
我是子子标题 3
|
|
103
131
|
```
|
|
104
132
|
|
|
105
133
|
运行:
|
|
@@ -109,7 +137,7 @@ tocsmith --config config.toml
|
|
|
109
137
|
```
|
|
110
138
|
|
|
111
139
|
说明:
|
|
112
|
-
- `defaults` 中的 `page_offset`、`min_len` 可被每个任务覆盖。
|
|
140
|
+
- `defaults` 中的 `page_offset`、`min_len`、`toc_mode`、`keep_numbering` 可被每个任务覆盖。
|
|
113
141
|
- `input_prefix` 用于解析任务中的 `input_file`;`output_prefix` 为输出目录根。
|
|
114
142
|
- 输出文件名为 `{stem}{output_suffix}`,其中 `stem` 来源于 `input_file`。
|
|
115
143
|
- 任务可直接内联 `toc` 文本;也兼容 `toc_file` 指定外部文件。
|
|
@@ -125,6 +153,8 @@ uv run python -m tocsmith.gui
|
|
|
125
153
|
- 选择输入 PDF
|
|
126
154
|
- 可选:修改输出路径
|
|
127
155
|
- 在 “TOC text” 中粘贴目录文本;在 “Page Offset” 填写偏移(实际 - 书籍)
|
|
156
|
+
- 选择 “TOC Mode”:`auto`(自动识别)、`numbering`(按序号)、`indent`(按缩进)
|
|
157
|
+
- 勾选 “Keep numbering” 控制按序号模式下是否保留标题中的序号(默认保留)
|
|
128
158
|
- 点击 “Parse TOC Text” 查看解析结果
|
|
129
159
|
- 点击 “Generate” 生成带书签的 PDF
|
|
130
160
|
|
|
@@ -8,8 +8,12 @@
|
|
|
8
8
|
|
|
9
9
|
## 功能概览
|
|
10
10
|
- 手动粘贴目录文本(每行以书中页码结尾),自动解析标题、页码与层级(1..6)
|
|
11
|
+
- 支持两种层级解析方式:
|
|
12
|
+
- **按序号**:`1 标题 1` / `1.1 子标题 2` / `1.1.1 子子标题 3`,可通过 `keep_numbering` 控制序号是否写入书签标题
|
|
13
|
+
- **按缩进**:通过行首空格/Tab 缩进表示层级,标题不含序号
|
|
14
|
+
- 默认 `auto` 自动识别;也可通过 `--toc-mode` 或配置 `toc_mode` 显式指定
|
|
11
15
|
- 支持页码偏移(实际页码 - 书籍页码),用于扫描件/前置页差异
|
|
12
|
-
-
|
|
16
|
+
- 按序号模式下,默认保留编号前缀到标题中(如 `第1章`、`1.1`);设置 `keep_numbering = false` 或 `--no-keep-numbering` 可仅用于推断层级
|
|
13
17
|
- 支持行首星号标记:允许输入 `*1.1 Title` 或 `* 1.1 Title`,输出统一为 `*1.1 Title`
|
|
14
18
|
- 将条目以父子层级写入 PDF 书签
|
|
15
19
|
- 提供 CLI 与 GUI;亦可通过 Python API 使用
|
|
@@ -65,6 +69,10 @@ tocsmith --help
|
|
|
65
69
|
page_offset = 0
|
|
66
70
|
# global minimum length
|
|
67
71
|
min_len = 3
|
|
72
|
+
# TOC hierarchy mode: auto | numbering | indent
|
|
73
|
+
toc_mode = "auto"
|
|
74
|
+
# keep numbering prefix in bookmark titles (numbering mode only)
|
|
75
|
+
keep_numbering = true
|
|
68
76
|
|
|
69
77
|
# input folder
|
|
70
78
|
input_prefix = "input"
|
|
@@ -83,6 +91,26 @@ toc = """
|
|
|
83
91
|
"""
|
|
84
92
|
page_offset = 10
|
|
85
93
|
min_len = 2
|
|
94
|
+
toc_mode = "numbering"
|
|
95
|
+
keep_numbering = false
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
### 目录文本格式
|
|
99
|
+
|
|
100
|
+
**按序号**(`toc_mode = "numbering"` 或自动识别):
|
|
101
|
+
|
|
102
|
+
```
|
|
103
|
+
1 我是标题 1
|
|
104
|
+
1.1 我是子标题 2
|
|
105
|
+
1.1.1 我是子子标题 3
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
**按缩进**(`toc_mode = "indent"` 或自动识别):
|
|
109
|
+
|
|
110
|
+
```
|
|
111
|
+
我是标题 1
|
|
112
|
+
我是子标题 2
|
|
113
|
+
我是子子标题 3
|
|
86
114
|
```
|
|
87
115
|
|
|
88
116
|
运行:
|
|
@@ -92,7 +120,7 @@ tocsmith --config config.toml
|
|
|
92
120
|
```
|
|
93
121
|
|
|
94
122
|
说明:
|
|
95
|
-
- `defaults` 中的 `page_offset`、`min_len` 可被每个任务覆盖。
|
|
123
|
+
- `defaults` 中的 `page_offset`、`min_len`、`toc_mode`、`keep_numbering` 可被每个任务覆盖。
|
|
96
124
|
- `input_prefix` 用于解析任务中的 `input_file`;`output_prefix` 为输出目录根。
|
|
97
125
|
- 输出文件名为 `{stem}{output_suffix}`,其中 `stem` 来源于 `input_file`。
|
|
98
126
|
- 任务可直接内联 `toc` 文本;也兼容 `toc_file` 指定外部文件。
|
|
@@ -108,6 +136,8 @@ uv run python -m tocsmith.gui
|
|
|
108
136
|
- 选择输入 PDF
|
|
109
137
|
- 可选:修改输出路径
|
|
110
138
|
- 在 “TOC text” 中粘贴目录文本;在 “Page Offset” 填写偏移(实际 - 书籍)
|
|
139
|
+
- 选择 “TOC Mode”:`auto`(自动识别)、`numbering`(按序号)、`indent`(按缩进)
|
|
140
|
+
- 勾选 “Keep numbering” 控制按序号模式下是否保留标题中的序号(默认保留)
|
|
111
141
|
- 点击 “Parse TOC Text” 查看解析结果
|
|
112
142
|
- 点击 “Generate” 生成带书签的 PDF
|
|
113
143
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "tocsmith"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.2.0"
|
|
4
4
|
description = "Create PDF bookmarks automatically using heuristics, with CLI and async tkinter GUI"
|
|
5
5
|
authors = [{ name = "Wesley Yang", email = "yxnian@outlook.com" }]
|
|
6
6
|
urls = { Homepage = "https://github.com/wesleyel/pdf-bookmark", Source = "https://github.com/wesleyel/pdf-bookmark" }
|
|
@@ -5,7 +5,7 @@ from pathlib import Path
|
|
|
5
5
|
from typing import Any, Dict, List, Optional
|
|
6
6
|
import sys
|
|
7
7
|
|
|
8
|
-
from .core import generate_bookmarks, parse_toc_lines
|
|
8
|
+
from .core import TocMode, generate_bookmarks, parse_toc_lines
|
|
9
9
|
|
|
10
10
|
try: # Python 3.11+
|
|
11
11
|
import tomllib # type: ignore[attr-defined]
|
|
@@ -23,6 +23,18 @@ def parse_args(argv: List[str] | None = None) -> argparse.Namespace:
|
|
|
23
23
|
p.add_argument("--min-len", type=int, default=3, help="Minimum heading text length")
|
|
24
24
|
p.add_argument("--page-offset", type=int, default=0, help="Page offset: actual - book page")
|
|
25
25
|
p.add_argument("--toc-file", help="Path to a text file containing TOC lines")
|
|
26
|
+
p.add_argument(
|
|
27
|
+
"--toc-mode",
|
|
28
|
+
choices=["auto", "numbering", "indent"],
|
|
29
|
+
default="auto",
|
|
30
|
+
help="TOC hierarchy mode: numbering (1/1.1), indent (spaces), or auto-detect",
|
|
31
|
+
)
|
|
32
|
+
p.add_argument(
|
|
33
|
+
"--keep-numbering",
|
|
34
|
+
action=argparse.BooleanOptionalAction,
|
|
35
|
+
default=True,
|
|
36
|
+
help="Keep numbering prefix in bookmark titles (numbering mode only)",
|
|
37
|
+
)
|
|
26
38
|
p.add_argument(
|
|
27
39
|
"-c",
|
|
28
40
|
"--config",
|
|
@@ -46,6 +58,8 @@ def _run_single(
|
|
|
46
58
|
page_offset: int,
|
|
47
59
|
min_len: int,
|
|
48
60
|
toc_text: Optional[str] = None,
|
|
61
|
+
toc_mode: TocMode = "auto",
|
|
62
|
+
keep_numbering: bool = True,
|
|
49
63
|
) -> int:
|
|
50
64
|
"""Run a single task and return process exit code."""
|
|
51
65
|
if not src.exists():
|
|
@@ -55,10 +69,22 @@ def _run_single(
|
|
|
55
69
|
|
|
56
70
|
headings = []
|
|
57
71
|
if toc_text is not None and toc_text.strip():
|
|
58
|
-
headings = parse_toc_lines(
|
|
72
|
+
headings = parse_toc_lines(
|
|
73
|
+
toc_text,
|
|
74
|
+
page_offset=page_offset,
|
|
75
|
+
min_len=min_len,
|
|
76
|
+
mode=toc_mode,
|
|
77
|
+
keep_numbering=keep_numbering,
|
|
78
|
+
)
|
|
59
79
|
elif toc_file:
|
|
60
80
|
file_text = Path(toc_file).read_text(encoding="utf-8")
|
|
61
|
-
headings = parse_toc_lines(
|
|
81
|
+
headings = parse_toc_lines(
|
|
82
|
+
file_text,
|
|
83
|
+
page_offset=page_offset,
|
|
84
|
+
min_len=min_len,
|
|
85
|
+
mode=toc_mode,
|
|
86
|
+
keep_numbering=keep_numbering,
|
|
87
|
+
)
|
|
62
88
|
else:
|
|
63
89
|
print("No TOC source provided (use --toc-file). Producing a copy without outline.")
|
|
64
90
|
headings = []
|
|
@@ -86,6 +112,8 @@ def _run_batch(config_path: Path) -> int:
|
|
|
86
112
|
# Alternatively: toc_file = "toc.txt"
|
|
87
113
|
page_offset = 10 # optional overrides default
|
|
88
114
|
min_len = 2 # optional overrides default
|
|
115
|
+
toc_mode = "auto" # optional: auto | numbering | indent
|
|
116
|
+
keep_numbering = true # optional: keep numbering in bookmark titles
|
|
89
117
|
'''
|
|
90
118
|
if tomllib is None:
|
|
91
119
|
print("Error: TOML support not available. Please install 'tomli' for Python < 3.11.")
|
|
@@ -107,6 +135,11 @@ def _run_batch(config_path: Path) -> int:
|
|
|
107
135
|
|
|
108
136
|
default_page_offset = int(defaults.get("page_offset", 0) or 0)
|
|
109
137
|
default_min_len = int(defaults.get("min_len", 3) or 3)
|
|
138
|
+
default_toc_mode = str(defaults.get("toc_mode", "auto") or "auto").strip() or "auto"
|
|
139
|
+
if default_toc_mode not in ("auto", "numbering", "indent"):
|
|
140
|
+
print(f"Invalid defaults.toc_mode: {default_toc_mode!r}")
|
|
141
|
+
return 2
|
|
142
|
+
default_keep_numbering = bool(defaults.get("keep_numbering", True))
|
|
110
143
|
input_prefix = str(defaults.get("input_prefix", "")).strip() or ""
|
|
111
144
|
output_prefix = str(defaults.get("output_prefix", "")).strip() or ""
|
|
112
145
|
output_suffix = (
|
|
@@ -139,11 +172,18 @@ def _run_batch(config_path: Path) -> int:
|
|
|
139
172
|
toc_file = _resolve_relative(base_dir, t.get("toc_file"))
|
|
140
173
|
page_offset = int(t.get("page_offset", default_page_offset) or default_page_offset)
|
|
141
174
|
min_len = int(t.get("min_len", default_min_len) or default_min_len)
|
|
175
|
+
toc_mode = str(t.get("toc_mode", default_toc_mode) or default_toc_mode).strip() or "auto"
|
|
176
|
+
if toc_mode not in ("auto", "numbering", "indent"):
|
|
177
|
+
print(f"[Task {idx}] Skipped: invalid toc_mode {toc_mode!r}")
|
|
178
|
+
failures += 1
|
|
179
|
+
continue
|
|
180
|
+
keep_numbering = bool(t.get("keep_numbering", default_keep_numbering))
|
|
142
181
|
|
|
143
182
|
print(
|
|
144
183
|
f"[Task {idx}] Running: src={src} out={out} "
|
|
145
184
|
f"toc={'inline' if (toc_inline and toc_inline.strip()) else (toc_file or '<none>')} "
|
|
146
|
-
f"offset={page_offset} min_len={min_len}"
|
|
185
|
+
f"offset={page_offset} min_len={min_len} toc_mode={toc_mode} "
|
|
186
|
+
f"keep_numbering={keep_numbering}"
|
|
147
187
|
)
|
|
148
188
|
try:
|
|
149
189
|
# Ensure output directory exists
|
|
@@ -155,6 +195,8 @@ def _run_batch(config_path: Path) -> int:
|
|
|
155
195
|
page_offset=page_offset,
|
|
156
196
|
min_len=min_len,
|
|
157
197
|
toc_text=toc_inline,
|
|
198
|
+
toc_mode=toc_mode, # type: ignore[arg-type]
|
|
199
|
+
keep_numbering=keep_numbering,
|
|
158
200
|
)
|
|
159
201
|
if code != 0:
|
|
160
202
|
failures += 1
|
|
@@ -186,6 +228,8 @@ def main(argv: List[str] | None = None) -> int:
|
|
|
186
228
|
toc_file=Path(ns.toc_file) if ns.toc_file else None,
|
|
187
229
|
page_offset=ns.page_offset,
|
|
188
230
|
min_len=ns.min_len,
|
|
231
|
+
toc_mode=ns.toc_mode,
|
|
232
|
+
keep_numbering=ns.keep_numbering,
|
|
189
233
|
)
|
|
190
234
|
|
|
191
235
|
|
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
import re
|
|
5
|
+
from functools import reduce
|
|
6
|
+
from math import gcd
|
|
7
|
+
from typing import Iterable, List, Literal, Tuple, Optional
|
|
8
|
+
|
|
9
|
+
TocMode = Literal["numbering", "indent", "auto"]
|
|
10
|
+
|
|
11
|
+
from pypdf import PdfReader, PdfWriter
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class Heading:
|
|
16
|
+
title: str
|
|
17
|
+
page: int # 1-based
|
|
18
|
+
level: int # 1..6
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def generate_bookmarks(src_pdf: str, out_pdf: str, headings: Iterable[Heading]) -> None:
|
|
22
|
+
"""Write given headings into a new PDF file as outline/bookmarks."""
|
|
23
|
+
reader = PdfReader(src_pdf)
|
|
24
|
+
writer = PdfWriter()
|
|
25
|
+
for page in reader.pages:
|
|
26
|
+
writer.add_page(page)
|
|
27
|
+
|
|
28
|
+
# Build hierarchical outlines using a simple stack by levels
|
|
29
|
+
stack: List[Tuple[int, object]] = [] # (level, parent_ref)
|
|
30
|
+
|
|
31
|
+
for h in headings:
|
|
32
|
+
page_index = max(0, min(len(reader.pages) - 1, h.page - 1))
|
|
33
|
+
while stack and stack[-1][0] >= h.level:
|
|
34
|
+
stack.pop()
|
|
35
|
+
parent = stack[-1][1] if stack else None
|
|
36
|
+
dest = writer.add_outline_item(h.title, page_index, parent=parent)
|
|
37
|
+
stack.append((h.level, dest))
|
|
38
|
+
|
|
39
|
+
with open(out_pdf, "wb") as f:
|
|
40
|
+
writer.write(f)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
# -------------------- TOC parsing utilities --------------------
|
|
44
|
+
|
|
45
|
+
_NUM_PREFIX_RE = re.compile(
|
|
46
|
+
r"^\s*(?P<num>(第\s*\d+[一二三四五六七八九十百千]*[章节节部分编]?)|((\d+\.)+\d+)|\d+)?\s*"
|
|
47
|
+
)
|
|
48
|
+
_TRAILING_PAGE_RE = re.compile(r"(?P<page>\d{1,5})\s*$")
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _infer_level_from_numbering(num: Optional[str]) -> int:
|
|
52
|
+
if not num:
|
|
53
|
+
return 1
|
|
54
|
+
num = num.strip()
|
|
55
|
+
if num.startswith("第"):
|
|
56
|
+
# "第1章" style => top-level
|
|
57
|
+
return 1
|
|
58
|
+
if "." in num:
|
|
59
|
+
# "1.2.3" => level = segments + 1 (so 1.2 is level 2)
|
|
60
|
+
return min(6, max(1, num.count(".") + 1))
|
|
61
|
+
# Simple leading integer like "1" => level 1
|
|
62
|
+
return 1
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def _leading_indent_width(raw_line: str) -> int:
|
|
66
|
+
width = 0
|
|
67
|
+
for ch in raw_line:
|
|
68
|
+
if ch == " ":
|
|
69
|
+
width += 1
|
|
70
|
+
elif ch == "\t":
|
|
71
|
+
width += 4
|
|
72
|
+
else:
|
|
73
|
+
break
|
|
74
|
+
return width
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _detect_indent_unit(indents: Iterable[int]) -> int:
|
|
78
|
+
non_zero = sorted({i for i in indents if i > 0})
|
|
79
|
+
if not non_zero:
|
|
80
|
+
return 4
|
|
81
|
+
unit = non_zero[0]
|
|
82
|
+
if all(i % unit == 0 for i in indents):
|
|
83
|
+
return max(1, unit)
|
|
84
|
+
return max(1, reduce(gcd, non_zero))
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def _infer_level_from_indent(indent: int, unit: int) -> int:
|
|
88
|
+
if indent <= 0:
|
|
89
|
+
return 1
|
|
90
|
+
return min(6, max(1, indent // unit + 1))
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def _strip_star_prefix(line: str) -> Tuple[str, str]:
|
|
94
|
+
star_prefix = ""
|
|
95
|
+
m_star = re.match(r"^\*+\s*", line)
|
|
96
|
+
if m_star:
|
|
97
|
+
star_prefix = "*" * m_star.group(0).count("*")
|
|
98
|
+
line = line[m_star.end() :].lstrip()
|
|
99
|
+
return star_prefix, line
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def _detect_toc_mode(toc_text: str, min_len: int = 1) -> TocMode:
|
|
103
|
+
"""Auto-detect whether TOC hierarchy is expressed by numbering or indentation."""
|
|
104
|
+
indent_signals = 0
|
|
105
|
+
numbering_signals = 0
|
|
106
|
+
for raw_line in toc_text.splitlines():
|
|
107
|
+
if len(raw_line.strip()) < min_len:
|
|
108
|
+
continue
|
|
109
|
+
line = raw_line.lstrip()
|
|
110
|
+
_, line = _strip_star_prefix(line)
|
|
111
|
+
page_m = _TRAILING_PAGE_RE.search(line)
|
|
112
|
+
if not page_m:
|
|
113
|
+
continue
|
|
114
|
+
line_wo_page = line[: page_m.start()].rstrip()
|
|
115
|
+
indent = _leading_indent_width(raw_line)
|
|
116
|
+
num_m = _NUM_PREFIX_RE.match(line_wo_page)
|
|
117
|
+
has_numbering = bool(num_m and num_m.group("num"))
|
|
118
|
+
if has_numbering:
|
|
119
|
+
numbering_signals += 1
|
|
120
|
+
elif indent > 0:
|
|
121
|
+
indent_signals += 1
|
|
122
|
+
return "indent" if indent_signals > numbering_signals else "numbering"
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def _parse_toc_lines_numbering(
|
|
126
|
+
toc_text: str,
|
|
127
|
+
page_offset: int = 0,
|
|
128
|
+
min_len: int = 1,
|
|
129
|
+
keep_numbering: bool = True,
|
|
130
|
+
) -> List[Heading]:
|
|
131
|
+
headings: List[Heading] = []
|
|
132
|
+
for raw_line in toc_text.splitlines():
|
|
133
|
+
line = raw_line.strip()
|
|
134
|
+
if len(line) < min_len:
|
|
135
|
+
continue
|
|
136
|
+
star_prefix, line = _strip_star_prefix(line)
|
|
137
|
+
|
|
138
|
+
page_m = _TRAILING_PAGE_RE.search(line)
|
|
139
|
+
if not page_m:
|
|
140
|
+
continue
|
|
141
|
+
page_num = int(page_m.group("page"))
|
|
142
|
+
line_wo_page = line[: page_m.start()].rstrip()
|
|
143
|
+
num_m = _NUM_PREFIX_RE.match(line_wo_page)
|
|
144
|
+
numbering = None
|
|
145
|
+
title_part = line_wo_page
|
|
146
|
+
if num_m:
|
|
147
|
+
numbering = num_m.group("num")
|
|
148
|
+
title_part = line_wo_page[num_m.end() :].strip()
|
|
149
|
+
if numbering and keep_numbering:
|
|
150
|
+
combined = f"{numbering.strip()} {title_part}".strip()
|
|
151
|
+
else:
|
|
152
|
+
combined = title_part
|
|
153
|
+
title = re.sub(r"\s+", " ", combined)
|
|
154
|
+
if not title:
|
|
155
|
+
title = line_wo_page.strip() if keep_numbering else title_part.strip()
|
|
156
|
+
if star_prefix:
|
|
157
|
+
title = f"{star_prefix}{title}".strip()
|
|
158
|
+
level = _infer_level_from_numbering(numbering)
|
|
159
|
+
pdf_page = max(1, page_num + page_offset)
|
|
160
|
+
headings.append(Heading(title=title, page=pdf_page, level=level))
|
|
161
|
+
|
|
162
|
+
headings.sort(key=lambda h: (h.page, h.level, h.title.lower()))
|
|
163
|
+
return headings
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def _parse_toc_lines_indent(toc_text: str, page_offset: int = 0, min_len: int = 1) -> List[Heading]:
|
|
167
|
+
lines_data: List[Tuple[int, str, int]] = []
|
|
168
|
+
indents: List[int] = []
|
|
169
|
+
for raw_line in toc_text.splitlines():
|
|
170
|
+
if len(raw_line.strip()) < min_len:
|
|
171
|
+
continue
|
|
172
|
+
indent = _leading_indent_width(raw_line)
|
|
173
|
+
line = raw_line.lstrip()
|
|
174
|
+
star_prefix, line = _strip_star_prefix(line)
|
|
175
|
+
|
|
176
|
+
page_m = _TRAILING_PAGE_RE.search(line)
|
|
177
|
+
if not page_m:
|
|
178
|
+
continue
|
|
179
|
+
page_num = int(page_m.group("page"))
|
|
180
|
+
title = re.sub(r"\s+", " ", line[: page_m.start()].rstrip())
|
|
181
|
+
if star_prefix:
|
|
182
|
+
title = f"{star_prefix}{title}".strip()
|
|
183
|
+
indents.append(indent)
|
|
184
|
+
lines_data.append((indent, title, page_num))
|
|
185
|
+
|
|
186
|
+
unit = _detect_indent_unit(indents)
|
|
187
|
+
headings: List[Heading] = []
|
|
188
|
+
for indent, title, page_num in lines_data:
|
|
189
|
+
level = _infer_level_from_indent(indent, unit)
|
|
190
|
+
pdf_page = max(1, page_num + page_offset)
|
|
191
|
+
headings.append(Heading(title=title, page=pdf_page, level=level))
|
|
192
|
+
|
|
193
|
+
headings.sort(key=lambda h: (h.page, h.level, h.title.lower()))
|
|
194
|
+
return headings
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def parse_toc_lines(
|
|
198
|
+
toc_text: str,
|
|
199
|
+
page_offset: int = 0,
|
|
200
|
+
min_len: int = 1,
|
|
201
|
+
mode: TocMode = "auto",
|
|
202
|
+
keep_numbering: bool = True,
|
|
203
|
+
) -> List[Heading]:
|
|
204
|
+
"""
|
|
205
|
+
Parse a pasted TOC text into Heading entries.
|
|
206
|
+
- Each line should end with the book page number (digits)
|
|
207
|
+
- mode="numbering": hierarchy from leading numbers like "1", "1.1", "第1章"
|
|
208
|
+
- mode="indent": hierarchy from leading spaces/tabs
|
|
209
|
+
- mode="auto": detect numbering vs indent automatically
|
|
210
|
+
- keep_numbering: when True (default), numbering prefix is kept in bookmark titles
|
|
211
|
+
- page_offset is added to the parsed page number to map to PDF actual pages
|
|
212
|
+
"""
|
|
213
|
+
resolved_mode = _detect_toc_mode(toc_text, min_len) if mode == "auto" else mode
|
|
214
|
+
if resolved_mode == "indent":
|
|
215
|
+
return _parse_toc_lines_indent(toc_text, page_offset=page_offset, min_len=min_len)
|
|
216
|
+
return _parse_toc_lines_numbering(
|
|
217
|
+
toc_text, page_offset=page_offset, min_len=min_len, keep_numbering=keep_numbering
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
## URL/website TOC fetching intentionally removed; only manual text input is supported.
|
|
222
|
+
|
|
223
|
+
|
|
@@ -68,6 +68,24 @@ class App:
|
|
|
68
68
|
self.offset_entry = ttk.Entry(ctrl, textvariable=self.offset_var, width=6)
|
|
69
69
|
self.offset_entry.pack(side=tk.LEFT, padx=(4, 12))
|
|
70
70
|
|
|
71
|
+
ttk.Label(ctrl, text="TOC Mode:").pack(side=tk.LEFT)
|
|
72
|
+
self.toc_mode_var = tk.StringVar(value="auto")
|
|
73
|
+
self.toc_mode_combo = ttk.Combobox(
|
|
74
|
+
ctrl,
|
|
75
|
+
textvariable=self.toc_mode_var,
|
|
76
|
+
values=["auto", "numbering", "indent"],
|
|
77
|
+
state="readonly",
|
|
78
|
+
width=10,
|
|
79
|
+
)
|
|
80
|
+
self.toc_mode_combo.pack(side=tk.LEFT, padx=(4, 12))
|
|
81
|
+
|
|
82
|
+
self.keep_numbering_var = tk.BooleanVar(value=True)
|
|
83
|
+
ttk.Checkbutton(
|
|
84
|
+
ctrl,
|
|
85
|
+
text="Keep numbering",
|
|
86
|
+
variable=self.keep_numbering_var,
|
|
87
|
+
).pack(side=tk.LEFT)
|
|
88
|
+
|
|
71
89
|
# TOC input
|
|
72
90
|
toc_row = ttk.Frame(frm)
|
|
73
91
|
toc_row.pack(fill=tk.BOTH, expand=True)
|
|
@@ -162,6 +180,20 @@ class App:
|
|
|
162
180
|
|
|
163
181
|
# Auto analysis removed
|
|
164
182
|
|
|
183
|
+
def _get_parse_kwargs(self) -> dict:
|
|
184
|
+
try:
|
|
185
|
+
offset = int(self.offset_var.get() or 0)
|
|
186
|
+
except ValueError:
|
|
187
|
+
offset = 0
|
|
188
|
+
mode = self.toc_mode_var.get() or "auto"
|
|
189
|
+
if mode not in ("auto", "numbering", "indent"):
|
|
190
|
+
mode = "auto"
|
|
191
|
+
return {
|
|
192
|
+
"page_offset": offset,
|
|
193
|
+
"mode": mode,
|
|
194
|
+
"keep_numbering": self.keep_numbering_var.get(),
|
|
195
|
+
}
|
|
196
|
+
|
|
165
197
|
def _on_generate(self) -> None:
|
|
166
198
|
if not self.in_var.get():
|
|
167
199
|
messagebox.showwarning("Missing", "Please choose an input PDF")
|
|
@@ -176,11 +208,7 @@ class App:
|
|
|
176
208
|
text = self.toc_text.get("1.0", tk.END).strip()
|
|
177
209
|
hs = []
|
|
178
210
|
if text:
|
|
179
|
-
|
|
180
|
-
offset = int(self.offset_var.get() or 0)
|
|
181
|
-
except ValueError:
|
|
182
|
-
offset = 0
|
|
183
|
-
hs = await run_in_thread(parse_toc_lines, text, offset)
|
|
211
|
+
hs = await run_in_thread(parse_toc_lines, text, **self._get_parse_kwargs())
|
|
184
212
|
else:
|
|
185
213
|
hs = []
|
|
186
214
|
await run_in_thread(generate_bookmarks, self.in_var.get(), self.out_var.get(), hs)
|
|
@@ -194,14 +222,9 @@ class App:
|
|
|
194
222
|
if not text:
|
|
195
223
|
messagebox.showwarning("Empty", "Please paste TOC text or URL first")
|
|
196
224
|
return
|
|
197
|
-
try:
|
|
198
|
-
offset = int(self.offset_var.get() or 0)
|
|
199
|
-
except ValueError:
|
|
200
|
-
offset = 0
|
|
201
|
-
|
|
202
225
|
async def task():
|
|
203
226
|
self._set_status("Parsing TOC…")
|
|
204
|
-
hs = await run_in_thread(parse_toc_lines, text,
|
|
227
|
+
hs = await run_in_thread(parse_toc_lines, text, **self._get_parse_kwargs())
|
|
205
228
|
self._populate_tree(hs)
|
|
206
229
|
self._set_status(f"Parsed {len(hs)} entries")
|
|
207
230
|
|
|
@@ -163,3 +163,82 @@ def test_parse_toc_lines_preserve_numbering_with_asterisk():
|
|
|
163
163
|
assert any(t.startswith("*2 ") and "星标章节" in t for t in titles)
|
|
164
164
|
|
|
165
165
|
|
|
166
|
+
def test_parse_toc_lines_numbering_mode_example():
|
|
167
|
+
toc = "\n".join([
|
|
168
|
+
"1 我是标题 1",
|
|
169
|
+
"1.1 我是子标题 2",
|
|
170
|
+
"1.1.1 我是子子标题 3",
|
|
171
|
+
])
|
|
172
|
+
hs = parse_toc_lines(toc, page_offset=0, mode="numbering")
|
|
173
|
+
assert len(hs) == 3
|
|
174
|
+
assert hs[0].title == "1 我是标题"
|
|
175
|
+
assert hs[0].level == 1
|
|
176
|
+
assert hs[1].title == "1.1 我是子标题"
|
|
177
|
+
assert hs[1].level == 2
|
|
178
|
+
assert hs[2].title == "1.1.1 我是子子标题"
|
|
179
|
+
assert hs[2].level == 3
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def test_parse_toc_lines_indent_mode_example():
|
|
183
|
+
toc = "\n".join([
|
|
184
|
+
"我是标题 1",
|
|
185
|
+
" 我是子标题 2",
|
|
186
|
+
" 我是子子标题 3",
|
|
187
|
+
])
|
|
188
|
+
hs = parse_toc_lines(toc, page_offset=0, mode="indent")
|
|
189
|
+
assert len(hs) == 3
|
|
190
|
+
assert hs[0].title == "我是标题"
|
|
191
|
+
assert hs[0].level == 1
|
|
192
|
+
assert hs[1].title == "我是子标题"
|
|
193
|
+
assert hs[1].level == 2
|
|
194
|
+
assert hs[2].title == "我是子子标题"
|
|
195
|
+
assert hs[2].level == 3
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def test_parse_toc_lines_auto_detect_indent():
|
|
199
|
+
toc = "\n".join([
|
|
200
|
+
"Chapter A 1",
|
|
201
|
+
" Section B 2",
|
|
202
|
+
" Subsection C 3",
|
|
203
|
+
])
|
|
204
|
+
hs = parse_toc_lines(toc, page_offset=0, mode="auto")
|
|
205
|
+
assert [h.level for h in hs] == [1, 2, 3]
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def test_parse_toc_lines_strip_numbering_when_disabled():
|
|
209
|
+
toc = "\n".join([
|
|
210
|
+
"第1章 计算机系统概述 1",
|
|
211
|
+
"1.1 操作系统的基本概念 2",
|
|
212
|
+
"2 其他章节 10",
|
|
213
|
+
])
|
|
214
|
+
hs = parse_toc_lines(toc, page_offset=0, keep_numbering=False)
|
|
215
|
+
titles = [h.title for h in hs]
|
|
216
|
+
assert any(t == "计算机系统概述" for t in titles)
|
|
217
|
+
assert any(t == "操作系统的基本概念" for t in titles)
|
|
218
|
+
assert any(t == "其他章节" for t in titles)
|
|
219
|
+
assert [h.level for h in hs] == [1, 2, 1]
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def test_parse_toc_lines_strip_numbering_with_asterisk():
|
|
223
|
+
toc = "\n".join([
|
|
224
|
+
"*1.1 星标小节 12",
|
|
225
|
+
"* 2 星标章节 13",
|
|
226
|
+
])
|
|
227
|
+
hs = parse_toc_lines(toc, page_offset=0, keep_numbering=False)
|
|
228
|
+
titles = [h.title for h in hs]
|
|
229
|
+
assert any(t == "*星标小节" for t in titles)
|
|
230
|
+
assert any(t == "*星标章节" for t in titles)
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
def test_parse_toc_lines_auto_detect_numbering():
|
|
234
|
+
toc = "\n".join([
|
|
235
|
+
"第1章 基础 1",
|
|
236
|
+
"1.1 小节 2",
|
|
237
|
+
])
|
|
238
|
+
hs = parse_toc_lines(toc, page_offset=0, mode="auto")
|
|
239
|
+
assert hs[0].level == 1
|
|
240
|
+
assert hs[1].level == 2
|
|
241
|
+
assert hs[0].title.startswith("第1章")
|
|
242
|
+
assert hs[1].title.startswith("1.1")
|
|
243
|
+
|
|
244
|
+
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: tocsmith
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: Create PDF bookmarks automatically using heuristics, with CLI and async tkinter GUI
|
|
5
5
|
Author-email: Wesley Yang <yxnian@outlook.com>
|
|
6
6
|
Project-URL: Homepage, https://github.com/wesleyel/pdf-bookmark
|
|
@@ -25,8 +25,12 @@ Requires-Dist: mypy>=1.10.0; extra == "dev"
|
|
|
25
25
|
|
|
26
26
|
## 功能概览
|
|
27
27
|
- 手动粘贴目录文本(每行以书中页码结尾),自动解析标题、页码与层级(1..6)
|
|
28
|
+
- 支持两种层级解析方式:
|
|
29
|
+
- **按序号**:`1 标题 1` / `1.1 子标题 2` / `1.1.1 子子标题 3`,可通过 `keep_numbering` 控制序号是否写入书签标题
|
|
30
|
+
- **按缩进**:通过行首空格/Tab 缩进表示层级,标题不含序号
|
|
31
|
+
- 默认 `auto` 自动识别;也可通过 `--toc-mode` 或配置 `toc_mode` 显式指定
|
|
28
32
|
- 支持页码偏移(实际页码 - 书籍页码),用于扫描件/前置页差异
|
|
29
|
-
-
|
|
33
|
+
- 按序号模式下,默认保留编号前缀到标题中(如 `第1章`、`1.1`);设置 `keep_numbering = false` 或 `--no-keep-numbering` 可仅用于推断层级
|
|
30
34
|
- 支持行首星号标记:允许输入 `*1.1 Title` 或 `* 1.1 Title`,输出统一为 `*1.1 Title`
|
|
31
35
|
- 将条目以父子层级写入 PDF 书签
|
|
32
36
|
- 提供 CLI 与 GUI;亦可通过 Python API 使用
|
|
@@ -82,6 +86,10 @@ tocsmith --help
|
|
|
82
86
|
page_offset = 0
|
|
83
87
|
# global minimum length
|
|
84
88
|
min_len = 3
|
|
89
|
+
# TOC hierarchy mode: auto | numbering | indent
|
|
90
|
+
toc_mode = "auto"
|
|
91
|
+
# keep numbering prefix in bookmark titles (numbering mode only)
|
|
92
|
+
keep_numbering = true
|
|
85
93
|
|
|
86
94
|
# input folder
|
|
87
95
|
input_prefix = "input"
|
|
@@ -100,6 +108,26 @@ toc = """
|
|
|
100
108
|
"""
|
|
101
109
|
page_offset = 10
|
|
102
110
|
min_len = 2
|
|
111
|
+
toc_mode = "numbering"
|
|
112
|
+
keep_numbering = false
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
### 目录文本格式
|
|
116
|
+
|
|
117
|
+
**按序号**(`toc_mode = "numbering"` 或自动识别):
|
|
118
|
+
|
|
119
|
+
```
|
|
120
|
+
1 我是标题 1
|
|
121
|
+
1.1 我是子标题 2
|
|
122
|
+
1.1.1 我是子子标题 3
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
**按缩进**(`toc_mode = "indent"` 或自动识别):
|
|
126
|
+
|
|
127
|
+
```
|
|
128
|
+
我是标题 1
|
|
129
|
+
我是子标题 2
|
|
130
|
+
我是子子标题 3
|
|
103
131
|
```
|
|
104
132
|
|
|
105
133
|
运行:
|
|
@@ -109,7 +137,7 @@ tocsmith --config config.toml
|
|
|
109
137
|
```
|
|
110
138
|
|
|
111
139
|
说明:
|
|
112
|
-
- `defaults` 中的 `page_offset`、`min_len` 可被每个任务覆盖。
|
|
140
|
+
- `defaults` 中的 `page_offset`、`min_len`、`toc_mode`、`keep_numbering` 可被每个任务覆盖。
|
|
113
141
|
- `input_prefix` 用于解析任务中的 `input_file`;`output_prefix` 为输出目录根。
|
|
114
142
|
- 输出文件名为 `{stem}{output_suffix}`,其中 `stem` 来源于 `input_file`。
|
|
115
143
|
- 任务可直接内联 `toc` 文本;也兼容 `toc_file` 指定外部文件。
|
|
@@ -125,6 +153,8 @@ uv run python -m tocsmith.gui
|
|
|
125
153
|
- 选择输入 PDF
|
|
126
154
|
- 可选:修改输出路径
|
|
127
155
|
- 在 “TOC text” 中粘贴目录文本;在 “Page Offset” 填写偏移(实际 - 书籍)
|
|
156
|
+
- 选择 “TOC Mode”:`auto`(自动识别)、`numbering`(按序号)、`indent`(按缩进)
|
|
157
|
+
- 勾选 “Keep numbering” 控制按序号模式下是否保留标题中的序号(默认保留)
|
|
128
158
|
- 点击 “Parse TOC Text” 查看解析结果
|
|
129
159
|
- 点击 “Generate” 生成带书签的 PDF
|
|
130
160
|
|
tocsmith-0.1.0/tocsmith/core.py
DELETED
|
@@ -1,122 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
from dataclasses import dataclass
|
|
4
|
-
import re
|
|
5
|
-
from typing import Iterable, List, Tuple, Optional
|
|
6
|
-
|
|
7
|
-
from pypdf import PdfReader, PdfWriter
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
@dataclass
|
|
11
|
-
class Heading:
|
|
12
|
-
title: str
|
|
13
|
-
page: int # 1-based
|
|
14
|
-
level: int # 1..6
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
def generate_bookmarks(src_pdf: str, out_pdf: str, headings: Iterable[Heading]) -> None:
|
|
18
|
-
"""Write given headings into a new PDF file as outline/bookmarks."""
|
|
19
|
-
reader = PdfReader(src_pdf)
|
|
20
|
-
writer = PdfWriter()
|
|
21
|
-
for page in reader.pages:
|
|
22
|
-
writer.add_page(page)
|
|
23
|
-
|
|
24
|
-
# Build hierarchical outlines using a simple stack by levels
|
|
25
|
-
stack: List[Tuple[int, object]] = [] # (level, parent_ref)
|
|
26
|
-
|
|
27
|
-
for h in headings:
|
|
28
|
-
page_index = max(0, min(len(reader.pages) - 1, h.page - 1))
|
|
29
|
-
while stack and stack[-1][0] >= h.level:
|
|
30
|
-
stack.pop()
|
|
31
|
-
parent = stack[-1][1] if stack else None
|
|
32
|
-
dest = writer.add_outline_item(h.title, page_index, parent=parent)
|
|
33
|
-
stack.append((h.level, dest))
|
|
34
|
-
|
|
35
|
-
with open(out_pdf, "wb") as f:
|
|
36
|
-
writer.write(f)
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
# -------------------- TOC parsing utilities --------------------
|
|
40
|
-
|
|
41
|
-
_NUM_PREFIX_RE = re.compile(
|
|
42
|
-
r"^\s*(?P<num>(第\s*\d+[一二三四五六七八九十百千]*[章节节部分编]?)|((\d+\.)+\d+)|\d+)?\s*"
|
|
43
|
-
)
|
|
44
|
-
_TRAILING_PAGE_RE = re.compile(r"(?P<page>\d{1,5})\s*$")
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
def _infer_level_from_numbering(num: Optional[str]) -> int:
|
|
48
|
-
if not num:
|
|
49
|
-
return 1
|
|
50
|
-
num = num.strip()
|
|
51
|
-
if num.startswith("第"):
|
|
52
|
-
# "第1章" style => top-level
|
|
53
|
-
return 1
|
|
54
|
-
if "." in num:
|
|
55
|
-
# "1.2.3" => level = segments + 1 (so 1.2 is level 2)
|
|
56
|
-
return min(6, max(1, num.count(".") + 1))
|
|
57
|
-
# Simple leading integer like "1" => level 1
|
|
58
|
-
return 1
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
def parse_toc_lines(toc_text: str, page_offset: int = 0, min_len: int = 1) -> List[Heading]:
|
|
62
|
-
"""
|
|
63
|
-
Parse a pasted TOC text into Heading entries.
|
|
64
|
-
- Each line should end with the book page number (digits)
|
|
65
|
-
- Leading numbering like "第1章" or "1.2" is used to infer the level
|
|
66
|
-
- page_offset is added to the parsed page number to map to PDF actual pages
|
|
67
|
-
"""
|
|
68
|
-
headings: List[Heading] = []
|
|
69
|
-
for raw_line in toc_text.splitlines():
|
|
70
|
-
line = raw_line.strip()
|
|
71
|
-
if len(line) < min_len:
|
|
72
|
-
continue
|
|
73
|
-
# Detect and temporarily strip leading asterisk marker(s)
|
|
74
|
-
star_prefix = ""
|
|
75
|
-
m_star = re.match(r"^\*+\s*", line)
|
|
76
|
-
if m_star:
|
|
77
|
-
stars = m_star.group(0)
|
|
78
|
-
star_count = stars.count("*")
|
|
79
|
-
# Preserve star(s) without trailing space; spacing will be normalized later
|
|
80
|
-
star_prefix = ("*" * star_count)
|
|
81
|
-
line = line[m_star.end() :].lstrip()
|
|
82
|
-
|
|
83
|
-
# Extract trailing page digits
|
|
84
|
-
page_m = _TRAILING_PAGE_RE.search(line)
|
|
85
|
-
if not page_m:
|
|
86
|
-
continue
|
|
87
|
-
page_num = int(page_m.group("page"))
|
|
88
|
-
# Remove trailing page from the line
|
|
89
|
-
line_wo_page = line[: page_m.start()].rstrip()
|
|
90
|
-
# Extract leading numbering if exists
|
|
91
|
-
num_m = _NUM_PREFIX_RE.match(line_wo_page)
|
|
92
|
-
numbering = None
|
|
93
|
-
title_part = line_wo_page
|
|
94
|
-
if num_m:
|
|
95
|
-
numbering = num_m.group("num")
|
|
96
|
-
title_part = line_wo_page[num_m.end() :].strip()
|
|
97
|
-
# Build title while preserving numbering prefix (e.g., "第1章" or "1.1")
|
|
98
|
-
if numbering:
|
|
99
|
-
combined = f"{numbering.strip()} {title_part}".strip()
|
|
100
|
-
else:
|
|
101
|
-
combined = title_part
|
|
102
|
-
# Cleanup whitespace
|
|
103
|
-
title = re.sub(r"\s+", " ", combined)
|
|
104
|
-
if not title:
|
|
105
|
-
# fallback to raw without numbering
|
|
106
|
-
title = line_wo_page.strip()
|
|
107
|
-
# Restore asterisk prefix if present
|
|
108
|
-
if star_prefix:
|
|
109
|
-
# No space between star(s) and numbering/title
|
|
110
|
-
title = f"{star_prefix}{title}".strip()
|
|
111
|
-
level = _infer_level_from_numbering(numbering)
|
|
112
|
-
pdf_page = max(1, page_num + page_offset)
|
|
113
|
-
headings.append(Heading(title=title, page=pdf_page, level=level))
|
|
114
|
-
|
|
115
|
-
# Sort by page then by inferred level
|
|
116
|
-
headings.sort(key=lambda h: (h.page, h.level, h.title.lower()))
|
|
117
|
-
return headings
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
## URL/website TOC fetching intentionally removed; only manual text input is supported.
|
|
121
|
-
|
|
122
|
-
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|