pdf-auto-outline 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1 @@
1
+ __version__ = '0.1.0'
@@ -0,0 +1,4 @@
1
+ from .main import main
2
+
3
+ if __name__ == '__main__':
4
+ main()
@@ -0,0 +1,283 @@
1
+ import pymupdf.layout
2
+ from pymupdf import Point
3
+ from time import perf_counter
4
+ from multiprocessing import Pool
5
+ import os
6
+ import subprocess
7
+ import argparse
8
+
9
+ SIOYEK = None
10
+
11
+ def log(message, end='\n'):
12
+ if SIOYEK:
13
+ SIOYEK.set_status_string(message)
14
+ else:
15
+ print(message, end=end)
16
+
17
+ def get_md5_hash(path):
18
+ import hashlib
19
+ m = hashlib.md5()
20
+ with open(path, 'rb') as f:
21
+ m.update(f.read())
22
+ return m.hexdigest()
23
+
24
+ def sioyek_transfer_annots(shared_db_path, from_hash, to_hash):
25
+ import sqlite3
26
+ con = sqlite3.connect(shared_db_path)
27
+ try:
28
+ with con:
29
+ con.execute("UPDATE highlights SET document_path = (?) WHERE document_path == (?)", (to_hash, from_hash))
30
+ log(f'moved highlights from {from_hash} to {to_hash}')
31
+ except Exception as e:
32
+ log(f'failed to move highlights from {from_hash} to {to_hash}: {e}')
33
+
34
+ con.close()
35
+
36
+
37
+ def process_pg(fpath, pg_num) -> tuple[int, list[list]]:
38
+ pg = pymupdf.open(fpath)[pg_num]
39
+ pg.get_layout()
40
+
41
+ page_toc_entries = []
42
+
43
+ def get_text(boxclass, j, pg):
44
+ text = pg.get_textbox(pymupdf.Rect(*j[:4])).replace('\n', ' ').strip()
45
+ if boxclass == 'caption':
46
+ a = text.find('.', 15)
47
+ return text[:a] or text
48
+ return text
49
+
50
+ page_toc_entries = [
51
+ [1, get_text(j[4], j, pg), pg_num+1, j[1]]
52
+ for j in pg.layout_information if j[4] in ('section-header', 'caption')
53
+ ]
54
+
55
+ return pg_num, page_toc_entries
56
+
57
+ def process_pg_wrapper(args):
58
+ return process_pg(args[0], args[1])
59
+
60
+ def generate_toc_nnet(pdfpath, worker_cnt=3) -> list:
61
+ doc = pymupdf.open(pdfpath)
62
+ pg_cnt = doc.page_count
63
+ pg_nums = range(pg_cnt)
64
+ doc.close()
65
+
66
+ try:
67
+ if worker_cnt < 2:
68
+ log('Started..')
69
+ count = 1
70
+ bar = 50
71
+ entries = []
72
+ for i in pg_nums:
73
+ for j in process_pg(pdfpath, i)[1]:
74
+ entries.append(j)
75
+ progress = (count * bar) // (pg_cnt)
76
+ log(f"[{'='*(progress)}{' '*(bar - progress)}] {count}/{pg_cnt} pages", end='\r')
77
+ count += 1
78
+
79
+ return entries
80
+
81
+
82
+ with Pool(processes=worker_cnt) as pool:
83
+ log("Started..")
84
+ count = 1
85
+ bar = 50
86
+ results = {}
87
+ tasks = [(pdfpath, i) for i in pg_nums]
88
+ result_iter = pool.imap_unordered(process_pg_wrapper, tasks)
89
+ for pg_num, res in result_iter:
90
+ results[pg_num] = res
91
+
92
+ progress = (count * bar) // (pg_cnt)
93
+ log(f"[{'='*(progress)}{' '*(bar - progress)}] {count}/{pg_cnt} pages", end='\r')
94
+ count += 1
95
+ except KeyboardInterrupt:
96
+ log('\nCancelled')
97
+ exit()
98
+
99
+ log('')
100
+
101
+ return [j for i in pg_nums for j in results[i]]
102
+
103
+ def align_toc_lvls(toc_entries: list) -> list:
104
+ # TODO: fix this spaghetti
105
+ import re
106
+ def act(lvl, current, prev): # cur prev expected lvl
107
+ # if current == prev - 1: # current is parent
108
+ if current == prev[0]: # current is sibling
109
+ return lvl
110
+ elif current == 'p5':
111
+ return lvl + 1
112
+ elif e[current] < prev[1]: # current is parent
113
+ return e[current]
114
+ # return max(1, lvl - 1)
115
+ else: # e[current] > prev[1]: # current is child
116
+ e[current] = min(lvl + 1, e[current])
117
+ return min(lvl + 1, e[current])
118
+ # else: #e[current] == prev: # current is sibling
119
+ # return lvl
120
+
121
+ p1 = re.compile(r'^[A-Z\d]')
122
+ p2 = re.compile(r'^(Contents)|(Chapter)|(Appendix)|(Index)|(Bibliograph)|(Preface)')
123
+ p3 = re.compile(r'^([IVXC\d])+\.[IVXC\d]\.? \w')
124
+ p4 = re.compile(r'^([AIVXC\d]+\.){2}[IVXC\d]\.? \w')
125
+ p5 = re.compile(r'^(Fig(ure)?\.?)|(Table\.? [\dIVXC]+)')
126
+ p6 = re.compile(r'''\d?\s?(Introduction)|((Materials and )?Methods)|(Results)|
127
+ (Discussion)|(References)|(Summary)|(Conclusion)|(Acknowledgements)
128
+ ''', re.IGNORECASE)
129
+ p7 = re.compile(r'^\d?\s?[A-Z ]{2,}')
130
+
131
+ e = {'p1': 1, 'p2': 1, 'p3': 2, 'p4': 3, 'p5': 5, 'p6': 1, 'p7': 1, 'l': 2,}
132
+
133
+ log('aligning levels..')
134
+ lvl, prev, titles, removed = 1, ('p1', 1), set(), 0
135
+
136
+ for i in range(1, len(toc_entries)):
137
+ title = toc_entries[i-removed][1]
138
+ if (not p1.match(title)) or len(title) < 4 or title in titles: #skip
139
+ toc_entries.pop(i-removed)
140
+ removed += 1
141
+ elif p2.match(title):
142
+ lvl = act(lvl, 'p2', prev)
143
+ toc_entries[i-removed][0] = lvl
144
+ prev = ('p2', e['p2'])
145
+ elif p7.match(title):
146
+ lvl = act(lvl, 'p7', prev)
147
+ toc_entries[i-removed][0] = lvl
148
+ prev = ('p7', e['p7'])
149
+ elif p6.match(title):
150
+ lvl = act(lvl, 'p6', prev)
151
+ toc_entries[i-removed][0] = lvl
152
+ prev = ('p6', e['p6'])
153
+ elif p3.match(title):
154
+ lvl = act(lvl, 'p3', prev)
155
+ toc_entries[i-removed][0] = lvl
156
+ prev = ('p3', e['p3'])
157
+ elif p4.match(title):
158
+ lvl = act(lvl, 'p4', prev)
159
+ toc_entries[i-removed][0] = lvl
160
+ prev = ('p4', e['p4'])
161
+ elif p5.match(title):
162
+ lvl = act(lvl, 'p5', prev)
163
+ toc_entries[i-removed][0] = lvl
164
+ prev = ('p5', e['p5'])
165
+ else:
166
+ titles.add(title)
167
+ lvl = act(lvl, 'l', prev)
168
+ toc_entries[i-removed][0] = lvl
169
+ prev = ('l', e['l'])
170
+ return toc_entries
171
+
172
+ def generate_txtfile(toc_entries, txtfile='outline.txt') -> str:
173
+ import textwrap
174
+ txt = textwrap.dedent("""\
175
+ ============================================================
176
+ TABLE OF CONTENTS OUTLINE
177
+ 4spaces/lvl text | pg# | {details dictionary} OR y-coord
178
+ ============================================================
179
+
180
+ """)
181
+ if len(toc_entries[0]) > 3:
182
+ txt += '\n'.join(f"{' '*4 * (i[0] - 1)}{i[1]} | {i[2]} | {i[3]}"
183
+ for i in toc_entries)
184
+ else:
185
+ txt += '\n'.join(f"{' '*4 * (i[0] - 1)}{i[1]} | {i[2]}"
186
+ for i in toc_entries)
187
+
188
+ with open(txtfile, 'w', encoding='utf-8') as f:
189
+ f.write(txt)
190
+
191
+ return txtfile
192
+
193
+
194
+ def parse_txtfile(txtfile='outline.txt', tablevel=2) -> list:
195
+ toc_entries = []
196
+ with open(txtfile) as f:
197
+ if f.read(1) == '=':
198
+ lines = f.readlines()[5:]
199
+ else: lines = f.read()
200
+
201
+ for i in lines:
202
+ i = i.replace('\t', ' '*tablevel)
203
+ lvl = (len(i) - len(i.lstrip())) // 4 + 1
204
+ a = i.lstrip().split(' | ')
205
+ if len(a) < 3:
206
+ toc_entries.append(
207
+ [lvl, a[0], int(a[1])]
208
+ )
209
+ else:
210
+ toc_entries.append(
211
+ [lvl, a[0], int(a[1]), eval(a[2])]
212
+ )
213
+
214
+ return toc_entries
215
+
216
+ def embed_toc(pdfpath, toc_entries, newfile=''):
217
+ print(len(toc_entries))
218
+ doc = pymupdf.open(pdfpath)
219
+ doc.set_toc(toc_entries, collapse=2)
220
+ if newfile:
221
+ doc.save(newfile)
222
+ log(f"toc written to '{newfile}'")
223
+ else:
224
+ doc.saveIncr()
225
+ log(f"toc saved to '{pdfpath}'")
226
+
227
+
228
+
229
+ def edit_txtfile(txtfile='outline.txt'):
230
+ editor = os.environ.get('EDITOR', 'notepad' if os.name == 'nt' else 'vi')
231
+ subprocess.run([editor, txtfile])
232
+
233
+ def main():
234
+ parser = argparse.ArgumentParser(prog='pdfao')
235
+ parser.add_argument("filename", help='input pdf')
236
+ parser.add_argument('-s', '--straight', action='store_true', help="write toc straight to pdf; skip editing")
237
+ parser.add_argument('-o', '--out', type=str, metavar='<path>', help='write changes to new pdf')
238
+ parser.add_argument('-mp', '--multiprocess', type=int, metavar='<n>', help='spread job over n processes (faster on linux)', default=1)
239
+ parser.add_argument('-e', '--edit', action='store_true', help='edit pdf toc')
240
+ parser.add_argument('-se', '--superedit', action='store_true', help='edit pdf toc (more attibutes available)')
241
+ parser.add_argument('-i', '--infile', type=str, metavar='<file>', help='write toc from file to pdf')
242
+ parser.add_argument('-t', '--tablevel', type=int, metavar='<n>', help='tab = n toc nesting levels (default 2)', default=2)
243
+ parser.add_argument('--sioyek', type=str, metavar='<path>', help='for users of the Sioyek pdf viewer')
244
+ parser.add_argument('--version', action='version', version='%(prog)s 0.1.0')
245
+
246
+ args = parser.parse_args()
247
+
248
+ if args.sioyek:
249
+ from sioyek.sioyek import Sioyek
250
+ sioyek_path = args.sioyek[0]
251
+ SIOYEK = Sioyek(sioyek_path)
252
+ # local_db = args.sioyek[1]
253
+ # shared_db = args.sioyek[2]
254
+ # pdf_path = args.sioyek[3]
255
+ # from_hash = get_md5_hash(args.filename)
256
+
257
+ if args.edit or args.superedit:
258
+ doc = pymupdf.Document(args.filename)
259
+ generate_txtfile(doc.get_toc(not args.superedit))
260
+ edit_txtfile()
261
+ toc_entries = parse_txtfile(tablevel=args.tablevel)
262
+ embed_toc(args.filename, toc_entries, args.out)
263
+ elif args.infile:
264
+ toc_entries = parse_txtfile(args.infile, args.tablevel)
265
+ embed_toc(args.filename, toc_entries, args.out)
266
+ else: # generate toc
267
+ start = perf_counter()
268
+ toc_entries = generate_toc_nnet(args.filename, args.multiprocess)
269
+ end = perf_counter()
270
+ log(f"finished in {end - start:<4.1f} s")
271
+ toc_entries = align_toc_lvls(toc_entries)
272
+ if args.straight:
273
+ embed_toc(args.filename, toc_entries, args.out)
274
+ else:
275
+ generate_txtfile(toc_entries)
276
+ edit_txtfile()
277
+ toc_entries = parse_txtfile(tablevel=args.tablevel)
278
+ embed_toc(args.filename, toc_entries, args.out)
279
+
280
+ # if args.sioyek and not args.out:
281
+ # to_hash = get_md5_hash(args.filename)
282
+ # sioyek_transfer_annots(shared_db, from_hash, to_hash)
283
+
@@ -0,0 +1,46 @@
1
+ Metadata-Version: 2.3
2
+ Name: pdf-auto-outline
3
+ Version: 0.1.0
4
+ Summary: Automatically generate and edit PDF table of contents / outline
5
+ Author: Rossikos
6
+ Author-email: Rossikos <216631970+rossikos@users.noreply.github.com>
7
+ Requires-Dist: pymupdf-layout>=1.26.6
8
+ Requires-Python: >=3.13
9
+ Description-Content-Type: text/markdown
10
+
11
+ # PDF Auto Outline
12
+
13
+ A simple python program to automatically generate and embed a table of contents or outline in a PDF.
14
+
15
+ ## Usage
16
+
17
+ ```
18
+ usage: pdfao [-h] [-s] [-o <path>] [-mp <n>] [-e] [-se] [-i <file>] [-t <n>] [--sioyek <path>] [--version] filename
19
+
20
+ positional arguments:
21
+ filename input pdf
22
+
23
+ options:
24
+ -h, --help show this help message and exit
25
+ -s, --straight write toc straight to pdf; skip editing
26
+ -o, --out <path> write changes to new pdf
27
+ -mp, --multiprocess <n>
28
+ spread job over n processes (faster on linux)
29
+ -e, --edit edit pdf toc
30
+ -se, --superedit edit pdf toc (more attibutes available)
31
+ -i, --infile <file> write toc from file to pdf
32
+ -t, --tablevel <n> tab = n toc nesting levels (default 2)
33
+ --sioyek <path> for users of the Sioyek pdf viewer
34
+ --version show program's version number and exit
35
+ ```
36
+
37
+ ## For Sioyek Users
38
+
39
+ Example commands; add to prefs_user.config.
40
+
41
+ ```
42
+ new_command _gen_toc python3 path/to/pdfao.py "%{file_path}" --sioyek path/to/sioyek -mp 4
43
+ new_command _edit_toc python3 path/to/pdfao.py "%{file_path}" --sioyek path/to/sioyek -e
44
+ ```
45
+
46
+
@@ -0,0 +1,7 @@
1
+ pdf_auto_outline/__init__.py,sha256=IMjkMO3twhQzluVTo8Z6rE7Eg-9U79_LGKMcsWLKBkY,22
2
+ pdf_auto_outline/__main__.py,sha256=7tzuGbeA5JiJWE_g9pzlcTXSsKlR-iEXNEbdYd4jZMs,62
3
+ pdf_auto_outline/main.py,sha256=KkLEIGCndRql55jjvAb8Y-onmkPoQwdExHcuNG3MPYw,9977
4
+ pdf_auto_outline-0.1.0.dist-info/WHEEL,sha256=5w2T7AS2mz1-rW9CNagNYWRCaB0iQqBMYLwKdlgiR4Q,78
5
+ pdf_auto_outline-0.1.0.dist-info/entry_points.txt,sha256=HBvhmxJs8hHqbbpJmVTbBH3xy19Hk655O_ySwFC_53w,100
6
+ pdf_auto_outline-0.1.0.dist-info/METADATA,sha256=ZeQBy-6lWQbqhTmt2375o5c-CQymnOffWyFeL7vi3bY,1504
7
+ pdf_auto_outline-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: uv 0.9.7
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,4 @@
1
+ [console_scripts]
2
+ pdf-auto-outline = pdf_auto_outline.main:main
3
+ pdfao = pdf_auto_outline.main:main
4
+