pdf-auto-outline 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = '0.1.0'
|
pdf_auto_outline/main.py
ADDED
|
@@ -0,0 +1,283 @@
|
|
|
1
|
+
import pymupdf.layout
|
|
2
|
+
from pymupdf import Point
|
|
3
|
+
from time import perf_counter
|
|
4
|
+
from multiprocessing import Pool
|
|
5
|
+
import os
|
|
6
|
+
import subprocess
|
|
7
|
+
import argparse
|
|
8
|
+
|
|
9
|
+
SIOYEK = None
|
|
10
|
+
|
|
11
|
+
def log(message, end='\n'):
|
|
12
|
+
if SIOYEK:
|
|
13
|
+
SIOYEK.set_status_string(message)
|
|
14
|
+
else:
|
|
15
|
+
print(message, end=end)
|
|
16
|
+
|
|
17
|
+
def get_md5_hash(path):
|
|
18
|
+
import hashlib
|
|
19
|
+
m = hashlib.md5()
|
|
20
|
+
with open(path, 'rb') as f:
|
|
21
|
+
m.update(f.read())
|
|
22
|
+
return m.hexdigest()
|
|
23
|
+
|
|
24
|
+
def sioyek_transfer_annots(shared_db_path, from_hash, to_hash):
|
|
25
|
+
import sqlite3
|
|
26
|
+
con = sqlite3.connect(shared_db_path)
|
|
27
|
+
try:
|
|
28
|
+
with con:
|
|
29
|
+
con.execute("UPDATE highlights SET document_path = (?) WHERE document_path == (?)", (to_hash, from_hash))
|
|
30
|
+
log(f'moved highlights from {from_hash} to {to_hash}')
|
|
31
|
+
except Exception as e:
|
|
32
|
+
log(f'failed to move highlights from {from_hash} to {to_hash}: {e}')
|
|
33
|
+
|
|
34
|
+
con.close()
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def process_pg(fpath, pg_num) -> tuple[int, list[list]]:
|
|
38
|
+
pg = pymupdf.open(fpath)[pg_num]
|
|
39
|
+
pg.get_layout()
|
|
40
|
+
|
|
41
|
+
page_toc_entries = []
|
|
42
|
+
|
|
43
|
+
def get_text(boxclass, j, pg):
|
|
44
|
+
text = pg.get_textbox(pymupdf.Rect(*j[:4])).replace('\n', ' ').strip()
|
|
45
|
+
if boxclass == 'caption':
|
|
46
|
+
a = text.find('.', 15)
|
|
47
|
+
return text[:a] or text
|
|
48
|
+
return text
|
|
49
|
+
|
|
50
|
+
page_toc_entries = [
|
|
51
|
+
[1, get_text(j[4], j, pg), pg_num+1, j[1]]
|
|
52
|
+
for j in pg.layout_information if j[4] in ('section-header', 'caption')
|
|
53
|
+
]
|
|
54
|
+
|
|
55
|
+
return pg_num, page_toc_entries
|
|
56
|
+
|
|
57
|
+
def process_pg_wrapper(args):
|
|
58
|
+
return process_pg(args[0], args[1])
|
|
59
|
+
|
|
60
|
+
def generate_toc_nnet(pdfpath, worker_cnt=3) -> list:
|
|
61
|
+
doc = pymupdf.open(pdfpath)
|
|
62
|
+
pg_cnt = doc.page_count
|
|
63
|
+
pg_nums = range(pg_cnt)
|
|
64
|
+
doc.close()
|
|
65
|
+
|
|
66
|
+
try:
|
|
67
|
+
if worker_cnt < 2:
|
|
68
|
+
log('Started..')
|
|
69
|
+
count = 1
|
|
70
|
+
bar = 50
|
|
71
|
+
entries = []
|
|
72
|
+
for i in pg_nums:
|
|
73
|
+
for j in process_pg(pdfpath, i)[1]:
|
|
74
|
+
entries.append(j)
|
|
75
|
+
progress = (count * bar) // (pg_cnt)
|
|
76
|
+
log(f"[{'='*(progress)}{' '*(bar - progress)}] {count}/{pg_cnt} pages", end='\r')
|
|
77
|
+
count += 1
|
|
78
|
+
|
|
79
|
+
return entries
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
with Pool(processes=worker_cnt) as pool:
|
|
83
|
+
log("Started..")
|
|
84
|
+
count = 1
|
|
85
|
+
bar = 50
|
|
86
|
+
results = {}
|
|
87
|
+
tasks = [(pdfpath, i) for i in pg_nums]
|
|
88
|
+
result_iter = pool.imap_unordered(process_pg_wrapper, tasks)
|
|
89
|
+
for pg_num, res in result_iter:
|
|
90
|
+
results[pg_num] = res
|
|
91
|
+
|
|
92
|
+
progress = (count * bar) // (pg_cnt)
|
|
93
|
+
log(f"[{'='*(progress)}{' '*(bar - progress)}] {count}/{pg_cnt} pages", end='\r')
|
|
94
|
+
count += 1
|
|
95
|
+
except KeyboardInterrupt:
|
|
96
|
+
log('\nCancelled')
|
|
97
|
+
exit()
|
|
98
|
+
|
|
99
|
+
log('')
|
|
100
|
+
|
|
101
|
+
return [j for i in pg_nums for j in results[i]]
|
|
102
|
+
|
|
103
|
+
def align_toc_lvls(toc_entries: list) -> list:
|
|
104
|
+
# TODO: fix this spaghetti
|
|
105
|
+
import re
|
|
106
|
+
def act(lvl, current, prev): # cur prev expected lvl
|
|
107
|
+
# if current == prev - 1: # current is parent
|
|
108
|
+
if current == prev[0]: # current is sibling
|
|
109
|
+
return lvl
|
|
110
|
+
elif current == 'p5':
|
|
111
|
+
return lvl + 1
|
|
112
|
+
elif e[current] < prev[1]: # current is parent
|
|
113
|
+
return e[current]
|
|
114
|
+
# return max(1, lvl - 1)
|
|
115
|
+
else: # e[current] > prev[1]: # current is child
|
|
116
|
+
e[current] = min(lvl + 1, e[current])
|
|
117
|
+
return min(lvl + 1, e[current])
|
|
118
|
+
# else: #e[current] == prev: # current is sibling
|
|
119
|
+
# return lvl
|
|
120
|
+
|
|
121
|
+
p1 = re.compile(r'^[A-Z\d]')
|
|
122
|
+
p2 = re.compile(r'^(Contents)|(Chapter)|(Appendix)|(Index)|(Bibliograph)|(Preface)')
|
|
123
|
+
p3 = re.compile(r'^([IVXC\d])+\.[IVXC\d]\.? \w')
|
|
124
|
+
p4 = re.compile(r'^([AIVXC\d]+\.){2}[IVXC\d]\.? \w')
|
|
125
|
+
p5 = re.compile(r'^(Fig(ure)?\.?)|(Table\.? [\dIVXC]+)')
|
|
126
|
+
p6 = re.compile(r'''\d?\s?(Introduction)|((Materials and )?Methods)|(Results)|
|
|
127
|
+
(Discussion)|(References)|(Summary)|(Conclusion)|(Acknowledgements)
|
|
128
|
+
''', re.IGNORECASE)
|
|
129
|
+
p7 = re.compile(r'^\d?\s?[A-Z ]{2,}')
|
|
130
|
+
|
|
131
|
+
e = {'p1': 1, 'p2': 1, 'p3': 2, 'p4': 3, 'p5': 5, 'p6': 1, 'p7': 1, 'l': 2,}
|
|
132
|
+
|
|
133
|
+
log('aligning levels..')
|
|
134
|
+
lvl, prev, titles, removed = 1, ('p1', 1), set(), 0
|
|
135
|
+
|
|
136
|
+
for i in range(1, len(toc_entries)):
|
|
137
|
+
title = toc_entries[i-removed][1]
|
|
138
|
+
if (not p1.match(title)) or len(title) < 4 or title in titles: #skip
|
|
139
|
+
toc_entries.pop(i-removed)
|
|
140
|
+
removed += 1
|
|
141
|
+
elif p2.match(title):
|
|
142
|
+
lvl = act(lvl, 'p2', prev)
|
|
143
|
+
toc_entries[i-removed][0] = lvl
|
|
144
|
+
prev = ('p2', e['p2'])
|
|
145
|
+
elif p7.match(title):
|
|
146
|
+
lvl = act(lvl, 'p7', prev)
|
|
147
|
+
toc_entries[i-removed][0] = lvl
|
|
148
|
+
prev = ('p7', e['p7'])
|
|
149
|
+
elif p6.match(title):
|
|
150
|
+
lvl = act(lvl, 'p6', prev)
|
|
151
|
+
toc_entries[i-removed][0] = lvl
|
|
152
|
+
prev = ('p6', e['p6'])
|
|
153
|
+
elif p3.match(title):
|
|
154
|
+
lvl = act(lvl, 'p3', prev)
|
|
155
|
+
toc_entries[i-removed][0] = lvl
|
|
156
|
+
prev = ('p3', e['p3'])
|
|
157
|
+
elif p4.match(title):
|
|
158
|
+
lvl = act(lvl, 'p4', prev)
|
|
159
|
+
toc_entries[i-removed][0] = lvl
|
|
160
|
+
prev = ('p4', e['p4'])
|
|
161
|
+
elif p5.match(title):
|
|
162
|
+
lvl = act(lvl, 'p5', prev)
|
|
163
|
+
toc_entries[i-removed][0] = lvl
|
|
164
|
+
prev = ('p5', e['p5'])
|
|
165
|
+
else:
|
|
166
|
+
titles.add(title)
|
|
167
|
+
lvl = act(lvl, 'l', prev)
|
|
168
|
+
toc_entries[i-removed][0] = lvl
|
|
169
|
+
prev = ('l', e['l'])
|
|
170
|
+
return toc_entries
|
|
171
|
+
|
|
172
|
+
def generate_txtfile(toc_entries, txtfile='outline.txt') -> str:
|
|
173
|
+
import textwrap
|
|
174
|
+
txt = textwrap.dedent("""\
|
|
175
|
+
============================================================
|
|
176
|
+
TABLE OF CONTENTS OUTLINE
|
|
177
|
+
4spaces/lvl text | pg# | {details dictionary} OR y-coord
|
|
178
|
+
============================================================
|
|
179
|
+
|
|
180
|
+
""")
|
|
181
|
+
if len(toc_entries[0]) > 3:
|
|
182
|
+
txt += '\n'.join(f"{' '*4 * (i[0] - 1)}{i[1]} | {i[2]} | {i[3]}"
|
|
183
|
+
for i in toc_entries)
|
|
184
|
+
else:
|
|
185
|
+
txt += '\n'.join(f"{' '*4 * (i[0] - 1)}{i[1]} | {i[2]}"
|
|
186
|
+
for i in toc_entries)
|
|
187
|
+
|
|
188
|
+
with open(txtfile, 'w', encoding='utf-8') as f:
|
|
189
|
+
f.write(txt)
|
|
190
|
+
|
|
191
|
+
return txtfile
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def parse_txtfile(txtfile='outline.txt', tablevel=2) -> list:
|
|
195
|
+
toc_entries = []
|
|
196
|
+
with open(txtfile) as f:
|
|
197
|
+
if f.read(1) == '=':
|
|
198
|
+
lines = f.readlines()[5:]
|
|
199
|
+
else: lines = f.read()
|
|
200
|
+
|
|
201
|
+
for i in lines:
|
|
202
|
+
i = i.replace('\t', ' '*tablevel)
|
|
203
|
+
lvl = (len(i) - len(i.lstrip())) // 4 + 1
|
|
204
|
+
a = i.lstrip().split(' | ')
|
|
205
|
+
if len(a) < 3:
|
|
206
|
+
toc_entries.append(
|
|
207
|
+
[lvl, a[0], int(a[1])]
|
|
208
|
+
)
|
|
209
|
+
else:
|
|
210
|
+
toc_entries.append(
|
|
211
|
+
[lvl, a[0], int(a[1]), eval(a[2])]
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
return toc_entries
|
|
215
|
+
|
|
216
|
+
def embed_toc(pdfpath, toc_entries, newfile=''):
|
|
217
|
+
print(len(toc_entries))
|
|
218
|
+
doc = pymupdf.open(pdfpath)
|
|
219
|
+
doc.set_toc(toc_entries, collapse=2)
|
|
220
|
+
if newfile:
|
|
221
|
+
doc.save(newfile)
|
|
222
|
+
log(f"toc written to '{newfile}'")
|
|
223
|
+
else:
|
|
224
|
+
doc.saveIncr()
|
|
225
|
+
log(f"toc saved to '{pdfpath}'")
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def edit_txtfile(txtfile='outline.txt'):
|
|
230
|
+
editor = os.environ.get('EDITOR', 'notepad' if os.name == 'nt' else 'vi')
|
|
231
|
+
subprocess.run([editor, txtfile])
|
|
232
|
+
|
|
233
|
+
def main():
|
|
234
|
+
parser = argparse.ArgumentParser(prog='pdfao')
|
|
235
|
+
parser.add_argument("filename", help='input pdf')
|
|
236
|
+
parser.add_argument('-s', '--straight', action='store_true', help="write toc straight to pdf; skip editing")
|
|
237
|
+
parser.add_argument('-o', '--out', type=str, metavar='<path>', help='write changes to new pdf')
|
|
238
|
+
parser.add_argument('-mp', '--multiprocess', type=int, metavar='<n>', help='spread job over n processes (faster on linux)', default=1)
|
|
239
|
+
parser.add_argument('-e', '--edit', action='store_true', help='edit pdf toc')
|
|
240
|
+
parser.add_argument('-se', '--superedit', action='store_true', help='edit pdf toc (more attibutes available)')
|
|
241
|
+
parser.add_argument('-i', '--infile', type=str, metavar='<file>', help='write toc from file to pdf')
|
|
242
|
+
parser.add_argument('-t', '--tablevel', type=int, metavar='<n>', help='tab = n toc nesting levels (default 2)', default=2)
|
|
243
|
+
parser.add_argument('--sioyek', type=str, metavar='<path>', help='for users of the Sioyek pdf viewer')
|
|
244
|
+
parser.add_argument('--version', action='version', version='%(prog)s 0.1.0')
|
|
245
|
+
|
|
246
|
+
args = parser.parse_args()
|
|
247
|
+
|
|
248
|
+
if args.sioyek:
|
|
249
|
+
from sioyek.sioyek import Sioyek
|
|
250
|
+
sioyek_path = args.sioyek[0]
|
|
251
|
+
SIOYEK = Sioyek(sioyek_path)
|
|
252
|
+
# local_db = args.sioyek[1]
|
|
253
|
+
# shared_db = args.sioyek[2]
|
|
254
|
+
# pdf_path = args.sioyek[3]
|
|
255
|
+
# from_hash = get_md5_hash(args.filename)
|
|
256
|
+
|
|
257
|
+
if args.edit or args.superedit:
|
|
258
|
+
doc = pymupdf.Document(args.filename)
|
|
259
|
+
generate_txtfile(doc.get_toc(not args.superedit))
|
|
260
|
+
edit_txtfile()
|
|
261
|
+
toc_entries = parse_txtfile(tablevel=args.tablevel)
|
|
262
|
+
embed_toc(args.filename, toc_entries, args.out)
|
|
263
|
+
elif args.infile:
|
|
264
|
+
toc_entries = parse_txtfile(args.infile, args.tablevel)
|
|
265
|
+
embed_toc(args.filename, toc_entries, args.out)
|
|
266
|
+
else: # generate toc
|
|
267
|
+
start = perf_counter()
|
|
268
|
+
toc_entries = generate_toc_nnet(args.filename, args.multiprocess)
|
|
269
|
+
end = perf_counter()
|
|
270
|
+
log(f"finished in {end - start:<4.1f} s")
|
|
271
|
+
toc_entries = align_toc_lvls(toc_entries)
|
|
272
|
+
if args.straight:
|
|
273
|
+
embed_toc(args.filename, toc_entries, args.out)
|
|
274
|
+
else:
|
|
275
|
+
generate_txtfile(toc_entries)
|
|
276
|
+
edit_txtfile()
|
|
277
|
+
toc_entries = parse_txtfile(tablevel=args.tablevel)
|
|
278
|
+
embed_toc(args.filename, toc_entries, args.out)
|
|
279
|
+
|
|
280
|
+
# if args.sioyek and not args.out:
|
|
281
|
+
# to_hash = get_md5_hash(args.filename)
|
|
282
|
+
# sioyek_transfer_annots(shared_db, from_hash, to_hash)
|
|
283
|
+
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: pdf-auto-outline
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Automatically generate and edit PDF table of contents / outline
|
|
5
|
+
Author: Rossikos
|
|
6
|
+
Author-email: Rossikos <216631970+rossikos@users.noreply.github.com>
|
|
7
|
+
Requires-Dist: pymupdf-layout>=1.26.6
|
|
8
|
+
Requires-Python: >=3.13
|
|
9
|
+
Description-Content-Type: text/markdown
|
|
10
|
+
|
|
11
|
+
# PDF Auto Outline
|
|
12
|
+
|
|
13
|
+
A simple python program to automatically generate and embed a table of contents or outline in a PDF.
|
|
14
|
+
|
|
15
|
+
## Usage
|
|
16
|
+
|
|
17
|
+
```
|
|
18
|
+
usage: pdfao [-h] [-s] [-o <path>] [-mp <n>] [-e] [-se] [-i <file>] [-t <n>] [--sioyek <path>] [--version] filename
|
|
19
|
+
|
|
20
|
+
positional arguments:
|
|
21
|
+
filename input pdf
|
|
22
|
+
|
|
23
|
+
options:
|
|
24
|
+
-h, --help show this help message and exit
|
|
25
|
+
-s, --straight write toc straight to pdf; skip editing
|
|
26
|
+
-o, --out <path> write changes to new pdf
|
|
27
|
+
-mp, --multiprocess <n>
|
|
28
|
+
spread job over n processes (faster on linux)
|
|
29
|
+
-e, --edit edit pdf toc
|
|
30
|
+
-se, --superedit edit pdf toc (more attibutes available)
|
|
31
|
+
-i, --infile <file> write toc from file to pdf
|
|
32
|
+
-t, --tablevel <n> tab = n toc nesting levels (default 2)
|
|
33
|
+
--sioyek <path> for users of the Sioyek pdf viewer
|
|
34
|
+
--version show program's version number and exit
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## For Sioyek Users
|
|
38
|
+
|
|
39
|
+
Example commands; add to prefs_user.config.
|
|
40
|
+
|
|
41
|
+
```
|
|
42
|
+
new_command _gen_toc python3 path/to/pdfao.py "%{file_path}" --sioyek path/to/sioyek -mp 4
|
|
43
|
+
new_command _edit_toc python3 path/to/pdfao.py "%{file_path}" --sioyek path/to/sioyek -e
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
pdf_auto_outline/__init__.py,sha256=IMjkMO3twhQzluVTo8Z6rE7Eg-9U79_LGKMcsWLKBkY,22
|
|
2
|
+
pdf_auto_outline/__main__.py,sha256=7tzuGbeA5JiJWE_g9pzlcTXSsKlR-iEXNEbdYd4jZMs,62
|
|
3
|
+
pdf_auto_outline/main.py,sha256=KkLEIGCndRql55jjvAb8Y-onmkPoQwdExHcuNG3MPYw,9977
|
|
4
|
+
pdf_auto_outline-0.1.0.dist-info/WHEEL,sha256=5w2T7AS2mz1-rW9CNagNYWRCaB0iQqBMYLwKdlgiR4Q,78
|
|
5
|
+
pdf_auto_outline-0.1.0.dist-info/entry_points.txt,sha256=HBvhmxJs8hHqbbpJmVTbBH3xy19Hk655O_ySwFC_53w,100
|
|
6
|
+
pdf_auto_outline-0.1.0.dist-info/METADATA,sha256=ZeQBy-6lWQbqhTmt2375o5c-CQymnOffWyFeL7vi3bY,1504
|
|
7
|
+
pdf_auto_outline-0.1.0.dist-info/RECORD,,
|