oldstyle-pdf-toc 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: oldstyle-pdf-toc
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Add Table of Content in scanned German old-style Textbooks
|
|
5
|
+
Keywords: pdf
|
|
6
|
+
Author: Hong-Phuc Bui
|
|
7
|
+
Author-email: Hong-Phuc Bui <hong-phuc.bui@htwsaar.de>
|
|
8
|
+
License-Expression: MIT
|
|
9
|
+
Requires-Python: >=3.12
|
|
10
|
+
Project-URL: Homepage, https://codeberg.org/hpb/oldstyle-pdf-toc
|
|
11
|
+
Project-URL: Repository, https://codeberg.org/hpb/oldstyle-pdf-toc
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
|
|
14
|
+
# Oldstyle PDF ToC
|
|
15
|
+
|
|
16
|
+
## Install
|
|
17
|
+
|
|
18
|
+
```shell
|
|
19
|
+
pip install oldstyle-pdf-toc
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
## Usage
|
|
23
|
+
|
|
24
|
+
### 1. Create the `toc.txt`
|
|
25
|
+
|
|
26
|
+
The file must be nammed `toc.txt`
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
```txt
|
|
30
|
+
# Lines that begin with # are ignored, empty lines are alis ignored
|
|
31
|
+
|
|
32
|
+
# Set page number to Uppercase roman number (I, II, III, ...)
|
|
33
|
+
\pageLabel{1}{1}{UppercaseRomanNumerals}
|
|
34
|
+
|
|
35
|
+
# Copy origin ToC of the book
|
|
36
|
+
|
|
37
|
+
Schrifttum 11
|
|
38
|
+
|
|
39
|
+
# Change page number from to decimal arabic number, reset it to 1 but physical page numer is now 13
|
|
40
|
+
\pageLabel{13}{1}{DecimalArabicNumerals}
|
|
41
|
+
|
|
42
|
+
# Copy other parts of the ToC of the Book
|
|
43
|
+
|
|
44
|
+
I. Abbildungsverfahren 1
|
|
45
|
+
1. Aufgabe der Darstellenden Geometrie 1
|
|
46
|
+
2. Zentralprojektion 4
|
|
47
|
+
3. Parallelprojektion 6
|
|
48
|
+
|
|
49
|
+
II. Parallelprojektion und perspektive Affinität 8
|
|
50
|
+
4. Invarianten der Parallelprojektion 8
|
|
51
|
+
5. Der Satz von DESARGUES 11
|
|
52
|
+
6. Perspektive Affinität 12
|
|
53
|
+
7. Analytische Darstellung einer ebenen perspektiven Affinität 14
|
|
54
|
+
8. Das Rechtwinkelpaar einer ebenen perspektiven Affinität 17
|
|
55
|
+
9. Die Ellipse als affines Bild des Kreises 19
|
|
56
|
+
10. Affine Eigenschaften der Ellipse 21
|
|
57
|
+
11. Ellipsenkonstruktionen 23
|
|
58
|
+
|
|
59
|
+
# .... others entries
|
|
60
|
+
Namen- und Sachverzeichnis 339
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
The complete example `toc.txt` can be found in this project repository.
|
|
64
|
+
|
|
65
|
+
### 2. Call the python script
|
|
66
|
+
|
|
67
|
+
```shell
|
|
68
|
+
oldstyle-pdf-toc > pdftk.toc.txt
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
### 3. Call the pdftk
|
|
72
|
+
|
|
73
|
+
```shell
|
|
74
|
+
pdftk Strubecker.pdf update_info_utf8 pdftk.toc.txt output Strubecker.pdf-toc.pdf
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
That's it!
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
## Development
|
|
81
|
+
|
|
82
|
+
This Project uses `uv` to manage dependencies and tasks during development and deployment
|
|
83
|
+
|
|
84
|
+
```shell
|
|
85
|
+
uv python install # Install a copy of python to this directory
|
|
86
|
+
uv venv # Creat a virtual python environment
|
|
87
|
+
source .venv/bin/activate # Activate venv
|
|
88
|
+
uv pip install -e . # Install this project as a python package into the virtual environment
|
|
89
|
+
uv build
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
## Deploy in PyPi
|
|
93
|
+
|
|
94
|
+
See: https://packaging.python.org/en/latest/guides/distributing-packages-using-setuptools/#create-an-account
|
|
95
|
+
|
|
96
|
+
Prinzipial:
|
|
97
|
+
|
|
98
|
+
```shell
|
|
99
|
+
uv build
|
|
100
|
+
```
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
# Oldstyle PDF ToC
|
|
2
|
+
|
|
3
|
+
## Install
|
|
4
|
+
|
|
5
|
+
```shell
|
|
6
|
+
pip install oldstyle-pdf-toc
|
|
7
|
+
```
|
|
8
|
+
|
|
9
|
+
## Usage
|
|
10
|
+
|
|
11
|
+
### 1. Create the `toc.txt`
|
|
12
|
+
|
|
13
|
+
The file must be nammed `toc.txt`
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
```txt
|
|
17
|
+
# Lines that begin with # are ignored, empty lines are alis ignored
|
|
18
|
+
|
|
19
|
+
# Set page number to Uppercase roman number (I, II, III, ...)
|
|
20
|
+
\pageLabel{1}{1}{UppercaseRomanNumerals}
|
|
21
|
+
|
|
22
|
+
# Copy origin ToC of the book
|
|
23
|
+
|
|
24
|
+
Schrifttum 11
|
|
25
|
+
|
|
26
|
+
# Change page number from to decimal arabic number, reset it to 1 but physical page numer is now 13
|
|
27
|
+
\pageLabel{13}{1}{DecimalArabicNumerals}
|
|
28
|
+
|
|
29
|
+
# Copy other parts of the ToC of the Book
|
|
30
|
+
|
|
31
|
+
I. Abbildungsverfahren 1
|
|
32
|
+
1. Aufgabe der Darstellenden Geometrie 1
|
|
33
|
+
2. Zentralprojektion 4
|
|
34
|
+
3. Parallelprojektion 6
|
|
35
|
+
|
|
36
|
+
II. Parallelprojektion und perspektive Affinität 8
|
|
37
|
+
4. Invarianten der Parallelprojektion 8
|
|
38
|
+
5. Der Satz von DESARGUES 11
|
|
39
|
+
6. Perspektive Affinität 12
|
|
40
|
+
7. Analytische Darstellung einer ebenen perspektiven Affinität 14
|
|
41
|
+
8. Das Rechtwinkelpaar einer ebenen perspektiven Affinität 17
|
|
42
|
+
9. Die Ellipse als affines Bild des Kreises 19
|
|
43
|
+
10. Affine Eigenschaften der Ellipse 21
|
|
44
|
+
11. Ellipsenkonstruktionen 23
|
|
45
|
+
|
|
46
|
+
# .... others entries
|
|
47
|
+
Namen- und Sachverzeichnis 339
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
The complete example `toc.txt` can be found in this project repository.
|
|
51
|
+
|
|
52
|
+
### 2. Call the python script
|
|
53
|
+
|
|
54
|
+
```shell
|
|
55
|
+
oldstyle-pdf-toc > pdftk.toc.txt
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
### 3. Call the pdftk
|
|
59
|
+
|
|
60
|
+
```shell
|
|
61
|
+
pdftk Strubecker.pdf update_info_utf8 pdftk.toc.txt output Strubecker.pdf-toc.pdf
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
That's it!
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
## Development
|
|
68
|
+
|
|
69
|
+
This Project uses `uv` to manage dependencies and tasks during development and deployment
|
|
70
|
+
|
|
71
|
+
```shell
|
|
72
|
+
uv python install # Install a copy of python to this directory
|
|
73
|
+
uv venv # Creat a virtual python environment
|
|
74
|
+
source .venv/bin/activate # Activate venv
|
|
75
|
+
uv pip install -e . # Install this project as a python package into the virtual environment
|
|
76
|
+
uv build
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
## Deploy in PyPi
|
|
80
|
+
|
|
81
|
+
See: https://packaging.python.org/en/latest/guides/distributing-packages-using-setuptools/#create-an-account
|
|
82
|
+
|
|
83
|
+
Prinzipial:
|
|
84
|
+
|
|
85
|
+
```shell
|
|
86
|
+
uv build
|
|
87
|
+
```
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "oldstyle-pdf-toc"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Add Table of Content in scanned German old-style Textbooks"
|
|
5
|
+
authors = [
|
|
6
|
+
{name="Hong-Phuc Bui", email="hong-phuc.bui@htwsaar.de"}
|
|
7
|
+
]
|
|
8
|
+
readme = "README.md"
|
|
9
|
+
license = "MIT"
|
|
10
|
+
keywords = ["pdf"]
|
|
11
|
+
|
|
12
|
+
requires-python = ">=3.12"
|
|
13
|
+
dependencies = []
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
[project.urls]
|
|
17
|
+
Repository = "https://codeberg.org/hpb/oldstyle-pdf-toc"
|
|
18
|
+
Homepage = "https://codeberg.org/hpb/oldstyle-pdf-toc"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
[project.scripts]
|
|
22
|
+
oldstyle-pdf-toc = "oldstyle_pdf_toc:main"
|
|
23
|
+
|
|
24
|
+
[build-system]
|
|
25
|
+
requires = ["uv_build>=0.9.21,<0.10.0"]
|
|
26
|
+
build-backend = "uv_build"
|
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import sys
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
|
|
5
|
+
# TODO:
|
|
6
|
+
# * make page-offset for begin
|
|
7
|
+
# * print pdftk command to stderr
|
|
8
|
+
|
|
9
|
+
@dataclass
|
|
10
|
+
class Chapter:
|
|
11
|
+
idx: str
|
|
12
|
+
"""
|
|
13
|
+
Chapter Index: I, II, III, IV, ect
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
title: str
|
|
17
|
+
"""
|
|
18
|
+
Title of the Chapter e.g. Grundlage,
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
page: int
|
|
22
|
+
"""
|
|
23
|
+
physical page begin with 1
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
level: int = 1
|
|
27
|
+
"""
|
|
28
|
+
ToC-Lelve 1 = chapter
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def __str__(self):
|
|
33
|
+
idx = self.idx.strip()
|
|
34
|
+
if len(idx) > 0:
|
|
35
|
+
idx = f"{idx} "
|
|
36
|
+
return f"""BookmarkBegin
|
|
37
|
+
BookmarkTitle: {idx}{self.title.strip()}
|
|
38
|
+
BookmarkLevel: {self.level}
|
|
39
|
+
BookmarkPageNumber: {self.page}
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
@dataclass
|
|
43
|
+
class Section(Chapter):
|
|
44
|
+
level:int = 2
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@dataclass
|
|
48
|
+
class PageLabel:
|
|
49
|
+
physical_page_counter: int
|
|
50
|
+
"""
|
|
51
|
+
page numer from cover to cover
|
|
52
|
+
"""
|
|
53
|
+
logical_page_counter: int
|
|
54
|
+
"""
|
|
55
|
+
logical page numer for each parts of the Book: Vorwort, Hauptteil, ...
|
|
56
|
+
"""
|
|
57
|
+
num_style: str
|
|
58
|
+
"""
|
|
59
|
+
Style from pdftk. One of "NoNumber", "LowercaseRomanNumerals", "UpercaseRomanNumerals"
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
def __str__(self):
|
|
63
|
+
return f"""PageLabelBegin
|
|
64
|
+
PageLabelStart: {self.logical_page_counter}
|
|
65
|
+
PageLabelNewIndex: {self.physical_page_counter}
|
|
66
|
+
PageLabelNumStyle: {self.num_style}
|
|
67
|
+
"""
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def parse_toc_entry(idx, line, offset) -> tuple[str,str, str]:
|
|
74
|
+
line = line.strip()
|
|
75
|
+
# chapter
|
|
76
|
+
chapter = r"""(^[IVX]+\.)(\s+)(.+[^\d$])(\d+)$"""
|
|
77
|
+
chapter_pattern = re.compile(chapter, re.MULTILINE|re.UNICODE)
|
|
78
|
+
# section
|
|
79
|
+
section = r"""(^[\d]+\.)(\s+)(.+[^\d$])(\d+)$"""
|
|
80
|
+
section_pattern = re.compile(section, re.MULTILINE|re.UNICODE)
|
|
81
|
+
# else
|
|
82
|
+
none_idx_chapter = r"(.+[^\d$])(\d+)$"
|
|
83
|
+
none_idx_chapter_pattern = re.compile(none_idx_chapter, re.MULTILINE|re.UNICODE)
|
|
84
|
+
# do the job (very ineffectiv but readable)
|
|
85
|
+
chapter_head = chapter_pattern.match(line)
|
|
86
|
+
section_head = section_pattern.match(line)
|
|
87
|
+
other_head = none_idx_chapter_pattern.match(line)
|
|
88
|
+
if chapter_head:
|
|
89
|
+
return Chapter(chapter_head[1], chapter_head[3], int(chapter_head[4]) + offset)
|
|
90
|
+
elif section_head:
|
|
91
|
+
return Section(section_head[1], section_head[3], int(section_head[4]) + offset)
|
|
92
|
+
elif other_head:
|
|
93
|
+
return Chapter("", other_head[1], int(other_head[2]) + offset)
|
|
94
|
+
else:
|
|
95
|
+
return ValueError(f">{line}< is not a ToC-entry")
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def parse_manual_info(idx, line, page_offset):
|
|
99
|
+
page_label = r"(^\\pageLabel){(\d+)}{(\d+)}{([a-z]+)}"
|
|
100
|
+
page_label_pattern = re.compile(page_label, re.IGNORECASE|re.UNICODE)
|
|
101
|
+
allowed_label_style = {"UppercaseRomanNumerals",
|
|
102
|
+
"DecimalArabicNumerals",
|
|
103
|
+
"LowercaseRomanNumerals",
|
|
104
|
+
"NoNumber"}
|
|
105
|
+
match = page_label_pattern.match(line)
|
|
106
|
+
if match:
|
|
107
|
+
style = match[4]
|
|
108
|
+
if style in allowed_label_style:
|
|
109
|
+
return PageLabel(
|
|
110
|
+
int(match[2]), # physical_page_counter
|
|
111
|
+
int(match[3]), # logical_page_counter
|
|
112
|
+
style
|
|
113
|
+
)
|
|
114
|
+
else:
|
|
115
|
+
raise ValueError(f">{style}< is not allowed")
|
|
116
|
+
else:
|
|
117
|
+
raise ValueError(f">{line}< is not a pageLabel")
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def is_informative(line):
|
|
121
|
+
return len(line) > 0 and not line.startswith('#')
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def parse_toc_file(tocfilename, out=sys.stdout):
|
|
125
|
+
page_offset = 0
|
|
126
|
+
with open(tocfilename, 'r') as toc:
|
|
127
|
+
toc_line_nr = 0
|
|
128
|
+
for line in toc:
|
|
129
|
+
toc_line_nr = toc_line_nr + 1
|
|
130
|
+
line = line.strip()
|
|
131
|
+
try:
|
|
132
|
+
if is_informative(line):
|
|
133
|
+
x = None
|
|
134
|
+
if line.startswith( '\\' ):
|
|
135
|
+
x = parse_manual_info(toc_line_nr, line, page_offset)
|
|
136
|
+
page_offset = x.physical_page_counter - 1
|
|
137
|
+
else:
|
|
138
|
+
x = parse_toc_entry(toc_line_nr, line, page_offset)
|
|
139
|
+
print(x, file=out, end="")
|
|
140
|
+
except ValueError as ex:
|
|
141
|
+
raise ValueError(f"{toc_line_nr} : {repr(ex)}") from ex
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
if __name__ == "__main__":
|
|
145
|
+
OFFSET = 12
|
|
146
|
+
with open("toc.txt") as toc:
|
|
147
|
+
idx = 0
|
|
148
|
+
for line in toc:
|
|
149
|
+
idx = idx + 1
|
|
150
|
+
x = parse_toc_entry(idx, line, OFFSET)
|
|
151
|
+
if x:
|
|
152
|
+
print(x, end="")
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
|