oldstyle-pdf-toc 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,100 @@
1
+ Metadata-Version: 2.4
2
+ Name: oldstyle-pdf-toc
3
+ Version: 0.1.0
4
+ Summary: Add Table of Content in scanned German old-style Textbooks
5
+ Keywords: pdf
6
+ Author: Hong-Phuc Bui
7
+ Author-email: Hong-Phuc Bui <hong-phuc.bui@htwsaar.de>
8
+ License-Expression: MIT
9
+ Requires-Python: >=3.12
10
+ Project-URL: Homepage, https://codeberg.org/hpb/oldstyle-pdf-toc
11
+ Project-URL: Repository, https://codeberg.org/hpb/oldstyle-pdf-toc
12
+ Description-Content-Type: text/markdown
13
+
14
+ # Oldstyle PDF ToC
15
+
16
+ ## Install
17
+
18
+ ```shell
19
+ pip install oldstyle-pdf-toc
20
+ ```
21
+
22
+ ## Usage
23
+
24
+ ### 1. Create the `toc.txt`
25
+
26
+ The file must be nammed `toc.txt`
27
+
28
+
29
+ ```txt
30
+ # Lines that begin with # are ignored, empty lines are alis ignored
31
+
32
+ # Set page number to Uppercase roman number (I, II, III, ...)
33
+ \pageLabel{1}{1}{UppercaseRomanNumerals}
34
+
35
+ # Copy origin ToC of the book
36
+
37
+ Schrifttum 11
38
+
39
+ # Change page number from to decimal arabic number, reset it to 1 but physical page numer is now 13
40
+ \pageLabel{13}{1}{DecimalArabicNumerals}
41
+
42
+ # Copy other parts of the ToC of the Book
43
+
44
+ I. Abbildungsverfahren 1
45
+ 1. Aufgabe der Darstellenden Geometrie 1
46
+ 2. Zentralprojektion 4
47
+ 3. Parallelprojektion 6
48
+
49
+ II. Parallelprojektion und perspektive Affinität 8
50
+ 4. Invarianten der Parallelprojektion 8
51
+ 5. Der Satz von DESARGUES 11
52
+ 6. Perspektive Affinität 12
53
+ 7. Analytische Darstellung einer ebenen perspektiven Affinität 14
54
+ 8. Das Rechtwinkelpaar einer ebenen perspektiven Affinität 17
55
+ 9. Die Ellipse als affines Bild des Kreises 19
56
+ 10. Affine Eigenschaften der Ellipse 21
57
+ 11. Ellipsenkonstruktionen 23
58
+
59
+ # .... others entries
60
+ Namen- und Sachverzeichnis 339
61
+ ```
62
+
63
+ The complete example `toc.txt` can be found in this project repository.
64
+
65
+ ### 2. Call the python script
66
+
67
+ ```shell
68
+ oldstyle-pdf-toc > pdftk.toc.txt
69
+ ```
70
+
71
+ ### 3. Call the pdftk
72
+
73
+ ```shell
74
+ pdftk Strubecker.pdf update_info_utf8 pdftk.toc.txt output Strubecker.pdf-toc.pdf
75
+ ```
76
+
77
+ That's it!
78
+
79
+
80
+ ## Development
81
+
82
+ This Project uses `uv` to manage dependencies and tasks during development and deployment
83
+
84
+ ```shell
85
+ uv python install # Install a copy of python to this directory
86
+ uv venv # Creat a virtual python environment
87
+ source .venv/bin/activate # Activate venv
88
+ uv pip install -e . # Install this project as a python package into the virtual environment
89
+ uv build
90
+ ```
91
+
92
+ ## Deploy in PyPi
93
+
94
+ See: https://packaging.python.org/en/latest/guides/distributing-packages-using-setuptools/#create-an-account
95
+
96
+ Prinzipial:
97
+
98
+ ```shell
99
+ uv build
100
+ ```
@@ -0,0 +1,87 @@
1
+ # Oldstyle PDF ToC
2
+
3
+ ## Install
4
+
5
+ ```shell
6
+ pip install oldstyle-pdf-toc
7
+ ```
8
+
9
+ ## Usage
10
+
11
+ ### 1. Create the `toc.txt`
12
+
13
+ The file must be nammed `toc.txt`
14
+
15
+
16
+ ```txt
17
+ # Lines that begin with # are ignored, empty lines are alis ignored
18
+
19
+ # Set page number to Uppercase roman number (I, II, III, ...)
20
+ \pageLabel{1}{1}{UppercaseRomanNumerals}
21
+
22
+ # Copy origin ToC of the book
23
+
24
+ Schrifttum 11
25
+
26
+ # Change page number from to decimal arabic number, reset it to 1 but physical page numer is now 13
27
+ \pageLabel{13}{1}{DecimalArabicNumerals}
28
+
29
+ # Copy other parts of the ToC of the Book
30
+
31
+ I. Abbildungsverfahren 1
32
+ 1. Aufgabe der Darstellenden Geometrie 1
33
+ 2. Zentralprojektion 4
34
+ 3. Parallelprojektion 6
35
+
36
+ II. Parallelprojektion und perspektive Affinität 8
37
+ 4. Invarianten der Parallelprojektion 8
38
+ 5. Der Satz von DESARGUES 11
39
+ 6. Perspektive Affinität 12
40
+ 7. Analytische Darstellung einer ebenen perspektiven Affinität 14
41
+ 8. Das Rechtwinkelpaar einer ebenen perspektiven Affinität 17
42
+ 9. Die Ellipse als affines Bild des Kreises 19
43
+ 10. Affine Eigenschaften der Ellipse 21
44
+ 11. Ellipsenkonstruktionen 23
45
+
46
+ # .... others entries
47
+ Namen- und Sachverzeichnis 339
48
+ ```
49
+
50
+ The complete example `toc.txt` can be found in this project repository.
51
+
52
+ ### 2. Call the python script
53
+
54
+ ```shell
55
+ oldstyle-pdf-toc > pdftk.toc.txt
56
+ ```
57
+
58
+ ### 3. Call the pdftk
59
+
60
+ ```shell
61
+ pdftk Strubecker.pdf update_info_utf8 pdftk.toc.txt output Strubecker.pdf-toc.pdf
62
+ ```
63
+
64
+ That's it!
65
+
66
+
67
+ ## Development
68
+
69
+ This Project uses `uv` to manage dependencies and tasks during development and deployment
70
+
71
+ ```shell
72
+ uv python install # Install a copy of python to this directory
73
+ uv venv # Creat a virtual python environment
74
+ source .venv/bin/activate # Activate venv
75
+ uv pip install -e . # Install this project as a python package into the virtual environment
76
+ uv build
77
+ ```
78
+
79
+ ## Deploy in PyPi
80
+
81
+ See: https://packaging.python.org/en/latest/guides/distributing-packages-using-setuptools/#create-an-account
82
+
83
+ Prinzipial:
84
+
85
+ ```shell
86
+ uv build
87
+ ```
@@ -0,0 +1,26 @@
1
+ [project]
2
+ name = "oldstyle-pdf-toc"
3
+ version = "0.1.0"
4
+ description = "Add Table of Content in scanned German old-style Textbooks"
5
+ authors = [
6
+ {name="Hong-Phuc Bui", email="hong-phuc.bui@htwsaar.de"}
7
+ ]
8
+ readme = "README.md"
9
+ license = "MIT"
10
+ keywords = ["pdf"]
11
+
12
+ requires-python = ">=3.12"
13
+ dependencies = []
14
+
15
+
16
+ [project.urls]
17
+ Repository = "https://codeberg.org/hpb/oldstyle-pdf-toc"
18
+ Homepage = "https://codeberg.org/hpb/oldstyle-pdf-toc"
19
+
20
+
21
+ [project.scripts]
22
+ oldstyle-pdf-toc = "oldstyle_pdf_toc:main"
23
+
24
+ [build-system]
25
+ requires = ["uv_build>=0.9.21,<0.10.0"]
26
+ build-backend = "uv_build"
@@ -0,0 +1,5 @@
1
+ from .oldstyle_pdf_toc import *
2
+
3
+ def main() -> None:
4
+ parse_toc_file("toc.txt")
5
+
@@ -0,0 +1,188 @@
1
+ import re
2
+ import sys
3
+ from dataclasses import dataclass
4
+
5
+ # TODO:
6
+ # * make page-offset for begin
7
+ # * print pdftk command to stderr
8
+
9
+ @dataclass
10
+ class Chapter:
11
+ idx: str
12
+ """
13
+ Chapter Index: I, II, III, IV, ect
14
+ """
15
+
16
+ title: str
17
+ """
18
+ Title of the Chapter e.g. Grundlage,
19
+ """
20
+
21
+ page: int
22
+ """
23
+ physical page begin with 1
24
+ """
25
+
26
+ level: int = 1
27
+ """
28
+ ToC-Lelve 1 = chapter
29
+ """
30
+
31
+
32
+ def __str__(self):
33
+ idx = self.idx.strip()
34
+ if len(idx) > 0:
35
+ idx = f"{idx} "
36
+ return f"""BookmarkBegin
37
+ BookmarkTitle: {idx}{self.title.strip()}
38
+ BookmarkLevel: {self.level}
39
+ BookmarkPageNumber: {self.page}
40
+ """
41
+
42
+ @dataclass
43
+ class Section(Chapter):
44
+ level:int = 2
45
+
46
+
47
+ @dataclass
48
+ class PageLabel:
49
+ physical_page_counter: int
50
+ """
51
+ page numer from cover to cover
52
+ """
53
+ logical_page_counter: int
54
+ """
55
+ logical page numer for each parts of the Book: Vorwort, Hauptteil, ...
56
+ """
57
+ num_style: str
58
+ """
59
+ Style from pdftk. One of "NoNumber", "LowercaseRomanNumerals", "UpercaseRomanNumerals"
60
+ """
61
+
62
+ def __str__(self):
63
+ return f"""PageLabelBegin
64
+ PageLabelStart: {self.logical_page_counter}
65
+ PageLabelNewIndex: {self.physical_page_counter}
66
+ PageLabelNumStyle: {self.num_style}
67
+ """
68
+
69
+
70
+
71
+
72
+
73
+ def parse_toc_entry(idx, line, offset) -> tuple[str,str, str]:
74
+ line = line.strip()
75
+ # chapter
76
+ chapter = r"""(^[IVX]+\.)(\s+)(.+[^\d$])(\d+)$"""
77
+ chapter_pattern = re.compile(chapter, re.MULTILINE|re.UNICODE)
78
+ # section
79
+ section = r"""(^[\d]+\.)(\s+)(.+[^\d$])(\d+)$"""
80
+ section_pattern = re.compile(section, re.MULTILINE|re.UNICODE)
81
+ # else
82
+ none_idx_chapter = r"(.+[^\d$])(\d+)$"
83
+ none_idx_chapter_pattern = re.compile(none_idx_chapter, re.MULTILINE|re.UNICODE)
84
+ # do the job (very ineffectiv but readable)
85
+ chapter_head = chapter_pattern.match(line)
86
+ section_head = section_pattern.match(line)
87
+ other_head = none_idx_chapter_pattern.match(line)
88
+ if chapter_head:
89
+ return Chapter(chapter_head[1], chapter_head[3], int(chapter_head[4]) + offset)
90
+ elif section_head:
91
+ return Section(section_head[1], section_head[3], int(section_head[4]) + offset)
92
+ elif other_head:
93
+ return Chapter("", other_head[1], int(other_head[2]) + offset)
94
+ else:
95
+ return ValueError(f">{line}< is not a ToC-entry")
96
+
97
+
98
+ def parse_manual_info(idx, line, page_offset):
99
+ page_label = r"(^\\pageLabel){(\d+)}{(\d+)}{([a-z]+)}"
100
+ page_label_pattern = re.compile(page_label, re.IGNORECASE|re.UNICODE)
101
+ allowed_label_style = {"UppercaseRomanNumerals",
102
+ "DecimalArabicNumerals",
103
+ "LowercaseRomanNumerals",
104
+ "NoNumber"}
105
+ match = page_label_pattern.match(line)
106
+ if match:
107
+ style = match[4]
108
+ if style in allowed_label_style:
109
+ return PageLabel(
110
+ int(match[2]), # physical_page_counter
111
+ int(match[3]), # logical_page_counter
112
+ style
113
+ )
114
+ else:
115
+ raise ValueError(f">{style}< is not allowed")
116
+ else:
117
+ raise ValueError(f">{line}< is not a pageLabel")
118
+
119
+
120
+ def is_informative(line):
121
+ return len(line) > 0 and not line.startswith('#')
122
+
123
+
124
+ def parse_toc_file(tocfilename, out=sys.stdout):
125
+ page_offset = 0
126
+ with open(tocfilename, 'r') as toc:
127
+ toc_line_nr = 0
128
+ for line in toc:
129
+ toc_line_nr = toc_line_nr + 1
130
+ line = line.strip()
131
+ try:
132
+ if is_informative(line):
133
+ x = None
134
+ if line.startswith( '\\' ):
135
+ x = parse_manual_info(toc_line_nr, line, page_offset)
136
+ page_offset = x.physical_page_counter - 1
137
+ else:
138
+ x = parse_toc_entry(toc_line_nr, line, page_offset)
139
+ print(x, file=out, end="")
140
+ except ValueError as ex:
141
+ raise ValueError(f"{toc_line_nr} : {repr(ex)}") from ex
142
+
143
+
144
+ if __name__ == "__main__":
145
+ OFFSET = 12
146
+ with open("toc.txt") as toc:
147
+ idx = 0
148
+ for line in toc:
149
+ idx = idx + 1
150
+ x = parse_toc_entry(idx, line, OFFSET)
151
+ if x:
152
+ print(x, end="")
153
+
154
+
155
+
156
+
157
+
158
+
159
+
160
+
161
+
162
+
163
+
164
+
165
+
166
+
167
+
168
+
169
+
170
+
171
+
172
+
173
+
174
+
175
+
176
+
177
+
178
+
179
+
180
+
181
+
182
+
183
+
184
+
185
+
186
+
187
+
188
+