weasyprint 65.1__py3-none-any.whl → 66.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- weasyprint/__init__.py +4 -1
- weasyprint/__main__.py +2 -0
- weasyprint/css/__init__.py +12 -4
- weasyprint/css/computed_values.py +8 -2
- weasyprint/css/html5_ua.css +2 -7
- weasyprint/css/html5_ua_form.css +1 -1
- weasyprint/css/utils.py +1 -1
- weasyprint/document.py +2 -10
- weasyprint/draw/__init__.py +51 -57
- weasyprint/draw/border.py +120 -66
- weasyprint/draw/text.py +1 -2
- weasyprint/formatting_structure/boxes.py +3 -2
- weasyprint/formatting_structure/build.py +32 -42
- weasyprint/images.py +8 -15
- weasyprint/layout/__init__.py +5 -2
- weasyprint/layout/absolute.py +4 -1
- weasyprint/layout/block.py +60 -29
- weasyprint/layout/column.py +1 -0
- weasyprint/layout/flex.py +41 -21
- weasyprint/layout/float.py +8 -1
- weasyprint/layout/grid.py +1 -1
- weasyprint/layout/inline.py +7 -8
- weasyprint/layout/page.py +23 -1
- weasyprint/layout/preferred.py +59 -32
- weasyprint/layout/table.py +8 -4
- weasyprint/pdf/__init__.py +13 -6
- weasyprint/pdf/anchors.py +2 -2
- weasyprint/pdf/pdfua.py +7 -115
- weasyprint/pdf/stream.py +40 -49
- weasyprint/pdf/tags.py +305 -0
- weasyprint/stacking.py +14 -15
- weasyprint/svg/__init__.py +22 -11
- weasyprint/svg/bounding_box.py +4 -2
- weasyprint/svg/defs.py +4 -9
- weasyprint/svg/utils.py +9 -5
- weasyprint/text/fonts.py +1 -1
- weasyprint/text/line_break.py +45 -26
- weasyprint/urls.py +21 -10
- {weasyprint-65.1.dist-info → weasyprint-66.0.dist-info}/METADATA +1 -1
- weasyprint-66.0.dist-info/RECORD +74 -0
- weasyprint/draw/stack.py +0 -13
- weasyprint-65.1.dist-info/RECORD +0 -74
- {weasyprint-65.1.dist-info → weasyprint-66.0.dist-info}/WHEEL +0 -0
- {weasyprint-65.1.dist-info → weasyprint-66.0.dist-info}/entry_points.txt +0 -0
- {weasyprint-65.1.dist-info → weasyprint-66.0.dist-info}/licenses/LICENSE +0 -0
weasyprint/pdf/__init__.py
CHANGED
|
@@ -12,6 +12,7 @@ from ..matrix import Matrix
|
|
|
12
12
|
from . import debug, pdfa, pdfua
|
|
13
13
|
from .fonts import build_fonts_dictionary
|
|
14
14
|
from .stream import Stream
|
|
15
|
+
from .tags import add_tags
|
|
15
16
|
|
|
16
17
|
from .anchors import ( # isort:skip
|
|
17
18
|
add_annotations, add_forms, add_links, add_outlines, resolve_links,
|
|
@@ -118,15 +119,15 @@ def generate_pdf(document, target, zoom, **options):
|
|
|
118
119
|
PROGRESS_LOGGER.info('Step 6 - Creating PDF')
|
|
119
120
|
|
|
120
121
|
# Set properties according to PDF variants
|
|
121
|
-
mark = False
|
|
122
122
|
srgb = options['srgb']
|
|
123
|
+
pdf_tags = options['pdf_tags']
|
|
123
124
|
variant = options['pdf_variant']
|
|
124
125
|
if variant:
|
|
125
126
|
variant_function, properties = VARIANTS[variant]
|
|
126
|
-
if 'mark' in properties:
|
|
127
|
-
mark = properties['mark']
|
|
128
127
|
if 'srgb' in properties:
|
|
129
128
|
srgb = properties['srgb']
|
|
129
|
+
if 'pdf_tags' in properties:
|
|
130
|
+
pdf_tags = properties['pdf_tags']
|
|
130
131
|
|
|
131
132
|
pdf = pydyf.PDF()
|
|
132
133
|
images = {}
|
|
@@ -159,6 +160,8 @@ def generate_pdf(document, target, zoom, **options):
|
|
|
159
160
|
compress = not options['uncompressed_pdf']
|
|
160
161
|
for page_number, (page, links_and_anchors) in enumerate(
|
|
161
162
|
zip(document.pages, page_links_and_anchors)):
|
|
163
|
+
tags = {} if pdf_tags else None
|
|
164
|
+
|
|
162
165
|
# Draw from the top-left corner
|
|
163
166
|
matrix = Matrix(scale, 0, 0, -scale, 0, page.height * scale)
|
|
164
167
|
|
|
@@ -175,7 +178,7 @@ def generate_pdf(document, target, zoom, **options):
|
|
|
175
178
|
left / scale, top / scale,
|
|
176
179
|
(right - left) / scale, (bottom - top) / scale)
|
|
177
180
|
stream = Stream(
|
|
178
|
-
document.fonts, page_rectangle, resources, images,
|
|
181
|
+
document.fonts, page_rectangle, resources, images, tags, compress=compress)
|
|
179
182
|
stream.transform(d=-1, f=(page.height * scale))
|
|
180
183
|
pdf.add_object(stream)
|
|
181
184
|
page_streams.append(stream)
|
|
@@ -187,13 +190,13 @@ def generate_pdf(document, target, zoom, **options):
|
|
|
187
190
|
'Contents': stream.reference,
|
|
188
191
|
'Resources': resources.reference,
|
|
189
192
|
})
|
|
190
|
-
if
|
|
193
|
+
if pdf_tags:
|
|
191
194
|
pdf_page['Tabs'] = '/S'
|
|
192
195
|
pdf_page['StructParents'] = page_number
|
|
193
196
|
pdf.add_page(pdf_page)
|
|
194
197
|
pdf_pages.append(pdf_page)
|
|
195
198
|
|
|
196
|
-
add_links(links_and_anchors, matrix, pdf, pdf_page, pdf_names,
|
|
199
|
+
add_links(links_and_anchors, matrix, pdf, pdf_page, pdf_names, tags)
|
|
197
200
|
add_annotations(
|
|
198
201
|
links_and_anchors[0], matrix, document, pdf, pdf_page, annot_files,
|
|
199
202
|
compress)
|
|
@@ -323,6 +326,10 @@ def generate_pdf(document, target, zoom, **options):
|
|
|
323
326
|
}),
|
|
324
327
|
])
|
|
325
328
|
|
|
329
|
+
# Add tags
|
|
330
|
+
if pdf_tags:
|
|
331
|
+
add_tags(pdf, document, page_streams)
|
|
332
|
+
|
|
326
333
|
# Apply PDF variants functions
|
|
327
334
|
if variant:
|
|
328
335
|
variant_function(
|
weasyprint/pdf/anchors.py
CHANGED
|
@@ -16,7 +16,7 @@ from ..text.fonts import get_font_description
|
|
|
16
16
|
from ..urls import URLFetchingError
|
|
17
17
|
|
|
18
18
|
|
|
19
|
-
def add_links(links_and_anchors, matrix, pdf, page, names,
|
|
19
|
+
def add_links(links_and_anchors, matrix, pdf, page, names, tags):
|
|
20
20
|
"""Include hyperlinks in given PDF page."""
|
|
21
21
|
links, anchors = links_and_anchors
|
|
22
22
|
|
|
@@ -30,7 +30,7 @@ def add_links(links_and_anchors, matrix, pdf, page, names, mark):
|
|
|
30
30
|
'Rect': pydyf.Array([x1, y1, x2, y2]),
|
|
31
31
|
'BS': pydyf.Dictionary({'W': 0}),
|
|
32
32
|
})
|
|
33
|
-
if
|
|
33
|
+
if tags is not None:
|
|
34
34
|
box.link_annotation['Contents'] = pydyf.String(link_target)
|
|
35
35
|
if link_type == 'internal':
|
|
36
36
|
box.link_annotation['Dest'] = pydyf.String(link_target)
|
weasyprint/pdf/pdfua.py
CHANGED
|
@@ -1,125 +1,17 @@
|
|
|
1
1
|
"""PDF/UA generation."""
|
|
2
2
|
|
|
3
|
-
import
|
|
3
|
+
from functools import partial
|
|
4
4
|
|
|
5
5
|
from .metadata import add_metadata
|
|
6
6
|
|
|
7
7
|
|
|
8
|
-
def pdfua(pdf, metadata, document, page_streams, attachments, compress):
|
|
8
|
+
def pdfua(pdf, metadata, document, page_streams, attachments, compress, version):
|
|
9
9
|
"""Set metadata for PDF/UA documents."""
|
|
10
|
-
# Structure for PDF tagging
|
|
11
|
-
content_mapping = pydyf.Dictionary({})
|
|
12
|
-
pdf.add_object(content_mapping)
|
|
13
|
-
structure_root = pydyf.Dictionary({
|
|
14
|
-
'Type': '/StructTreeRoot',
|
|
15
|
-
'ParentTree': content_mapping.reference,
|
|
16
|
-
})
|
|
17
|
-
pdf.add_object(structure_root)
|
|
18
|
-
structure_document = pydyf.Dictionary({
|
|
19
|
-
'Type': '/StructElem',
|
|
20
|
-
'S': '/Document',
|
|
21
|
-
'P': structure_root.reference,
|
|
22
|
-
})
|
|
23
|
-
pdf.add_object(structure_document)
|
|
24
|
-
structure_root['K'] = pydyf.Array([structure_document.reference])
|
|
25
|
-
pdf.catalog['StructTreeRoot'] = structure_root.reference
|
|
26
|
-
|
|
27
|
-
document_children = []
|
|
28
|
-
content_mapping['Nums'] = pydyf.Array()
|
|
29
|
-
links = []
|
|
30
|
-
for page_number, page_stream in enumerate(page_streams):
|
|
31
|
-
structure = {}
|
|
32
|
-
document.build_element_structure(structure)
|
|
33
|
-
parents = [None] * len(page_stream.marked)
|
|
34
|
-
for mcid, (key, box) in enumerate(page_stream.marked):
|
|
35
|
-
# Build structure elements
|
|
36
|
-
kids = [mcid]
|
|
37
|
-
if key == 'Link':
|
|
38
|
-
object_reference = pydyf.Dictionary({
|
|
39
|
-
'Type': '/OBJR',
|
|
40
|
-
'Obj': box.link_annotation.reference,
|
|
41
|
-
'Pg': pdf.page_references[page_number],
|
|
42
|
-
})
|
|
43
|
-
pdf.add_object(object_reference)
|
|
44
|
-
links.append((object_reference.reference, box.link_annotation))
|
|
45
|
-
etree_element = box.element
|
|
46
|
-
child_structure_data_element = None
|
|
47
|
-
while True:
|
|
48
|
-
if etree_element is None:
|
|
49
|
-
structure_data = structure.setdefault(
|
|
50
|
-
box, {'parent': None})
|
|
51
|
-
else:
|
|
52
|
-
structure_data = structure[etree_element]
|
|
53
|
-
new_element = 'element' not in structure_data
|
|
54
|
-
if new_element:
|
|
55
|
-
child = structure_data['element'] = pydyf.Dictionary({
|
|
56
|
-
'Type': '/StructElem',
|
|
57
|
-
'S': f'/{key}',
|
|
58
|
-
'K': pydyf.Array(kids),
|
|
59
|
-
'Pg': pdf.page_references[page_number],
|
|
60
|
-
})
|
|
61
|
-
pdf.add_object(child)
|
|
62
|
-
if key == 'LI':
|
|
63
|
-
if etree_element.tag == 'dt':
|
|
64
|
-
sub_key = 'Lbl'
|
|
65
|
-
else:
|
|
66
|
-
sub_key = 'LBody'
|
|
67
|
-
real_child = pydyf.Dictionary({
|
|
68
|
-
'Type': '/StructElem',
|
|
69
|
-
'S': f'/{sub_key}',
|
|
70
|
-
'K': pydyf.Array(kids),
|
|
71
|
-
'Pg': pdf.page_references[page_number],
|
|
72
|
-
'P': child.reference,
|
|
73
|
-
})
|
|
74
|
-
pdf.add_object(real_child)
|
|
75
|
-
for kid in kids:
|
|
76
|
-
if isinstance(kid, int):
|
|
77
|
-
parents[kid] = real_child.reference
|
|
78
|
-
child['K'] = pydyf.Array([real_child.reference])
|
|
79
|
-
structure_data['element'] = real_child
|
|
80
|
-
else:
|
|
81
|
-
for kid in kids:
|
|
82
|
-
if isinstance(kid, int):
|
|
83
|
-
parents[kid] = child.reference
|
|
84
|
-
else:
|
|
85
|
-
child = structure_data['element']
|
|
86
|
-
child['K'].extend(kids)
|
|
87
|
-
for kid in kids:
|
|
88
|
-
if isinstance(kid, int):
|
|
89
|
-
parents[kid] = child.reference
|
|
90
|
-
kid = child.reference
|
|
91
|
-
if child_structure_data_element is not None:
|
|
92
|
-
child_structure_data_element['P'] = kid
|
|
93
|
-
if not new_element:
|
|
94
|
-
break
|
|
95
|
-
kids = [kid]
|
|
96
|
-
child_structure_data_element = child
|
|
97
|
-
if structure_data['parent'] is None:
|
|
98
|
-
child['P'] = structure_document.reference
|
|
99
|
-
document_children.append(child.reference)
|
|
100
|
-
break
|
|
101
|
-
else:
|
|
102
|
-
etree_element = structure_data['parent']
|
|
103
|
-
key = page_stream.get_marked_content_tag(etree_element.tag)
|
|
104
|
-
content_mapping['Nums'].append(page_number)
|
|
105
|
-
content_mapping['Nums'].append(pydyf.Array(parents))
|
|
106
|
-
structure_document['K'] = pydyf.Array(document_children)
|
|
107
|
-
for i, (link, annotation) in enumerate(links, start=page_number + 1):
|
|
108
|
-
content_mapping['Nums'].append(i)
|
|
109
|
-
content_mapping['Nums'].append(link)
|
|
110
|
-
annotation['StructParent'] = i
|
|
111
|
-
annotation['F'] = 2 ** (2 - 1)
|
|
112
|
-
|
|
113
10
|
# Common PDF metadata stream
|
|
114
|
-
add_metadata(pdf, metadata, 'ua',
|
|
115
|
-
|
|
116
|
-
# PDF document extra metadata
|
|
117
|
-
if 'Lang' not in pdf.catalog:
|
|
118
|
-
pdf.catalog['Lang'] = pydyf.String()
|
|
119
|
-
pdf.catalog['ViewerPreferences'] = pydyf.Dictionary({
|
|
120
|
-
'DisplayDocTitle': 'true',
|
|
121
|
-
})
|
|
122
|
-
pdf.catalog['MarkInfo'] = pydyf.Dictionary({'Marked': 'true'})
|
|
11
|
+
add_metadata(pdf, metadata, 'ua', version, conformance=None, compress=compress)
|
|
123
12
|
|
|
124
13
|
|
|
125
|
-
VARIANTS = {
|
|
14
|
+
VARIANTS = {
|
|
15
|
+
'pdf/ua-1': (partial(pdfua, version=1), {'version': '1.7', 'pdf_tags': True}),
|
|
16
|
+
'pdf/ua-2': (partial(pdfua, version=2), {'version': '2.0', 'pdf_tags': True}),
|
|
17
|
+
}
|
weasyprint/pdf/stream.py
CHANGED
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
"""PDF stream."""
|
|
2
2
|
|
|
3
|
+
from contextlib import contextmanager
|
|
4
|
+
|
|
3
5
|
import pydyf
|
|
4
6
|
|
|
5
7
|
from ..logger import LOGGER
|
|
@@ -11,14 +13,13 @@ from .fonts import Font
|
|
|
11
13
|
|
|
12
14
|
class Stream(pydyf.Stream):
|
|
13
15
|
"""PDF stream object with extra features."""
|
|
14
|
-
def __init__(self, fonts, page_rectangle, resources, images,
|
|
16
|
+
def __init__(self, fonts, page_rectangle, resources, images, tags, *args, **kwargs):
|
|
15
17
|
super().__init__(*args, **kwargs)
|
|
16
18
|
self.page_rectangle = page_rectangle
|
|
17
|
-
self.marked = []
|
|
18
19
|
self._fonts = fonts
|
|
19
20
|
self._resources = resources
|
|
20
21
|
self._images = images
|
|
21
|
-
self.
|
|
22
|
+
self._tags = tags
|
|
22
23
|
self._current_color = self._current_color_stroke = None
|
|
23
24
|
self._current_alpha = self._current_alpha_stroke = None
|
|
24
25
|
self._current_font = self._current_font_size = None
|
|
@@ -39,8 +40,8 @@ class Stream(pydyf.Stream):
|
|
|
39
40
|
kwargs['resources'] = self._resources
|
|
40
41
|
if 'images' not in kwargs:
|
|
41
42
|
kwargs['images'] = self._images
|
|
42
|
-
if '
|
|
43
|
-
kwargs['
|
|
43
|
+
if 'tags' not in kwargs:
|
|
44
|
+
kwargs['tags'] = self._tags
|
|
44
45
|
if 'compress' not in kwargs:
|
|
45
46
|
kwargs['compress'] = self.compress
|
|
46
47
|
return Stream(**kwargs)
|
|
@@ -105,7 +106,7 @@ class Stream(pydyf.Stream):
|
|
|
105
106
|
lightness, a, b = color.to('lab').coordinates
|
|
106
107
|
self.set_color_special(None, stroke, lightness, a, b)
|
|
107
108
|
else:
|
|
108
|
-
LOGGER.
|
|
109
|
+
LOGGER.warning('Unsupported color space %s, use sRGB instead', color.space)
|
|
109
110
|
self.set_color_rgb(*channels, stroke)
|
|
110
111
|
|
|
111
112
|
def set_font_size(self, font, size):
|
|
@@ -248,21 +249,39 @@ class Stream(pydyf.Stream):
|
|
|
248
249
|
self._resources['Shading'][shading.id] = shading
|
|
249
250
|
return shading
|
|
250
251
|
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
252
|
+
@contextmanager
|
|
253
|
+
def stacked(self):
|
|
254
|
+
"""Save and restore stream context when used with the ``with`` keyword."""
|
|
255
|
+
self.push_state()
|
|
256
|
+
try:
|
|
257
|
+
yield
|
|
258
|
+
finally:
|
|
259
|
+
self.pop_state()
|
|
260
|
+
|
|
261
|
+
@contextmanager
|
|
262
|
+
def marked(self, box, tag):
|
|
263
|
+
if self._tags is not None:
|
|
264
|
+
property_list = None
|
|
265
|
+
mcid = len(self._tags)
|
|
266
|
+
assert box not in self._tags
|
|
267
|
+
self._tags[box] = {'tag': tag, 'mcid': mcid}
|
|
268
|
+
property_list = pydyf.Dictionary({'MCID': mcid})
|
|
269
|
+
super().begin_marked_content(tag, property_list)
|
|
270
|
+
try:
|
|
271
|
+
yield
|
|
272
|
+
finally:
|
|
273
|
+
if self._tags is not None:
|
|
274
|
+
super().end_marked_content()
|
|
275
|
+
|
|
276
|
+
@contextmanager
|
|
277
|
+
def artifact(self):
|
|
278
|
+
if self._tags is not None:
|
|
279
|
+
super().begin_marked_content('Artifact')
|
|
280
|
+
try:
|
|
281
|
+
yield
|
|
282
|
+
finally:
|
|
283
|
+
if self._tags is not None:
|
|
284
|
+
super().end_marked_content()
|
|
266
285
|
|
|
267
286
|
@staticmethod
|
|
268
287
|
def create_interpolation_function(domain, c0, c1, n):
|
|
@@ -283,31 +302,3 @@ class Stream(pydyf.Stream):
|
|
|
283
302
|
'Bounds': pydyf.Array(bounds),
|
|
284
303
|
'Functions': pydyf.Array(sub_functions),
|
|
285
304
|
})
|
|
286
|
-
|
|
287
|
-
def get_marked_content_tag(self, element_tag):
|
|
288
|
-
if element_tag == 'div':
|
|
289
|
-
return 'Div'
|
|
290
|
-
elif element_tag == 'span':
|
|
291
|
-
return 'Span'
|
|
292
|
-
elif element_tag == 'article':
|
|
293
|
-
return 'Art'
|
|
294
|
-
elif element_tag == 'section':
|
|
295
|
-
return 'Sect'
|
|
296
|
-
elif element_tag == 'blockquote':
|
|
297
|
-
return 'BlockQuote'
|
|
298
|
-
elif element_tag == 'p':
|
|
299
|
-
return 'P'
|
|
300
|
-
elif element_tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'):
|
|
301
|
-
return element_tag.upper()
|
|
302
|
-
elif element_tag in ('dl', 'ul', 'ol'):
|
|
303
|
-
return 'L'
|
|
304
|
-
elif element_tag in ('li', 'dt', 'dd'):
|
|
305
|
-
return 'LI'
|
|
306
|
-
elif element_tag == 'table':
|
|
307
|
-
return 'Table'
|
|
308
|
-
elif element_tag in ('tr', 'th', 'td'):
|
|
309
|
-
return element_tag.upper()
|
|
310
|
-
elif element_tag in ('thead', 'tbody', 'tfoot'):
|
|
311
|
-
return element_tag[:2].upper() + element_tag[2:]
|
|
312
|
-
else:
|
|
313
|
-
return 'NonStruct'
|
weasyprint/pdf/tags.py
ADDED
|
@@ -0,0 +1,305 @@
|
|
|
1
|
+
"""PDF tagging."""
|
|
2
|
+
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
|
|
5
|
+
import pydyf
|
|
6
|
+
|
|
7
|
+
from ..formatting_structure import boxes
|
|
8
|
+
from ..layout.absolute import AbsolutePlaceholder
|
|
9
|
+
from ..logger import LOGGER
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def add_tags(pdf, document, page_streams):
|
|
13
|
+
"""Add tag tree to the document."""
|
|
14
|
+
|
|
15
|
+
# Add root structure.
|
|
16
|
+
content_mapping = pydyf.Dictionary({})
|
|
17
|
+
pdf.add_object(content_mapping)
|
|
18
|
+
structure_root = pydyf.Dictionary({
|
|
19
|
+
'Type': '/StructTreeRoot',
|
|
20
|
+
'ParentTree': content_mapping.reference,
|
|
21
|
+
})
|
|
22
|
+
pdf.add_object(structure_root)
|
|
23
|
+
structure_document = pydyf.Dictionary({
|
|
24
|
+
'Type': '/StructElem',
|
|
25
|
+
'S': '/Document',
|
|
26
|
+
'K': pydyf.Array(),
|
|
27
|
+
'P': structure_root.reference,
|
|
28
|
+
})
|
|
29
|
+
pdf.add_object(structure_document)
|
|
30
|
+
structure_root['K'] = pydyf.Array([structure_document.reference])
|
|
31
|
+
pdf.catalog['StructTreeRoot'] = structure_root.reference
|
|
32
|
+
|
|
33
|
+
# Map content.
|
|
34
|
+
content_mapping['Nums'] = pydyf.Array()
|
|
35
|
+
links = []
|
|
36
|
+
for page_number, (page, stream) in enumerate(zip(document.pages, page_streams)):
|
|
37
|
+
tags = stream._tags
|
|
38
|
+
page_box = page._page_box
|
|
39
|
+
|
|
40
|
+
# Prepare array for this page’s MCID-to-StructElem mapping.
|
|
41
|
+
content_mapping['Nums'].append(page_number)
|
|
42
|
+
content_mapping['Nums'].append(pydyf.Array())
|
|
43
|
+
page_nums = {}
|
|
44
|
+
|
|
45
|
+
# Map page box content.
|
|
46
|
+
elements = _build_box_tree(
|
|
47
|
+
page_box, structure_document, pdf, page_number, page_nums, links, tags)
|
|
48
|
+
for element in elements:
|
|
49
|
+
structure_document['K'].append(element.reference)
|
|
50
|
+
assert not tags
|
|
51
|
+
|
|
52
|
+
# Flatten page-local nums into global mapping.
|
|
53
|
+
sorted_refs = [ref for _, ref in sorted(page_nums.items())]
|
|
54
|
+
content_mapping['Nums'][-1].extend(sorted_refs)
|
|
55
|
+
|
|
56
|
+
# Add annotations for links.
|
|
57
|
+
for i, (link_reference, annotation) in enumerate(links, start=len(document.pages)):
|
|
58
|
+
content_mapping['Nums'].append(i)
|
|
59
|
+
content_mapping['Nums'].append(link_reference)
|
|
60
|
+
annotation['StructParent'] = i
|
|
61
|
+
|
|
62
|
+
# Add required metadata.
|
|
63
|
+
pdf.catalog['ViewerPreferences'] = pydyf.Dictionary({'DisplayDocTitle': 'true'})
|
|
64
|
+
pdf.catalog['MarkInfo'] = pydyf.Dictionary({'Marked': 'true'})
|
|
65
|
+
if 'Lang' not in pdf.catalog:
|
|
66
|
+
LOGGER.error('Missing required "lang" attribute at the root of the document')
|
|
67
|
+
pdf.catalog['Lang'] = pydyf.String()
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _get_pdf_tag(tag):
|
|
71
|
+
"""Get PDF tag corresponding to HTML tag."""
|
|
72
|
+
if tag is None:
|
|
73
|
+
return 'NonStruct'
|
|
74
|
+
elif tag == 'div':
|
|
75
|
+
return 'Div'
|
|
76
|
+
elif tag.split(':')[0] == 'a':
|
|
77
|
+
# Links and link pseudo elements create link annotations.
|
|
78
|
+
return 'Link'
|
|
79
|
+
elif tag == 'span':
|
|
80
|
+
return 'Span'
|
|
81
|
+
elif tag == 'main':
|
|
82
|
+
return 'Part'
|
|
83
|
+
elif tag == 'article':
|
|
84
|
+
return 'Art'
|
|
85
|
+
elif tag == 'section':
|
|
86
|
+
return 'Sect'
|
|
87
|
+
elif tag == 'blockquote':
|
|
88
|
+
return 'BlockQuote'
|
|
89
|
+
elif tag == 'p':
|
|
90
|
+
return 'P'
|
|
91
|
+
elif tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'):
|
|
92
|
+
return tag.upper()
|
|
93
|
+
elif tag in ('dl', 'ul', 'ol'):
|
|
94
|
+
return 'L'
|
|
95
|
+
elif tag in ('li', 'dt', 'dd'):
|
|
96
|
+
# TODO: dt should be different.
|
|
97
|
+
return 'LI'
|
|
98
|
+
elif tag == 'li::marker':
|
|
99
|
+
return 'Lbl'
|
|
100
|
+
elif tag == 'table':
|
|
101
|
+
return 'Table'
|
|
102
|
+
elif tag in ('tr', 'th', 'td'):
|
|
103
|
+
return tag.upper()
|
|
104
|
+
elif tag in ('thead', 'tbody', 'tfoot'):
|
|
105
|
+
return tag[:2].upper() + tag[2:]
|
|
106
|
+
elif tag == 'img':
|
|
107
|
+
return 'Figure'
|
|
108
|
+
elif tag in ('caption', 'figcaption'):
|
|
109
|
+
return 'Caption'
|
|
110
|
+
else:
|
|
111
|
+
return 'NonStruct'
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def _build_box_tree(box, parent, pdf, page_number, nums, links, tags):
|
|
115
|
+
"""Recursively build tag tree for given box and yield children."""
|
|
116
|
+
|
|
117
|
+
# Special case for absolute elements.
|
|
118
|
+
if isinstance(box, AbsolutePlaceholder):
|
|
119
|
+
box = box._box
|
|
120
|
+
|
|
121
|
+
element_tag = None if box.element is None else box.element_tag
|
|
122
|
+
tag = _get_pdf_tag(element_tag)
|
|
123
|
+
|
|
124
|
+
# Special case for html, body, page boxes and margin boxes.
|
|
125
|
+
if element_tag in ('html', 'body') or isinstance(box, boxes.PageBox):
|
|
126
|
+
# Avoid generate page, html and body boxes as a semantic node, yield children.
|
|
127
|
+
if isinstance(box, boxes.ParentBox) and not isinstance(box, boxes.LineBox):
|
|
128
|
+
for child in box.children:
|
|
129
|
+
yield from _build_box_tree(
|
|
130
|
+
child, parent, pdf, page_number, nums, links, tags)
|
|
131
|
+
return
|
|
132
|
+
elif isinstance(box, boxes.MarginBox):
|
|
133
|
+
# Build tree for margin boxes but don’t link it to main tree. It ensures that
|
|
134
|
+
# marked content is mapped in document and removed from list. It could be
|
|
135
|
+
# included in tree as Artifact, but that’s only allowed in PDF 2.0.
|
|
136
|
+
for child in box.children:
|
|
137
|
+
tuple(_build_box_tree(child, parent, pdf, page_number, nums, links, tags))
|
|
138
|
+
return
|
|
139
|
+
|
|
140
|
+
# Create box element.
|
|
141
|
+
if tag == 'LI':
|
|
142
|
+
anonymous_list_element = parent['S'] == '/LI'
|
|
143
|
+
anonymous_li_child = parent['S'] == '/LBody'
|
|
144
|
+
dl_item = box.element_tag in ('dt', 'dd')
|
|
145
|
+
no_bullet_li = box.element_tag == 'li' and (
|
|
146
|
+
'list-item' not in box.style['display'] or
|
|
147
|
+
box.style['list_style_type'] == 'none')
|
|
148
|
+
if anonymous_list_element:
|
|
149
|
+
# Store as list item body.
|
|
150
|
+
tag = 'LBody'
|
|
151
|
+
elif anonymous_li_child:
|
|
152
|
+
# Store as non struct list item body child.
|
|
153
|
+
tag = 'NonStruct'
|
|
154
|
+
elif dl_item or no_bullet_li:
|
|
155
|
+
# Wrap in list item.
|
|
156
|
+
tag = 'LBody'
|
|
157
|
+
parent = pydyf.Dictionary({
|
|
158
|
+
'Type': '/StructElem',
|
|
159
|
+
'S': '/LI',
|
|
160
|
+
'K': pydyf.Array([]),
|
|
161
|
+
'Pg': pdf.page_references[page_number],
|
|
162
|
+
'P': parent.reference,
|
|
163
|
+
})
|
|
164
|
+
pdf.add_object(parent)
|
|
165
|
+
children = _build_box_tree(box, parent, pdf, page_number, nums, links, tags)
|
|
166
|
+
for child in children:
|
|
167
|
+
parent['K'].append(child.reference)
|
|
168
|
+
yield parent
|
|
169
|
+
return
|
|
170
|
+
|
|
171
|
+
element = pydyf.Dictionary({
|
|
172
|
+
'Type': '/StructElem',
|
|
173
|
+
'S': f'/{tag}',
|
|
174
|
+
'K': pydyf.Array([]),
|
|
175
|
+
'Pg': pdf.page_references[page_number],
|
|
176
|
+
'P': parent.reference,
|
|
177
|
+
})
|
|
178
|
+
pdf.add_object(element)
|
|
179
|
+
|
|
180
|
+
# Handle special cases.
|
|
181
|
+
if tag == 'Figure':
|
|
182
|
+
# Add extra data for images.
|
|
183
|
+
x1, y1 = box.content_box_x(), box.content_box_y()
|
|
184
|
+
x2, y2 = x1 + box.width, y1 + box.height
|
|
185
|
+
element['A'] = pydyf.Dictionary({
|
|
186
|
+
'O': '/Layout',
|
|
187
|
+
'BBox': pydyf.Array((x1, y1, x2, y2)),
|
|
188
|
+
})
|
|
189
|
+
if alt := box.element.attrib.get('alt'):
|
|
190
|
+
element['Alt'] = pydyf.String(alt)
|
|
191
|
+
else:
|
|
192
|
+
source = box.element.attrib.get('src', 'unknown')
|
|
193
|
+
LOGGER.error(f'Image "{source}" has no required alt description')
|
|
194
|
+
elif tag == 'Table':
|
|
195
|
+
# Use wrapped table as tagged box, and put captions in it.
|
|
196
|
+
wrapper, table = box, box.get_wrapped_table()
|
|
197
|
+
box = table.copy_with_children([])
|
|
198
|
+
for child in wrapper.children:
|
|
199
|
+
box.children.extend(child.children if child is table else [child])
|
|
200
|
+
elif tag == 'TH':
|
|
201
|
+
# Set identifier for table headers to reference them in cells.
|
|
202
|
+
element['ID'] = pydyf.String(id(box))
|
|
203
|
+
elif tag == 'TD':
|
|
204
|
+
# Store table cell element to map it to headers later.
|
|
205
|
+
# TODO: don’t use the box to store this.
|
|
206
|
+
box.mark = element
|
|
207
|
+
|
|
208
|
+
# Include link annotations.
|
|
209
|
+
if box.link_annotation:
|
|
210
|
+
annotation = box.link_annotation
|
|
211
|
+
object_reference = pydyf.Dictionary({
|
|
212
|
+
'Type': '/OBJR',
|
|
213
|
+
'Obj': annotation.reference,
|
|
214
|
+
'Pg': pdf.page_references[page_number],
|
|
215
|
+
})
|
|
216
|
+
pdf.add_object(object_reference)
|
|
217
|
+
links.append((element.reference, annotation))
|
|
218
|
+
element['K'].append(object_reference.reference)
|
|
219
|
+
|
|
220
|
+
if isinstance(box, boxes.ParentBox):
|
|
221
|
+
# Build tree for box children.
|
|
222
|
+
for child in box.children:
|
|
223
|
+
children = child.children if isinstance(child, boxes.LineBox) else [child]
|
|
224
|
+
for child in children:
|
|
225
|
+
if isinstance(child, boxes.TextBox):
|
|
226
|
+
# Add marked element from the stream.
|
|
227
|
+
kid = tags.pop(child)
|
|
228
|
+
assert kid['mcid'] not in nums
|
|
229
|
+
if tag == 'Link':
|
|
230
|
+
# Associate MCID directly with link reference.
|
|
231
|
+
element['K'].append(kid['mcid'])
|
|
232
|
+
nums[kid['mcid']] = element.reference
|
|
233
|
+
else:
|
|
234
|
+
kid_element = pydyf.Dictionary({
|
|
235
|
+
'Type': '/StructElem',
|
|
236
|
+
'S': f'/{kid["tag"]}',
|
|
237
|
+
'K': pydyf.Array([kid['mcid']]),
|
|
238
|
+
'Pg': pdf.page_references[page_number],
|
|
239
|
+
'P': element.reference,
|
|
240
|
+
})
|
|
241
|
+
pdf.add_object(kid_element)
|
|
242
|
+
element['K'].append(kid_element.reference)
|
|
243
|
+
nums[kid['mcid']] = kid_element.reference
|
|
244
|
+
else:
|
|
245
|
+
# Recursively build tree for child.
|
|
246
|
+
if child.element_tag in ('ul', 'ol') and element['S'] == '/LI':
|
|
247
|
+
# In PDFs, nested lists are linked to the parent list, but in
|
|
248
|
+
# HTML, nested lists are linked to a parent’s list item.
|
|
249
|
+
child_parent = parent
|
|
250
|
+
else:
|
|
251
|
+
child_parent = element
|
|
252
|
+
child_elements = _build_box_tree(
|
|
253
|
+
child, child_parent, pdf, page_number, nums, links, tags)
|
|
254
|
+
|
|
255
|
+
# Check if it is already been referenced before.
|
|
256
|
+
for child_element in child_elements:
|
|
257
|
+
child_parent['K'].append(child_element.reference)
|
|
258
|
+
|
|
259
|
+
else:
|
|
260
|
+
# Add replaced box.
|
|
261
|
+
assert isinstance(box, boxes.ReplacedBox)
|
|
262
|
+
kid = tags.pop(box)
|
|
263
|
+
element['K'].append(kid['mcid'])
|
|
264
|
+
assert kid['mcid'] not in nums
|
|
265
|
+
nums[kid['mcid']] = element.reference
|
|
266
|
+
|
|
267
|
+
# Link table cells to related headers.
|
|
268
|
+
if tag == 'Table':
|
|
269
|
+
def _get_rows(table_box):
|
|
270
|
+
for child in table_box.children:
|
|
271
|
+
if child.element_tag == 'tr':
|
|
272
|
+
yield child
|
|
273
|
+
else:
|
|
274
|
+
yield from _get_rows(child)
|
|
275
|
+
|
|
276
|
+
# Get headers and rows.
|
|
277
|
+
column_headers = defaultdict(list)
|
|
278
|
+
row_headers = defaultdict(list)
|
|
279
|
+
rows = tuple(_get_rows(box))
|
|
280
|
+
|
|
281
|
+
# Find column and row headers.
|
|
282
|
+
# TODO: handle rowspan and colspan values.
|
|
283
|
+
for i, row in enumerate(rows):
|
|
284
|
+
for j, cell in enumerate(row.children):
|
|
285
|
+
if cell.element is None:
|
|
286
|
+
continue
|
|
287
|
+
if cell.element_tag == 'th':
|
|
288
|
+
# TODO: handle rowgroup and colgroup values.
|
|
289
|
+
if cell.element.attrib.get('scope') == 'row':
|
|
290
|
+
row_headers[i].append(pydyf.String(id(cell)))
|
|
291
|
+
else:
|
|
292
|
+
column_headers[j].append(pydyf.String(id(cell)))
|
|
293
|
+
|
|
294
|
+
# Map headers to cells.
|
|
295
|
+
for i, row in enumerate(rows):
|
|
296
|
+
for j, cell in enumerate(row.children):
|
|
297
|
+
if cell.element is None:
|
|
298
|
+
continue
|
|
299
|
+
if cell.element_tag == 'td':
|
|
300
|
+
cell.mark['A'] = pydyf.Dictionary({
|
|
301
|
+
'O': '/Table',
|
|
302
|
+
'Headers': pydyf.Array(row_headers[i] + column_headers[j]),
|
|
303
|
+
})
|
|
304
|
+
|
|
305
|
+
yield element
|