weasyprint 65.1__py3-none-any.whl → 67.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- weasyprint/__init__.py +17 -7
- weasyprint/__main__.py +21 -10
- weasyprint/anchors.py +4 -4
- weasyprint/css/__init__.py +732 -67
- weasyprint/css/computed_values.py +65 -170
- weasyprint/css/counters.py +1 -1
- weasyprint/css/functions.py +206 -0
- weasyprint/css/html5_ua.css +3 -7
- weasyprint/css/html5_ua_form.css +2 -2
- weasyprint/css/media_queries.py +3 -1
- weasyprint/css/properties.py +6 -2
- weasyprint/css/{utils.py → tokens.py} +306 -397
- weasyprint/css/units.py +91 -0
- weasyprint/css/validation/__init__.py +1 -1
- weasyprint/css/validation/descriptors.py +47 -19
- weasyprint/css/validation/expanders.py +7 -8
- weasyprint/css/validation/properties.py +341 -357
- weasyprint/document.py +20 -19
- weasyprint/draw/__init__.py +56 -63
- weasyprint/draw/border.py +121 -69
- weasyprint/draw/color.py +1 -1
- weasyprint/draw/text.py +60 -41
- weasyprint/formatting_structure/boxes.py +24 -5
- weasyprint/formatting_structure/build.py +33 -45
- weasyprint/images.py +76 -62
- weasyprint/layout/__init__.py +32 -26
- weasyprint/layout/absolute.py +7 -6
- weasyprint/layout/background.py +7 -7
- weasyprint/layout/block.py +195 -152
- weasyprint/layout/column.py +19 -24
- weasyprint/layout/flex.py +54 -26
- weasyprint/layout/float.py +12 -7
- weasyprint/layout/grid.py +284 -90
- weasyprint/layout/inline.py +121 -68
- weasyprint/layout/page.py +45 -12
- weasyprint/layout/percent.py +14 -10
- weasyprint/layout/preferred.py +105 -63
- weasyprint/layout/replaced.py +9 -6
- weasyprint/layout/table.py +16 -9
- weasyprint/pdf/__init__.py +58 -18
- weasyprint/pdf/anchors.py +3 -4
- weasyprint/pdf/fonts.py +126 -69
- weasyprint/pdf/metadata.py +36 -4
- weasyprint/pdf/pdfa.py +19 -3
- weasyprint/pdf/pdfua.py +7 -115
- weasyprint/pdf/pdfx.py +83 -0
- weasyprint/pdf/stream.py +57 -49
- weasyprint/pdf/tags.py +307 -0
- weasyprint/stacking.py +14 -15
- weasyprint/svg/__init__.py +59 -32
- weasyprint/svg/bounding_box.py +4 -2
- weasyprint/svg/defs.py +4 -9
- weasyprint/svg/images.py +11 -3
- weasyprint/svg/text.py +11 -2
- weasyprint/svg/utils.py +15 -8
- weasyprint/text/constants.py +1 -1
- weasyprint/text/ffi.py +4 -3
- weasyprint/text/fonts.py +13 -5
- weasyprint/text/line_break.py +146 -43
- weasyprint/urls.py +41 -13
- {weasyprint-65.1.dist-info → weasyprint-67.0.dist-info}/METADATA +5 -6
- weasyprint-67.0.dist-info/RECORD +77 -0
- weasyprint/draw/stack.py +0 -13
- weasyprint-65.1.dist-info/RECORD +0 -74
- {weasyprint-65.1.dist-info → weasyprint-67.0.dist-info}/WHEEL +0 -0
- {weasyprint-65.1.dist-info → weasyprint-67.0.dist-info}/entry_points.txt +0 -0
- {weasyprint-65.1.dist-info → weasyprint-67.0.dist-info}/licenses/LICENSE +0 -0
weasyprint/pdf/pdfua.py
CHANGED
|
@@ -1,125 +1,17 @@
|
|
|
1
1
|
"""PDF/UA generation."""
|
|
2
2
|
|
|
3
|
-
import
|
|
3
|
+
from functools import partial
|
|
4
4
|
|
|
5
5
|
from .metadata import add_metadata
|
|
6
6
|
|
|
7
7
|
|
|
8
|
-
def pdfua(pdf, metadata, document, page_streams, attachments, compress):
|
|
8
|
+
def pdfua(pdf, metadata, document, page_streams, attachments, compress, version):
|
|
9
9
|
"""Set metadata for PDF/UA documents."""
|
|
10
|
-
# Structure for PDF tagging
|
|
11
|
-
content_mapping = pydyf.Dictionary({})
|
|
12
|
-
pdf.add_object(content_mapping)
|
|
13
|
-
structure_root = pydyf.Dictionary({
|
|
14
|
-
'Type': '/StructTreeRoot',
|
|
15
|
-
'ParentTree': content_mapping.reference,
|
|
16
|
-
})
|
|
17
|
-
pdf.add_object(structure_root)
|
|
18
|
-
structure_document = pydyf.Dictionary({
|
|
19
|
-
'Type': '/StructElem',
|
|
20
|
-
'S': '/Document',
|
|
21
|
-
'P': structure_root.reference,
|
|
22
|
-
})
|
|
23
|
-
pdf.add_object(structure_document)
|
|
24
|
-
structure_root['K'] = pydyf.Array([structure_document.reference])
|
|
25
|
-
pdf.catalog['StructTreeRoot'] = structure_root.reference
|
|
26
|
-
|
|
27
|
-
document_children = []
|
|
28
|
-
content_mapping['Nums'] = pydyf.Array()
|
|
29
|
-
links = []
|
|
30
|
-
for page_number, page_stream in enumerate(page_streams):
|
|
31
|
-
structure = {}
|
|
32
|
-
document.build_element_structure(structure)
|
|
33
|
-
parents = [None] * len(page_stream.marked)
|
|
34
|
-
for mcid, (key, box) in enumerate(page_stream.marked):
|
|
35
|
-
# Build structure elements
|
|
36
|
-
kids = [mcid]
|
|
37
|
-
if key == 'Link':
|
|
38
|
-
object_reference = pydyf.Dictionary({
|
|
39
|
-
'Type': '/OBJR',
|
|
40
|
-
'Obj': box.link_annotation.reference,
|
|
41
|
-
'Pg': pdf.page_references[page_number],
|
|
42
|
-
})
|
|
43
|
-
pdf.add_object(object_reference)
|
|
44
|
-
links.append((object_reference.reference, box.link_annotation))
|
|
45
|
-
etree_element = box.element
|
|
46
|
-
child_structure_data_element = None
|
|
47
|
-
while True:
|
|
48
|
-
if etree_element is None:
|
|
49
|
-
structure_data = structure.setdefault(
|
|
50
|
-
box, {'parent': None})
|
|
51
|
-
else:
|
|
52
|
-
structure_data = structure[etree_element]
|
|
53
|
-
new_element = 'element' not in structure_data
|
|
54
|
-
if new_element:
|
|
55
|
-
child = structure_data['element'] = pydyf.Dictionary({
|
|
56
|
-
'Type': '/StructElem',
|
|
57
|
-
'S': f'/{key}',
|
|
58
|
-
'K': pydyf.Array(kids),
|
|
59
|
-
'Pg': pdf.page_references[page_number],
|
|
60
|
-
})
|
|
61
|
-
pdf.add_object(child)
|
|
62
|
-
if key == 'LI':
|
|
63
|
-
if etree_element.tag == 'dt':
|
|
64
|
-
sub_key = 'Lbl'
|
|
65
|
-
else:
|
|
66
|
-
sub_key = 'LBody'
|
|
67
|
-
real_child = pydyf.Dictionary({
|
|
68
|
-
'Type': '/StructElem',
|
|
69
|
-
'S': f'/{sub_key}',
|
|
70
|
-
'K': pydyf.Array(kids),
|
|
71
|
-
'Pg': pdf.page_references[page_number],
|
|
72
|
-
'P': child.reference,
|
|
73
|
-
})
|
|
74
|
-
pdf.add_object(real_child)
|
|
75
|
-
for kid in kids:
|
|
76
|
-
if isinstance(kid, int):
|
|
77
|
-
parents[kid] = real_child.reference
|
|
78
|
-
child['K'] = pydyf.Array([real_child.reference])
|
|
79
|
-
structure_data['element'] = real_child
|
|
80
|
-
else:
|
|
81
|
-
for kid in kids:
|
|
82
|
-
if isinstance(kid, int):
|
|
83
|
-
parents[kid] = child.reference
|
|
84
|
-
else:
|
|
85
|
-
child = structure_data['element']
|
|
86
|
-
child['K'].extend(kids)
|
|
87
|
-
for kid in kids:
|
|
88
|
-
if isinstance(kid, int):
|
|
89
|
-
parents[kid] = child.reference
|
|
90
|
-
kid = child.reference
|
|
91
|
-
if child_structure_data_element is not None:
|
|
92
|
-
child_structure_data_element['P'] = kid
|
|
93
|
-
if not new_element:
|
|
94
|
-
break
|
|
95
|
-
kids = [kid]
|
|
96
|
-
child_structure_data_element = child
|
|
97
|
-
if structure_data['parent'] is None:
|
|
98
|
-
child['P'] = structure_document.reference
|
|
99
|
-
document_children.append(child.reference)
|
|
100
|
-
break
|
|
101
|
-
else:
|
|
102
|
-
etree_element = structure_data['parent']
|
|
103
|
-
key = page_stream.get_marked_content_tag(etree_element.tag)
|
|
104
|
-
content_mapping['Nums'].append(page_number)
|
|
105
|
-
content_mapping['Nums'].append(pydyf.Array(parents))
|
|
106
|
-
structure_document['K'] = pydyf.Array(document_children)
|
|
107
|
-
for i, (link, annotation) in enumerate(links, start=page_number + 1):
|
|
108
|
-
content_mapping['Nums'].append(i)
|
|
109
|
-
content_mapping['Nums'].append(link)
|
|
110
|
-
annotation['StructParent'] = i
|
|
111
|
-
annotation['F'] = 2 ** (2 - 1)
|
|
112
|
-
|
|
113
10
|
# Common PDF metadata stream
|
|
114
|
-
add_metadata(pdf, metadata, 'ua',
|
|
115
|
-
|
|
116
|
-
# PDF document extra metadata
|
|
117
|
-
if 'Lang' not in pdf.catalog:
|
|
118
|
-
pdf.catalog['Lang'] = pydyf.String()
|
|
119
|
-
pdf.catalog['ViewerPreferences'] = pydyf.Dictionary({
|
|
120
|
-
'DisplayDocTitle': 'true',
|
|
121
|
-
})
|
|
122
|
-
pdf.catalog['MarkInfo'] = pydyf.Dictionary({'Marked': 'true'})
|
|
11
|
+
add_metadata(pdf, metadata, 'ua', version, conformance=None, compress=compress)
|
|
123
12
|
|
|
124
13
|
|
|
125
|
-
VARIANTS = {
|
|
14
|
+
VARIANTS = {
|
|
15
|
+
'pdf/ua-1': (partial(pdfua, version=1), {'version': '1.7', 'pdf_tags': True}),
|
|
16
|
+
'pdf/ua-2': (partial(pdfua, version=2), {'version': '2.0', 'pdf_tags': True}),
|
|
17
|
+
}
|
weasyprint/pdf/pdfx.py
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
"""PDF/X generation."""
|
|
2
|
+
|
|
3
|
+
from functools import partial
|
|
4
|
+
from time import localtime
|
|
5
|
+
|
|
6
|
+
import pydyf
|
|
7
|
+
|
|
8
|
+
from .metadata import add_metadata
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def pdfx(pdf, metadata, document, page_streams, attachments, compress, version,
|
|
12
|
+
variant):
|
|
13
|
+
"""Set metadata for PDF/X documents."""
|
|
14
|
+
|
|
15
|
+
# Add conformance metadata.
|
|
16
|
+
conformance = f'PDF/X-{version}{variant}'
|
|
17
|
+
if version < 4:
|
|
18
|
+
pdf.info['GTS_PDFXVersion'] = pydyf.String(conformance)
|
|
19
|
+
pdf.info['GTS_PDFXConformance'] = pydyf.String(conformance)
|
|
20
|
+
pdf.info['Trapped'] = '/False'
|
|
21
|
+
now = localtime()
|
|
22
|
+
year, month, day, hour, minute, second = now[:6]
|
|
23
|
+
tz_hour, tz_minute = divmod(now.tm_gmtoff, 3600)
|
|
24
|
+
now_iso = (
|
|
25
|
+
f'{year:04}-{month:02}-{day:02}T{hour:02}:{minute:02}:{second:02}'
|
|
26
|
+
f'{tz_hour:+03}:{tz_minute:02}')
|
|
27
|
+
now_pdf = (
|
|
28
|
+
f'(D:{year:04}{month:02}{day:02}{hour:02}{minute:02}{second:02}'
|
|
29
|
+
f"{tz_hour:+03}'{tz_minute:02}')")
|
|
30
|
+
if not metadata.modified:
|
|
31
|
+
metadata.modified = now_iso
|
|
32
|
+
pdf.info['ModDate'] = now_pdf
|
|
33
|
+
if not metadata.created:
|
|
34
|
+
metadata.created = now_iso
|
|
35
|
+
pdf.info['CreationDate'] = now_pdf
|
|
36
|
+
|
|
37
|
+
# Add output intents.
|
|
38
|
+
if 'device-cmyk' not in document.color_profiles:
|
|
39
|
+
# Add standard CMYK profile.
|
|
40
|
+
pdf.catalog['OutputIntents'] = pydyf.Array([
|
|
41
|
+
pydyf.Dictionary({
|
|
42
|
+
'Type': '/OutputIntent',
|
|
43
|
+
'S': '/GTS_PDFX',
|
|
44
|
+
'OutputConditionIdentifier': pydyf.String('CGATS TR 001'),
|
|
45
|
+
'RegistryName': pydyf.String('http://www.color.org'),
|
|
46
|
+
}),
|
|
47
|
+
])
|
|
48
|
+
|
|
49
|
+
# Common PDF metadata stream.
|
|
50
|
+
add_metadata(pdf, metadata, 'x', version, conformance, compress=compress)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
VARIANTS = {
|
|
54
|
+
'pdf/x-1a': (
|
|
55
|
+
partial(pdfx, version=1, variant='a:2003'),
|
|
56
|
+
{'version': '1.4', 'identifier': True},
|
|
57
|
+
),
|
|
58
|
+
'pdf/x-3': (
|
|
59
|
+
partial(pdfx, version=3, variant=':2003'),
|
|
60
|
+
{'version': '1.4', 'identifier': True},
|
|
61
|
+
),
|
|
62
|
+
'pdf/x-4': (
|
|
63
|
+
partial(pdfx, version=4, variant=''),
|
|
64
|
+
{'version': '1.6', 'identifier': True},
|
|
65
|
+
),
|
|
66
|
+
'pdf/x-5g': (
|
|
67
|
+
partial(pdfx, version=5, variant='g'),
|
|
68
|
+
{'version': '1.6', 'identifier': True},
|
|
69
|
+
),
|
|
70
|
+
# TODO: these variants forbid OutputIntent to include ICC file.
|
|
71
|
+
# 'pdf/x-4p': (
|
|
72
|
+
# partial(pdfx, version=4, variant='p'),
|
|
73
|
+
# {'version': '1.6', 'identifier': True},
|
|
74
|
+
# ),
|
|
75
|
+
# 'pdf/x-5pg': (
|
|
76
|
+
# partial(pdfx, version=5, variant='pg'),
|
|
77
|
+
# {'version': '1.6', 'identifier': True},
|
|
78
|
+
# ),
|
|
79
|
+
# 'pdf/x-5n': (
|
|
80
|
+
# partial(pdfx, version=5, variant='n'),
|
|
81
|
+
# {'version': '1.6', 'identifier': True},
|
|
82
|
+
# ),
|
|
83
|
+
}
|
weasyprint/pdf/stream.py
CHANGED
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
"""PDF stream."""
|
|
2
2
|
|
|
3
|
+
from contextlib import contextmanager
|
|
4
|
+
|
|
3
5
|
import pydyf
|
|
4
6
|
|
|
5
7
|
from ..logger import LOGGER
|
|
@@ -11,14 +13,15 @@ from .fonts import Font
|
|
|
11
13
|
|
|
12
14
|
class Stream(pydyf.Stream):
|
|
13
15
|
"""PDF stream object with extra features."""
|
|
14
|
-
def __init__(self, fonts, page_rectangle, resources, images,
|
|
16
|
+
def __init__(self, fonts, page_rectangle, resources, images, tags, color_profiles,
|
|
17
|
+
*args, **kwargs):
|
|
15
18
|
super().__init__(*args, **kwargs)
|
|
16
19
|
self.page_rectangle = page_rectangle
|
|
17
|
-
self.marked = []
|
|
18
20
|
self._fonts = fonts
|
|
19
21
|
self._resources = resources
|
|
20
22
|
self._images = images
|
|
21
|
-
self.
|
|
23
|
+
self._tags = tags
|
|
24
|
+
self._color_profiles = color_profiles
|
|
22
25
|
self._current_color = self._current_color_stroke = None
|
|
23
26
|
self._current_alpha = self._current_alpha_stroke = None
|
|
24
27
|
self._current_font = self._current_font_size = None
|
|
@@ -39,8 +42,10 @@ class Stream(pydyf.Stream):
|
|
|
39
42
|
kwargs['resources'] = self._resources
|
|
40
43
|
if 'images' not in kwargs:
|
|
41
44
|
kwargs['images'] = self._images
|
|
42
|
-
if '
|
|
43
|
-
kwargs['
|
|
45
|
+
if 'tags' not in kwargs:
|
|
46
|
+
kwargs['tags'] = self._tags
|
|
47
|
+
if 'color_profiles' not in kwargs:
|
|
48
|
+
kwargs['color_profiles'] = self._color_profiles
|
|
44
49
|
if 'compress' not in kwargs:
|
|
45
50
|
kwargs['compress'] = self.compress
|
|
46
51
|
return Stream(**kwargs)
|
|
@@ -104,8 +109,21 @@ class Stream(pydyf.Stream):
|
|
|
104
109
|
self.set_color_space('lab-d50', stroke)
|
|
105
110
|
lightness, a, b = color.to('lab').coordinates
|
|
106
111
|
self.set_color_special(None, stroke, lightness, a, b)
|
|
112
|
+
elif color.space == 'device-cmyk':
|
|
113
|
+
self.set_color_space('DeviceCMYK', stroke)
|
|
114
|
+
c, m, y, k = color.coordinates
|
|
115
|
+
self.set_color_special(None, stroke, c, m, y, k)
|
|
116
|
+
elif color.space.startswith('--') and color.space in self._color_profiles:
|
|
117
|
+
self.set_color_space(color.space, stroke)
|
|
118
|
+
self.set_color_special(None, stroke, *color.coordinates)
|
|
107
119
|
else:
|
|
108
|
-
LOGGER.
|
|
120
|
+
LOGGER.warning('Unsupported color space %s, use sRGB instead', color.space)
|
|
121
|
+
if len(channels) > 3:
|
|
122
|
+
channels = channels[:3]
|
|
123
|
+
elif len(channels) == 2:
|
|
124
|
+
channels = *channels, 0
|
|
125
|
+
elif len(channels) == 1:
|
|
126
|
+
channels = *channels, 0, 0
|
|
109
127
|
self.set_color_rgb(*channels, stroke)
|
|
110
128
|
|
|
111
129
|
def set_font_size(self, font, size):
|
|
@@ -248,21 +266,39 @@ class Stream(pydyf.Stream):
|
|
|
248
266
|
self._resources['Shading'][shading.id] = shading
|
|
249
267
|
return shading
|
|
250
268
|
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
269
|
+
@contextmanager
|
|
270
|
+
def stacked(self):
|
|
271
|
+
"""Save and restore stream context when used with the ``with`` keyword."""
|
|
272
|
+
self.push_state()
|
|
273
|
+
try:
|
|
274
|
+
yield
|
|
275
|
+
finally:
|
|
276
|
+
self.pop_state()
|
|
277
|
+
|
|
278
|
+
@contextmanager
|
|
279
|
+
def marked(self, box, tag):
|
|
280
|
+
if self._tags is not None:
|
|
281
|
+
property_list = None
|
|
282
|
+
mcid = len(self._tags)
|
|
283
|
+
assert box not in self._tags
|
|
284
|
+
self._tags[box] = {'tag': tag, 'mcid': mcid}
|
|
285
|
+
property_list = pydyf.Dictionary({'MCID': mcid})
|
|
286
|
+
super().begin_marked_content(tag, property_list)
|
|
287
|
+
try:
|
|
288
|
+
yield
|
|
289
|
+
finally:
|
|
290
|
+
if self._tags is not None:
|
|
291
|
+
super().end_marked_content()
|
|
292
|
+
|
|
293
|
+
@contextmanager
|
|
294
|
+
def artifact(self):
|
|
295
|
+
if self._tags is not None:
|
|
296
|
+
super().begin_marked_content('Artifact')
|
|
297
|
+
try:
|
|
298
|
+
yield
|
|
299
|
+
finally:
|
|
300
|
+
if self._tags is not None:
|
|
301
|
+
super().end_marked_content()
|
|
266
302
|
|
|
267
303
|
@staticmethod
|
|
268
304
|
def create_interpolation_function(domain, c0, c1, n):
|
|
@@ -283,31 +319,3 @@ class Stream(pydyf.Stream):
|
|
|
283
319
|
'Bounds': pydyf.Array(bounds),
|
|
284
320
|
'Functions': pydyf.Array(sub_functions),
|
|
285
321
|
})
|
|
286
|
-
|
|
287
|
-
def get_marked_content_tag(self, element_tag):
|
|
288
|
-
if element_tag == 'div':
|
|
289
|
-
return 'Div'
|
|
290
|
-
elif element_tag == 'span':
|
|
291
|
-
return 'Span'
|
|
292
|
-
elif element_tag == 'article':
|
|
293
|
-
return 'Art'
|
|
294
|
-
elif element_tag == 'section':
|
|
295
|
-
return 'Sect'
|
|
296
|
-
elif element_tag == 'blockquote':
|
|
297
|
-
return 'BlockQuote'
|
|
298
|
-
elif element_tag == 'p':
|
|
299
|
-
return 'P'
|
|
300
|
-
elif element_tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'):
|
|
301
|
-
return element_tag.upper()
|
|
302
|
-
elif element_tag in ('dl', 'ul', 'ol'):
|
|
303
|
-
return 'L'
|
|
304
|
-
elif element_tag in ('li', 'dt', 'dd'):
|
|
305
|
-
return 'LI'
|
|
306
|
-
elif element_tag == 'table':
|
|
307
|
-
return 'Table'
|
|
308
|
-
elif element_tag in ('tr', 'th', 'td'):
|
|
309
|
-
return element_tag.upper()
|
|
310
|
-
elif element_tag in ('thead', 'tbody', 'tfoot'):
|
|
311
|
-
return element_tag[:2].upper() + element_tag[2:]
|
|
312
|
-
else:
|
|
313
|
-
return 'NonStruct'
|
weasyprint/pdf/tags.py
ADDED
|
@@ -0,0 +1,307 @@
|
|
|
1
|
+
"""PDF tagging."""
|
|
2
|
+
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
|
|
5
|
+
import pydyf
|
|
6
|
+
|
|
7
|
+
from ..formatting_structure import boxes
|
|
8
|
+
from ..layout.absolute import AbsolutePlaceholder
|
|
9
|
+
from ..logger import LOGGER
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def add_tags(pdf, document, page_streams):
|
|
13
|
+
"""Add tag tree to the document."""
|
|
14
|
+
|
|
15
|
+
# Add root structure.
|
|
16
|
+
content_mapping = pydyf.Dictionary({})
|
|
17
|
+
pdf.add_object(content_mapping)
|
|
18
|
+
structure_root = pydyf.Dictionary({
|
|
19
|
+
'Type': '/StructTreeRoot',
|
|
20
|
+
'ParentTree': content_mapping.reference,
|
|
21
|
+
})
|
|
22
|
+
pdf.add_object(structure_root)
|
|
23
|
+
structure_document = pydyf.Dictionary({
|
|
24
|
+
'Type': '/StructElem',
|
|
25
|
+
'S': '/Document',
|
|
26
|
+
'K': pydyf.Array(),
|
|
27
|
+
'P': structure_root.reference,
|
|
28
|
+
})
|
|
29
|
+
pdf.add_object(structure_document)
|
|
30
|
+
structure_root['K'] = pydyf.Array([structure_document.reference])
|
|
31
|
+
pdf.catalog['StructTreeRoot'] = structure_root.reference
|
|
32
|
+
|
|
33
|
+
# Map content.
|
|
34
|
+
content_mapping['Nums'] = pydyf.Array()
|
|
35
|
+
links = []
|
|
36
|
+
for page_number, (page, stream) in enumerate(zip(document.pages, page_streams)):
|
|
37
|
+
tags = stream._tags
|
|
38
|
+
page_box = page._page_box
|
|
39
|
+
|
|
40
|
+
# Prepare array for this page’s MCID-to-StructElem mapping.
|
|
41
|
+
content_mapping['Nums'].append(page_number)
|
|
42
|
+
content_mapping['Nums'].append(pydyf.Array())
|
|
43
|
+
page_nums = {}
|
|
44
|
+
|
|
45
|
+
# Map page box content.
|
|
46
|
+
elements = _build_box_tree(
|
|
47
|
+
page_box, structure_document, pdf, page_number, page_nums, links, tags)
|
|
48
|
+
for element in elements:
|
|
49
|
+
structure_document['K'].append(element.reference)
|
|
50
|
+
assert not tags
|
|
51
|
+
|
|
52
|
+
# Flatten page-local nums into global mapping.
|
|
53
|
+
sorted_refs = [ref for _, ref in sorted(page_nums.items())]
|
|
54
|
+
content_mapping['Nums'][-1].extend(sorted_refs)
|
|
55
|
+
|
|
56
|
+
# Add annotations for links.
|
|
57
|
+
for i, (link_reference, annotation) in enumerate(links, start=len(document.pages)):
|
|
58
|
+
content_mapping['Nums'].append(i)
|
|
59
|
+
content_mapping['Nums'].append(link_reference)
|
|
60
|
+
annotation['StructParent'] = i
|
|
61
|
+
|
|
62
|
+
# Add required metadata.
|
|
63
|
+
pdf.catalog['ViewerPreferences'] = pydyf.Dictionary({'DisplayDocTitle': 'true'})
|
|
64
|
+
pdf.catalog['MarkInfo'] = pydyf.Dictionary({'Marked': 'true'})
|
|
65
|
+
if 'Lang' not in pdf.catalog:
|
|
66
|
+
LOGGER.error('Missing required "lang" attribute at the root of the document')
|
|
67
|
+
pdf.catalog['Lang'] = pydyf.String()
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _get_pdf_tag(tag):
|
|
71
|
+
"""Get PDF tag corresponding to HTML tag."""
|
|
72
|
+
if tag is None:
|
|
73
|
+
return 'NonStruct'
|
|
74
|
+
elif tag == 'div':
|
|
75
|
+
return 'Div'
|
|
76
|
+
elif tag.split(':')[0] == 'a':
|
|
77
|
+
# Links and link pseudo elements create link annotations.
|
|
78
|
+
return 'Link'
|
|
79
|
+
elif tag == 'span':
|
|
80
|
+
return 'Span'
|
|
81
|
+
elif tag == 'main':
|
|
82
|
+
return 'Part'
|
|
83
|
+
elif tag == 'article':
|
|
84
|
+
return 'Art'
|
|
85
|
+
elif tag == 'section':
|
|
86
|
+
return 'Sect'
|
|
87
|
+
elif tag == 'blockquote':
|
|
88
|
+
return 'BlockQuote'
|
|
89
|
+
elif tag == 'p':
|
|
90
|
+
return 'P'
|
|
91
|
+
elif tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'):
|
|
92
|
+
return tag.upper()
|
|
93
|
+
elif tag in ('dl', 'ul', 'ol'):
|
|
94
|
+
return 'L'
|
|
95
|
+
elif tag in ('li', 'dt', 'dd'):
|
|
96
|
+
# TODO: dt should be different.
|
|
97
|
+
return 'LI'
|
|
98
|
+
elif tag == 'li::marker':
|
|
99
|
+
return 'Lbl'
|
|
100
|
+
elif tag == 'table':
|
|
101
|
+
return 'Table'
|
|
102
|
+
elif tag in ('tr', 'th', 'td'):
|
|
103
|
+
return tag.upper()
|
|
104
|
+
elif tag in ('thead', 'tbody', 'tfoot'):
|
|
105
|
+
return tag[:2].upper() + tag[2:]
|
|
106
|
+
elif tag == 'img':
|
|
107
|
+
return 'Figure'
|
|
108
|
+
elif tag in ('caption', 'figcaption'):
|
|
109
|
+
return 'Caption'
|
|
110
|
+
else:
|
|
111
|
+
return 'NonStruct'
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def _build_box_tree(box, parent, pdf, page_number, nums, links, tags):
|
|
115
|
+
"""Recursively build tag tree for given box and yield children."""
|
|
116
|
+
|
|
117
|
+
# Special case for absolute elements.
|
|
118
|
+
if isinstance(box, AbsolutePlaceholder):
|
|
119
|
+
box = box._box
|
|
120
|
+
|
|
121
|
+
element_tag = None if box.element is None else box.element_tag
|
|
122
|
+
tag = _get_pdf_tag(element_tag)
|
|
123
|
+
|
|
124
|
+
# Special case for html, body, page boxes and margin boxes.
|
|
125
|
+
if element_tag in ('html', 'body') or isinstance(box, boxes.PageBox):
|
|
126
|
+
# Avoid generate page, html and body boxes as a semantic node, yield children.
|
|
127
|
+
if isinstance(box, boxes.ParentBox) and not isinstance(box, boxes.LineBox):
|
|
128
|
+
for child in box.children:
|
|
129
|
+
yield from _build_box_tree(
|
|
130
|
+
child, parent, pdf, page_number, nums, links, tags)
|
|
131
|
+
return
|
|
132
|
+
elif isinstance(box, boxes.MarginBox):
|
|
133
|
+
# Build tree for margin boxes but don’t link it to main tree. It ensures that
|
|
134
|
+
# marked content is mapped in document and removed from list. It could be
|
|
135
|
+
# included in tree as Artifact, but that’s only allowed in PDF 2.0.
|
|
136
|
+
for child in box.children:
|
|
137
|
+
tuple(_build_box_tree(child, parent, pdf, page_number, nums, links, tags))
|
|
138
|
+
return
|
|
139
|
+
|
|
140
|
+
# Create box element.
|
|
141
|
+
if tag == 'LI':
|
|
142
|
+
anonymous_list_element = parent['S'] == '/LI'
|
|
143
|
+
anonymous_li_child = parent['S'] == '/LBody'
|
|
144
|
+
dl_item = box.element_tag in ('dt', 'dd')
|
|
145
|
+
no_bullet_li = box.element_tag == 'li' and (
|
|
146
|
+
'list-item' not in box.style['display'] or
|
|
147
|
+
box.style['list_style_type'] == 'none')
|
|
148
|
+
if anonymous_list_element:
|
|
149
|
+
# Store as list item body.
|
|
150
|
+
tag = 'LBody'
|
|
151
|
+
elif anonymous_li_child:
|
|
152
|
+
# Store as non struct list item body child.
|
|
153
|
+
tag = 'NonStruct'
|
|
154
|
+
elif dl_item or no_bullet_li:
|
|
155
|
+
# Wrap in list item.
|
|
156
|
+
tag = 'LBody'
|
|
157
|
+
parent = pydyf.Dictionary({
|
|
158
|
+
'Type': '/StructElem',
|
|
159
|
+
'S': '/LI',
|
|
160
|
+
'K': pydyf.Array([]),
|
|
161
|
+
'Pg': pdf.page_references[page_number],
|
|
162
|
+
'P': parent.reference,
|
|
163
|
+
})
|
|
164
|
+
pdf.add_object(parent)
|
|
165
|
+
children = _build_box_tree(box, parent, pdf, page_number, nums, links, tags)
|
|
166
|
+
for child in children:
|
|
167
|
+
parent['K'].append(child.reference)
|
|
168
|
+
yield parent
|
|
169
|
+
return
|
|
170
|
+
|
|
171
|
+
element = pydyf.Dictionary({
|
|
172
|
+
'Type': '/StructElem',
|
|
173
|
+
'S': f'/{tag}',
|
|
174
|
+
'K': pydyf.Array([]),
|
|
175
|
+
'Pg': pdf.page_references[page_number],
|
|
176
|
+
'P': parent.reference,
|
|
177
|
+
})
|
|
178
|
+
pdf.add_object(element)
|
|
179
|
+
|
|
180
|
+
# Handle special cases.
|
|
181
|
+
if tag == 'Figure':
|
|
182
|
+
# Add extra data for images.
|
|
183
|
+
x1, y1 = box.content_box_x(), box.content_box_y()
|
|
184
|
+
x2, y2 = x1 + box.width, y1 + box.height
|
|
185
|
+
element['A'] = pydyf.Dictionary({
|
|
186
|
+
'O': '/Layout',
|
|
187
|
+
'BBox': pydyf.Array((x1, y1, x2, y2)),
|
|
188
|
+
})
|
|
189
|
+
if alt := box.element.attrib.get('alt'):
|
|
190
|
+
element['Alt'] = pydyf.String(alt)
|
|
191
|
+
else:
|
|
192
|
+
source = box.element.attrib.get('src', 'unknown')
|
|
193
|
+
LOGGER.error(f'Image "{source}" has no required alt description')
|
|
194
|
+
elif tag == 'Table':
|
|
195
|
+
# Use wrapped table as tagged box, and put captions in it.
|
|
196
|
+
if box.is_table_wrapper:
|
|
197
|
+
# Can be false if table has another display type.
|
|
198
|
+
wrapper, table = box, box.get_wrapped_table()
|
|
199
|
+
box = table.copy_with_children([])
|
|
200
|
+
for child in wrapper.children:
|
|
201
|
+
box.children.extend(child.children if child is table else [child])
|
|
202
|
+
elif tag == 'TH':
|
|
203
|
+
# Set identifier for table headers to reference them in cells.
|
|
204
|
+
element['ID'] = pydyf.String(id(box))
|
|
205
|
+
elif tag == 'TD':
|
|
206
|
+
# Store table cell element to map it to headers later.
|
|
207
|
+
# TODO: don’t use the box to store this.
|
|
208
|
+
box.mark = element
|
|
209
|
+
|
|
210
|
+
# Include link annotations.
|
|
211
|
+
if box.link_annotation:
|
|
212
|
+
annotation = box.link_annotation
|
|
213
|
+
object_reference = pydyf.Dictionary({
|
|
214
|
+
'Type': '/OBJR',
|
|
215
|
+
'Obj': annotation.reference,
|
|
216
|
+
'Pg': pdf.page_references[page_number],
|
|
217
|
+
})
|
|
218
|
+
pdf.add_object(object_reference)
|
|
219
|
+
links.append((element.reference, annotation))
|
|
220
|
+
element['K'].append(object_reference.reference)
|
|
221
|
+
|
|
222
|
+
if isinstance(box, boxes.ParentBox):
|
|
223
|
+
# Build tree for box children.
|
|
224
|
+
for child in box.children:
|
|
225
|
+
children = child.children if isinstance(child, boxes.LineBox) else [child]
|
|
226
|
+
for child in children:
|
|
227
|
+
if isinstance(child, boxes.TextBox):
|
|
228
|
+
# Add marked element from the stream.
|
|
229
|
+
kid = tags.pop(child)
|
|
230
|
+
assert kid['mcid'] not in nums
|
|
231
|
+
if tag == 'Link':
|
|
232
|
+
# Associate MCID directly with link reference.
|
|
233
|
+
element['K'].append(kid['mcid'])
|
|
234
|
+
nums[kid['mcid']] = element.reference
|
|
235
|
+
else:
|
|
236
|
+
kid_element = pydyf.Dictionary({
|
|
237
|
+
'Type': '/StructElem',
|
|
238
|
+
'S': f'/{kid["tag"]}',
|
|
239
|
+
'K': pydyf.Array([kid['mcid']]),
|
|
240
|
+
'Pg': pdf.page_references[page_number],
|
|
241
|
+
'P': element.reference,
|
|
242
|
+
})
|
|
243
|
+
pdf.add_object(kid_element)
|
|
244
|
+
element['K'].append(kid_element.reference)
|
|
245
|
+
nums[kid['mcid']] = kid_element.reference
|
|
246
|
+
else:
|
|
247
|
+
# Recursively build tree for child.
|
|
248
|
+
if child.element_tag in ('ul', 'ol') and element['S'] == '/LI':
|
|
249
|
+
# In PDFs, nested lists are linked to the parent list, but in
|
|
250
|
+
# HTML, nested lists are linked to a parent’s list item.
|
|
251
|
+
child_parent = parent
|
|
252
|
+
else:
|
|
253
|
+
child_parent = element
|
|
254
|
+
child_elements = _build_box_tree(
|
|
255
|
+
child, child_parent, pdf, page_number, nums, links, tags)
|
|
256
|
+
|
|
257
|
+
# Check if it is already been referenced before.
|
|
258
|
+
for child_element in child_elements:
|
|
259
|
+
child_parent['K'].append(child_element.reference)
|
|
260
|
+
|
|
261
|
+
else:
|
|
262
|
+
# Add replaced box.
|
|
263
|
+
assert isinstance(box, boxes.ReplacedBox)
|
|
264
|
+
kid = tags.pop(box)
|
|
265
|
+
element['K'].append(kid['mcid'])
|
|
266
|
+
assert kid['mcid'] not in nums
|
|
267
|
+
nums[kid['mcid']] = element.reference
|
|
268
|
+
|
|
269
|
+
# Link table cells to related headers.
|
|
270
|
+
if tag == 'Table':
|
|
271
|
+
def _get_rows(table_box):
|
|
272
|
+
for child in table_box.children:
|
|
273
|
+
if child.element_tag == 'tr':
|
|
274
|
+
yield child
|
|
275
|
+
else:
|
|
276
|
+
yield from _get_rows(child)
|
|
277
|
+
|
|
278
|
+
# Get headers and rows.
|
|
279
|
+
column_headers = defaultdict(list)
|
|
280
|
+
row_headers = defaultdict(list)
|
|
281
|
+
rows = tuple(_get_rows(box))
|
|
282
|
+
|
|
283
|
+
# Find column and row headers.
|
|
284
|
+
# TODO: handle rowspan and colspan values.
|
|
285
|
+
for i, row in enumerate(rows):
|
|
286
|
+
for j, cell in enumerate(row.children):
|
|
287
|
+
if cell.element is None:
|
|
288
|
+
continue
|
|
289
|
+
if cell.element_tag == 'th':
|
|
290
|
+
# TODO: handle rowgroup and colgroup values.
|
|
291
|
+
if cell.element.attrib.get('scope') == 'row':
|
|
292
|
+
row_headers[i].append(pydyf.String(id(cell)))
|
|
293
|
+
else:
|
|
294
|
+
column_headers[j].append(pydyf.String(id(cell)))
|
|
295
|
+
|
|
296
|
+
# Map headers to cells.
|
|
297
|
+
for i, row in enumerate(rows):
|
|
298
|
+
for j, cell in enumerate(row.children):
|
|
299
|
+
if cell.element is None:
|
|
300
|
+
continue
|
|
301
|
+
if cell.element_tag == 'td':
|
|
302
|
+
cell.mark['A'] = pydyf.Dictionary({
|
|
303
|
+
'O': '/Table',
|
|
304
|
+
'Headers': pydyf.Array(row_headers[i] + column_headers[j]),
|
|
305
|
+
})
|
|
306
|
+
|
|
307
|
+
yield element
|