weasyprint 65.1__py3-none-any.whl → 66.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. weasyprint/__init__.py +4 -1
  2. weasyprint/__main__.py +2 -0
  3. weasyprint/css/__init__.py +12 -4
  4. weasyprint/css/computed_values.py +8 -2
  5. weasyprint/css/html5_ua.css +2 -7
  6. weasyprint/css/html5_ua_form.css +1 -1
  7. weasyprint/css/utils.py +1 -1
  8. weasyprint/document.py +2 -10
  9. weasyprint/draw/__init__.py +51 -57
  10. weasyprint/draw/border.py +120 -66
  11. weasyprint/draw/text.py +1 -2
  12. weasyprint/formatting_structure/boxes.py +3 -2
  13. weasyprint/formatting_structure/build.py +32 -42
  14. weasyprint/images.py +8 -15
  15. weasyprint/layout/__init__.py +5 -2
  16. weasyprint/layout/absolute.py +4 -1
  17. weasyprint/layout/block.py +60 -29
  18. weasyprint/layout/column.py +1 -0
  19. weasyprint/layout/flex.py +41 -21
  20. weasyprint/layout/float.py +8 -1
  21. weasyprint/layout/grid.py +1 -1
  22. weasyprint/layout/inline.py +7 -8
  23. weasyprint/layout/page.py +23 -1
  24. weasyprint/layout/preferred.py +59 -32
  25. weasyprint/layout/table.py +8 -4
  26. weasyprint/pdf/__init__.py +13 -6
  27. weasyprint/pdf/anchors.py +2 -2
  28. weasyprint/pdf/pdfua.py +7 -115
  29. weasyprint/pdf/stream.py +40 -49
  30. weasyprint/pdf/tags.py +305 -0
  31. weasyprint/stacking.py +14 -15
  32. weasyprint/svg/__init__.py +22 -11
  33. weasyprint/svg/bounding_box.py +4 -2
  34. weasyprint/svg/defs.py +4 -9
  35. weasyprint/svg/utils.py +9 -5
  36. weasyprint/text/fonts.py +1 -1
  37. weasyprint/text/line_break.py +45 -26
  38. weasyprint/urls.py +21 -10
  39. {weasyprint-65.1.dist-info → weasyprint-66.0.dist-info}/METADATA +1 -1
  40. weasyprint-66.0.dist-info/RECORD +74 -0
  41. weasyprint/draw/stack.py +0 -13
  42. weasyprint-65.1.dist-info/RECORD +0 -74
  43. {weasyprint-65.1.dist-info → weasyprint-66.0.dist-info}/WHEEL +0 -0
  44. {weasyprint-65.1.dist-info → weasyprint-66.0.dist-info}/entry_points.txt +0 -0
  45. {weasyprint-65.1.dist-info → weasyprint-66.0.dist-info}/licenses/LICENSE +0 -0
@@ -12,6 +12,7 @@ from ..matrix import Matrix
12
12
  from . import debug, pdfa, pdfua
13
13
  from .fonts import build_fonts_dictionary
14
14
  from .stream import Stream
15
+ from .tags import add_tags
15
16
 
16
17
  from .anchors import ( # isort:skip
17
18
  add_annotations, add_forms, add_links, add_outlines, resolve_links,
@@ -118,15 +119,15 @@ def generate_pdf(document, target, zoom, **options):
118
119
  PROGRESS_LOGGER.info('Step 6 - Creating PDF')
119
120
 
120
121
  # Set properties according to PDF variants
121
- mark = False
122
122
  srgb = options['srgb']
123
+ pdf_tags = options['pdf_tags']
123
124
  variant = options['pdf_variant']
124
125
  if variant:
125
126
  variant_function, properties = VARIANTS[variant]
126
- if 'mark' in properties:
127
- mark = properties['mark']
128
127
  if 'srgb' in properties:
129
128
  srgb = properties['srgb']
129
+ if 'pdf_tags' in properties:
130
+ pdf_tags = properties['pdf_tags']
130
131
 
131
132
  pdf = pydyf.PDF()
132
133
  images = {}
@@ -159,6 +160,8 @@ def generate_pdf(document, target, zoom, **options):
159
160
  compress = not options['uncompressed_pdf']
160
161
  for page_number, (page, links_and_anchors) in enumerate(
161
162
  zip(document.pages, page_links_and_anchors)):
163
+ tags = {} if pdf_tags else None
164
+
162
165
  # Draw from the top-left corner
163
166
  matrix = Matrix(scale, 0, 0, -scale, 0, page.height * scale)
164
167
 
@@ -175,7 +178,7 @@ def generate_pdf(document, target, zoom, **options):
175
178
  left / scale, top / scale,
176
179
  (right - left) / scale, (bottom - top) / scale)
177
180
  stream = Stream(
178
- document.fonts, page_rectangle, resources, images, mark, compress=compress)
181
+ document.fonts, page_rectangle, resources, images, tags, compress=compress)
179
182
  stream.transform(d=-1, f=(page.height * scale))
180
183
  pdf.add_object(stream)
181
184
  page_streams.append(stream)
@@ -187,13 +190,13 @@ def generate_pdf(document, target, zoom, **options):
187
190
  'Contents': stream.reference,
188
191
  'Resources': resources.reference,
189
192
  })
190
- if mark:
193
+ if pdf_tags:
191
194
  pdf_page['Tabs'] = '/S'
192
195
  pdf_page['StructParents'] = page_number
193
196
  pdf.add_page(pdf_page)
194
197
  pdf_pages.append(pdf_page)
195
198
 
196
- add_links(links_and_anchors, matrix, pdf, pdf_page, pdf_names, mark)
199
+ add_links(links_and_anchors, matrix, pdf, pdf_page, pdf_names, tags)
197
200
  add_annotations(
198
201
  links_and_anchors[0], matrix, document, pdf, pdf_page, annot_files,
199
202
  compress)
@@ -323,6 +326,10 @@ def generate_pdf(document, target, zoom, **options):
323
326
  }),
324
327
  ])
325
328
 
329
+ # Add tags
330
+ if pdf_tags:
331
+ add_tags(pdf, document, page_streams)
332
+
326
333
  # Apply PDF variants functions
327
334
  if variant:
328
335
  variant_function(
weasyprint/pdf/anchors.py CHANGED
@@ -16,7 +16,7 @@ from ..text.fonts import get_font_description
16
16
  from ..urls import URLFetchingError
17
17
 
18
18
 
19
- def add_links(links_and_anchors, matrix, pdf, page, names, mark):
19
+ def add_links(links_and_anchors, matrix, pdf, page, names, tags):
20
20
  """Include hyperlinks in given PDF page."""
21
21
  links, anchors = links_and_anchors
22
22
 
@@ -30,7 +30,7 @@ def add_links(links_and_anchors, matrix, pdf, page, names, mark):
30
30
  'Rect': pydyf.Array([x1, y1, x2, y2]),
31
31
  'BS': pydyf.Dictionary({'W': 0}),
32
32
  })
33
- if mark:
33
+ if tags is not None:
34
34
  box.link_annotation['Contents'] = pydyf.String(link_target)
35
35
  if link_type == 'internal':
36
36
  box.link_annotation['Dest'] = pydyf.String(link_target)
weasyprint/pdf/pdfua.py CHANGED
@@ -1,125 +1,17 @@
1
1
  """PDF/UA generation."""
2
2
 
3
- import pydyf
3
+ from functools import partial
4
4
 
5
5
  from .metadata import add_metadata
6
6
 
7
7
 
8
- def pdfua(pdf, metadata, document, page_streams, attachments, compress):
8
+ def pdfua(pdf, metadata, document, page_streams, attachments, compress, version):
9
9
  """Set metadata for PDF/UA documents."""
10
- # Structure for PDF tagging
11
- content_mapping = pydyf.Dictionary({})
12
- pdf.add_object(content_mapping)
13
- structure_root = pydyf.Dictionary({
14
- 'Type': '/StructTreeRoot',
15
- 'ParentTree': content_mapping.reference,
16
- })
17
- pdf.add_object(structure_root)
18
- structure_document = pydyf.Dictionary({
19
- 'Type': '/StructElem',
20
- 'S': '/Document',
21
- 'P': structure_root.reference,
22
- })
23
- pdf.add_object(structure_document)
24
- structure_root['K'] = pydyf.Array([structure_document.reference])
25
- pdf.catalog['StructTreeRoot'] = structure_root.reference
26
-
27
- document_children = []
28
- content_mapping['Nums'] = pydyf.Array()
29
- links = []
30
- for page_number, page_stream in enumerate(page_streams):
31
- structure = {}
32
- document.build_element_structure(structure)
33
- parents = [None] * len(page_stream.marked)
34
- for mcid, (key, box) in enumerate(page_stream.marked):
35
- # Build structure elements
36
- kids = [mcid]
37
- if key == 'Link':
38
- object_reference = pydyf.Dictionary({
39
- 'Type': '/OBJR',
40
- 'Obj': box.link_annotation.reference,
41
- 'Pg': pdf.page_references[page_number],
42
- })
43
- pdf.add_object(object_reference)
44
- links.append((object_reference.reference, box.link_annotation))
45
- etree_element = box.element
46
- child_structure_data_element = None
47
- while True:
48
- if etree_element is None:
49
- structure_data = structure.setdefault(
50
- box, {'parent': None})
51
- else:
52
- structure_data = structure[etree_element]
53
- new_element = 'element' not in structure_data
54
- if new_element:
55
- child = structure_data['element'] = pydyf.Dictionary({
56
- 'Type': '/StructElem',
57
- 'S': f'/{key}',
58
- 'K': pydyf.Array(kids),
59
- 'Pg': pdf.page_references[page_number],
60
- })
61
- pdf.add_object(child)
62
- if key == 'LI':
63
- if etree_element.tag == 'dt':
64
- sub_key = 'Lbl'
65
- else:
66
- sub_key = 'LBody'
67
- real_child = pydyf.Dictionary({
68
- 'Type': '/StructElem',
69
- 'S': f'/{sub_key}',
70
- 'K': pydyf.Array(kids),
71
- 'Pg': pdf.page_references[page_number],
72
- 'P': child.reference,
73
- })
74
- pdf.add_object(real_child)
75
- for kid in kids:
76
- if isinstance(kid, int):
77
- parents[kid] = real_child.reference
78
- child['K'] = pydyf.Array([real_child.reference])
79
- structure_data['element'] = real_child
80
- else:
81
- for kid in kids:
82
- if isinstance(kid, int):
83
- parents[kid] = child.reference
84
- else:
85
- child = structure_data['element']
86
- child['K'].extend(kids)
87
- for kid in kids:
88
- if isinstance(kid, int):
89
- parents[kid] = child.reference
90
- kid = child.reference
91
- if child_structure_data_element is not None:
92
- child_structure_data_element['P'] = kid
93
- if not new_element:
94
- break
95
- kids = [kid]
96
- child_structure_data_element = child
97
- if structure_data['parent'] is None:
98
- child['P'] = structure_document.reference
99
- document_children.append(child.reference)
100
- break
101
- else:
102
- etree_element = structure_data['parent']
103
- key = page_stream.get_marked_content_tag(etree_element.tag)
104
- content_mapping['Nums'].append(page_number)
105
- content_mapping['Nums'].append(pydyf.Array(parents))
106
- structure_document['K'] = pydyf.Array(document_children)
107
- for i, (link, annotation) in enumerate(links, start=page_number + 1):
108
- content_mapping['Nums'].append(i)
109
- content_mapping['Nums'].append(link)
110
- annotation['StructParent'] = i
111
- annotation['F'] = 2 ** (2 - 1)
112
-
113
10
  # Common PDF metadata stream
114
- add_metadata(pdf, metadata, 'ua', 1, conformance=None, compress=compress)
115
-
116
- # PDF document extra metadata
117
- if 'Lang' not in pdf.catalog:
118
- pdf.catalog['Lang'] = pydyf.String()
119
- pdf.catalog['ViewerPreferences'] = pydyf.Dictionary({
120
- 'DisplayDocTitle': 'true',
121
- })
122
- pdf.catalog['MarkInfo'] = pydyf.Dictionary({'Marked': 'true'})
11
+ add_metadata(pdf, metadata, 'ua', version, conformance=None, compress=compress)
123
12
 
124
13
 
125
- VARIANTS = {'pdf/ua-1': (pdfua, {'mark': True})}
14
+ VARIANTS = {
15
+ 'pdf/ua-1': (partial(pdfua, version=1), {'version': '1.7', 'pdf_tags': True}),
16
+ 'pdf/ua-2': (partial(pdfua, version=2), {'version': '2.0', 'pdf_tags': True}),
17
+ }
weasyprint/pdf/stream.py CHANGED
@@ -1,5 +1,7 @@
1
1
  """PDF stream."""
2
2
 
3
+ from contextlib import contextmanager
4
+
3
5
  import pydyf
4
6
 
5
7
  from ..logger import LOGGER
@@ -11,14 +13,13 @@ from .fonts import Font
11
13
 
12
14
  class Stream(pydyf.Stream):
13
15
  """PDF stream object with extra features."""
14
- def __init__(self, fonts, page_rectangle, resources, images, mark, *args, **kwargs):
16
+ def __init__(self, fonts, page_rectangle, resources, images, tags, *args, **kwargs):
15
17
  super().__init__(*args, **kwargs)
16
18
  self.page_rectangle = page_rectangle
17
- self.marked = []
18
19
  self._fonts = fonts
19
20
  self._resources = resources
20
21
  self._images = images
21
- self._mark = mark
22
+ self._tags = tags
22
23
  self._current_color = self._current_color_stroke = None
23
24
  self._current_alpha = self._current_alpha_stroke = None
24
25
  self._current_font = self._current_font_size = None
@@ -39,8 +40,8 @@ class Stream(pydyf.Stream):
39
40
  kwargs['resources'] = self._resources
40
41
  if 'images' not in kwargs:
41
42
  kwargs['images'] = self._images
42
- if 'mark' not in kwargs:
43
- kwargs['mark'] = self._mark
43
+ if 'tags' not in kwargs:
44
+ kwargs['tags'] = self._tags
44
45
  if 'compress' not in kwargs:
45
46
  kwargs['compress'] = self.compress
46
47
  return Stream(**kwargs)
@@ -105,7 +106,7 @@ class Stream(pydyf.Stream):
105
106
  lightness, a, b = color.to('lab').coordinates
106
107
  self.set_color_special(None, stroke, lightness, a, b)
107
108
  else:
108
- LOGGER.warn('Unsupported color space %s, use sRGB instead', color.space)
109
+ LOGGER.warning('Unsupported color space %s, use sRGB instead', color.space)
109
110
  self.set_color_rgb(*channels, stroke)
110
111
 
111
112
  def set_font_size(self, font, size):
@@ -248,21 +249,39 @@ class Stream(pydyf.Stream):
248
249
  self._resources['Shading'][shading.id] = shading
249
250
  return shading
250
251
 
251
- def begin_marked_content(self, box, mcid=False, tag=None):
252
- if not self._mark:
253
- return
254
- property_list = None
255
- if tag is None:
256
- tag = self.get_marked_content_tag(box.element_tag)
257
- if mcid:
258
- property_list = pydyf.Dictionary({'MCID': len(self.marked)})
259
- self.marked.append((tag, box))
260
- super().begin_marked_content(tag, property_list)
261
-
262
- def end_marked_content(self):
263
- if not self._mark:
264
- return
265
- super().end_marked_content()
252
+ @contextmanager
253
+ def stacked(self):
254
+ """Save and restore stream context when used with the ``with`` keyword."""
255
+ self.push_state()
256
+ try:
257
+ yield
258
+ finally:
259
+ self.pop_state()
260
+
261
+ @contextmanager
262
+ def marked(self, box, tag):
263
+ if self._tags is not None:
264
+ property_list = None
265
+ mcid = len(self._tags)
266
+ assert box not in self._tags
267
+ self._tags[box] = {'tag': tag, 'mcid': mcid}
268
+ property_list = pydyf.Dictionary({'MCID': mcid})
269
+ super().begin_marked_content(tag, property_list)
270
+ try:
271
+ yield
272
+ finally:
273
+ if self._tags is not None:
274
+ super().end_marked_content()
275
+
276
+ @contextmanager
277
+ def artifact(self):
278
+ if self._tags is not None:
279
+ super().begin_marked_content('Artifact')
280
+ try:
281
+ yield
282
+ finally:
283
+ if self._tags is not None:
284
+ super().end_marked_content()
266
285
 
267
286
  @staticmethod
268
287
  def create_interpolation_function(domain, c0, c1, n):
@@ -283,31 +302,3 @@ class Stream(pydyf.Stream):
283
302
  'Bounds': pydyf.Array(bounds),
284
303
  'Functions': pydyf.Array(sub_functions),
285
304
  })
286
-
287
- def get_marked_content_tag(self, element_tag):
288
- if element_tag == 'div':
289
- return 'Div'
290
- elif element_tag == 'span':
291
- return 'Span'
292
- elif element_tag == 'article':
293
- return 'Art'
294
- elif element_tag == 'section':
295
- return 'Sect'
296
- elif element_tag == 'blockquote':
297
- return 'BlockQuote'
298
- elif element_tag == 'p':
299
- return 'P'
300
- elif element_tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'):
301
- return element_tag.upper()
302
- elif element_tag in ('dl', 'ul', 'ol'):
303
- return 'L'
304
- elif element_tag in ('li', 'dt', 'dd'):
305
- return 'LI'
306
- elif element_tag == 'table':
307
- return 'Table'
308
- elif element_tag in ('tr', 'th', 'td'):
309
- return element_tag.upper()
310
- elif element_tag in ('thead', 'tbody', 'tfoot'):
311
- return element_tag[:2].upper() + element_tag[2:]
312
- else:
313
- return 'NonStruct'
weasyprint/pdf/tags.py ADDED
@@ -0,0 +1,305 @@
1
+ """PDF tagging."""
2
+
3
+ from collections import defaultdict
4
+
5
+ import pydyf
6
+
7
+ from ..formatting_structure import boxes
8
+ from ..layout.absolute import AbsolutePlaceholder
9
+ from ..logger import LOGGER
10
+
11
+
12
+ def add_tags(pdf, document, page_streams):
13
+ """Add tag tree to the document."""
14
+
15
+ # Add root structure.
16
+ content_mapping = pydyf.Dictionary({})
17
+ pdf.add_object(content_mapping)
18
+ structure_root = pydyf.Dictionary({
19
+ 'Type': '/StructTreeRoot',
20
+ 'ParentTree': content_mapping.reference,
21
+ })
22
+ pdf.add_object(structure_root)
23
+ structure_document = pydyf.Dictionary({
24
+ 'Type': '/StructElem',
25
+ 'S': '/Document',
26
+ 'K': pydyf.Array(),
27
+ 'P': structure_root.reference,
28
+ })
29
+ pdf.add_object(structure_document)
30
+ structure_root['K'] = pydyf.Array([structure_document.reference])
31
+ pdf.catalog['StructTreeRoot'] = structure_root.reference
32
+
33
+ # Map content.
34
+ content_mapping['Nums'] = pydyf.Array()
35
+ links = []
36
+ for page_number, (page, stream) in enumerate(zip(document.pages, page_streams)):
37
+ tags = stream._tags
38
+ page_box = page._page_box
39
+
40
+ # Prepare array for this page’s MCID-to-StructElem mapping.
41
+ content_mapping['Nums'].append(page_number)
42
+ content_mapping['Nums'].append(pydyf.Array())
43
+ page_nums = {}
44
+
45
+ # Map page box content.
46
+ elements = _build_box_tree(
47
+ page_box, structure_document, pdf, page_number, page_nums, links, tags)
48
+ for element in elements:
49
+ structure_document['K'].append(element.reference)
50
+ assert not tags
51
+
52
+ # Flatten page-local nums into global mapping.
53
+ sorted_refs = [ref for _, ref in sorted(page_nums.items())]
54
+ content_mapping['Nums'][-1].extend(sorted_refs)
55
+
56
+ # Add annotations for links.
57
+ for i, (link_reference, annotation) in enumerate(links, start=len(document.pages)):
58
+ content_mapping['Nums'].append(i)
59
+ content_mapping['Nums'].append(link_reference)
60
+ annotation['StructParent'] = i
61
+
62
+ # Add required metadata.
63
+ pdf.catalog['ViewerPreferences'] = pydyf.Dictionary({'DisplayDocTitle': 'true'})
64
+ pdf.catalog['MarkInfo'] = pydyf.Dictionary({'Marked': 'true'})
65
+ if 'Lang' not in pdf.catalog:
66
+ LOGGER.error('Missing required "lang" attribute at the root of the document')
67
+ pdf.catalog['Lang'] = pydyf.String()
68
+
69
+
70
+ def _get_pdf_tag(tag):
71
+ """Get PDF tag corresponding to HTML tag."""
72
+ if tag is None:
73
+ return 'NonStruct'
74
+ elif tag == 'div':
75
+ return 'Div'
76
+ elif tag.split(':')[0] == 'a':
77
+ # Links and link pseudo elements create link annotations.
78
+ return 'Link'
79
+ elif tag == 'span':
80
+ return 'Span'
81
+ elif tag == 'main':
82
+ return 'Part'
83
+ elif tag == 'article':
84
+ return 'Art'
85
+ elif tag == 'section':
86
+ return 'Sect'
87
+ elif tag == 'blockquote':
88
+ return 'BlockQuote'
89
+ elif tag == 'p':
90
+ return 'P'
91
+ elif tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'):
92
+ return tag.upper()
93
+ elif tag in ('dl', 'ul', 'ol'):
94
+ return 'L'
95
+ elif tag in ('li', 'dt', 'dd'):
96
+ # TODO: dt should be different.
97
+ return 'LI'
98
+ elif tag == 'li::marker':
99
+ return 'Lbl'
100
+ elif tag == 'table':
101
+ return 'Table'
102
+ elif tag in ('tr', 'th', 'td'):
103
+ return tag.upper()
104
+ elif tag in ('thead', 'tbody', 'tfoot'):
105
+ return tag[:2].upper() + tag[2:]
106
+ elif tag == 'img':
107
+ return 'Figure'
108
+ elif tag in ('caption', 'figcaption'):
109
+ return 'Caption'
110
+ else:
111
+ return 'NonStruct'
112
+
113
+
114
+ def _build_box_tree(box, parent, pdf, page_number, nums, links, tags):
115
+ """Recursively build tag tree for given box and yield children."""
116
+
117
+ # Special case for absolute elements.
118
+ if isinstance(box, AbsolutePlaceholder):
119
+ box = box._box
120
+
121
+ element_tag = None if box.element is None else box.element_tag
122
+ tag = _get_pdf_tag(element_tag)
123
+
124
+ # Special case for html, body, page boxes and margin boxes.
125
+ if element_tag in ('html', 'body') or isinstance(box, boxes.PageBox):
126
+ # Avoid generate page, html and body boxes as a semantic node, yield children.
127
+ if isinstance(box, boxes.ParentBox) and not isinstance(box, boxes.LineBox):
128
+ for child in box.children:
129
+ yield from _build_box_tree(
130
+ child, parent, pdf, page_number, nums, links, tags)
131
+ return
132
+ elif isinstance(box, boxes.MarginBox):
133
+ # Build tree for margin boxes but don’t link it to main tree. It ensures that
134
+ # marked content is mapped in document and removed from list. It could be
135
+ # included in tree as Artifact, but that’s only allowed in PDF 2.0.
136
+ for child in box.children:
137
+ tuple(_build_box_tree(child, parent, pdf, page_number, nums, links, tags))
138
+ return
139
+
140
+ # Create box element.
141
+ if tag == 'LI':
142
+ anonymous_list_element = parent['S'] == '/LI'
143
+ anonymous_li_child = parent['S'] == '/LBody'
144
+ dl_item = box.element_tag in ('dt', 'dd')
145
+ no_bullet_li = box.element_tag == 'li' and (
146
+ 'list-item' not in box.style['display'] or
147
+ box.style['list_style_type'] == 'none')
148
+ if anonymous_list_element:
149
+ # Store as list item body.
150
+ tag = 'LBody'
151
+ elif anonymous_li_child:
152
+ # Store as non struct list item body child.
153
+ tag = 'NonStruct'
154
+ elif dl_item or no_bullet_li:
155
+ # Wrap in list item.
156
+ tag = 'LBody'
157
+ parent = pydyf.Dictionary({
158
+ 'Type': '/StructElem',
159
+ 'S': '/LI',
160
+ 'K': pydyf.Array([]),
161
+ 'Pg': pdf.page_references[page_number],
162
+ 'P': parent.reference,
163
+ })
164
+ pdf.add_object(parent)
165
+ children = _build_box_tree(box, parent, pdf, page_number, nums, links, tags)
166
+ for child in children:
167
+ parent['K'].append(child.reference)
168
+ yield parent
169
+ return
170
+
171
+ element = pydyf.Dictionary({
172
+ 'Type': '/StructElem',
173
+ 'S': f'/{tag}',
174
+ 'K': pydyf.Array([]),
175
+ 'Pg': pdf.page_references[page_number],
176
+ 'P': parent.reference,
177
+ })
178
+ pdf.add_object(element)
179
+
180
+ # Handle special cases.
181
+ if tag == 'Figure':
182
+ # Add extra data for images.
183
+ x1, y1 = box.content_box_x(), box.content_box_y()
184
+ x2, y2 = x1 + box.width, y1 + box.height
185
+ element['A'] = pydyf.Dictionary({
186
+ 'O': '/Layout',
187
+ 'BBox': pydyf.Array((x1, y1, x2, y2)),
188
+ })
189
+ if alt := box.element.attrib.get('alt'):
190
+ element['Alt'] = pydyf.String(alt)
191
+ else:
192
+ source = box.element.attrib.get('src', 'unknown')
193
+ LOGGER.error(f'Image "{source}" has no required alt description')
194
+ elif tag == 'Table':
195
+ # Use wrapped table as tagged box, and put captions in it.
196
+ wrapper, table = box, box.get_wrapped_table()
197
+ box = table.copy_with_children([])
198
+ for child in wrapper.children:
199
+ box.children.extend(child.children if child is table else [child])
200
+ elif tag == 'TH':
201
+ # Set identifier for table headers to reference them in cells.
202
+ element['ID'] = pydyf.String(id(box))
203
+ elif tag == 'TD':
204
+ # Store table cell element to map it to headers later.
205
+ # TODO: don’t use the box to store this.
206
+ box.mark = element
207
+
208
+ # Include link annotations.
209
+ if box.link_annotation:
210
+ annotation = box.link_annotation
211
+ object_reference = pydyf.Dictionary({
212
+ 'Type': '/OBJR',
213
+ 'Obj': annotation.reference,
214
+ 'Pg': pdf.page_references[page_number],
215
+ })
216
+ pdf.add_object(object_reference)
217
+ links.append((element.reference, annotation))
218
+ element['K'].append(object_reference.reference)
219
+
220
+ if isinstance(box, boxes.ParentBox):
221
+ # Build tree for box children.
222
+ for child in box.children:
223
+ children = child.children if isinstance(child, boxes.LineBox) else [child]
224
+ for child in children:
225
+ if isinstance(child, boxes.TextBox):
226
+ # Add marked element from the stream.
227
+ kid = tags.pop(child)
228
+ assert kid['mcid'] not in nums
229
+ if tag == 'Link':
230
+ # Associate MCID directly with link reference.
231
+ element['K'].append(kid['mcid'])
232
+ nums[kid['mcid']] = element.reference
233
+ else:
234
+ kid_element = pydyf.Dictionary({
235
+ 'Type': '/StructElem',
236
+ 'S': f'/{kid["tag"]}',
237
+ 'K': pydyf.Array([kid['mcid']]),
238
+ 'Pg': pdf.page_references[page_number],
239
+ 'P': element.reference,
240
+ })
241
+ pdf.add_object(kid_element)
242
+ element['K'].append(kid_element.reference)
243
+ nums[kid['mcid']] = kid_element.reference
244
+ else:
245
+ # Recursively build tree for child.
246
+ if child.element_tag in ('ul', 'ol') and element['S'] == '/LI':
247
+ # In PDFs, nested lists are linked to the parent list, but in
248
+ # HTML, nested lists are linked to a parent’s list item.
249
+ child_parent = parent
250
+ else:
251
+ child_parent = element
252
+ child_elements = _build_box_tree(
253
+ child, child_parent, pdf, page_number, nums, links, tags)
254
+
255
+ # Check if it is already been referenced before.
256
+ for child_element in child_elements:
257
+ child_parent['K'].append(child_element.reference)
258
+
259
+ else:
260
+ # Add replaced box.
261
+ assert isinstance(box, boxes.ReplacedBox)
262
+ kid = tags.pop(box)
263
+ element['K'].append(kid['mcid'])
264
+ assert kid['mcid'] not in nums
265
+ nums[kid['mcid']] = element.reference
266
+
267
+ # Link table cells to related headers.
268
+ if tag == 'Table':
269
+ def _get_rows(table_box):
270
+ for child in table_box.children:
271
+ if child.element_tag == 'tr':
272
+ yield child
273
+ else:
274
+ yield from _get_rows(child)
275
+
276
+ # Get headers and rows.
277
+ column_headers = defaultdict(list)
278
+ row_headers = defaultdict(list)
279
+ rows = tuple(_get_rows(box))
280
+
281
+ # Find column and row headers.
282
+ # TODO: handle rowspan and colspan values.
283
+ for i, row in enumerate(rows):
284
+ for j, cell in enumerate(row.children):
285
+ if cell.element is None:
286
+ continue
287
+ if cell.element_tag == 'th':
288
+ # TODO: handle rowgroup and colgroup values.
289
+ if cell.element.attrib.get('scope') == 'row':
290
+ row_headers[i].append(pydyf.String(id(cell)))
291
+ else:
292
+ column_headers[j].append(pydyf.String(id(cell)))
293
+
294
+ # Map headers to cells.
295
+ for i, row in enumerate(rows):
296
+ for j, cell in enumerate(row.children):
297
+ if cell.element is None:
298
+ continue
299
+ if cell.element_tag == 'td':
300
+ cell.mark['A'] = pydyf.Dictionary({
301
+ 'O': '/Table',
302
+ 'Headers': pydyf.Array(row_headers[i] + column_headers[j]),
303
+ })
304
+
305
+ yield element