weasyprint 65.1__py3-none-any.whl → 67.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. weasyprint/__init__.py +17 -7
  2. weasyprint/__main__.py +21 -10
  3. weasyprint/anchors.py +4 -4
  4. weasyprint/css/__init__.py +732 -67
  5. weasyprint/css/computed_values.py +65 -170
  6. weasyprint/css/counters.py +1 -1
  7. weasyprint/css/functions.py +206 -0
  8. weasyprint/css/html5_ua.css +3 -7
  9. weasyprint/css/html5_ua_form.css +2 -2
  10. weasyprint/css/media_queries.py +3 -1
  11. weasyprint/css/properties.py +6 -2
  12. weasyprint/css/{utils.py → tokens.py} +306 -397
  13. weasyprint/css/units.py +91 -0
  14. weasyprint/css/validation/__init__.py +1 -1
  15. weasyprint/css/validation/descriptors.py +47 -19
  16. weasyprint/css/validation/expanders.py +7 -8
  17. weasyprint/css/validation/properties.py +341 -357
  18. weasyprint/document.py +20 -19
  19. weasyprint/draw/__init__.py +56 -63
  20. weasyprint/draw/border.py +121 -69
  21. weasyprint/draw/color.py +1 -1
  22. weasyprint/draw/text.py +60 -41
  23. weasyprint/formatting_structure/boxes.py +24 -5
  24. weasyprint/formatting_structure/build.py +33 -45
  25. weasyprint/images.py +76 -62
  26. weasyprint/layout/__init__.py +32 -26
  27. weasyprint/layout/absolute.py +7 -6
  28. weasyprint/layout/background.py +7 -7
  29. weasyprint/layout/block.py +195 -152
  30. weasyprint/layout/column.py +19 -24
  31. weasyprint/layout/flex.py +54 -26
  32. weasyprint/layout/float.py +12 -7
  33. weasyprint/layout/grid.py +284 -90
  34. weasyprint/layout/inline.py +121 -68
  35. weasyprint/layout/page.py +45 -12
  36. weasyprint/layout/percent.py +14 -10
  37. weasyprint/layout/preferred.py +105 -63
  38. weasyprint/layout/replaced.py +9 -6
  39. weasyprint/layout/table.py +16 -9
  40. weasyprint/pdf/__init__.py +58 -18
  41. weasyprint/pdf/anchors.py +3 -4
  42. weasyprint/pdf/fonts.py +126 -69
  43. weasyprint/pdf/metadata.py +36 -4
  44. weasyprint/pdf/pdfa.py +19 -3
  45. weasyprint/pdf/pdfua.py +7 -115
  46. weasyprint/pdf/pdfx.py +83 -0
  47. weasyprint/pdf/stream.py +57 -49
  48. weasyprint/pdf/tags.py +307 -0
  49. weasyprint/stacking.py +14 -15
  50. weasyprint/svg/__init__.py +59 -32
  51. weasyprint/svg/bounding_box.py +4 -2
  52. weasyprint/svg/defs.py +4 -9
  53. weasyprint/svg/images.py +11 -3
  54. weasyprint/svg/text.py +11 -2
  55. weasyprint/svg/utils.py +15 -8
  56. weasyprint/text/constants.py +1 -1
  57. weasyprint/text/ffi.py +4 -3
  58. weasyprint/text/fonts.py +13 -5
  59. weasyprint/text/line_break.py +146 -43
  60. weasyprint/urls.py +41 -13
  61. {weasyprint-65.1.dist-info → weasyprint-67.0.dist-info}/METADATA +5 -6
  62. weasyprint-67.0.dist-info/RECORD +77 -0
  63. weasyprint/draw/stack.py +0 -13
  64. weasyprint-65.1.dist-info/RECORD +0 -74
  65. {weasyprint-65.1.dist-info → weasyprint-67.0.dist-info}/WHEEL +0 -0
  66. {weasyprint-65.1.dist-info → weasyprint-67.0.dist-info}/entry_points.txt +0 -0
  67. {weasyprint-65.1.dist-info → weasyprint-67.0.dist-info}/licenses/LICENSE +0 -0
weasyprint/pdf/pdfua.py CHANGED
@@ -1,125 +1,17 @@
1
1
  """PDF/UA generation."""
2
2
 
3
- import pydyf
3
+ from functools import partial
4
4
 
5
5
  from .metadata import add_metadata
6
6
 
7
7
 
8
- def pdfua(pdf, metadata, document, page_streams, attachments, compress):
8
+ def pdfua(pdf, metadata, document, page_streams, attachments, compress, version):
9
9
  """Set metadata for PDF/UA documents."""
10
- # Structure for PDF tagging
11
- content_mapping = pydyf.Dictionary({})
12
- pdf.add_object(content_mapping)
13
- structure_root = pydyf.Dictionary({
14
- 'Type': '/StructTreeRoot',
15
- 'ParentTree': content_mapping.reference,
16
- })
17
- pdf.add_object(structure_root)
18
- structure_document = pydyf.Dictionary({
19
- 'Type': '/StructElem',
20
- 'S': '/Document',
21
- 'P': structure_root.reference,
22
- })
23
- pdf.add_object(structure_document)
24
- structure_root['K'] = pydyf.Array([structure_document.reference])
25
- pdf.catalog['StructTreeRoot'] = structure_root.reference
26
-
27
- document_children = []
28
- content_mapping['Nums'] = pydyf.Array()
29
- links = []
30
- for page_number, page_stream in enumerate(page_streams):
31
- structure = {}
32
- document.build_element_structure(structure)
33
- parents = [None] * len(page_stream.marked)
34
- for mcid, (key, box) in enumerate(page_stream.marked):
35
- # Build structure elements
36
- kids = [mcid]
37
- if key == 'Link':
38
- object_reference = pydyf.Dictionary({
39
- 'Type': '/OBJR',
40
- 'Obj': box.link_annotation.reference,
41
- 'Pg': pdf.page_references[page_number],
42
- })
43
- pdf.add_object(object_reference)
44
- links.append((object_reference.reference, box.link_annotation))
45
- etree_element = box.element
46
- child_structure_data_element = None
47
- while True:
48
- if etree_element is None:
49
- structure_data = structure.setdefault(
50
- box, {'parent': None})
51
- else:
52
- structure_data = structure[etree_element]
53
- new_element = 'element' not in structure_data
54
- if new_element:
55
- child = structure_data['element'] = pydyf.Dictionary({
56
- 'Type': '/StructElem',
57
- 'S': f'/{key}',
58
- 'K': pydyf.Array(kids),
59
- 'Pg': pdf.page_references[page_number],
60
- })
61
- pdf.add_object(child)
62
- if key == 'LI':
63
- if etree_element.tag == 'dt':
64
- sub_key = 'Lbl'
65
- else:
66
- sub_key = 'LBody'
67
- real_child = pydyf.Dictionary({
68
- 'Type': '/StructElem',
69
- 'S': f'/{sub_key}',
70
- 'K': pydyf.Array(kids),
71
- 'Pg': pdf.page_references[page_number],
72
- 'P': child.reference,
73
- })
74
- pdf.add_object(real_child)
75
- for kid in kids:
76
- if isinstance(kid, int):
77
- parents[kid] = real_child.reference
78
- child['K'] = pydyf.Array([real_child.reference])
79
- structure_data['element'] = real_child
80
- else:
81
- for kid in kids:
82
- if isinstance(kid, int):
83
- parents[kid] = child.reference
84
- else:
85
- child = structure_data['element']
86
- child['K'].extend(kids)
87
- for kid in kids:
88
- if isinstance(kid, int):
89
- parents[kid] = child.reference
90
- kid = child.reference
91
- if child_structure_data_element is not None:
92
- child_structure_data_element['P'] = kid
93
- if not new_element:
94
- break
95
- kids = [kid]
96
- child_structure_data_element = child
97
- if structure_data['parent'] is None:
98
- child['P'] = structure_document.reference
99
- document_children.append(child.reference)
100
- break
101
- else:
102
- etree_element = structure_data['parent']
103
- key = page_stream.get_marked_content_tag(etree_element.tag)
104
- content_mapping['Nums'].append(page_number)
105
- content_mapping['Nums'].append(pydyf.Array(parents))
106
- structure_document['K'] = pydyf.Array(document_children)
107
- for i, (link, annotation) in enumerate(links, start=page_number + 1):
108
- content_mapping['Nums'].append(i)
109
- content_mapping['Nums'].append(link)
110
- annotation['StructParent'] = i
111
- annotation['F'] = 2 ** (2 - 1)
112
-
113
10
  # Common PDF metadata stream
114
- add_metadata(pdf, metadata, 'ua', 1, conformance=None, compress=compress)
115
-
116
- # PDF document extra metadata
117
- if 'Lang' not in pdf.catalog:
118
- pdf.catalog['Lang'] = pydyf.String()
119
- pdf.catalog['ViewerPreferences'] = pydyf.Dictionary({
120
- 'DisplayDocTitle': 'true',
121
- })
122
- pdf.catalog['MarkInfo'] = pydyf.Dictionary({'Marked': 'true'})
11
+ add_metadata(pdf, metadata, 'ua', version, conformance=None, compress=compress)
123
12
 
124
13
 
125
- VARIANTS = {'pdf/ua-1': (pdfua, {'mark': True})}
14
+ VARIANTS = {
15
+ 'pdf/ua-1': (partial(pdfua, version=1), {'version': '1.7', 'pdf_tags': True}),
16
+ 'pdf/ua-2': (partial(pdfua, version=2), {'version': '2.0', 'pdf_tags': True}),
17
+ }
weasyprint/pdf/pdfx.py ADDED
@@ -0,0 +1,83 @@
1
+ """PDF/X generation."""
2
+
3
+ from functools import partial
4
+ from time import localtime
5
+
6
+ import pydyf
7
+
8
+ from .metadata import add_metadata
9
+
10
+
11
+ def pdfx(pdf, metadata, document, page_streams, attachments, compress, version,
12
+ variant):
13
+ """Set metadata for PDF/X documents."""
14
+
15
+ # Add conformance metadata.
16
+ conformance = f'PDF/X-{version}{variant}'
17
+ if version < 4:
18
+ pdf.info['GTS_PDFXVersion'] = pydyf.String(conformance)
19
+ pdf.info['GTS_PDFXConformance'] = pydyf.String(conformance)
20
+ pdf.info['Trapped'] = '/False'
21
+ now = localtime()
22
+ year, month, day, hour, minute, second = now[:6]
23
+ tz_hour, tz_minute = divmod(now.tm_gmtoff, 3600)
24
+ now_iso = (
25
+ f'{year:04}-{month:02}-{day:02}T{hour:02}:{minute:02}:{second:02}'
26
+ f'{tz_hour:+03}:{tz_minute:02}')
27
+ now_pdf = (
28
+ f'(D:{year:04}{month:02}{day:02}{hour:02}{minute:02}{second:02}'
29
+ f"{tz_hour:+03}'{tz_minute:02}')")
30
+ if not metadata.modified:
31
+ metadata.modified = now_iso
32
+ pdf.info['ModDate'] = now_pdf
33
+ if not metadata.created:
34
+ metadata.created = now_iso
35
+ pdf.info['CreationDate'] = now_pdf
36
+
37
+ # Add output intents.
38
+ if 'device-cmyk' not in document.color_profiles:
39
+ # Add standard CMYK profile.
40
+ pdf.catalog['OutputIntents'] = pydyf.Array([
41
+ pydyf.Dictionary({
42
+ 'Type': '/OutputIntent',
43
+ 'S': '/GTS_PDFX',
44
+ 'OutputConditionIdentifier': pydyf.String('CGATS TR 001'),
45
+ 'RegistryName': pydyf.String('http://www.color.org'),
46
+ }),
47
+ ])
48
+
49
+ # Common PDF metadata stream.
50
+ add_metadata(pdf, metadata, 'x', version, conformance, compress=compress)
51
+
52
+
53
+ VARIANTS = {
54
+ 'pdf/x-1a': (
55
+ partial(pdfx, version=1, variant='a:2003'),
56
+ {'version': '1.4', 'identifier': True},
57
+ ),
58
+ 'pdf/x-3': (
59
+ partial(pdfx, version=3, variant=':2003'),
60
+ {'version': '1.4', 'identifier': True},
61
+ ),
62
+ 'pdf/x-4': (
63
+ partial(pdfx, version=4, variant=''),
64
+ {'version': '1.6', 'identifier': True},
65
+ ),
66
+ 'pdf/x-5g': (
67
+ partial(pdfx, version=5, variant='g'),
68
+ {'version': '1.6', 'identifier': True},
69
+ ),
70
+ # TODO: these variants forbid OutputIntent to include ICC file.
71
+ # 'pdf/x-4p': (
72
+ # partial(pdfx, version=4, variant='p'),
73
+ # {'version': '1.6', 'identifier': True},
74
+ # ),
75
+ # 'pdf/x-5pg': (
76
+ # partial(pdfx, version=5, variant='pg'),
77
+ # {'version': '1.6', 'identifier': True},
78
+ # ),
79
+ # 'pdf/x-5n': (
80
+ # partial(pdfx, version=5, variant='n'),
81
+ # {'version': '1.6', 'identifier': True},
82
+ # ),
83
+ }
weasyprint/pdf/stream.py CHANGED
@@ -1,5 +1,7 @@
1
1
  """PDF stream."""
2
2
 
3
+ from contextlib import contextmanager
4
+
3
5
  import pydyf
4
6
 
5
7
  from ..logger import LOGGER
@@ -11,14 +13,15 @@ from .fonts import Font
11
13
 
12
14
  class Stream(pydyf.Stream):
13
15
  """PDF stream object with extra features."""
14
- def __init__(self, fonts, page_rectangle, resources, images, mark, *args, **kwargs):
16
+ def __init__(self, fonts, page_rectangle, resources, images, tags, color_profiles,
17
+ *args, **kwargs):
15
18
  super().__init__(*args, **kwargs)
16
19
  self.page_rectangle = page_rectangle
17
- self.marked = []
18
20
  self._fonts = fonts
19
21
  self._resources = resources
20
22
  self._images = images
21
- self._mark = mark
23
+ self._tags = tags
24
+ self._color_profiles = color_profiles
22
25
  self._current_color = self._current_color_stroke = None
23
26
  self._current_alpha = self._current_alpha_stroke = None
24
27
  self._current_font = self._current_font_size = None
@@ -39,8 +42,10 @@ class Stream(pydyf.Stream):
39
42
  kwargs['resources'] = self._resources
40
43
  if 'images' not in kwargs:
41
44
  kwargs['images'] = self._images
42
- if 'mark' not in kwargs:
43
- kwargs['mark'] = self._mark
45
+ if 'tags' not in kwargs:
46
+ kwargs['tags'] = self._tags
47
+ if 'color_profiles' not in kwargs:
48
+ kwargs['color_profiles'] = self._color_profiles
44
49
  if 'compress' not in kwargs:
45
50
  kwargs['compress'] = self.compress
46
51
  return Stream(**kwargs)
@@ -104,8 +109,21 @@ class Stream(pydyf.Stream):
104
109
  self.set_color_space('lab-d50', stroke)
105
110
  lightness, a, b = color.to('lab').coordinates
106
111
  self.set_color_special(None, stroke, lightness, a, b)
112
+ elif color.space == 'device-cmyk':
113
+ self.set_color_space('DeviceCMYK', stroke)
114
+ c, m, y, k = color.coordinates
115
+ self.set_color_special(None, stroke, c, m, y, k)
116
+ elif color.space.startswith('--') and color.space in self._color_profiles:
117
+ self.set_color_space(color.space, stroke)
118
+ self.set_color_special(None, stroke, *color.coordinates)
107
119
  else:
108
- LOGGER.warn('Unsupported color space %s, use sRGB instead', color.space)
120
+ LOGGER.warning('Unsupported color space %s, use sRGB instead', color.space)
121
+ if len(channels) > 3:
122
+ channels = channels[:3]
123
+ elif len(channels) == 2:
124
+ channels = *channels, 0
125
+ elif len(channels) == 1:
126
+ channels = *channels, 0, 0
109
127
  self.set_color_rgb(*channels, stroke)
110
128
 
111
129
  def set_font_size(self, font, size):
@@ -248,21 +266,39 @@ class Stream(pydyf.Stream):
248
266
  self._resources['Shading'][shading.id] = shading
249
267
  return shading
250
268
 
251
- def begin_marked_content(self, box, mcid=False, tag=None):
252
- if not self._mark:
253
- return
254
- property_list = None
255
- if tag is None:
256
- tag = self.get_marked_content_tag(box.element_tag)
257
- if mcid:
258
- property_list = pydyf.Dictionary({'MCID': len(self.marked)})
259
- self.marked.append((tag, box))
260
- super().begin_marked_content(tag, property_list)
261
-
262
- def end_marked_content(self):
263
- if not self._mark:
264
- return
265
- super().end_marked_content()
269
+ @contextmanager
270
+ def stacked(self):
271
+ """Save and restore stream context when used with the ``with`` keyword."""
272
+ self.push_state()
273
+ try:
274
+ yield
275
+ finally:
276
+ self.pop_state()
277
+
278
+ @contextmanager
279
+ def marked(self, box, tag):
280
+ if self._tags is not None:
281
+ property_list = None
282
+ mcid = len(self._tags)
283
+ assert box not in self._tags
284
+ self._tags[box] = {'tag': tag, 'mcid': mcid}
285
+ property_list = pydyf.Dictionary({'MCID': mcid})
286
+ super().begin_marked_content(tag, property_list)
287
+ try:
288
+ yield
289
+ finally:
290
+ if self._tags is not None:
291
+ super().end_marked_content()
292
+
293
+ @contextmanager
294
+ def artifact(self):
295
+ if self._tags is not None:
296
+ super().begin_marked_content('Artifact')
297
+ try:
298
+ yield
299
+ finally:
300
+ if self._tags is not None:
301
+ super().end_marked_content()
266
302
 
267
303
  @staticmethod
268
304
  def create_interpolation_function(domain, c0, c1, n):
@@ -283,31 +319,3 @@ class Stream(pydyf.Stream):
283
319
  'Bounds': pydyf.Array(bounds),
284
320
  'Functions': pydyf.Array(sub_functions),
285
321
  })
286
-
287
- def get_marked_content_tag(self, element_tag):
288
- if element_tag == 'div':
289
- return 'Div'
290
- elif element_tag == 'span':
291
- return 'Span'
292
- elif element_tag == 'article':
293
- return 'Art'
294
- elif element_tag == 'section':
295
- return 'Sect'
296
- elif element_tag == 'blockquote':
297
- return 'BlockQuote'
298
- elif element_tag == 'p':
299
- return 'P'
300
- elif element_tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'):
301
- return element_tag.upper()
302
- elif element_tag in ('dl', 'ul', 'ol'):
303
- return 'L'
304
- elif element_tag in ('li', 'dt', 'dd'):
305
- return 'LI'
306
- elif element_tag == 'table':
307
- return 'Table'
308
- elif element_tag in ('tr', 'th', 'td'):
309
- return element_tag.upper()
310
- elif element_tag in ('thead', 'tbody', 'tfoot'):
311
- return element_tag[:2].upper() + element_tag[2:]
312
- else:
313
- return 'NonStruct'
weasyprint/pdf/tags.py ADDED
@@ -0,0 +1,307 @@
1
+ """PDF tagging."""
2
+
3
+ from collections import defaultdict
4
+
5
+ import pydyf
6
+
7
+ from ..formatting_structure import boxes
8
+ from ..layout.absolute import AbsolutePlaceholder
9
+ from ..logger import LOGGER
10
+
11
+
12
+ def add_tags(pdf, document, page_streams):
13
+ """Add tag tree to the document."""
14
+
15
+ # Add root structure.
16
+ content_mapping = pydyf.Dictionary({})
17
+ pdf.add_object(content_mapping)
18
+ structure_root = pydyf.Dictionary({
19
+ 'Type': '/StructTreeRoot',
20
+ 'ParentTree': content_mapping.reference,
21
+ })
22
+ pdf.add_object(structure_root)
23
+ structure_document = pydyf.Dictionary({
24
+ 'Type': '/StructElem',
25
+ 'S': '/Document',
26
+ 'K': pydyf.Array(),
27
+ 'P': structure_root.reference,
28
+ })
29
+ pdf.add_object(structure_document)
30
+ structure_root['K'] = pydyf.Array([structure_document.reference])
31
+ pdf.catalog['StructTreeRoot'] = structure_root.reference
32
+
33
+ # Map content.
34
+ content_mapping['Nums'] = pydyf.Array()
35
+ links = []
36
+ for page_number, (page, stream) in enumerate(zip(document.pages, page_streams)):
37
+ tags = stream._tags
38
+ page_box = page._page_box
39
+
40
+ # Prepare array for this page’s MCID-to-StructElem mapping.
41
+ content_mapping['Nums'].append(page_number)
42
+ content_mapping['Nums'].append(pydyf.Array())
43
+ page_nums = {}
44
+
45
+ # Map page box content.
46
+ elements = _build_box_tree(
47
+ page_box, structure_document, pdf, page_number, page_nums, links, tags)
48
+ for element in elements:
49
+ structure_document['K'].append(element.reference)
50
+ assert not tags
51
+
52
+ # Flatten page-local nums into global mapping.
53
+ sorted_refs = [ref for _, ref in sorted(page_nums.items())]
54
+ content_mapping['Nums'][-1].extend(sorted_refs)
55
+
56
+ # Add annotations for links.
57
+ for i, (link_reference, annotation) in enumerate(links, start=len(document.pages)):
58
+ content_mapping['Nums'].append(i)
59
+ content_mapping['Nums'].append(link_reference)
60
+ annotation['StructParent'] = i
61
+
62
+ # Add required metadata.
63
+ pdf.catalog['ViewerPreferences'] = pydyf.Dictionary({'DisplayDocTitle': 'true'})
64
+ pdf.catalog['MarkInfo'] = pydyf.Dictionary({'Marked': 'true'})
65
+ if 'Lang' not in pdf.catalog:
66
+ LOGGER.error('Missing required "lang" attribute at the root of the document')
67
+ pdf.catalog['Lang'] = pydyf.String()
68
+
69
+
70
+ def _get_pdf_tag(tag):
71
+ """Get PDF tag corresponding to HTML tag."""
72
+ if tag is None:
73
+ return 'NonStruct'
74
+ elif tag == 'div':
75
+ return 'Div'
76
+ elif tag.split(':')[0] == 'a':
77
+ # Links and link pseudo elements create link annotations.
78
+ return 'Link'
79
+ elif tag == 'span':
80
+ return 'Span'
81
+ elif tag == 'main':
82
+ return 'Part'
83
+ elif tag == 'article':
84
+ return 'Art'
85
+ elif tag == 'section':
86
+ return 'Sect'
87
+ elif tag == 'blockquote':
88
+ return 'BlockQuote'
89
+ elif tag == 'p':
90
+ return 'P'
91
+ elif tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'):
92
+ return tag.upper()
93
+ elif tag in ('dl', 'ul', 'ol'):
94
+ return 'L'
95
+ elif tag in ('li', 'dt', 'dd'):
96
+ # TODO: dt should be different.
97
+ return 'LI'
98
+ elif tag == 'li::marker':
99
+ return 'Lbl'
100
+ elif tag == 'table':
101
+ return 'Table'
102
+ elif tag in ('tr', 'th', 'td'):
103
+ return tag.upper()
104
+ elif tag in ('thead', 'tbody', 'tfoot'):
105
+ return tag[:2].upper() + tag[2:]
106
+ elif tag == 'img':
107
+ return 'Figure'
108
+ elif tag in ('caption', 'figcaption'):
109
+ return 'Caption'
110
+ else:
111
+ return 'NonStruct'
112
+
113
+
114
+ def _build_box_tree(box, parent, pdf, page_number, nums, links, tags):
115
+ """Recursively build tag tree for given box and yield children."""
116
+
117
+ # Special case for absolute elements.
118
+ if isinstance(box, AbsolutePlaceholder):
119
+ box = box._box
120
+
121
+ element_tag = None if box.element is None else box.element_tag
122
+ tag = _get_pdf_tag(element_tag)
123
+
124
+ # Special case for html, body, page boxes and margin boxes.
125
+ if element_tag in ('html', 'body') or isinstance(box, boxes.PageBox):
126
+ # Avoid generate page, html and body boxes as a semantic node, yield children.
127
+ if isinstance(box, boxes.ParentBox) and not isinstance(box, boxes.LineBox):
128
+ for child in box.children:
129
+ yield from _build_box_tree(
130
+ child, parent, pdf, page_number, nums, links, tags)
131
+ return
132
+ elif isinstance(box, boxes.MarginBox):
133
+ # Build tree for margin boxes but don’t link it to main tree. It ensures that
134
+ # marked content is mapped in document and removed from list. It could be
135
+ # included in tree as Artifact, but that’s only allowed in PDF 2.0.
136
+ for child in box.children:
137
+ tuple(_build_box_tree(child, parent, pdf, page_number, nums, links, tags))
138
+ return
139
+
140
+ # Create box element.
141
+ if tag == 'LI':
142
+ anonymous_list_element = parent['S'] == '/LI'
143
+ anonymous_li_child = parent['S'] == '/LBody'
144
+ dl_item = box.element_tag in ('dt', 'dd')
145
+ no_bullet_li = box.element_tag == 'li' and (
146
+ 'list-item' not in box.style['display'] or
147
+ box.style['list_style_type'] == 'none')
148
+ if anonymous_list_element:
149
+ # Store as list item body.
150
+ tag = 'LBody'
151
+ elif anonymous_li_child:
152
+ # Store as non struct list item body child.
153
+ tag = 'NonStruct'
154
+ elif dl_item or no_bullet_li:
155
+ # Wrap in list item.
156
+ tag = 'LBody'
157
+ parent = pydyf.Dictionary({
158
+ 'Type': '/StructElem',
159
+ 'S': '/LI',
160
+ 'K': pydyf.Array([]),
161
+ 'Pg': pdf.page_references[page_number],
162
+ 'P': parent.reference,
163
+ })
164
+ pdf.add_object(parent)
165
+ children = _build_box_tree(box, parent, pdf, page_number, nums, links, tags)
166
+ for child in children:
167
+ parent['K'].append(child.reference)
168
+ yield parent
169
+ return
170
+
171
+ element = pydyf.Dictionary({
172
+ 'Type': '/StructElem',
173
+ 'S': f'/{tag}',
174
+ 'K': pydyf.Array([]),
175
+ 'Pg': pdf.page_references[page_number],
176
+ 'P': parent.reference,
177
+ })
178
+ pdf.add_object(element)
179
+
180
+ # Handle special cases.
181
+ if tag == 'Figure':
182
+ # Add extra data for images.
183
+ x1, y1 = box.content_box_x(), box.content_box_y()
184
+ x2, y2 = x1 + box.width, y1 + box.height
185
+ element['A'] = pydyf.Dictionary({
186
+ 'O': '/Layout',
187
+ 'BBox': pydyf.Array((x1, y1, x2, y2)),
188
+ })
189
+ if alt := box.element.attrib.get('alt'):
190
+ element['Alt'] = pydyf.String(alt)
191
+ else:
192
+ source = box.element.attrib.get('src', 'unknown')
193
+ LOGGER.error(f'Image "{source}" has no required alt description')
194
+ elif tag == 'Table':
195
+ # Use wrapped table as tagged box, and put captions in it.
196
+ if box.is_table_wrapper:
197
+ # Can be false if table has another display type.
198
+ wrapper, table = box, box.get_wrapped_table()
199
+ box = table.copy_with_children([])
200
+ for child in wrapper.children:
201
+ box.children.extend(child.children if child is table else [child])
202
+ elif tag == 'TH':
203
+ # Set identifier for table headers to reference them in cells.
204
+ element['ID'] = pydyf.String(id(box))
205
+ elif tag == 'TD':
206
+ # Store table cell element to map it to headers later.
207
+ # TODO: don’t use the box to store this.
208
+ box.mark = element
209
+
210
+ # Include link annotations.
211
+ if box.link_annotation:
212
+ annotation = box.link_annotation
213
+ object_reference = pydyf.Dictionary({
214
+ 'Type': '/OBJR',
215
+ 'Obj': annotation.reference,
216
+ 'Pg': pdf.page_references[page_number],
217
+ })
218
+ pdf.add_object(object_reference)
219
+ links.append((element.reference, annotation))
220
+ element['K'].append(object_reference.reference)
221
+
222
+ if isinstance(box, boxes.ParentBox):
223
+ # Build tree for box children.
224
+ for child in box.children:
225
+ children = child.children if isinstance(child, boxes.LineBox) else [child]
226
+ for child in children:
227
+ if isinstance(child, boxes.TextBox):
228
+ # Add marked element from the stream.
229
+ kid = tags.pop(child)
230
+ assert kid['mcid'] not in nums
231
+ if tag == 'Link':
232
+ # Associate MCID directly with link reference.
233
+ element['K'].append(kid['mcid'])
234
+ nums[kid['mcid']] = element.reference
235
+ else:
236
+ kid_element = pydyf.Dictionary({
237
+ 'Type': '/StructElem',
238
+ 'S': f'/{kid["tag"]}',
239
+ 'K': pydyf.Array([kid['mcid']]),
240
+ 'Pg': pdf.page_references[page_number],
241
+ 'P': element.reference,
242
+ })
243
+ pdf.add_object(kid_element)
244
+ element['K'].append(kid_element.reference)
245
+ nums[kid['mcid']] = kid_element.reference
246
+ else:
247
+ # Recursively build tree for child.
248
+ if child.element_tag in ('ul', 'ol') and element['S'] == '/LI':
249
+ # In PDFs, nested lists are linked to the parent list, but in
250
+ # HTML, nested lists are linked to a parent’s list item.
251
+ child_parent = parent
252
+ else:
253
+ child_parent = element
254
+ child_elements = _build_box_tree(
255
+ child, child_parent, pdf, page_number, nums, links, tags)
256
+
257
+ # Check if it is already been referenced before.
258
+ for child_element in child_elements:
259
+ child_parent['K'].append(child_element.reference)
260
+
261
+ else:
262
+ # Add replaced box.
263
+ assert isinstance(box, boxes.ReplacedBox)
264
+ kid = tags.pop(box)
265
+ element['K'].append(kid['mcid'])
266
+ assert kid['mcid'] not in nums
267
+ nums[kid['mcid']] = element.reference
268
+
269
+ # Link table cells to related headers.
270
+ if tag == 'Table':
271
+ def _get_rows(table_box):
272
+ for child in table_box.children:
273
+ if child.element_tag == 'tr':
274
+ yield child
275
+ else:
276
+ yield from _get_rows(child)
277
+
278
+ # Get headers and rows.
279
+ column_headers = defaultdict(list)
280
+ row_headers = defaultdict(list)
281
+ rows = tuple(_get_rows(box))
282
+
283
+ # Find column and row headers.
284
+ # TODO: handle rowspan and colspan values.
285
+ for i, row in enumerate(rows):
286
+ for j, cell in enumerate(row.children):
287
+ if cell.element is None:
288
+ continue
289
+ if cell.element_tag == 'th':
290
+ # TODO: handle rowgroup and colgroup values.
291
+ if cell.element.attrib.get('scope') == 'row':
292
+ row_headers[i].append(pydyf.String(id(cell)))
293
+ else:
294
+ column_headers[j].append(pydyf.String(id(cell)))
295
+
296
+ # Map headers to cells.
297
+ for i, row in enumerate(rows):
298
+ for j, cell in enumerate(row.children):
299
+ if cell.element is None:
300
+ continue
301
+ if cell.element_tag == 'td':
302
+ cell.mark['A'] = pydyf.Dictionary({
303
+ 'O': '/Table',
304
+ 'Headers': pydyf.Array(row_headers[i] + column_headers[j]),
305
+ })
306
+
307
+ yield element