docling 2.69.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling might be problematic. Click here for more details.

Files changed (138) hide show
  1. docling/__init__.py +0 -0
  2. docling/backend/__init__.py +0 -0
  3. docling/backend/abstract_backend.py +84 -0
  4. docling/backend/asciidoc_backend.py +443 -0
  5. docling/backend/csv_backend.py +125 -0
  6. docling/backend/docling_parse_backend.py +237 -0
  7. docling/backend/docling_parse_v2_backend.py +276 -0
  8. docling/backend/docling_parse_v4_backend.py +260 -0
  9. docling/backend/docx/__init__.py +0 -0
  10. docling/backend/docx/drawingml/utils.py +131 -0
  11. docling/backend/docx/latex/__init__.py +0 -0
  12. docling/backend/docx/latex/latex_dict.py +274 -0
  13. docling/backend/docx/latex/omml.py +459 -0
  14. docling/backend/html_backend.py +1502 -0
  15. docling/backend/image_backend.py +188 -0
  16. docling/backend/json/__init__.py +0 -0
  17. docling/backend/json/docling_json_backend.py +58 -0
  18. docling/backend/md_backend.py +618 -0
  19. docling/backend/mets_gbs_backend.py +399 -0
  20. docling/backend/msexcel_backend.py +686 -0
  21. docling/backend/mspowerpoint_backend.py +398 -0
  22. docling/backend/msword_backend.py +1663 -0
  23. docling/backend/noop_backend.py +51 -0
  24. docling/backend/pdf_backend.py +82 -0
  25. docling/backend/pypdfium2_backend.py +417 -0
  26. docling/backend/webvtt_backend.py +572 -0
  27. docling/backend/xml/__init__.py +0 -0
  28. docling/backend/xml/jats_backend.py +819 -0
  29. docling/backend/xml/uspto_backend.py +1905 -0
  30. docling/chunking/__init__.py +12 -0
  31. docling/cli/__init__.py +0 -0
  32. docling/cli/main.py +974 -0
  33. docling/cli/models.py +196 -0
  34. docling/cli/tools.py +17 -0
  35. docling/datamodel/__init__.py +0 -0
  36. docling/datamodel/accelerator_options.py +69 -0
  37. docling/datamodel/asr_model_specs.py +494 -0
  38. docling/datamodel/backend_options.py +102 -0
  39. docling/datamodel/base_models.py +493 -0
  40. docling/datamodel/document.py +699 -0
  41. docling/datamodel/extraction.py +39 -0
  42. docling/datamodel/layout_model_specs.py +91 -0
  43. docling/datamodel/pipeline_options.py +457 -0
  44. docling/datamodel/pipeline_options_asr_model.py +78 -0
  45. docling/datamodel/pipeline_options_vlm_model.py +136 -0
  46. docling/datamodel/settings.py +65 -0
  47. docling/datamodel/vlm_model_specs.py +365 -0
  48. docling/document_converter.py +559 -0
  49. docling/document_extractor.py +327 -0
  50. docling/exceptions.py +10 -0
  51. docling/experimental/__init__.py +5 -0
  52. docling/experimental/datamodel/__init__.py +1 -0
  53. docling/experimental/datamodel/table_crops_layout_options.py +13 -0
  54. docling/experimental/datamodel/threaded_layout_vlm_pipeline_options.py +45 -0
  55. docling/experimental/models/__init__.py +3 -0
  56. docling/experimental/models/table_crops_layout_model.py +114 -0
  57. docling/experimental/pipeline/__init__.py +1 -0
  58. docling/experimental/pipeline/threaded_layout_vlm_pipeline.py +439 -0
  59. docling/models/__init__.py +0 -0
  60. docling/models/base_layout_model.py +39 -0
  61. docling/models/base_model.py +230 -0
  62. docling/models/base_ocr_model.py +241 -0
  63. docling/models/base_table_model.py +45 -0
  64. docling/models/extraction/__init__.py +0 -0
  65. docling/models/extraction/nuextract_transformers_model.py +305 -0
  66. docling/models/factories/__init__.py +47 -0
  67. docling/models/factories/base_factory.py +122 -0
  68. docling/models/factories/layout_factory.py +7 -0
  69. docling/models/factories/ocr_factory.py +11 -0
  70. docling/models/factories/picture_description_factory.py +11 -0
  71. docling/models/factories/table_factory.py +7 -0
  72. docling/models/picture_description_base_model.py +149 -0
  73. docling/models/plugins/__init__.py +0 -0
  74. docling/models/plugins/defaults.py +60 -0
  75. docling/models/stages/__init__.py +0 -0
  76. docling/models/stages/code_formula/__init__.py +0 -0
  77. docling/models/stages/code_formula/code_formula_model.py +342 -0
  78. docling/models/stages/layout/__init__.py +0 -0
  79. docling/models/stages/layout/layout_model.py +249 -0
  80. docling/models/stages/ocr/__init__.py +0 -0
  81. docling/models/stages/ocr/auto_ocr_model.py +132 -0
  82. docling/models/stages/ocr/easyocr_model.py +200 -0
  83. docling/models/stages/ocr/ocr_mac_model.py +145 -0
  84. docling/models/stages/ocr/rapid_ocr_model.py +328 -0
  85. docling/models/stages/ocr/tesseract_ocr_cli_model.py +331 -0
  86. docling/models/stages/ocr/tesseract_ocr_model.py +262 -0
  87. docling/models/stages/page_assemble/__init__.py +0 -0
  88. docling/models/stages/page_assemble/page_assemble_model.py +156 -0
  89. docling/models/stages/page_preprocessing/__init__.py +0 -0
  90. docling/models/stages/page_preprocessing/page_preprocessing_model.py +145 -0
  91. docling/models/stages/picture_classifier/__init__.py +0 -0
  92. docling/models/stages/picture_classifier/document_picture_classifier.py +246 -0
  93. docling/models/stages/picture_description/__init__.py +0 -0
  94. docling/models/stages/picture_description/picture_description_api_model.py +66 -0
  95. docling/models/stages/picture_description/picture_description_vlm_model.py +123 -0
  96. docling/models/stages/reading_order/__init__.py +0 -0
  97. docling/models/stages/reading_order/readingorder_model.py +431 -0
  98. docling/models/stages/table_structure/__init__.py +0 -0
  99. docling/models/stages/table_structure/table_structure_model.py +305 -0
  100. docling/models/utils/__init__.py +0 -0
  101. docling/models/utils/generation_utils.py +157 -0
  102. docling/models/utils/hf_model_download.py +45 -0
  103. docling/models/vlm_pipeline_models/__init__.py +1 -0
  104. docling/models/vlm_pipeline_models/api_vlm_model.py +180 -0
  105. docling/models/vlm_pipeline_models/hf_transformers_model.py +391 -0
  106. docling/models/vlm_pipeline_models/mlx_model.py +325 -0
  107. docling/models/vlm_pipeline_models/vllm_model.py +344 -0
  108. docling/pipeline/__init__.py +0 -0
  109. docling/pipeline/asr_pipeline.py +431 -0
  110. docling/pipeline/base_extraction_pipeline.py +72 -0
  111. docling/pipeline/base_pipeline.py +326 -0
  112. docling/pipeline/extraction_vlm_pipeline.py +207 -0
  113. docling/pipeline/legacy_standard_pdf_pipeline.py +262 -0
  114. docling/pipeline/simple_pipeline.py +55 -0
  115. docling/pipeline/standard_pdf_pipeline.py +859 -0
  116. docling/pipeline/threaded_standard_pdf_pipeline.py +5 -0
  117. docling/pipeline/vlm_pipeline.py +416 -0
  118. docling/py.typed +1 -0
  119. docling/utils/__init__.py +0 -0
  120. docling/utils/accelerator_utils.py +97 -0
  121. docling/utils/api_image_request.py +205 -0
  122. docling/utils/deepseekocr_utils.py +388 -0
  123. docling/utils/export.py +146 -0
  124. docling/utils/glm_utils.py +361 -0
  125. docling/utils/layout_postprocessor.py +683 -0
  126. docling/utils/locks.py +3 -0
  127. docling/utils/model_downloader.py +168 -0
  128. docling/utils/ocr_utils.py +69 -0
  129. docling/utils/orientation.py +65 -0
  130. docling/utils/profiling.py +65 -0
  131. docling/utils/utils.py +65 -0
  132. docling/utils/visualization.py +85 -0
  133. docling-2.69.0.dist-info/METADATA +237 -0
  134. docling-2.69.0.dist-info/RECORD +138 -0
  135. docling-2.69.0.dist-info/WHEEL +5 -0
  136. docling-2.69.0.dist-info/entry_points.txt +6 -0
  137. docling-2.69.0.dist-info/licenses/LICENSE +21 -0
  138. docling-2.69.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1905 @@
1
+ """Backend to parse patents from the United States Patent Office (USPTO).
2
+
3
+ The parsers included in this module can handle patent grants published since 1976 and
4
+ patent applications since 2001.
5
+ The original files can be found in https://bulkdata.uspto.gov.
6
+ """
7
+
8
+ import html
9
+ import logging
10
+ import re
11
+ import xml.sax
12
+ import xml.sax.xmlreader
13
+ from abc import ABC, abstractmethod
14
+ from enum import Enum, unique
15
+ from io import BytesIO
16
+ from pathlib import Path
17
+ from typing import Final, Optional, Union
18
+
19
+ from bs4 import BeautifulSoup, Tag
20
+ from docling_core.types.doc import (
21
+ DocItem,
22
+ DocItemLabel,
23
+ DoclingDocument,
24
+ DocumentOrigin,
25
+ TableCell,
26
+ TableData,
27
+ TextItem,
28
+ )
29
+ from docling_core.types.doc.document import LevelNumber
30
+ from pydantic import NonNegativeInt
31
+ from typing_extensions import Self, TypedDict, override
32
+
33
+ from docling.backend.abstract_backend import DeclarativeDocumentBackend
34
+ from docling.datamodel.base_models import InputFormat
35
+ from docling.datamodel.document import InputDocument
36
+
37
+ _log = logging.getLogger(__name__)
38
+
39
+ XML_DECLARATION: Final = '<?xml version="1.0" encoding="UTF-8"?>'
40
+
41
+
42
+ @unique
43
+ class PatentHeading(Enum):
44
+ """Text of docling headings for tagged sections in USPTO patent documents."""
45
+
46
+ ABSTRACT = "ABSTRACT", 2
47
+ CLAIMS = "CLAIMS", 2
48
+
49
+ @override
50
+ def __new__(cls, value: str, _) -> Self:
51
+ obj = object.__new__(cls)
52
+ obj._value_ = value
53
+ return obj
54
+
55
+ @override
56
+ def __init__(self, _, level: LevelNumber) -> None:
57
+ self.level: LevelNumber = level
58
+
59
+
60
+ class PatentUsptoDocumentBackend(DeclarativeDocumentBackend):
61
+ @override
62
+ def __init__(
63
+ self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]
64
+ ) -> None:
65
+ super().__init__(in_doc, path_or_stream)
66
+
67
+ self.patent_content: str = ""
68
+ self.parser: Optional[PatentUspto] = None
69
+
70
+ try:
71
+ if isinstance(self.path_or_stream, BytesIO):
72
+ while line := self.path_or_stream.readline().decode("utf-8"):
73
+ if line.startswith("<!DOCTYPE") or line == "PATN\n":
74
+ self._set_parser(line)
75
+ self.patent_content += line
76
+ elif isinstance(self.path_or_stream, Path):
77
+ with open(self.path_or_stream, encoding="utf-8") as file_obj:
78
+ while line := file_obj.readline():
79
+ if line.startswith("<!DOCTYPE") or line == "PATN\n":
80
+ self._set_parser(line)
81
+ self.patent_content += line
82
+ except Exception as exc:
83
+ raise RuntimeError(
84
+ f"Could not initialize USPTO backend for file with hash {self.document_hash}."
85
+ ) from exc
86
+
87
+ def _set_parser(self, doctype: str) -> None:
88
+ doctype_line = doctype.lower()
89
+ if doctype == "PATN\n":
90
+ self.parser = PatentUsptoGrantAps()
91
+ elif "us-patent-application-v4" in doctype_line:
92
+ self.parser = PatentUsptoIce()
93
+ elif "us-patent-grant-v4" in doctype_line:
94
+ self.parser = PatentUsptoIce()
95
+ elif "us-grant-025" in doctype_line:
96
+ self.parser = PatentUsptoGrantV2()
97
+ elif all(
98
+ item in doctype_line
99
+ for item in ("patent-application-publication", "pap-v1")
100
+ ):
101
+ self.parser = PatentUsptoAppV1()
102
+ else:
103
+ self.parser = None
104
+
105
+ @override
106
+ def is_valid(self) -> bool:
107
+ return bool(self.patent_content) and bool(self.parser)
108
+
109
+ @classmethod
110
+ @override
111
+ def supports_pagination(cls) -> bool:
112
+ return False
113
+
114
+ @override
115
+ def unload(self) -> None:
116
+ return
117
+
118
+ @classmethod
119
+ @override
120
+ def supported_formats(cls) -> set[InputFormat]:
121
+ return {InputFormat.XML_USPTO}
122
+
123
+ @override
124
+ def convert(self) -> DoclingDocument:
125
+ if self.parser is not None:
126
+ doc = self.parser.parse(self.patent_content)
127
+ if doc is None:
128
+ raise RuntimeError(
129
+ f"Failed to convert doc (hash={self.document_hash}, "
130
+ f"name={self.file.name})."
131
+ )
132
+ doc.name = self.file.name or "file"
133
+ mime_type = (
134
+ "text/plain"
135
+ if isinstance(self.parser, PatentUsptoGrantAps)
136
+ else "application/xml"
137
+ )
138
+ doc.origin = DocumentOrigin(
139
+ mimetype=mime_type,
140
+ binary_hash=self.document_hash,
141
+ filename=self.file.name or "file",
142
+ )
143
+
144
+ return doc
145
+ else:
146
+ raise RuntimeError(
147
+ f"Cannot convert doc (hash={self.document_hash}, "
148
+ f"name={self.file.name}) because the backend failed to init."
149
+ )
150
+
151
+
152
+ class PatentUspto(ABC):
153
+ """Parser of patent documents from the US Patent Office."""
154
+
155
+ @abstractmethod
156
+ def parse(self, patent_content: str) -> Optional[DoclingDocument]:
157
+ """Parse a USPTO patent.
158
+
159
+ Parameters:
160
+ patent_content: The content of a single patent in a USPTO file.
161
+
162
+ Returns:
163
+ The patent parsed as a docling document.
164
+ """
165
+
166
+
167
+ class PatentUsptoIce(PatentUspto):
168
+ """Parser of patent documents from the US Patent Office (ICE).
169
+
170
+ The compatible formats are:
171
+ - Patent Grant Full Text Data/XML Version 4.x ICE (from January 2005)
172
+ - Patent Application Full Text Data/XML Version 4.x ICE (from January 2005)
173
+ """
174
+
175
+ def __init__(self) -> None:
176
+ """Build an instance of PatentUsptoIce class."""
177
+ self.handler = PatentUsptoIce.PatentHandler()
178
+ self.pattern = re.compile(r"^(<table .*?</table>)", re.MULTILINE | re.DOTALL)
179
+
180
+ def parse(self, patent_content: str) -> Optional[DoclingDocument]:
181
+ try:
182
+ xml.sax.parseString(patent_content, self.handler)
183
+ except xml.sax._exceptions.SAXParseException as exc_sax:
184
+ _log.error(f"Error in parsing USPTO document: {exc_sax}")
185
+
186
+ return None
187
+
188
+ doc = self.handler.doc
189
+ if doc:
190
+ raw_tables = re.findall(self.pattern, patent_content)
191
+ parsed_tables: list[TableData] = []
192
+ _log.debug(f"Found {len(raw_tables)} tables to be parsed with XmlTable.")
193
+ for table in raw_tables:
194
+ table_parser = XmlTable(XML_DECLARATION + "\n" + table)
195
+ try:
196
+ table_data = table_parser.parse()
197
+ if table_data:
198
+ parsed_tables.append(table_data)
199
+ except Exception as exc_table:
200
+ _log.error(f"Error in parsing USPTO tables: {exc_table}")
201
+ if len(parsed_tables) != len(doc.tables):
202
+ _log.error(
203
+ f"Number of referenced ({len(doc.tables)}) and parsed "
204
+ f"({len(parsed_tables)}) tables differ."
205
+ )
206
+ else:
207
+ for idx, item in enumerate(parsed_tables):
208
+ doc.tables[idx].data = item
209
+
210
+ return doc
211
+
212
+ class PatentHandler(xml.sax.handler.ContentHandler):
213
+ """SAX ContentHandler for patent documents."""
214
+
215
+ APP_DOC_ELEMENT: Final = "us-patent-application"
216
+ GRANT_DOC_ELEMENT: Final = "us-patent-grant"
217
+
218
+ @unique
219
+ class Element(Enum):
220
+ """Represents an element of interest in the patent application document."""
221
+
222
+ ABSTRACT = "abstract", True
223
+ TITLE = "invention-title", True
224
+ CLAIMS = "claims", False
225
+ CLAIM = "claim", False
226
+ CLAIM_TEXT = "claim-text", True
227
+ PARAGRAPH = "p", True
228
+ HEADING = "heading", True
229
+ DESCRIPTION = "description", False
230
+ TABLE = "table", False # to track its position, without text
231
+ DRAWINGS = "description-of-drawings", True
232
+ STYLE_SUPERSCRIPT = "sup", True
233
+ STYLE_SUBSCRIPT = "sub", True
234
+ MATHS = "maths", False # to avoid keeping formulas
235
+
236
+ @override
237
+ def __new__(cls, value: str, _) -> Self:
238
+ obj = object.__new__(cls)
239
+ obj._value_ = value
240
+ return obj
241
+
242
+ @override
243
+ def __init__(self, _, is_text: bool) -> None:
244
+ self.is_text: bool = is_text
245
+
246
+ @override
247
+ def __init__(self) -> None:
248
+ """Build an instance of the patent handler."""
249
+ # Current patent being parsed
250
+ self.doc: Optional[DoclingDocument] = None
251
+ # Keep track of docling hierarchy level
252
+ self.level: LevelNumber = 1
253
+ # Keep track of docling parents by level
254
+ self.parents: dict[LevelNumber, Optional[DocItem]] = {1: None}
255
+ # Content to retain for the current patent
256
+ self.property: list[str]
257
+ self.claim: str
258
+ self.claims: list[str]
259
+ self.abstract: str
260
+ self.text: str
261
+ self._clean_data()
262
+ # To handle mathematical styling
263
+ self.style_html = HtmlEntity()
264
+
265
+ @override
266
+ def startElement(self, tag, attributes):
267
+ """Signal the start of an element.
268
+
269
+ Args:
270
+ tag: The element tag.
271
+ attributes: The element attributes.
272
+ """
273
+ if tag in (
274
+ self.APP_DOC_ELEMENT,
275
+ self.GRANT_DOC_ELEMENT,
276
+ ):
277
+ self.doc = DoclingDocument(name="file")
278
+ self.text = ""
279
+ self._start_registered_elements(tag, attributes)
280
+
281
+ @override
282
+ def skippedEntity(self, name):
283
+ """Receive notification of a skipped entity.
284
+
285
+ HTML entities will be skipped by the parser. This method will unescape them
286
+ and add them to the text.
287
+
288
+ Args:
289
+ name: Entity name.
290
+ """
291
+ if self.property:
292
+ elm_val = self.property[-1]
293
+ element = self.Element(elm_val)
294
+ if element.is_text:
295
+ escaped = self.style_html.get_greek_from_iso8879(f"&{name};")
296
+ unescaped = html.unescape(escaped)
297
+ if unescaped == escaped:
298
+ _log.debug(f"Unrecognized HTML entity: {name}")
299
+ return
300
+
301
+ if element in (
302
+ self.Element.STYLE_SUPERSCRIPT,
303
+ self.Element.STYLE_SUBSCRIPT,
304
+ ):
305
+ # superscripts and subscripts need to be under text elements
306
+ if len(self.property) < 2:
307
+ return
308
+ parent_val = self.property[-2]
309
+ parent = self.Element(parent_val)
310
+ if parent.is_text:
311
+ self.text += self._apply_style(unescaped, elm_val)
312
+ else:
313
+ self.text += unescaped
314
+
315
+ @override
316
+ def endElement(self, tag):
317
+ """Signal the end of an element.
318
+
319
+ Args:
320
+ tag: The element tag.
321
+ """
322
+ if tag in (
323
+ self.APP_DOC_ELEMENT,
324
+ self.GRANT_DOC_ELEMENT,
325
+ ):
326
+ self._clean_data()
327
+ self._end_registered_element(tag)
328
+
329
+ @override
330
+ def characters(self, content):
331
+ """Receive notification of character data.
332
+
333
+ Args:
334
+ content: Data reported by the handler.
335
+ """
336
+ if self.property:
337
+ elm_val = self.property[-1]
338
+ element = self.Element(elm_val)
339
+ if element.is_text:
340
+ if element in (
341
+ self.Element.STYLE_SUPERSCRIPT,
342
+ self.Element.STYLE_SUBSCRIPT,
343
+ ):
344
+ # superscripts and subscripts need to be under text elements
345
+ if len(self.property) < 2:
346
+ return
347
+ parent_val = self.property[-2]
348
+ parent = self.Element(parent_val)
349
+ if parent.is_text:
350
+ self.text += self._apply_style(content, elm_val)
351
+ else:
352
+ self.text += content
353
+
354
+ def _start_registered_elements(
355
+ self, tag: str, attributes: xml.sax.xmlreader.AttributesImpl
356
+ ) -> None:
357
+ if tag in [member.value for member in self.Element]:
358
+ # special case for claims: claim lines may start before the
359
+ # previous one is closed
360
+ if (
361
+ tag == self.Element.CLAIM_TEXT.value
362
+ and self.property
363
+ and self.property[-1] == tag
364
+ and self.text.strip()
365
+ ):
366
+ self.claim += " " + self.text.strip()
367
+ self.text = ""
368
+ elif tag == self.Element.HEADING.value:
369
+ level_attr: str = attributes.get("level", "")
370
+ new_level: int = int(level_attr) if level_attr.isnumeric() else 1
371
+ max_level = min(self.parents.keys())
372
+ # increase heading level with 1 for title, if any
373
+ self.level = (
374
+ new_level + 1 if (new_level + 1) in self.parents else max_level
375
+ )
376
+ self.property.append(tag)
377
+
378
+ def _end_registered_element(self, tag: str) -> None:
379
+ if tag in [item.value for item in self.Element] and self.property:
380
+ current_tag = self.property.pop()
381
+ self._add_property(current_tag, self.text.strip())
382
+
383
+ def _add_property(self, name: str, text: str) -> None:
384
+ if not name or not self.doc:
385
+ return
386
+
387
+ if name == self.Element.TITLE.value:
388
+ if text:
389
+ self.parents[self.level + 1] = self.doc.add_title(
390
+ parent=self.parents[self.level],
391
+ text=text,
392
+ )
393
+ self.level += 1
394
+ self.text = ""
395
+
396
+ elif name == self.Element.ABSTRACT.value:
397
+ if self.abstract:
398
+ heading_text = PatentHeading.ABSTRACT.value
399
+ heading_level = (
400
+ PatentHeading.ABSTRACT.level
401
+ if PatentHeading.ABSTRACT.level in self.parents
402
+ else 1
403
+ )
404
+ abstract_item = self.doc.add_heading(
405
+ heading_text,
406
+ level=heading_level,
407
+ parent=self.parents[heading_level],
408
+ )
409
+ self.doc.add_text(
410
+ label=DocItemLabel.PARAGRAPH,
411
+ text=self.abstract,
412
+ parent=abstract_item,
413
+ )
414
+
415
+ elif name == self.Element.CLAIM_TEXT.value:
416
+ text = re.sub("\\s+", " ", text).strip()
417
+ if text:
418
+ self.claim += " " + text
419
+ self.text = ""
420
+
421
+ elif name == self.Element.CLAIM.value and self.claim:
422
+ self.claims.append(self.claim.strip())
423
+ self.claim = ""
424
+
425
+ elif name == self.Element.CLAIMS.value and self.claims:
426
+ heading_text = PatentHeading.CLAIMS.value
427
+ heading_level = (
428
+ PatentHeading.CLAIMS.level
429
+ if PatentHeading.CLAIMS.level in self.parents
430
+ else 1
431
+ )
432
+ claims_item = self.doc.add_heading(
433
+ heading_text,
434
+ level=heading_level,
435
+ parent=self.parents[heading_level],
436
+ )
437
+ for text in self.claims:
438
+ self.doc.add_text(
439
+ label=DocItemLabel.PARAGRAPH, text=text, parent=claims_item
440
+ )
441
+
442
+ elif name == self.Element.PARAGRAPH.value and text:
443
+ # remove blank spaces added in paragraphs
444
+ text = re.sub("\\s+", " ", text)
445
+ if self.Element.ABSTRACT.value in self.property:
446
+ self.abstract = (
447
+ (self.abstract + " " + text) if self.abstract else text
448
+ )
449
+ else:
450
+ self.doc.add_text(
451
+ label=DocItemLabel.PARAGRAPH,
452
+ text=text,
453
+ parent=self.parents[self.level],
454
+ )
455
+ self.text = ""
456
+
457
+ elif name == self.Element.HEADING.value and text:
458
+ self.parents[self.level + 1] = self.doc.add_heading(
459
+ text=text,
460
+ level=self.level,
461
+ parent=self.parents[self.level],
462
+ )
463
+ self.level += 1
464
+ self.text = ""
465
+
466
+ elif name == self.Element.TABLE.value:
467
+ # set an empty table as placeholder
468
+ empty_table = TableData(num_rows=0, num_cols=0, table_cells=[])
469
+ self.doc.add_table(
470
+ data=empty_table,
471
+ parent=self.parents[self.level],
472
+ )
473
+
474
+ def _apply_style(self, text: str, style_tag: str) -> str:
475
+ """Apply an HTML style to text.
476
+
477
+ Args:
478
+ text: A string containing plain text.
479
+ style_tag: An HTML tag name for styling text. If the tag name is not
480
+ recognized as one of the supported styles, the method will return
481
+ the original `text`.
482
+
483
+ Returns:
484
+ A string after applying the style.
485
+ """
486
+ formatted = text
487
+
488
+ if style_tag == self.Element.STYLE_SUPERSCRIPT.value:
489
+ formatted = html.unescape(self.style_html.get_superscript(text))
490
+ elif style_tag == self.Element.STYLE_SUBSCRIPT.value:
491
+ formatted = html.unescape(self.style_html.get_subscript(text))
492
+
493
+ return formatted
494
+
495
+ def _clean_data(self) -> None:
496
+ """Reset the variables from stream data."""
497
+ self.property = []
498
+ self.claim = ""
499
+ self.claims = []
500
+ self.abstract = ""
501
+
502
+
503
+ class PatentUsptoGrantV2(PatentUspto):
504
+ """Parser of patent documents from the US Patent Office (grants v2.5).
505
+
506
+ The compatible format is:
507
+ - Patent Grant Full Text Data/XML Version 2.5 (from January 2002 till December 2004)
508
+ """
509
+
510
+ @override
511
+ def __init__(self) -> None:
512
+ """Build an instance of PatentUsptoGrantV2 class."""
513
+ self.handler = PatentUsptoGrantV2.PatentHandler()
514
+ self.pattern = re.compile(r"^(<table .*?</table>)", re.MULTILINE | re.DOTALL)
515
+
516
+ @override
517
+ def parse(self, patent_content: str) -> Optional[DoclingDocument]:
518
+ try:
519
+ xml.sax.parseString(patent_content, self.handler)
520
+ except xml.sax._exceptions.SAXParseException as exc_sax:
521
+ _log.error(f"Error in parsing USPTO document: {exc_sax}")
522
+
523
+ return None
524
+
525
+ doc = self.handler.doc
526
+ if doc:
527
+ raw_tables = re.findall(self.pattern, patent_content)
528
+ parsed_tables: list[TableData] = []
529
+ _log.debug(f"Found {len(raw_tables)} tables to be parsed with XmlTable.")
530
+ for table in raw_tables:
531
+ table_parser = XmlTable(XML_DECLARATION + "\n" + table)
532
+ try:
533
+ table_data = table_parser.parse()
534
+ if table_data:
535
+ parsed_tables.append(table_data)
536
+ except Exception as exc_table:
537
+ _log.error(f"Error in parsing USPTO tables: {exc_table}")
538
+ if len(parsed_tables) != len(doc.tables):
539
+ _log.error(
540
+ f"Number of referenced ({len(doc.tables)}) and parsed "
541
+ f"({len(parsed_tables)}) tables differ."
542
+ )
543
+ else:
544
+ for idx, item in enumerate(parsed_tables):
545
+ doc.tables[idx].data = item
546
+
547
+ return doc
548
+
549
+ class PatentHandler(xml.sax.handler.ContentHandler):
550
+ """SAX ContentHandler for patent documents."""
551
+
552
+ GRANT_DOC_ELEMENT: Final = "PATDOC"
553
+ CLAIM_STATEMENT: Final = "What is claimed is:"
554
+
555
+ @unique
556
+ class Element(Enum):
557
+ """Represents an element of interest in the patent application document."""
558
+
559
+ PDAT = "PDAT", True # any type of data
560
+ ABSTRACT = ("SDOAB", False)
561
+ SDOCL = ("SDOCL", False)
562
+ TITLE = ("B540", False)
563
+ CLAIMS = ("CL", False)
564
+ CLAIM = ("CLM", False)
565
+ PARAGRAPH = ("PARA", True)
566
+ HEADING = ("H", True)
567
+ DRAWINGS = ("DRWDESC", False)
568
+ STYLE_SUPERSCRIPT = ("SP", False)
569
+ STYLE_SUBSCRIPT = ("SB", False)
570
+ STYLE_ITALIC = ("ITALIC", False)
571
+ CWU = ("CWU", False) # avoid tables, chemicals, formulas
572
+ TABLE = ("table", False) # to keep track of table positions
573
+
574
+ @override
575
+ def __new__(cls, value: str, _) -> Self:
576
+ obj = object.__new__(cls)
577
+ obj._value_ = value
578
+ return obj
579
+
580
+ @override
581
+ def __init__(self, _, is_text: bool) -> None:
582
+ self.is_text: bool = is_text
583
+
584
+ @override
585
+ def __init__(self) -> None:
586
+ """Build an instance of the patent handler."""
587
+ # Current patent being parsed
588
+ self.doc: Optional[DoclingDocument] = None
589
+ # Keep track of docling hierarchy level
590
+ self.level: LevelNumber = 1
591
+ # Keep track of docling parents by level
592
+ self.parents: dict[LevelNumber, Optional[DocItem]] = {1: None}
593
+ # Content to retain for the current patent
594
+ self.property: list[str]
595
+ self.claim: str
596
+ self.claims: list[str]
597
+ self.paragraph: str
598
+ self.abstract: str
599
+ self._clean_data()
600
+ # To handle mathematical styling
601
+ self.style_html = HtmlEntity()
602
+
603
+ @override
604
+ def startElement(self, tag, attributes):
605
+ """Signal the start of an element.
606
+
607
+ Args:
608
+ tag: The element tag.
609
+ attributes: The element attributes.
610
+ """
611
+ if tag == self.GRANT_DOC_ELEMENT:
612
+ self.doc = DoclingDocument(name="file")
613
+ self.text = ""
614
+ self._start_registered_elements(tag, attributes)
615
+
616
+ @override
617
+ def skippedEntity(self, name):
618
+ """Receive notification of a skipped entity.
619
+
620
+ HTML entities will be skipped by the parser. This method will unescape them
621
+ and add them to the text.
622
+
623
+ Args:
624
+ name: Entity name.
625
+ """
626
+ if self.property:
627
+ elm_val = self.property[-1]
628
+ element = self.Element(elm_val)
629
+ if element.is_text:
630
+ escaped = self.style_html.get_greek_from_iso8879(f"&{name};")
631
+ unescaped = html.unescape(escaped)
632
+ if unescaped == escaped:
633
+ _log.debug("Unrecognized HTML entity: " + name)
634
+ return
635
+
636
+ if element in (
637
+ self.Element.STYLE_SUPERSCRIPT,
638
+ self.Element.STYLE_SUBSCRIPT,
639
+ ):
640
+ # superscripts and subscripts need to be under text elements
641
+ if len(self.property) < 2:
642
+ return
643
+ parent_val = self.property[-2]
644
+ parent = self.Element(parent_val)
645
+ if parent.is_text:
646
+ self.text += self._apply_style(unescaped, elm_val)
647
+ else:
648
+ self.text += unescaped
649
+
650
+ @override
651
+ def endElement(self, tag):
652
+ """Signal the end of an element.
653
+
654
+ Args:
655
+ tag: The element tag.
656
+ """
657
+ if tag == self.GRANT_DOC_ELEMENT:
658
+ self._clean_data()
659
+ self._end_registered_element(tag)
660
+
661
+ @override
662
+ def characters(self, content):
663
+ """Receive notification of character data.
664
+
665
+ Args:
666
+ content: Data reported by the handler.
667
+ """
668
+ if self.property:
669
+ elm_val = self.property[-1]
670
+ element = self.Element(elm_val)
671
+ if element.is_text:
672
+ if element in (
673
+ self.Element.STYLE_SUPERSCRIPT,
674
+ self.Element.STYLE_SUBSCRIPT,
675
+ ):
676
+ # superscripts and subscripts need to be under text elements
677
+ if len(self.property) < 2:
678
+ return
679
+ parent_val = self.property[-2]
680
+ parent = self.Element(parent_val)
681
+ if parent.is_text:
682
+ self.text += self._apply_style(content, elm_val)
683
+ else:
684
+ self.text += content
685
+
686
+ def _start_registered_elements(
687
+ self, tag: str, attributes: xml.sax.xmlreader.AttributesImpl
688
+ ) -> None:
689
+ if tag in [member.value for member in self.Element]:
690
+ if (
691
+ tag == self.Element.HEADING.value
692
+ and self.Element.SDOCL.value not in self.property
693
+ ):
694
+ level_attr: str = attributes.get("LVL", "")
695
+ new_level: int = int(level_attr) if level_attr.isnumeric() else 1
696
+ max_level = min(self.parents.keys())
697
+ # increase heading level with 1 for title, if any
698
+ self.level = (
699
+ new_level + 1 if (new_level + 1) in self.parents else max_level
700
+ )
701
+ self.property.append(tag)
702
+
703
+ def _end_registered_element(self, tag: str) -> None:
704
+ if tag in [elm.value for elm in self.Element] and self.property:
705
+ current_tag = self.property.pop()
706
+ self._add_property(current_tag, self.text)
707
+
708
+ def _add_property(self, name: str, text: str) -> None:
709
+ if not name or not self.doc:
710
+ return
711
+ if name == self.Element.PDAT.value and text:
712
+ if not self.property:
713
+ self.text = ""
714
+ return
715
+
716
+ wrapper = self.property[-1]
717
+ text = self._apply_style(text, wrapper)
718
+
719
+ if self.Element.TITLE.value in self.property and text.strip():
720
+ title = text.strip()
721
+ self.parents[self.level + 1] = self.doc.add_title(
722
+ parent=self.parents[self.level],
723
+ text=title,
724
+ )
725
+ self.level += 1
726
+
727
+ elif self.Element.ABSTRACT.value in self.property:
728
+ self.abstract += text
729
+
730
+ elif self.Element.CLAIM.value in self.property:
731
+ self.claim += text
732
+
733
+ # Paragraph text not in claims or abstract
734
+ elif (
735
+ self.Element.PARAGRAPH.value in self.property
736
+ and self.Element.CLAIM.value not in self.property
737
+ and self.Element.ABSTRACT.value not in self.property
738
+ ):
739
+ self.paragraph += text
740
+
741
+ # headers except claims statement
742
+ elif (
743
+ self.Element.HEADING.value in self.property
744
+ and self.Element.SDOCL.value not in self.property
745
+ and text.strip()
746
+ ):
747
+ self.parents[self.level + 1] = self.doc.add_heading(
748
+ text=text.strip(),
749
+ level=self.level,
750
+ parent=self.parents[self.level],
751
+ )
752
+ self.level += 1
753
+
754
+ self.text = ""
755
+
756
+ elif name == self.Element.CLAIM.value and self.claim.strip():
757
+ self.claims.append(self.claim.strip())
758
+ self.claim = ""
759
+
760
+ elif name == self.Element.CLAIMS.value and self.claims:
761
+ heading_text = PatentHeading.CLAIMS.value
762
+ heading_level = (
763
+ PatentHeading.CLAIMS.level
764
+ if PatentHeading.CLAIMS.level in self.parents
765
+ else 1
766
+ )
767
+ claims_item = self.doc.add_heading(
768
+ heading_text,
769
+ level=heading_level,
770
+ parent=self.parents[heading_level],
771
+ )
772
+ for text in self.claims:
773
+ self.doc.add_text(
774
+ label=DocItemLabel.PARAGRAPH, text=text, parent=claims_item
775
+ )
776
+
777
+ elif name == self.Element.ABSTRACT.value and self.abstract.strip():
778
+ abstract = self.abstract.strip()
779
+ heading_text = PatentHeading.ABSTRACT.value
780
+ heading_level = (
781
+ PatentHeading.ABSTRACT.level
782
+ if PatentHeading.ABSTRACT.level in self.parents
783
+ else 1
784
+ )
785
+ abstract_item = self.doc.add_heading(
786
+ heading_text,
787
+ level=heading_level,
788
+ parent=self.parents[heading_level],
789
+ )
790
+ self.doc.add_text(
791
+ label=DocItemLabel.PARAGRAPH, text=abstract, parent=abstract_item
792
+ )
793
+
794
+ elif name == self.Element.PARAGRAPH.value:
795
+ paragraph = self.paragraph.strip()
796
+ if paragraph and self.Element.CLAIM.value not in self.property:
797
+ self.doc.add_text(
798
+ label=DocItemLabel.PARAGRAPH,
799
+ text=paragraph,
800
+ parent=self.parents[self.level],
801
+ )
802
+ elif self.Element.CLAIM.value in self.property:
803
+ # we may need a space after a paragraph in claim text
804
+ self.claim += " "
805
+ self.paragraph = ""
806
+
807
+ elif name == self.Element.TABLE.value:
808
+ # set an empty table as placeholder
809
+ empty_table = TableData(num_rows=0, num_cols=0, table_cells=[])
810
+ self.doc.add_table(
811
+ data=empty_table,
812
+ parent=self.parents[self.level],
813
+ )
814
+
815
+ def _apply_style(self, text: str, style_tag: str) -> str:
816
+ """Apply an HTML style to text.
817
+
818
+ Args:
819
+ text: A string containing plain text.
820
+ style_tag: An HTML tag name for styling text. If the tag name is not
821
+ recognized as one of the supported styles, the method will return
822
+ the original `text`.
823
+
824
+ Returns:
825
+ A string after applying the style.
826
+ """
827
+ formatted = text
828
+
829
+ if style_tag == self.Element.STYLE_SUPERSCRIPT.value:
830
+ formatted = html.unescape(self.style_html.get_superscript(text))
831
+ elif style_tag == self.Element.STYLE_SUBSCRIPT.value:
832
+ formatted = html.unescape(self.style_html.get_subscript(text))
833
+ elif style_tag == self.Element.STYLE_ITALIC.value:
834
+ formatted = html.unescape(self.style_html.get_math_italic(text))
835
+
836
+ return formatted
837
+
838
+ def _clean_data(self) -> None:
839
+ """Reset the variables from stream data."""
840
+ self.text = ""
841
+ self.property = []
842
+ self.claim = ""
843
+ self.claims = []
844
+ self.paragraph = ""
845
+ self.abstract = ""
846
+
847
+
848
+ class PatentUsptoGrantAps(PatentUspto):
849
+ """Parser of patents documents from the US Patent Office (grants APS).
850
+
851
+ The compatible format is:
852
+ - Patent Grant Full Text Data/APS (from January 1976 till December 2001)
853
+ """
854
+
855
+ @unique
856
+ class Section(Enum):
857
+ """Represent a section in a patent APS document."""
858
+
859
+ ABSTRACT = "ABST"
860
+ SUMMARY = "BSUM"
861
+ DETAILS = "DETD"
862
+ CLAIMS = "CLMS"
863
+ DRAWINGS = "DRWD"
864
+
865
+ @unique
866
+ class Field(Enum):
867
+ """Represent a field in a patent APS document."""
868
+
869
+ DOC_NUMBER = "WKU"
870
+ TITLE = "TTL"
871
+ PARAGRAPH = "PAR"
872
+ PARAGRAPH_1 = "PA1"
873
+ PARAGRAPH_2 = "PA2"
874
+ PARAGRAPH_3 = "PA3"
875
+ TEXT = "PAL"
876
+ CAPTION = "PAC"
877
+ NUMBER = "NUM"
878
+ NAME = "NAM"
879
+ IPC = "ICL"
880
+ ISSUED = "ISD"
881
+ FILED = "APD"
882
+ PATENT_NUMBER = "PNO"
883
+ APPLICATION_NUMBER = "APN"
884
+ APPLICATION_TYPE = "APT"
885
+ COUNTRY = "CNT"
886
+
887
+ @override
888
+ def __init__(self) -> None:
889
+ """Build an instance of PatentUsptoGrantAps class."""
890
+ self.doc: Optional[DoclingDocument] = None
891
+ # Keep track of docling hierarchy level
892
+ self.level: LevelNumber = 1
893
+ # Keep track of docling parents by level
894
+ self.parents: dict[LevelNumber, Optional[DocItem]] = {1: None}
895
+
896
+ def get_last_text_item(self) -> Optional[TextItem]:
897
+ """Get the last text item at the current document level.
898
+
899
+ Returns:
900
+ The text item or None, if the current level parent has no children."""
901
+ if self.doc:
902
+ parent = self.parents[self.level]
903
+ children = parent.children if parent is not None else []
904
+ else:
905
+ return None
906
+ text_list: list[TextItem] = [
907
+ item
908
+ for item in self.doc.texts
909
+ if isinstance(item, TextItem) and item.get_ref() in children
910
+ ]
911
+
912
+ if text_list:
913
+ return text_list[-1]
914
+ else:
915
+ return None
916
+
917
+ def store_section(self, section: str) -> None:
918
+ """Store the section heading in the docling document.
919
+
920
+ Only the predefined sections from PatentHeading will be handled.
921
+ The other sections are created by the Field.CAPTION field.
922
+
923
+ Args:
924
+ section: A patent section name."""
925
+ heading: PatentHeading
926
+ if self.doc is None:
927
+ return
928
+ elif section == self.Section.ABSTRACT.value:
929
+ heading = PatentHeading.ABSTRACT
930
+ elif section == self.Section.CLAIMS.value:
931
+ heading = PatentHeading.CLAIMS
932
+ else:
933
+ return None
934
+
935
+ self.level = heading.level if heading.level in self.parents else 1
936
+ self.parents[self.level + 1] = self.doc.add_heading(
937
+ heading.value,
938
+ level=self.level,
939
+ parent=self.parents[self.level],
940
+ )
941
+ self.level += 1
942
+
943
+ def store_content(self, section: str, field: str, value: str) -> None:
944
+ """Store the key value within a document section in the docling document.
945
+
946
+ Args:
947
+ section: A patent section name.
948
+ field: A field name.
949
+ value: A field value name.
950
+ """
951
+ if (
952
+ not self.doc
953
+ or not field
954
+ or field not in [item.value for item in PatentUsptoGrantAps.Field]
955
+ ):
956
+ return
957
+
958
+ if field == self.Field.TITLE.value:
959
+ self.parents[self.level + 1] = self.doc.add_title(
960
+ parent=self.parents[self.level], text=value
961
+ )
962
+ self.level += 1
963
+
964
+ elif field == self.Field.TEXT.value and section == self.Section.ABSTRACT.value:
965
+ abst_item = self.get_last_text_item()
966
+ if abst_item:
967
+ abst_item.text += " " + value
968
+ else:
969
+ self.doc.add_text(
970
+ label=DocItemLabel.PARAGRAPH,
971
+ text=value,
972
+ parent=self.parents[self.level],
973
+ )
974
+
975
+ elif field == self.Field.NUMBER.value and section == self.Section.CLAIMS.value:
976
+ self.doc.add_text(
977
+ label=DocItemLabel.PARAGRAPH,
978
+ text="",
979
+ parent=self.parents[self.level],
980
+ )
981
+
982
+ elif (
983
+ field
984
+ in (
985
+ self.Field.PARAGRAPH.value,
986
+ self.Field.PARAGRAPH_1.value,
987
+ self.Field.PARAGRAPH_2.value,
988
+ self.Field.PARAGRAPH_3.value,
989
+ )
990
+ and section == self.Section.CLAIMS.value
991
+ ):
992
+ last_claim = self.get_last_text_item()
993
+ if last_claim is None:
994
+ last_claim = self.doc.add_text(
995
+ label=DocItemLabel.PARAGRAPH,
996
+ text="",
997
+ parent=self.parents[self.level],
998
+ )
999
+
1000
+ last_claim.text += f" {value.strip()}" if last_claim.text else value.strip()
1001
+
1002
+ elif field == self.Field.CAPTION.value and section in (
1003
+ self.Section.SUMMARY.value,
1004
+ self.Section.DETAILS.value,
1005
+ self.Section.DRAWINGS.value,
1006
+ ):
1007
+ # captions are siblings of abstract since no level info is provided
1008
+ head_item = PatentHeading.ABSTRACT
1009
+ self.level = head_item.level if head_item.level in self.parents else 1
1010
+ self.parents[self.level + 1] = self.doc.add_heading(
1011
+ value,
1012
+ level=self.level,
1013
+ parent=self.parents[self.level],
1014
+ )
1015
+ self.level += 1
1016
+
1017
+ elif field in (
1018
+ self.Field.PARAGRAPH.value,
1019
+ self.Field.PARAGRAPH_1.value,
1020
+ self.Field.PARAGRAPH_2.value,
1021
+ self.Field.PARAGRAPH_3.value,
1022
+ ) and section in (
1023
+ self.Section.SUMMARY.value,
1024
+ self.Section.DETAILS.value,
1025
+ self.Section.DRAWINGS.value,
1026
+ ):
1027
+ self.doc.add_text(
1028
+ label=DocItemLabel.PARAGRAPH,
1029
+ text=value,
1030
+ parent=self.parents[self.level],
1031
+ )
1032
+
1033
+ def parse(self, patent_content: str) -> Optional[DoclingDocument]:
1034
+ self.doc = self.doc = DoclingDocument(name="file")
1035
+ section: str = ""
1036
+ key: str = ""
1037
+ value: str = ""
1038
+ line_num = 0
1039
+ for line in patent_content.splitlines():
1040
+ cols = re.split("\\s{2,}", line, maxsplit=1)
1041
+ if key and value and (len(cols) == 1 or (len(cols) == 2 and cols[0])):
1042
+ self.store_content(section, key, value)
1043
+ key = ""
1044
+ value = ""
1045
+ if len(cols) == 1: # section title
1046
+ section = cols[0]
1047
+ self.store_section(section)
1048
+ _log.debug(f"Parsing section {section}")
1049
+ elif len(cols) == 2: # key value
1050
+ if cols[0]: # key present
1051
+ key = cols[0]
1052
+ value = cols[1]
1053
+ elif not re.match(r"^##STR\d+##$", cols[1]): # line continues
1054
+ value += " " + cols[1]
1055
+ line_num += 1
1056
+ if key and value:
1057
+ self.store_content(section, key, value)
1058
+
1059
+ # TODO: parse tables
1060
+ return self.doc
1061
+
1062
+
1063
+ class PatentUsptoAppV1(PatentUspto):
1064
+ """Parser of patent documents from the US Patent Office (applications v1.x)
1065
+
1066
+ The compatible format is:
1067
+ - Patent Application Full Text Data/XML Version 1.x (from March 2001 till December
1068
+ 2004)
1069
+ """
1070
+
1071
+ @override
1072
+ def __init__(self) -> None:
1073
+ """Build an instance of PatentUsptoAppV1 class."""
1074
+ self.handler = PatentUsptoAppV1.PatentHandler()
1075
+ self.pattern = re.compile(r"^(<table .*?</table>)", re.MULTILINE | re.DOTALL)
1076
+
1077
+ @override
1078
+ def parse(self, patent_content: str) -> Optional[DoclingDocument]:
1079
+ try:
1080
+ xml.sax.parseString(patent_content, self.handler)
1081
+ except xml.sax._exceptions.SAXParseException as exc_sax:
1082
+ _log.error(f"Error in parsing USPTO document: {exc_sax}")
1083
+
1084
+ return None
1085
+
1086
+ doc = self.handler.doc
1087
+ if doc:
1088
+ raw_tables = re.findall(self.pattern, patent_content)
1089
+ parsed_tables: list[TableData] = []
1090
+ _log.debug(f"Found {len(raw_tables)} tables to be parsed with XmlTable.")
1091
+ for table in raw_tables:
1092
+ table_parser = XmlTable(XML_DECLARATION + "\n" + table)
1093
+ try:
1094
+ table_data = table_parser.parse()
1095
+ if table_data:
1096
+ parsed_tables.append(table_data)
1097
+ except Exception as exc_table:
1098
+ _log.error(f"Error in parsing USPTO tables: {exc_table}")
1099
+ if len(parsed_tables) != len(doc.tables):
1100
+ _log.error(
1101
+ f"Number of referenced ({len(doc.tables)}) and parsed "
1102
+ f"({len(parsed_tables)}) tables differ."
1103
+ )
1104
+ else:
1105
+ for idx, item in enumerate(parsed_tables):
1106
+ doc.tables[idx].data = item
1107
+
1108
+ return doc
1109
+
1110
+ class PatentHandler(xml.sax.handler.ContentHandler):
1111
+ """SAX ContentHandler for patent documents."""
1112
+
1113
+ APP_DOC_ELEMENT: Final = "patent-application-publication"
1114
+
1115
+ @unique
1116
+ class Element(Enum):
1117
+ """Represents an element of interest in the patent application document."""
1118
+
1119
+ DRAWINGS = "brief-description-of-drawings", False
1120
+ ABSTRACT = "subdoc-abstract", False
1121
+ TITLE = "title-of-invention", True
1122
+ CLAIMS = "subdoc-claims", False
1123
+ CLAIM = "claim", False
1124
+ CLAIM_TEXT = "claim-text", True
1125
+ NUMBER = ("number", False)
1126
+ PARAGRAPH = "paragraph", True
1127
+ HEADING = "heading", True
1128
+ STYLE_SUPERSCRIPT = "superscript", True
1129
+ STYLE_SUBSCRIPT = "subscript", True
1130
+ # do not store text of a table, since it can be within paragraph
1131
+ TABLE = "table", False
1132
+ # do not store text of a formula, since it can be within paragraph
1133
+ MATH = "math-cwu", False
1134
+
1135
+ @override
1136
+ def __new__(cls, value: str, _) -> Self:
1137
+ obj = object.__new__(cls)
1138
+ obj._value_ = value
1139
+ return obj
1140
+
1141
+ @override
1142
+ def __init__(self, _, is_text: bool) -> None:
1143
+ self.is_text: bool = is_text
1144
+
1145
+ @override
1146
+ def __init__(self) -> None:
1147
+ """Build an instance of the patent handler."""
1148
+ # Current patent being parsed
1149
+ self.doc: Optional[DoclingDocument] = None
1150
+ # Keep track of docling hierarchy level
1151
+ self.level: LevelNumber = 1
1152
+ # Keep track of docling parents by level
1153
+ self.parents: dict[LevelNumber, Optional[DocItem]] = {1: None}
1154
+ # Content to retain for the current patent
1155
+ self.property: list[str]
1156
+ self.claim: str
1157
+ self.claims: list[str]
1158
+ self.abstract: str
1159
+ self.text: str
1160
+ self._clean_data()
1161
+ # To handle mathematical styling
1162
+ self.style_html = HtmlEntity()
1163
+
1164
+ @override
1165
+ def startElement(self, tag, attributes):
1166
+ """Signal the start of an element.
1167
+
1168
+ Args:
1169
+ tag: The element tag.
1170
+ attributes: The element attributes.
1171
+ """
1172
+ if tag == self.APP_DOC_ELEMENT:
1173
+ self.doc = DoclingDocument(name="file")
1174
+ self.text = ""
1175
+ self._start_registered_elements(tag, attributes)
1176
+
1177
+ @override
1178
+ def skippedEntity(self, name):
1179
+ """Receive notification of a skipped entity.
1180
+
1181
+ HTML entities will be skipped by the parser. This method will unescape them
1182
+ and add them to the text.
1183
+
1184
+ Args:
1185
+ name: Entity name.
1186
+ """
1187
+ if self.property:
1188
+ elm_val = self.property[-1]
1189
+ element = self.Element(elm_val)
1190
+ if element.is_text:
1191
+ escaped = self.style_html.get_greek_from_iso8879(f"&{name};")
1192
+ unescaped = html.unescape(escaped)
1193
+ if unescaped == escaped:
1194
+ _log.debug("Unrecognized HTML entity: " + name)
1195
+ return
1196
+
1197
+ if element in (
1198
+ self.Element.STYLE_SUPERSCRIPT,
1199
+ self.Element.STYLE_SUBSCRIPT,
1200
+ ):
1201
+ # superscripts and subscripts need to be under text elements
1202
+ if len(self.property) < 2:
1203
+ return
1204
+ parent_val = self.property[-2]
1205
+ parent = self.Element(parent_val)
1206
+ if parent.is_text:
1207
+ self.text += self._apply_style(unescaped, elm_val)
1208
+ else:
1209
+ self.text += unescaped
1210
+
1211
+ @override
1212
+ def endElement(self, tag):
1213
+ """Signal the end of an element.
1214
+
1215
+ Args:
1216
+ tag: The element tag.
1217
+ """
1218
+ if tag == self.APP_DOC_ELEMENT:
1219
+ self._clean_data()
1220
+ self._end_registered_element(tag)
1221
+
1222
+ @override
1223
+ def characters(self, content):
1224
+ """Receive notification of character data.
1225
+
1226
+ Args:
1227
+ content: Data reported by the handler.
1228
+ """
1229
+ if self.property:
1230
+ elm_val = self.property[-1]
1231
+ element = self.Element(elm_val)
1232
+ if element.is_text:
1233
+ if element in (
1234
+ self.Element.STYLE_SUPERSCRIPT,
1235
+ self.Element.STYLE_SUBSCRIPT,
1236
+ ):
1237
+ # superscripts and subscripts need to be under text elements
1238
+ if len(self.property) < 2:
1239
+ return
1240
+ parent_val = self.property[-2]
1241
+ parent = self.Element(parent_val)
1242
+ if parent.is_text:
1243
+ self.text += self._apply_style(content, elm_val)
1244
+ else:
1245
+ self.text += content
1246
+
1247
+ def _start_registered_elements(
1248
+ self, tag: str, attributes: xml.sax.xmlreader.AttributesImpl
1249
+ ) -> None:
1250
+ if tag in [member.value for member in self.Element]:
1251
+ # special case for claims: claim lines may start before the
1252
+ # previous one is closed
1253
+ if (
1254
+ tag == self.Element.CLAIM_TEXT.value
1255
+ and self.property
1256
+ and self.property[-1] == tag
1257
+ and self.text.strip()
1258
+ ):
1259
+ self.claim += " " + self.text.strip("\n")
1260
+ self.text = ""
1261
+ elif tag == self.Element.HEADING.value:
1262
+ level_attr: str = attributes.get("lvl", "")
1263
+ new_level: int = int(level_attr) if level_attr.isnumeric() else 1
1264
+ max_level = min(self.parents.keys())
1265
+ # increase heading level with 1 for title, if any
1266
+ self.level = (
1267
+ new_level + 1 if (new_level + 1) in self.parents else max_level
1268
+ )
1269
+ self.property.append(tag)
1270
+
1271
+ def _end_registered_element(self, tag: str) -> None:
1272
+ if tag in [elm.value for elm in self.Element] and self.property:
1273
+ current_tag = self.property.pop()
1274
+ self._add_property(current_tag, self.text)
1275
+
1276
+ def _add_property(self, name: str, text: str) -> None:
1277
+ if not name or not self.doc:
1278
+ return
1279
+
1280
+ if name == self.Element.TITLE.value:
1281
+ title = text.strip()
1282
+ if title:
1283
+ self.parents[self.level + 1] = self.doc.add_text(
1284
+ parent=self.parents[self.level],
1285
+ label=DocItemLabel.TITLE,
1286
+ text=title,
1287
+ )
1288
+ self.level += 1
1289
+ self.text = ""
1290
+ elif name == self.Element.ABSTRACT.value:
1291
+ abstract = self.abstract.strip()
1292
+ if abstract:
1293
+ heading_text = PatentHeading.ABSTRACT.value
1294
+ heading_level = (
1295
+ PatentHeading.ABSTRACT.level
1296
+ if PatentHeading.ABSTRACT.level in self.parents
1297
+ else 1
1298
+ )
1299
+ abstract_item = self.doc.add_heading(
1300
+ heading_text,
1301
+ level=heading_level,
1302
+ parent=self.parents[heading_level],
1303
+ )
1304
+ self.doc.add_text(
1305
+ label=DocItemLabel.PARAGRAPH,
1306
+ text=self.abstract,
1307
+ parent=abstract_item,
1308
+ )
1309
+ self.abstract = ""
1310
+ self.text = ""
1311
+ elif name == self.Element.CLAIM_TEXT.value:
1312
+ if text:
1313
+ self.claim += self.text.strip("\n")
1314
+ self.text = ""
1315
+
1316
+ elif name == self.Element.CLAIM.value:
1317
+ claim = self.claim.strip()
1318
+ if claim:
1319
+ self.claims.append(claim)
1320
+ self.claim = ""
1321
+
1322
+ elif name == self.Element.CLAIMS.value and self.claims:
1323
+ heading_text = PatentHeading.CLAIMS.value
1324
+ heading_level = (
1325
+ PatentHeading.CLAIMS.level
1326
+ if PatentHeading.CLAIMS.level in self.parents
1327
+ else 1
1328
+ )
1329
+ claims_item = self.doc.add_heading(
1330
+ heading_text,
1331
+ level=heading_level,
1332
+ parent=self.parents[heading_level],
1333
+ )
1334
+ for text in self.claims:
1335
+ self.doc.add_text(
1336
+ label=DocItemLabel.PARAGRAPH, text=text, parent=claims_item
1337
+ )
1338
+
1339
+ elif name in (
1340
+ self.Element.PARAGRAPH.value,
1341
+ self.Element.HEADING.value,
1342
+ ):
1343
+ if text and self.Element.ABSTRACT.value in self.property:
1344
+ self.abstract = (self.abstract + text) if self.abstract else text
1345
+ elif text.strip():
1346
+ text = re.sub("\\s+", " ", text).strip()
1347
+ if name == self.Element.HEADING.value:
1348
+ self.parents[self.level + 1] = self.doc.add_heading(
1349
+ text=text,
1350
+ level=self.level,
1351
+ parent=self.parents[self.level],
1352
+ )
1353
+ self.level += 1
1354
+ else:
1355
+ self.doc.add_text(
1356
+ label=DocItemLabel.PARAGRAPH,
1357
+ text=text,
1358
+ parent=self.parents[self.level],
1359
+ )
1360
+ self.text = ""
1361
+
1362
+ elif name == self.Element.TABLE.value:
1363
+ # set an empty table as placeholder
1364
+ empty_table = TableData(num_rows=0, num_cols=0, table_cells=[])
1365
+ self.doc.add_table(
1366
+ data=empty_table,
1367
+ parent=self.parents[self.level],
1368
+ )
1369
+
1370
+ def _apply_style(self, text: str, style_tag: str) -> str:
1371
+ """Apply an HTML style to text.
1372
+
1373
+ Args:
1374
+ text: A string containing plain text.
1375
+ style_tag: An HTML tag name for styling text. If the tag name is not
1376
+ recognized as one of the supported styles, the method will return
1377
+ the original `text`.
1378
+
1379
+ Returns:
1380
+ A string after applying the style.
1381
+ """
1382
+ formatted = html.unescape(text)
1383
+
1384
+ if style_tag == self.Element.STYLE_SUPERSCRIPT.value:
1385
+ formatted = html.unescape(self.style_html.get_superscript(formatted))
1386
+ elif style_tag == self.Element.STYLE_SUBSCRIPT.value:
1387
+ formatted = html.unescape(self.style_html.get_subscript(formatted))
1388
+
1389
+ return formatted
1390
+
1391
+ def _clean_data(self):
1392
+ """Reset the variables from stream data."""
1393
+ self.property = []
1394
+ self.abstract = ""
1395
+ self.claim = ""
1396
+ self.claims = []
1397
+ self.text = ""
1398
+
1399
+
1400
+ class XmlTable:
1401
+ """Provide a table parser for xml tables in USPTO patent documents.
1402
+
1403
+ The OASIS Open XML Exchange Table Model can be downloaded from:
1404
+ http://oasis-open.org/specs/soextblx.dtd
1405
+ """
1406
+
1407
+ class ColInfo(TypedDict):
1408
+ ncols: int
1409
+ colinfo: list[dict]
1410
+
1411
+ class MinColInfoType(TypedDict):
1412
+ offset: list[int]
1413
+ colwidth: list[int]
1414
+
1415
+ class ColInfoType(MinColInfoType):
1416
+ cell_range: list[int]
1417
+ cell_offst: list[int]
1418
+
1419
+ def __init__(self, input: str) -> None:
1420
+ """Initialize the table parser with the xml content.
1421
+
1422
+ Args:
1423
+ input: The xml content.
1424
+ """
1425
+ self.max_nbr_messages = 2
1426
+ self.nbr_messages = 0
1427
+ self.empty_text = ""
1428
+ self._soup = BeautifulSoup(input, features="xml")
1429
+
1430
+ def _create_tg_range(self, tgs: list[ColInfo]) -> dict[int, ColInfoType]:
1431
+ """Create a unified range along the table groups.
1432
+
1433
+ Args:
1434
+ tgs: Table group column specifications.
1435
+
1436
+ Returns:
1437
+ Unified group column specifications.
1438
+ """
1439
+ colinfo: dict[int, XmlTable.ColInfoType] = {}
1440
+
1441
+ if len(tgs) == 0:
1442
+ return colinfo
1443
+
1444
+ for itg, tg in enumerate(tgs):
1445
+ colinfo[itg] = {
1446
+ "offset": [],
1447
+ "colwidth": [],
1448
+ "cell_range": [],
1449
+ "cell_offst": [0],
1450
+ }
1451
+ offst = 0
1452
+ for info in tg["colinfo"]:
1453
+ cw = info["colwidth"]
1454
+ cw = re.sub("pt", "", cw, flags=re.I)
1455
+ cw = re.sub("mm", "", cw, flags=re.I)
1456
+ try:
1457
+ cw = int(cw)
1458
+ except BaseException:
1459
+ cw = float(cw)
1460
+ colinfo[itg]["colwidth"].append(cw)
1461
+ colinfo[itg]["offset"].append(offst)
1462
+ offst += cw
1463
+ colinfo[itg]["offset"].append(offst)
1464
+
1465
+ min_colinfo: XmlTable.MinColInfoType = {"offset": [], "colwidth": []}
1466
+
1467
+ min_colinfo["offset"] = colinfo[0]["offset"]
1468
+ offset_w0 = []
1469
+ for itg, col in colinfo.items():
1470
+ # keep track of col with 0 width
1471
+ for ic, cw in enumerate(col["colwidth"]):
1472
+ if cw == 0:
1473
+ offset_w0.append(col["offset"][ic])
1474
+
1475
+ min_colinfo["offset"] = sorted(set(col["offset"] + min_colinfo["offset"]))
1476
+
1477
+ # add back the 0 width cols to offset list
1478
+ offset_w0 = list(set(offset_w0))
1479
+ min_colinfo["offset"] = sorted(min_colinfo["offset"] + offset_w0)
1480
+
1481
+ for i in range(len(min_colinfo["offset"]) - 1):
1482
+ min_colinfo["colwidth"].append(
1483
+ min_colinfo["offset"][i + 1] - min_colinfo["offset"][i]
1484
+ )
1485
+
1486
+ for itg, col in colinfo.items():
1487
+ i = 1
1488
+ range_ = 1
1489
+ for min_i in range(1, len(min_colinfo["offset"])):
1490
+ min_offst = min_colinfo["offset"][min_i]
1491
+ offst = col["offset"][i]
1492
+ if min_offst == offst:
1493
+ if (
1494
+ len(col["offset"]) == i + 1
1495
+ and len(min_colinfo["offset"]) > min_i + 1
1496
+ ):
1497
+ range_ += 1
1498
+ else:
1499
+ col["cell_range"].append(range_)
1500
+ col["cell_offst"].append(col["cell_offst"][-1] + range_)
1501
+ range_ = 1
1502
+ i += 1
1503
+ elif min_offst < offst:
1504
+ range_ += 1
1505
+ else:
1506
+ _log.debug("A USPTO XML table has wrong offsets.")
1507
+ return {}
1508
+
1509
+ return colinfo
1510
+
1511
+ def _get_max_ncols(self, tgs_info: dict[int, ColInfoType]) -> NonNegativeInt:
1512
+ """Get the maximum number of columns across table groups.
1513
+
1514
+ Args:
1515
+ tgs_info: Unified group column specifications.
1516
+
1517
+ Return:
1518
+ The maximum number of columns.
1519
+ """
1520
+ ncols_max = 0
1521
+ for rowinfo in tgs_info.values():
1522
+ ncols_max = max(ncols_max, len(rowinfo["colwidth"]))
1523
+
1524
+ return ncols_max
1525
+
1526
+ def _parse_table(self, table: Tag) -> TableData:
1527
+ """Parse the content of a table tag.
1528
+
1529
+ Args:
1530
+ The table element.
1531
+
1532
+ Returns:
1533
+ A docling table object.
1534
+ """
1535
+ tgs_align: list[XmlTable.ColInfo] = []
1536
+ tg_secs = table("tgroup")
1537
+ if tg_secs:
1538
+ for tg_sec in tg_secs:
1539
+ if not isinstance(tg_sec, Tag):
1540
+ continue
1541
+ col_val = tg_sec.get("cols")
1542
+ ncols = (
1543
+ int(col_val)
1544
+ if isinstance(col_val, str) and col_val.isnumeric()
1545
+ else 1
1546
+ )
1547
+ tg_align: XmlTable.ColInfo = {"ncols": ncols, "colinfo": []}
1548
+ cs_secs = tg_sec("colspec")
1549
+ if cs_secs:
1550
+ for cs_sec in cs_secs:
1551
+ if not isinstance(cs_sec, Tag):
1552
+ continue
1553
+ colname = cs_sec.get("colname")
1554
+ colwidth = cs_sec.get("colwidth")
1555
+ tg_align["colinfo"].append(
1556
+ {"colname": colname, "colwidth": colwidth}
1557
+ )
1558
+
1559
+ tgs_align.append(tg_align)
1560
+
1561
+ # create unified range along the table groups
1562
+ tgs_range = self._create_tg_range(tgs_align)
1563
+
1564
+ # if the structure is broken, return an empty table
1565
+ if not tgs_range:
1566
+ dl_table = TableData(num_rows=0, num_cols=0, table_cells=[])
1567
+ return dl_table
1568
+
1569
+ ncols_max = self._get_max_ncols(tgs_range)
1570
+
1571
+ # extract table data
1572
+ table_data: list[TableCell] = []
1573
+ i_row_global = 0
1574
+ is_row_empty: bool = True
1575
+ tg_secs = table("tgroup")
1576
+ if tg_secs:
1577
+ for itg, tg_sec in enumerate(tg_secs):
1578
+ if not isinstance(tg_sec, Tag):
1579
+ continue
1580
+ tg_range = tgs_range[itg]
1581
+ row_secs = tg_sec(["row", "tr"])
1582
+
1583
+ if row_secs:
1584
+ for row_sec in row_secs:
1585
+ if not isinstance(row_sec, Tag):
1586
+ continue
1587
+ entry_secs = row_sec(["entry", "td"])
1588
+ is_header: bool = (
1589
+ row_sec.parent is not None
1590
+ and row_sec.parent.name == "thead"
1591
+ )
1592
+
1593
+ ncols = 0
1594
+ local_row: list[TableCell] = []
1595
+ is_row_empty = True
1596
+ if entry_secs:
1597
+ wrong_nbr_cols = False
1598
+ for ientry, entry_sec in enumerate(entry_secs):
1599
+ if not isinstance(entry_sec, Tag):
1600
+ continue
1601
+ text = entry_sec.get_text().strip()
1602
+
1603
+ # start-end
1604
+ namest = entry_sec.get("namest")
1605
+ nameend = entry_sec.get("nameend")
1606
+ start = (
1607
+ int(namest)
1608
+ if isinstance(namest, str) and namest.isnumeric()
1609
+ else ientry + 1
1610
+ )
1611
+ if isinstance(nameend, str) and nameend.isnumeric():
1612
+ end = int(nameend)
1613
+ shift = 0
1614
+ else:
1615
+ end = ientry + 2
1616
+ shift = 1
1617
+
1618
+ if end > len(tg_range["cell_offst"]):
1619
+ wrong_nbr_cols = True
1620
+ self.nbr_messages += 1
1621
+ if self.nbr_messages <= self.max_nbr_messages:
1622
+ _log.debug(
1623
+ "USPTO table has # entries != # columns"
1624
+ )
1625
+ break
1626
+
1627
+ range_ = [
1628
+ tg_range["cell_offst"][start - 1],
1629
+ tg_range["cell_offst"][end - 1] - shift,
1630
+ ]
1631
+
1632
+ # add row and replicate cell if needed
1633
+ cell_text = text if text else self.empty_text
1634
+ if cell_text != self.empty_text:
1635
+ is_row_empty = False
1636
+ for irep in range(range_[0], range_[1] + 1):
1637
+ ncols += 1
1638
+ local_row.append(
1639
+ TableCell(
1640
+ column_header=is_header,
1641
+ text=cell_text,
1642
+ start_row_offset_idx=i_row_global,
1643
+ end_row_offset_idx=i_row_global + 1,
1644
+ row_span=1,
1645
+ start_col_offset_idx=range_[0],
1646
+ end_col_offset_idx=range_[1] + 1,
1647
+ col_span=range_[1] - range_[0] + 1,
1648
+ )
1649
+ )
1650
+
1651
+ if wrong_nbr_cols:
1652
+ # keep empty text, not to introduce noise
1653
+ local_row = []
1654
+ ncols = 0
1655
+
1656
+ # add empty cell up to ncols_max
1657
+ for irep in range(ncols, ncols_max):
1658
+ local_row.append(
1659
+ TableCell(
1660
+ column_header=is_header,
1661
+ text=self.empty_text,
1662
+ start_row_offset_idx=i_row_global,
1663
+ end_row_offset_idx=i_row_global + 1,
1664
+ row_span=1,
1665
+ start_col_offset_idx=irep,
1666
+ end_col_offset_idx=irep + 1,
1667
+ col_span=1,
1668
+ )
1669
+ )
1670
+ # do not add empty rows
1671
+ if not is_row_empty:
1672
+ table_data.extend(local_row)
1673
+ i_row_global += 1
1674
+
1675
+ dl_table = TableData(
1676
+ num_rows=i_row_global, num_cols=ncols_max, table_cells=table_data
1677
+ )
1678
+
1679
+ return dl_table
1680
+
1681
+ def parse(self) -> Optional[TableData]:
1682
+ """Parse the first table from an xml content.
1683
+
1684
+ Returns:
1685
+ A docling table data.
1686
+ """
1687
+ section = self._soup.find("table")
1688
+ if isinstance(section, Tag):
1689
+ table = self._parse_table(section)
1690
+ if table.num_rows == 0 or table.num_cols == 0:
1691
+ _log.warning("The parsed USPTO table is empty")
1692
+ return table
1693
+ else:
1694
+ return None
1695
+
1696
+
1697
+ class HtmlEntity:
1698
+ """Provide utility functions to get the HTML entities of styled characters.
1699
+
1700
+ This class has been developed from:
1701
+ https://unicode-table.com/en/html-entities/
1702
+ https://www.w3.org/TR/WD-math-970515/table03.html
1703
+ """
1704
+
1705
+ def __init__(self):
1706
+ """Initialize this class by loading the HTML entity dictionaries."""
1707
+ self.superscript = str.maketrans(
1708
+ {
1709
+ "1": "&sup1;",
1710
+ "2": "&sup2;",
1711
+ "3": "&sup3;",
1712
+ "4": "&#8308;",
1713
+ "5": "&#8309;",
1714
+ "6": "&#8310;",
1715
+ "7": "&#8311;",
1716
+ "8": "&#8312;",
1717
+ "9": "&#8313;",
1718
+ "0": "&#8304;",
1719
+ "+": "&#8314;",
1720
+ "-": "&#8315;",
1721
+ "−": "&#8315;", # noqa: RUF001
1722
+ "=": "&#8316;",
1723
+ "(": "&#8317;",
1724
+ ")": "&#8318;",
1725
+ "a": "&#170;",
1726
+ "o": "&#186;",
1727
+ "i": "&#8305;",
1728
+ "n": "&#8319;",
1729
+ }
1730
+ )
1731
+ self.subscript = str.maketrans(
1732
+ {
1733
+ "1": "&#8321;",
1734
+ "2": "&#8322;",
1735
+ "3": "&#8323;",
1736
+ "4": "&#8324;",
1737
+ "5": "&#8325;",
1738
+ "6": "&#8326;",
1739
+ "7": "&#8327;",
1740
+ "8": "&#8328;",
1741
+ "9": "&#8329;",
1742
+ "0": "&#8320;",
1743
+ "+": "&#8330;",
1744
+ "-": "&#8331;",
1745
+ "−": "&#8331;", # noqa: RUF001
1746
+ "=": "&#8332;",
1747
+ "(": "&#8333;",
1748
+ ")": "&#8334;",
1749
+ "a": "&#8336;",
1750
+ "e": "&#8337;",
1751
+ "o": "&#8338;",
1752
+ "x": "&#8339;",
1753
+ }
1754
+ )
1755
+ self.mathematical_italic = str.maketrans(
1756
+ {
1757
+ "A": "&#119860;",
1758
+ "B": "&#119861;",
1759
+ "C": "&#119862;",
1760
+ "D": "&#119863;",
1761
+ "E": "&#119864;",
1762
+ "F": "&#119865;",
1763
+ "G": "&#119866;",
1764
+ "H": "&#119867;",
1765
+ "I": "&#119868;",
1766
+ "J": "&#119869;",
1767
+ "K": "&#119870;",
1768
+ "L": "&#119871;",
1769
+ "M": "&#119872;",
1770
+ "N": "&#119873;",
1771
+ "O": "&#119874;",
1772
+ "P": "&#119875;",
1773
+ "Q": "&#119876;",
1774
+ "R": "&#119877;",
1775
+ "S": "&#119878;",
1776
+ "T": "&#119879;",
1777
+ "U": "&#119880;",
1778
+ "V": "&#119881;",
1779
+ "W": "&#119882;",
1780
+ "Y": "&#119884;",
1781
+ "Z": "&#119885;",
1782
+ "a": "&#119886;",
1783
+ "b": "&#119887;",
1784
+ "c": "&#119888;",
1785
+ "d": "&#119889;",
1786
+ "e": "&#119890;",
1787
+ "f": "&#119891;",
1788
+ "g": "&#119892;",
1789
+ "h": "&#119893;",
1790
+ "i": "&#119894;",
1791
+ "j": "&#119895;",
1792
+ "k": "&#119896;",
1793
+ "l": "&#119897;",
1794
+ "m": "&#119898;",
1795
+ "n": "&#119899;",
1796
+ "o": "&#119900;",
1797
+ "p": "&#119901;",
1798
+ "q": "&#119902;",
1799
+ "r": "&#119903;",
1800
+ "s": "&#119904;",
1801
+ "t": "&#119905;",
1802
+ "u": "&#119906;",
1803
+ "v": "&#119907;",
1804
+ "w": "&#119908;",
1805
+ "x": "&#119909;",
1806
+ "y": "&#119910;",
1807
+ "z": "&#119911;",
1808
+ }
1809
+ )
1810
+
1811
+ self.lookup_iso8879 = {
1812
+ "&Agr;": "&Alpha;",
1813
+ "&Bgr;": "&Beta;",
1814
+ "&Ggr;": "&Gamma;",
1815
+ "&Dgr;": "&Delta;",
1816
+ "&Egr;": "&Epsilon;",
1817
+ "&Zgr;": "&Zeta;",
1818
+ "&EEgr;": "&Eta;",
1819
+ "&THgr;": "&Theta;",
1820
+ "&Igr;": "&Iota;",
1821
+ "&Kgr;": "&Kappa;",
1822
+ "&Lgr;": "&Lambda;",
1823
+ "&Mgr;": "&Mu;",
1824
+ "&Ngr;": "&Nu;",
1825
+ "&Xgr;": "&Xi;",
1826
+ "&Ogr;": "&Omicron;",
1827
+ "&Pgr;": "&Pi;",
1828
+ "&Rgr;": "&Rho;",
1829
+ "&Sgr;": "&Sigma;",
1830
+ "&Tgr;": "&Tau;",
1831
+ "&Ugr;": "&Upsilon;",
1832
+ "&PHgr;": "&Phi;",
1833
+ "&KHgr;": "&Chi;",
1834
+ "&PSgr;": "&Psi;",
1835
+ "&OHgr;": "&Omega;",
1836
+ "&agr;": "&alpha;",
1837
+ "&bgr;": "&beta;",
1838
+ "&ggr;": "&gamma;",
1839
+ "&dgr;": "&delta;",
1840
+ "&egr;": "&epsilon;",
1841
+ "&zgr;": "&zeta;",
1842
+ "&eegr;": "&eta;",
1843
+ "&thgr;": "&theta;",
1844
+ "&igr;": "&iota;",
1845
+ "&kgr;": "&kappa;",
1846
+ "&lgr;": "&lambda;",
1847
+ "&mgr;": "&mu;",
1848
+ "&ngr;": "&nu;",
1849
+ "&xgr;": "&xi;",
1850
+ "&ogr;": "&omicron;",
1851
+ "&pgr;": "&pi;",
1852
+ "&rgr;": "&rho;",
1853
+ "&sgr;": "&sigmaf;",
1854
+ "&tgr;": "&tau;",
1855
+ "&ugr;": "&upsilon;",
1856
+ "&phgr;": "&phi;",
1857
+ "&khgr;": "&chi;",
1858
+ "&psgr;": "&psi;",
1859
+ "&ohgr;": "&omega;",
1860
+ }
1861
+
1862
+ def get_superscript(self, text: str) -> str:
1863
+ """Get a text in superscript as HTML entities.
1864
+
1865
+ Args:
1866
+ text: The text to transform.
1867
+
1868
+ Returns:
1869
+ The text in superscript as HTML entities.
1870
+ """
1871
+ return text.translate(self.superscript)
1872
+
1873
+ def get_subscript(self, text: str) -> str:
1874
+ """Get a text in subscript as HTML entities.
1875
+
1876
+ Args:
1877
+ The text to transform.
1878
+
1879
+ Returns:
1880
+ The text in subscript as HTML entities.
1881
+ """
1882
+ return text.translate(self.subscript)
1883
+
1884
+ def get_math_italic(self, text: str) -> str:
1885
+ """Get a text in italic as HTML entities.
1886
+
1887
+ Args:
1888
+ The text to transform.
1889
+
1890
+ Returns:
1891
+ The text in italics as HTML entities.
1892
+ """
1893
+ return text.translate(self.mathematical_italic)
1894
+
1895
+ def get_greek_from_iso8879(self, text: str) -> str:
1896
+ """Get an HTML entity of a greek letter in ISO 8879.
1897
+
1898
+ Args:
1899
+ The text to transform, as an ISO 8879 entity.
1900
+
1901
+ Returns:
1902
+ The HTML entity representing a greek letter. If the input text is not
1903
+ supported, the original text is returned.
1904
+ """
1905
+ return self.lookup_iso8879.get(text, text)