docling 2.11.0__py3-none-any.whl → 2.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1888 @@
1
+ """Backend to parse patents from the United States Patent Office (USPTO).
2
+
3
+ The parsers included in this module can handle patent grants pubished since 1976 and
4
+ patent applications since 2001.
5
+ The original files can be found in https://bulkdata.uspto.gov.
6
+ """
7
+
8
+ import html
9
+ import logging
10
+ import re
11
+ import xml.sax
12
+ import xml.sax.xmlreader
13
+ from abc import ABC, abstractmethod
14
+ from enum import Enum, unique
15
+ from io import BytesIO
16
+ from pathlib import Path
17
+ from typing import Any, Final, Optional, Union
18
+
19
+ from bs4 import BeautifulSoup, Tag
20
+ from docling_core.types.doc import (
21
+ DocItem,
22
+ DocItemLabel,
23
+ DoclingDocument,
24
+ DocumentOrigin,
25
+ TableCell,
26
+ TableData,
27
+ TextItem,
28
+ )
29
+ from docling_core.types.doc.document import LevelNumber
30
+ from pydantic import NonNegativeInt
31
+ from typing_extensions import Self, TypedDict, override
32
+
33
+ from docling.backend.abstract_backend import DeclarativeDocumentBackend
34
+ from docling.datamodel.base_models import InputFormat
35
+ from docling.datamodel.document import InputDocument
36
+
37
+ _log = logging.getLogger(__name__)
38
+
39
+ XML_DECLARATION: Final = '<?xml version="1.0" encoding="UTF-8"?>'
40
+
41
+
42
+ @unique
43
+ class PatentHeading(Enum):
44
+ """Text of docling headings for tagged sections in USPTO patent documents."""
45
+
46
+ ABSTRACT = "ABSTRACT", 2
47
+ CLAIMS = "CLAIMS", 2
48
+
49
+ @override
50
+ def __new__(cls, value: str, _) -> Self:
51
+ obj = object.__new__(cls)
52
+ obj._value_ = value
53
+ return obj
54
+
55
+ @override
56
+ def __init__(self, _, level: LevelNumber) -> None:
57
+ self.level: LevelNumber = level
58
+
59
+
60
+ class PatentUsptoDocumentBackend(DeclarativeDocumentBackend):
61
+ @override
62
+ def __init__(
63
+ self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]
64
+ ) -> None:
65
+ super().__init__(in_doc, path_or_stream)
66
+
67
+ self.patent_content: str = ""
68
+ self.parser: Optional[PatentUspto] = None
69
+
70
+ try:
71
+ if isinstance(self.path_or_stream, BytesIO):
72
+ while line := self.path_or_stream.readline().decode("utf-8"):
73
+ if line.startswith("<!DOCTYPE") or line == "PATN\n":
74
+ self._set_parser(line)
75
+ self.patent_content += line
76
+ elif isinstance(self.path_or_stream, Path):
77
+ with open(self.path_or_stream, encoding="utf-8") as file_obj:
78
+ while line := file_obj.readline():
79
+ if line.startswith("<!DOCTYPE") or line == "PATN\n":
80
+ self._set_parser(line)
81
+ self.patent_content += line
82
+ except Exception as exc:
83
+ raise RuntimeError(
84
+ f"Could not initialize USPTO backend for file with hash {self.document_hash}."
85
+ ) from exc
86
+
87
+ def _set_parser(self, doctype: str) -> None:
88
+ doctype_line = doctype.lower()
89
+ if doctype == "PATN\n":
90
+ self.parser = PatentUsptoGrantAps()
91
+ elif "us-patent-application-v4" in doctype_line:
92
+ self.parser = PatentUsptoIce()
93
+ elif "us-patent-grant-v4" in doctype_line:
94
+ self.parser = PatentUsptoIce()
95
+ elif "us-grant-025" in doctype_line:
96
+ self.parser = PatentUsptoGrantV2()
97
+ elif all(
98
+ item in doctype_line
99
+ for item in ("patent-application-publication", "pap-v1")
100
+ ):
101
+ self.parser = PatentUsptoAppV1()
102
+ else:
103
+ self.parser = None
104
+
105
+ @override
106
+ def is_valid(self) -> bool:
107
+ return bool(self.patent_content) and bool(self.parser)
108
+
109
+ @classmethod
110
+ @override
111
+ def supports_pagination(cls) -> bool:
112
+ return False
113
+
114
+ @override
115
+ def unload(self) -> None:
116
+ return
117
+
118
+ @classmethod
119
+ @override
120
+ def supported_formats(cls) -> set[InputFormat]:
121
+ return {InputFormat.XML_USPTO}
122
+
123
+ @override
124
+ def convert(self) -> DoclingDocument:
125
+
126
+ if self.parser is not None:
127
+ doc = self.parser.parse(self.patent_content)
128
+ if doc is None:
129
+ raise RuntimeError(
130
+ f"Failed to convert doc (hash={self.document_hash}, "
131
+ f"name={self.file.name})."
132
+ )
133
+ doc.name = self.file.name or "file"
134
+ mime_type = (
135
+ "text/plain"
136
+ if isinstance(self.parser, PatentUsptoGrantAps)
137
+ else "application/xml"
138
+ )
139
+ doc.origin = DocumentOrigin(
140
+ mimetype=mime_type,
141
+ binary_hash=self.document_hash,
142
+ filename=self.file.name or "file",
143
+ )
144
+
145
+ return doc
146
+ else:
147
+ raise RuntimeError(
148
+ f"Cannot convert doc (hash={self.document_hash}, "
149
+ f"name={self.file.name}) because the backend failed to init."
150
+ )
151
+
152
+
153
+ class PatentUspto(ABC):
154
+ """Parser of patent documents from the US Patent Office."""
155
+
156
+ @abstractmethod
157
+ def parse(self, patent_content: str) -> Optional[DoclingDocument]:
158
+ """Parse a USPTO patent.
159
+
160
+ Parameters:
161
+ patent_content: The content of a single patent in a USPTO file.
162
+
163
+ Returns:
164
+ The patent parsed as a docling document.
165
+ """
166
+ pass
167
+
168
+
169
+ class PatentUsptoIce(PatentUspto):
170
+ """Parser of patent documents from the US Patent Office (ICE).
171
+
172
+ The compatible formats are:
173
+ - Patent Grant Full Text Data/XML Version 4.x ICE (from January 2005)
174
+ - Patent Application Full Text Data/XML Version 4.x ICE (from January 2005)
175
+ """
176
+
177
+ def __init__(self) -> None:
178
+ """Build an instance of PatentUsptoIce class."""
179
+ self.handler = PatentUsptoIce.PatentHandler()
180
+ self.pattern = re.compile(r"^(<table .*?</table>)", re.MULTILINE | re.DOTALL)
181
+
182
+ def parse(self, patent_content: str) -> Optional[DoclingDocument]:
183
+ try:
184
+ xml.sax.parseString(patent_content, self.handler)
185
+ except xml.sax._exceptions.SAXParseException as exc_sax:
186
+ _log.error(f"Error in parsing USPTO document: {exc_sax}")
187
+
188
+ return None
189
+
190
+ doc = self.handler.doc
191
+ if doc:
192
+ raw_tables = re.findall(self.pattern, patent_content)
193
+ parsed_tables: list[TableData] = []
194
+ _log.debug(f"Found {len(raw_tables)} tables to be parsed with XmlTable.")
195
+ for table in raw_tables:
196
+ table_parser = XmlTable(XML_DECLARATION + "\n" + table)
197
+ try:
198
+ table_data = table_parser.parse()
199
+ if table_data:
200
+ parsed_tables.append(table_data)
201
+ except Exception as exc_table:
202
+ _log.error(f"Error in parsing USPTO tables: {exc_table}")
203
+ if len(parsed_tables) != len(doc.tables):
204
+ _log.error(
205
+ f"Number of referenced ({len(doc.tables)}) and parsed "
206
+ f"({len(parsed_tables)}) tables differ."
207
+ )
208
+ else:
209
+ for idx, item in enumerate(parsed_tables):
210
+ doc.tables[idx].data = item
211
+
212
+ return doc
213
+
214
+ class PatentHandler(xml.sax.handler.ContentHandler):
215
+ """SAX ContentHandler for patent documents."""
216
+
217
+ APP_DOC_ELEMENT: Final = "us-patent-application"
218
+ GRANT_DOC_ELEMENT: Final = "us-patent-grant"
219
+
220
+ @unique
221
+ class Element(Enum):
222
+ """Represents an element of interest in the patent application document."""
223
+
224
+ ABSTRACT = "abstract", True
225
+ TITLE = "invention-title", True
226
+ CLAIMS = "claims", False
227
+ CLAIM = "claim", False
228
+ CLAIM_TEXT = "claim-text", True
229
+ PARAGRAPH = "p", True
230
+ HEADING = "heading", True
231
+ DESCRIPTION = "description", False
232
+ TABLE = "table", False # to track its position, without text
233
+ DRAWINGS = "description-of-drawings", True
234
+ STYLE_SUPERSCRIPT = "sup", True
235
+ STYLE_SUBSCRIPT = "sub", True
236
+ MATHS = "maths", False # to avoid keeping formulas
237
+
238
+ @override
239
+ def __new__(cls, value: str, _) -> Self:
240
+ obj = object.__new__(cls)
241
+ obj._value_ = value
242
+ return obj
243
+
244
+ @override
245
+ def __init__(self, _, is_text: bool) -> None:
246
+ self.is_text: bool = is_text
247
+
248
+ @override
249
+ def __init__(self) -> None:
250
+ """Build an instance of the patent handler."""
251
+ # Current patent being parsed
252
+ self.doc: Optional[DoclingDocument] = None
253
+ # Keep track of docling hierarchy level
254
+ self.level: LevelNumber = 1
255
+ # Keep track of docling parents by level
256
+ self.parents: dict[LevelNumber, Optional[DocItem]] = {1: None}
257
+ # Content to retain for the current patent
258
+ self.property: list[str]
259
+ self.claim: str
260
+ self.claims: list[str]
261
+ self.abstract: str
262
+ self.text: str
263
+ self._clean_data()
264
+ # To handle mathematical styling
265
+ self.style_html = HtmlEntity()
266
+
267
+ @override
268
+ def startElement(self, tag, attributes): # noqa: N802
269
+ """Signal the start of an element.
270
+
271
+ Args:
272
+ tag: The element tag.
273
+ attributes: The element attributes.
274
+ """
275
+ if tag in (
276
+ self.APP_DOC_ELEMENT,
277
+ self.GRANT_DOC_ELEMENT,
278
+ ):
279
+ self.doc = DoclingDocument(name="file")
280
+ self.text = ""
281
+ self._start_registered_elements(tag, attributes)
282
+
283
+ @override
284
+ def skippedEntity(self, name): # noqa: N802
285
+ """Receive notification of a skipped entity.
286
+
287
+ HTML entities will be skipped by the parser. This method will unescape them
288
+ and add them to the text.
289
+
290
+ Args:
291
+ name: Entity name.
292
+ """
293
+ if self.property:
294
+ elm_val = self.property[-1]
295
+ element = self.Element(elm_val)
296
+ if element.is_text:
297
+ escaped = self.style_html.get_greek_from_iso8879(f"&{name};")
298
+ unescaped = html.unescape(escaped)
299
+ if unescaped == escaped:
300
+ _log.debug(f"Unrecognized HTML entity: {name}")
301
+ return
302
+
303
+ if element in (
304
+ self.Element.STYLE_SUPERSCRIPT,
305
+ self.Element.STYLE_SUBSCRIPT,
306
+ ):
307
+ # superscripts and subscripts need to be under text elements
308
+ if len(self.property) < 2:
309
+ return
310
+ parent_val = self.property[-2]
311
+ parent = self.Element(parent_val)
312
+ if parent.is_text:
313
+ self.text += self._apply_style(unescaped, elm_val)
314
+ else:
315
+ self.text += unescaped
316
+
317
+ @override
318
+ def endElement(self, tag): # noqa: N802
319
+ """Signal the end of an element.
320
+
321
+ Args:
322
+ tag: The element tag.
323
+ """
324
+ if tag in (
325
+ self.APP_DOC_ELEMENT,
326
+ self.GRANT_DOC_ELEMENT,
327
+ ):
328
+ self._clean_data()
329
+ self._end_registered_element(tag)
330
+
331
+ @override
332
+ def characters(self, content):
333
+ """Receive notification of character data.
334
+
335
+ Args:
336
+ content: Data reported by the handler.
337
+ """
338
+ if self.property:
339
+ elm_val = self.property[-1]
340
+ element = self.Element(elm_val)
341
+ if element.is_text:
342
+ if element in (
343
+ self.Element.STYLE_SUPERSCRIPT,
344
+ self.Element.STYLE_SUBSCRIPT,
345
+ ):
346
+ # superscripts and subscripts need to be under text elements
347
+ if len(self.property) < 2:
348
+ return
349
+ parent_val = self.property[-2]
350
+ parent = self.Element(parent_val)
351
+ if parent.is_text:
352
+ self.text += self._apply_style(content, elm_val)
353
+ else:
354
+ self.text += content
355
+
356
+ def _start_registered_elements(
357
+ self, tag: str, attributes: xml.sax.xmlreader.AttributesImpl
358
+ ) -> None:
359
+ if tag in [member.value for member in self.Element]:
360
+ # special case for claims: claim lines may start before the
361
+ # previous one is closed
362
+ if (
363
+ tag == self.Element.CLAIM_TEXT.value
364
+ and self.property
365
+ and self.property[-1] == tag
366
+ and self.text.strip()
367
+ ):
368
+ self.claim += " " + self.text.strip()
369
+ self.text = ""
370
+ elif tag == self.Element.HEADING.value:
371
+ level_attr: str = attributes.get("level", "")
372
+ new_level: int = int(level_attr) if level_attr.isnumeric() else 1
373
+ max_level = min(self.parents.keys())
374
+ # increase heading level with 1 for title, if any
375
+ self.level = (
376
+ new_level + 1 if (new_level + 1) in self.parents else max_level
377
+ )
378
+ self.property.append(tag)
379
+
380
+ def _end_registered_element(self, tag: str) -> None:
381
+ if tag in [item.value for item in self.Element] and self.property:
382
+ current_tag = self.property.pop()
383
+ self._add_property(current_tag, self.text.strip())
384
+
385
+ def _add_property(self, name: str, text: str) -> None:
386
+ if not name or not self.doc:
387
+ return
388
+
389
+ if name == self.Element.TITLE.value:
390
+ if text:
391
+ self.parents[self.level + 1] = self.doc.add_title(
392
+ parent=self.parents[self.level], # type: ignore[arg-type]
393
+ text=text,
394
+ )
395
+ self.level += 1
396
+ self.text = ""
397
+
398
+ elif name == self.Element.ABSTRACT.value:
399
+ if self.abstract:
400
+ heading_text = PatentHeading.ABSTRACT.value
401
+ heading_level = (
402
+ PatentHeading.ABSTRACT.level
403
+ if PatentHeading.ABSTRACT.level in self.parents
404
+ else 1
405
+ )
406
+ abstract_item = self.doc.add_heading(
407
+ heading_text,
408
+ level=heading_level,
409
+ parent=self.parents[heading_level], # type: ignore[arg-type]
410
+ )
411
+ self.doc.add_text(
412
+ label=DocItemLabel.PARAGRAPH,
413
+ text=self.abstract,
414
+ parent=abstract_item,
415
+ )
416
+
417
+ elif name == self.Element.CLAIM_TEXT.value:
418
+ text = re.sub("\\s+", " ", text).strip()
419
+ if text:
420
+ self.claim += " " + text
421
+ self.text = ""
422
+
423
+ elif name == self.Element.CLAIM.value and self.claim:
424
+ self.claims.append(self.claim.strip())
425
+ self.claim = ""
426
+
427
+ elif name == self.Element.CLAIMS.value and self.claims:
428
+ heading_text = PatentHeading.CLAIMS.value
429
+ heading_level = (
430
+ PatentHeading.CLAIMS.level
431
+ if PatentHeading.CLAIMS.level in self.parents
432
+ else 1
433
+ )
434
+ claims_item = self.doc.add_heading(
435
+ heading_text,
436
+ level=heading_level,
437
+ parent=self.parents[heading_level], # type: ignore[arg-type]
438
+ )
439
+ for text in self.claims:
440
+ self.doc.add_text(
441
+ label=DocItemLabel.PARAGRAPH, text=text, parent=claims_item
442
+ )
443
+
444
+ elif name == self.Element.PARAGRAPH.value and text:
445
+ # remmove blank spaces added in paragraphs
446
+ text = re.sub("\\s+", " ", text)
447
+ if self.Element.ABSTRACT.value in self.property:
448
+ self.abstract = (
449
+ (self.abstract + " " + text) if self.abstract else text
450
+ )
451
+ else:
452
+ self.doc.add_text(
453
+ label=DocItemLabel.PARAGRAPH,
454
+ text=text,
455
+ parent=self.parents[self.level], # type: ignore[arg-type]
456
+ )
457
+ self.text = ""
458
+
459
+ elif name == self.Element.HEADING.value and text:
460
+ self.parents[self.level + 1] = self.doc.add_heading(
461
+ text=text,
462
+ level=self.level,
463
+ parent=self.parents[self.level], # type: ignore[arg-type]
464
+ )
465
+ self.level += 1
466
+ self.text = ""
467
+
468
+ elif name == self.Element.TABLE.value:
469
+ # set an empty table as placeholder
470
+ empty_table = TableData(num_rows=0, num_cols=0, table_cells=[])
471
+ self.doc.add_table(
472
+ data=empty_table,
473
+ parent=self.parents[self.level], # type: ignore[arg-type]
474
+ )
475
+
476
+ def _apply_style(self, text: str, style_tag: str) -> str:
477
+ """Apply an HTML style to text.
478
+
479
+ Args:
480
+ text: A string containing plain text.
481
+ style_tag: An HTML tag name for styling text. If the tag name is not
482
+ recognized as one of the supported styles, the method will return
483
+ the original `text`.
484
+
485
+ Returns:
486
+ A string after applying the style.
487
+ """
488
+ formatted = text
489
+
490
+ if style_tag == self.Element.STYLE_SUPERSCRIPT.value:
491
+ formatted = html.unescape(self.style_html.get_superscript(text))
492
+ elif style_tag == self.Element.STYLE_SUBSCRIPT.value:
493
+ formatted = html.unescape(self.style_html.get_subscript(text))
494
+
495
+ return formatted
496
+
497
+ def _clean_data(self) -> None:
498
+ """Reset the variables from stream data."""
499
+ self.property = []
500
+ self.claim = ""
501
+ self.claims = []
502
+ self.abstract = ""
503
+
504
+
505
+ class PatentUsptoGrantV2(PatentUspto):
506
+ """Parser of patent documents from the US Patent Office (grants v2.5).
507
+
508
+ The compatible format is:
509
+ - Patent Grant Full Text Data/XML Version 2.5 (from January 2002 till December 2004)
510
+ """
511
+
512
+ @override
513
+ def __init__(self) -> None:
514
+ """Build an instance of PatentUsptoGrantV2 class."""
515
+ self.handler = PatentUsptoGrantV2.PatentHandler()
516
+ self.pattern = re.compile(r"^(<table .*?</table>)", re.MULTILINE | re.DOTALL)
517
+
518
+ @override
519
+ def parse(self, patent_content: str) -> Optional[DoclingDocument]:
520
+ try:
521
+ xml.sax.parseString(patent_content, self.handler)
522
+ except xml.sax._exceptions.SAXParseException as exc_sax:
523
+ _log.error(f"Error in parsing USPTO document: {exc_sax}")
524
+
525
+ return None
526
+
527
+ doc = self.handler.doc
528
+ if doc:
529
+ raw_tables = re.findall(self.pattern, patent_content)
530
+ parsed_tables: list[TableData] = []
531
+ _log.debug(f"Found {len(raw_tables)} tables to be parsed with XmlTable.")
532
+ for table in raw_tables:
533
+ table_parser = XmlTable(XML_DECLARATION + "\n" + table)
534
+ try:
535
+ table_data = table_parser.parse()
536
+ if table_data:
537
+ parsed_tables.append(table_data)
538
+ except Exception as exc_table:
539
+ _log.error(f"Error in parsing USPTO tables: {exc_table}")
540
+ if len(parsed_tables) != len(doc.tables):
541
+ _log.error(
542
+ f"Number of referenced ({len(doc.tables)}) and parsed "
543
+ f"({len(parsed_tables)}) tables differ."
544
+ )
545
+ else:
546
+ for idx, item in enumerate(parsed_tables):
547
+ doc.tables[idx].data = item
548
+
549
+ return doc
550
+
551
+ class PatentHandler(xml.sax.handler.ContentHandler):
552
+ """SAX ContentHandler for patent documents."""
553
+
554
+ GRANT_DOC_ELEMENT: Final = "PATDOC"
555
+ CLAIM_STATEMENT: Final = "What is claimed is:"
556
+
557
+ @unique
558
+ class Element(Enum):
559
+ """Represents an element of interest in the patent application document."""
560
+
561
+ PDAT = "PDAT", True # any type of data
562
+ ABSTRACT = ("SDOAB", False)
563
+ SDOCL = ("SDOCL", False)
564
+ TITLE = ("B540", False)
565
+ CLAIMS = ("CL", False)
566
+ CLAIM = ("CLM", False)
567
+ PARAGRAPH = ("PARA", True)
568
+ HEADING = ("H", True)
569
+ DRAWINGS = ("DRWDESC", False)
570
+ STYLE_SUPERSCRIPT = ("SP", False)
571
+ STYLE_SUBSCRIPT = ("SB", False)
572
+ STYLE_ITALIC = ("ITALIC", False)
573
+ CWU = ("CWU", False) # avoid tables, chemicals, formulas
574
+ TABLE = ("table", False) # to keep track of table positions
575
+
576
+ @override
577
+ def __new__(cls, value: str, _) -> Self:
578
+ obj = object.__new__(cls)
579
+ obj._value_ = value
580
+ return obj
581
+
582
+ @override
583
+ def __init__(self, _, is_text: bool) -> None:
584
+ self.is_text: bool = is_text
585
+
586
+ @override
587
+ def __init__(self) -> None:
588
+ """Build an instance of the patent handler."""
589
+ # Current patent being parsed
590
+ self.doc: Optional[DoclingDocument] = None
591
+ # Keep track of docling hierarchy level
592
+ self.level: LevelNumber = 1
593
+ # Keep track of docling parents by level
594
+ self.parents: dict[LevelNumber, Optional[DocItem]] = {1: None}
595
+ # Content to retain for the current patent
596
+ self.property: list[str]
597
+ self.claim: str
598
+ self.claims: list[str]
599
+ self.paragraph: str
600
+ self.abstract: str
601
+ self._clean_data()
602
+ # To handle mathematical styling
603
+ self.style_html = HtmlEntity()
604
+
605
+ @override
606
+ def startElement(self, tag, attributes): # noqa: N802
607
+ """Signal the start of an element.
608
+
609
+ Args:
610
+ tag: The element tag.
611
+ attributes: The element attributes.
612
+ """
613
+ if tag == self.GRANT_DOC_ELEMENT:
614
+ self.doc = DoclingDocument(name="file")
615
+ self.text = ""
616
+ self._start_registered_elements(tag, attributes)
617
+
618
+ @override
619
+ def skippedEntity(self, name): # noqa: N802
620
+ """Receive notification of a skipped entity.
621
+
622
+ HTML entities will be skipped by the parser. This method will unescape them
623
+ and add them to the text.
624
+
625
+ Args:
626
+ name: Entity name.
627
+ """
628
+ if self.property:
629
+ elm_val = self.property[-1]
630
+ element = self.Element(elm_val)
631
+ if element.is_text:
632
+ escaped = self.style_html.get_greek_from_iso8879(f"&{name};")
633
+ unescaped = html.unescape(escaped)
634
+ if unescaped == escaped:
635
+ logging.debug("Unrecognized HTML entity: " + name)
636
+ return
637
+
638
+ if element in (
639
+ self.Element.STYLE_SUPERSCRIPT,
640
+ self.Element.STYLE_SUBSCRIPT,
641
+ ):
642
+ # superscripts and subscripts need to be under text elements
643
+ if len(self.property) < 2:
644
+ return
645
+ parent_val = self.property[-2]
646
+ parent = self.Element(parent_val)
647
+ if parent.is_text:
648
+ self.text += self._apply_style(unescaped, elm_val)
649
+ else:
650
+ self.text += unescaped
651
+
652
+ @override
653
+ def endElement(self, tag): # noqa: N802
654
+ """Signal the end of an element.
655
+
656
+ Args:
657
+ tag: The element tag.
658
+ """
659
+ if tag == self.GRANT_DOC_ELEMENT:
660
+ self._clean_data()
661
+ self._end_registered_element(tag)
662
+
663
+ @override
664
+ def characters(self, content):
665
+ """Receive notification of character data.
666
+
667
+ Args:
668
+ content: Data reported by the handler.
669
+ """
670
+ if self.property:
671
+ elm_val = self.property[-1]
672
+ element = self.Element(elm_val)
673
+ if element.is_text:
674
+ if element in (
675
+ self.Element.STYLE_SUPERSCRIPT,
676
+ self.Element.STYLE_SUBSCRIPT,
677
+ ):
678
+ # superscripts and subscripts need to be under text elements
679
+ if len(self.property) < 2:
680
+ return
681
+ parent_val = self.property[-2]
682
+ parent = self.Element(parent_val)
683
+ if parent.is_text:
684
+ self.text += self._apply_style(content, elm_val)
685
+ else:
686
+ self.text += content
687
+
688
+ def _start_registered_elements(
689
+ self, tag: str, attributes: xml.sax.xmlreader.AttributesImpl
690
+ ) -> None:
691
+ if tag in [member.value for member in self.Element]:
692
+ if (
693
+ tag == self.Element.HEADING.value
694
+ and not self.Element.SDOCL.value in self.property
695
+ ):
696
+ level_attr: str = attributes.get("LVL", "")
697
+ new_level: int = int(level_attr) if level_attr.isnumeric() else 1
698
+ max_level = min(self.parents.keys())
699
+ # increase heading level with 1 for title, if any
700
+ self.level = (
701
+ new_level + 1 if (new_level + 1) in self.parents else max_level
702
+ )
703
+ self.property.append(tag)
704
+
705
+ def _end_registered_element(self, tag: str) -> None:
706
+ if tag in [elm.value for elm in self.Element] and self.property:
707
+ current_tag = self.property.pop()
708
+ self._add_property(current_tag, self.text)
709
+
710
+ def _add_property(self, name: str, text: str) -> None:
711
+ if not name or not self.doc:
712
+ return
713
+ if name == self.Element.PDAT.value and text:
714
+ if not self.property:
715
+ self.text = ""
716
+ return
717
+
718
+ wrapper = self.property[-1]
719
+ text = self._apply_style(text, wrapper)
720
+
721
+ if self.Element.TITLE.value in self.property and text.strip():
722
+ title = text.strip()
723
+ self.parents[self.level + 1] = self.doc.add_title(
724
+ parent=self.parents[self.level], # type: ignore[arg-type]
725
+ text=title,
726
+ )
727
+ self.level += 1
728
+
729
+ elif self.Element.ABSTRACT.value in self.property:
730
+ self.abstract += text
731
+
732
+ elif self.Element.CLAIM.value in self.property:
733
+ self.claim += text
734
+
735
+ # Paragraph text not in claims or abstract
736
+ elif (
737
+ self.Element.PARAGRAPH.value in self.property
738
+ and self.Element.CLAIM.value not in self.property
739
+ and self.Element.ABSTRACT.value not in self.property
740
+ ):
741
+ self.paragraph += text
742
+
743
+ # headers except claims statement
744
+ elif (
745
+ self.Element.HEADING.value in self.property
746
+ and not self.Element.SDOCL.value in self.property
747
+ and text.strip()
748
+ ):
749
+ self.parents[self.level + 1] = self.doc.add_heading(
750
+ text=text.strip(),
751
+ level=self.level,
752
+ parent=self.parents[self.level], # type: ignore[arg-type]
753
+ )
754
+ self.level += 1
755
+
756
+ self.text = ""
757
+
758
+ elif name == self.Element.CLAIM.value and self.claim.strip():
759
+ self.claims.append(self.claim.strip())
760
+ self.claim = ""
761
+
762
+ elif name == self.Element.CLAIMS.value and self.claims:
763
+ heading_text = PatentHeading.CLAIMS.value
764
+ heading_level = (
765
+ PatentHeading.CLAIMS.level
766
+ if PatentHeading.CLAIMS.level in self.parents
767
+ else 1
768
+ )
769
+ claims_item = self.doc.add_heading(
770
+ heading_text,
771
+ level=heading_level,
772
+ parent=self.parents[heading_level], # type: ignore[arg-type]
773
+ )
774
+ for text in self.claims:
775
+ self.doc.add_text(
776
+ label=DocItemLabel.PARAGRAPH, text=text, parent=claims_item
777
+ )
778
+
779
+ elif name == self.Element.ABSTRACT.value and self.abstract.strip():
780
+ abstract = self.abstract.strip()
781
+ heading_text = PatentHeading.ABSTRACT.value
782
+ heading_level = (
783
+ PatentHeading.ABSTRACT.level
784
+ if PatentHeading.ABSTRACT.level in self.parents
785
+ else 1
786
+ )
787
+ abstract_item = self.doc.add_heading(
788
+ heading_text,
789
+ level=heading_level,
790
+ parent=self.parents[heading_level], # type: ignore[arg-type]
791
+ )
792
+ self.doc.add_text(
793
+ label=DocItemLabel.PARAGRAPH, text=abstract, parent=abstract_item
794
+ )
795
+
796
+ elif name == self.Element.PARAGRAPH.value:
797
+ paragraph = self.paragraph.strip()
798
+ if paragraph and self.Element.CLAIM.value not in self.property:
799
+ self.doc.add_text(
800
+ label=DocItemLabel.PARAGRAPH,
801
+ text=paragraph,
802
+ parent=self.parents[self.level], # type: ignore[arg-type]
803
+ )
804
+ elif self.Element.CLAIM.value in self.property:
805
+ # we may need a space after a paragraph in claim text
806
+ self.claim += " "
807
+ self.paragraph = ""
808
+
809
+ elif name == self.Element.TABLE.value:
810
+ # set an empty table as placeholder
811
+ empty_table = TableData(num_rows=0, num_cols=0, table_cells=[])
812
+ self.doc.add_table(
813
+ data=empty_table,
814
+ parent=self.parents[self.level], # type: ignore[arg-type]
815
+ )
816
+
817
+ def _apply_style(self, text: str, style_tag: str) -> str:
818
+ """Apply an HTML style to text.
819
+
820
+ Args:
821
+ text: A string containing plain text.
822
+ style_tag: An HTML tag name for styling text. If the tag name is not
823
+ recognized as one of the supported styles, the method will return
824
+ the original `text`.
825
+
826
+ Returns:
827
+ A string after applying the style.
828
+ """
829
+ formatted = text
830
+
831
+ if style_tag == self.Element.STYLE_SUPERSCRIPT.value:
832
+ formatted = html.unescape(self.style_html.get_superscript(text))
833
+ elif style_tag == self.Element.STYLE_SUBSCRIPT.value:
834
+ formatted = html.unescape(self.style_html.get_subscript(text))
835
+ elif style_tag == self.Element.STYLE_ITALIC.value:
836
+ formatted = html.unescape(self.style_html.get_math_italic(text))
837
+
838
+ return formatted
839
+
840
+ def _clean_data(self) -> None:
841
+ """Reset the variables from stream data."""
842
+ self.text = ""
843
+ self.property = []
844
+ self.claim = ""
845
+ self.claims = []
846
+ self.paragraph = ""
847
+ self.abstract = ""
848
+
849
+
850
+ class PatentUsptoGrantAps(PatentUspto):
851
+ """Parser of patents documents from the US Patent Office (grants APS).
852
+
853
+ The compatible format is:
854
+ - Patent Grant Full Text Data/APS (from January 1976 till December 2001)
855
+ """
856
+
857
+ @unique
858
+ class Section(Enum):
859
+ """Represent a section in a patent APS document."""
860
+
861
+ ABSTRACT = "ABST"
862
+ SUMMARY = "BSUM"
863
+ DETAILS = "DETD"
864
+ CLAIMS = "CLMS"
865
+ DRAWINGS = "DRWD"
866
+
867
+ @unique
868
+ class Field(Enum):
869
+ """Represent a field in a patent APS document."""
870
+
871
+ DOC_NUMBER = "WKU"
872
+ TITLE = "TTL"
873
+ PARAGRAPH = "PAR"
874
+ PARAGRAPH_1 = "PA1"
875
+ PARAGRAPH_2 = "PA2"
876
+ PARAGRAPH_3 = "PA3"
877
+ TEXT = "PAL"
878
+ CAPTION = "PAC"
879
+ NUMBER = "NUM"
880
+ NAME = "NAM"
881
+ IPC = "ICL"
882
+ ISSUED = "ISD"
883
+ FILED = "APD"
884
+ PATENT_NUMBER = "PNO"
885
+ APPLICATION_NUMBER = "APN"
886
+ APPLICATION_TYPE = "APT"
887
+ COUNTRY = "CNT"
888
+
889
+ @override
890
+ def __init__(self) -> None:
891
+ """Build an instance of PatentUsptoGrantAps class."""
892
+ self.doc: Optional[DoclingDocument] = None
893
+ # Keep track of docling hierarchy level
894
+ self.level: LevelNumber = 1
895
+ # Keep track of docling parents by level
896
+ self.parents: dict[LevelNumber, Optional[DocItem]] = {1: None}
897
+
898
+ def get_last_text_item(self) -> Optional[TextItem]:
899
+ """Get the last text item at the current document level.
900
+
901
+ Returns:
902
+ The text item or None, if the current level parent has no children."""
903
+ if self.doc:
904
+ parent = self.parents[self.level]
905
+ children = parent.children if parent is not None else []
906
+ else:
907
+ return None
908
+ text_list: list[TextItem] = [
909
+ item
910
+ for item in self.doc.texts
911
+ if isinstance(item, TextItem) and item.get_ref() in children
912
+ ]
913
+
914
+ if text_list:
915
+ return text_list[-1]
916
+ else:
917
+ return None
918
+
919
+ def store_section(self, section: str) -> None:
920
+ """Store the section heading in the docling document.
921
+
922
+ Only the predefined sections from PatentHeading will be handled.
923
+ The other sections are created by the Field.CAPTION field.
924
+
925
+ Args:
926
+ section: A patent section name."""
927
+ heading: PatentHeading
928
+ if self.doc is None:
929
+ return
930
+ elif section == self.Section.ABSTRACT.value:
931
+ heading = PatentHeading.ABSTRACT
932
+ elif section == self.Section.CLAIMS.value:
933
+ heading = PatentHeading.CLAIMS
934
+ else:
935
+ return None
936
+
937
+ self.level = heading.level if heading.level in self.parents else 1
938
+ self.parents[self.level + 1] = self.doc.add_heading(
939
+ heading.value,
940
+ level=self.level,
941
+ parent=self.parents[self.level], # type: ignore[arg-type]
942
+ )
943
+ self.level += 1
944
+
945
+ def store_content(self, section: str, field: str, value: str) -> None:
946
+ """Store the key value within a document section in the docling document.
947
+
948
+ Args:
949
+ section: A patent section name.
950
+ field: A field name.
951
+ value: A field value name.
952
+ """
953
+ if (
954
+ not self.doc
955
+ or not field
956
+ or field not in [item.value for item in PatentUsptoGrantAps.Field]
957
+ ):
958
+ return
959
+
960
+ if field == self.Field.TITLE.value:
961
+ self.parents[self.level + 1] = self.doc.add_title(
962
+ parent=self.parents[self.level], text=value # type: ignore[arg-type]
963
+ )
964
+ self.level += 1
965
+
966
+ elif field == self.Field.TEXT.value and section == self.Section.ABSTRACT.value:
967
+ abst_item = self.get_last_text_item()
968
+ if abst_item:
969
+ abst_item.text += " " + value
970
+ else:
971
+ self.doc.add_text(
972
+ label=DocItemLabel.PARAGRAPH,
973
+ text=value,
974
+ parent=self.parents[self.level], # type: ignore[arg-type]
975
+ )
976
+
977
+ elif field == self.Field.NUMBER.value and section == self.Section.CLAIMS.value:
978
+ self.doc.add_text(
979
+ label=DocItemLabel.PARAGRAPH,
980
+ text="",
981
+ parent=self.parents[self.level], # type: ignore[arg-type]
982
+ )
983
+
984
+ elif (
985
+ field
986
+ in (
987
+ self.Field.PARAGRAPH.value,
988
+ self.Field.PARAGRAPH_1.value,
989
+ self.Field.PARAGRAPH_2.value,
990
+ self.Field.PARAGRAPH_3.value,
991
+ )
992
+ and section == self.Section.CLAIMS.value
993
+ ):
994
+ last_claim = self.get_last_text_item()
995
+ if last_claim is None:
996
+ last_claim = self.doc.add_text(
997
+ label=DocItemLabel.PARAGRAPH,
998
+ text="",
999
+ parent=self.parents[self.level], # type: ignore[arg-type]
1000
+ )
1001
+
1002
+ last_claim.text += f" {value}" if last_claim.text else value
1003
+
1004
+ elif field == self.Field.CAPTION.value and section in (
1005
+ self.Section.SUMMARY.value,
1006
+ self.Section.DETAILS.value,
1007
+ self.Section.DRAWINGS.value,
1008
+ ):
1009
+ # captions are siblings of abstract since no level info is provided
1010
+ head_item = PatentHeading.ABSTRACT
1011
+ self.level = head_item.level if head_item.level in self.parents else 1
1012
+ self.parents[self.level + 1] = self.doc.add_heading(
1013
+ value,
1014
+ level=self.level,
1015
+ parent=self.parents[self.level], # type: ignore[arg-type]
1016
+ )
1017
+ self.level += 1
1018
+
1019
+ elif field in (
1020
+ self.Field.PARAGRAPH.value,
1021
+ self.Field.PARAGRAPH_1.value,
1022
+ self.Field.PARAGRAPH_2.value,
1023
+ self.Field.PARAGRAPH_3.value,
1024
+ ) and section in (
1025
+ self.Section.SUMMARY.value,
1026
+ self.Section.DETAILS.value,
1027
+ self.Section.DRAWINGS.value,
1028
+ ):
1029
+ self.doc.add_text(
1030
+ label=DocItemLabel.PARAGRAPH,
1031
+ text=value,
1032
+ parent=self.parents[self.level], # type: ignore[arg-type]
1033
+ )
1034
+
1035
+ def parse(self, patent_content: str) -> Optional[DoclingDocument]:
1036
+ self.doc = self.doc = DoclingDocument(name="file")
1037
+ section: str = ""
1038
+ key: str = ""
1039
+ value: str = ""
1040
+ line_num = 0
1041
+ for line in patent_content.splitlines():
1042
+ cols = re.split("\\s{2,}", line, maxsplit=1)
1043
+ if key and value and (len(cols) == 1 or (len(cols) == 2 and cols[0])):
1044
+ self.store_content(section, key, value)
1045
+ key = ""
1046
+ value = ""
1047
+ if len(cols) == 1: # section title
1048
+ section = cols[0]
1049
+ self.store_section(section)
1050
+ _log.debug(f"Parsing section {section}")
1051
+ elif len(cols) == 2: # key value
1052
+ if cols[0]: # key present
1053
+ key = cols[0]
1054
+ value = cols[1]
1055
+ elif not re.match(r"^##STR\d+##$", cols[1]): # line continues
1056
+ value += " " + cols[1]
1057
+ line_num += 1
1058
+ if key and value:
1059
+ self.store_content(section, key, value)
1060
+
1061
+ # TODO: parse tables
1062
+ return self.doc
1063
+
1064
+
1065
+ class PatentUsptoAppV1(PatentUspto):
1066
+ """Parser of patent documents from the US Patent Office (applications v1.x)
1067
+
1068
+ The compatible format is:
1069
+ - Patent Application Full Text Data/XML Version 1.x (from March 2001 till December
1070
+ 2004)
1071
+ """
1072
+
1073
+ @override
1074
+ def __init__(self) -> None:
1075
+ """Build an instance of PatentUsptoAppV1 class."""
1076
+ self.handler = PatentUsptoAppV1.PatentHandler()
1077
+ self.pattern = re.compile(r"^(<table .*?</table>)", re.MULTILINE | re.DOTALL)
1078
+
1079
+ @override
1080
+ def parse(self, patent_content: str) -> Optional[DoclingDocument]:
1081
+ try:
1082
+ xml.sax.parseString(patent_content, self.handler)
1083
+ except xml.sax._exceptions.SAXParseException as exc_sax:
1084
+ _log.error(f"Error in parsing USPTO document: {exc_sax}")
1085
+
1086
+ return None
1087
+
1088
+ doc = self.handler.doc
1089
+ if doc:
1090
+ raw_tables = re.findall(self.pattern, patent_content)
1091
+ parsed_tables: list[TableData] = []
1092
+ _log.debug(f"Found {len(raw_tables)} tables to be parsed with XmlTable.")
1093
+ for table in raw_tables:
1094
+ table_parser = XmlTable(XML_DECLARATION + "\n" + table)
1095
+ try:
1096
+ table_data = table_parser.parse()
1097
+ if table_data:
1098
+ parsed_tables.append(table_data)
1099
+ except Exception as exc_table:
1100
+ _log.error(f"Error in parsing USPTO tables: {exc_table}")
1101
+ if len(parsed_tables) != len(doc.tables):
1102
+ _log.error(
1103
+ f"Number of referenced ({len(doc.tables)}) and parsed "
1104
+ f"({len(parsed_tables)}) tables differ."
1105
+ )
1106
+ else:
1107
+ for idx, item in enumerate(parsed_tables):
1108
+ doc.tables[idx].data = item
1109
+
1110
+ return doc
1111
+
1112
+ class PatentHandler(xml.sax.handler.ContentHandler):
1113
+ """SAX ContentHandler for patent documents."""
1114
+
1115
+ APP_DOC_ELEMENT: Final = "patent-application-publication"
1116
+
1117
+ @unique
1118
+ class Element(Enum):
1119
+ """Represents an element of interest in the patent application document."""
1120
+
1121
+ DRAWINGS = "brief-description-of-drawings", False
1122
+ ABSTRACT = "subdoc-abstract", False
1123
+ TITLE = "title-of-invention", True
1124
+ CLAIMS = "subdoc-claims", False
1125
+ CLAIM = "claim", False
1126
+ CLAIM_TEXT = "claim-text", True
1127
+ NUMBER = ("number", False)
1128
+ PARAGRAPH = "paragraph", True
1129
+ HEADING = "heading", True
1130
+ STYLE_SUPERSCRIPT = "superscript", True
1131
+ STYLE_SUBSCRIPT = "subscript", True
1132
+ # do not store text of a table, since it can be within paragraph
1133
+ TABLE = "table", False
1134
+ # do not store text of a formula, since it can be within paragraph
1135
+ MATH = "math-cwu", False
1136
+
1137
+ @override
1138
+ def __new__(cls, value: str, _) -> Self:
1139
+ obj = object.__new__(cls)
1140
+ obj._value_ = value
1141
+ return obj
1142
+
1143
+ @override
1144
+ def __init__(self, _, is_text: bool) -> None:
1145
+ self.is_text: bool = is_text
1146
+
1147
+ @override
1148
+ def __init__(self) -> None:
1149
+ """Build an instance of the patent handler."""
1150
+ # Current patent being parsed
1151
+ self.doc: Optional[DoclingDocument] = None
1152
+ # Keep track of docling hierarchy level
1153
+ self.level: LevelNumber = 1
1154
+ # Keep track of docling parents by level
1155
+ self.parents: dict[LevelNumber, Optional[DocItem]] = {1: None}
1156
+ # Content to retain for the current patent
1157
+ self.property: list[str]
1158
+ self.claim: str
1159
+ self.claims: list[str]
1160
+ self.abstract: str
1161
+ self.text: str
1162
+ self._clean_data()
1163
+ # To handle mathematical styling
1164
+ self.style_html = HtmlEntity()
1165
+
1166
+ @override
1167
+ def startElement(self, tag, attributes): # noqa: N802
1168
+ """Signal the start of an element.
1169
+
1170
+ Args:
1171
+ tag: The element tag.
1172
+ attributes: The element attributes.
1173
+ """
1174
+ if tag == self.APP_DOC_ELEMENT:
1175
+ self.doc = DoclingDocument(name="file")
1176
+ self.text = ""
1177
+ self._start_registered_elements(tag, attributes)
1178
+
1179
+ @override
1180
+ def skippedEntity(self, name): # noqa: N802
1181
+ """Receive notification of a skipped entity.
1182
+
1183
+ HTML entities will be skipped by the parser. This method will unescape them
1184
+ and add them to the text.
1185
+
1186
+ Args:
1187
+ name: Entity name.
1188
+ """
1189
+ if self.property:
1190
+ elm_val = self.property[-1]
1191
+ element = self.Element(elm_val)
1192
+ if element.is_text:
1193
+ escaped = self.style_html.get_greek_from_iso8879(f"&{name};")
1194
+ unescaped = html.unescape(escaped)
1195
+ if unescaped == escaped:
1196
+ logging.debug("Unrecognized HTML entity: " + name)
1197
+ return
1198
+
1199
+ if element in (
1200
+ self.Element.STYLE_SUPERSCRIPT,
1201
+ self.Element.STYLE_SUBSCRIPT,
1202
+ ):
1203
+ # superscripts and subscripts need to be under text elements
1204
+ if len(self.property) < 2:
1205
+ return
1206
+ parent_val = self.property[-2]
1207
+ parent = self.Element(parent_val)
1208
+ if parent.is_text:
1209
+ self.text += self._apply_style(unescaped, elm_val)
1210
+ else:
1211
+ self.text += unescaped
1212
+
1213
+ @override
1214
+ def endElement(self, tag): # noqa: N802
1215
+ """Signal the end of an element.
1216
+
1217
+ Args:
1218
+ tag: The element tag.
1219
+ """
1220
+ if tag == self.APP_DOC_ELEMENT:
1221
+ self._clean_data()
1222
+ self._end_registered_element(tag)
1223
+
1224
+ @override
1225
+ def characters(self, content):
1226
+ """Receive notification of character data.
1227
+
1228
+ Args:
1229
+ content: Data reported by the handler.
1230
+ """
1231
+ if self.property:
1232
+ elm_val = self.property[-1]
1233
+ element = self.Element(elm_val)
1234
+ if element.is_text:
1235
+ if element in (
1236
+ self.Element.STYLE_SUPERSCRIPT,
1237
+ self.Element.STYLE_SUBSCRIPT,
1238
+ ):
1239
+ # superscripts and subscripts need to be under text elements
1240
+ if len(self.property) < 2:
1241
+ return
1242
+ parent_val = self.property[-2]
1243
+ parent = self.Element(parent_val)
1244
+ if parent.is_text:
1245
+ self.text += self._apply_style(content, elm_val)
1246
+ else:
1247
+ self.text += content
1248
+
1249
+ def _start_registered_elements(
1250
+ self, tag: str, attributes: xml.sax.xmlreader.AttributesImpl
1251
+ ) -> None:
1252
+ if tag in [member.value for member in self.Element]:
1253
+ # special case for claims: claim lines may start before the
1254
+ # previous one is closed
1255
+ if (
1256
+ tag == self.Element.CLAIM_TEXT.value
1257
+ and self.property
1258
+ and self.property[-1] == tag
1259
+ and self.text.strip()
1260
+ ):
1261
+ self.claim += " " + self.text.strip("\n")
1262
+ self.text = ""
1263
+ elif tag == self.Element.HEADING.value:
1264
+ level_attr: str = attributes.get("lvl", "")
1265
+ new_level: int = int(level_attr) if level_attr.isnumeric() else 1
1266
+ max_level = min(self.parents.keys())
1267
+ # increase heading level with 1 for title, if any
1268
+ self.level = (
1269
+ new_level + 1 if (new_level + 1) in self.parents else max_level
1270
+ )
1271
+ self.property.append(tag)
1272
+
1273
+ def _end_registered_element(self, tag: str) -> None:
1274
+ if tag in [elm.value for elm in self.Element] and self.property:
1275
+ current_tag = self.property.pop()
1276
+ self._add_property(current_tag, self.text)
1277
+
1278
+ def _add_property(self, name: str, text: str) -> None:
1279
+ if not name or not self.doc:
1280
+ return
1281
+
1282
+ if name == self.Element.TITLE.value:
1283
+ title = text.strip()
1284
+ if title:
1285
+ self.parents[self.level + 1] = self.doc.add_text(
1286
+ parent=self.parents[self.level], # type: ignore[arg-type]
1287
+ label=DocItemLabel.TITLE,
1288
+ text=title,
1289
+ )
1290
+ self.level += 1
1291
+ self.text = ""
1292
+ elif name == self.Element.ABSTRACT.value:
1293
+ abstract = self.abstract.strip()
1294
+ if abstract:
1295
+ heading_text = PatentHeading.ABSTRACT.value
1296
+ heading_level = (
1297
+ PatentHeading.ABSTRACT.level
1298
+ if PatentHeading.ABSTRACT.level in self.parents
1299
+ else 1
1300
+ )
1301
+ abstract_item = self.doc.add_heading(
1302
+ heading_text,
1303
+ level=heading_level,
1304
+ parent=self.parents[heading_level], # type: ignore[arg-type]
1305
+ )
1306
+ self.doc.add_text(
1307
+ label=DocItemLabel.PARAGRAPH,
1308
+ text=self.abstract,
1309
+ parent=abstract_item,
1310
+ )
1311
+ self.abstract = ""
1312
+ self.text = ""
1313
+ elif name == self.Element.CLAIM_TEXT.value:
1314
+ if text:
1315
+ self.claim += self.text.strip("\n")
1316
+ self.text = ""
1317
+
1318
+ elif name == self.Element.CLAIM.value:
1319
+ claim = self.claim.strip()
1320
+ if claim:
1321
+ self.claims.append(claim)
1322
+ self.claim = ""
1323
+
1324
+ elif name == self.Element.CLAIMS.value and self.claims:
1325
+ heading_text = PatentHeading.CLAIMS.value
1326
+ heading_level = (
1327
+ PatentHeading.CLAIMS.level
1328
+ if PatentHeading.CLAIMS.level in self.parents
1329
+ else 1
1330
+ )
1331
+ claims_item = self.doc.add_heading(
1332
+ heading_text,
1333
+ level=heading_level,
1334
+ parent=self.parents[heading_level], # type: ignore[arg-type]
1335
+ )
1336
+ for text in self.claims:
1337
+ self.doc.add_text(
1338
+ label=DocItemLabel.PARAGRAPH, text=text, parent=claims_item
1339
+ )
1340
+
1341
+ elif name in (
1342
+ self.Element.PARAGRAPH.value,
1343
+ self.Element.HEADING.value,
1344
+ ):
1345
+ if text and self.Element.ABSTRACT.value in self.property:
1346
+ self.abstract = (self.abstract + text) if self.abstract else text
1347
+ elif text.strip():
1348
+ text = re.sub("\\s+", " ", text).strip()
1349
+ if name == self.Element.HEADING.value:
1350
+ self.parents[self.level + 1] = self.doc.add_heading(
1351
+ text=text,
1352
+ level=self.level,
1353
+ parent=self.parents[self.level], # type: ignore[arg-type]
1354
+ )
1355
+ self.level += 1
1356
+ else:
1357
+ self.doc.add_text(
1358
+ label=DocItemLabel.PARAGRAPH,
1359
+ text=text,
1360
+ parent=self.parents[self.level], # type: ignore[arg-type]
1361
+ )
1362
+ self.text = ""
1363
+
1364
+ elif name == self.Element.TABLE.value:
1365
+ # set an empty table as placeholder
1366
+ empty_table = TableData(num_rows=0, num_cols=0, table_cells=[])
1367
+ self.doc.add_table(
1368
+ data=empty_table,
1369
+ parent=self.parents[self.level], # type: ignore[arg-type]
1370
+ )
1371
+
1372
+ def _apply_style(self, text: str, style_tag: str) -> str:
1373
+ """Apply an HTML style to text.
1374
+
1375
+ Args:
1376
+ text: A string containing plain text.
1377
+ style_tag: An HTML tag name for styling text. If the tag name is not
1378
+ recognized as one of the supported styles, the method will return
1379
+ the original `text`.
1380
+
1381
+ Returns:
1382
+ A string after applying the style.
1383
+ """
1384
+ formatted = html.unescape(text)
1385
+
1386
+ if style_tag == self.Element.STYLE_SUPERSCRIPT.value:
1387
+ formatted = html.unescape(self.style_html.get_superscript(formatted))
1388
+ elif style_tag == self.Element.STYLE_SUBSCRIPT.value:
1389
+ formatted = html.unescape(self.style_html.get_subscript(formatted))
1390
+
1391
+ return formatted
1392
+
1393
+ def _clean_data(self):
1394
+ """Reset the variables from stream data."""
1395
+ self.property = []
1396
+ self.abstract = ""
1397
+ self.claim = ""
1398
+ self.claims = []
1399
+ self.text = ""
1400
+
1401
+
1402
+ class XmlTable:
1403
+ """Provide a table parser for xml tables in USPTO patent documents.
1404
+
1405
+ The OASIS Open XML Exchange Table Model can be downloaded from:
1406
+ http://oasis-open.org/specs/soextblx.dtd
1407
+ """
1408
+
1409
+ class MinColInfoType(TypedDict):
1410
+ offset: list[int]
1411
+ colwidth: list[int]
1412
+
1413
+ class ColInfoType(MinColInfoType):
1414
+ cell_range: list[int]
1415
+ cell_offst: list[int]
1416
+
1417
+ def __init__(self, input: str) -> None:
1418
+ """Initialize the table parser with the xml content.
1419
+
1420
+ Args:
1421
+ input: The xml content.
1422
+ """
1423
+ self.max_nbr_messages = 2
1424
+ self.nbr_messages = 0
1425
+ self.empty_text = ""
1426
+ self._soup = BeautifulSoup(input, features="xml")
1427
+
1428
+ def _create_tg_range(self, tgs: list[dict[str, Any]]) -> dict[int, ColInfoType]:
1429
+ """Create a unified range along the table groups.
1430
+
1431
+ Args:
1432
+ tgs: Table group column specifications.
1433
+
1434
+ Returns:
1435
+ Unified group column specifications.
1436
+ """
1437
+ colinfo: dict[int, XmlTable.ColInfoType] = {}
1438
+
1439
+ if len(tgs) == 0:
1440
+ return colinfo
1441
+
1442
+ for itg, tg in enumerate(tgs):
1443
+ colinfo[itg] = {
1444
+ "offset": [],
1445
+ "colwidth": [],
1446
+ "cell_range": [],
1447
+ "cell_offst": [0],
1448
+ }
1449
+ offst = 0
1450
+ for info in tg["colinfo"]:
1451
+ cw = info["colwidth"]
1452
+ cw = re.sub("pt", "", cw, flags=re.I)
1453
+ cw = re.sub("mm", "", cw, flags=re.I)
1454
+ try:
1455
+ cw = int(cw)
1456
+ except BaseException:
1457
+ cw = float(cw)
1458
+ colinfo[itg]["colwidth"].append(cw)
1459
+ colinfo[itg]["offset"].append(offst)
1460
+ offst += cw
1461
+ colinfo[itg]["offset"].append(offst)
1462
+
1463
+ min_colinfo: XmlTable.MinColInfoType = {"offset": [], "colwidth": []}
1464
+
1465
+ min_colinfo["offset"] = colinfo[0]["offset"]
1466
+ offset_w0 = []
1467
+ for itg, col in colinfo.items():
1468
+ # keep track of col with 0 width
1469
+ for ic, cw in enumerate(col["colwidth"]):
1470
+ if cw == 0:
1471
+ offset_w0.append(col["offset"][ic])
1472
+
1473
+ min_colinfo["offset"] = sorted(
1474
+ list(set(col["offset"] + min_colinfo["offset"]))
1475
+ )
1476
+
1477
+ # add back the 0 width cols to offset list
1478
+ offset_w0 = list(set(offset_w0))
1479
+ min_colinfo["offset"] = sorted(min_colinfo["offset"] + offset_w0)
1480
+
1481
+ for i in range(len(min_colinfo["offset"]) - 1):
1482
+ min_colinfo["colwidth"].append(
1483
+ min_colinfo["offset"][i + 1] - min_colinfo["offset"][i]
1484
+ )
1485
+
1486
+ for itg, col in colinfo.items():
1487
+ i = 1
1488
+ range_ = 1
1489
+ for min_i in range(1, len(min_colinfo["offset"])):
1490
+ min_offst = min_colinfo["offset"][min_i]
1491
+ offst = col["offset"][i]
1492
+ if min_offst == offst:
1493
+ if (
1494
+ len(col["offset"]) == i + 1
1495
+ and len(min_colinfo["offset"]) > min_i + 1
1496
+ ):
1497
+ range_ += 1
1498
+ else:
1499
+ col["cell_range"].append(range_)
1500
+ col["cell_offst"].append(col["cell_offst"][-1] + range_)
1501
+ range_ = 1
1502
+ i += 1
1503
+ elif min_offst < offst:
1504
+ range_ += 1
1505
+ else:
1506
+ _log.debug("A USPTO XML table has wrong offsets.")
1507
+ return {}
1508
+
1509
+ return colinfo
1510
+
1511
+ def _get_max_ncols(self, tgs_info: dict[int, ColInfoType]) -> NonNegativeInt:
1512
+ """Get the maximum number of columns across table groups.
1513
+
1514
+ Args:
1515
+ tgs_info: Unified group column specifications.
1516
+
1517
+ Return:
1518
+ The maximum number of columns.
1519
+ """
1520
+ ncols_max = 0
1521
+ for rowinfo in tgs_info.values():
1522
+ ncols_max = max(ncols_max, len(rowinfo["colwidth"]))
1523
+
1524
+ return ncols_max
1525
+
1526
+ def _parse_table(self, table: Tag) -> TableData:
1527
+ """Parse the content of a table tag.
1528
+
1529
+ Args:
1530
+ The table element.
1531
+
1532
+ Returns:
1533
+ A docling table object.
1534
+ """
1535
+ tgs_align = []
1536
+ tg_secs = table.find_all("tgroup")
1537
+ if tg_secs:
1538
+ for tg_sec in tg_secs:
1539
+ ncols = tg_sec.get("cols", None)
1540
+ if ncols:
1541
+ ncols = int(ncols)
1542
+ tg_align = {"ncols": ncols, "colinfo": []}
1543
+ cs_secs = tg_sec.find_all("colspec")
1544
+ if cs_secs:
1545
+ for cs_sec in cs_secs:
1546
+ colname = cs_sec.get("colname", None)
1547
+ colwidth = cs_sec.get("colwidth", None)
1548
+ tg_align["colinfo"].append(
1549
+ {"colname": colname, "colwidth": colwidth}
1550
+ )
1551
+
1552
+ tgs_align.append(tg_align)
1553
+
1554
+ # create unified range along the table groups
1555
+ tgs_range = self._create_tg_range(tgs_align)
1556
+
1557
+ # if the structure is broken, return an empty table
1558
+ if not tgs_range:
1559
+ dl_table = TableData(num_rows=0, num_cols=0, table_cells=[])
1560
+ return dl_table
1561
+
1562
+ ncols_max = self._get_max_ncols(tgs_range)
1563
+
1564
+ # extract table data
1565
+ table_data: list[TableCell] = []
1566
+ i_row_global = 0
1567
+ is_row_empty: bool = True
1568
+ tg_secs = table.find_all("tgroup")
1569
+ if tg_secs:
1570
+ for itg, tg_sec in enumerate(tg_secs):
1571
+ tg_range = tgs_range[itg]
1572
+ row_secs = tg_sec.find_all(["row", "tr"])
1573
+
1574
+ if row_secs:
1575
+ for row_sec in row_secs:
1576
+ entry_secs = row_sec.find_all(["entry", "td"])
1577
+ is_header: bool = row_sec.parent.name in ["thead"]
1578
+
1579
+ ncols = 0
1580
+ local_row: list[TableCell] = []
1581
+ is_row_empty = True
1582
+ if entry_secs:
1583
+ wrong_nbr_cols = False
1584
+ for ientry, entry_sec in enumerate(entry_secs):
1585
+ text = entry_sec.get_text().strip()
1586
+
1587
+ # start-end
1588
+ namest = entry_sec.attrs.get("namest", None)
1589
+ nameend = entry_sec.attrs.get("nameend", None)
1590
+ if isinstance(namest, str) and namest.isnumeric():
1591
+ namest = int(namest)
1592
+ else:
1593
+ namest = ientry + 1
1594
+ if isinstance(nameend, str) and nameend.isnumeric():
1595
+ nameend = int(nameend)
1596
+ shift = 0
1597
+ else:
1598
+ nameend = ientry + 2
1599
+ shift = 1
1600
+
1601
+ if nameend > len(tg_range["cell_offst"]):
1602
+ wrong_nbr_cols = True
1603
+ self.nbr_messages += 1
1604
+ if self.nbr_messages <= self.max_nbr_messages:
1605
+ _log.debug(
1606
+ "USPTO table has # entries != # columns"
1607
+ )
1608
+ break
1609
+
1610
+ range_ = [
1611
+ tg_range["cell_offst"][namest - 1],
1612
+ tg_range["cell_offst"][nameend - 1] - shift,
1613
+ ]
1614
+
1615
+ # add row and replicate cell if needed
1616
+ cell_text = text if text else self.empty_text
1617
+ if cell_text != self.empty_text:
1618
+ is_row_empty = False
1619
+ for irep in range(range_[0], range_[1] + 1):
1620
+ ncols += 1
1621
+ local_row.append(
1622
+ TableCell(
1623
+ column_header=is_header,
1624
+ text=cell_text,
1625
+ start_row_offset_idx=i_row_global,
1626
+ end_row_offset_idx=i_row_global + 1,
1627
+ row_span=1,
1628
+ start_col_offset_idx=range_[0],
1629
+ end_col_offset_idx=range_[1] + 1,
1630
+ col_span=range_[1] - range_[0] + 1,
1631
+ )
1632
+ )
1633
+
1634
+ if wrong_nbr_cols:
1635
+ # keep empty text, not to introduce noise
1636
+ local_row = []
1637
+ ncols = 0
1638
+
1639
+ # add empty cell up to ncols_max
1640
+ for irep in range(ncols, ncols_max):
1641
+ local_row.append(
1642
+ TableCell(
1643
+ column_header=is_header,
1644
+ text=self.empty_text,
1645
+ start_row_offset_idx=i_row_global,
1646
+ end_row_offset_idx=i_row_global + 1,
1647
+ row_span=1,
1648
+ start_col_offset_idx=irep,
1649
+ end_col_offset_idx=irep + 1,
1650
+ col_span=1,
1651
+ )
1652
+ )
1653
+ # do not add empty rows
1654
+ if not is_row_empty:
1655
+ table_data.extend(local_row)
1656
+ i_row_global += 1
1657
+
1658
+ dl_table = TableData(
1659
+ num_rows=i_row_global, num_cols=ncols_max, table_cells=table_data
1660
+ )
1661
+
1662
+ return dl_table
1663
+
1664
+ def parse(self) -> Optional[TableData]:
1665
+ """Parse the first table from an xml content.
1666
+
1667
+ Returns:
1668
+ A docling table data.
1669
+ """
1670
+ section = self._soup.find("table")
1671
+ if section is not None:
1672
+ table = self._parse_table(section)
1673
+ if table.num_rows == 0 or table.num_cols == 0:
1674
+ _log.warning("The parsed USPTO table is empty")
1675
+ return table
1676
+ else:
1677
+ return None
1678
+
1679
+
1680
+ class HtmlEntity:
1681
+ """Provide utility functions to get the HTML entities of styled characters.
1682
+
1683
+ This class has been developped from:
1684
+ https://unicode-table.com/en/html-entities/
1685
+ https://www.w3.org/TR/WD-math-970515/table03.html
1686
+ """
1687
+
1688
+ def __init__(self):
1689
+ """Initialize this class by loading the HTML entity dictionaries."""
1690
+ self.superscript = str.maketrans(
1691
+ {
1692
+ "1": "&sup1;",
1693
+ "2": "&sup2;",
1694
+ "3": "&sup3;",
1695
+ "4": "&#8308;",
1696
+ "5": "&#8309;",
1697
+ "6": "&#8310;",
1698
+ "7": "&#8311;",
1699
+ "8": "&#8312;",
1700
+ "9": "&#8313;",
1701
+ "0": "&#8304;",
1702
+ "+": "&#8314;",
1703
+ "-": "&#8315;",
1704
+ "−": "&#8315;",
1705
+ "=": "&#8316;",
1706
+ "(": "&#8317;",
1707
+ ")": "&#8318;",
1708
+ "a": "&#170;",
1709
+ "o": "&#186;",
1710
+ "i": "&#8305;",
1711
+ "n": "&#8319;",
1712
+ }
1713
+ )
1714
+ self.subscript = str.maketrans(
1715
+ {
1716
+ "1": "&#8321;",
1717
+ "2": "&#8322;",
1718
+ "3": "&#8323;",
1719
+ "4": "&#8324;",
1720
+ "5": "&#8325;",
1721
+ "6": "&#8326;",
1722
+ "7": "&#8327;",
1723
+ "8": "&#8328;",
1724
+ "9": "&#8329;",
1725
+ "0": "&#8320;",
1726
+ "+": "&#8330;",
1727
+ "-": "&#8331;",
1728
+ "−": "&#8331;",
1729
+ "=": "&#8332;",
1730
+ "(": "&#8333;",
1731
+ ")": "&#8334;",
1732
+ "a": "&#8336;",
1733
+ "e": "&#8337;",
1734
+ "o": "&#8338;",
1735
+ "x": "&#8339;",
1736
+ }
1737
+ )
1738
+ self.mathematical_italic = str.maketrans(
1739
+ {
1740
+ "A": "&#119860;",
1741
+ "B": "&#119861;",
1742
+ "C": "&#119862;",
1743
+ "D": "&#119863;",
1744
+ "E": "&#119864;",
1745
+ "F": "&#119865;",
1746
+ "G": "&#119866;",
1747
+ "H": "&#119867;",
1748
+ "I": "&#119868;",
1749
+ "J": "&#119869;",
1750
+ "K": "&#119870;",
1751
+ "L": "&#119871;",
1752
+ "M": "&#119872;",
1753
+ "N": "&#119873;",
1754
+ "O": "&#119874;",
1755
+ "P": "&#119875;",
1756
+ "Q": "&#119876;",
1757
+ "R": "&#119877;",
1758
+ "S": "&#119878;",
1759
+ "T": "&#119879;",
1760
+ "U": "&#119880;",
1761
+ "V": "&#119881;",
1762
+ "W": "&#119882;",
1763
+ "Y": "&#119884;",
1764
+ "Z": "&#119885;",
1765
+ "a": "&#119886;",
1766
+ "b": "&#119887;",
1767
+ "c": "&#119888;",
1768
+ "d": "&#119889;",
1769
+ "e": "&#119890;",
1770
+ "f": "&#119891;",
1771
+ "g": "&#119892;",
1772
+ "h": "&#119893;",
1773
+ "i": "&#119894;",
1774
+ "j": "&#119895;",
1775
+ "k": "&#119896;",
1776
+ "l": "&#119897;",
1777
+ "m": "&#119898;",
1778
+ "n": "&#119899;",
1779
+ "o": "&#119900;",
1780
+ "p": "&#119901;",
1781
+ "q": "&#119902;",
1782
+ "r": "&#119903;",
1783
+ "s": "&#119904;",
1784
+ "t": "&#119905;",
1785
+ "u": "&#119906;",
1786
+ "v": "&#119907;",
1787
+ "w": "&#119908;",
1788
+ "x": "&#119909;",
1789
+ "y": "&#119910;",
1790
+ "z": "&#119911;",
1791
+ }
1792
+ )
1793
+
1794
+ self.lookup_iso8879 = {
1795
+ "&Agr;": "&Alpha;",
1796
+ "&Bgr;": "&Beta;",
1797
+ "&Ggr;": "&Gamma;",
1798
+ "&Dgr;": "&Delta;",
1799
+ "&Egr;": "&Epsilon;",
1800
+ "&Zgr;": "&Zeta;",
1801
+ "&EEgr;": "&Eta;",
1802
+ "&THgr;": "&Theta;",
1803
+ "&Igr;": "&Iota;",
1804
+ "&Kgr;": "&Kappa;",
1805
+ "&Lgr;": "&Lambda;",
1806
+ "&Mgr;": "&Mu;",
1807
+ "&Ngr;": "&Nu;",
1808
+ "&Xgr;": "&Xi;",
1809
+ "&Ogr;": "&Omicron;",
1810
+ "&Pgr;": "&Pi;",
1811
+ "&Rgr;": "&Rho;",
1812
+ "&Sgr;": "&Sigma;",
1813
+ "&Tgr;": "&Tau;",
1814
+ "&Ugr;": "&Upsilon;",
1815
+ "&PHgr;": "&Phi;",
1816
+ "&KHgr;": "&Chi;",
1817
+ "&PSgr;": "&Psi;",
1818
+ "&OHgr;": "&Omega;",
1819
+ "&agr;": "&alpha;",
1820
+ "&bgr;": "&beta;",
1821
+ "&ggr;": "&gamma;",
1822
+ "&dgr;": "&delta;",
1823
+ "&egr;": "&epsilon;",
1824
+ "&zgr;": "&zeta;",
1825
+ "&eegr;": "&eta;",
1826
+ "&thgr;": "&theta;",
1827
+ "&igr;": "&iota;",
1828
+ "&kgr;": "&kappa;",
1829
+ "&lgr;": "&lambda;",
1830
+ "&mgr;": "&mu;",
1831
+ "&ngr;": "&nu;",
1832
+ "&xgr;": "&xi;",
1833
+ "&ogr;": "&omicron;",
1834
+ "&pgr;": "&pi;",
1835
+ "&rgr;": "&rho;",
1836
+ "&sgr;": "&sigmaf;",
1837
+ "&tgr;": "&tau;",
1838
+ "&ugr;": "&upsilon;",
1839
+ "&phgr;": "&phi;",
1840
+ "&khgr;": "&chi;",
1841
+ "&psgr;": "&psi;",
1842
+ "&ohgr;": "&omega;",
1843
+ }
1844
+
1845
+ def get_superscript(self, text: str) -> str:
1846
+ """Get a text in superscript as HTML entities.
1847
+
1848
+ Args:
1849
+ text: The text to transform.
1850
+
1851
+ Returns:
1852
+ The text in superscript as HTML entities.
1853
+ """
1854
+ return text.translate(self.superscript)
1855
+
1856
+ def get_subscript(self, text: str) -> str:
1857
+ """Get a text in subscript as HTML entities.
1858
+
1859
+ Args:
1860
+ The text to transform.
1861
+
1862
+ Returns:
1863
+ The text in subscript as HTML entities.
1864
+ """
1865
+ return text.translate(self.subscript)
1866
+
1867
+ def get_math_italic(self, text: str) -> str:
1868
+ """Get a text in italic as HTML entities.
1869
+
1870
+ Args:
1871
+ The text to transform.
1872
+
1873
+ Returns:
1874
+ The text in italics as HTML entities.
1875
+ """
1876
+ return text.translate(self.mathematical_italic)
1877
+
1878
+ def get_greek_from_iso8879(self, text: str) -> str:
1879
+ """Get an HTML entity of a greek letter in ISO 8879.
1880
+
1881
+ Args:
1882
+ The text to transform, as an ISO 8879 entitiy.
1883
+
1884
+ Returns:
1885
+ The HTML entity representing a greek letter. If the input text is not
1886
+ supported, the original text is returned.
1887
+ """
1888
+ return self.lookup_iso8879.get(text, text)