docling 2.16.0__tar.gz → 2.17.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. {docling-2.16.0 → docling-2.17.0}/PKG-INFO +13 -10
  2. {docling-2.16.0 → docling-2.17.0}/README.md +12 -9
  3. {docling-2.16.0 → docling-2.17.0}/docling/backend/html_backend.py +3 -2
  4. {docling-2.16.0 → docling-2.17.0}/docling/backend/md_backend.py +4 -8
  5. {docling-2.16.0 → docling-2.17.0}/docling/backend/xml/uspto_backend.py +25 -25
  6. {docling-2.16.0 → docling-2.17.0}/docling/cli/main.py +18 -3
  7. {docling-2.16.0 → docling-2.17.0}/docling/datamodel/document.py +2 -0
  8. {docling-2.16.0 → docling-2.17.0}/docling/datamodel/pipeline_options.py +1 -0
  9. {docling-2.16.0 → docling-2.17.0}/docling/models/rapid_ocr_model.py +1 -0
  10. {docling-2.16.0 → docling-2.17.0}/docling/models/tesseract_ocr_cli_model.py +72 -4
  11. {docling-2.16.0 → docling-2.17.0}/docling/models/tesseract_ocr_model.py +37 -37
  12. docling-2.17.0/docling/utils/ocr_utils.py +9 -0
  13. {docling-2.16.0 → docling-2.17.0}/pyproject.toml +1 -1
  14. {docling-2.16.0 → docling-2.17.0}/LICENSE +0 -0
  15. {docling-2.16.0 → docling-2.17.0}/docling/__init__.py +0 -0
  16. {docling-2.16.0 → docling-2.17.0}/docling/backend/__init__.py +0 -0
  17. {docling-2.16.0 → docling-2.17.0}/docling/backend/abstract_backend.py +0 -0
  18. {docling-2.16.0 → docling-2.17.0}/docling/backend/asciidoc_backend.py +0 -0
  19. {docling-2.16.0 → docling-2.17.0}/docling/backend/docling_parse_backend.py +0 -0
  20. {docling-2.16.0 → docling-2.17.0}/docling/backend/docling_parse_v2_backend.py +0 -0
  21. {docling-2.16.0 → docling-2.17.0}/docling/backend/json/__init__.py +0 -0
  22. {docling-2.16.0 → docling-2.17.0}/docling/backend/json/docling_json_backend.py +0 -0
  23. {docling-2.16.0 → docling-2.17.0}/docling/backend/msexcel_backend.py +0 -0
  24. {docling-2.16.0 → docling-2.17.0}/docling/backend/mspowerpoint_backend.py +0 -0
  25. {docling-2.16.0 → docling-2.17.0}/docling/backend/msword_backend.py +0 -0
  26. {docling-2.16.0 → docling-2.17.0}/docling/backend/pdf_backend.py +0 -0
  27. {docling-2.16.0 → docling-2.17.0}/docling/backend/pypdfium2_backend.py +0 -0
  28. {docling-2.16.0 → docling-2.17.0}/docling/backend/xml/__init__.py +0 -0
  29. {docling-2.16.0 → docling-2.17.0}/docling/backend/xml/pubmed_backend.py +0 -0
  30. {docling-2.16.0 → docling-2.17.0}/docling/chunking/__init__.py +0 -0
  31. {docling-2.16.0 → docling-2.17.0}/docling/cli/__init__.py +0 -0
  32. {docling-2.16.0 → docling-2.17.0}/docling/datamodel/__init__.py +0 -0
  33. {docling-2.16.0 → docling-2.17.0}/docling/datamodel/base_models.py +0 -0
  34. {docling-2.16.0 → docling-2.17.0}/docling/datamodel/settings.py +0 -0
  35. {docling-2.16.0 → docling-2.17.0}/docling/document_converter.py +0 -0
  36. {docling-2.16.0 → docling-2.17.0}/docling/exceptions.py +0 -0
  37. {docling-2.16.0 → docling-2.17.0}/docling/models/__init__.py +0 -0
  38. {docling-2.16.0 → docling-2.17.0}/docling/models/base_model.py +0 -0
  39. {docling-2.16.0 → docling-2.17.0}/docling/models/base_ocr_model.py +0 -0
  40. {docling-2.16.0 → docling-2.17.0}/docling/models/code_formula_model.py +0 -0
  41. {docling-2.16.0 → docling-2.17.0}/docling/models/document_picture_classifier.py +0 -0
  42. {docling-2.16.0 → docling-2.17.0}/docling/models/ds_glm_model.py +0 -0
  43. {docling-2.16.0 → docling-2.17.0}/docling/models/easyocr_model.py +0 -0
  44. {docling-2.16.0 → docling-2.17.0}/docling/models/layout_model.py +0 -0
  45. {docling-2.16.0 → docling-2.17.0}/docling/models/ocr_mac_model.py +0 -0
  46. {docling-2.16.0 → docling-2.17.0}/docling/models/page_assemble_model.py +0 -0
  47. {docling-2.16.0 → docling-2.17.0}/docling/models/page_preprocessing_model.py +0 -0
  48. {docling-2.16.0 → docling-2.17.0}/docling/models/table_structure_model.py +0 -0
  49. {docling-2.16.0 → docling-2.17.0}/docling/pipeline/__init__.py +0 -0
  50. {docling-2.16.0 → docling-2.17.0}/docling/pipeline/base_pipeline.py +0 -0
  51. {docling-2.16.0 → docling-2.17.0}/docling/pipeline/simple_pipeline.py +0 -0
  52. {docling-2.16.0 → docling-2.17.0}/docling/pipeline/standard_pdf_pipeline.py +0 -0
  53. {docling-2.16.0 → docling-2.17.0}/docling/py.typed +0 -0
  54. {docling-2.16.0 → docling-2.17.0}/docling/utils/__init__.py +0 -0
  55. {docling-2.16.0 → docling-2.17.0}/docling/utils/accelerator_utils.py +0 -0
  56. {docling-2.16.0 → docling-2.17.0}/docling/utils/export.py +0 -0
  57. {docling-2.16.0 → docling-2.17.0}/docling/utils/glm_utils.py +0 -0
  58. {docling-2.16.0 → docling-2.17.0}/docling/utils/layout_postprocessor.py +0 -0
  59. {docling-2.16.0 → docling-2.17.0}/docling/utils/profiling.py +0 -0
  60. {docling-2.16.0 → docling-2.17.0}/docling/utils/utils.py +0 -0
  61. {docling-2.16.0 → docling-2.17.0}/docling/utils/visualization.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.16.0
3
+ Version: 2.17.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -78,22 +78,21 @@ Description-Content-Type: text/markdown
78
78
  [![License MIT](https://img.shields.io/github/license/DS4SD/docling)](https://opensource.org/licenses/MIT)
79
79
  [![PyPI Downloads](https://static.pepy.tech/badge/docling/month)](https://pepy.tech/projects/docling)
80
80
 
81
- Docling parses documents and exports them to the desired format with ease and speed.
81
+ Docling simplifies document processing, parsing diverse formats including advanced PDF understanding and providing seamless integrations with the gen AI ecosystem.
82
82
 
83
83
  ## Features
84
84
 
85
- * 🗂️ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to HTML, Markdown and JSON (with embedded and referenced images)
86
- * 📑 Advanced PDF document understanding including page layout, reading order & table structures
87
- * 🧩 Unified, expressive [DoclingDocument](https://ds4sd.github.io/docling/concepts/docling_document/) representation format
88
- * 🤖 Plug-and-play [integrations](https://ds4sd.github.io/docling/integrations/) incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
89
- * 🔍 OCR support for scanned PDFs
85
+ * 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, XLSX, HTML, images, and more
86
+ * 📑 Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more
87
+ * 🧬 Unified, expressive [DoclingDocument][docling_document] representation format
88
+ * ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, and lossless JSON
89
+ * 🔒 Local execution capabilities for sensitive data and air-gapped environments
90
+ * 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
91
+ * 🔍 Extensive OCR support for scanned PDFs and images
90
92
  * 💻 Simple and convenient CLI
91
93
 
92
- Explore the [documentation](https://ds4sd.github.io/docling/) to discover plenty examples and unlock the full power of Docling!
93
-
94
94
  ### Coming soon
95
95
 
96
- * ♾️ Equation & code extraction
97
96
  * 📝 Metadata extraction, including title, authors, references & language
98
97
 
99
98
  ## Installation
@@ -177,3 +176,7 @@ For individual model usage, please refer to the model licenses found in the orig
177
176
 
178
177
  Docling has been brought to you by IBM.
179
178
 
179
+ [supported_formats]: https://ds4sd.github.io/docling/supported_formats/
180
+ [docling_document]: https://ds4sd.github.io/docling/concepts/docling_document/
181
+ [integrations]: https://ds4sd.github.io/docling/integrations/
182
+
@@ -22,22 +22,21 @@
22
22
  [![License MIT](https://img.shields.io/github/license/DS4SD/docling)](https://opensource.org/licenses/MIT)
23
23
  [![PyPI Downloads](https://static.pepy.tech/badge/docling/month)](https://pepy.tech/projects/docling)
24
24
 
25
- Docling parses documents and exports them to the desired format with ease and speed.
25
+ Docling simplifies document processing, parsing diverse formats including advanced PDF understanding and providing seamless integrations with the gen AI ecosystem.
26
26
 
27
27
  ## Features
28
28
 
29
- * 🗂️ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to HTML, Markdown and JSON (with embedded and referenced images)
30
- * 📑 Advanced PDF document understanding including page layout, reading order & table structures
31
- * 🧩 Unified, expressive [DoclingDocument](https://ds4sd.github.io/docling/concepts/docling_document/) representation format
32
- * 🤖 Plug-and-play [integrations](https://ds4sd.github.io/docling/integrations/) incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
33
- * 🔍 OCR support for scanned PDFs
29
+ * 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, XLSX, HTML, images, and more
30
+ * 📑 Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more
31
+ * 🧬 Unified, expressive [DoclingDocument][docling_document] representation format
32
+ * ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, and lossless JSON
33
+ * 🔒 Local execution capabilities for sensitive data and air-gapped environments
34
+ * 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
35
+ * 🔍 Extensive OCR support for scanned PDFs and images
34
36
  * 💻 Simple and convenient CLI
35
37
 
36
- Explore the [documentation](https://ds4sd.github.io/docling/) to discover plenty examples and unlock the full power of Docling!
37
-
38
38
  ### Coming soon
39
39
 
40
- * ♾️ Equation & code extraction
41
40
  * 📝 Metadata extraction, including title, authors, references & language
42
41
 
43
42
  ## Installation
@@ -120,3 +119,7 @@ For individual model usage, please refer to the model licenses found in the orig
120
119
  ## IBM ❤️ Open Source AI
121
120
 
122
121
  Docling has been brought to you by IBM.
122
+
123
+ [supported_formats]: https://ds4sd.github.io/docling/supported_formats/
124
+ [docling_document]: https://ds4sd.github.io/docling/concepts/docling_document/
125
+ [integrations]: https://ds4sd.github.io/docling/integrations/
@@ -78,10 +78,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
78
78
 
79
79
  if self.is_valid():
80
80
  assert self.soup is not None
81
+ content = self.soup.body or self.soup
81
82
  # Replace <br> tags with newline characters
82
- for br in self.soup.body.find_all("br"):
83
+ for br in content.find_all("br"):
83
84
  br.replace_with("\n")
84
- doc = self.walk(self.soup.body, doc)
85
+ doc = self.walk(content, doc)
85
86
  else:
86
87
  raise RuntimeError(
87
88
  f"Cannot convert doc with {self.document_hash} because the backend failed to init."
@@ -65,7 +65,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
65
65
 
66
66
  self.in_table = False
67
67
  self.md_table_buffer: list[str] = []
68
- self.inline_text_buffer = ""
68
+ self.inline_texts: list[str] = []
69
69
 
70
70
  try:
71
71
  if isinstance(self.path_or_stream, BytesIO):
@@ -152,15 +152,14 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
152
152
  def process_inline_text(
153
153
  self, parent_element: Optional[NodeItem], doc: DoclingDocument
154
154
  ):
155
- # self.inline_text_buffer += str(text_in)
156
- txt = self.inline_text_buffer.strip()
155
+ txt = " ".join(self.inline_texts)
157
156
  if len(txt) > 0:
158
157
  doc.add_text(
159
158
  label=DocItemLabel.PARAGRAPH,
160
159
  parent=parent_element,
161
160
  text=txt,
162
161
  )
163
- self.inline_text_buffer = ""
162
+ self.inline_texts = []
164
163
 
165
164
  def iterate_elements(
166
165
  self,
@@ -266,9 +265,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
266
265
  self.close_table(doc)
267
266
  self.in_table = False
268
267
  # most likely just inline text
269
- self.inline_text_buffer += str(
270
- element.children
271
- ) # do not strip an inline text, as it may contain important spaces
268
+ self.inline_texts.append(str(element.children))
272
269
 
273
270
  elif isinstance(element, marko.inline.CodeSpan):
274
271
  self.close_table(doc)
@@ -292,7 +289,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
292
289
  doc.add_code(parent=parent_element, text=snippet_text)
293
290
 
294
291
  elif isinstance(element, marko.inline.LineBreak):
295
- self.process_inline_text(parent_element, doc)
296
292
  if self.in_table:
297
293
  _log.debug("Line break in a table")
298
294
  self.md_table_buffer.append("")
@@ -389,7 +389,7 @@ class PatentUsptoIce(PatentUspto):
389
389
  if name == self.Element.TITLE.value:
390
390
  if text:
391
391
  self.parents[self.level + 1] = self.doc.add_title(
392
- parent=self.parents[self.level], # type: ignore[arg-type]
392
+ parent=self.parents[self.level],
393
393
  text=text,
394
394
  )
395
395
  self.level += 1
@@ -406,7 +406,7 @@ class PatentUsptoIce(PatentUspto):
406
406
  abstract_item = self.doc.add_heading(
407
407
  heading_text,
408
408
  level=heading_level,
409
- parent=self.parents[heading_level], # type: ignore[arg-type]
409
+ parent=self.parents[heading_level],
410
410
  )
411
411
  self.doc.add_text(
412
412
  label=DocItemLabel.PARAGRAPH,
@@ -434,7 +434,7 @@ class PatentUsptoIce(PatentUspto):
434
434
  claims_item = self.doc.add_heading(
435
435
  heading_text,
436
436
  level=heading_level,
437
- parent=self.parents[heading_level], # type: ignore[arg-type]
437
+ parent=self.parents[heading_level],
438
438
  )
439
439
  for text in self.claims:
440
440
  self.doc.add_text(
@@ -452,7 +452,7 @@ class PatentUsptoIce(PatentUspto):
452
452
  self.doc.add_text(
453
453
  label=DocItemLabel.PARAGRAPH,
454
454
  text=text,
455
- parent=self.parents[self.level], # type: ignore[arg-type]
455
+ parent=self.parents[self.level],
456
456
  )
457
457
  self.text = ""
458
458
 
@@ -460,7 +460,7 @@ class PatentUsptoIce(PatentUspto):
460
460
  self.parents[self.level + 1] = self.doc.add_heading(
461
461
  text=text,
462
462
  level=self.level,
463
- parent=self.parents[self.level], # type: ignore[arg-type]
463
+ parent=self.parents[self.level],
464
464
  )
465
465
  self.level += 1
466
466
  self.text = ""
@@ -470,7 +470,7 @@ class PatentUsptoIce(PatentUspto):
470
470
  empty_table = TableData(num_rows=0, num_cols=0, table_cells=[])
471
471
  self.doc.add_table(
472
472
  data=empty_table,
473
- parent=self.parents[self.level], # type: ignore[arg-type]
473
+ parent=self.parents[self.level],
474
474
  )
475
475
 
476
476
  def _apply_style(self, text: str, style_tag: str) -> str:
@@ -721,7 +721,7 @@ class PatentUsptoGrantV2(PatentUspto):
721
721
  if self.Element.TITLE.value in self.property and text.strip():
722
722
  title = text.strip()
723
723
  self.parents[self.level + 1] = self.doc.add_title(
724
- parent=self.parents[self.level], # type: ignore[arg-type]
724
+ parent=self.parents[self.level],
725
725
  text=title,
726
726
  )
727
727
  self.level += 1
@@ -749,7 +749,7 @@ class PatentUsptoGrantV2(PatentUspto):
749
749
  self.parents[self.level + 1] = self.doc.add_heading(
750
750
  text=text.strip(),
751
751
  level=self.level,
752
- parent=self.parents[self.level], # type: ignore[arg-type]
752
+ parent=self.parents[self.level],
753
753
  )
754
754
  self.level += 1
755
755
 
@@ -769,7 +769,7 @@ class PatentUsptoGrantV2(PatentUspto):
769
769
  claims_item = self.doc.add_heading(
770
770
  heading_text,
771
771
  level=heading_level,
772
- parent=self.parents[heading_level], # type: ignore[arg-type]
772
+ parent=self.parents[heading_level],
773
773
  )
774
774
  for text in self.claims:
775
775
  self.doc.add_text(
@@ -787,7 +787,7 @@ class PatentUsptoGrantV2(PatentUspto):
787
787
  abstract_item = self.doc.add_heading(
788
788
  heading_text,
789
789
  level=heading_level,
790
- parent=self.parents[heading_level], # type: ignore[arg-type]
790
+ parent=self.parents[heading_level],
791
791
  )
792
792
  self.doc.add_text(
793
793
  label=DocItemLabel.PARAGRAPH, text=abstract, parent=abstract_item
@@ -799,7 +799,7 @@ class PatentUsptoGrantV2(PatentUspto):
799
799
  self.doc.add_text(
800
800
  label=DocItemLabel.PARAGRAPH,
801
801
  text=paragraph,
802
- parent=self.parents[self.level], # type: ignore[arg-type]
802
+ parent=self.parents[self.level],
803
803
  )
804
804
  elif self.Element.CLAIM.value in self.property:
805
805
  # we may need a space after a paragraph in claim text
@@ -811,7 +811,7 @@ class PatentUsptoGrantV2(PatentUspto):
811
811
  empty_table = TableData(num_rows=0, num_cols=0, table_cells=[])
812
812
  self.doc.add_table(
813
813
  data=empty_table,
814
- parent=self.parents[self.level], # type: ignore[arg-type]
814
+ parent=self.parents[self.level],
815
815
  )
816
816
 
817
817
  def _apply_style(self, text: str, style_tag: str) -> str:
@@ -938,7 +938,7 @@ class PatentUsptoGrantAps(PatentUspto):
938
938
  self.parents[self.level + 1] = self.doc.add_heading(
939
939
  heading.value,
940
940
  level=self.level,
941
- parent=self.parents[self.level], # type: ignore[arg-type]
941
+ parent=self.parents[self.level],
942
942
  )
943
943
  self.level += 1
944
944
 
@@ -959,7 +959,7 @@ class PatentUsptoGrantAps(PatentUspto):
959
959
 
960
960
  if field == self.Field.TITLE.value:
961
961
  self.parents[self.level + 1] = self.doc.add_title(
962
- parent=self.parents[self.level], text=value # type: ignore[arg-type]
962
+ parent=self.parents[self.level], text=value
963
963
  )
964
964
  self.level += 1
965
965
 
@@ -971,14 +971,14 @@ class PatentUsptoGrantAps(PatentUspto):
971
971
  self.doc.add_text(
972
972
  label=DocItemLabel.PARAGRAPH,
973
973
  text=value,
974
- parent=self.parents[self.level], # type: ignore[arg-type]
974
+ parent=self.parents[self.level],
975
975
  )
976
976
 
977
977
  elif field == self.Field.NUMBER.value and section == self.Section.CLAIMS.value:
978
978
  self.doc.add_text(
979
979
  label=DocItemLabel.PARAGRAPH,
980
980
  text="",
981
- parent=self.parents[self.level], # type: ignore[arg-type]
981
+ parent=self.parents[self.level],
982
982
  )
983
983
 
984
984
  elif (
@@ -996,7 +996,7 @@ class PatentUsptoGrantAps(PatentUspto):
996
996
  last_claim = self.doc.add_text(
997
997
  label=DocItemLabel.PARAGRAPH,
998
998
  text="",
999
- parent=self.parents[self.level], # type: ignore[arg-type]
999
+ parent=self.parents[self.level],
1000
1000
  )
1001
1001
 
1002
1002
  last_claim.text += f" {value}" if last_claim.text else value
@@ -1012,7 +1012,7 @@ class PatentUsptoGrantAps(PatentUspto):
1012
1012
  self.parents[self.level + 1] = self.doc.add_heading(
1013
1013
  value,
1014
1014
  level=self.level,
1015
- parent=self.parents[self.level], # type: ignore[arg-type]
1015
+ parent=self.parents[self.level],
1016
1016
  )
1017
1017
  self.level += 1
1018
1018
 
@@ -1029,7 +1029,7 @@ class PatentUsptoGrantAps(PatentUspto):
1029
1029
  self.doc.add_text(
1030
1030
  label=DocItemLabel.PARAGRAPH,
1031
1031
  text=value,
1032
- parent=self.parents[self.level], # type: ignore[arg-type]
1032
+ parent=self.parents[self.level],
1033
1033
  )
1034
1034
 
1035
1035
  def parse(self, patent_content: str) -> Optional[DoclingDocument]:
@@ -1283,7 +1283,7 @@ class PatentUsptoAppV1(PatentUspto):
1283
1283
  title = text.strip()
1284
1284
  if title:
1285
1285
  self.parents[self.level + 1] = self.doc.add_text(
1286
- parent=self.parents[self.level], # type: ignore[arg-type]
1286
+ parent=self.parents[self.level],
1287
1287
  label=DocItemLabel.TITLE,
1288
1288
  text=title,
1289
1289
  )
@@ -1301,7 +1301,7 @@ class PatentUsptoAppV1(PatentUspto):
1301
1301
  abstract_item = self.doc.add_heading(
1302
1302
  heading_text,
1303
1303
  level=heading_level,
1304
- parent=self.parents[heading_level], # type: ignore[arg-type]
1304
+ parent=self.parents[heading_level],
1305
1305
  )
1306
1306
  self.doc.add_text(
1307
1307
  label=DocItemLabel.PARAGRAPH,
@@ -1331,7 +1331,7 @@ class PatentUsptoAppV1(PatentUspto):
1331
1331
  claims_item = self.doc.add_heading(
1332
1332
  heading_text,
1333
1333
  level=heading_level,
1334
- parent=self.parents[heading_level], # type: ignore[arg-type]
1334
+ parent=self.parents[heading_level],
1335
1335
  )
1336
1336
  for text in self.claims:
1337
1337
  self.doc.add_text(
@@ -1350,14 +1350,14 @@ class PatentUsptoAppV1(PatentUspto):
1350
1350
  self.parents[self.level + 1] = self.doc.add_heading(
1351
1351
  text=text,
1352
1352
  level=self.level,
1353
- parent=self.parents[self.level], # type: ignore[arg-type]
1353
+ parent=self.parents[self.level],
1354
1354
  )
1355
1355
  self.level += 1
1356
1356
  else:
1357
1357
  self.doc.add_text(
1358
1358
  label=DocItemLabel.PARAGRAPH,
1359
1359
  text=text,
1360
- parent=self.parents[self.level], # type: ignore[arg-type]
1360
+ parent=self.parents[self.level],
1361
1361
  )
1362
1362
  self.text = ""
1363
1363
 
@@ -1366,7 +1366,7 @@ class PatentUsptoAppV1(PatentUspto):
1366
1366
  empty_table = TableData(num_rows=0, num_cols=0, table_cells=[])
1367
1367
  self.doc.add_table(
1368
1368
  data=empty_table,
1369
- parent=self.parents[self.level], # type: ignore[arg-type]
1369
+ parent=self.parents[self.level],
1370
1370
  )
1371
1371
 
1372
1372
  def _apply_style(self, text: str, style_tag: str) -> str:
@@ -1,18 +1,18 @@
1
1
  import importlib
2
- import json
3
2
  import logging
3
+ import platform
4
4
  import re
5
+ import sys
5
6
  import tempfile
6
7
  import time
7
8
  import warnings
8
- from enum import Enum
9
9
  from pathlib import Path
10
10
  from typing import Annotated, Dict, Iterable, List, Optional, Type
11
11
 
12
12
  import typer
13
13
  from docling_core.types.doc import ImageRefMode
14
14
  from docling_core.utils.file import resolve_source_to_path
15
- from pydantic import TypeAdapter, ValidationError
15
+ from pydantic import TypeAdapter
16
16
 
17
17
  from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
18
18
  from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
@@ -65,10 +65,15 @@ def version_callback(value: bool):
65
65
  docling_core_version = importlib.metadata.version("docling-core")
66
66
  docling_ibm_models_version = importlib.metadata.version("docling-ibm-models")
67
67
  docling_parse_version = importlib.metadata.version("docling-parse")
68
+ platform_str = platform.platform()
69
+ py_impl_version = sys.implementation.cache_tag
70
+ py_lang_version = platform.python_version()
68
71
  print(f"Docling version: {docling_version}")
69
72
  print(f"Docling Core version: {docling_core_version}")
70
73
  print(f"Docling IBM Models version: {docling_ibm_models_version}")
71
74
  print(f"Docling Parse version: {docling_parse_version}")
75
+ print(f"Python: {py_impl_version} ({py_lang_version})")
76
+ print(f"Platform: {platform_str}")
72
77
  raise typer.Exit()
73
78
 
74
79
 
@@ -206,6 +211,14 @@ def convert(
206
211
  TableFormerMode,
207
212
  typer.Option(..., help="The mode to use in the table structure model."),
208
213
  ] = TableFormerMode.FAST,
214
+ enrich_code: Annotated[
215
+ bool,
216
+ typer.Option(..., help="Enable the code enrichment model in the pipeline."),
217
+ ] = False,
218
+ enrich_formula: Annotated[
219
+ bool,
220
+ typer.Option(..., help="Enable the formula enrichment model in the pipeline."),
221
+ ] = False,
209
222
  artifacts_path: Annotated[
210
223
  Optional[Path],
211
224
  typer.Option(..., help="If provided, the location of the model artifacts."),
@@ -360,6 +373,8 @@ def convert(
360
373
  do_ocr=ocr,
361
374
  ocr_options=ocr_options,
362
375
  do_table_structure=True,
376
+ do_code_enrichment=enrich_code,
377
+ do_formula_enrichment=enrich_formula,
363
378
  document_timeout=document_timeout,
364
379
  )
365
380
  pipeline_options.table_structure_options.do_cell_matching = (
@@ -352,6 +352,8 @@ class _DocumentConversionInput(BaseModel):
352
352
  mime = FormatToMimeType[InputFormat.MD][0]
353
353
  elif ext in FormatToExtensions[InputFormat.JSON_DOCLING]:
354
354
  mime = FormatToMimeType[InputFormat.JSON_DOCLING][0]
355
+ elif ext in FormatToExtensions[InputFormat.PDF]:
356
+ mime = FormatToMimeType[InputFormat.PDF][0]
355
357
  return mime
356
358
 
357
359
  @staticmethod
@@ -119,6 +119,7 @@ class RapidOcrOptions(OcrOptions):
119
119
  det_model_path: Optional[str] = None # same default as rapidocr
120
120
  cls_model_path: Optional[str] = None # same default as rapidocr
121
121
  rec_model_path: Optional[str] = None # same default as rapidocr
122
+ rec_keys_path: Optional[str] = None # same default as rapidocr
122
123
 
123
124
  model_config = ConfigDict(
124
125
  extra="forbid",
@@ -59,6 +59,7 @@ class RapidOcrModel(BaseOcrModel):
59
59
  det_model_path=self.options.det_model_path,
60
60
  cls_model_path=self.options.cls_model_path,
61
61
  rec_model_path=self.options.rec_model_path,
62
+ rec_keys_path=self.options.rec_keys_path,
62
63
  )
63
64
 
64
65
  def __call__(
@@ -4,7 +4,7 @@ import logging
4
4
  import os
5
5
  import tempfile
6
6
  from subprocess import DEVNULL, PIPE, Popen
7
- from typing import Iterable, Optional, Tuple
7
+ from typing import Iterable, List, Optional, Tuple
8
8
 
9
9
  import pandas as pd
10
10
  from docling_core.types.doc import BoundingBox, CoordOrigin
@@ -14,6 +14,7 @@ from docling.datamodel.document import ConversionResult
14
14
  from docling.datamodel.pipeline_options import TesseractCliOcrOptions
15
15
  from docling.datamodel.settings import settings
16
16
  from docling.models.base_ocr_model import BaseOcrModel
17
+ from docling.utils.ocr_utils import map_tesseract_script
17
18
  from docling.utils.profiling import TimeRecorder
18
19
 
19
20
  _log = logging.getLogger(__name__)
@@ -28,10 +29,13 @@ class TesseractOcrCliModel(BaseOcrModel):
28
29
 
29
30
  self._name: Optional[str] = None
30
31
  self._version: Optional[str] = None
32
+ self._tesseract_languages: Optional[List[str]] = None
33
+ self._script_prefix: Optional[str] = None
31
34
 
32
35
  if self.enabled:
33
36
  try:
34
37
  self._get_name_and_version()
38
+ self._set_languages_and_prefix()
35
39
 
36
40
  except Exception as exc:
37
41
  raise RuntimeError(
@@ -73,12 +77,20 @@ class TesseractOcrCliModel(BaseOcrModel):
73
77
  return name, version
74
78
 
75
79
  def _run_tesseract(self, ifilename: str):
76
-
80
+ r"""
81
+ Run tesseract CLI
82
+ """
77
83
  cmd = [self.options.tesseract_cmd]
78
84
 
79
- if self.options.lang is not None and len(self.options.lang) > 0:
85
+ if "auto" in self.options.lang:
86
+ lang = self._detect_language(ifilename)
87
+ if lang is not None:
88
+ cmd.append("-l")
89
+ cmd.append(lang)
90
+ elif self.options.lang is not None and len(self.options.lang) > 0:
80
91
  cmd.append("-l")
81
92
  cmd.append("+".join(self.options.lang))
93
+
82
94
  if self.options.path is not None:
83
95
  cmd.append("--tessdata-dir")
84
96
  cmd.append(self.options.path)
@@ -106,6 +118,63 @@ class TesseractOcrCliModel(BaseOcrModel):
106
118
 
107
119
  return df_filtered
108
120
 
121
+ def _detect_language(self, ifilename: str):
122
+ r"""
123
+ Run tesseract in PSM 0 mode to detect the language
124
+ """
125
+ assert self._tesseract_languages is not None
126
+
127
+ cmd = [self.options.tesseract_cmd]
128
+ cmd.extend(["--psm", "0", "-l", "osd", ifilename, "stdout"])
129
+ _log.info("command: {}".format(" ".join(cmd)))
130
+ proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
131
+ output, _ = proc.communicate()
132
+ decoded_data = output.decode("utf-8")
133
+ df = pd.read_csv(
134
+ io.StringIO(decoded_data), sep=":", header=None, names=["key", "value"]
135
+ )
136
+ scripts = df.loc[df["key"] == "Script"].value.tolist()
137
+ if len(scripts) == 0:
138
+ _log.warning("Tesseract cannot detect the script of the page")
139
+ return None
140
+
141
+ script = map_tesseract_script(scripts[0].strip())
142
+ lang = f"{self._script_prefix}{script}"
143
+
144
+ # Check if the detected language has been installed
145
+ if lang not in self._tesseract_languages:
146
+ msg = f"Tesseract detected the script '{script}' and language '{lang}'."
147
+ msg += " However this language is not installed in your system and will be ignored."
148
+ _log.warning(msg)
149
+ return None
150
+
151
+ _log.debug(
152
+ f"Using tesseract model for the detected script '{script}' and language '{lang}'"
153
+ )
154
+ return lang
155
+
156
+ def _set_languages_and_prefix(self):
157
+ r"""
158
+ Read and set the languages installed in tesseract and decide the script prefix
159
+ """
160
+ # Get all languages
161
+ cmd = [self.options.tesseract_cmd]
162
+ cmd.append("--list-langs")
163
+ _log.info("command: {}".format(" ".join(cmd)))
164
+ proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
165
+ output, _ = proc.communicate()
166
+ decoded_data = output.decode("utf-8")
167
+ df = pd.read_csv(io.StringIO(decoded_data), header=None)
168
+ self._tesseract_languages = df[0].tolist()[1:]
169
+
170
+ # Decide the script prefix
171
+ if any([l.startswith("script/") for l in self._tesseract_languages]):
172
+ script_prefix = "script/"
173
+ else:
174
+ script_prefix = ""
175
+
176
+ self._script_prefix = script_prefix
177
+
109
178
  def __call__(
110
179
  self, conv_res: ConversionResult, page_batch: Iterable[Page]
111
180
  ) -> Iterable[Page]:
@@ -120,7 +189,6 @@ class TesseractOcrCliModel(BaseOcrModel):
120
189
  yield page
121
190
  else:
122
191
  with TimeRecorder(conv_res, "ocr"):
123
-
124
192
  ocr_rects = self.get_ocr_rects(page)
125
193
 
126
194
  all_ocr_cells = []
@@ -8,6 +8,7 @@ from docling.datamodel.document import ConversionResult
8
8
  from docling.datamodel.pipeline_options import TesseractOcrOptions
9
9
  from docling.datamodel.settings import settings
10
10
  from docling.models.base_ocr_model import BaseOcrModel
11
+ from docling.utils.ocr_utils import map_tesseract_script
11
12
  from docling.utils.profiling import TimeRecorder
12
13
 
13
14
  _log = logging.getLogger(__name__)
@@ -20,6 +21,7 @@ class TesseractOcrModel(BaseOcrModel):
20
21
 
21
22
  self.scale = 3 # multiplier for 72 dpi == 216 dpi.
22
23
  self.reader = None
24
+ self.osd_reader = None
23
25
 
24
26
  if self.enabled:
25
27
  install_errmsg = (
@@ -47,8 +49,8 @@ class TesseractOcrModel(BaseOcrModel):
47
49
  except:
48
50
  raise ImportError(install_errmsg)
49
51
 
50
- _, tesserocr_languages = tesserocr.get_languages()
51
- if not tesserocr_languages:
52
+ _, self._tesserocr_languages = tesserocr.get_languages()
53
+ if not self._tesserocr_languages:
52
54
  raise ImportError(missing_langs_errmsg)
53
55
 
54
56
  # Initialize the tesseractAPI
@@ -57,7 +59,7 @@ class TesseractOcrModel(BaseOcrModel):
57
59
 
58
60
  self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}
59
61
 
60
- if any([l.startswith("script/") for l in tesserocr_languages]):
62
+ if any([l.startswith("script/") for l in self._tesserocr_languages]):
61
63
  self.script_prefix = "script/"
62
64
  else:
63
65
  self.script_prefix = ""
@@ -72,14 +74,14 @@ class TesseractOcrModel(BaseOcrModel):
72
74
  tesserocr_kwargs["path"] = self.options.path
73
75
 
74
76
  if lang == "auto":
75
- self.reader = tesserocr.PyTessBaseAPI(
77
+ self.reader = tesserocr.PyTessBaseAPI(**tesserocr_kwargs)
78
+ self.osd_reader = tesserocr.PyTessBaseAPI(
76
79
  **{"lang": "osd", "psm": tesserocr.PSM.OSD_ONLY} | tesserocr_kwargs
77
80
  )
78
81
  else:
79
82
  self.reader = tesserocr.PyTessBaseAPI(
80
83
  **{"lang": lang} | tesserocr_kwargs,
81
84
  )
82
-
83
85
  self.reader_RIL = tesserocr.RIL
84
86
 
85
87
  def __del__(self):
@@ -96,8 +98,6 @@ class TesseractOcrModel(BaseOcrModel):
96
98
  yield from page_batch
97
99
  return
98
100
 
99
- import tesserocr
100
-
101
101
  for page in page_batch:
102
102
  assert page._backend is not None
103
103
  if not page._backend.is_valid():
@@ -105,6 +105,7 @@ class TesseractOcrModel(BaseOcrModel):
105
105
  else:
106
106
  with TimeRecorder(conv_res, "ocr"):
107
107
  assert self.reader is not None
108
+ assert self._tesserocr_languages is not None
108
109
 
109
110
  ocr_rects = self.get_ocr_rects(page)
110
111
 
@@ -117,43 +118,42 @@ class TesseractOcrModel(BaseOcrModel):
117
118
  scale=self.scale, cropbox=ocr_rect
118
119
  )
119
120
 
120
- # Retrieve text snippets with their bounding boxes
121
- self.reader.SetImage(high_res_image)
121
+ local_reader = self.reader
122
+ if "auto" in self.options.lang:
123
+ assert self.osd_reader is not None
122
124
 
123
- if self.options.lang == ["auto"]:
124
- osd = self.reader.DetectOrientationScript()
125
+ self.osd_reader.SetImage(high_res_image)
126
+ osd = self.osd_reader.DetectOrientationScript()
125
127
 
126
128
  # No text, probably
127
129
  if osd is None:
128
130
  continue
129
131
 
130
132
  script = osd["script_name"]
131
-
132
- if script == "Katakana" or script == "Hiragana":
133
- script = "Japanese"
134
- elif script == "Han":
135
- script = "HanS"
136
- elif script == "Korean":
137
- script = "Hangul"
138
-
139
- _log.debug(
140
- f'Using model for the detected script "{script}"'
141
- )
142
-
143
- if script not in self.script_readers:
144
- self.script_readers[script] = tesserocr.PyTessBaseAPI(
145
- path=self.reader.GetDatapath(),
146
- lang=f"{self.script_prefix}{script}",
147
- psm=tesserocr.PSM.AUTO,
148
- init=True,
149
- oem=tesserocr.OEM.DEFAULT,
150
- )
151
-
152
- local_reader = self.script_readers[script]
153
- local_reader.SetImage(high_res_image)
154
- else:
155
- local_reader = self.reader
156
-
133
+ script = map_tesseract_script(script)
134
+ lang = f"{self.script_prefix}{script}"
135
+
136
+ # Check if the detected languge is present in the system
137
+ if lang not in self._tesserocr_languages:
138
+ msg = f"Tesseract detected the script '{script}' and language '{lang}'."
139
+ msg += " However this language is not installed in your system and will be ignored."
140
+ _log.warning(msg)
141
+ else:
142
+ if script not in self.script_readers:
143
+ import tesserocr
144
+
145
+ self.script_readers[script] = (
146
+ tesserocr.PyTessBaseAPI(
147
+ path=self.reader.GetDatapath(),
148
+ lang=lang,
149
+ psm=tesserocr.PSM.AUTO,
150
+ init=True,
151
+ oem=tesserocr.OEM.DEFAULT,
152
+ )
153
+ )
154
+ local_reader = self.script_readers[script]
155
+
156
+ local_reader.SetImage(high_res_image)
157
157
  boxes = local_reader.GetComponentImages(
158
158
  self.reader_RIL.TEXTLINE, True
159
159
  )
@@ -0,0 +1,9 @@
1
+ def map_tesseract_script(script: str) -> str:
2
+ r""" """
3
+ if script == "Katakana" or script == "Hiragana":
4
+ script = "Japanese"
5
+ elif script == "Han":
6
+ script = "HanS"
7
+ elif script == "Korean":
8
+ script = "Hangul"
9
+ return script
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling"
3
- version = "2.16.0" # DO NOT EDIT, updated automatically
3
+ version = "2.17.0" # DO NOT EDIT, updated automatically
4
4
  description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
5
5
  authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Panos Vagenas <pva@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
6
6
  license = "MIT"
File without changes
File without changes
File without changes
File without changes