docling 2.20.0__tar.gz → 2.21.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. {docling-2.20.0 → docling-2.21.0}/PKG-INFO +2 -2
  2. {docling-2.20.0 → docling-2.21.0}/docling/models/ds_glm_model.py +60 -2
  3. {docling-2.20.0 → docling-2.21.0}/docling/utils/glm_utils.py +10 -0
  4. {docling-2.20.0 → docling-2.21.0}/pyproject.toml +2 -2
  5. {docling-2.20.0 → docling-2.21.0}/LICENSE +0 -0
  6. {docling-2.20.0 → docling-2.21.0}/README.md +0 -0
  7. {docling-2.20.0 → docling-2.21.0}/docling/__init__.py +0 -0
  8. {docling-2.20.0 → docling-2.21.0}/docling/backend/__init__.py +0 -0
  9. {docling-2.20.0 → docling-2.21.0}/docling/backend/abstract_backend.py +0 -0
  10. {docling-2.20.0 → docling-2.21.0}/docling/backend/asciidoc_backend.py +0 -0
  11. {docling-2.20.0 → docling-2.21.0}/docling/backend/docling_parse_backend.py +0 -0
  12. {docling-2.20.0 → docling-2.21.0}/docling/backend/docling_parse_v2_backend.py +0 -0
  13. {docling-2.20.0 → docling-2.21.0}/docling/backend/html_backend.py +0 -0
  14. {docling-2.20.0 → docling-2.21.0}/docling/backend/json/__init__.py +0 -0
  15. {docling-2.20.0 → docling-2.21.0}/docling/backend/json/docling_json_backend.py +0 -0
  16. {docling-2.20.0 → docling-2.21.0}/docling/backend/md_backend.py +0 -0
  17. {docling-2.20.0 → docling-2.21.0}/docling/backend/msexcel_backend.py +0 -0
  18. {docling-2.20.0 → docling-2.21.0}/docling/backend/mspowerpoint_backend.py +0 -0
  19. {docling-2.20.0 → docling-2.21.0}/docling/backend/msword_backend.py +0 -0
  20. {docling-2.20.0 → docling-2.21.0}/docling/backend/pdf_backend.py +0 -0
  21. {docling-2.20.0 → docling-2.21.0}/docling/backend/pypdfium2_backend.py +0 -0
  22. {docling-2.20.0 → docling-2.21.0}/docling/backend/xml/__init__.py +0 -0
  23. {docling-2.20.0 → docling-2.21.0}/docling/backend/xml/pubmed_backend.py +0 -0
  24. {docling-2.20.0 → docling-2.21.0}/docling/backend/xml/uspto_backend.py +0 -0
  25. {docling-2.20.0 → docling-2.21.0}/docling/chunking/__init__.py +0 -0
  26. {docling-2.20.0 → docling-2.21.0}/docling/cli/__init__.py +0 -0
  27. {docling-2.20.0 → docling-2.21.0}/docling/cli/main.py +0 -0
  28. {docling-2.20.0 → docling-2.21.0}/docling/cli/models.py +0 -0
  29. {docling-2.20.0 → docling-2.21.0}/docling/cli/tools.py +0 -0
  30. {docling-2.20.0 → docling-2.21.0}/docling/datamodel/__init__.py +0 -0
  31. {docling-2.20.0 → docling-2.21.0}/docling/datamodel/base_models.py +0 -0
  32. {docling-2.20.0 → docling-2.21.0}/docling/datamodel/document.py +0 -0
  33. {docling-2.20.0 → docling-2.21.0}/docling/datamodel/pipeline_options.py +0 -0
  34. {docling-2.20.0 → docling-2.21.0}/docling/datamodel/settings.py +0 -0
  35. {docling-2.20.0 → docling-2.21.0}/docling/document_converter.py +0 -0
  36. {docling-2.20.0 → docling-2.21.0}/docling/exceptions.py +0 -0
  37. {docling-2.20.0 → docling-2.21.0}/docling/models/__init__.py +0 -0
  38. {docling-2.20.0 → docling-2.21.0}/docling/models/base_model.py +0 -0
  39. {docling-2.20.0 → docling-2.21.0}/docling/models/base_ocr_model.py +0 -0
  40. {docling-2.20.0 → docling-2.21.0}/docling/models/code_formula_model.py +0 -0
  41. {docling-2.20.0 → docling-2.21.0}/docling/models/document_picture_classifier.py +0 -0
  42. {docling-2.20.0 → docling-2.21.0}/docling/models/easyocr_model.py +0 -0
  43. {docling-2.20.0 → docling-2.21.0}/docling/models/layout_model.py +0 -0
  44. {docling-2.20.0 → docling-2.21.0}/docling/models/ocr_mac_model.py +0 -0
  45. {docling-2.20.0 → docling-2.21.0}/docling/models/page_assemble_model.py +0 -0
  46. {docling-2.20.0 → docling-2.21.0}/docling/models/page_preprocessing_model.py +0 -0
  47. {docling-2.20.0 → docling-2.21.0}/docling/models/picture_description_api_model.py +0 -0
  48. {docling-2.20.0 → docling-2.21.0}/docling/models/picture_description_base_model.py +0 -0
  49. {docling-2.20.0 → docling-2.21.0}/docling/models/picture_description_vlm_model.py +0 -0
  50. {docling-2.20.0 → docling-2.21.0}/docling/models/rapid_ocr_model.py +0 -0
  51. {docling-2.20.0 → docling-2.21.0}/docling/models/table_structure_model.py +0 -0
  52. {docling-2.20.0 → docling-2.21.0}/docling/models/tesseract_ocr_cli_model.py +0 -0
  53. {docling-2.20.0 → docling-2.21.0}/docling/models/tesseract_ocr_model.py +0 -0
  54. {docling-2.20.0 → docling-2.21.0}/docling/pipeline/__init__.py +0 -0
  55. {docling-2.20.0 → docling-2.21.0}/docling/pipeline/base_pipeline.py +0 -0
  56. {docling-2.20.0 → docling-2.21.0}/docling/pipeline/simple_pipeline.py +0 -0
  57. {docling-2.20.0 → docling-2.21.0}/docling/pipeline/standard_pdf_pipeline.py +0 -0
  58. {docling-2.20.0 → docling-2.21.0}/docling/py.typed +0 -0
  59. {docling-2.20.0 → docling-2.21.0}/docling/utils/__init__.py +0 -0
  60. {docling-2.20.0 → docling-2.21.0}/docling/utils/accelerator_utils.py +0 -0
  61. {docling-2.20.0 → docling-2.21.0}/docling/utils/export.py +0 -0
  62. {docling-2.20.0 → docling-2.21.0}/docling/utils/layout_postprocessor.py +0 -0
  63. {docling-2.20.0 → docling-2.21.0}/docling/utils/model_downloader.py +0 -0
  64. {docling-2.20.0 → docling-2.21.0}/docling/utils/ocr_utils.py +0 -0
  65. {docling-2.20.0 → docling-2.21.0}/docling/utils/profiling.py +0 -0
  66. {docling-2.20.0 → docling-2.21.0}/docling/utils/utils.py +0 -0
  67. {docling-2.20.0 → docling-2.21.0}/docling/utils/visualization.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.20.0
3
+ Version: 2.21.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -28,7 +28,7 @@ Provides-Extra: vlm
28
28
  Requires-Dist: beautifulsoup4 (>=4.12.3,<4.13.0)
29
29
  Requires-Dist: certifi (>=2024.7.4)
30
30
  Requires-Dist: deepsearch-glm (>=1.0.0,<2.0.0)
31
- Requires-Dist: docling-core[chunking] (>=2.17.2,<3.0.0)
31
+ Requires-Dist: docling-core[chunking] (>=2.18.0,<3.0.0)
32
32
  Requires-Dist: docling-ibm-models (>=3.3.0,<4.0.0)
33
33
  Requires-Dist: docling-parse (>=3.3.0,<4.0.0)
34
34
  Requires-Dist: easyocr (>=1.7,<2.0)
@@ -4,7 +4,12 @@ from pathlib import Path
4
4
  from typing import List, Union
5
5
 
6
6
  from deepsearch_glm.andromeda_nlp import nlp_model
7
- from docling_core.types.doc import BoundingBox, CoordOrigin, DoclingDocument
7
+ from docling_core.types.doc import (
8
+ BoundingBox,
9
+ CoordOrigin,
10
+ DocItemLabel,
11
+ DoclingDocument,
12
+ )
8
13
  from docling_core.types.legacy_doc.base import BoundingBox as DsBoundingBox
9
14
  from docling_core.types.legacy_doc.base import (
10
15
  Figure,
@@ -71,12 +76,15 @@ class GlmModel:
71
76
  )
72
77
 
73
78
  main_text: List[Union[Ref, BaseText]] = []
79
+ page_headers: List[Union[Ref, BaseText]] = []
80
+ page_footers: List[Union[Ref, BaseText]] = []
81
+
74
82
  tables: List[DsSchemaTable] = []
75
83
  figures: List[Figure] = []
76
84
 
77
85
  page_no_to_page = {p.page_no: p for p in conv_res.pages}
78
86
 
79
- for element in conv_res.assembled.elements:
87
+ for element in conv_res.assembled.body:
80
88
  # Convert bboxes to lower-left origin.
81
89
  target_bbox = DsBoundingBox(
82
90
  element.cluster.bbox.to_bottom_left_origin(
@@ -238,6 +246,53 @@ class GlmModel:
238
246
  )
239
247
  )
240
248
 
249
+ # We can throw in headers and footers at the end of the legacy doc
250
+ # since the reading-order will re-sort it later.
251
+ for element in conv_res.assembled.headers:
252
+ # Convert bboxes to lower-left origin.
253
+ target_bbox = DsBoundingBox(
254
+ element.cluster.bbox.to_bottom_left_origin(
255
+ page_no_to_page[element.page_no].size.height
256
+ ).as_tuple()
257
+ )
258
+
259
+ if isinstance(element, TextElement):
260
+
261
+ tel = BaseText(
262
+ text=element.text,
263
+ obj_type=layout_label_to_ds_type.get(element.label),
264
+ name=element.label,
265
+ prov=[
266
+ Prov(
267
+ bbox=target_bbox,
268
+ page=element.page_no + 1,
269
+ span=[0, len(element.text)],
270
+ )
271
+ ],
272
+ )
273
+ if element.label == DocItemLabel.PAGE_HEADER:
274
+ index = len(page_headers)
275
+ ref_str = f"#/page-headers/{index}"
276
+ main_text.append(
277
+ Ref(
278
+ name=element.label,
279
+ obj_type=layout_label_to_ds_type.get(element.label),
280
+ ref=ref_str,
281
+ ),
282
+ )
283
+ page_headers.append(tel)
284
+ elif element.label == DocItemLabel.PAGE_FOOTER:
285
+ index = len(page_footers)
286
+ ref_str = f"#/page-footers/{index}"
287
+ main_text.append(
288
+ Ref(
289
+ name=element.label,
290
+ obj_type=layout_label_to_ds_type.get(element.label),
291
+ ref=ref_str,
292
+ ),
293
+ )
294
+ page_footers.append(tel)
295
+
241
296
  page_dimensions = [
242
297
  PageDimensions(page=p.page_no + 1, height=p.size.height, width=p.size.width)
243
298
  for p in conv_res.pages
@@ -252,6 +307,8 @@ class GlmModel:
252
307
  tables=tables,
253
308
  figures=figures,
254
309
  page_dimensions=page_dimensions,
310
+ page_headers=page_headers,
311
+ page_footers=page_footers,
255
312
  )
256
313
 
257
314
  return ds_doc
@@ -264,6 +321,7 @@ class GlmModel:
264
321
  glm_doc = self.model.apply_on_doc(ds_doc_dict)
265
322
 
266
323
  docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental
324
+ 1 == 1
267
325
 
268
326
  # DEBUG code:
269
327
  def draw_clusters_and_cells(ds_document, page_no, show: bool = False):
@@ -15,6 +15,7 @@ from docling_core.types.doc import (
15
15
  TableCell,
16
16
  TableData,
17
17
  )
18
+ from docling_core.types.doc.document import ContentLayer
18
19
 
19
20
 
20
21
  def resolve_item(paths, obj):
@@ -311,6 +312,15 @@ def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
311
312
  current_list = None
312
313
 
313
314
  doc.add_text(label=DocItemLabel.FORMULA, text="", orig=text, prov=prov)
315
+ elif label in [DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER]:
316
+ current_list = None
317
+
318
+ doc.add_text(
319
+ label=DocItemLabel(name_label),
320
+ text=text,
321
+ prov=prov,
322
+ content_layer=ContentLayer.FURNITURE,
323
+ )
314
324
  else:
315
325
  current_list = None
316
326
 
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling"
3
- version = "2.20.0" # DO NOT EDIT, updated automatically
3
+ version = "2.21.0" # DO NOT EDIT, updated automatically
4
4
  description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
5
5
  authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Panos Vagenas <pva@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
6
6
  license = "MIT"
@@ -26,7 +26,7 @@ packages = [{include = "docling"}]
26
26
  ######################
27
27
  python = "^3.9"
28
28
  pydantic = "^2.0.0"
29
- docling-core = {extras = ["chunking"], version = "^2.17.2"}
29
+ docling-core = {extras = ["chunking"], version = "^2.18.0"}
30
30
  docling-ibm-models = "^3.3.0"
31
31
  deepsearch-glm = "^1.0.0"
32
32
  docling-parse = "^3.3.0"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes