docling 1.8.5__tar.gz → 1.10.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. {docling-1.8.5 → docling-1.10.0}/LICENSE +1 -1
  2. {docling-1.8.5 → docling-1.10.0}/PKG-INFO +15 -8
  3. {docling-1.8.5 → docling-1.10.0}/README.md +10 -4
  4. {docling-1.8.5 → docling-1.10.0}/docling/datamodel/base_models.py +9 -0
  5. {docling-1.8.5 → docling-1.10.0}/docling/models/ds_glm_model.py +1 -1
  6. docling-1.10.0/docling/utils/export.py +193 -0
  7. {docling-1.8.5 → docling-1.10.0}/pyproject.toml +9 -4
  8. {docling-1.8.5 → docling-1.10.0}/docling/__init__.py +0 -0
  9. {docling-1.8.5 → docling-1.10.0}/docling/backend/__init__.py +0 -0
  10. {docling-1.8.5 → docling-1.10.0}/docling/backend/abstract_backend.py +0 -0
  11. {docling-1.8.5 → docling-1.10.0}/docling/backend/docling_parse_backend.py +0 -0
  12. {docling-1.8.5 → docling-1.10.0}/docling/backend/pypdfium2_backend.py +0 -0
  13. {docling-1.8.5 → docling-1.10.0}/docling/datamodel/__init__.py +0 -0
  14. {docling-1.8.5 → docling-1.10.0}/docling/datamodel/document.py +0 -0
  15. {docling-1.8.5 → docling-1.10.0}/docling/datamodel/settings.py +0 -0
  16. {docling-1.8.5 → docling-1.10.0}/docling/document_converter.py +0 -0
  17. {docling-1.8.5 → docling-1.10.0}/docling/models/__init__.py +0 -0
  18. {docling-1.8.5 → docling-1.10.0}/docling/models/base_ocr_model.py +0 -0
  19. {docling-1.8.5 → docling-1.10.0}/docling/models/easyocr_model.py +0 -0
  20. {docling-1.8.5 → docling-1.10.0}/docling/models/layout_model.py +0 -0
  21. {docling-1.8.5 → docling-1.10.0}/docling/models/page_assemble_model.py +0 -0
  22. {docling-1.8.5 → docling-1.10.0}/docling/models/table_structure_model.py +0 -0
  23. {docling-1.8.5 → docling-1.10.0}/docling/pipeline/__init__.py +0 -0
  24. {docling-1.8.5 → docling-1.10.0}/docling/pipeline/base_model_pipeline.py +0 -0
  25. {docling-1.8.5 → docling-1.10.0}/docling/pipeline/standard_model_pipeline.py +0 -0
  26. {docling-1.8.5 → docling-1.10.0}/docling/utils/__init__.py +0 -0
  27. {docling-1.8.5 → docling-1.10.0}/docling/utils/layout_utils.py +0 -0
  28. {docling-1.8.5 → docling-1.10.0}/docling/utils/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  MIT License
2
2
 
3
- Copyright (c) [year] [fullname]
3
+ Copyright (c) 2024 International Business Machines
4
4
 
5
5
  Permission is hereby granted, free of charge, to any person obtaining a copy
6
6
  of this software and associated documentation files (the "Software"), to deal
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 1.8.5
3
+ Version: 1.10.0
4
4
  Summary: Docling PDF conversion package
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -20,13 +20,14 @@ Classifier: Programming Language :: Python :: 3.11
20
20
  Classifier: Programming Language :: Python :: 3.12
21
21
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
22
  Requires-Dist: certifi (>=2024.7.4)
23
- Requires-Dist: deepsearch-glm (>=0.19.1,<0.20.0)
24
- Requires-Dist: docling-core (>=1.1.2,<2.0.0)
23
+ Requires-Dist: deepsearch-glm (>=0.21.0,<0.22.0)
24
+ Requires-Dist: docling-core (>=1.1.3,<2.0.0)
25
25
  Requires-Dist: docling-ibm-models (>=1.1.3,<2.0.0)
26
- Requires-Dist: docling-parse (>=1.1.3,<2.0.0)
26
+ Requires-Dist: docling-parse (>=1.2.0,<2.0.0)
27
27
  Requires-Dist: easyocr (>=1.7,<2.0)
28
28
  Requires-Dist: filetype (>=1.2.0,<2.0.0)
29
29
  Requires-Dist: huggingface_hub (>=0.23,<1)
30
+ Requires-Dist: pyarrow (>=16.1.0,<17.0.0)
30
31
  Requires-Dist: pydantic (>=2.0.0,<3.0.0)
31
32
  Requires-Dist: pydantic-settings (>=2.3.0,<3.0.0)
32
33
  Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
@@ -62,6 +63,8 @@ Docling bundles PDF document conversion to JSON and Markdown in an easy, self-co
62
63
  * 📝 Extracts metadata from the document, such as title, authors, references and language
63
64
  * 🔍 Optionally applies OCR (use with scanned PDFs)
64
65
 
66
+ For RAG, check out [Quackling](https://github.com/DS4SD/quackling) to get the most out of your docs, be it using LlamaIndex, LangChain or your pipeline.
67
+
65
68
  ## Installation
66
69
 
67
70
  To use Docling, simply install `docling` from your package manager, e.g. pip:
@@ -180,6 +183,10 @@ results = doc_converter.convert(conv_input)
180
183
  You can limit the CPU threads used by Docling by setting the environment variable `OMP_NUM_THREADS` accordingly. The default setting is using 4 CPU threads.
181
184
 
182
185
 
186
+ ## Technical report
187
+
188
+ For more details on Docling's inner workings, check out the [Docling Technical Report](https://arxiv.org/abs/2408.09869).
189
+
183
190
  ## Contributing
184
191
 
185
192
  Please read [Contributing to Docling](https://github.com/DS4SD/docling/blob/main/CONTRIBUTING.md) for details.
@@ -193,10 +200,10 @@ If you use Docling in your projects, please consider citing the following:
193
200
  @techreport{Docling,
194
201
  author = {Deep Search Team},
195
202
  month = {8},
196
- title = {{Docling Technical Report}},
197
- url={https://arxiv.org/abs/2408.09869},
198
- eprint={2408.09869},
199
- doi = "10.48550/arXiv.2408.09869",
203
+ title = {Docling Technical Report},
204
+ url = {https://arxiv.org/abs/2408.09869},
205
+ eprint = {2408.09869},
206
+ doi = {10.48550/arXiv.2408.09869},
200
207
  version = {1.0.0},
201
208
  year = {2024}
202
209
  }
@@ -24,6 +24,8 @@ Docling bundles PDF document conversion to JSON and Markdown in an easy, self-co
24
24
  * 📝 Extracts metadata from the document, such as title, authors, references and language
25
25
  * 🔍 Optionally applies OCR (use with scanned PDFs)
26
26
 
27
+ For RAG, check out [Quackling](https://github.com/DS4SD/quackling) to get the most out of your docs, be it using LlamaIndex, LangChain or your pipeline.
28
+
27
29
  ## Installation
28
30
 
29
31
  To use Docling, simply install `docling` from your package manager, e.g. pip:
@@ -142,6 +144,10 @@ results = doc_converter.convert(conv_input)
142
144
  You can limit the CPU threads used by Docling by setting the environment variable `OMP_NUM_THREADS` accordingly. The default setting is using 4 CPU threads.
143
145
 
144
146
 
147
+ ## Technical report
148
+
149
+ For more details on Docling's inner workings, check out the [Docling Technical Report](https://arxiv.org/abs/2408.09869).
150
+
145
151
  ## Contributing
146
152
 
147
153
  Please read [Contributing to Docling](https://github.com/DS4SD/docling/blob/main/CONTRIBUTING.md) for details.
@@ -155,10 +161,10 @@ If you use Docling in your projects, please consider citing the following:
155
161
  @techreport{Docling,
156
162
  author = {Deep Search Team},
157
163
  month = {8},
158
- title = {{Docling Technical Report}},
159
- url={https://arxiv.org/abs/2408.09869},
160
- eprint={2408.09869},
161
- doi = "10.48550/arXiv.2408.09869",
164
+ title = {Docling Technical Report},
165
+ url = {https://arxiv.org/abs/2408.09869},
166
+ eprint = {2408.09869},
167
+ doi = {10.48550/arXiv.2408.09869},
162
168
  version = {1.0.0},
163
169
  year = {2024}
164
170
  }
@@ -71,6 +71,15 @@ class BoundingBox(BaseModel):
71
71
 
72
72
  return out_bbox
73
73
 
74
+ def normalized(self, page_size: PageSize) -> "BoundingBox":
75
+ out_bbox = copy.deepcopy(self)
76
+ out_bbox.l /= page_size.width
77
+ out_bbox.r /= page_size.width
78
+ out_bbox.t /= page_size.height
79
+ out_bbox.b /= page_size.height
80
+
81
+ return out_bbox
82
+
74
83
  def as_tuple(self):
75
84
  if self.coord_origin == CoordOrigin.TOPLEFT:
76
85
  return (self.l, self.t, self.r, self.b)
@@ -2,7 +2,7 @@ import copy
2
2
  import random
3
3
 
4
4
  from deepsearch_glm.nlp_utils import init_nlp_model
5
- from deepsearch_glm.utils.ds_utils import to_legacy_document_format
5
+ from deepsearch_glm.utils.doc_utils import to_legacy_document_format
6
6
  from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models
7
7
  from docling_core.types import BaseText
8
8
  from docling_core.types import Document as DsDocument
@@ -0,0 +1,193 @@
1
+ import logging
2
+ from typing import Any, Dict, Iterable, List, Tuple
3
+
4
+ from docling_core.types.doc.base import BaseCell, Ref, Table, TableCell
5
+
6
+ from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell
7
+ from docling.datamodel.document import ConvertedDocument, Page
8
+
9
+ _log = logging.getLogger(__name__)
10
+
11
+
12
+ def _export_table_to_html(table: Table):
13
+
14
+ # TODO: this is flagged as internal, because we will move it
15
+ # to the docling-core package.
16
+
17
+ def _get_tablecell_span(cell: TableCell, ix):
18
+ span = set([s[ix] for s in cell.spans])
19
+ if len(span) == 0:
20
+ return 1, None, None
21
+ return len(span), min(span), max(span)
22
+
23
+ body = ""
24
+ nrows = table.num_rows
25
+ ncols = table.num_cols
26
+
27
+ for i in range(nrows):
28
+ body += "<tr>"
29
+ for j in range(ncols):
30
+ cell: TableCell = table.data[i][j]
31
+
32
+ rowspan, rowstart, rowend = _get_tablecell_span(cell, 0)
33
+ colspan, colstart, colend = _get_tablecell_span(cell, 1)
34
+
35
+ if rowstart is not None and rowstart != i:
36
+ continue
37
+ if colstart is not None and colstart != j:
38
+ continue
39
+
40
+ if rowstart is None:
41
+ rowstart = i
42
+ if colstart is None:
43
+ colstart = j
44
+
45
+ content = cell.text.strip()
46
+ label = cell.obj_type
47
+ label_class = "body"
48
+ celltag = "td"
49
+ if label in ["row_header", "row_multi_header", "row_title"]:
50
+ label_class = "header"
51
+ elif label in ["col_header", "col_multi_header"]:
52
+ label_class = "header"
53
+ celltag = "th"
54
+
55
+ opening_tag = f"{celltag}"
56
+ if rowspan > 1:
57
+ opening_tag += f' rowspan="{rowspan}"'
58
+ if colspan > 1:
59
+ opening_tag += f' colspan="{colspan}"'
60
+
61
+ body += f"<{opening_tag}>{content}</{celltag}>"
62
+ body += "</tr>"
63
+ body = f"<table>{body}</table>"
64
+
65
+ return body
66
+
67
+
68
+ def generate_multimodal_pages(
69
+ doc_result: ConvertedDocument,
70
+ ) -> Iterable[Tuple[str, str, List[Dict[str, Any]], List[Dict[str, Any]], Page]]:
71
+
72
+ label_to_doclaynet = {
73
+ "title": "title",
74
+ "table-of-contents": "document_index",
75
+ "subtitle-level-1": "section_header",
76
+ "checkbox-selected": "checkbox_selected",
77
+ "checkbox-unselected": "checkbox_unselected",
78
+ "caption": "caption",
79
+ "page-header": "page_header",
80
+ "page-footer": "page_footer",
81
+ "footnote": "footnote",
82
+ "table": "table",
83
+ "formula": "formula",
84
+ "list-item": "list_item",
85
+ "code": "code",
86
+ "figure": "picture",
87
+ "picture": "picture",
88
+ "reference": "text",
89
+ "paragraph": "text",
90
+ "text": "text",
91
+ }
92
+
93
+ content_text = ""
94
+ page_no = 0
95
+ start_ix = 0
96
+ end_ix = 0
97
+ doc_items = []
98
+
99
+ doc = doc_result.output
100
+
101
+ def _process_page_segments(doc_items: list[Tuple[int, BaseCell]], page: Page):
102
+ segments = []
103
+
104
+ for ix, item in doc_items:
105
+ item_type = item.obj_type
106
+ label = label_to_doclaynet.get(item_type, None)
107
+
108
+ if label is None:
109
+ continue
110
+
111
+ bbox = BoundingBox.from_tuple(
112
+ item.prov[0].bbox, origin=CoordOrigin.BOTTOMLEFT
113
+ )
114
+ new_bbox = bbox.to_top_left_origin(page_height=page.size.height).normalized(
115
+ page_size=page.size
116
+ )
117
+
118
+ new_segment = {
119
+ "index_in_doc": ix,
120
+ "label": label,
121
+ "text": item.text if item.text is not None else "",
122
+ "bbox": new_bbox.as_tuple(),
123
+ "data": [],
124
+ }
125
+
126
+ if isinstance(item, Table):
127
+ table_html = _export_table_to_html(item)
128
+ new_segment["data"].append(
129
+ {
130
+ "html_seq": table_html,
131
+ "otsl_seq": "",
132
+ }
133
+ )
134
+
135
+ segments.append(new_segment)
136
+
137
+ return segments
138
+
139
+ def _process_page_cells(page: Page):
140
+ cells = []
141
+ for cell in page.cells:
142
+ new_bbox = cell.bbox.to_top_left_origin(
143
+ page_height=page.size.height
144
+ ).normalized(page_size=page.size)
145
+ is_ocr = isinstance(cell, OcrCell)
146
+ ocr_confidence = cell.confidence if is_ocr else 1.0
147
+ cells.append(
148
+ {
149
+ "text": cell.text,
150
+ "bbox": new_bbox.as_tuple(),
151
+ "ocr": is_ocr,
152
+ "ocr_confidence": ocr_confidence,
153
+ }
154
+ )
155
+ return cells
156
+
157
+ def _process_page():
158
+ page_ix = page_no - 1
159
+ page = doc_result.pages[page_ix]
160
+
161
+ page_cells = _process_page_cells(page=page)
162
+ page_segments = _process_page_segments(doc_items=doc_items, page=page)
163
+ content_md = doc.export_to_markdown(
164
+ main_text_start=start_ix, main_text_stop=end_ix
165
+ )
166
+
167
+ return content_text, content_md, page_cells, page_segments, page
168
+
169
+ for ix, orig_item in enumerate(doc.main_text):
170
+
171
+ item = doc._resolve_ref(orig_item) if isinstance(orig_item, Ref) else orig_item
172
+ if item is None or item.prov is None or len(item.prov) == 0:
173
+ _log.debug(f"Skipping item {orig_item}")
174
+ continue
175
+
176
+ item_page = item.prov[0].page
177
+
178
+ # Page is complete
179
+ if page_no > 0 and item_page > page_no:
180
+ yield _process_page()
181
+
182
+ start_ix = ix
183
+ doc_items = []
184
+ content_text = ""
185
+
186
+ page_no = item_page
187
+ end_ix = ix
188
+ doc_items.append((ix, item))
189
+ if item.text is not None and item.text != "":
190
+ content_text += item.text + " "
191
+
192
+ if len(doc_items) > 0:
193
+ yield _process_page()
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling"
3
- version = "1.8.5" # DO NOT EDIT, updated automatically
3
+ version = "1.10.0" # DO NOT EDIT, updated automatically
4
4
  description = "Docling PDF conversion package"
5
5
  authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
6
6
  license = "MIT"
@@ -23,19 +23,20 @@ packages = [{include = "docling"}]
23
23
  [tool.poetry.dependencies]
24
24
  python = "^3.10"
25
25
  pydantic = "^2.0.0"
26
- docling-core = "^1.1.2"
26
+ docling-core = "^1.1.3"
27
27
  docling-ibm-models = "^1.1.3"
28
- deepsearch-glm = "^0.19.1"
28
+ deepsearch-glm = "^0.21.0"
29
29
  filetype = "^1.2.0"
30
30
  pypdfium2 = "^4.30.0"
31
31
  pydantic-settings = "^2.3.0"
32
32
  huggingface_hub = ">=0.23,<1"
33
33
  requests = "^2.32.3"
34
34
  easyocr = "^1.7"
35
- docling-parse = "^1.1.3"
35
+ docling-parse = "^1.2.0"
36
36
  certifi = ">=2024.7.4"
37
37
  rtree = "^1.3.0"
38
38
  scipy = "^1.14.1"
39
+ pyarrow = "^16.1.0"
39
40
 
40
41
  [tool.poetry.group.dev.dependencies]
41
42
  black = {extras = ["jupyter"], version = "^24.4.2"}
@@ -51,6 +52,10 @@ types-requests = "^2.31.0.2"
51
52
  flake8-pyproject = "^1.2.3"
52
53
  pylint = "^2.17.5"
53
54
 
55
+
56
+ [tool.poetry.group.examples.dependencies]
57
+ datasets = "^2.21.0"
58
+
54
59
  [build-system]
55
60
  requires = ["poetry-core"]
56
61
  build-backend = "poetry.core.masonry.api"
File without changes
File without changes