docling 1.8.4__py3-none-any.whl → 1.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -71,6 +71,15 @@ class BoundingBox(BaseModel):
71
71
 
72
72
  return out_bbox
73
73
 
74
+ def normalized(self, page_size: PageSize) -> "BoundingBox":
75
+ out_bbox = copy.deepcopy(self)
76
+ out_bbox.l /= page_size.width
77
+ out_bbox.r /= page_size.width
78
+ out_bbox.t /= page_size.height
79
+ out_bbox.b /= page_size.height
80
+
81
+ return out_bbox
82
+
74
83
  def as_tuple(self):
75
84
  if self.coord_origin == CoordOrigin.TOPLEFT:
76
85
  return (self.l, self.t, self.r, self.b)
@@ -238,9 +247,9 @@ class EquationPrediction(BaseModel):
238
247
 
239
248
  class PagePredictions(BaseModel):
240
249
  layout: LayoutPrediction = None
241
- tablestructure: TableStructurePrediction = None
242
- figures_classification: FigureClassificationPrediction = None
243
- equations_prediction: EquationPrediction = None
250
+ tablestructure: Optional[TableStructurePrediction] = None
251
+ figures_classification: Optional[FigureClassificationPrediction] = None
252
+ equations_prediction: Optional[EquationPrediction] = None
244
253
 
245
254
 
246
255
  PageElement = Union[TextElement, TableElement, FigureElement]
@@ -16,8 +16,12 @@ from docling.datamodel.document import ConversionResult
16
16
  class GlmModel:
17
17
  def __init__(self, config):
18
18
  self.config = config
19
+ self.model_names = self.config.get(
20
+ "model_names", ""
21
+ ) # "language;term;reference"
19
22
  load_pretrained_nlp_models()
20
- model = init_nlp_model(model_names="language;term;reference")
23
+ # model = init_nlp_model(model_names="language;term;reference")
24
+ model = init_nlp_model(model_names=self.model_names)
21
25
  self.model = model
22
26
 
23
27
  def __call__(self, conv_res: ConversionResult) -> DsDocument:
@@ -44,7 +44,16 @@ class TableStructureModel:
44
44
 
45
45
  for tc in table_element.table_cells:
46
46
  x0, y0, x1, y1 = tc.bbox.as_tuple()
47
- draw.rectangle([(x0, y0), (x1, y1)], outline="blue")
47
+ if tc.column_header:
48
+ width = 3
49
+ else:
50
+ width = 1
51
+ draw.rectangle([(x0, y0), (x1, y1)], outline="blue", width=width)
52
+ draw.text(
53
+ (x0 + 3, y0 + 3),
54
+ text=f"{tc.start_row_offset_idx}, {tc.start_col_offset_idx}",
55
+ fill="black",
56
+ )
48
57
 
49
58
  image.show()
50
59
 
@@ -0,0 +1,193 @@
1
+ import logging
2
+ from typing import Any, Dict, Iterable, List, Tuple
3
+
4
+ from docling_core.types.doc.base import BaseCell, Ref, Table, TableCell
5
+
6
+ from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell
7
+ from docling.datamodel.document import ConvertedDocument, Page
8
+
9
+ _log = logging.getLogger(__name__)
10
+
11
+
12
+ def _export_table_to_html(table: Table):
13
+
14
+ # TODO: this is flagged as internal, because we will move it
15
+ # to the docling-core package.
16
+
17
+ def _get_tablecell_span(cell: TableCell, ix):
18
+ span = set([s[ix] for s in cell.spans])
19
+ if len(span) == 0:
20
+ return 1, None, None
21
+ return len(span), min(span), max(span)
22
+
23
+ body = ""
24
+ nrows = table.num_rows
25
+ ncols = table.num_cols
26
+
27
+ for i in range(nrows):
28
+ body += "<tr>"
29
+ for j in range(ncols):
30
+ cell: TableCell = table.data[i][j]
31
+
32
+ rowspan, rowstart, rowend = _get_tablecell_span(cell, 0)
33
+ colspan, colstart, colend = _get_tablecell_span(cell, 1)
34
+
35
+ if rowstart is not None and rowstart != i:
36
+ continue
37
+ if colstart is not None and colstart != j:
38
+ continue
39
+
40
+ if rowstart is None:
41
+ rowstart = i
42
+ if colstart is None:
43
+ colstart = j
44
+
45
+ content = cell.text.strip()
46
+ label = cell.obj_type
47
+ label_class = "body"
48
+ celltag = "td"
49
+ if label in ["row_header", "row_multi_header", "row_title"]:
50
+ label_class = "header"
51
+ elif label in ["col_header", "col_multi_header"]:
52
+ label_class = "header"
53
+ celltag = "th"
54
+
55
+ opening_tag = f"{celltag}"
56
+ if rowspan > 1:
57
+ opening_tag += f' rowspan="{rowspan}"'
58
+ if colspan > 1:
59
+ opening_tag += f' colspan="{colspan}"'
60
+
61
+ body += f"<{opening_tag}>{content}</{celltag}>"
62
+ body += "</tr>"
63
+ body = f"<table>{body}</table>"
64
+
65
+ return body
66
+
67
+
68
+ def generate_multimodal_pages(
69
+ doc_result: ConvertedDocument,
70
+ ) -> Iterable[Tuple[str, str, List[Dict[str, Any]], List[Dict[str, Any]], Page]]:
71
+
72
+ label_to_doclaynet = {
73
+ "title": "title",
74
+ "table-of-contents": "document_index",
75
+ "subtitle-level-1": "section_header",
76
+ "checkbox-selected": "checkbox_selected",
77
+ "checkbox-unselected": "checkbox_unselected",
78
+ "caption": "caption",
79
+ "page-header": "page_header",
80
+ "page-footer": "page_footer",
81
+ "footnote": "footnote",
82
+ "table": "table",
83
+ "formula": "formula",
84
+ "list-item": "list_item",
85
+ "code": "code",
86
+ "figure": "picture",
87
+ "picture": "picture",
88
+ "reference": "text",
89
+ "paragraph": "text",
90
+ "text": "text",
91
+ }
92
+
93
+ content_text = ""
94
+ page_no = 0
95
+ start_ix = 0
96
+ end_ix = 0
97
+ doc_items = []
98
+
99
+ doc = doc_result.output
100
+
101
+ def _process_page_segments(doc_items: list[Tuple[int, BaseCell]], page: Page):
102
+ segments = []
103
+
104
+ for ix, item in doc_items:
105
+ item_type = item.obj_type
106
+ label = label_to_doclaynet.get(item_type, None)
107
+
108
+ if label is None:
109
+ continue
110
+
111
+ bbox = BoundingBox.from_tuple(
112
+ item.prov[0].bbox, origin=CoordOrigin.BOTTOMLEFT
113
+ )
114
+ new_bbox = bbox.to_top_left_origin(page_height=page.size.height).normalized(
115
+ page_size=page.size
116
+ )
117
+
118
+ new_segment = {
119
+ "index_in_doc": ix,
120
+ "label": label,
121
+ "text": item.text if item.text is not None else "",
122
+ "bbox": new_bbox.as_tuple(),
123
+ "data": [],
124
+ }
125
+
126
+ if isinstance(item, Table):
127
+ table_html = _export_table_to_html(item)
128
+ new_segment["data"].append(
129
+ {
130
+ "html_seq": table_html,
131
+ "otsl_seq": "",
132
+ }
133
+ )
134
+
135
+ segments.append(new_segment)
136
+
137
+ return segments
138
+
139
+ def _process_page_cells(page: Page):
140
+ cells = []
141
+ for cell in page.cells:
142
+ new_bbox = cell.bbox.to_top_left_origin(
143
+ page_height=page.size.height
144
+ ).normalized(page_size=page.size)
145
+ is_ocr = isinstance(cell, OcrCell)
146
+ ocr_confidence = cell.confidence if is_ocr else 1.0
147
+ cells.append(
148
+ {
149
+ "text": cell.text,
150
+ "bbox": new_bbox.as_tuple(),
151
+ "ocr": is_ocr,
152
+ "ocr_confidence": ocr_confidence,
153
+ }
154
+ )
155
+ return cells
156
+
157
+ def _process_page():
158
+ page_ix = page_no - 1
159
+ page = doc_result.pages[page_ix]
160
+
161
+ page_cells = _process_page_cells(page=page)
162
+ page_segments = _process_page_segments(doc_items=doc_items, page=page)
163
+ content_md = doc.export_to_markdown(
164
+ main_text_start=start_ix, main_text_stop=end_ix
165
+ )
166
+
167
+ return content_text, content_md, page_cells, page_segments, page
168
+
169
+ for ix, orig_item in enumerate(doc.main_text):
170
+
171
+ item = doc._resolve_ref(orig_item) if isinstance(orig_item, Ref) else orig_item
172
+ if item is None or item.prov is None or len(item.prov) == 0:
173
+ _log.debug(f"Skipping item {orig_item}")
174
+ continue
175
+
176
+ item_page = item.prov[0].page
177
+
178
+ # Page is complete
179
+ if page_no > 0 and item_page > page_no:
180
+ yield _process_page()
181
+
182
+ start_ix = ix
183
+ doc_items = []
184
+ content_text = ""
185
+
186
+ page_no = item_page
187
+ end_ix = ix
188
+ doc_items.append((ix, item))
189
+ if item.text is not None and item.text != "":
190
+ content_text += item.text + " "
191
+
192
+ if len(doc_items) > 0:
193
+ yield _process_page()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 1.8.4
3
+ Version: 1.9.0
4
4
  Summary: Docling PDF conversion package
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -20,13 +20,14 @@ Classifier: Programming Language :: Python :: 3.11
20
20
  Classifier: Programming Language :: Python :: 3.12
21
21
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
22
  Requires-Dist: certifi (>=2024.7.4)
23
- Requires-Dist: deepsearch-glm (>=0.19.0,<1)
24
- Requires-Dist: docling-core (>=1.1.2,<2.0.0)
23
+ Requires-Dist: deepsearch-glm (>=0.19.1,<0.20.0)
24
+ Requires-Dist: docling-core (>=1.1.3,<2.0.0)
25
25
  Requires-Dist: docling-ibm-models (>=1.1.3,<2.0.0)
26
- Requires-Dist: docling-parse (>=1.1.1,<2.0.0)
26
+ Requires-Dist: docling-parse (>=1.1.3,<2.0.0)
27
27
  Requires-Dist: easyocr (>=1.7,<2.0)
28
28
  Requires-Dist: filetype (>=1.2.0,<2.0.0)
29
29
  Requires-Dist: huggingface_hub (>=0.23,<1)
30
+ Requires-Dist: pyarrow (>=17.0.0,<18.0.0)
30
31
  Requires-Dist: pydantic (>=2.0.0,<3.0.0)
31
32
  Requires-Dist: pydantic-settings (>=2.3.0,<3.0.0)
32
33
  Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
@@ -62,6 +63,8 @@ Docling bundles PDF document conversion to JSON and Markdown in an easy, self-co
62
63
  * 📝 Extracts metadata from the document, such as title, authors, references and language
63
64
  * 🔍 Optionally applies OCR (use with scanned PDFs)
64
65
 
66
+ Doing RAG or Q/A? Also consider [Quackling](https://github.com/DS4SD/quackling) to get the most out of your documents.
67
+
65
68
  ## Installation
66
69
 
67
70
  To use Docling, simply install `docling` from your package manager, e.g. pip:
@@ -4,24 +4,25 @@ docling/backend/abstract_backend.py,sha256=xfNNiZKksPPa9KAiA-fHD86flg0It4n_29ccp
4
4
  docling/backend/docling_parse_backend.py,sha256=r3aJwsWR7qG47ElhOa9iQJJQauHMt950FfCsf6fhlP4,7480
5
5
  docling/backend/pypdfium2_backend.py,sha256=FggVFitmyMMmLar6vk6XQsavGOPQx95TD14opWYRMAY,8837
6
6
  docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
- docling/datamodel/base_models.py,sha256=F3iF7cQRvdO5RaPnrXballaTvnWkPTXnX-n9N4cpCGo,8842
7
+ docling/datamodel/base_models.py,sha256=PSJe_Qlh2VJfijg3kkXOOqZbi_uqRHCmLjX__c5Buck,9155
8
8
  docling/datamodel/document.py,sha256=cG9RuAkFXCCGZqCHmhUtYeOA5PV6gjO3Y4i5lf2IM6I,13649
9
9
  docling/datamodel/settings.py,sha256=t5g6wrEJnPa9gBzMMl8ppgBRUYz-8xgopEtfMS0ZH28,733
10
10
  docling/document_converter.py,sha256=5OiNafoaVcQhZ8ATF69xRp2KyFyKeSMhmwEFUoCzP-k,10980
11
11
  docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
12
  docling/models/base_ocr_model.py,sha256=Ipl82a3AV2OsgMQSMEMpnWJ6MXcmyIQzmp52PmTaB0g,4465
13
- docling/models/ds_glm_model.py,sha256=BszxBcUZPUFgDqngGLbS5pSRyOCkPxRrCi4zP7Vm8DY,3191
13
+ docling/models/ds_glm_model.py,sha256=inNsmlriiDuqe3Q4LWL2DbqPTScP-3-dFgFoaJprFtQ,3367
14
14
  docling/models/easyocr_model.py,sha256=ABIqALvtNNrDQ47fXaZ0lDFhOwKsYGUUlAPnIsFZgZA,2232
15
15
  docling/models/layout_model.py,sha256=ZFmaLXlRWUfsT1pJCiYVxhQFrBBsiz6Aw0m9GM3UvVM,11249
16
16
  docling/models/page_assemble_model.py,sha256=8eoG2WiFxPxq9TPvM-wkngb2gkr0tdtCRVXg1JcTETo,5550
17
- docling/models/table_structure_model.py,sha256=5jzTlpM-GdCSq4l0vD1W6aSPTJXeTcXEnNuPxnw-DlA,5437
17
+ docling/models/table_structure_model.py,sha256=0wOeiRoma6et7FtoJZw2SA3wBd9-R9ivp5uvXBQqeM4,5768
18
18
  docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
19
  docling/pipeline/base_model_pipeline.py,sha256=AC5NTR0xLy5JIZqsTINkKEHeCPqpyvJpuE_bcnZhyvI,529
20
20
  docling/pipeline/standard_model_pipeline.py,sha256=UTjyaEXvz9htYZz-IMTkn11cZwNjgvo_Fl2dfBVnRQs,1442
21
21
  docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
22
+ docling/utils/export.py,sha256=gP8609DtHp6bNGPhYpwe0g3J4qvc2HqQpHZnfl7hQZQ,5899
22
23
  docling/utils/layout_utils.py,sha256=FOFbL0hKzUoWXdZaeUvEtFqKv0IkPifIr4sdGW4suKs,31804
23
24
  docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
24
- docling-1.8.4.dist-info/LICENSE,sha256=ACwmltkrXIz5VsEQcrqljq-fat6ZXAMepjXGoe40KtE,1069
25
- docling-1.8.4.dist-info/METADATA,sha256=BfG2nwCktriHJ6k_NMw07Q0OmfNBOaY1V2_bFLd_AZA,7883
26
- docling-1.8.4.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
27
- docling-1.8.4.dist-info/RECORD,,
25
+ docling-1.9.0.dist-info/LICENSE,sha256=ACwmltkrXIz5VsEQcrqljq-fat6ZXAMepjXGoe40KtE,1069
26
+ docling-1.9.0.dist-info/METADATA,sha256=YV5QVsWcEyeDIYezvMWyFg7csluluDQ2xT7LLT1J6Qg,8051
27
+ docling-1.9.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
28
+ docling-1.9.0.dist-info/RECORD,,