docling 1.12.2__py3-none-any.whl → 1.13.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/datamodel/document.py +18 -8
- docling/utils/export.py +2 -63
- {docling-1.12.2.dist-info → docling-1.13.1.dist-info}/METADATA +5 -3
- {docling-1.12.2.dist-info → docling-1.13.1.dist-info}/RECORD +7 -7
- {docling-1.12.2.dist-info → docling-1.13.1.dist-info}/LICENSE +0 -0
- {docling-1.12.2.dist-info → docling-1.13.1.dist-info}/WHEEL +0 -0
- {docling-1.12.2.dist-info → docling-1.13.1.dist-info}/entry_points.txt +0 -0
docling/datamodel/document.py
CHANGED
@@ -368,20 +368,30 @@ class ConvertedDocument(BaseModel):
|
|
368
368
|
"table",
|
369
369
|
"figure",
|
370
370
|
],
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
371
|
+
xsize: int = 100,
|
372
|
+
ysize: int = 100,
|
373
|
+
add_location: bool = True,
|
374
|
+
add_content: bool = True,
|
375
|
+
add_page_index: bool = True,
|
376
|
+
# table specific flags
|
377
|
+
add_table_cell_location: bool = False,
|
378
|
+
add_table_cell_label: bool = True,
|
379
|
+
add_table_cell_text: bool = True,
|
375
380
|
) -> str:
|
376
381
|
return self.output.export_to_document_tokens(
|
377
382
|
delim=delim,
|
378
383
|
main_text_start=main_text_start,
|
379
384
|
main_text_stop=main_text_stop,
|
380
385
|
main_text_labels=main_text_labels,
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
386
|
+
xsize=xsize,
|
387
|
+
ysize=ysize,
|
388
|
+
add_location=add_location,
|
389
|
+
add_content=add_content,
|
390
|
+
add_page_index=add_page_index,
|
391
|
+
# table specific flags
|
392
|
+
add_table_cell_location=add_table_cell_location,
|
393
|
+
add_table_cell_label=add_table_cell_label,
|
394
|
+
add_table_cell_text=add_table_cell_text,
|
385
395
|
)
|
386
396
|
|
387
397
|
def render_element_images(
|
docling/utils/export.py
CHANGED
@@ -9,67 +9,6 @@ from docling.datamodel.document import ConversionResult, Page
|
|
9
9
|
_log = logging.getLogger(__name__)
|
10
10
|
|
11
11
|
|
12
|
-
def _export_table_to_html(table: Table):
|
13
|
-
|
14
|
-
# TODO: this is flagged as internal, because we will move it
|
15
|
-
# to the docling-core package.
|
16
|
-
|
17
|
-
def _get_tablecell_span(cell: TableCell, ix):
|
18
|
-
if cell.spans is None:
|
19
|
-
span = set()
|
20
|
-
else:
|
21
|
-
span = set([s[ix] for s in cell.spans])
|
22
|
-
if len(span) == 0:
|
23
|
-
return 1, None, None
|
24
|
-
return len(span), min(span), max(span)
|
25
|
-
|
26
|
-
body = ""
|
27
|
-
nrows = table.num_rows
|
28
|
-
ncols = table.num_cols
|
29
|
-
|
30
|
-
if table.data is None:
|
31
|
-
return ""
|
32
|
-
for i in range(nrows):
|
33
|
-
body += "<tr>"
|
34
|
-
for j in range(ncols):
|
35
|
-
cell: TableCell = table.data[i][j]
|
36
|
-
|
37
|
-
rowspan, rowstart, rowend = _get_tablecell_span(cell, 0)
|
38
|
-
colspan, colstart, colend = _get_tablecell_span(cell, 1)
|
39
|
-
|
40
|
-
if rowstart is not None and rowstart != i:
|
41
|
-
continue
|
42
|
-
if colstart is not None and colstart != j:
|
43
|
-
continue
|
44
|
-
|
45
|
-
if rowstart is None:
|
46
|
-
rowstart = i
|
47
|
-
if colstart is None:
|
48
|
-
colstart = j
|
49
|
-
|
50
|
-
content = cell.text.strip()
|
51
|
-
label = cell.obj_type
|
52
|
-
label_class = "body"
|
53
|
-
celltag = "td"
|
54
|
-
if label in ["row_header", "row_multi_header", "row_title"]:
|
55
|
-
label_class = "header"
|
56
|
-
elif label in ["col_header", "col_multi_header"]:
|
57
|
-
label_class = "header"
|
58
|
-
celltag = "th"
|
59
|
-
|
60
|
-
opening_tag = f"{celltag}"
|
61
|
-
if rowspan > 1:
|
62
|
-
opening_tag += f' rowspan="{rowspan}"'
|
63
|
-
if colspan > 1:
|
64
|
-
opening_tag += f' colspan="{colspan}"'
|
65
|
-
|
66
|
-
body += f"<{opening_tag}>{content}</{celltag}>"
|
67
|
-
body += "</tr>"
|
68
|
-
body = f"<table>{body}</table>"
|
69
|
-
|
70
|
-
return body
|
71
|
-
|
72
|
-
|
73
12
|
def generate_multimodal_pages(
|
74
13
|
doc_result: ConversionResult,
|
75
14
|
) -> Iterable[Tuple[str, str, List[Dict[str, Any]], List[Dict[str, Any]], Page]]:
|
@@ -129,7 +68,7 @@ def generate_multimodal_pages(
|
|
129
68
|
}
|
130
69
|
|
131
70
|
if isinstance(item, Table):
|
132
|
-
table_html =
|
71
|
+
table_html = item.export_to_html()
|
133
72
|
new_segment["data"].append(
|
134
73
|
{
|
135
74
|
"html_seq": table_html,
|
@@ -172,7 +111,7 @@ def generate_multimodal_pages(
|
|
172
111
|
)
|
173
112
|
# No page-tagging since we only do 1 page at the time
|
174
113
|
content_dt = doc.export_to_document_tokens(
|
175
|
-
main_text_start=start_ix, main_text_stop=end_ix,
|
114
|
+
main_text_start=start_ix, main_text_stop=end_ix, add_page_index=False
|
176
115
|
)
|
177
116
|
|
178
117
|
return content_text, content_md, content_dt, page_cells, page_segments, page
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 1.
|
3
|
+
Version: 1.13.1
|
4
4
|
Summary: Docling PDF conversion package
|
5
5
|
Home-page: https://github.com/DS4SD/docling
|
6
6
|
License: MIT
|
@@ -21,8 +21,8 @@ Classifier: Programming Language :: Python :: 3.12
|
|
21
21
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
22
22
|
Provides-Extra: examples
|
23
23
|
Requires-Dist: certifi (>=2024.7.4)
|
24
|
-
Requires-Dist: deepsearch-glm (>=0.21.
|
25
|
-
Requires-Dist: docling-core (>=1.
|
24
|
+
Requires-Dist: deepsearch-glm (>=0.21.1,<0.22.0)
|
25
|
+
Requires-Dist: docling-core (>=1.5.0,<2.0.0)
|
26
26
|
Requires-Dist: docling-ibm-models (>=1.2.0,<2.0.0)
|
27
27
|
Requires-Dist: docling-parse (>=1.2.0,<2.0.0)
|
28
28
|
Requires-Dist: easyocr (>=1.7,<2.0)
|
@@ -122,7 +122,9 @@ from docling.document_converter import DocumentConverter
|
|
122
122
|
source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL
|
123
123
|
converter = DocumentConverter()
|
124
124
|
result = converter.convert_single(source)
|
125
|
+
|
125
126
|
print(result.render_as_markdown()) # output: "## Docling Technical Report[...]"
|
127
|
+
print(result.render_as_doctags()) # output: "<document><title><page_1><loc_20>..."
|
126
128
|
```
|
127
129
|
|
128
130
|
### Convert a batch of documents
|
@@ -7,7 +7,7 @@ docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
7
7
|
docling/cli/main.py,sha256=VUzm4vOijPo2F2Ht20zTnMI5alJLixfC5WK2NJCbyng,8492
|
8
8
|
docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
9
9
|
docling/datamodel/base_models.py,sha256=tE2Sxoe3e_fBZjq3GDo2NCughDMU5xDeAfkQgT72TRI,9168
|
10
|
-
docling/datamodel/document.py,sha256=
|
10
|
+
docling/datamodel/document.py,sha256=7HnPXTin5r_XvIxbqPe7uV6keIr90RhXGGo22uHbTeA,16064
|
11
11
|
docling/datamodel/settings.py,sha256=t5g6wrEJnPa9gBzMMl8ppgBRUYz-8xgopEtfMS0ZH28,733
|
12
12
|
docling/document_converter.py,sha256=5OiNafoaVcQhZ8ATF69xRp2KyFyKeSMhmwEFUoCzP-k,10980
|
13
13
|
docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -21,11 +21,11 @@ docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,
|
|
21
21
|
docling/pipeline/base_model_pipeline.py,sha256=H5XoADpsJEZls8BI3FnppR2ubltkQwf_er4Qr74rdQ8,561
|
22
22
|
docling/pipeline/standard_model_pipeline.py,sha256=UTjyaEXvz9htYZz-IMTkn11cZwNjgvo_Fl2dfBVnRQs,1442
|
23
23
|
docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
24
|
-
docling/utils/export.py,sha256=
|
24
|
+
docling/utils/export.py,sha256=bKLdbeUcR-rQsGPV1IqJkCHKMCv7X2QOHyxmjNuH3HE,4655
|
25
25
|
docling/utils/layout_utils.py,sha256=FOFbL0hKzUoWXdZaeUvEtFqKv0IkPifIr4sdGW4suKs,31804
|
26
26
|
docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
|
27
|
-
docling-1.
|
28
|
-
docling-1.
|
29
|
-
docling-1.
|
30
|
-
docling-1.
|
31
|
-
docling-1.
|
27
|
+
docling-1.13.1.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
|
28
|
+
docling-1.13.1.dist-info/METADATA,sha256=YbOdVls3nn2uE7XZPZeeE_irTAYcOqshA9eqmdom8pM,9629
|
29
|
+
docling-1.13.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
30
|
+
docling-1.13.1.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
|
31
|
+
docling-1.13.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|