docling 2.0.0__py3-none-any.whl → 2.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/abstract_backend.py +1 -0
- docling/backend/asciidoc_backend.py +435 -0
- docling/backend/docling_parse_backend.py +3 -3
- docling/backend/docling_parse_v2_backend.py +11 -3
- docling/backend/html_backend.py +8 -1
- docling/backend/md_backend.py +293 -0
- docling/backend/mspowerpoint_backend.py +62 -39
- docling/backend/msword_backend.py +3 -10
- docling/datamodel/base_models.py +15 -9
- docling/datamodel/document.py +49 -12
- docling/datamodel/pipeline_options.py +3 -0
- docling/document_converter.py +18 -0
- docling/models/base_ocr_model.py +9 -1
- docling/models/ds_glm_model.py +16 -7
- docling/models/easyocr_model.py +42 -40
- docling/models/layout_model.py +63 -59
- docling/models/page_assemble_model.py +105 -97
- docling/models/page_preprocessing_model.py +7 -3
- docling/models/table_structure_model.py +94 -85
- docling/models/tesseract_ocr_cli_model.py +56 -52
- docling/models/tesseract_ocr_model.py +50 -45
- docling/pipeline/standard_pdf_pipeline.py +7 -7
- {docling-2.0.0.dist-info → docling-2.2.0.dist-info}/METADATA +10 -9
- docling-2.2.0.dist-info/RECORD +44 -0
- docling-2.0.0.dist-info/RECORD +0 -42
- {docling-2.0.0.dist-info → docling-2.2.0.dist-info}/LICENSE +0 -0
- {docling-2.0.0.dist-info → docling-2.2.0.dist-info}/WHEEL +0 -0
- {docling-2.0.0.dist-info → docling-2.2.0.dist-info}/entry_points.txt +0 -0
@@ -71,92 +71,101 @@ class TableStructureModel(BasePageModel):
|
|
71
71
|
|
72
72
|
for page in page_batch:
|
73
73
|
assert page._backend is not None
|
74
|
-
|
75
|
-
assert page.size is not None
|
76
|
-
|
77
|
-
page.predictions.tablestructure = TableStructurePrediction() # dummy
|
78
|
-
|
79
|
-
in_tables = [
|
80
|
-
(
|
81
|
-
cluster,
|
82
|
-
[
|
83
|
-
round(cluster.bbox.l) * self.scale,
|
84
|
-
round(cluster.bbox.t) * self.scale,
|
85
|
-
round(cluster.bbox.r) * self.scale,
|
86
|
-
round(cluster.bbox.b) * self.scale,
|
87
|
-
],
|
88
|
-
)
|
89
|
-
for cluster in page.predictions.layout.clusters
|
90
|
-
if cluster.label == DocItemLabel.TABLE
|
91
|
-
]
|
92
|
-
if not len(in_tables):
|
74
|
+
if not page._backend.is_valid():
|
93
75
|
yield page
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
table_cells=table_cells,
|
149
|
-
num_rows=num_rows,
|
150
|
-
num_cols=num_cols,
|
151
|
-
id=table_cluster.id,
|
152
|
-
page_no=page.page_no,
|
153
|
-
cluster=table_cluster,
|
154
|
-
label=DocItemLabel.TABLE,
|
76
|
+
else:
|
77
|
+
|
78
|
+
assert page.predictions.layout is not None
|
79
|
+
assert page.size is not None
|
80
|
+
|
81
|
+
page.predictions.tablestructure = TableStructurePrediction() # dummy
|
82
|
+
|
83
|
+
in_tables = [
|
84
|
+
(
|
85
|
+
cluster,
|
86
|
+
[
|
87
|
+
round(cluster.bbox.l) * self.scale,
|
88
|
+
round(cluster.bbox.t) * self.scale,
|
89
|
+
round(cluster.bbox.r) * self.scale,
|
90
|
+
round(cluster.bbox.b) * self.scale,
|
91
|
+
],
|
92
|
+
)
|
93
|
+
for cluster in page.predictions.layout.clusters
|
94
|
+
if cluster.label == DocItemLabel.TABLE
|
95
|
+
]
|
96
|
+
if not len(in_tables):
|
97
|
+
yield page
|
98
|
+
continue
|
99
|
+
|
100
|
+
tokens = []
|
101
|
+
for c in page.cells:
|
102
|
+
for cluster, _ in in_tables:
|
103
|
+
if c.bbox.area() > 0:
|
104
|
+
if (
|
105
|
+
c.bbox.intersection_area_with(cluster.bbox)
|
106
|
+
/ c.bbox.area()
|
107
|
+
> 0.2
|
108
|
+
):
|
109
|
+
# Only allow non empty stings (spaces) into the cells of a table
|
110
|
+
if len(c.text.strip()) > 0:
|
111
|
+
new_cell = copy.deepcopy(c)
|
112
|
+
new_cell.bbox = new_cell.bbox.scaled(
|
113
|
+
scale=self.scale
|
114
|
+
)
|
115
|
+
|
116
|
+
tokens.append(new_cell.model_dump())
|
117
|
+
|
118
|
+
page_input = {
|
119
|
+
"tokens": tokens,
|
120
|
+
"width": page.size.width * self.scale,
|
121
|
+
"height": page.size.height * self.scale,
|
122
|
+
}
|
123
|
+
page_input["image"] = numpy.asarray(page.get_image(scale=self.scale))
|
124
|
+
|
125
|
+
table_clusters, table_bboxes = zip(*in_tables)
|
126
|
+
|
127
|
+
if len(table_bboxes):
|
128
|
+
tf_output = self.tf_predictor.multi_table_predict(
|
129
|
+
page_input, table_bboxes, do_matching=self.do_cell_matching
|
155
130
|
)
|
156
131
|
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
132
|
+
for table_cluster, table_out in zip(table_clusters, tf_output):
|
133
|
+
table_cells = []
|
134
|
+
for element in table_out["tf_responses"]:
|
135
|
+
|
136
|
+
if not self.do_cell_matching:
|
137
|
+
the_bbox = BoundingBox.model_validate(
|
138
|
+
element["bbox"]
|
139
|
+
).scaled(1 / self.scale)
|
140
|
+
text_piece = page._backend.get_text_in_rect(the_bbox)
|
141
|
+
element["bbox"]["token"] = text_piece
|
142
|
+
|
143
|
+
tc = TableCell.model_validate(element)
|
144
|
+
if self.do_cell_matching and tc.bbox is not None:
|
145
|
+
tc.bbox = tc.bbox.scaled(1 / self.scale)
|
146
|
+
table_cells.append(tc)
|
147
|
+
|
148
|
+
# Retrieving cols/rows, after post processing:
|
149
|
+
num_rows = table_out["predict_details"]["num_rows"]
|
150
|
+
num_cols = table_out["predict_details"]["num_cols"]
|
151
|
+
otsl_seq = table_out["predict_details"]["prediction"]["rs_seq"]
|
152
|
+
|
153
|
+
tbl = Table(
|
154
|
+
otsl_seq=otsl_seq,
|
155
|
+
table_cells=table_cells,
|
156
|
+
num_rows=num_rows,
|
157
|
+
num_cols=num_cols,
|
158
|
+
id=table_cluster.id,
|
159
|
+
page_no=page.page_no,
|
160
|
+
cluster=table_cluster,
|
161
|
+
label=DocItemLabel.TABLE,
|
162
|
+
)
|
163
|
+
|
164
|
+
page.predictions.tablestructure.table_map[table_cluster.id] = (
|
165
|
+
tbl
|
166
|
+
)
|
167
|
+
|
168
|
+
# For debugging purposes:
|
169
|
+
# self.draw_table_and_cells(page, page.predictions.tablestructure.table_map.values())
|
161
170
|
|
162
|
-
|
171
|
+
yield page
|
@@ -110,61 +110,65 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
110
110
|
|
111
111
|
for page in page_batch:
|
112
112
|
assert page._backend is not None
|
113
|
+
if not page._backend.is_valid():
|
114
|
+
yield page
|
115
|
+
else:
|
116
|
+
ocr_rects = self.get_ocr_rects(page)
|
117
|
+
|
118
|
+
all_ocr_cells = []
|
119
|
+
for ocr_rect in ocr_rects:
|
120
|
+
# Skip zero area boxes
|
121
|
+
if ocr_rect.area() == 0:
|
122
|
+
continue
|
123
|
+
high_res_image = page._backend.get_page_image(
|
124
|
+
scale=self.scale, cropbox=ocr_rect
|
125
|
+
)
|
113
126
|
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
confidence=conf / 100.0,
|
150
|
-
bbox=BoundingBox.from_tuple(
|
151
|
-
coord=(
|
152
|
-
(l / self.scale) + ocr_rect.l,
|
153
|
-
(b / self.scale) + ocr_rect.t,
|
154
|
-
(r / self.scale) + ocr_rect.l,
|
155
|
-
(t / self.scale) + ocr_rect.t,
|
127
|
+
with tempfile.NamedTemporaryFile(
|
128
|
+
suffix=".png", mode="w"
|
129
|
+
) as image_file:
|
130
|
+
fname = image_file.name
|
131
|
+
high_res_image.save(fname)
|
132
|
+
|
133
|
+
df = self._run_tesseract(fname)
|
134
|
+
|
135
|
+
# _log.info(df)
|
136
|
+
|
137
|
+
# Print relevant columns (bounding box and text)
|
138
|
+
for ix, row in df.iterrows():
|
139
|
+
text = row["text"]
|
140
|
+
conf = row["conf"]
|
141
|
+
|
142
|
+
l = float(row["left"])
|
143
|
+
b = float(row["top"])
|
144
|
+
w = float(row["width"])
|
145
|
+
h = float(row["height"])
|
146
|
+
|
147
|
+
t = b + h
|
148
|
+
r = l + w
|
149
|
+
|
150
|
+
cell = OcrCell(
|
151
|
+
id=ix,
|
152
|
+
text=text,
|
153
|
+
confidence=conf / 100.0,
|
154
|
+
bbox=BoundingBox.from_tuple(
|
155
|
+
coord=(
|
156
|
+
(l / self.scale) + ocr_rect.l,
|
157
|
+
(b / self.scale) + ocr_rect.t,
|
158
|
+
(r / self.scale) + ocr_rect.l,
|
159
|
+
(t / self.scale) + ocr_rect.t,
|
160
|
+
),
|
161
|
+
origin=CoordOrigin.TOPLEFT,
|
156
162
|
),
|
157
|
-
|
158
|
-
)
|
159
|
-
)
|
160
|
-
all_ocr_cells.append(cell)
|
163
|
+
)
|
164
|
+
all_ocr_cells.append(cell)
|
161
165
|
|
162
|
-
|
163
|
-
|
166
|
+
## Remove OCR cells which overlap with programmatic cells.
|
167
|
+
filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
|
164
168
|
|
165
|
-
|
169
|
+
page.cells.extend(filtered_ocr_cells)
|
166
170
|
|
167
|
-
|
168
|
-
|
171
|
+
# DEBUG code:
|
172
|
+
# self.draw_ocr_rects_and_cells(page, ocr_rects)
|
169
173
|
|
170
|
-
|
174
|
+
yield page
|
@@ -69,57 +69,62 @@ class TesseractOcrModel(BaseOcrModel):
|
|
69
69
|
|
70
70
|
for page in page_batch:
|
71
71
|
assert page._backend is not None
|
72
|
-
|
72
|
+
if not page._backend.is_valid():
|
73
|
+
yield page
|
74
|
+
else:
|
75
|
+
assert self.reader is not None
|
73
76
|
|
74
|
-
|
77
|
+
ocr_rects = self.get_ocr_rects(page)
|
75
78
|
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
79
|
+
all_ocr_cells = []
|
80
|
+
for ocr_rect in ocr_rects:
|
81
|
+
# Skip zero area boxes
|
82
|
+
if ocr_rect.area() == 0:
|
83
|
+
continue
|
84
|
+
high_res_image = page._backend.get_page_image(
|
85
|
+
scale=self.scale, cropbox=ocr_rect
|
86
|
+
)
|
84
87
|
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
cells = []
|
90
|
-
for ix, (im, box, _, _) in enumerate(boxes):
|
91
|
-
# Set the area of interest. Tesseract uses Bottom-Left for the origin
|
92
|
-
self.reader.SetRectangle(box["x"], box["y"], box["w"], box["h"])
|
93
|
-
|
94
|
-
# Extract text within the bounding box
|
95
|
-
text = self.reader.GetUTF8Text().strip()
|
96
|
-
confidence = self.reader.MeanTextConf()
|
97
|
-
left = box["x"] / self.scale
|
98
|
-
bottom = box["y"] / self.scale
|
99
|
-
right = (box["x"] + box["w"]) / self.scale
|
100
|
-
top = (box["y"] + box["h"]) / self.scale
|
101
|
-
|
102
|
-
cells.append(
|
103
|
-
OcrCell(
|
104
|
-
id=ix,
|
105
|
-
text=text,
|
106
|
-
confidence=confidence,
|
107
|
-
bbox=BoundingBox.from_tuple(
|
108
|
-
coord=(left, top, right, bottom),
|
109
|
-
origin=CoordOrigin.TOPLEFT,
|
110
|
-
),
|
111
|
-
)
|
88
|
+
# Retrieve text snippets with their bounding boxes
|
89
|
+
self.reader.SetImage(high_res_image)
|
90
|
+
boxes = self.reader.GetComponentImages(
|
91
|
+
self.reader_RIL.TEXTLINE, True
|
112
92
|
)
|
113
93
|
|
114
|
-
|
115
|
-
|
94
|
+
cells = []
|
95
|
+
for ix, (im, box, _, _) in enumerate(boxes):
|
96
|
+
# Set the area of interest. Tesseract uses Bottom-Left for the origin
|
97
|
+
self.reader.SetRectangle(box["x"], box["y"], box["w"], box["h"])
|
98
|
+
|
99
|
+
# Extract text within the bounding box
|
100
|
+
text = self.reader.GetUTF8Text().strip()
|
101
|
+
confidence = self.reader.MeanTextConf()
|
102
|
+
left = box["x"] / self.scale
|
103
|
+
bottom = box["y"] / self.scale
|
104
|
+
right = (box["x"] + box["w"]) / self.scale
|
105
|
+
top = (box["y"] + box["h"]) / self.scale
|
106
|
+
|
107
|
+
cells.append(
|
108
|
+
OcrCell(
|
109
|
+
id=ix,
|
110
|
+
text=text,
|
111
|
+
confidence=confidence,
|
112
|
+
bbox=BoundingBox.from_tuple(
|
113
|
+
coord=(left, top, right, bottom),
|
114
|
+
origin=CoordOrigin.TOPLEFT,
|
115
|
+
),
|
116
|
+
)
|
117
|
+
)
|
118
|
+
|
119
|
+
# del high_res_image
|
120
|
+
all_ocr_cells.extend(cells)
|
116
121
|
|
117
|
-
|
118
|
-
|
122
|
+
## Remove OCR cells which overlap with programmatic cells.
|
123
|
+
filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
|
119
124
|
|
120
|
-
|
125
|
+
page.cells.extend(filtered_ocr_cells)
|
121
126
|
|
122
|
-
|
123
|
-
|
127
|
+
# DEBUG code:
|
128
|
+
# self.draw_ocr_rects_and_cells(page, ocr_rects)
|
124
129
|
|
125
|
-
|
130
|
+
yield page
|
@@ -134,13 +134,13 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|
134
134
|
all_body = []
|
135
135
|
|
136
136
|
for p in conv_res.pages:
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
137
|
+
if p.assembled is not None:
|
138
|
+
for el in p.assembled.body:
|
139
|
+
all_body.append(el)
|
140
|
+
for el in p.assembled.headers:
|
141
|
+
all_headers.append(el)
|
142
|
+
for el in p.assembled.elements:
|
143
|
+
all_elements.append(el)
|
144
144
|
|
145
145
|
conv_res.assembled = AssembledUnit(
|
146
146
|
elements=all_elements, headers=all_headers, body=all_body
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.2.0
|
4
4
|
Summary: Docling PDF conversion package
|
5
5
|
Home-page: https://github.com/DS4SD/docling
|
6
6
|
License: MIT
|
@@ -22,13 +22,14 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
22
22
|
Provides-Extra: tesserocr
|
23
23
|
Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
|
24
24
|
Requires-Dist: certifi (>=2024.7.4)
|
25
|
-
Requires-Dist: deepsearch-glm (>=0.
|
26
|
-
Requires-Dist: docling-core (>=2.
|
25
|
+
Requires-Dist: deepsearch-glm (>=0.26.1,<0.27.0)
|
26
|
+
Requires-Dist: docling-core (>=2.1.0,<3.0.0)
|
27
27
|
Requires-Dist: docling-ibm-models (>=2.0.1,<3.0.0)
|
28
|
-
Requires-Dist: docling-parse (>=
|
28
|
+
Requires-Dist: docling-parse (>=2.0.0,<3.0.0)
|
29
29
|
Requires-Dist: easyocr (>=1.7,<2.0)
|
30
30
|
Requires-Dist: filetype (>=1.2.0,<2.0.0)
|
31
31
|
Requires-Dist: huggingface_hub (>=0.23,<1)
|
32
|
+
Requires-Dist: marko (>=2.1.2,<3.0.0)
|
32
33
|
Requires-Dist: pandas (>=2.1.4,<3.0.0)
|
33
34
|
Requires-Dist: pyarrow (>=16.1.0,<17.0.0)
|
34
35
|
Requires-Dist: pydantic (>=2.0.0,<3.0.0)
|
@@ -50,7 +51,7 @@ Description-Content-Type: text/markdown
|
|
50
51
|
|
51
52
|
<p align="center">
|
52
53
|
<a href="https://github.com/ds4sd/docling">
|
53
|
-
<img loading="lazy" alt="Docling" src="https://github.com/DS4SD/docling/raw/main/docs/assets/
|
54
|
+
<img loading="lazy" alt="Docling" src="https://github.com/DS4SD/docling/raw/main/docs/assets/docling_processing.png" width="100%"/>
|
54
55
|
</a>
|
55
56
|
</p>
|
56
57
|
|
@@ -69,6 +70,7 @@ Description-Content-Type: text/markdown
|
|
69
70
|
|
70
71
|
Docling parses documents and exports them to the desired format with ease and speed.
|
71
72
|
|
73
|
+
|
72
74
|
## Features
|
73
75
|
|
74
76
|
* 🗂️ Multi-format support for input (PDF, DOCX etc.) & output (Markdown, JSON etc.)
|
@@ -94,16 +96,15 @@ More [detailed installation instructions](https://ds4sd.github.io/docling/instal
|
|
94
96
|
|
95
97
|
## Getting started
|
96
98
|
|
97
|
-
To convert
|
99
|
+
To convert individual documents, use `convert()`, for example:
|
98
100
|
|
99
101
|
```python
|
100
102
|
from docling.document_converter import DocumentConverter
|
101
103
|
|
102
|
-
source = "https://arxiv.org/pdf/2408.09869" #
|
104
|
+
source = "https://arxiv.org/pdf/2408.09869" # document per local path or URL
|
103
105
|
converter = DocumentConverter()
|
104
106
|
result = converter.convert(source)
|
105
107
|
print(result.document.export_to_markdown()) # output: "## Docling Technical Report[...]"
|
106
|
-
print(result.document.export_to_document_tokens()) # output: "<document><title><page_1><loc_20>..."
|
107
108
|
```
|
108
109
|
|
109
110
|
|
@@ -144,6 +145,6 @@ If you use Docling in your projects, please consider citing the following:
|
|
144
145
|
|
145
146
|
## License
|
146
147
|
|
147
|
-
The Docling codebase is under MIT license.
|
148
|
+
The Docling codebase is under MIT license.
|
148
149
|
For individual model usage, please refer to the model licenses found in the original packages.
|
149
150
|
|
@@ -0,0 +1,44 @@
|
|
1
|
+
docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
+
docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
|
+
docling/backend/abstract_backend.py,sha256=-or6kWVV7egQeyIuN-vI0Tr7Q1htalBZSlhgq_G2RdU,1678
|
4
|
+
docling/backend/asciidoc_backend.py,sha256=WW0eIanPIObcg5ci9YcnqFxwipmqRFsRY8zjZDdKvJA,14116
|
5
|
+
docling/backend/docling_parse_backend.py,sha256=TaIMli9vePd3fz9L6S4t75JPYZDpgYBLRGfWjbc9Hbk,7632
|
6
|
+
docling/backend/docling_parse_v2_backend.py,sha256=QlVU8NgqKvVCa99E8oDa2Xvy__kq30C-myGY3o9Qoq4,8588
|
7
|
+
docling/backend/html_backend.py,sha256=wfh5PWEwoqsCXxFCQbFBdJvEtlqZhXgqfPfTYETWHfE,14974
|
8
|
+
docling/backend/md_backend.py,sha256=osYiNLnep9UgLq8mUH9bmwG3kP9RXxt69I8LlyeJN6g,11505
|
9
|
+
docling/backend/mspowerpoint_backend.py,sha256=J472AIH_IXvGg3D0FDmXhue1At_VSBD6n15c64Kxttw,15446
|
10
|
+
docling/backend/msword_backend.py,sha256=6bY0ebOaeSbpskUJY5t5pOf4a2VclWzeHeSo-vzsaO0,17470
|
11
|
+
docling/backend/pdf_backend.py,sha256=unnw7QiRE1VXg6Pj-eYrtnFGrp5SSYiI324OlFxyv6c,2050
|
12
|
+
docling/backend/pypdfium2_backend.py,sha256=MJX6fQqwK3r967fyAAs-RA_YIkeQvhgsLkQAgaBTgaE,8995
|
13
|
+
docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
14
|
+
docling/cli/main.py,sha256=NRVGz0z-3EBwYNMJGVnLtDBcfOeutaUyYdkM0ymRnGA,8008
|
15
|
+
docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
16
|
+
docling/datamodel/base_models.py,sha256=Mx0xR6YmRP8thu8CjOxjbGHLUJctqIvFwRZQ-8tQowY,5380
|
17
|
+
docling/datamodel/document.py,sha256=mkPXDms9jtPFY1pfBSicNaVRZwbbfzYFUj0dJDbMgG8,20612
|
18
|
+
docling/datamodel/pipeline_options.py,sha256=WNjluKC-Ww63ifkGMHwws8zIDHnOS1z5Hw7_j3S0qao,2446
|
19
|
+
docling/datamodel/settings.py,sha256=KBFVeQviR1hoCFjA1ZwuLuQ6EAAYR7saIa6EUYiOkHI,767
|
20
|
+
docling/document_converter.py,sha256=T-Y2pWwbCIofW209XJ3wlc5TiGeQqMbDqgzcVWyZ_0Y,10227
|
21
|
+
docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
22
|
+
docling/models/base_model.py,sha256=wSBGAIAbLqrqP_SMtkzXMuyFvvzjVU6iCqgSNnGIR4Y,603
|
23
|
+
docling/models/base_ocr_model.py,sha256=SYelQRValiUo6M_p_9-J7CqNIOFO-EkK58j90SMsKQY,5028
|
24
|
+
docling/models/ds_glm_model.py,sha256=vJLngchZonqFzGWbUr2izFSXk9DloPDhAfN2c3nkzNU,11254
|
25
|
+
docling/models/easyocr_model.py,sha256=YfvdodjZ20WuOfouQXJmDyQL78QDOqWYsWSs2zSxWFc,3327
|
26
|
+
docling/models/layout_model.py,sha256=zd2ULW3U6v9OJl4TnjWFEY6Q2O-lBfrIqtvrnDzF7HU,12596
|
27
|
+
docling/models/page_assemble_model.py,sha256=LOKHho-r-RpeIVh8CpJ9tid_QIp5um3ukcrucZsyUlY,6645
|
28
|
+
docling/models/page_preprocessing_model.py,sha256=cfhUIlGAGaX1RxILi69ZEV9Kmhhd3Y0XaSlQnGo18o4,1964
|
29
|
+
docling/models/table_structure_model.py,sha256=YWSZKOz56gvicjTzVgSE-8Z_hI3NcRD5EN0yOUoM-_g,6979
|
30
|
+
docling/models/tesseract_ocr_cli_model.py,sha256=fKc05V73ibMvAeuA4PForhYNtunpT5rR0k_xHZsew-E,5980
|
31
|
+
docling/models/tesseract_ocr_model.py,sha256=v6td0vq8NogePuRTJRZhKF0DtZXITj70r9rKJKO5u9k,4984
|
32
|
+
docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
33
|
+
docling/pipeline/base_pipeline.py,sha256=7DTzVvM_jVHCxyY-BuuGRhmUsD_sgX4DD00oBFJWdB8,6723
|
34
|
+
docling/pipeline/simple_pipeline.py,sha256=pxce0-3He5Lqa-xXT-7h173XVOSMZiMHl6HOfAJmQ7o,2162
|
35
|
+
docling/pipeline/standard_pdf_pipeline.py,sha256=AVNSxGc6kPmBPDLWDc9eI8fryc25eOtiIVrOyVhZMZM,7527
|
36
|
+
docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
37
|
+
docling/utils/export.py,sha256=KyGF1BVDHPFfHVXZc8vegsWlFfOgGPP2YckWpTadyI8,4694
|
38
|
+
docling/utils/layout_utils.py,sha256=vlN0rc8i0ayRGn3WnaG-pdmqEL00KKGl2zez3Gj-hrk,32074
|
39
|
+
docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
|
40
|
+
docling-2.2.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
|
41
|
+
docling-2.2.0.dist-info/METADATA,sha256=TkaywA2l2ImdMc9WpUYWUQy3n50zG9Y9eC7ziElBlU0,6205
|
42
|
+
docling-2.2.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
43
|
+
docling-2.2.0.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
|
44
|
+
docling-2.2.0.dist-info/RECORD,,
|
docling-2.0.0.dist-info/RECORD
DELETED
@@ -1,42 +0,0 @@
|
|
1
|
-
docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
-
docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
|
-
docling/backend/abstract_backend.py,sha256=8Lh1gf1P9AnzlwB989OVBgLmokTpfI0LxYRfuvYTqoo,1646
|
4
|
-
docling/backend/docling_parse_backend.py,sha256=UgBpopZIP5YkhwhybiqDnqVsSqv9DAAPFkafhfL0pPo,7623
|
5
|
-
docling/backend/docling_parse_v2_backend.py,sha256=VY7MsiyqjN3Vl0UkyezriiVJMLbLRrQVuKjWaTgIUwY,8336
|
6
|
-
docling/backend/html_backend.py,sha256=MlhEXaA0tgX_tLuQLnkex43gsKqpqHWnbkssxY4n_kc,14753
|
7
|
-
docling/backend/mspowerpoint_backend.py,sha256=2UYfMMeWwgDtvIKQELCA-bYv5Z-rGvbMiBNcidNL_uE,14332
|
8
|
-
docling/backend/msword_backend.py,sha256=4SDqZAZxLr6VV50OU3MRBAV8SwZMCyJCUbNVMVUpitc,17659
|
9
|
-
docling/backend/pdf_backend.py,sha256=unnw7QiRE1VXg6Pj-eYrtnFGrp5SSYiI324OlFxyv6c,2050
|
10
|
-
docling/backend/pypdfium2_backend.py,sha256=MJX6fQqwK3r967fyAAs-RA_YIkeQvhgsLkQAgaBTgaE,8995
|
11
|
-
docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
12
|
-
docling/cli/main.py,sha256=NRVGz0z-3EBwYNMJGVnLtDBcfOeutaUyYdkM0ymRnGA,8008
|
13
|
-
docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
14
|
-
docling/datamodel/base_models.py,sha256=Ha-DoRZoksjHSZHWqUSiQ79MTBEfY5ur8U_LVtyBRYU,5153
|
15
|
-
docling/datamodel/document.py,sha256=GCARkUuv8TNtFO934E7KujOsTkBFqLXX5bogNprVXEM,19411
|
16
|
-
docling/datamodel/pipeline_options.py,sha256=mez7CiJMtm-xhOmZ-2-M_Q3YwC6EzHytWfg0E3tiVio,2329
|
17
|
-
docling/datamodel/settings.py,sha256=KBFVeQviR1hoCFjA1ZwuLuQ6EAAYR7saIa6EUYiOkHI,767
|
18
|
-
docling/document_converter.py,sha256=S_t9hs2uZfXC38LC0hTaAihrSJIrCvnTiuY5SvUccgk,9587
|
19
|
-
docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
20
|
-
docling/models/base_model.py,sha256=wSBGAIAbLqrqP_SMtkzXMuyFvvzjVU6iCqgSNnGIR4Y,603
|
21
|
-
docling/models/base_ocr_model.py,sha256=N5pOQ4RQSWPU-bPZ81FySDdBnwNG64-6K0ldK6ENU0U,4672
|
22
|
-
docling/models/ds_glm_model.py,sha256=nUBHTsE-eRtrtPE6v_N4iZGr43bXIsOfb_8NFUMWJQk,11057
|
23
|
-
docling/models/easyocr_model.py,sha256=URhHzxwnBuErf6sskWyEWauX-Kne0upnrAguzKQi3SI,3090
|
24
|
-
docling/models/layout_model.py,sha256=B4Veff9V0WxcQXTBYzJM6rE7B3lszUI7zmg7EFE0WxU,12245
|
25
|
-
docling/models/page_assemble_model.py,sha256=ovwSki52w1rlrc7MgMbjh1Uc5H8XBCz9S2nHE44mzYU,6030
|
26
|
-
docling/models/page_preprocessing_model.py,sha256=PJ_jASz3w0Lus_Ep4NN5Vq_Redq7x8vAyVR8qXCb6Eg,1817
|
27
|
-
docling/models/table_structure_model.py,sha256=qcjXXiNZcMWjr6ys02sToKZlAr8S0rAJNICbBjK9Ijo,6426
|
28
|
-
docling/models/tesseract_ocr_cli_model.py,sha256=l-gRDU273opgack9fAxHaXPEdX5IdD5ZTnu6VsfKIWc,5665
|
29
|
-
docling/models/tesseract_ocr_model.py,sha256=tEEq-URSYnyQru7RoD5fx-s1trwMxPCcwJx94M4iuxc,4676
|
30
|
-
docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
31
|
-
docling/pipeline/base_pipeline.py,sha256=7DTzVvM_jVHCxyY-BuuGRhmUsD_sgX4DD00oBFJWdB8,6723
|
32
|
-
docling/pipeline/simple_pipeline.py,sha256=pxce0-3He5Lqa-xXT-7h173XVOSMZiMHl6HOfAJmQ7o,2162
|
33
|
-
docling/pipeline/standard_pdf_pipeline.py,sha256=_gRGR9tsy55_tptFj-AiEJEedxhJ0iIjHb5qaj36d28,7506
|
34
|
-
docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
35
|
-
docling/utils/export.py,sha256=KyGF1BVDHPFfHVXZc8vegsWlFfOgGPP2YckWpTadyI8,4694
|
36
|
-
docling/utils/layout_utils.py,sha256=vlN0rc8i0ayRGn3WnaG-pdmqEL00KKGl2zez3Gj-hrk,32074
|
37
|
-
docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
|
38
|
-
docling-2.0.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
|
39
|
-
docling-2.0.0.dist-info/METADATA,sha256=RyawmIT2dz9la0DH8KsW749TNq4BpiSIndVEz83wauQ,6235
|
40
|
-
docling-2.0.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
41
|
-
docling-2.0.0.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
|
42
|
-
docling-2.0.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|