docling 2.0.0__py3-none-any.whl → 2.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -71,92 +71,101 @@ class TableStructureModel(BasePageModel):
71
71
 
72
72
  for page in page_batch:
73
73
  assert page._backend is not None
74
- assert page.predictions.layout is not None
75
- assert page.size is not None
76
-
77
- page.predictions.tablestructure = TableStructurePrediction() # dummy
78
-
79
- in_tables = [
80
- (
81
- cluster,
82
- [
83
- round(cluster.bbox.l) * self.scale,
84
- round(cluster.bbox.t) * self.scale,
85
- round(cluster.bbox.r) * self.scale,
86
- round(cluster.bbox.b) * self.scale,
87
- ],
88
- )
89
- for cluster in page.predictions.layout.clusters
90
- if cluster.label == DocItemLabel.TABLE
91
- ]
92
- if not len(in_tables):
74
+ if not page._backend.is_valid():
93
75
  yield page
94
- continue
95
-
96
- tokens = []
97
- for c in page.cells:
98
- for cluster, _ in in_tables:
99
- if c.bbox.area() > 0:
100
- if (
101
- c.bbox.intersection_area_with(cluster.bbox) / c.bbox.area()
102
- > 0.2
103
- ):
104
- # Only allow non empty stings (spaces) into the cells of a table
105
- if len(c.text.strip()) > 0:
106
- new_cell = copy.deepcopy(c)
107
- new_cell.bbox = new_cell.bbox.scaled(scale=self.scale)
108
-
109
- tokens.append(new_cell.model_dump())
110
-
111
- page_input = {
112
- "tokens": tokens,
113
- "width": page.size.width * self.scale,
114
- "height": page.size.height * self.scale,
115
- }
116
- page_input["image"] = numpy.asarray(page.get_image(scale=self.scale))
117
-
118
- table_clusters, table_bboxes = zip(*in_tables)
119
-
120
- if len(table_bboxes):
121
- tf_output = self.tf_predictor.multi_table_predict(
122
- page_input, table_bboxes, do_matching=self.do_cell_matching
123
- )
124
-
125
- for table_cluster, table_out in zip(table_clusters, tf_output):
126
- table_cells = []
127
- for element in table_out["tf_responses"]:
128
-
129
- if not self.do_cell_matching:
130
- the_bbox = BoundingBox.model_validate(
131
- element["bbox"]
132
- ).scaled(1 / self.scale)
133
- text_piece = page._backend.get_text_in_rect(the_bbox)
134
- element["bbox"]["token"] = text_piece
135
-
136
- tc = TableCell.model_validate(element)
137
- if self.do_cell_matching and tc.bbox is not None:
138
- tc.bbox = tc.bbox.scaled(1 / self.scale)
139
- table_cells.append(tc)
140
-
141
- # Retrieving cols/rows, after post processing:
142
- num_rows = table_out["predict_details"]["num_rows"]
143
- num_cols = table_out["predict_details"]["num_cols"]
144
- otsl_seq = table_out["predict_details"]["prediction"]["rs_seq"]
145
-
146
- tbl = Table(
147
- otsl_seq=otsl_seq,
148
- table_cells=table_cells,
149
- num_rows=num_rows,
150
- num_cols=num_cols,
151
- id=table_cluster.id,
152
- page_no=page.page_no,
153
- cluster=table_cluster,
154
- label=DocItemLabel.TABLE,
76
+ else:
77
+
78
+ assert page.predictions.layout is not None
79
+ assert page.size is not None
80
+
81
+ page.predictions.tablestructure = TableStructurePrediction() # dummy
82
+
83
+ in_tables = [
84
+ (
85
+ cluster,
86
+ [
87
+ round(cluster.bbox.l) * self.scale,
88
+ round(cluster.bbox.t) * self.scale,
89
+ round(cluster.bbox.r) * self.scale,
90
+ round(cluster.bbox.b) * self.scale,
91
+ ],
92
+ )
93
+ for cluster in page.predictions.layout.clusters
94
+ if cluster.label == DocItemLabel.TABLE
95
+ ]
96
+ if not len(in_tables):
97
+ yield page
98
+ continue
99
+
100
+ tokens = []
101
+ for c in page.cells:
102
+ for cluster, _ in in_tables:
103
+ if c.bbox.area() > 0:
104
+ if (
105
+ c.bbox.intersection_area_with(cluster.bbox)
106
+ / c.bbox.area()
107
+ > 0.2
108
+ ):
109
+ # Only allow non empty stings (spaces) into the cells of a table
110
+ if len(c.text.strip()) > 0:
111
+ new_cell = copy.deepcopy(c)
112
+ new_cell.bbox = new_cell.bbox.scaled(
113
+ scale=self.scale
114
+ )
115
+
116
+ tokens.append(new_cell.model_dump())
117
+
118
+ page_input = {
119
+ "tokens": tokens,
120
+ "width": page.size.width * self.scale,
121
+ "height": page.size.height * self.scale,
122
+ }
123
+ page_input["image"] = numpy.asarray(page.get_image(scale=self.scale))
124
+
125
+ table_clusters, table_bboxes = zip(*in_tables)
126
+
127
+ if len(table_bboxes):
128
+ tf_output = self.tf_predictor.multi_table_predict(
129
+ page_input, table_bboxes, do_matching=self.do_cell_matching
155
130
  )
156
131
 
157
- page.predictions.tablestructure.table_map[table_cluster.id] = tbl
158
-
159
- # For debugging purposes:
160
- # self.draw_table_and_cells(page, page.predictions.tablestructure.table_map.values())
132
+ for table_cluster, table_out in zip(table_clusters, tf_output):
133
+ table_cells = []
134
+ for element in table_out["tf_responses"]:
135
+
136
+ if not self.do_cell_matching:
137
+ the_bbox = BoundingBox.model_validate(
138
+ element["bbox"]
139
+ ).scaled(1 / self.scale)
140
+ text_piece = page._backend.get_text_in_rect(the_bbox)
141
+ element["bbox"]["token"] = text_piece
142
+
143
+ tc = TableCell.model_validate(element)
144
+ if self.do_cell_matching and tc.bbox is not None:
145
+ tc.bbox = tc.bbox.scaled(1 / self.scale)
146
+ table_cells.append(tc)
147
+
148
+ # Retrieving cols/rows, after post processing:
149
+ num_rows = table_out["predict_details"]["num_rows"]
150
+ num_cols = table_out["predict_details"]["num_cols"]
151
+ otsl_seq = table_out["predict_details"]["prediction"]["rs_seq"]
152
+
153
+ tbl = Table(
154
+ otsl_seq=otsl_seq,
155
+ table_cells=table_cells,
156
+ num_rows=num_rows,
157
+ num_cols=num_cols,
158
+ id=table_cluster.id,
159
+ page_no=page.page_no,
160
+ cluster=table_cluster,
161
+ label=DocItemLabel.TABLE,
162
+ )
163
+
164
+ page.predictions.tablestructure.table_map[table_cluster.id] = (
165
+ tbl
166
+ )
167
+
168
+ # For debugging purposes:
169
+ # self.draw_table_and_cells(page, page.predictions.tablestructure.table_map.values())
161
170
 
162
- yield page
171
+ yield page
@@ -110,61 +110,65 @@ class TesseractOcrCliModel(BaseOcrModel):
110
110
 
111
111
  for page in page_batch:
112
112
  assert page._backend is not None
113
+ if not page._backend.is_valid():
114
+ yield page
115
+ else:
116
+ ocr_rects = self.get_ocr_rects(page)
117
+
118
+ all_ocr_cells = []
119
+ for ocr_rect in ocr_rects:
120
+ # Skip zero area boxes
121
+ if ocr_rect.area() == 0:
122
+ continue
123
+ high_res_image = page._backend.get_page_image(
124
+ scale=self.scale, cropbox=ocr_rect
125
+ )
113
126
 
114
- ocr_rects = self.get_ocr_rects(page)
115
-
116
- all_ocr_cells = []
117
- for ocr_rect in ocr_rects:
118
- # Skip zero area boxes
119
- if ocr_rect.area() == 0:
120
- continue
121
- high_res_image = page._backend.get_page_image(
122
- scale=self.scale, cropbox=ocr_rect
123
- )
124
-
125
- with tempfile.NamedTemporaryFile(suffix=".png", mode="w") as image_file:
126
- fname = image_file.name
127
- high_res_image.save(fname)
128
-
129
- df = self._run_tesseract(fname)
130
-
131
- # _log.info(df)
132
-
133
- # Print relevant columns (bounding box and text)
134
- for ix, row in df.iterrows():
135
- text = row["text"]
136
- conf = row["conf"]
137
-
138
- l = float(row["left"])
139
- b = float(row["top"])
140
- w = float(row["width"])
141
- h = float(row["height"])
142
-
143
- t = b + h
144
- r = l + w
145
-
146
- cell = OcrCell(
147
- id=ix,
148
- text=text,
149
- confidence=conf / 100.0,
150
- bbox=BoundingBox.from_tuple(
151
- coord=(
152
- (l / self.scale) + ocr_rect.l,
153
- (b / self.scale) + ocr_rect.t,
154
- (r / self.scale) + ocr_rect.l,
155
- (t / self.scale) + ocr_rect.t,
127
+ with tempfile.NamedTemporaryFile(
128
+ suffix=".png", mode="w"
129
+ ) as image_file:
130
+ fname = image_file.name
131
+ high_res_image.save(fname)
132
+
133
+ df = self._run_tesseract(fname)
134
+
135
+ # _log.info(df)
136
+
137
+ # Print relevant columns (bounding box and text)
138
+ for ix, row in df.iterrows():
139
+ text = row["text"]
140
+ conf = row["conf"]
141
+
142
+ l = float(row["left"])
143
+ b = float(row["top"])
144
+ w = float(row["width"])
145
+ h = float(row["height"])
146
+
147
+ t = b + h
148
+ r = l + w
149
+
150
+ cell = OcrCell(
151
+ id=ix,
152
+ text=text,
153
+ confidence=conf / 100.0,
154
+ bbox=BoundingBox.from_tuple(
155
+ coord=(
156
+ (l / self.scale) + ocr_rect.l,
157
+ (b / self.scale) + ocr_rect.t,
158
+ (r / self.scale) + ocr_rect.l,
159
+ (t / self.scale) + ocr_rect.t,
160
+ ),
161
+ origin=CoordOrigin.TOPLEFT,
156
162
  ),
157
- origin=CoordOrigin.TOPLEFT,
158
- ),
159
- )
160
- all_ocr_cells.append(cell)
163
+ )
164
+ all_ocr_cells.append(cell)
161
165
 
162
- ## Remove OCR cells which overlap with programmatic cells.
163
- filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
166
+ ## Remove OCR cells which overlap with programmatic cells.
167
+ filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
164
168
 
165
- page.cells.extend(filtered_ocr_cells)
169
+ page.cells.extend(filtered_ocr_cells)
166
170
 
167
- # DEBUG code:
168
- # self.draw_ocr_rects_and_cells(page, ocr_rects)
171
+ # DEBUG code:
172
+ # self.draw_ocr_rects_and_cells(page, ocr_rects)
169
173
 
170
- yield page
174
+ yield page
@@ -69,57 +69,62 @@ class TesseractOcrModel(BaseOcrModel):
69
69
 
70
70
  for page in page_batch:
71
71
  assert page._backend is not None
72
- assert self.reader is not None
72
+ if not page._backend.is_valid():
73
+ yield page
74
+ else:
75
+ assert self.reader is not None
73
76
 
74
- ocr_rects = self.get_ocr_rects(page)
77
+ ocr_rects = self.get_ocr_rects(page)
75
78
 
76
- all_ocr_cells = []
77
- for ocr_rect in ocr_rects:
78
- # Skip zero area boxes
79
- if ocr_rect.area() == 0:
80
- continue
81
- high_res_image = page._backend.get_page_image(
82
- scale=self.scale, cropbox=ocr_rect
83
- )
79
+ all_ocr_cells = []
80
+ for ocr_rect in ocr_rects:
81
+ # Skip zero area boxes
82
+ if ocr_rect.area() == 0:
83
+ continue
84
+ high_res_image = page._backend.get_page_image(
85
+ scale=self.scale, cropbox=ocr_rect
86
+ )
84
87
 
85
- # Retrieve text snippets with their bounding boxes
86
- self.reader.SetImage(high_res_image)
87
- boxes = self.reader.GetComponentImages(self.reader_RIL.TEXTLINE, True)
88
-
89
- cells = []
90
- for ix, (im, box, _, _) in enumerate(boxes):
91
- # Set the area of interest. Tesseract uses Bottom-Left for the origin
92
- self.reader.SetRectangle(box["x"], box["y"], box["w"], box["h"])
93
-
94
- # Extract text within the bounding box
95
- text = self.reader.GetUTF8Text().strip()
96
- confidence = self.reader.MeanTextConf()
97
- left = box["x"] / self.scale
98
- bottom = box["y"] / self.scale
99
- right = (box["x"] + box["w"]) / self.scale
100
- top = (box["y"] + box["h"]) / self.scale
101
-
102
- cells.append(
103
- OcrCell(
104
- id=ix,
105
- text=text,
106
- confidence=confidence,
107
- bbox=BoundingBox.from_tuple(
108
- coord=(left, top, right, bottom),
109
- origin=CoordOrigin.TOPLEFT,
110
- ),
111
- )
88
+ # Retrieve text snippets with their bounding boxes
89
+ self.reader.SetImage(high_res_image)
90
+ boxes = self.reader.GetComponentImages(
91
+ self.reader_RIL.TEXTLINE, True
112
92
  )
113
93
 
114
- # del high_res_image
115
- all_ocr_cells.extend(cells)
94
+ cells = []
95
+ for ix, (im, box, _, _) in enumerate(boxes):
96
+ # Set the area of interest. Tesseract uses Bottom-Left for the origin
97
+ self.reader.SetRectangle(box["x"], box["y"], box["w"], box["h"])
98
+
99
+ # Extract text within the bounding box
100
+ text = self.reader.GetUTF8Text().strip()
101
+ confidence = self.reader.MeanTextConf()
102
+ left = box["x"] / self.scale
103
+ bottom = box["y"] / self.scale
104
+ right = (box["x"] + box["w"]) / self.scale
105
+ top = (box["y"] + box["h"]) / self.scale
106
+
107
+ cells.append(
108
+ OcrCell(
109
+ id=ix,
110
+ text=text,
111
+ confidence=confidence,
112
+ bbox=BoundingBox.from_tuple(
113
+ coord=(left, top, right, bottom),
114
+ origin=CoordOrigin.TOPLEFT,
115
+ ),
116
+ )
117
+ )
118
+
119
+ # del high_res_image
120
+ all_ocr_cells.extend(cells)
116
121
 
117
- ## Remove OCR cells which overlap with programmatic cells.
118
- filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
122
+ ## Remove OCR cells which overlap with programmatic cells.
123
+ filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
119
124
 
120
- page.cells.extend(filtered_ocr_cells)
125
+ page.cells.extend(filtered_ocr_cells)
121
126
 
122
- # DEBUG code:
123
- # self.draw_ocr_rects_and_cells(page, ocr_rects)
127
+ # DEBUG code:
128
+ # self.draw_ocr_rects_and_cells(page, ocr_rects)
124
129
 
125
- yield page
130
+ yield page
@@ -134,13 +134,13 @@ class StandardPdfPipeline(PaginatedPipeline):
134
134
  all_body = []
135
135
 
136
136
  for p in conv_res.pages:
137
- assert p.assembled is not None
138
- for el in p.assembled.body:
139
- all_body.append(el)
140
- for el in p.assembled.headers:
141
- all_headers.append(el)
142
- for el in p.assembled.elements:
143
- all_elements.append(el)
137
+ if p.assembled is not None:
138
+ for el in p.assembled.body:
139
+ all_body.append(el)
140
+ for el in p.assembled.headers:
141
+ all_headers.append(el)
142
+ for el in p.assembled.elements:
143
+ all_elements.append(el)
144
144
 
145
145
  conv_res.assembled = AssembledUnit(
146
146
  elements=all_elements, headers=all_headers, body=all_body
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.0.0
3
+ Version: 2.2.0
4
4
  Summary: Docling PDF conversion package
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -22,13 +22,14 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
22
  Provides-Extra: tesserocr
23
23
  Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
24
24
  Requires-Dist: certifi (>=2024.7.4)
25
- Requires-Dist: deepsearch-glm (>=0.25.0,<0.26.0)
26
- Requires-Dist: docling-core (>=2.0.0,<3.0.0)
25
+ Requires-Dist: deepsearch-glm (>=0.26.1,<0.27.0)
26
+ Requires-Dist: docling-core (>=2.1.0,<3.0.0)
27
27
  Requires-Dist: docling-ibm-models (>=2.0.1,<3.0.0)
28
- Requires-Dist: docling-parse (>=1.6.0,<2.0.0)
28
+ Requires-Dist: docling-parse (>=2.0.0,<3.0.0)
29
29
  Requires-Dist: easyocr (>=1.7,<2.0)
30
30
  Requires-Dist: filetype (>=1.2.0,<2.0.0)
31
31
  Requires-Dist: huggingface_hub (>=0.23,<1)
32
+ Requires-Dist: marko (>=2.1.2,<3.0.0)
32
33
  Requires-Dist: pandas (>=2.1.4,<3.0.0)
33
34
  Requires-Dist: pyarrow (>=16.1.0,<17.0.0)
34
35
  Requires-Dist: pydantic (>=2.0.0,<3.0.0)
@@ -50,7 +51,7 @@ Description-Content-Type: text/markdown
50
51
 
51
52
  <p align="center">
52
53
  <a href="https://github.com/ds4sd/docling">
53
- <img loading="lazy" alt="Docling" src="https://github.com/DS4SD/docling/raw/main/docs/assets/logo.png" width="150" />
54
+ <img loading="lazy" alt="Docling" src="https://github.com/DS4SD/docling/raw/main/docs/assets/docling_processing.png" width="100%"/>
54
55
  </a>
55
56
  </p>
56
57
 
@@ -69,6 +70,7 @@ Description-Content-Type: text/markdown
69
70
 
70
71
  Docling parses documents and exports them to the desired format with ease and speed.
71
72
 
73
+
72
74
  ## Features
73
75
 
74
76
  * 🗂️ Multi-format support for input (PDF, DOCX etc.) & output (Markdown, JSON etc.)
@@ -94,16 +96,15 @@ More [detailed installation instructions](https://ds4sd.github.io/docling/instal
94
96
 
95
97
  ## Getting started
96
98
 
97
- To convert invidual documents, use `convert()`, for example:
99
+ To convert individual documents, use `convert()`, for example:
98
100
 
99
101
  ```python
100
102
  from docling.document_converter import DocumentConverter
101
103
 
102
- source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL
104
+ source = "https://arxiv.org/pdf/2408.09869" # document per local path or URL
103
105
  converter = DocumentConverter()
104
106
  result = converter.convert(source)
105
107
  print(result.document.export_to_markdown()) # output: "## Docling Technical Report[...]"
106
- print(result.document.export_to_document_tokens()) # output: "<document><title><page_1><loc_20>..."
107
108
  ```
108
109
 
109
110
 
@@ -144,6 +145,6 @@ If you use Docling in your projects, please consider citing the following:
144
145
 
145
146
  ## License
146
147
 
147
- The Docling codebase is under MIT license.
148
+ The Docling codebase is under MIT license.
148
149
  For individual model usage, please refer to the model licenses found in the original packages.
149
150
 
@@ -0,0 +1,44 @@
1
+ docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ docling/backend/abstract_backend.py,sha256=-or6kWVV7egQeyIuN-vI0Tr7Q1htalBZSlhgq_G2RdU,1678
4
+ docling/backend/asciidoc_backend.py,sha256=WW0eIanPIObcg5ci9YcnqFxwipmqRFsRY8zjZDdKvJA,14116
5
+ docling/backend/docling_parse_backend.py,sha256=TaIMli9vePd3fz9L6S4t75JPYZDpgYBLRGfWjbc9Hbk,7632
6
+ docling/backend/docling_parse_v2_backend.py,sha256=QlVU8NgqKvVCa99E8oDa2Xvy__kq30C-myGY3o9Qoq4,8588
7
+ docling/backend/html_backend.py,sha256=wfh5PWEwoqsCXxFCQbFBdJvEtlqZhXgqfPfTYETWHfE,14974
8
+ docling/backend/md_backend.py,sha256=osYiNLnep9UgLq8mUH9bmwG3kP9RXxt69I8LlyeJN6g,11505
9
+ docling/backend/mspowerpoint_backend.py,sha256=J472AIH_IXvGg3D0FDmXhue1At_VSBD6n15c64Kxttw,15446
10
+ docling/backend/msword_backend.py,sha256=6bY0ebOaeSbpskUJY5t5pOf4a2VclWzeHeSo-vzsaO0,17470
11
+ docling/backend/pdf_backend.py,sha256=unnw7QiRE1VXg6Pj-eYrtnFGrp5SSYiI324OlFxyv6c,2050
12
+ docling/backend/pypdfium2_backend.py,sha256=MJX6fQqwK3r967fyAAs-RA_YIkeQvhgsLkQAgaBTgaE,8995
13
+ docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
+ docling/cli/main.py,sha256=NRVGz0z-3EBwYNMJGVnLtDBcfOeutaUyYdkM0ymRnGA,8008
15
+ docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
+ docling/datamodel/base_models.py,sha256=Mx0xR6YmRP8thu8CjOxjbGHLUJctqIvFwRZQ-8tQowY,5380
17
+ docling/datamodel/document.py,sha256=mkPXDms9jtPFY1pfBSicNaVRZwbbfzYFUj0dJDbMgG8,20612
18
+ docling/datamodel/pipeline_options.py,sha256=WNjluKC-Ww63ifkGMHwws8zIDHnOS1z5Hw7_j3S0qao,2446
19
+ docling/datamodel/settings.py,sha256=KBFVeQviR1hoCFjA1ZwuLuQ6EAAYR7saIa6EUYiOkHI,767
20
+ docling/document_converter.py,sha256=T-Y2pWwbCIofW209XJ3wlc5TiGeQqMbDqgzcVWyZ_0Y,10227
21
+ docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
22
+ docling/models/base_model.py,sha256=wSBGAIAbLqrqP_SMtkzXMuyFvvzjVU6iCqgSNnGIR4Y,603
23
+ docling/models/base_ocr_model.py,sha256=SYelQRValiUo6M_p_9-J7CqNIOFO-EkK58j90SMsKQY,5028
24
+ docling/models/ds_glm_model.py,sha256=vJLngchZonqFzGWbUr2izFSXk9DloPDhAfN2c3nkzNU,11254
25
+ docling/models/easyocr_model.py,sha256=YfvdodjZ20WuOfouQXJmDyQL78QDOqWYsWSs2zSxWFc,3327
26
+ docling/models/layout_model.py,sha256=zd2ULW3U6v9OJl4TnjWFEY6Q2O-lBfrIqtvrnDzF7HU,12596
27
+ docling/models/page_assemble_model.py,sha256=LOKHho-r-RpeIVh8CpJ9tid_QIp5um3ukcrucZsyUlY,6645
28
+ docling/models/page_preprocessing_model.py,sha256=cfhUIlGAGaX1RxILi69ZEV9Kmhhd3Y0XaSlQnGo18o4,1964
29
+ docling/models/table_structure_model.py,sha256=YWSZKOz56gvicjTzVgSE-8Z_hI3NcRD5EN0yOUoM-_g,6979
30
+ docling/models/tesseract_ocr_cli_model.py,sha256=fKc05V73ibMvAeuA4PForhYNtunpT5rR0k_xHZsew-E,5980
31
+ docling/models/tesseract_ocr_model.py,sha256=v6td0vq8NogePuRTJRZhKF0DtZXITj70r9rKJKO5u9k,4984
32
+ docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
33
+ docling/pipeline/base_pipeline.py,sha256=7DTzVvM_jVHCxyY-BuuGRhmUsD_sgX4DD00oBFJWdB8,6723
34
+ docling/pipeline/simple_pipeline.py,sha256=pxce0-3He5Lqa-xXT-7h173XVOSMZiMHl6HOfAJmQ7o,2162
35
+ docling/pipeline/standard_pdf_pipeline.py,sha256=AVNSxGc6kPmBPDLWDc9eI8fryc25eOtiIVrOyVhZMZM,7527
36
+ docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
37
+ docling/utils/export.py,sha256=KyGF1BVDHPFfHVXZc8vegsWlFfOgGPP2YckWpTadyI8,4694
38
+ docling/utils/layout_utils.py,sha256=vlN0rc8i0ayRGn3WnaG-pdmqEL00KKGl2zez3Gj-hrk,32074
39
+ docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
40
+ docling-2.2.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
41
+ docling-2.2.0.dist-info/METADATA,sha256=TkaywA2l2ImdMc9WpUYWUQy3n50zG9Y9eC7ziElBlU0,6205
42
+ docling-2.2.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
43
+ docling-2.2.0.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
44
+ docling-2.2.0.dist-info/RECORD,,
@@ -1,42 +0,0 @@
1
- docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- docling/backend/abstract_backend.py,sha256=8Lh1gf1P9AnzlwB989OVBgLmokTpfI0LxYRfuvYTqoo,1646
4
- docling/backend/docling_parse_backend.py,sha256=UgBpopZIP5YkhwhybiqDnqVsSqv9DAAPFkafhfL0pPo,7623
5
- docling/backend/docling_parse_v2_backend.py,sha256=VY7MsiyqjN3Vl0UkyezriiVJMLbLRrQVuKjWaTgIUwY,8336
6
- docling/backend/html_backend.py,sha256=MlhEXaA0tgX_tLuQLnkex43gsKqpqHWnbkssxY4n_kc,14753
7
- docling/backend/mspowerpoint_backend.py,sha256=2UYfMMeWwgDtvIKQELCA-bYv5Z-rGvbMiBNcidNL_uE,14332
8
- docling/backend/msword_backend.py,sha256=4SDqZAZxLr6VV50OU3MRBAV8SwZMCyJCUbNVMVUpitc,17659
9
- docling/backend/pdf_backend.py,sha256=unnw7QiRE1VXg6Pj-eYrtnFGrp5SSYiI324OlFxyv6c,2050
10
- docling/backend/pypdfium2_backend.py,sha256=MJX6fQqwK3r967fyAAs-RA_YIkeQvhgsLkQAgaBTgaE,8995
11
- docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
- docling/cli/main.py,sha256=NRVGz0z-3EBwYNMJGVnLtDBcfOeutaUyYdkM0ymRnGA,8008
13
- docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
- docling/datamodel/base_models.py,sha256=Ha-DoRZoksjHSZHWqUSiQ79MTBEfY5ur8U_LVtyBRYU,5153
15
- docling/datamodel/document.py,sha256=GCARkUuv8TNtFO934E7KujOsTkBFqLXX5bogNprVXEM,19411
16
- docling/datamodel/pipeline_options.py,sha256=mez7CiJMtm-xhOmZ-2-M_Q3YwC6EzHytWfg0E3tiVio,2329
17
- docling/datamodel/settings.py,sha256=KBFVeQviR1hoCFjA1ZwuLuQ6EAAYR7saIa6EUYiOkHI,767
18
- docling/document_converter.py,sha256=S_t9hs2uZfXC38LC0hTaAihrSJIrCvnTiuY5SvUccgk,9587
19
- docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
20
- docling/models/base_model.py,sha256=wSBGAIAbLqrqP_SMtkzXMuyFvvzjVU6iCqgSNnGIR4Y,603
21
- docling/models/base_ocr_model.py,sha256=N5pOQ4RQSWPU-bPZ81FySDdBnwNG64-6K0ldK6ENU0U,4672
22
- docling/models/ds_glm_model.py,sha256=nUBHTsE-eRtrtPE6v_N4iZGr43bXIsOfb_8NFUMWJQk,11057
23
- docling/models/easyocr_model.py,sha256=URhHzxwnBuErf6sskWyEWauX-Kne0upnrAguzKQi3SI,3090
24
- docling/models/layout_model.py,sha256=B4Veff9V0WxcQXTBYzJM6rE7B3lszUI7zmg7EFE0WxU,12245
25
- docling/models/page_assemble_model.py,sha256=ovwSki52w1rlrc7MgMbjh1Uc5H8XBCz9S2nHE44mzYU,6030
26
- docling/models/page_preprocessing_model.py,sha256=PJ_jASz3w0Lus_Ep4NN5Vq_Redq7x8vAyVR8qXCb6Eg,1817
27
- docling/models/table_structure_model.py,sha256=qcjXXiNZcMWjr6ys02sToKZlAr8S0rAJNICbBjK9Ijo,6426
28
- docling/models/tesseract_ocr_cli_model.py,sha256=l-gRDU273opgack9fAxHaXPEdX5IdD5ZTnu6VsfKIWc,5665
29
- docling/models/tesseract_ocr_model.py,sha256=tEEq-URSYnyQru7RoD5fx-s1trwMxPCcwJx94M4iuxc,4676
30
- docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
31
- docling/pipeline/base_pipeline.py,sha256=7DTzVvM_jVHCxyY-BuuGRhmUsD_sgX4DD00oBFJWdB8,6723
32
- docling/pipeline/simple_pipeline.py,sha256=pxce0-3He5Lqa-xXT-7h173XVOSMZiMHl6HOfAJmQ7o,2162
33
- docling/pipeline/standard_pdf_pipeline.py,sha256=_gRGR9tsy55_tptFj-AiEJEedxhJ0iIjHb5qaj36d28,7506
34
- docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
35
- docling/utils/export.py,sha256=KyGF1BVDHPFfHVXZc8vegsWlFfOgGPP2YckWpTadyI8,4694
36
- docling/utils/layout_utils.py,sha256=vlN0rc8i0ayRGn3WnaG-pdmqEL00KKGl2zez3Gj-hrk,32074
37
- docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
38
- docling-2.0.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
39
- docling-2.0.0.dist-info/METADATA,sha256=RyawmIT2dz9la0DH8KsW749TNq4BpiSIndVEz83wauQ,6235
40
- docling-2.0.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
41
- docling-2.0.0.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
42
- docling-2.0.0.dist-info/RECORD,,