docling-core 1.3.0__tar.gz → 1.4.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

Files changed (49) hide show
  1. {docling_core-1.3.0 → docling_core-1.4.1}/PKG-INFO +1 -1
  2. {docling_core-1.3.0 → docling_core-1.4.1}/docling_core/types/doc/base.py +98 -1
  3. {docling_core-1.3.0 → docling_core-1.4.1}/docling_core/types/doc/document.py +4 -4
  4. {docling_core-1.3.0 → docling_core-1.4.1}/pyproject.toml +1 -1
  5. {docling_core-1.3.0 → docling_core-1.4.1}/LICENSE +0 -0
  6. {docling_core-1.3.0 → docling_core-1.4.1}/README.md +0 -0
  7. {docling_core-1.3.0 → docling_core-1.4.1}/docling_core/__init__.py +0 -0
  8. {docling_core-1.3.0 → docling_core-1.4.1}/docling_core/py.typed +0 -0
  9. {docling_core-1.3.0 → docling_core-1.4.1}/docling_core/resources/schemas/doc/ANN.json +0 -0
  10. {docling_core-1.3.0 → docling_core-1.4.1}/docling_core/resources/schemas/doc/DOC.json +0 -0
  11. {docling_core-1.3.0 → docling_core-1.4.1}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
  12. {docling_core-1.3.0 → docling_core-1.4.1}/docling_core/resources/schemas/doc/RAW.json +0 -0
  13. {docling_core-1.3.0 → docling_core-1.4.1}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
  14. {docling_core-1.3.0 → docling_core-1.4.1}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
  15. {docling_core-1.3.0 → docling_core-1.4.1}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
  16. {docling_core-1.3.0 → docling_core-1.4.1}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
  17. {docling_core-1.3.0 → docling_core-1.4.1}/docling_core/search/__init__.py +0 -0
  18. {docling_core-1.3.0 → docling_core-1.4.1}/docling_core/search/json_schema_to_search_mapper.py +0 -0
  19. {docling_core-1.3.0 → docling_core-1.4.1}/docling_core/search/mapping.py +0 -0
  20. {docling_core-1.3.0 → docling_core-1.4.1}/docling_core/search/meta.py +0 -0
  21. {docling_core-1.3.0 → docling_core-1.4.1}/docling_core/search/package.py +0 -0
  22. {docling_core-1.3.0 → docling_core-1.4.1}/docling_core/transforms/__init__.py +0 -0
  23. {docling_core-1.3.0 → docling_core-1.4.1}/docling_core/transforms/chunker/__init__.py +0 -0
  24. {docling_core-1.3.0 → docling_core-1.4.1}/docling_core/transforms/chunker/base.py +0 -0
  25. {docling_core-1.3.0 → docling_core-1.4.1}/docling_core/transforms/chunker/hierarchical_chunker.py +0 -0
  26. {docling_core-1.3.0 → docling_core-1.4.1}/docling_core/types/__init__.py +0 -0
  27. {docling_core-1.3.0 → docling_core-1.4.1}/docling_core/types/base.py +0 -0
  28. {docling_core-1.3.0 → docling_core-1.4.1}/docling_core/types/doc/__init__.py +0 -0
  29. {docling_core-1.3.0 → docling_core-1.4.1}/docling_core/types/doc/doc_ann.py +0 -0
  30. {docling_core-1.3.0 → docling_core-1.4.1}/docling_core/types/doc/doc_ocr.py +0 -0
  31. {docling_core-1.3.0 → docling_core-1.4.1}/docling_core/types/doc/doc_raw.py +0 -0
  32. {docling_core-1.3.0 → docling_core-1.4.1}/docling_core/types/gen/__init__.py +0 -0
  33. {docling_core-1.3.0 → docling_core-1.4.1}/docling_core/types/gen/generic.py +0 -0
  34. {docling_core-1.3.0 → docling_core-1.4.1}/docling_core/types/nlp/__init__.py +0 -0
  35. {docling_core-1.3.0 → docling_core-1.4.1}/docling_core/types/nlp/qa.py +0 -0
  36. {docling_core-1.3.0 → docling_core-1.4.1}/docling_core/types/nlp/qa_labels.py +0 -0
  37. {docling_core-1.3.0 → docling_core-1.4.1}/docling_core/types/rec/__init__.py +0 -0
  38. {docling_core-1.3.0 → docling_core-1.4.1}/docling_core/types/rec/attribute.py +0 -0
  39. {docling_core-1.3.0 → docling_core-1.4.1}/docling_core/types/rec/base.py +0 -0
  40. {docling_core-1.3.0 → docling_core-1.4.1}/docling_core/types/rec/predicate.py +0 -0
  41. {docling_core-1.3.0 → docling_core-1.4.1}/docling_core/types/rec/record.py +0 -0
  42. {docling_core-1.3.0 → docling_core-1.4.1}/docling_core/types/rec/statement.py +0 -0
  43. {docling_core-1.3.0 → docling_core-1.4.1}/docling_core/types/rec/subject.py +0 -0
  44. {docling_core-1.3.0 → docling_core-1.4.1}/docling_core/utils/__init__.py +0 -0
  45. {docling_core-1.3.0 → docling_core-1.4.1}/docling_core/utils/alias.py +0 -0
  46. {docling_core-1.3.0 → docling_core-1.4.1}/docling_core/utils/ds_generate_docs.py +0 -0
  47. {docling_core-1.3.0 → docling_core-1.4.1}/docling_core/utils/ds_generate_jsonschema.py +0 -0
  48. {docling_core-1.3.0 → docling_core-1.4.1}/docling_core/utils/validate.py +0 -0
  49. {docling_core-1.3.0 → docling_core-1.4.1}/docling_core/utils/validators.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling-core
3
- Version: 1.3.0
3
+ Version: 1.4.1
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Home-page: https://ds4sd.github.io/
6
6
  License: MIT
@@ -4,8 +4,9 @@
4
4
  #
5
5
 
6
6
  """Define common models across CCS objects."""
7
- from typing import Annotated, Literal, Optional, Union
7
+ from typing import Annotated, List, Literal, Optional, Union
8
8
 
9
+ import pandas as pd
9
10
  from pydantic import BaseModel, Field, PositiveInt, StrictStr
10
11
 
11
12
  from docling_core.search.mapping import es_field
@@ -152,6 +153,102 @@ class Table(BaseCell):
152
153
  data: Optional[list[list[Union[GlmTableCell, TableCell]]]] = None
153
154
  model: Optional[str] = None
154
155
 
156
+ def _get_tablecell_span(self, cell: TableCell, ix: int):
157
+ if cell.spans is None:
158
+ span = set()
159
+ else:
160
+ span = set([s[ix] for s in cell.spans])
161
+ if len(span) == 0:
162
+ return 1, None, None
163
+ return len(span), min(span), max(span)
164
+
165
+ def export_to_dataframe(self) -> pd.DataFrame:
166
+ """Export the table as a Pandas DataFrame."""
167
+ if self.data is None or self.num_rows == 0 or self.num_cols == 0:
168
+ return pd.DataFrame()
169
+
170
+ # Count how many rows are column headers
171
+ num_headers = 0
172
+ for i, row in enumerate(self.data):
173
+ if len(row) == 0:
174
+ raise RuntimeError(f"Invalid table. {len(row)=} but {self.num_cols=}.")
175
+
176
+ any_header = False
177
+ for cell in row:
178
+ if cell.obj_type == "col_header":
179
+ any_header = True
180
+ break
181
+
182
+ if any_header:
183
+ num_headers += 1
184
+ else:
185
+ break
186
+
187
+ # Create the column names from all col_headers
188
+ columns: Optional[List[str]] = None
189
+ if num_headers > 0:
190
+ columns = ["" for _ in range(self.num_cols)]
191
+ for i in range(num_headers):
192
+ for j, cell in enumerate(self.data[i]):
193
+ col_name = cell.text
194
+ if columns[j] != "":
195
+ col_name = f".{col_name}"
196
+ columns[j] += col_name
197
+
198
+ # Create table data
199
+ table_data = [[cell.text for cell in row] for row in self.data[num_headers:]]
200
+
201
+ # Create DataFrame
202
+ df = pd.DataFrame(table_data, columns=columns)
203
+
204
+ return df
205
+
206
+ def export_to_html(self) -> str:
207
+ """Export the table as html."""
208
+ body = ""
209
+ nrows = self.num_rows
210
+ ncols = self.num_cols
211
+
212
+ if self.data is None:
213
+ return ""
214
+ for i in range(nrows):
215
+ body += "<tr>"
216
+ for j in range(ncols):
217
+ cell: TableCell = self.data[i][j]
218
+
219
+ rowspan, rowstart, rowend = self._get_tablecell_span(cell, 0)
220
+ colspan, colstart, colend = self._get_tablecell_span(cell, 1)
221
+
222
+ if rowstart is not None and rowstart != i:
223
+ continue
224
+ if colstart is not None and colstart != j:
225
+ continue
226
+
227
+ if rowstart is None:
228
+ rowstart = i
229
+ if colstart is None:
230
+ colstart = j
231
+
232
+ content = cell.text.strip()
233
+ label = cell.obj_type
234
+ celltag = "td"
235
+ if label in ["row_header", "row_multi_header", "row_title"]:
236
+ pass
237
+ elif label in ["col_header", "col_multi_header"]:
238
+ celltag = "th"
239
+
240
+ opening_tag = f"{celltag}"
241
+ if rowspan > 1:
242
+ opening_tag += f' rowspan="{rowspan}"'
243
+ if colspan > 1:
244
+ opening_tag += f' colspan="{colspan}"'
245
+
246
+ body += f"<{opening_tag}>{content}</{celltag}>"
247
+ body += "</tr>"
248
+ body = f"<table>{body}</table>"
249
+
250
+ return body
251
+
155
252
 
156
253
  # FIXME: let's add some figure specific data-types later
157
254
  class Figure(BaseCell):
@@ -410,21 +410,21 @@ class DocumentToken(Enum):
410
410
  special_tokens = [token.value for token in cls]
411
411
 
412
412
  # Adding dynamically generated row and col tokens
413
- for i in range(0, max_rows):
413
+ for i in range(0, max_rows + 1):
414
414
  special_tokens += [f"<row_{i}>", f"</row_{i}>"]
415
415
 
416
- for i in range(0, max_cols):
416
+ for i in range(0, max_cols + 1):
417
417
  special_tokens += [f"<col_{i}>", f"</col_{i}>"]
418
418
 
419
419
  for i in range(6):
420
420
  special_tokens += [f"<section-header-{i}>", f"</section-header-{i}>"]
421
421
 
422
422
  # Adding dynamically generated page-tokens
423
- for i in range(0, max_pages):
423
+ for i in range(0, max_pages + 1):
424
424
  special_tokens.append(f"<page_{i}>")
425
425
 
426
426
  # Adding dynamically generated location-tokens
427
- for i in range(0, max(page_dimension[0], page_dimension[1])):
427
+ for i in range(0, max(page_dimension[0] + 1, page_dimension[1] + 1)):
428
428
  special_tokens.append(f"<loc_{i}>")
429
429
 
430
430
  return special_tokens
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling-core"
3
- version = "1.3.0"
3
+ version = "1.4.1"
4
4
  description = "A python library to define and validate data types in Docling."
5
5
  license = "MIT"
6
6
  authors = [
File without changes
File without changes