docling-core 1.2.0__tar.gz → 1.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

Files changed (49) hide show
  1. {docling_core-1.2.0 → docling_core-1.4.0}/PKG-INFO +2 -1
  2. docling_core-1.4.0/docling_core/transforms/__init__.py +6 -0
  3. docling_core-1.4.0/docling_core/transforms/chunker/__init__.py +15 -0
  4. docling_core-1.4.0/docling_core/transforms/chunker/base.py +45 -0
  5. docling_core-1.4.0/docling_core/transforms/chunker/hierarchical_chunker.py +337 -0
  6. {docling_core-1.2.0 → docling_core-1.4.0}/docling_core/types/doc/base.py +98 -1
  7. {docling_core-1.2.0 → docling_core-1.4.0}/pyproject.toml +11 -3
  8. {docling_core-1.2.0 → docling_core-1.4.0}/LICENSE +0 -0
  9. {docling_core-1.2.0 → docling_core-1.4.0}/README.md +0 -0
  10. {docling_core-1.2.0 → docling_core-1.4.0}/docling_core/__init__.py +0 -0
  11. {docling_core-1.2.0 → docling_core-1.4.0}/docling_core/py.typed +0 -0
  12. {docling_core-1.2.0 → docling_core-1.4.0}/docling_core/resources/schemas/doc/ANN.json +0 -0
  13. {docling_core-1.2.0 → docling_core-1.4.0}/docling_core/resources/schemas/doc/DOC.json +0 -0
  14. {docling_core-1.2.0 → docling_core-1.4.0}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
  15. {docling_core-1.2.0 → docling_core-1.4.0}/docling_core/resources/schemas/doc/RAW.json +0 -0
  16. {docling_core-1.2.0 → docling_core-1.4.0}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
  17. {docling_core-1.2.0 → docling_core-1.4.0}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
  18. {docling_core-1.2.0 → docling_core-1.4.0}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
  19. {docling_core-1.2.0 → docling_core-1.4.0}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
  20. {docling_core-1.2.0 → docling_core-1.4.0}/docling_core/search/__init__.py +0 -0
  21. {docling_core-1.2.0 → docling_core-1.4.0}/docling_core/search/json_schema_to_search_mapper.py +0 -0
  22. {docling_core-1.2.0 → docling_core-1.4.0}/docling_core/search/mapping.py +0 -0
  23. {docling_core-1.2.0 → docling_core-1.4.0}/docling_core/search/meta.py +0 -0
  24. {docling_core-1.2.0 → docling_core-1.4.0}/docling_core/search/package.py +0 -0
  25. {docling_core-1.2.0 → docling_core-1.4.0}/docling_core/types/__init__.py +0 -0
  26. {docling_core-1.2.0 → docling_core-1.4.0}/docling_core/types/base.py +0 -0
  27. {docling_core-1.2.0 → docling_core-1.4.0}/docling_core/types/doc/__init__.py +0 -0
  28. {docling_core-1.2.0 → docling_core-1.4.0}/docling_core/types/doc/doc_ann.py +0 -0
  29. {docling_core-1.2.0 → docling_core-1.4.0}/docling_core/types/doc/doc_ocr.py +0 -0
  30. {docling_core-1.2.0 → docling_core-1.4.0}/docling_core/types/doc/doc_raw.py +0 -0
  31. {docling_core-1.2.0 → docling_core-1.4.0}/docling_core/types/doc/document.py +0 -0
  32. {docling_core-1.2.0 → docling_core-1.4.0}/docling_core/types/gen/__init__.py +0 -0
  33. {docling_core-1.2.0 → docling_core-1.4.0}/docling_core/types/gen/generic.py +0 -0
  34. {docling_core-1.2.0 → docling_core-1.4.0}/docling_core/types/nlp/__init__.py +0 -0
  35. {docling_core-1.2.0 → docling_core-1.4.0}/docling_core/types/nlp/qa.py +0 -0
  36. {docling_core-1.2.0 → docling_core-1.4.0}/docling_core/types/nlp/qa_labels.py +0 -0
  37. {docling_core-1.2.0 → docling_core-1.4.0}/docling_core/types/rec/__init__.py +0 -0
  38. {docling_core-1.2.0 → docling_core-1.4.0}/docling_core/types/rec/attribute.py +0 -0
  39. {docling_core-1.2.0 → docling_core-1.4.0}/docling_core/types/rec/base.py +0 -0
  40. {docling_core-1.2.0 → docling_core-1.4.0}/docling_core/types/rec/predicate.py +0 -0
  41. {docling_core-1.2.0 → docling_core-1.4.0}/docling_core/types/rec/record.py +0 -0
  42. {docling_core-1.2.0 → docling_core-1.4.0}/docling_core/types/rec/statement.py +0 -0
  43. {docling_core-1.2.0 → docling_core-1.4.0}/docling_core/types/rec/subject.py +0 -0
  44. {docling_core-1.2.0 → docling_core-1.4.0}/docling_core/utils/__init__.py +0 -0
  45. {docling_core-1.2.0 → docling_core-1.4.0}/docling_core/utils/alias.py +0 -0
  46. {docling_core-1.2.0 → docling_core-1.4.0}/docling_core/utils/ds_generate_docs.py +0 -0
  47. {docling_core-1.2.0 → docling_core-1.4.0}/docling_core/utils/ds_generate_jsonschema.py +0 -0
  48. {docling_core-1.2.0 → docling_core-1.4.0}/docling_core/utils/validate.py +0 -0
  49. {docling_core-1.2.0 → docling_core-1.4.0}/docling_core/utils/validators.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling-core
3
- Version: 1.2.0
3
+ Version: 1.4.0
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Home-page: https://ds4sd.github.io/
6
6
  License: MIT
@@ -28,6 +28,7 @@ Classifier: Typing :: Typed
28
28
  Requires-Dist: json-schema-for-humans (>=1.0.0,<2.0.0)
29
29
  Requires-Dist: jsonref (>=1.1.0,<2.0.0)
30
30
  Requires-Dist: jsonschema (>=4.16.0,<5.0.0)
31
+ Requires-Dist: pandas (>=2.2.2,<3.0.0)
31
32
  Requires-Dist: pydantic (>=2.6.0,<3.0.0)
32
33
  Requires-Dist: pyproject-toml (>=0.0.10,<0.0.11)
33
34
  Requires-Dist: tabulate (>=0.9.0,<0.10.0)
@@ -0,0 +1,6 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """Data transformations package."""
@@ -0,0 +1,15 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """Define the chunker types."""
7
+
8
+ from docling_core.transforms.chunker.base import ( # noqa
9
+ BaseChunker,
10
+ Chunk,
11
+ ChunkWithMetadata,
12
+ )
13
+ from docling_core.transforms.chunker.hierarchical_chunker import ( # noqa
14
+ HierarchicalChunker,
15
+ )
@@ -0,0 +1,45 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """Define base classes for chunking."""
7
+ from abc import ABC, abstractmethod
8
+ from typing import Iterator, Optional
9
+
10
+ from pydantic import BaseModel
11
+
12
+ from docling_core.types import BoundingBox, Document
13
+
14
+
15
+ class Chunk(BaseModel):
16
+ """Data model for Chunk."""
17
+
18
+ path: str
19
+ text: str
20
+
21
+
22
+ class ChunkWithMetadata(Chunk):
23
+ """Data model for Chunk including metadata."""
24
+
25
+ page: Optional[int]
26
+ bbox: Optional[BoundingBox]
27
+
28
+
29
+ class BaseChunker(BaseModel, ABC):
30
+ """Base class for Chunker."""
31
+
32
+ @abstractmethod
33
+ def chunk(self, dl_doc: Document, **kwargs) -> Iterator[Chunk]:
34
+ """Chunk the provided document.
35
+
36
+ Args:
37
+ dl_doc (Document): document to chunk
38
+
39
+ Raises:
40
+ NotImplementedError: in this abstract implementation
41
+
42
+ Yields:
43
+ Iterator[Chunk]: iterator over extracted chunks
44
+ """
45
+ raise NotImplementedError()
@@ -0,0 +1,337 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """Chunker implementation leveraging the document structure."""
7
+
8
+ from __future__ import annotations
9
+
10
+ import logging
11
+ from enum import Enum
12
+ from typing import Any, Iterator, Optional, Union
13
+
14
+ import pandas as pd
15
+ from pydantic import BaseModel, PositiveInt
16
+
17
+ from docling_core.transforms.chunker import BaseChunker, Chunk, ChunkWithMetadata
18
+ from docling_core.types import BaseText
19
+ from docling_core.types import Document as DLDocument
20
+ from docling_core.types import Ref, Table
21
+
22
+ _logger = logging.getLogger(__name__)
23
+
24
+
25
+ class HierarchicalChunker(BaseChunker):
26
+ """Chunker implementation leveraging the document layout."""
27
+
28
+ include_metadata: bool = True
29
+ min_chunk_len: PositiveInt = 64
30
+
31
+ class _NodeType(str, Enum):
32
+ PARAGRAPH = "paragraph"
33
+ SUBTITLE_LEVEL_1 = "subtitle-level-1"
34
+ TABLE = "table"
35
+ CAPTION = "caption"
36
+
37
+ class _NodeName(str, Enum):
38
+ TITLE = "title"
39
+ REFERENCE = "reference"
40
+ LIST_ITEM = "list-item"
41
+ SUBTITLE_LEVEL_1 = "subtitle-level-1"
42
+
43
+ _allowed_types: list[str] = [
44
+ _NodeType.PARAGRAPH,
45
+ _NodeType.SUBTITLE_LEVEL_1,
46
+ _NodeType.TABLE,
47
+ _NodeType.CAPTION,
48
+ ]
49
+ _disallowed_names_by_type: dict[str, list[str]] = {
50
+ _NodeType.PARAGRAPH: [
51
+ _NodeName.REFERENCE,
52
+ ],
53
+ }
54
+
55
+ @classmethod
56
+ def _norm(cls, text: Optional[str]) -> Optional[str]:
57
+ return text.lower() if text is not None else None
58
+
59
+ @classmethod
60
+ def _convert_table_to_dataframe(cls, table: Table) -> Optional[pd.DataFrame]:
61
+ if table.data:
62
+ table_content = [[cell.text for cell in row] for row in table.data]
63
+ return pd.DataFrame(table_content)
64
+ else:
65
+ return None
66
+
67
+ @classmethod
68
+ def _triplet_serialize(cls, table) -> Optional[str]:
69
+ output_text: Optional[str] = None
70
+ table_df = cls._convert_table_to_dataframe(table)
71
+ if table_df is not None and table_df.shape[0] > 1 and table_df.shape[1] > 1:
72
+ rows = [item.strip() for item in table_df.iloc[:, 0].to_list()]
73
+ cols = [item.strip() for item in table_df.iloc[0, :].to_list()]
74
+ nrows = table_df.shape[0]
75
+ ncols = table_df.shape[1]
76
+ texts = [
77
+ f"{rows[i]}, {cols[j]} = {table_df.iloc[i, j].strip()}"
78
+ for i in range(1, nrows)
79
+ for j in range(1, ncols)
80
+ ]
81
+ output_text = ". ".join(texts)
82
+
83
+ return output_text
84
+
85
+ @classmethod
86
+ def _create_path(cls, pos: int, path_prefix: str = "main-text") -> str:
87
+ return f"$.{path_prefix}[{pos}]"
88
+
89
+ class _MainTextItemNode(BaseModel):
90
+ parent: Optional[int] = None
91
+ children: list[int] = []
92
+
93
+ class _TitleInfo(BaseModel):
94
+ text: str
95
+ path_in_doc: str
96
+
97
+ class _GlobalContext(BaseModel):
98
+ title: Optional[_HC._TitleInfo] = None
99
+
100
+ class _DocContext(BaseModel):
101
+ dmap: dict[int, _HC._MainTextItemNode] # main text element context
102
+ glob: _HC._GlobalContext # global context
103
+
104
+ @classmethod
105
+ def from_doc(cls, doc: DLDocument) -> _HC._DocContext:
106
+ dmap: dict[int, _HC._MainTextItemNode] = {}
107
+ glob: _HC._GlobalContext = _HC._GlobalContext()
108
+ if doc.description.title:
109
+ glob.title = _HC._TitleInfo(
110
+ text=doc.description.title,
111
+ path_in_doc="description.title",
112
+ )
113
+
114
+ parent = None
115
+ if doc.main_text:
116
+ idx = 0
117
+ while idx < len(doc.main_text):
118
+ item = doc.main_text[idx]
119
+ if (
120
+ not glob.title
121
+ and isinstance(item, BaseText)
122
+ and _HC._norm(item.name) == _HC._NodeName.TITLE
123
+ ):
124
+ glob.title = _HC._TitleInfo(
125
+ text=item.text,
126
+ path_in_doc=_HC._create_path(idx),
127
+ )
128
+
129
+ # start of a subtitle-level-1 parent
130
+ if (
131
+ isinstance(item, BaseText)
132
+ and _HC._norm(item.obj_type) == _HC._NodeType.SUBTITLE_LEVEL_1
133
+ ):
134
+ dmap[idx] = _HC._MainTextItemNode(parent=None)
135
+ parent = idx
136
+ if not glob.title:
137
+ glob.title = _HC._TitleInfo(
138
+ text=item.text,
139
+ path_in_doc=_HC._create_path(idx),
140
+ )
141
+
142
+ # start of a list parent
143
+ elif (
144
+ isinstance(item, BaseText)
145
+ and _HC._norm(item.name) != _HC._NodeName.LIST_ITEM
146
+ and idx + 1 < len(doc.main_text)
147
+ and _HC._norm(doc.main_text[idx + 1].name)
148
+ == _HC._NodeName.LIST_ITEM
149
+ ):
150
+ if parent is not None:
151
+ dmap[parent].children.append(idx)
152
+ dmap[idx] = _HC._MainTextItemNode(parent=parent)
153
+
154
+ # have all children register locally
155
+ li = idx + 1
156
+ while (
157
+ li < len(doc.main_text)
158
+ and _HC._norm(doc.main_text[li].name)
159
+ == _HC._NodeName.LIST_ITEM
160
+ ):
161
+ dmap[idx].children.append(li)
162
+ dmap[li] = _HC._MainTextItemNode(parent=idx)
163
+ li += 1
164
+ idx = li
165
+ continue
166
+
167
+ # normal case
168
+ else:
169
+ if parent is not None:
170
+ dmap[parent].children.append(idx)
171
+ dmap[idx] = _HC._MainTextItemNode(parent=parent)
172
+
173
+ idx += 1
174
+ else:
175
+ pass
176
+ return cls(
177
+ dmap=dmap,
178
+ glob=glob,
179
+ )
180
+
181
+ class _TextEntry(BaseModel):
182
+ text: str
183
+ path: str
184
+
185
+ def _build_chunk_impl(
186
+ self, doc: DLDocument, doc_map: _DocContext, idx: int, rec: bool = False
187
+ ) -> list[_TextEntry]:
188
+ if doc.main_text:
189
+ item = doc.main_text[idx]
190
+ item_type = _HC._norm(item.obj_type)
191
+ item_name = _HC._norm(item.name)
192
+ if (
193
+ item_type not in self._allowed_types
194
+ or item_name in self._disallowed_names_by_type.get(item_type, [])
195
+ ):
196
+ return []
197
+
198
+ c2p = doc_map.dmap
199
+
200
+ text_entries: list[_HC._TextEntry] = []
201
+ if (
202
+ isinstance(item, Ref)
203
+ and item_type == _HC._NodeType.TABLE
204
+ and doc.tables
205
+ ):
206
+ # resolve table reference
207
+ ref_nr = int(item.ref.split("/")[2]) # e.g. '#/tables/0'
208
+ table = doc.tables[ref_nr]
209
+ ser_out = _HC._triplet_serialize(table)
210
+ if table.data:
211
+ text_entries = (
212
+ [
213
+ self._TextEntry(
214
+ text=ser_out,
215
+ path=self._create_path(idx),
216
+ )
217
+ ]
218
+ if ser_out
219
+ else []
220
+ )
221
+ else:
222
+ return []
223
+ elif isinstance(item, BaseText):
224
+ text_entries = [
225
+ self._TextEntry(
226
+ text=item.text,
227
+ path=self._create_path(idx),
228
+ )
229
+ ]
230
+
231
+ # squash in any children of type list-item
232
+ if not rec:
233
+ if (
234
+ c2p[idx].children
235
+ and _HC._norm(doc.main_text[c2p[idx].children[0]].name)
236
+ == _HC._NodeName.LIST_ITEM
237
+ ):
238
+ text_entries = text_entries + [
239
+ self._TextEntry(
240
+ text=doc.main_text[c].text, # type: ignore[union-attr]
241
+ path=self._create_path(c),
242
+ )
243
+ for c in c2p[idx].children
244
+ if isinstance(doc.main_text[c], BaseText)
245
+ and _HC._norm(doc.main_text[c].name) == _HC._NodeName.LIST_ITEM
246
+ ]
247
+ elif item_name in [
248
+ _HC._NodeName.LIST_ITEM,
249
+ _HC._NodeName.SUBTITLE_LEVEL_1,
250
+ ]:
251
+ return []
252
+
253
+ if (parent := c2p[idx].parent) is not None:
254
+ # prepend with ancestors
255
+ return (
256
+ self._build_chunk_impl(
257
+ doc=doc, doc_map=doc_map, idx=parent, rec=True
258
+ )
259
+ + text_entries
260
+ )
261
+ else:
262
+ # if root, augment with title (if available and different)
263
+ return text_entries
264
+ else:
265
+ return []
266
+
267
+ def _build_chunk(
268
+ self,
269
+ doc: DLDocument,
270
+ doc_map: _DocContext,
271
+ idx: int,
272
+ delim: str,
273
+ rec: bool = False,
274
+ ) -> Optional[Chunk]:
275
+ texts = self._build_chunk_impl(doc=doc, doc_map=doc_map, idx=idx, rec=rec)
276
+ concat = delim.join([t.text for t in texts if t.text])
277
+ assert doc.main_text is not None
278
+ if len(concat) >= self.min_chunk_len:
279
+ orig_item = doc.main_text[idx]
280
+ item: Union[BaseText, Table]
281
+ if isinstance(orig_item, Ref):
282
+ if _HC._norm(orig_item.obj_type) == _HC._NodeType.TABLE and doc.tables:
283
+ pos = int(orig_item.ref.split("/")[2])
284
+ item = doc.tables[pos]
285
+ path = self._create_path(pos, path_prefix="tables")
286
+ else: # currently disregarding non-table references
287
+ return None
288
+ else:
289
+ item = orig_item
290
+ path = self._create_path(idx)
291
+
292
+ if self.include_metadata:
293
+ return ChunkWithMetadata(
294
+ text=concat,
295
+ path=path,
296
+ page=item.prov[0].page if item.prov else None,
297
+ bbox=item.prov[0].bbox if item.prov else None,
298
+ )
299
+ else:
300
+ return Chunk(
301
+ text=concat,
302
+ path=path,
303
+ )
304
+ else:
305
+ return None
306
+
307
+ def chunk(self, dl_doc: DLDocument, delim="\n", **kwargs: Any) -> Iterator[Chunk]:
308
+ r"""Chunk the provided document.
309
+
310
+ Args:
311
+ dl_doc (DLDocument): document to chunk
312
+ delim (str, optional): delimiter to use when concatenating sub-items.
313
+ Defaults to "\n".
314
+
315
+ Yields:
316
+ Iterator[Chunk]: iterator over extracted chunks
317
+ """
318
+ if dl_doc.main_text:
319
+ # extract doc structure incl. metadata for
320
+ # each item (e.g. parent, children)
321
+ doc_ctx = self._DocContext.from_doc(doc=dl_doc)
322
+ _logger.debug(f"{doc_ctx.model_dump()=}")
323
+
324
+ for i, item in enumerate(dl_doc.main_text):
325
+ if (
326
+ isinstance(item, BaseText)
327
+ or _HC._norm(item.obj_type) == _HC._NodeType.TABLE
328
+ ):
329
+ chunk = self._build_chunk(
330
+ doc=dl_doc, doc_map=doc_ctx, idx=i, delim=delim
331
+ )
332
+ if chunk:
333
+ _logger.info(f"{i=}, {chunk=}")
334
+ yield chunk
335
+
336
+
337
+ _HC = HierarchicalChunker
@@ -4,8 +4,9 @@
4
4
  #
5
5
 
6
6
  """Define common models across CCS objects."""
7
- from typing import Annotated, Literal, Optional, Union
7
+ from typing import Annotated, List, Literal, Optional, Union
8
8
 
9
+ import pandas as pd
9
10
  from pydantic import BaseModel, Field, PositiveInt, StrictStr
10
11
 
11
12
  from docling_core.search.mapping import es_field
@@ -152,6 +153,102 @@ class Table(BaseCell):
152
153
  data: Optional[list[list[Union[GlmTableCell, TableCell]]]] = None
153
154
  model: Optional[str] = None
154
155
 
156
+ def _get_tablecell_span(self, cell: TableCell, ix: int):
157
+ if cell.spans is None:
158
+ span = set()
159
+ else:
160
+ span = set([s[ix] for s in cell.spans])
161
+ if len(span) == 0:
162
+ return 1, None, None
163
+ return len(span), min(span), max(span)
164
+
165
+ def export_to_dataframe(self) -> pd.DataFrame:
166
+ """Export the table as a Pandas DataFrame."""
167
+ if self.data is None or self.num_rows == 0 or self.num_cols == 0:
168
+ return pd.DataFrame()
169
+
170
+ # Count how many rows are column headers
171
+ num_headers = 0
172
+ for i, row in enumerate(self.data):
173
+ if len(row) == 0:
174
+ raise RuntimeError(f"Invalid table. {len(row)=} but {self.num_cols=}.")
175
+
176
+ any_header = False
177
+ for cell in row:
178
+ if cell.obj_type == "col_header":
179
+ any_header = True
180
+ break
181
+
182
+ if any_header:
183
+ num_headers += 1
184
+ else:
185
+ break
186
+
187
+ # Create the column names from all col_headers
188
+ columns: Optional[List[str]] = None
189
+ if num_headers > 0:
190
+ columns = ["" for _ in range(self.num_cols)]
191
+ for i in range(num_headers):
192
+ for j, cell in enumerate(self.data[i]):
193
+ col_name = cell.text
194
+ if columns[j] != "":
195
+ col_name = f".{col_name}"
196
+ columns[j] += col_name
197
+
198
+ # Create table data
199
+ table_data = [[cell.text for cell in row] for row in self.data[num_headers:]]
200
+
201
+ # Create DataFrame
202
+ df = pd.DataFrame(table_data, columns=columns)
203
+
204
+ return df
205
+
206
+ def export_to_html(self) -> str:
207
+ """Export the table as html."""
208
+ body = ""
209
+ nrows = self.num_rows
210
+ ncols = self.num_cols
211
+
212
+ if self.data is None:
213
+ return ""
214
+ for i in range(nrows):
215
+ body += "<tr>"
216
+ for j in range(ncols):
217
+ cell: TableCell = self.data[i][j]
218
+
219
+ rowspan, rowstart, rowend = self._get_tablecell_span(cell, 0)
220
+ colspan, colstart, colend = self._get_tablecell_span(cell, 1)
221
+
222
+ if rowstart is not None and rowstart != i:
223
+ continue
224
+ if colstart is not None and colstart != j:
225
+ continue
226
+
227
+ if rowstart is None:
228
+ rowstart = i
229
+ if colstart is None:
230
+ colstart = j
231
+
232
+ content = cell.text.strip()
233
+ label = cell.obj_type
234
+ celltag = "td"
235
+ if label in ["row_header", "row_multi_header", "row_title"]:
236
+ pass
237
+ elif label in ["col_header", "col_multi_header"]:
238
+ celltag = "th"
239
+
240
+ opening_tag = f"{celltag}"
241
+ if rowspan > 1:
242
+ opening_tag += f' rowspan="{rowspan}"'
243
+ if colspan > 1:
244
+ opening_tag += f' colspan="{colspan}"'
245
+
246
+ body += f"<{opening_tag}>{content}</{celltag}>"
247
+ body += "</tr>"
248
+ body = f"<table>{body}</table>"
249
+
250
+ return body
251
+
155
252
 
156
253
  # FIXME: let's add some figure specific data-types later
157
254
  class Figure(BaseCell):
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling-core"
3
- version = "1.2.0"
3
+ version = "1.4.0"
4
4
  description = "A python library to define and validate data types in Docling."
5
5
  license = "MIT"
6
6
  authors = [
@@ -19,7 +19,7 @@ maintainers = [
19
19
  "Peter Staar <taa@zurich.ibm.com>",
20
20
  "Christoph Auer <cau@zurich.ibm.com>",
21
21
  "Michele Dolfi <dol@zurich.ibm.com>",
22
- "Panos Vagenas <pva@zurich.ibm.com>",
22
+ "Panos Vagenas <pva@zurich.ibm.com>",
23
23
  ]
24
24
  readme = "README.md"
25
25
  homepage = "https://ds4sd.github.io/"
@@ -53,6 +53,7 @@ jsonref = "^1.1.0"
53
53
  json-schema-for-humans = "^1.0.0"
54
54
  pyproject-toml = "^0.0.10"
55
55
  tabulate = "^0.9.0"
56
+ pandas = "^2.2.2"
56
57
 
57
58
  [tool.poetry.group.dev.dependencies]
58
59
  black = "^24.4.2"
@@ -111,7 +112,14 @@ python_version = "3.9"
111
112
  plugins = ["pydantic.mypy"]
112
113
 
113
114
  [[tool.mypy.overrides]]
114
- module = ["jsondiff.*", "jsonref.*", "jsonschema.*", "json_schema_for_humans.*", "tabulate.*"]
115
+ module = [
116
+ "jsondiff.*",
117
+ "jsonref.*",
118
+ "jsonschema.*",
119
+ "json_schema_for_humans.*",
120
+ "pandas.*",
121
+ "tabulate.*",
122
+ ]
115
123
  ignore_missing_imports = true
116
124
 
117
125
  [tool.semantic_release]
File without changes
File without changes