docling-core 1.4.0__py3-none-any.whl → 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- docling_core/types/doc/base.py +205 -12
- docling_core/types/doc/document.py +72 -198
- docling_core/types/doc/tokens.py +202 -0
- docling_core/utils/file.py +54 -0
- {docling_core-1.4.0.dist-info → docling_core-1.5.0.dist-info}/METADATA +1 -1
- {docling_core-1.4.0.dist-info → docling_core-1.5.0.dist-info}/RECORD +9 -7
- {docling_core-1.4.0.dist-info → docling_core-1.5.0.dist-info}/LICENSE +0 -0
- {docling_core-1.4.0.dist-info → docling_core-1.5.0.dist-info}/WHEEL +0 -0
- {docling_core-1.4.0.dist-info → docling_core-1.5.0.dist-info}/entry_points.txt +0 -0
docling_core/types/doc/base.py
CHANGED
|
@@ -10,6 +10,7 @@ import pandas as pd
|
|
|
10
10
|
from pydantic import BaseModel, Field, PositiveInt, StrictStr
|
|
11
11
|
|
|
12
12
|
from docling_core.search.mapping import es_field
|
|
13
|
+
from docling_core.types.doc.tokens import DocumentToken
|
|
13
14
|
from docling_core.utils.alias import AliasModel
|
|
14
15
|
|
|
15
16
|
CellData = tuple[float, float, float, float, str, str]
|
|
@@ -132,10 +133,6 @@ class GlmTableCell(TableCell):
|
|
|
132
133
|
class BaseCell(AliasModel):
|
|
133
134
|
"""Base cell."""
|
|
134
135
|
|
|
135
|
-
# FIXME: we need to check why we have bounding_box (this should be in prov)
|
|
136
|
-
bounding_box: Optional[BoundingBoxContainer] = Field(
|
|
137
|
-
default=None, alias="bounding-box", json_schema_extra=es_field(suppress=True)
|
|
138
|
-
)
|
|
139
136
|
prov: Optional[list[Prov]] = None
|
|
140
137
|
text: Optional[str] = Field(
|
|
141
138
|
default=None, json_schema_extra=es_field(term_vector="with_positions_offsets")
|
|
@@ -144,6 +141,38 @@ class BaseCell(AliasModel):
|
|
|
144
141
|
alias="type", json_schema_extra=es_field(type="keyword", ignore_above=8191)
|
|
145
142
|
)
|
|
146
143
|
|
|
144
|
+
def get_location_tokens(
|
|
145
|
+
self,
|
|
146
|
+
new_line: str,
|
|
147
|
+
page_w: float,
|
|
148
|
+
page_h: float,
|
|
149
|
+
xsize: int = 100,
|
|
150
|
+
ysize: int = 100,
|
|
151
|
+
add_page_index: bool = True,
|
|
152
|
+
) -> str:
|
|
153
|
+
"""Get the location string for the BaseCell."""
|
|
154
|
+
if self.prov is None:
|
|
155
|
+
return ""
|
|
156
|
+
|
|
157
|
+
location = ""
|
|
158
|
+
for prov in self.prov:
|
|
159
|
+
|
|
160
|
+
page_i = -1
|
|
161
|
+
if add_page_index:
|
|
162
|
+
page_i = prov.page
|
|
163
|
+
|
|
164
|
+
loc_str = DocumentToken.get_location(
|
|
165
|
+
bbox=prov.bbox,
|
|
166
|
+
page_w=page_w,
|
|
167
|
+
page_h=page_h,
|
|
168
|
+
xsize=xsize,
|
|
169
|
+
ysize=ysize,
|
|
170
|
+
page_i=page_i,
|
|
171
|
+
)
|
|
172
|
+
location += f"{loc_str}{new_line}"
|
|
173
|
+
|
|
174
|
+
return location
|
|
175
|
+
|
|
147
176
|
|
|
148
177
|
class Table(BaseCell):
|
|
149
178
|
"""Table."""
|
|
@@ -153,6 +182,11 @@ class Table(BaseCell):
|
|
|
153
182
|
data: Optional[list[list[Union[GlmTableCell, TableCell]]]] = None
|
|
154
183
|
model: Optional[str] = None
|
|
155
184
|
|
|
185
|
+
# FIXME: we need to check why we have bounding_box (this should be in prov)
|
|
186
|
+
bounding_box: Optional[BoundingBoxContainer] = Field(
|
|
187
|
+
default=None, alias="bounding-box", json_schema_extra=es_field(suppress=True)
|
|
188
|
+
)
|
|
189
|
+
|
|
156
190
|
def _get_tablecell_span(self, cell: TableCell, ix: int):
|
|
157
191
|
if cell.spans is None:
|
|
158
192
|
span = set()
|
|
@@ -249,26 +283,185 @@ class Table(BaseCell):
|
|
|
249
283
|
|
|
250
284
|
return body
|
|
251
285
|
|
|
286
|
+
def export_to_document_tokens(
|
|
287
|
+
self,
|
|
288
|
+
new_line: str = "\n",
|
|
289
|
+
page_w: float = 0.0,
|
|
290
|
+
page_h: float = 0.0,
|
|
291
|
+
xsize: int = 100,
|
|
292
|
+
ysize: int = 100,
|
|
293
|
+
add_location: bool = True,
|
|
294
|
+
add_caption: bool = True,
|
|
295
|
+
add_content: bool = True,
|
|
296
|
+
add_cell_location: bool = True,
|
|
297
|
+
add_cell_label: bool = True,
|
|
298
|
+
add_cell_text: bool = True,
|
|
299
|
+
add_page_index: bool = True,
|
|
300
|
+
):
|
|
301
|
+
"""Export table to document tokens format."""
|
|
302
|
+
body = f"{DocumentToken.BEG_TABLE.value}{new_line}"
|
|
303
|
+
|
|
304
|
+
if add_location:
|
|
305
|
+
body += self.get_location_tokens(
|
|
306
|
+
new_line=new_line,
|
|
307
|
+
page_w=page_w,
|
|
308
|
+
page_h=page_h,
|
|
309
|
+
xsize=xsize,
|
|
310
|
+
ysize=ysize,
|
|
311
|
+
add_page_index=add_page_index,
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
if add_caption and self.text is not None and len(self.text) > 0:
|
|
315
|
+
body += f"{DocumentToken.BEG_CAPTION.value}"
|
|
316
|
+
body += f"{self.text.strip()}"
|
|
317
|
+
body += f"{DocumentToken.END_CAPTION.value}"
|
|
318
|
+
body += f"{new_line}"
|
|
319
|
+
|
|
320
|
+
if add_content and self.data is not None and len(self.data) > 0:
|
|
321
|
+
for i, row in enumerate(self.data):
|
|
322
|
+
body += f"<row_{i}>"
|
|
323
|
+
for j, col in enumerate(row):
|
|
324
|
+
|
|
325
|
+
text = ""
|
|
326
|
+
if add_cell_text:
|
|
327
|
+
text = col.text.strip()
|
|
328
|
+
|
|
329
|
+
cell_loc = ""
|
|
330
|
+
if (
|
|
331
|
+
col.bbox is not None
|
|
332
|
+
and add_cell_location
|
|
333
|
+
and add_page_index
|
|
334
|
+
and self.prov is not None
|
|
335
|
+
and len(self.prov) > 0
|
|
336
|
+
):
|
|
337
|
+
cell_loc = DocumentToken.get_location(
|
|
338
|
+
bbox=col.bbox,
|
|
339
|
+
page_w=page_w,
|
|
340
|
+
page_h=page_h,
|
|
341
|
+
xsize=xsize,
|
|
342
|
+
ysize=ysize,
|
|
343
|
+
page_i=self.prov[0].page,
|
|
344
|
+
)
|
|
345
|
+
elif (
|
|
346
|
+
col.bbox is not None
|
|
347
|
+
and add_cell_location
|
|
348
|
+
and not add_page_index
|
|
349
|
+
):
|
|
350
|
+
cell_loc = DocumentToken.get_location(
|
|
351
|
+
bbox=col.bbox,
|
|
352
|
+
page_w=page_w,
|
|
353
|
+
page_h=page_h,
|
|
354
|
+
xsize=xsize,
|
|
355
|
+
ysize=ysize,
|
|
356
|
+
page_i=-1,
|
|
357
|
+
)
|
|
358
|
+
|
|
359
|
+
cell_label = ""
|
|
360
|
+
if (
|
|
361
|
+
add_cell_label
|
|
362
|
+
and col.obj_type is not None
|
|
363
|
+
and len(col.obj_type) > 0
|
|
364
|
+
):
|
|
365
|
+
cell_label = f"<{col.obj_type}>"
|
|
366
|
+
|
|
367
|
+
body += f"<col_{j}>{cell_loc}{cell_label}{text}</col_{j}>"
|
|
368
|
+
|
|
369
|
+
body += f"</row_{i}>{new_line}"
|
|
370
|
+
|
|
371
|
+
body += f"{DocumentToken.END_TABLE.value}{new_line}"
|
|
372
|
+
|
|
373
|
+
return body
|
|
374
|
+
|
|
252
375
|
|
|
253
376
|
# FIXME: let's add some figure specific data-types later
|
|
254
377
|
class Figure(BaseCell):
|
|
255
378
|
"""Figure."""
|
|
256
379
|
|
|
380
|
+
# FIXME: we need to check why we have bounding_box (this should be in prov)
|
|
381
|
+
bounding_box: Optional[BoundingBoxContainer] = Field(
|
|
382
|
+
default=None, alias="bounding-box", json_schema_extra=es_field(suppress=True)
|
|
383
|
+
)
|
|
384
|
+
|
|
385
|
+
def export_to_document_tokens(
|
|
386
|
+
self,
|
|
387
|
+
new_line: str = "\n",
|
|
388
|
+
page_w: float = 0.0,
|
|
389
|
+
page_h: float = 0.0,
|
|
390
|
+
xsize: int = 100,
|
|
391
|
+
ysize: int = 100,
|
|
392
|
+
add_location: bool = True,
|
|
393
|
+
add_caption: bool = True,
|
|
394
|
+
add_content: bool = True, # not used at the moment
|
|
395
|
+
add_page_index: bool = True,
|
|
396
|
+
):
|
|
397
|
+
"""Export figure to document tokens format."""
|
|
398
|
+
body = f"{DocumentToken.BEG_FIGURE.value}{new_line}"
|
|
399
|
+
|
|
400
|
+
if add_location:
|
|
401
|
+
body += self.get_location_tokens(
|
|
402
|
+
new_line=new_line,
|
|
403
|
+
page_w=page_w,
|
|
404
|
+
page_h=page_h,
|
|
405
|
+
xsize=xsize,
|
|
406
|
+
ysize=ysize,
|
|
407
|
+
add_page_index=add_page_index,
|
|
408
|
+
)
|
|
409
|
+
|
|
410
|
+
if add_caption and self.text is not None and len(self.text) > 0:
|
|
411
|
+
body += f"{DocumentToken.BEG_CAPTION.value}"
|
|
412
|
+
body += f"{self.text.strip()}"
|
|
413
|
+
body += f"{DocumentToken.END_CAPTION.value}"
|
|
414
|
+
body += f"{new_line}"
|
|
415
|
+
|
|
416
|
+
body += f"{DocumentToken.END_FIGURE.value}{new_line}"
|
|
417
|
+
|
|
418
|
+
return body
|
|
419
|
+
|
|
257
420
|
|
|
258
|
-
class BaseText(
|
|
421
|
+
class BaseText(BaseCell):
|
|
259
422
|
"""Base model for text objects."""
|
|
260
423
|
|
|
261
|
-
|
|
262
|
-
json_schema_extra=es_field(term_vector="with_positions_offsets")
|
|
263
|
-
)
|
|
264
|
-
obj_type: StrictStr = Field(
|
|
265
|
-
alias="type", json_schema_extra=es_field(type="keyword", ignore_above=8191)
|
|
266
|
-
)
|
|
424
|
+
# FIXME: do we need these ???
|
|
267
425
|
name: Optional[StrictStr] = Field(
|
|
268
426
|
default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
|
|
269
427
|
)
|
|
270
428
|
font: Optional[str] = None
|
|
271
|
-
|
|
429
|
+
|
|
430
|
+
def export_to_document_tokens(
|
|
431
|
+
self,
|
|
432
|
+
new_line: str = "\n",
|
|
433
|
+
page_w: float = 0.0,
|
|
434
|
+
page_h: float = 0.0,
|
|
435
|
+
xsize: int = 100,
|
|
436
|
+
ysize: int = 100,
|
|
437
|
+
add_location: bool = True,
|
|
438
|
+
add_content: bool = True,
|
|
439
|
+
add_page_index: bool = True,
|
|
440
|
+
):
|
|
441
|
+
"""Export text element to document tokens format."""
|
|
442
|
+
body = f"<{self.obj_type}>"
|
|
443
|
+
# body = f"<{self.name}>"
|
|
444
|
+
|
|
445
|
+
assert DocumentToken.is_known_token(
|
|
446
|
+
body
|
|
447
|
+
), f"failed DocumentToken.is_known_token({body})"
|
|
448
|
+
|
|
449
|
+
if add_location:
|
|
450
|
+
body += self.get_location_tokens(
|
|
451
|
+
new_line="",
|
|
452
|
+
page_w=page_w,
|
|
453
|
+
page_h=page_h,
|
|
454
|
+
xsize=xsize,
|
|
455
|
+
ysize=ysize,
|
|
456
|
+
add_page_index=add_page_index,
|
|
457
|
+
)
|
|
458
|
+
|
|
459
|
+
if add_content and self.text is not None:
|
|
460
|
+
body += self.text.strip()
|
|
461
|
+
|
|
462
|
+
body += f"</{self.obj_type}>{new_line}"
|
|
463
|
+
|
|
464
|
+
return body
|
|
272
465
|
|
|
273
466
|
|
|
274
467
|
class ListItem(BaseText):
|
|
@@ -6,8 +6,7 @@
|
|
|
6
6
|
"""Models for the Docling Document data type."""
|
|
7
7
|
|
|
8
8
|
from datetime import datetime
|
|
9
|
-
from
|
|
10
|
-
from typing import Generic, Optional, Tuple, Union
|
|
9
|
+
from typing import Generic, Optional, Union
|
|
11
10
|
|
|
12
11
|
from pydantic import (
|
|
13
12
|
AnyHttpUrl,
|
|
@@ -43,6 +42,7 @@ from docling_core.types.doc.base import (
|
|
|
43
42
|
S3Data,
|
|
44
43
|
Table,
|
|
45
44
|
)
|
|
45
|
+
from docling_core.types.doc.tokens import DocumentToken
|
|
46
46
|
from docling_core.utils.alias import AliasModel
|
|
47
47
|
|
|
48
48
|
|
|
@@ -347,107 +347,6 @@ class CCSDocument(
|
|
|
347
347
|
return data
|
|
348
348
|
|
|
349
349
|
|
|
350
|
-
class DocumentToken(Enum):
|
|
351
|
-
"""Class to represent an LLM friendly representation of a Document."""
|
|
352
|
-
|
|
353
|
-
BEG_DOCUMENT = "<document>"
|
|
354
|
-
END_DOCUMENT = "</document>"
|
|
355
|
-
|
|
356
|
-
BEG_TITLE = "<title>"
|
|
357
|
-
END_TITLE = "</title>"
|
|
358
|
-
|
|
359
|
-
BEG_ABSTRACT = "<abstract>"
|
|
360
|
-
END_ABSTRACT = "</abstract>"
|
|
361
|
-
|
|
362
|
-
BEG_DOI = "<doi>"
|
|
363
|
-
END_DOI = "</doi>"
|
|
364
|
-
BEG_DATE = "<date>"
|
|
365
|
-
END_DATE = "</date>"
|
|
366
|
-
|
|
367
|
-
BEG_AUTHORS = "<authors>"
|
|
368
|
-
END_AUTHORS = "</authors>"
|
|
369
|
-
BEG_AUTHOR = "<author>"
|
|
370
|
-
END_AUTHOR = "</author>"
|
|
371
|
-
|
|
372
|
-
BEG_AFFILIATIONS = "<affiliations>"
|
|
373
|
-
END_AFFILIATIONS = "</affiliations>"
|
|
374
|
-
BEG_AFFILIATION = "<affiliation>"
|
|
375
|
-
END_AFFILIATION = "</affiliation>"
|
|
376
|
-
|
|
377
|
-
BEG_HEADER = "<section-header>"
|
|
378
|
-
END_HEADER = "</section-header>"
|
|
379
|
-
BEG_TEXT = "<text>"
|
|
380
|
-
END_TEXT = "</text>"
|
|
381
|
-
BEG_PARAGRAPH = "<paragraph>"
|
|
382
|
-
END_PARAGRAPH = "</paragraph>"
|
|
383
|
-
BEG_TABLE = "<table>"
|
|
384
|
-
END_TABLE = "</table>"
|
|
385
|
-
BEG_FIGURE = "<figure>"
|
|
386
|
-
END_FIGURE = "</figure>"
|
|
387
|
-
BEG_CAPTION = "<caption>"
|
|
388
|
-
END_CAPTION = "</caption>"
|
|
389
|
-
BEG_EQUATION = "<equation>"
|
|
390
|
-
END_EQUATION = "</equation>"
|
|
391
|
-
BEG_LIST = "<list>"
|
|
392
|
-
END_LIST = "</list>"
|
|
393
|
-
BEG_LISTITEM = "<list-item>"
|
|
394
|
-
END_LISTITEM = "</list-item>"
|
|
395
|
-
|
|
396
|
-
BEG_LOCATION = "<location>"
|
|
397
|
-
END_LOCATION = "</location>"
|
|
398
|
-
BEG_GROUP = "<group>"
|
|
399
|
-
END_GROUP = "</group>"
|
|
400
|
-
|
|
401
|
-
@classmethod
|
|
402
|
-
def get_special_tokens(
|
|
403
|
-
cls,
|
|
404
|
-
max_rows: int = 100,
|
|
405
|
-
max_cols: int = 100,
|
|
406
|
-
max_pages: int = 1000,
|
|
407
|
-
page_dimension: Tuple[int, int] = (100, 100),
|
|
408
|
-
):
|
|
409
|
-
"""Function to get all special document tokens."""
|
|
410
|
-
special_tokens = [token.value for token in cls]
|
|
411
|
-
|
|
412
|
-
# Adding dynamically generated row and col tokens
|
|
413
|
-
for i in range(0, max_rows):
|
|
414
|
-
special_tokens += [f"<row_{i}>", f"</row_{i}>"]
|
|
415
|
-
|
|
416
|
-
for i in range(0, max_cols):
|
|
417
|
-
special_tokens += [f"<col_{i}>", f"</col_{i}>"]
|
|
418
|
-
|
|
419
|
-
for i in range(6):
|
|
420
|
-
special_tokens += [f"<section-header-{i}>", f"</section-header-{i}>"]
|
|
421
|
-
|
|
422
|
-
# Adding dynamically generated page-tokens
|
|
423
|
-
for i in range(0, max_pages):
|
|
424
|
-
special_tokens.append(f"<page_{i}>")
|
|
425
|
-
|
|
426
|
-
# Adding dynamically generated location-tokens
|
|
427
|
-
for i in range(0, max(page_dimension[0], page_dimension[1])):
|
|
428
|
-
special_tokens.append(f"<loc_{i}>")
|
|
429
|
-
|
|
430
|
-
return special_tokens
|
|
431
|
-
|
|
432
|
-
@staticmethod
|
|
433
|
-
def get_page_token(page: int):
|
|
434
|
-
"""Function to get page tokens."""
|
|
435
|
-
return f"<page_{page}>"
|
|
436
|
-
|
|
437
|
-
@staticmethod
|
|
438
|
-
def get_location_token(val: float, rnorm: int = 100):
|
|
439
|
-
"""Function to get location tokens."""
|
|
440
|
-
val_ = round(rnorm * val)
|
|
441
|
-
|
|
442
|
-
if val_ < 0:
|
|
443
|
-
return "<loc_0>"
|
|
444
|
-
|
|
445
|
-
if val_ > rnorm:
|
|
446
|
-
return f"<loc_{rnorm}>"
|
|
447
|
-
|
|
448
|
-
return f"<loc_{val_}>"
|
|
449
|
-
|
|
450
|
-
|
|
451
350
|
class ExportedCCSDocument(
|
|
452
351
|
MinimalDocument,
|
|
453
352
|
Generic[
|
|
@@ -525,6 +424,16 @@ class ExportedCCSDocument(
|
|
|
525
424
|
|
|
526
425
|
return result
|
|
527
426
|
|
|
427
|
+
def get_map_to_page_dimensions(self):
|
|
428
|
+
"""Get a map from page-index (start at 1) to page-dim [width, height]."""
|
|
429
|
+
pagedims = {}
|
|
430
|
+
|
|
431
|
+
if self.page_dimensions is not None:
|
|
432
|
+
for _ in self.page_dimensions:
|
|
433
|
+
pagedims[_.page] = [_.width, _.height]
|
|
434
|
+
|
|
435
|
+
return pagedims
|
|
436
|
+
|
|
528
437
|
def export_to_markdown(
|
|
529
438
|
self,
|
|
530
439
|
delim: str = "\n\n",
|
|
@@ -576,7 +485,7 @@ class ExportedCCSDocument(
|
|
|
576
485
|
text = item.text
|
|
577
486
|
|
|
578
487
|
# ignore repeated text
|
|
579
|
-
if prev_text == text:
|
|
488
|
+
if prev_text == text or text is None:
|
|
580
489
|
continue
|
|
581
490
|
else:
|
|
582
491
|
prev_text = text
|
|
@@ -649,48 +558,32 @@ class ExportedCCSDocument(
|
|
|
649
558
|
"table",
|
|
650
559
|
"figure",
|
|
651
560
|
],
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
561
|
+
xsize: int = 100,
|
|
562
|
+
ysize: int = 100,
|
|
563
|
+
add_location: bool = True,
|
|
564
|
+
add_content: bool = True,
|
|
565
|
+
add_page_index: bool = True,
|
|
566
|
+
# table specific flags
|
|
567
|
+
add_table_cell_location: bool = False,
|
|
568
|
+
add_table_cell_label: bool = True,
|
|
569
|
+
add_table_cell_text: bool = True,
|
|
656
570
|
) -> str:
|
|
657
571
|
r"""Exports the document content to an DocumentToken format.
|
|
658
572
|
|
|
659
573
|
Operates on a slice of the document's main_text as defined through arguments
|
|
660
574
|
main_text_start and main_text_stop; defaulting to the whole main_text.
|
|
661
575
|
|
|
662
|
-
Args:
|
|
663
|
-
delim (str, optional): The delimiter used to separate text blocks in the
|
|
664
|
-
exported XML. Default is two newline characters ("\n\n").
|
|
665
|
-
main_text_start (int, optional): The starting index of the main text to
|
|
666
|
-
be included in the XML. Default is 0 (the beginning of the text).
|
|
667
|
-
main_text_stop (Optional[int], optional): The stopping index of the main
|
|
668
|
-
text. If set to None, the export includes text up to the end.
|
|
669
|
-
Default is None.
|
|
670
|
-
main_text_labels (list[str], optional): A list of text labels that
|
|
671
|
-
categorize the different sections of the document (e.g., "title",
|
|
672
|
-
"subtitle-level-1", "paragraph", "caption"). Default labels are
|
|
673
|
-
"title", "subtitle-level-1", "paragraph", and "caption".
|
|
674
|
-
location_tagging (bool, optional): Determines whether to include
|
|
675
|
-
location-based tagging in the XML. If True, the exported XML will
|
|
676
|
-
contain information about the locations of the text elements.
|
|
677
|
-
Default is True.
|
|
678
|
-
location_dimensions (Tuple[int, int], optional): Specifies the dimensions
|
|
679
|
-
(width and height) for the location tagging, if enabled.
|
|
680
|
-
Default is [100, 100].
|
|
681
|
-
add_new_line (bool, optional): Whether to add new line characters after
|
|
682
|
-
each text block. If True, a new line is added after each block of
|
|
683
|
-
text in the XML. Default is True.
|
|
684
|
-
|
|
685
576
|
Returns:
|
|
686
|
-
str: The content of the document formatted as
|
|
577
|
+
str: The content of the document formatted as a DocTags string.
|
|
687
578
|
"""
|
|
688
|
-
xml_str = DocumentToken.BEG_DOCUMENT.value
|
|
689
|
-
|
|
690
579
|
new_line = ""
|
|
691
|
-
if
|
|
580
|
+
if delim:
|
|
692
581
|
new_line = "\n"
|
|
693
582
|
|
|
583
|
+
doctags = f"{DocumentToken.BEG_DOCUMENT.value}{new_line}"
|
|
584
|
+
|
|
585
|
+
# pagedims = self.get_map_to_page_dimensions()
|
|
586
|
+
|
|
694
587
|
if self.main_text is not None:
|
|
695
588
|
for orig_item in self.main_text[main_text_start:main_text_stop]:
|
|
696
589
|
|
|
@@ -705,87 +598,68 @@ class ExportedCCSDocument(
|
|
|
705
598
|
|
|
706
599
|
prov = item.prov
|
|
707
600
|
|
|
708
|
-
|
|
601
|
+
page_i = -1
|
|
602
|
+
page_w = 0.0
|
|
603
|
+
page_h = 0.0
|
|
604
|
+
|
|
709
605
|
if (
|
|
710
|
-
|
|
606
|
+
add_location
|
|
711
607
|
and self.page_dimensions is not None
|
|
712
608
|
and prov is not None
|
|
713
609
|
and len(prov) > 0
|
|
714
610
|
):
|
|
715
611
|
|
|
716
|
-
|
|
717
|
-
page_dim = self.page_dimensions[
|
|
612
|
+
page_i = prov[0].page
|
|
613
|
+
page_dim = self.page_dimensions[page_i - 1]
|
|
718
614
|
|
|
719
615
|
page_w = float(page_dim.width)
|
|
720
616
|
page_h = float(page_dim.height)
|
|
721
617
|
|
|
722
|
-
x0 = float(prov[0].bbox[0]) / float(page_w)
|
|
723
|
-
y0 = float(prov[0].bbox[1]) / float(page_h)
|
|
724
|
-
x1 = float(prov[0].bbox[2]) / float(page_w)
|
|
725
|
-
y1 = float(prov[0].bbox[3]) / float(page_h)
|
|
726
|
-
|
|
727
|
-
page_tok = ""
|
|
728
|
-
if page_tagging:
|
|
729
|
-
page_tok = DocumentToken.get_page_token(page=page)
|
|
730
|
-
|
|
731
|
-
x0_tok = DocumentToken.get_location_token(
|
|
732
|
-
val=min(x0, x1), rnorm=location_dimensions[0]
|
|
733
|
-
)
|
|
734
|
-
y0_tok = DocumentToken.get_location_token(
|
|
735
|
-
val=min(y0, y1), rnorm=location_dimensions[1]
|
|
736
|
-
)
|
|
737
|
-
x1_tok = DocumentToken.get_location_token(
|
|
738
|
-
val=max(x0, x1), rnorm=location_dimensions[0]
|
|
739
|
-
)
|
|
740
|
-
y1_tok = DocumentToken.get_location_token(
|
|
741
|
-
val=max(y0, y1), rnorm=location_dimensions[1]
|
|
742
|
-
)
|
|
743
|
-
|
|
744
|
-
# update
|
|
745
|
-
loc_str = f"{DocumentToken.BEG_LOCATION.value}"
|
|
746
|
-
loc_str += f"{page_tok}"
|
|
747
|
-
loc_str += f"{x0_tok}{y0_tok}{x1_tok}{y1_tok}"
|
|
748
|
-
loc_str += f"{DocumentToken.END_LOCATION.value}"
|
|
749
|
-
|
|
750
618
|
item_type = item.obj_type
|
|
751
619
|
if isinstance(item, BaseText) and (item_type in main_text_labels):
|
|
752
|
-
text = item.text
|
|
753
620
|
|
|
754
|
-
|
|
621
|
+
doctags += item.export_to_document_tokens(
|
|
622
|
+
new_line=new_line,
|
|
623
|
+
page_w=page_w,
|
|
624
|
+
page_h=page_h,
|
|
625
|
+
xsize=xsize,
|
|
626
|
+
ysize=ysize,
|
|
627
|
+
add_location=add_location,
|
|
628
|
+
add_content=add_content,
|
|
629
|
+
add_page_index=add_page_index,
|
|
630
|
+
)
|
|
755
631
|
|
|
756
632
|
elif isinstance(item, Table) and (item_type in main_text_labels):
|
|
757
633
|
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
xml_str += f"</row_{i}>{new_line}"
|
|
774
|
-
|
|
775
|
-
xml_str += f"</{item_type}>{new_line}"
|
|
634
|
+
doctags += item.export_to_document_tokens(
|
|
635
|
+
new_line=new_line,
|
|
636
|
+
page_w=page_w,
|
|
637
|
+
page_h=page_h,
|
|
638
|
+
xsize=xsize,
|
|
639
|
+
ysize=ysize,
|
|
640
|
+
add_caption=True,
|
|
641
|
+
add_location=add_location,
|
|
642
|
+
add_content=add_content,
|
|
643
|
+
add_cell_location=add_table_cell_location,
|
|
644
|
+
add_cell_label=add_table_cell_label,
|
|
645
|
+
add_cell_text=add_table_cell_text,
|
|
646
|
+
add_page_index=add_page_index,
|
|
647
|
+
)
|
|
776
648
|
|
|
777
649
|
elif isinstance(item, Figure) and (item_type in main_text_labels):
|
|
778
650
|
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
651
|
+
doctags += item.export_to_document_tokens(
|
|
652
|
+
new_line=new_line,
|
|
653
|
+
page_w=page_w,
|
|
654
|
+
page_h=page_h,
|
|
655
|
+
xsize=xsize,
|
|
656
|
+
ysize=ysize,
|
|
657
|
+
add_caption=True,
|
|
658
|
+
add_location=add_location,
|
|
659
|
+
add_content=add_content,
|
|
660
|
+
add_page_index=add_page_index,
|
|
661
|
+
)
|
|
788
662
|
|
|
789
|
-
|
|
663
|
+
doctags += DocumentToken.END_DOCUMENT.value
|
|
790
664
|
|
|
791
|
-
return
|
|
665
|
+
return doctags
|
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright IBM Corp. 2024 - 2024
|
|
3
|
+
# SPDX-License-Identifier: MIT
|
|
4
|
+
#
|
|
5
|
+
|
|
6
|
+
"""Tokens used in the docling document model."""
|
|
7
|
+
|
|
8
|
+
from enum import Enum
|
|
9
|
+
from typing import Annotated, Tuple
|
|
10
|
+
|
|
11
|
+
from pydantic import Field
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class TableToken(Enum):
|
|
15
|
+
"""Class to represent an LLM friendly representation of a Table."""
|
|
16
|
+
|
|
17
|
+
CELL_LABEL_COLUMN_HEADER = "<column_header>"
|
|
18
|
+
CELL_LABEL_ROW_HEADER = "<row_header>"
|
|
19
|
+
CELL_LABEL_SECTION_HEADERE = "<section_header>"
|
|
20
|
+
CELL_LABEL_DATA = "<data>"
|
|
21
|
+
|
|
22
|
+
OTSL_ECEL = "<ecel>" # empty cell
|
|
23
|
+
OTSL_FCEL = "<fcel>" # cell with content
|
|
24
|
+
OTSL_LCEL = "<lcel>" # left looking cell,
|
|
25
|
+
OTSL_UCEL = "<ucel>" # up looking cell,
|
|
26
|
+
OTSL_XCEL = "<xcel>" # 2d extension cell (cross cell),
|
|
27
|
+
OTSL_NL = "<nl>" # new line,
|
|
28
|
+
OTSL_CHED = "<ched>" # - column header cell,
|
|
29
|
+
OTSL_RHED = "<rhed>" # - row header cell,
|
|
30
|
+
OTSL_SROW = "<srow>" # - section row cell
|
|
31
|
+
|
|
32
|
+
@classmethod
|
|
33
|
+
def get_special_tokens(cls):
|
|
34
|
+
"""Function to get all special document tokens."""
|
|
35
|
+
special_tokens = [token.value for token in cls]
|
|
36
|
+
return special_tokens
|
|
37
|
+
|
|
38
|
+
@staticmethod
|
|
39
|
+
def is_known_token(label):
|
|
40
|
+
"""Function to check if label is in tokens."""
|
|
41
|
+
return label in TableToken.get_special_tokens()
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class DocumentToken(Enum):
|
|
45
|
+
"""Class to represent an LLM friendly representation of a Document."""
|
|
46
|
+
|
|
47
|
+
BEG_DOCUMENT = "<document>"
|
|
48
|
+
END_DOCUMENT = "</document>"
|
|
49
|
+
|
|
50
|
+
BEG_TITLE = "<title>"
|
|
51
|
+
END_TITLE = "</title>"
|
|
52
|
+
|
|
53
|
+
BEG_ABSTRACT = "<abstract>"
|
|
54
|
+
END_ABSTRACT = "</abstract>"
|
|
55
|
+
|
|
56
|
+
BEG_DOI = "<doi>"
|
|
57
|
+
END_DOI = "</doi>"
|
|
58
|
+
BEG_DATE = "<date>"
|
|
59
|
+
END_DATE = "</date>"
|
|
60
|
+
|
|
61
|
+
BEG_AUTHORS = "<authors>"
|
|
62
|
+
END_AUTHORS = "</authors>"
|
|
63
|
+
BEG_AUTHOR = "<author>"
|
|
64
|
+
END_AUTHOR = "</author>"
|
|
65
|
+
|
|
66
|
+
BEG_AFFILIATIONS = "<affiliations>"
|
|
67
|
+
END_AFFILIATIONS = "</affiliations>"
|
|
68
|
+
BEG_AFFILIATION = "<affiliation>"
|
|
69
|
+
END_AFFILIATION = "</affiliation>"
|
|
70
|
+
|
|
71
|
+
BEG_HEADER = "<section-header>"
|
|
72
|
+
END_HEADER = "</section-header>"
|
|
73
|
+
BEG_TEXT = "<text>"
|
|
74
|
+
END_TEXT = "</text>"
|
|
75
|
+
BEG_PARAGRAPH = "<paragraph>"
|
|
76
|
+
END_PARAGRAPH = "</paragraph>"
|
|
77
|
+
BEG_TABLE = "<table>"
|
|
78
|
+
END_TABLE = "</table>"
|
|
79
|
+
BEG_FIGURE = "<figure>"
|
|
80
|
+
END_FIGURE = "</figure>"
|
|
81
|
+
BEG_CAPTION = "<caption>"
|
|
82
|
+
END_CAPTION = "</caption>"
|
|
83
|
+
BEG_EQUATION = "<equation>"
|
|
84
|
+
END_EQUATION = "</equation>"
|
|
85
|
+
BEG_LIST = "<list>"
|
|
86
|
+
END_LIST = "</list>"
|
|
87
|
+
BEG_LISTITEM = "<list-item>"
|
|
88
|
+
END_LISTITEM = "</list-item>"
|
|
89
|
+
|
|
90
|
+
BEG_LOCATION = "<location>"
|
|
91
|
+
END_LOCATION = "</location>"
|
|
92
|
+
BEG_GROUP = "<group>"
|
|
93
|
+
END_GROUP = "</group>"
|
|
94
|
+
|
|
95
|
+
@classmethod
|
|
96
|
+
def get_special_tokens(
|
|
97
|
+
cls,
|
|
98
|
+
max_rows: int = 100,
|
|
99
|
+
max_cols: int = 100,
|
|
100
|
+
max_pages: int = 1000,
|
|
101
|
+
page_dimension: Tuple[int, int] = (100, 100),
|
|
102
|
+
):
|
|
103
|
+
"""Function to get all special document tokens."""
|
|
104
|
+
special_tokens = [token.value for token in cls]
|
|
105
|
+
|
|
106
|
+
# Adding dynamically generated row and col tokens
|
|
107
|
+
for i in range(0, max_rows + 1):
|
|
108
|
+
special_tokens += [f"<row_{i}>", f"</row_{i}>"]
|
|
109
|
+
|
|
110
|
+
for i in range(0, max_cols + 1):
|
|
111
|
+
special_tokens += [f"<col_{i}>", f"</col_{i}>"]
|
|
112
|
+
|
|
113
|
+
for i in range(6):
|
|
114
|
+
special_tokens += [f"<section-header-{i}>", f"</section-header-{i}>"]
|
|
115
|
+
|
|
116
|
+
# FIXME: this is synonym of section header
|
|
117
|
+
for i in range(6):
|
|
118
|
+
special_tokens += [f"<subtitle-level-{i}>", f"</subtitle-level-{i}>"]
|
|
119
|
+
|
|
120
|
+
# Adding dynamically generated page-tokens
|
|
121
|
+
for i in range(0, max_pages + 1):
|
|
122
|
+
special_tokens.append(f"<page_{i}>")
|
|
123
|
+
special_tokens.append(f"</page_{i}>")
|
|
124
|
+
|
|
125
|
+
# Adding dynamically generated location-tokens
|
|
126
|
+
for i in range(0, max(page_dimension[0] + 1, page_dimension[1] + 1)):
|
|
127
|
+
special_tokens.append(f"<loc_{i}>")
|
|
128
|
+
|
|
129
|
+
return special_tokens
|
|
130
|
+
|
|
131
|
+
@staticmethod
|
|
132
|
+
def is_known_token(label):
|
|
133
|
+
"""Function to check if label is in tokens."""
|
|
134
|
+
return label in DocumentToken.get_special_tokens()
|
|
135
|
+
|
|
136
|
+
@staticmethod
|
|
137
|
+
def get_row_token(row: int, beg=bool) -> str:
|
|
138
|
+
"""Function to get page tokens."""
|
|
139
|
+
if beg:
|
|
140
|
+
return f"<row_{row}>"
|
|
141
|
+
else:
|
|
142
|
+
return f"</row_{row}>"
|
|
143
|
+
|
|
144
|
+
@staticmethod
|
|
145
|
+
def get_col_token(col: int, beg=bool) -> str:
|
|
146
|
+
"""Function to get page tokens."""
|
|
147
|
+
if beg:
|
|
148
|
+
return f"<col_{col}>"
|
|
149
|
+
else:
|
|
150
|
+
return f"</col_{col}>"
|
|
151
|
+
|
|
152
|
+
@staticmethod
|
|
153
|
+
def get_page_token(page: int):
|
|
154
|
+
"""Function to get page tokens."""
|
|
155
|
+
return f"<page_{page}>"
|
|
156
|
+
|
|
157
|
+
@staticmethod
|
|
158
|
+
def get_location_token(val: float, rnorm: int = 100):
|
|
159
|
+
"""Function to get location tokens."""
|
|
160
|
+
val_ = round(rnorm * val)
|
|
161
|
+
|
|
162
|
+
if val_ < 0:
|
|
163
|
+
return "<loc_0>"
|
|
164
|
+
|
|
165
|
+
if val_ > rnorm:
|
|
166
|
+
return f"<loc_{rnorm}>"
|
|
167
|
+
|
|
168
|
+
return f"<loc_{val_}>"
|
|
169
|
+
|
|
170
|
+
@staticmethod
|
|
171
|
+
def get_location(
|
|
172
|
+
# bbox: Tuple[float, float, float, float],
|
|
173
|
+
bbox: Annotated[list[float], Field(min_length=4, max_length=4)],
|
|
174
|
+
page_w: float,
|
|
175
|
+
page_h: float,
|
|
176
|
+
xsize: int = 100,
|
|
177
|
+
ysize: int = 100,
|
|
178
|
+
page_i: int = -1,
|
|
179
|
+
):
|
|
180
|
+
"""Get the location string give bbox and page-dim."""
|
|
181
|
+
assert bbox[0] <= bbox[2], f"bbox[0]<=bbox[2] => {bbox[0]}<={bbox[2]}"
|
|
182
|
+
assert bbox[1] <= bbox[3], f"bbox[1]<=bbox[3] => {bbox[1]}<={bbox[3]}"
|
|
183
|
+
|
|
184
|
+
x0 = bbox[0] / page_w
|
|
185
|
+
y0 = bbox[1] / page_h
|
|
186
|
+
x1 = bbox[2] / page_w
|
|
187
|
+
y1 = bbox[3] / page_h
|
|
188
|
+
|
|
189
|
+
page_tok = ""
|
|
190
|
+
if page_i != -1:
|
|
191
|
+
page_tok = DocumentToken.get_page_token(page=page_i)
|
|
192
|
+
|
|
193
|
+
x0_tok = DocumentToken.get_location_token(val=min(x0, x1), rnorm=xsize)
|
|
194
|
+
y0_tok = DocumentToken.get_location_token(val=min(y0, y1), rnorm=ysize)
|
|
195
|
+
x1_tok = DocumentToken.get_location_token(val=max(x0, x1), rnorm=xsize)
|
|
196
|
+
y1_tok = DocumentToken.get_location_token(val=max(y0, y1), rnorm=ysize)
|
|
197
|
+
|
|
198
|
+
loc_str = f"{DocumentToken.BEG_LOCATION.value}"
|
|
199
|
+
loc_str += f"{page_tok}{x0_tok}{y0_tok}{x1_tok}{y1_tok}"
|
|
200
|
+
loc_str += f"{DocumentToken.END_LOCATION.value}"
|
|
201
|
+
|
|
202
|
+
return loc_str
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright IBM Corp. 2024 - 2024
|
|
3
|
+
# SPDX-License-Identifier: MIT
|
|
4
|
+
#
|
|
5
|
+
|
|
6
|
+
"""File-related utilities."""
|
|
7
|
+
|
|
8
|
+
import tempfile
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Union
|
|
11
|
+
|
|
12
|
+
import requests
|
|
13
|
+
from pydantic import AnyHttpUrl, TypeAdapter, ValidationError
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def resolve_file_source(source: Union[Path, AnyHttpUrl, str]) -> Path:
|
|
17
|
+
"""Resolves the source (URL, path) of a file to a local file path.
|
|
18
|
+
|
|
19
|
+
If a URL is provided, the content is first downloaded to a temporary local file.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
source (Path | AnyHttpUrl | str): The file input source. Can be a path or URL.
|
|
23
|
+
|
|
24
|
+
Raises:
|
|
25
|
+
ValueError: If source is of unexpected type.
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
Path: The local file path.
|
|
29
|
+
"""
|
|
30
|
+
try:
|
|
31
|
+
http_url: AnyHttpUrl = TypeAdapter(AnyHttpUrl).validate_python(source)
|
|
32
|
+
res = requests.get(http_url, stream=True)
|
|
33
|
+
res.raise_for_status()
|
|
34
|
+
fname = None
|
|
35
|
+
# try to get filename from response header
|
|
36
|
+
if cont_disp := res.headers.get("Content-Disposition"):
|
|
37
|
+
for par in cont_disp.strip().split(";"):
|
|
38
|
+
# currently only handling directive "filename" (not "*filename")
|
|
39
|
+
if (split := par.split("=")) and split[0].strip() == "filename":
|
|
40
|
+
fname = "=".join(split[1:]).strip().strip("'\"") or None
|
|
41
|
+
break
|
|
42
|
+
# otherwise, use name from URL:
|
|
43
|
+
if fname is None:
|
|
44
|
+
fname = Path(http_url.path or "file").name
|
|
45
|
+
local_path = Path(tempfile.mkdtemp()) / fname
|
|
46
|
+
with open(local_path, "wb") as f:
|
|
47
|
+
for chunk in res.iter_content(chunk_size=1024): # using 1-KB chunks
|
|
48
|
+
f.write(chunk)
|
|
49
|
+
except ValidationError:
|
|
50
|
+
try:
|
|
51
|
+
local_path = TypeAdapter(Path).validate_python(source)
|
|
52
|
+
except ValidationError:
|
|
53
|
+
raise ValueError(f"Unexpected source type encountered: {type(source)}")
|
|
54
|
+
return local_path
|
|
@@ -20,11 +20,12 @@ docling_core/transforms/chunker/hierarchical_chunker.py,sha256=lAeHgJ4relA1EU0YV
|
|
|
20
20
|
docling_core/types/__init__.py,sha256=6mrAEKRW85uHJwNQBufwjPcMWCjm3oocA6MaO4_NLgg,805
|
|
21
21
|
docling_core/types/base.py,sha256=fNtfQ20NKa_RBNBWbq0DfO8o0zC1Cec8UAMu0Znsltk,8170
|
|
22
22
|
docling_core/types/doc/__init__.py,sha256=Pzj_8rft6SJTVTCHgXRwHtuZjL6LK_6dcBWjikL9biY,125
|
|
23
|
-
docling_core/types/doc/base.py,sha256=
|
|
23
|
+
docling_core/types/doc/base.py,sha256=QQC8KzQeYWnHFPY2_BNGcbTp6J2_rPbnLjsnbehICno,14710
|
|
24
24
|
docling_core/types/doc/doc_ann.py,sha256=8pV2efUglw19jxl4_oqB__mSxjWvtGIcllyCdqA-b2s,1196
|
|
25
25
|
docling_core/types/doc/doc_ocr.py,sha256=6PC0C-OczF-MyfgRxEI1xs3PWgNOzi7i2yEQbTqZz0I,1387
|
|
26
26
|
docling_core/types/doc/doc_raw.py,sha256=Y69G6IiauNDaoT-5el4xo1ypWpnBJQ75akGGkCMTZSc,3888
|
|
27
|
-
docling_core/types/doc/document.py,sha256=
|
|
27
|
+
docling_core/types/doc/document.py,sha256=FdAAyYfYnKXi3kqt0Qk2NYFAxsuGbHsNVB1EDIRVH3Y,22136
|
|
28
|
+
docling_core/types/doc/tokens.py,sha256=uU_MYW_p7ypf7eYICFBvxdnVaPZ7CQnvZmbJ6oPrtEA,6134
|
|
28
29
|
docling_core/types/gen/__init__.py,sha256=C6TuCfvpSnSL5XDOFMcYHUY2-i08vvfOGRcdu6Af0pI,124
|
|
29
30
|
docling_core/types/gen/generic.py,sha256=l4CZ4_Lb8ONG36WNJWbKX5hGKvTh_yU-hXp5hsm7uVU,844
|
|
30
31
|
docling_core/types/nlp/__init__.py,sha256=hGcztAeVK7xkRBqRRvc4zbY4PGeJ0r0QrEsetnSx9nI,119
|
|
@@ -41,10 +42,11 @@ docling_core/utils/__init__.py,sha256=VauNNpWRHG0_ISKrsy5-gTxicrdQZSau6qMfuMl3iq
|
|
|
41
42
|
docling_core/utils/alias.py,sha256=B6Lqvss8CbaNARHLR4qSmNh9OkB6LvqTpxfsFmkLAFo,874
|
|
42
43
|
docling_core/utils/ds_generate_docs.py,sha256=0xGBagdC_PGjyeHXYZo90VnVrSTMZgHb0SYhFa6X7bQ,4248
|
|
43
44
|
docling_core/utils/ds_generate_jsonschema.py,sha256=EhNQutqWJFWuN-yl9UUPFZ7DJTvGqg54qBIvUMHTHdA,1647
|
|
45
|
+
docling_core/utils/file.py,sha256=VQgzjyvmJnAIHB6ex7ikcmbDAR4GA1ALreuO7Ubrp50,1895
|
|
44
46
|
docling_core/utils/validate.py,sha256=3FmnxnKTDZC5J9OGxCL3U3DGRl0t0bBV1NcySXswdas,2031
|
|
45
47
|
docling_core/utils/validators.py,sha256=fBdyWX4PvFh7o_d25ZTs4iwmeo75QTbrxsvXv2kXkTg,2777
|
|
46
|
-
docling_core-1.
|
|
47
|
-
docling_core-1.
|
|
48
|
-
docling_core-1.
|
|
49
|
-
docling_core-1.
|
|
50
|
-
docling_core-1.
|
|
48
|
+
docling_core-1.5.0.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
|
|
49
|
+
docling_core-1.5.0.dist-info/METADATA,sha256=Z5kzVlogRs8FQN_ummLsikFOrf9BBr-rR-ESfwLvwHs,5432
|
|
50
|
+
docling_core-1.5.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
51
|
+
docling_core-1.5.0.dist-info/entry_points.txt,sha256=XHhtJEkdUuLxXSNxLdFIzx_siQ3z2UFQEKp-P8VYAE4,189
|
|
52
|
+
docling_core-1.5.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|