docling-core 1.4.1__tar.gz → 1.6.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- {docling_core-1.4.1 → docling_core-1.6.0}/PKG-INFO +1 -1
- {docling_core-1.4.1 → docling_core-1.6.0}/docling_core/types/doc/base.py +205 -12
- {docling_core-1.4.1 → docling_core-1.6.0}/docling_core/types/doc/document.py +89 -199
- docling_core-1.6.0/docling_core/types/doc/tokens.py +202 -0
- docling_core-1.6.0/docling_core/utils/file.py +54 -0
- {docling_core-1.4.1 → docling_core-1.6.0}/pyproject.toml +2 -1
- {docling_core-1.4.1 → docling_core-1.6.0}/LICENSE +0 -0
- {docling_core-1.4.1 → docling_core-1.6.0}/README.md +0 -0
- {docling_core-1.4.1 → docling_core-1.6.0}/docling_core/__init__.py +0 -0
- {docling_core-1.4.1 → docling_core-1.6.0}/docling_core/py.typed +0 -0
- {docling_core-1.4.1 → docling_core-1.6.0}/docling_core/resources/schemas/doc/ANN.json +0 -0
- {docling_core-1.4.1 → docling_core-1.6.0}/docling_core/resources/schemas/doc/DOC.json +0 -0
- {docling_core-1.4.1 → docling_core-1.6.0}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
- {docling_core-1.4.1 → docling_core-1.6.0}/docling_core/resources/schemas/doc/RAW.json +0 -0
- {docling_core-1.4.1 → docling_core-1.6.0}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
- {docling_core-1.4.1 → docling_core-1.6.0}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
- {docling_core-1.4.1 → docling_core-1.6.0}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
- {docling_core-1.4.1 → docling_core-1.6.0}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
- {docling_core-1.4.1 → docling_core-1.6.0}/docling_core/search/__init__.py +0 -0
- {docling_core-1.4.1 → docling_core-1.6.0}/docling_core/search/json_schema_to_search_mapper.py +0 -0
- {docling_core-1.4.1 → docling_core-1.6.0}/docling_core/search/mapping.py +0 -0
- {docling_core-1.4.1 → docling_core-1.6.0}/docling_core/search/meta.py +0 -0
- {docling_core-1.4.1 → docling_core-1.6.0}/docling_core/search/package.py +0 -0
- {docling_core-1.4.1 → docling_core-1.6.0}/docling_core/transforms/__init__.py +0 -0
- {docling_core-1.4.1 → docling_core-1.6.0}/docling_core/transforms/chunker/__init__.py +0 -0
- {docling_core-1.4.1 → docling_core-1.6.0}/docling_core/transforms/chunker/base.py +0 -0
- {docling_core-1.4.1 → docling_core-1.6.0}/docling_core/transforms/chunker/hierarchical_chunker.py +0 -0
- {docling_core-1.4.1 → docling_core-1.6.0}/docling_core/types/__init__.py +0 -0
- {docling_core-1.4.1 → docling_core-1.6.0}/docling_core/types/base.py +0 -0
- {docling_core-1.4.1 → docling_core-1.6.0}/docling_core/types/doc/__init__.py +0 -0
- {docling_core-1.4.1 → docling_core-1.6.0}/docling_core/types/doc/doc_ann.py +0 -0
- {docling_core-1.4.1 → docling_core-1.6.0}/docling_core/types/doc/doc_ocr.py +0 -0
- {docling_core-1.4.1 → docling_core-1.6.0}/docling_core/types/doc/doc_raw.py +0 -0
- {docling_core-1.4.1 → docling_core-1.6.0}/docling_core/types/gen/__init__.py +0 -0
- {docling_core-1.4.1 → docling_core-1.6.0}/docling_core/types/gen/generic.py +0 -0
- {docling_core-1.4.1 → docling_core-1.6.0}/docling_core/types/nlp/__init__.py +0 -0
- {docling_core-1.4.1 → docling_core-1.6.0}/docling_core/types/nlp/qa.py +0 -0
- {docling_core-1.4.1 → docling_core-1.6.0}/docling_core/types/nlp/qa_labels.py +0 -0
- {docling_core-1.4.1 → docling_core-1.6.0}/docling_core/types/rec/__init__.py +0 -0
- {docling_core-1.4.1 → docling_core-1.6.0}/docling_core/types/rec/attribute.py +0 -0
- {docling_core-1.4.1 → docling_core-1.6.0}/docling_core/types/rec/base.py +0 -0
- {docling_core-1.4.1 → docling_core-1.6.0}/docling_core/types/rec/predicate.py +0 -0
- {docling_core-1.4.1 → docling_core-1.6.0}/docling_core/types/rec/record.py +0 -0
- {docling_core-1.4.1 → docling_core-1.6.0}/docling_core/types/rec/statement.py +0 -0
- {docling_core-1.4.1 → docling_core-1.6.0}/docling_core/types/rec/subject.py +0 -0
- {docling_core-1.4.1 → docling_core-1.6.0}/docling_core/utils/__init__.py +0 -0
- {docling_core-1.4.1 → docling_core-1.6.0}/docling_core/utils/alias.py +0 -0
- {docling_core-1.4.1 → docling_core-1.6.0}/docling_core/utils/ds_generate_docs.py +0 -0
- {docling_core-1.4.1 → docling_core-1.6.0}/docling_core/utils/ds_generate_jsonschema.py +0 -0
- {docling_core-1.4.1 → docling_core-1.6.0}/docling_core/utils/validate.py +0 -0
- {docling_core-1.4.1 → docling_core-1.6.0}/docling_core/utils/validators.py +0 -0
|
@@ -10,6 +10,7 @@ import pandas as pd
|
|
|
10
10
|
from pydantic import BaseModel, Field, PositiveInt, StrictStr
|
|
11
11
|
|
|
12
12
|
from docling_core.search.mapping import es_field
|
|
13
|
+
from docling_core.types.doc.tokens import DocumentToken
|
|
13
14
|
from docling_core.utils.alias import AliasModel
|
|
14
15
|
|
|
15
16
|
CellData = tuple[float, float, float, float, str, str]
|
|
@@ -132,10 +133,6 @@ class GlmTableCell(TableCell):
|
|
|
132
133
|
class BaseCell(AliasModel):
|
|
133
134
|
"""Base cell."""
|
|
134
135
|
|
|
135
|
-
# FIXME: we need to check why we have bounding_box (this should be in prov)
|
|
136
|
-
bounding_box: Optional[BoundingBoxContainer] = Field(
|
|
137
|
-
default=None, alias="bounding-box", json_schema_extra=es_field(suppress=True)
|
|
138
|
-
)
|
|
139
136
|
prov: Optional[list[Prov]] = None
|
|
140
137
|
text: Optional[str] = Field(
|
|
141
138
|
default=None, json_schema_extra=es_field(term_vector="with_positions_offsets")
|
|
@@ -144,6 +141,38 @@ class BaseCell(AliasModel):
|
|
|
144
141
|
alias="type", json_schema_extra=es_field(type="keyword", ignore_above=8191)
|
|
145
142
|
)
|
|
146
143
|
|
|
144
|
+
def get_location_tokens(
|
|
145
|
+
self,
|
|
146
|
+
new_line: str,
|
|
147
|
+
page_w: float,
|
|
148
|
+
page_h: float,
|
|
149
|
+
xsize: int = 100,
|
|
150
|
+
ysize: int = 100,
|
|
151
|
+
add_page_index: bool = True,
|
|
152
|
+
) -> str:
|
|
153
|
+
"""Get the location string for the BaseCell."""
|
|
154
|
+
if self.prov is None:
|
|
155
|
+
return ""
|
|
156
|
+
|
|
157
|
+
location = ""
|
|
158
|
+
for prov in self.prov:
|
|
159
|
+
|
|
160
|
+
page_i = -1
|
|
161
|
+
if add_page_index:
|
|
162
|
+
page_i = prov.page
|
|
163
|
+
|
|
164
|
+
loc_str = DocumentToken.get_location(
|
|
165
|
+
bbox=prov.bbox,
|
|
166
|
+
page_w=page_w,
|
|
167
|
+
page_h=page_h,
|
|
168
|
+
xsize=xsize,
|
|
169
|
+
ysize=ysize,
|
|
170
|
+
page_i=page_i,
|
|
171
|
+
)
|
|
172
|
+
location += f"{loc_str}{new_line}"
|
|
173
|
+
|
|
174
|
+
return location
|
|
175
|
+
|
|
147
176
|
|
|
148
177
|
class Table(BaseCell):
|
|
149
178
|
"""Table."""
|
|
@@ -153,6 +182,11 @@ class Table(BaseCell):
|
|
|
153
182
|
data: Optional[list[list[Union[GlmTableCell, TableCell]]]] = None
|
|
154
183
|
model: Optional[str] = None
|
|
155
184
|
|
|
185
|
+
# FIXME: we need to check why we have bounding_box (this should be in prov)
|
|
186
|
+
bounding_box: Optional[BoundingBoxContainer] = Field(
|
|
187
|
+
default=None, alias="bounding-box", json_schema_extra=es_field(suppress=True)
|
|
188
|
+
)
|
|
189
|
+
|
|
156
190
|
def _get_tablecell_span(self, cell: TableCell, ix: int):
|
|
157
191
|
if cell.spans is None:
|
|
158
192
|
span = set()
|
|
@@ -249,26 +283,185 @@ class Table(BaseCell):
|
|
|
249
283
|
|
|
250
284
|
return body
|
|
251
285
|
|
|
286
|
+
def export_to_document_tokens(
|
|
287
|
+
self,
|
|
288
|
+
new_line: str = "\n",
|
|
289
|
+
page_w: float = 0.0,
|
|
290
|
+
page_h: float = 0.0,
|
|
291
|
+
xsize: int = 100,
|
|
292
|
+
ysize: int = 100,
|
|
293
|
+
add_location: bool = True,
|
|
294
|
+
add_caption: bool = True,
|
|
295
|
+
add_content: bool = True,
|
|
296
|
+
add_cell_location: bool = True,
|
|
297
|
+
add_cell_label: bool = True,
|
|
298
|
+
add_cell_text: bool = True,
|
|
299
|
+
add_page_index: bool = True,
|
|
300
|
+
):
|
|
301
|
+
"""Export table to document tokens format."""
|
|
302
|
+
body = f"{DocumentToken.BEG_TABLE.value}{new_line}"
|
|
303
|
+
|
|
304
|
+
if add_location:
|
|
305
|
+
body += self.get_location_tokens(
|
|
306
|
+
new_line=new_line,
|
|
307
|
+
page_w=page_w,
|
|
308
|
+
page_h=page_h,
|
|
309
|
+
xsize=xsize,
|
|
310
|
+
ysize=ysize,
|
|
311
|
+
add_page_index=add_page_index,
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
if add_caption and self.text is not None and len(self.text) > 0:
|
|
315
|
+
body += f"{DocumentToken.BEG_CAPTION.value}"
|
|
316
|
+
body += f"{self.text.strip()}"
|
|
317
|
+
body += f"{DocumentToken.END_CAPTION.value}"
|
|
318
|
+
body += f"{new_line}"
|
|
319
|
+
|
|
320
|
+
if add_content and self.data is not None and len(self.data) > 0:
|
|
321
|
+
for i, row in enumerate(self.data):
|
|
322
|
+
body += f"<row_{i}>"
|
|
323
|
+
for j, col in enumerate(row):
|
|
324
|
+
|
|
325
|
+
text = ""
|
|
326
|
+
if add_cell_text:
|
|
327
|
+
text = col.text.strip()
|
|
328
|
+
|
|
329
|
+
cell_loc = ""
|
|
330
|
+
if (
|
|
331
|
+
col.bbox is not None
|
|
332
|
+
and add_cell_location
|
|
333
|
+
and add_page_index
|
|
334
|
+
and self.prov is not None
|
|
335
|
+
and len(self.prov) > 0
|
|
336
|
+
):
|
|
337
|
+
cell_loc = DocumentToken.get_location(
|
|
338
|
+
bbox=col.bbox,
|
|
339
|
+
page_w=page_w,
|
|
340
|
+
page_h=page_h,
|
|
341
|
+
xsize=xsize,
|
|
342
|
+
ysize=ysize,
|
|
343
|
+
page_i=self.prov[0].page,
|
|
344
|
+
)
|
|
345
|
+
elif (
|
|
346
|
+
col.bbox is not None
|
|
347
|
+
and add_cell_location
|
|
348
|
+
and not add_page_index
|
|
349
|
+
):
|
|
350
|
+
cell_loc = DocumentToken.get_location(
|
|
351
|
+
bbox=col.bbox,
|
|
352
|
+
page_w=page_w,
|
|
353
|
+
page_h=page_h,
|
|
354
|
+
xsize=xsize,
|
|
355
|
+
ysize=ysize,
|
|
356
|
+
page_i=-1,
|
|
357
|
+
)
|
|
358
|
+
|
|
359
|
+
cell_label = ""
|
|
360
|
+
if (
|
|
361
|
+
add_cell_label
|
|
362
|
+
and col.obj_type is not None
|
|
363
|
+
and len(col.obj_type) > 0
|
|
364
|
+
):
|
|
365
|
+
cell_label = f"<{col.obj_type}>"
|
|
366
|
+
|
|
367
|
+
body += f"<col_{j}>{cell_loc}{cell_label}{text}</col_{j}>"
|
|
368
|
+
|
|
369
|
+
body += f"</row_{i}>{new_line}"
|
|
370
|
+
|
|
371
|
+
body += f"{DocumentToken.END_TABLE.value}{new_line}"
|
|
372
|
+
|
|
373
|
+
return body
|
|
374
|
+
|
|
252
375
|
|
|
253
376
|
# FIXME: let's add some figure specific data-types later
|
|
254
377
|
class Figure(BaseCell):
|
|
255
378
|
"""Figure."""
|
|
256
379
|
|
|
380
|
+
# FIXME: we need to check why we have bounding_box (this should be in prov)
|
|
381
|
+
bounding_box: Optional[BoundingBoxContainer] = Field(
|
|
382
|
+
default=None, alias="bounding-box", json_schema_extra=es_field(suppress=True)
|
|
383
|
+
)
|
|
384
|
+
|
|
385
|
+
def export_to_document_tokens(
|
|
386
|
+
self,
|
|
387
|
+
new_line: str = "\n",
|
|
388
|
+
page_w: float = 0.0,
|
|
389
|
+
page_h: float = 0.0,
|
|
390
|
+
xsize: int = 100,
|
|
391
|
+
ysize: int = 100,
|
|
392
|
+
add_location: bool = True,
|
|
393
|
+
add_caption: bool = True,
|
|
394
|
+
add_content: bool = True, # not used at the moment
|
|
395
|
+
add_page_index: bool = True,
|
|
396
|
+
):
|
|
397
|
+
"""Export figure to document tokens format."""
|
|
398
|
+
body = f"{DocumentToken.BEG_FIGURE.value}{new_line}"
|
|
399
|
+
|
|
400
|
+
if add_location:
|
|
401
|
+
body += self.get_location_tokens(
|
|
402
|
+
new_line=new_line,
|
|
403
|
+
page_w=page_w,
|
|
404
|
+
page_h=page_h,
|
|
405
|
+
xsize=xsize,
|
|
406
|
+
ysize=ysize,
|
|
407
|
+
add_page_index=add_page_index,
|
|
408
|
+
)
|
|
409
|
+
|
|
410
|
+
if add_caption and self.text is not None and len(self.text) > 0:
|
|
411
|
+
body += f"{DocumentToken.BEG_CAPTION.value}"
|
|
412
|
+
body += f"{self.text.strip()}"
|
|
413
|
+
body += f"{DocumentToken.END_CAPTION.value}"
|
|
414
|
+
body += f"{new_line}"
|
|
415
|
+
|
|
416
|
+
body += f"{DocumentToken.END_FIGURE.value}{new_line}"
|
|
417
|
+
|
|
418
|
+
return body
|
|
419
|
+
|
|
257
420
|
|
|
258
|
-
class BaseText(
|
|
421
|
+
class BaseText(BaseCell):
|
|
259
422
|
"""Base model for text objects."""
|
|
260
423
|
|
|
261
|
-
|
|
262
|
-
json_schema_extra=es_field(term_vector="with_positions_offsets")
|
|
263
|
-
)
|
|
264
|
-
obj_type: StrictStr = Field(
|
|
265
|
-
alias="type", json_schema_extra=es_field(type="keyword", ignore_above=8191)
|
|
266
|
-
)
|
|
424
|
+
# FIXME: do we need these ???
|
|
267
425
|
name: Optional[StrictStr] = Field(
|
|
268
426
|
default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
|
|
269
427
|
)
|
|
270
428
|
font: Optional[str] = None
|
|
271
|
-
|
|
429
|
+
|
|
430
|
+
def export_to_document_tokens(
|
|
431
|
+
self,
|
|
432
|
+
new_line: str = "\n",
|
|
433
|
+
page_w: float = 0.0,
|
|
434
|
+
page_h: float = 0.0,
|
|
435
|
+
xsize: int = 100,
|
|
436
|
+
ysize: int = 100,
|
|
437
|
+
add_location: bool = True,
|
|
438
|
+
add_content: bool = True,
|
|
439
|
+
add_page_index: bool = True,
|
|
440
|
+
):
|
|
441
|
+
"""Export text element to document tokens format."""
|
|
442
|
+
body = f"<{self.obj_type}>"
|
|
443
|
+
# body = f"<{self.name}>"
|
|
444
|
+
|
|
445
|
+
assert DocumentToken.is_known_token(
|
|
446
|
+
body
|
|
447
|
+
), f"failed DocumentToken.is_known_token({body})"
|
|
448
|
+
|
|
449
|
+
if add_location:
|
|
450
|
+
body += self.get_location_tokens(
|
|
451
|
+
new_line="",
|
|
452
|
+
page_w=page_w,
|
|
453
|
+
page_h=page_h,
|
|
454
|
+
xsize=xsize,
|
|
455
|
+
ysize=ysize,
|
|
456
|
+
add_page_index=add_page_index,
|
|
457
|
+
)
|
|
458
|
+
|
|
459
|
+
if add_content and self.text is not None:
|
|
460
|
+
body += self.text.strip()
|
|
461
|
+
|
|
462
|
+
body += f"</{self.obj_type}>{new_line}"
|
|
463
|
+
|
|
464
|
+
return body
|
|
272
465
|
|
|
273
466
|
|
|
274
467
|
class ListItem(BaseText):
|
|
@@ -6,8 +6,7 @@
|
|
|
6
6
|
"""Models for the Docling Document data type."""
|
|
7
7
|
|
|
8
8
|
from datetime import datetime
|
|
9
|
-
from
|
|
10
|
-
from typing import Generic, Optional, Tuple, Union
|
|
9
|
+
from typing import Generic, Optional, Union
|
|
11
10
|
|
|
12
11
|
from pydantic import (
|
|
13
12
|
AnyHttpUrl,
|
|
@@ -43,6 +42,7 @@ from docling_core.types.doc.base import (
|
|
|
43
42
|
S3Data,
|
|
44
43
|
Table,
|
|
45
44
|
)
|
|
45
|
+
from docling_core.types.doc.tokens import DocumentToken
|
|
46
46
|
from docling_core.utils.alias import AliasModel
|
|
47
47
|
|
|
48
48
|
|
|
@@ -347,107 +347,6 @@ class CCSDocument(
|
|
|
347
347
|
return data
|
|
348
348
|
|
|
349
349
|
|
|
350
|
-
class DocumentToken(Enum):
|
|
351
|
-
"""Class to represent an LLM friendly representation of a Document."""
|
|
352
|
-
|
|
353
|
-
BEG_DOCUMENT = "<document>"
|
|
354
|
-
END_DOCUMENT = "</document>"
|
|
355
|
-
|
|
356
|
-
BEG_TITLE = "<title>"
|
|
357
|
-
END_TITLE = "</title>"
|
|
358
|
-
|
|
359
|
-
BEG_ABSTRACT = "<abstract>"
|
|
360
|
-
END_ABSTRACT = "</abstract>"
|
|
361
|
-
|
|
362
|
-
BEG_DOI = "<doi>"
|
|
363
|
-
END_DOI = "</doi>"
|
|
364
|
-
BEG_DATE = "<date>"
|
|
365
|
-
END_DATE = "</date>"
|
|
366
|
-
|
|
367
|
-
BEG_AUTHORS = "<authors>"
|
|
368
|
-
END_AUTHORS = "</authors>"
|
|
369
|
-
BEG_AUTHOR = "<author>"
|
|
370
|
-
END_AUTHOR = "</author>"
|
|
371
|
-
|
|
372
|
-
BEG_AFFILIATIONS = "<affiliations>"
|
|
373
|
-
END_AFFILIATIONS = "</affiliations>"
|
|
374
|
-
BEG_AFFILIATION = "<affiliation>"
|
|
375
|
-
END_AFFILIATION = "</affiliation>"
|
|
376
|
-
|
|
377
|
-
BEG_HEADER = "<section-header>"
|
|
378
|
-
END_HEADER = "</section-header>"
|
|
379
|
-
BEG_TEXT = "<text>"
|
|
380
|
-
END_TEXT = "</text>"
|
|
381
|
-
BEG_PARAGRAPH = "<paragraph>"
|
|
382
|
-
END_PARAGRAPH = "</paragraph>"
|
|
383
|
-
BEG_TABLE = "<table>"
|
|
384
|
-
END_TABLE = "</table>"
|
|
385
|
-
BEG_FIGURE = "<figure>"
|
|
386
|
-
END_FIGURE = "</figure>"
|
|
387
|
-
BEG_CAPTION = "<caption>"
|
|
388
|
-
END_CAPTION = "</caption>"
|
|
389
|
-
BEG_EQUATION = "<equation>"
|
|
390
|
-
END_EQUATION = "</equation>"
|
|
391
|
-
BEG_LIST = "<list>"
|
|
392
|
-
END_LIST = "</list>"
|
|
393
|
-
BEG_LISTITEM = "<list-item>"
|
|
394
|
-
END_LISTITEM = "</list-item>"
|
|
395
|
-
|
|
396
|
-
BEG_LOCATION = "<location>"
|
|
397
|
-
END_LOCATION = "</location>"
|
|
398
|
-
BEG_GROUP = "<group>"
|
|
399
|
-
END_GROUP = "</group>"
|
|
400
|
-
|
|
401
|
-
@classmethod
|
|
402
|
-
def get_special_tokens(
|
|
403
|
-
cls,
|
|
404
|
-
max_rows: int = 100,
|
|
405
|
-
max_cols: int = 100,
|
|
406
|
-
max_pages: int = 1000,
|
|
407
|
-
page_dimension: Tuple[int, int] = (100, 100),
|
|
408
|
-
):
|
|
409
|
-
"""Function to get all special document tokens."""
|
|
410
|
-
special_tokens = [token.value for token in cls]
|
|
411
|
-
|
|
412
|
-
# Adding dynamically generated row and col tokens
|
|
413
|
-
for i in range(0, max_rows + 1):
|
|
414
|
-
special_tokens += [f"<row_{i}>", f"</row_{i}>"]
|
|
415
|
-
|
|
416
|
-
for i in range(0, max_cols + 1):
|
|
417
|
-
special_tokens += [f"<col_{i}>", f"</col_{i}>"]
|
|
418
|
-
|
|
419
|
-
for i in range(6):
|
|
420
|
-
special_tokens += [f"<section-header-{i}>", f"</section-header-{i}>"]
|
|
421
|
-
|
|
422
|
-
# Adding dynamically generated page-tokens
|
|
423
|
-
for i in range(0, max_pages + 1):
|
|
424
|
-
special_tokens.append(f"<page_{i}>")
|
|
425
|
-
|
|
426
|
-
# Adding dynamically generated location-tokens
|
|
427
|
-
for i in range(0, max(page_dimension[0] + 1, page_dimension[1] + 1)):
|
|
428
|
-
special_tokens.append(f"<loc_{i}>")
|
|
429
|
-
|
|
430
|
-
return special_tokens
|
|
431
|
-
|
|
432
|
-
@staticmethod
|
|
433
|
-
def get_page_token(page: int):
|
|
434
|
-
"""Function to get page tokens."""
|
|
435
|
-
return f"<page_{page}>"
|
|
436
|
-
|
|
437
|
-
@staticmethod
|
|
438
|
-
def get_location_token(val: float, rnorm: int = 100):
|
|
439
|
-
"""Function to get location tokens."""
|
|
440
|
-
val_ = round(rnorm * val)
|
|
441
|
-
|
|
442
|
-
if val_ < 0:
|
|
443
|
-
return "<loc_0>"
|
|
444
|
-
|
|
445
|
-
if val_ > rnorm:
|
|
446
|
-
return f"<loc_{rnorm}>"
|
|
447
|
-
|
|
448
|
-
return f"<loc_{val_}>"
|
|
449
|
-
|
|
450
|
-
|
|
451
350
|
class ExportedCCSDocument(
|
|
452
351
|
MinimalDocument,
|
|
453
352
|
Generic[
|
|
@@ -525,7 +424,17 @@ class ExportedCCSDocument(
|
|
|
525
424
|
|
|
526
425
|
return result
|
|
527
426
|
|
|
528
|
-
def
|
|
427
|
+
def get_map_to_page_dimensions(self):
|
|
428
|
+
"""Get a map from page-index (start at 1) to page-dim [width, height]."""
|
|
429
|
+
pagedims = {}
|
|
430
|
+
|
|
431
|
+
if self.page_dimensions is not None:
|
|
432
|
+
for _ in self.page_dimensions:
|
|
433
|
+
pagedims[_.page] = [_.width, _.height]
|
|
434
|
+
|
|
435
|
+
return pagedims
|
|
436
|
+
|
|
437
|
+
def export_to_markdown( # noqa: C901
|
|
529
438
|
self,
|
|
530
439
|
delim: str = "\n\n",
|
|
531
440
|
main_text_start: int = 0,
|
|
@@ -536,8 +445,10 @@ class ExportedCCSDocument(
|
|
|
536
445
|
"paragraph",
|
|
537
446
|
"caption",
|
|
538
447
|
"table",
|
|
448
|
+
"figure",
|
|
539
449
|
],
|
|
540
450
|
strict_text: bool = False,
|
|
451
|
+
image_placeholder: str = "<!-- image -->",
|
|
541
452
|
) -> str:
|
|
542
453
|
r"""Serialize to Markdown.
|
|
543
454
|
|
|
@@ -551,6 +462,12 @@ class ExportedCCSDocument(
|
|
|
551
462
|
Defaults to 0.
|
|
552
463
|
main_text_end (Optional[int], optional): Main-text slicing stop index
|
|
553
464
|
(exclusive). Defaults to None.
|
|
465
|
+
main_text_labels (list[str], optional): The labels to include in the
|
|
466
|
+
markdown.
|
|
467
|
+
strict_text (bool, optional): if true, the output will be only plain text
|
|
468
|
+
without any markdown styling. Defaults to False.
|
|
469
|
+
image_placeholder (str, optional): the placeholder to include to position
|
|
470
|
+
images in the markdown. Defaults to a markdown comment "<!-- image -->".
|
|
554
471
|
|
|
555
472
|
Returns:
|
|
556
473
|
str: The exported Markdown representation.
|
|
@@ -576,7 +493,7 @@ class ExportedCCSDocument(
|
|
|
576
493
|
text = item.text
|
|
577
494
|
|
|
578
495
|
# ignore repeated text
|
|
579
|
-
if prev_text == text:
|
|
496
|
+
if prev_text == text or text is None:
|
|
580
497
|
continue
|
|
581
498
|
else:
|
|
582
499
|
prev_text = text
|
|
@@ -630,6 +547,14 @@ class ExportedCCSDocument(
|
|
|
630
547
|
|
|
631
548
|
markdown_text = md_table
|
|
632
549
|
|
|
550
|
+
elif isinstance(item, Figure) and item_type in main_text_labels:
|
|
551
|
+
|
|
552
|
+
markdown_text = ""
|
|
553
|
+
if not strict_text:
|
|
554
|
+
markdown_text = f"{image_placeholder}"
|
|
555
|
+
if item.text:
|
|
556
|
+
markdown_text += "\n" + item.text
|
|
557
|
+
|
|
633
558
|
if markdown_text:
|
|
634
559
|
md_texts.append(markdown_text)
|
|
635
560
|
|
|
@@ -649,48 +574,32 @@ class ExportedCCSDocument(
|
|
|
649
574
|
"table",
|
|
650
575
|
"figure",
|
|
651
576
|
],
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
577
|
+
xsize: int = 100,
|
|
578
|
+
ysize: int = 100,
|
|
579
|
+
add_location: bool = True,
|
|
580
|
+
add_content: bool = True,
|
|
581
|
+
add_page_index: bool = True,
|
|
582
|
+
# table specific flags
|
|
583
|
+
add_table_cell_location: bool = False,
|
|
584
|
+
add_table_cell_label: bool = True,
|
|
585
|
+
add_table_cell_text: bool = True,
|
|
656
586
|
) -> str:
|
|
657
587
|
r"""Exports the document content to an DocumentToken format.
|
|
658
588
|
|
|
659
589
|
Operates on a slice of the document's main_text as defined through arguments
|
|
660
590
|
main_text_start and main_text_stop; defaulting to the whole main_text.
|
|
661
591
|
|
|
662
|
-
Args:
|
|
663
|
-
delim (str, optional): The delimiter used to separate text blocks in the
|
|
664
|
-
exported XML. Default is two newline characters ("\n\n").
|
|
665
|
-
main_text_start (int, optional): The starting index of the main text to
|
|
666
|
-
be included in the XML. Default is 0 (the beginning of the text).
|
|
667
|
-
main_text_stop (Optional[int], optional): The stopping index of the main
|
|
668
|
-
text. If set to None, the export includes text up to the end.
|
|
669
|
-
Default is None.
|
|
670
|
-
main_text_labels (list[str], optional): A list of text labels that
|
|
671
|
-
categorize the different sections of the document (e.g., "title",
|
|
672
|
-
"subtitle-level-1", "paragraph", "caption"). Default labels are
|
|
673
|
-
"title", "subtitle-level-1", "paragraph", and "caption".
|
|
674
|
-
location_tagging (bool, optional): Determines whether to include
|
|
675
|
-
location-based tagging in the XML. If True, the exported XML will
|
|
676
|
-
contain information about the locations of the text elements.
|
|
677
|
-
Default is True.
|
|
678
|
-
location_dimensions (Tuple[int, int], optional): Specifies the dimensions
|
|
679
|
-
(width and height) for the location tagging, if enabled.
|
|
680
|
-
Default is [100, 100].
|
|
681
|
-
add_new_line (bool, optional): Whether to add new line characters after
|
|
682
|
-
each text block. If True, a new line is added after each block of
|
|
683
|
-
text in the XML. Default is True.
|
|
684
|
-
|
|
685
592
|
Returns:
|
|
686
|
-
str: The content of the document formatted as
|
|
593
|
+
str: The content of the document formatted as a DocTags string.
|
|
687
594
|
"""
|
|
688
|
-
xml_str = DocumentToken.BEG_DOCUMENT.value
|
|
689
|
-
|
|
690
595
|
new_line = ""
|
|
691
|
-
if
|
|
596
|
+
if delim:
|
|
692
597
|
new_line = "\n"
|
|
693
598
|
|
|
599
|
+
doctags = f"{DocumentToken.BEG_DOCUMENT.value}{new_line}"
|
|
600
|
+
|
|
601
|
+
# pagedims = self.get_map_to_page_dimensions()
|
|
602
|
+
|
|
694
603
|
if self.main_text is not None:
|
|
695
604
|
for orig_item in self.main_text[main_text_start:main_text_stop]:
|
|
696
605
|
|
|
@@ -705,87 +614,68 @@ class ExportedCCSDocument(
|
|
|
705
614
|
|
|
706
615
|
prov = item.prov
|
|
707
616
|
|
|
708
|
-
|
|
617
|
+
page_i = -1
|
|
618
|
+
page_w = 0.0
|
|
619
|
+
page_h = 0.0
|
|
620
|
+
|
|
709
621
|
if (
|
|
710
|
-
|
|
622
|
+
add_location
|
|
711
623
|
and self.page_dimensions is not None
|
|
712
624
|
and prov is not None
|
|
713
625
|
and len(prov) > 0
|
|
714
626
|
):
|
|
715
627
|
|
|
716
|
-
|
|
717
|
-
page_dim = self.page_dimensions[
|
|
628
|
+
page_i = prov[0].page
|
|
629
|
+
page_dim = self.page_dimensions[page_i - 1]
|
|
718
630
|
|
|
719
631
|
page_w = float(page_dim.width)
|
|
720
632
|
page_h = float(page_dim.height)
|
|
721
633
|
|
|
722
|
-
x0 = float(prov[0].bbox[0]) / float(page_w)
|
|
723
|
-
y0 = float(prov[0].bbox[1]) / float(page_h)
|
|
724
|
-
x1 = float(prov[0].bbox[2]) / float(page_w)
|
|
725
|
-
y1 = float(prov[0].bbox[3]) / float(page_h)
|
|
726
|
-
|
|
727
|
-
page_tok = ""
|
|
728
|
-
if page_tagging:
|
|
729
|
-
page_tok = DocumentToken.get_page_token(page=page)
|
|
730
|
-
|
|
731
|
-
x0_tok = DocumentToken.get_location_token(
|
|
732
|
-
val=min(x0, x1), rnorm=location_dimensions[0]
|
|
733
|
-
)
|
|
734
|
-
y0_tok = DocumentToken.get_location_token(
|
|
735
|
-
val=min(y0, y1), rnorm=location_dimensions[1]
|
|
736
|
-
)
|
|
737
|
-
x1_tok = DocumentToken.get_location_token(
|
|
738
|
-
val=max(x0, x1), rnorm=location_dimensions[0]
|
|
739
|
-
)
|
|
740
|
-
y1_tok = DocumentToken.get_location_token(
|
|
741
|
-
val=max(y0, y1), rnorm=location_dimensions[1]
|
|
742
|
-
)
|
|
743
|
-
|
|
744
|
-
# update
|
|
745
|
-
loc_str = f"{DocumentToken.BEG_LOCATION.value}"
|
|
746
|
-
loc_str += f"{page_tok}"
|
|
747
|
-
loc_str += f"{x0_tok}{y0_tok}{x1_tok}{y1_tok}"
|
|
748
|
-
loc_str += f"{DocumentToken.END_LOCATION.value}"
|
|
749
|
-
|
|
750
634
|
item_type = item.obj_type
|
|
751
635
|
if isinstance(item, BaseText) and (item_type in main_text_labels):
|
|
752
|
-
text = item.text
|
|
753
636
|
|
|
754
|
-
|
|
637
|
+
doctags += item.export_to_document_tokens(
|
|
638
|
+
new_line=new_line,
|
|
639
|
+
page_w=page_w,
|
|
640
|
+
page_h=page_h,
|
|
641
|
+
xsize=xsize,
|
|
642
|
+
ysize=ysize,
|
|
643
|
+
add_location=add_location,
|
|
644
|
+
add_content=add_content,
|
|
645
|
+
add_page_index=add_page_index,
|
|
646
|
+
)
|
|
755
647
|
|
|
756
648
|
elif isinstance(item, Table) and (item_type in main_text_labels):
|
|
757
649
|
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
xml_str += f"</row_{i}>{new_line}"
|
|
774
|
-
|
|
775
|
-
xml_str += f"</{item_type}>{new_line}"
|
|
650
|
+
doctags += item.export_to_document_tokens(
|
|
651
|
+
new_line=new_line,
|
|
652
|
+
page_w=page_w,
|
|
653
|
+
page_h=page_h,
|
|
654
|
+
xsize=xsize,
|
|
655
|
+
ysize=ysize,
|
|
656
|
+
add_caption=True,
|
|
657
|
+
add_location=add_location,
|
|
658
|
+
add_content=add_content,
|
|
659
|
+
add_cell_location=add_table_cell_location,
|
|
660
|
+
add_cell_label=add_table_cell_label,
|
|
661
|
+
add_cell_text=add_table_cell_text,
|
|
662
|
+
add_page_index=add_page_index,
|
|
663
|
+
)
|
|
776
664
|
|
|
777
665
|
elif isinstance(item, Figure) and (item_type in main_text_labels):
|
|
778
666
|
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
667
|
+
doctags += item.export_to_document_tokens(
|
|
668
|
+
new_line=new_line,
|
|
669
|
+
page_w=page_w,
|
|
670
|
+
page_h=page_h,
|
|
671
|
+
xsize=xsize,
|
|
672
|
+
ysize=ysize,
|
|
673
|
+
add_caption=True,
|
|
674
|
+
add_location=add_location,
|
|
675
|
+
add_content=add_content,
|
|
676
|
+
add_page_index=add_page_index,
|
|
677
|
+
)
|
|
788
678
|
|
|
789
|
-
|
|
679
|
+
doctags += DocumentToken.END_DOCUMENT.value
|
|
790
680
|
|
|
791
|
-
return
|
|
681
|
+
return doctags
|
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright IBM Corp. 2024 - 2024
|
|
3
|
+
# SPDX-License-Identifier: MIT
|
|
4
|
+
#
|
|
5
|
+
|
|
6
|
+
"""Tokens used in the docling document model."""
|
|
7
|
+
|
|
8
|
+
from enum import Enum
|
|
9
|
+
from typing import Annotated, Tuple
|
|
10
|
+
|
|
11
|
+
from pydantic import Field
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class TableToken(Enum):
|
|
15
|
+
"""Class to represent an LLM friendly representation of a Table."""
|
|
16
|
+
|
|
17
|
+
CELL_LABEL_COLUMN_HEADER = "<column_header>"
|
|
18
|
+
CELL_LABEL_ROW_HEADER = "<row_header>"
|
|
19
|
+
CELL_LABEL_SECTION_HEADERE = "<section_header>"
|
|
20
|
+
CELL_LABEL_DATA = "<data>"
|
|
21
|
+
|
|
22
|
+
OTSL_ECEL = "<ecel>" # empty cell
|
|
23
|
+
OTSL_FCEL = "<fcel>" # cell with content
|
|
24
|
+
OTSL_LCEL = "<lcel>" # left looking cell,
|
|
25
|
+
OTSL_UCEL = "<ucel>" # up looking cell,
|
|
26
|
+
OTSL_XCEL = "<xcel>" # 2d extension cell (cross cell),
|
|
27
|
+
OTSL_NL = "<nl>" # new line,
|
|
28
|
+
OTSL_CHED = "<ched>" # - column header cell,
|
|
29
|
+
OTSL_RHED = "<rhed>" # - row header cell,
|
|
30
|
+
OTSL_SROW = "<srow>" # - section row cell
|
|
31
|
+
|
|
32
|
+
@classmethod
|
|
33
|
+
def get_special_tokens(cls):
|
|
34
|
+
"""Function to get all special document tokens."""
|
|
35
|
+
special_tokens = [token.value for token in cls]
|
|
36
|
+
return special_tokens
|
|
37
|
+
|
|
38
|
+
@staticmethod
|
|
39
|
+
def is_known_token(label):
|
|
40
|
+
"""Function to check if label is in tokens."""
|
|
41
|
+
return label in TableToken.get_special_tokens()
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class DocumentToken(Enum):
|
|
45
|
+
"""Class to represent an LLM friendly representation of a Document."""
|
|
46
|
+
|
|
47
|
+
BEG_DOCUMENT = "<document>"
|
|
48
|
+
END_DOCUMENT = "</document>"
|
|
49
|
+
|
|
50
|
+
BEG_TITLE = "<title>"
|
|
51
|
+
END_TITLE = "</title>"
|
|
52
|
+
|
|
53
|
+
BEG_ABSTRACT = "<abstract>"
|
|
54
|
+
END_ABSTRACT = "</abstract>"
|
|
55
|
+
|
|
56
|
+
BEG_DOI = "<doi>"
|
|
57
|
+
END_DOI = "</doi>"
|
|
58
|
+
BEG_DATE = "<date>"
|
|
59
|
+
END_DATE = "</date>"
|
|
60
|
+
|
|
61
|
+
BEG_AUTHORS = "<authors>"
|
|
62
|
+
END_AUTHORS = "</authors>"
|
|
63
|
+
BEG_AUTHOR = "<author>"
|
|
64
|
+
END_AUTHOR = "</author>"
|
|
65
|
+
|
|
66
|
+
BEG_AFFILIATIONS = "<affiliations>"
|
|
67
|
+
END_AFFILIATIONS = "</affiliations>"
|
|
68
|
+
BEG_AFFILIATION = "<affiliation>"
|
|
69
|
+
END_AFFILIATION = "</affiliation>"
|
|
70
|
+
|
|
71
|
+
BEG_HEADER = "<section-header>"
|
|
72
|
+
END_HEADER = "</section-header>"
|
|
73
|
+
BEG_TEXT = "<text>"
|
|
74
|
+
END_TEXT = "</text>"
|
|
75
|
+
BEG_PARAGRAPH = "<paragraph>"
|
|
76
|
+
END_PARAGRAPH = "</paragraph>"
|
|
77
|
+
BEG_TABLE = "<table>"
|
|
78
|
+
END_TABLE = "</table>"
|
|
79
|
+
BEG_FIGURE = "<figure>"
|
|
80
|
+
END_FIGURE = "</figure>"
|
|
81
|
+
BEG_CAPTION = "<caption>"
|
|
82
|
+
END_CAPTION = "</caption>"
|
|
83
|
+
BEG_EQUATION = "<equation>"
|
|
84
|
+
END_EQUATION = "</equation>"
|
|
85
|
+
BEG_LIST = "<list>"
|
|
86
|
+
END_LIST = "</list>"
|
|
87
|
+
BEG_LISTITEM = "<list-item>"
|
|
88
|
+
END_LISTITEM = "</list-item>"
|
|
89
|
+
|
|
90
|
+
BEG_LOCATION = "<location>"
|
|
91
|
+
END_LOCATION = "</location>"
|
|
92
|
+
BEG_GROUP = "<group>"
|
|
93
|
+
END_GROUP = "</group>"
|
|
94
|
+
|
|
95
|
+
@classmethod
|
|
96
|
+
def get_special_tokens(
|
|
97
|
+
cls,
|
|
98
|
+
max_rows: int = 100,
|
|
99
|
+
max_cols: int = 100,
|
|
100
|
+
max_pages: int = 1000,
|
|
101
|
+
page_dimension: Tuple[int, int] = (100, 100),
|
|
102
|
+
):
|
|
103
|
+
"""Function to get all special document tokens."""
|
|
104
|
+
special_tokens = [token.value for token in cls]
|
|
105
|
+
|
|
106
|
+
# Adding dynamically generated row and col tokens
|
|
107
|
+
for i in range(0, max_rows + 1):
|
|
108
|
+
special_tokens += [f"<row_{i}>", f"</row_{i}>"]
|
|
109
|
+
|
|
110
|
+
for i in range(0, max_cols + 1):
|
|
111
|
+
special_tokens += [f"<col_{i}>", f"</col_{i}>"]
|
|
112
|
+
|
|
113
|
+
for i in range(6):
|
|
114
|
+
special_tokens += [f"<section-header-{i}>", f"</section-header-{i}>"]
|
|
115
|
+
|
|
116
|
+
# FIXME: this is synonym of section header
|
|
117
|
+
for i in range(6):
|
|
118
|
+
special_tokens += [f"<subtitle-level-{i}>", f"</subtitle-level-{i}>"]
|
|
119
|
+
|
|
120
|
+
# Adding dynamically generated page-tokens
|
|
121
|
+
for i in range(0, max_pages + 1):
|
|
122
|
+
special_tokens.append(f"<page_{i}>")
|
|
123
|
+
special_tokens.append(f"</page_{i}>")
|
|
124
|
+
|
|
125
|
+
# Adding dynamically generated location-tokens
|
|
126
|
+
for i in range(0, max(page_dimension[0] + 1, page_dimension[1] + 1)):
|
|
127
|
+
special_tokens.append(f"<loc_{i}>")
|
|
128
|
+
|
|
129
|
+
return special_tokens
|
|
130
|
+
|
|
131
|
+
@staticmethod
|
|
132
|
+
def is_known_token(label):
|
|
133
|
+
"""Function to check if label is in tokens."""
|
|
134
|
+
return label in DocumentToken.get_special_tokens()
|
|
135
|
+
|
|
136
|
+
@staticmethod
|
|
137
|
+
def get_row_token(row: int, beg=bool) -> str:
|
|
138
|
+
"""Function to get page tokens."""
|
|
139
|
+
if beg:
|
|
140
|
+
return f"<row_{row}>"
|
|
141
|
+
else:
|
|
142
|
+
return f"</row_{row}>"
|
|
143
|
+
|
|
144
|
+
@staticmethod
|
|
145
|
+
def get_col_token(col: int, beg=bool) -> str:
|
|
146
|
+
"""Function to get page tokens."""
|
|
147
|
+
if beg:
|
|
148
|
+
return f"<col_{col}>"
|
|
149
|
+
else:
|
|
150
|
+
return f"</col_{col}>"
|
|
151
|
+
|
|
152
|
+
@staticmethod
|
|
153
|
+
def get_page_token(page: int):
|
|
154
|
+
"""Function to get page tokens."""
|
|
155
|
+
return f"<page_{page}>"
|
|
156
|
+
|
|
157
|
+
@staticmethod
|
|
158
|
+
def get_location_token(val: float, rnorm: int = 100):
|
|
159
|
+
"""Function to get location tokens."""
|
|
160
|
+
val_ = round(rnorm * val)
|
|
161
|
+
|
|
162
|
+
if val_ < 0:
|
|
163
|
+
return "<loc_0>"
|
|
164
|
+
|
|
165
|
+
if val_ > rnorm:
|
|
166
|
+
return f"<loc_{rnorm}>"
|
|
167
|
+
|
|
168
|
+
return f"<loc_{val_}>"
|
|
169
|
+
|
|
170
|
+
@staticmethod
|
|
171
|
+
def get_location(
|
|
172
|
+
# bbox: Tuple[float, float, float, float],
|
|
173
|
+
bbox: Annotated[list[float], Field(min_length=4, max_length=4)],
|
|
174
|
+
page_w: float,
|
|
175
|
+
page_h: float,
|
|
176
|
+
xsize: int = 100,
|
|
177
|
+
ysize: int = 100,
|
|
178
|
+
page_i: int = -1,
|
|
179
|
+
):
|
|
180
|
+
"""Get the location string give bbox and page-dim."""
|
|
181
|
+
assert bbox[0] <= bbox[2], f"bbox[0]<=bbox[2] => {bbox[0]}<={bbox[2]}"
|
|
182
|
+
assert bbox[1] <= bbox[3], f"bbox[1]<=bbox[3] => {bbox[1]}<={bbox[3]}"
|
|
183
|
+
|
|
184
|
+
x0 = bbox[0] / page_w
|
|
185
|
+
y0 = bbox[1] / page_h
|
|
186
|
+
x1 = bbox[2] / page_w
|
|
187
|
+
y1 = bbox[3] / page_h
|
|
188
|
+
|
|
189
|
+
page_tok = ""
|
|
190
|
+
if page_i != -1:
|
|
191
|
+
page_tok = DocumentToken.get_page_token(page=page_i)
|
|
192
|
+
|
|
193
|
+
x0_tok = DocumentToken.get_location_token(val=min(x0, x1), rnorm=xsize)
|
|
194
|
+
y0_tok = DocumentToken.get_location_token(val=min(y0, y1), rnorm=ysize)
|
|
195
|
+
x1_tok = DocumentToken.get_location_token(val=max(x0, x1), rnorm=xsize)
|
|
196
|
+
y1_tok = DocumentToken.get_location_token(val=max(y0, y1), rnorm=ysize)
|
|
197
|
+
|
|
198
|
+
loc_str = f"{DocumentToken.BEG_LOCATION.value}"
|
|
199
|
+
loc_str += f"{page_tok}{x0_tok}{y0_tok}{x1_tok}{y1_tok}"
|
|
200
|
+
loc_str += f"{DocumentToken.END_LOCATION.value}"
|
|
201
|
+
|
|
202
|
+
return loc_str
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright IBM Corp. 2024 - 2024
|
|
3
|
+
# SPDX-License-Identifier: MIT
|
|
4
|
+
#
|
|
5
|
+
|
|
6
|
+
"""File-related utilities."""
|
|
7
|
+
|
|
8
|
+
import tempfile
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Union
|
|
11
|
+
|
|
12
|
+
import requests
|
|
13
|
+
from pydantic import AnyHttpUrl, TypeAdapter, ValidationError
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def resolve_file_source(source: Union[Path, AnyHttpUrl, str]) -> Path:
|
|
17
|
+
"""Resolves the source (URL, path) of a file to a local file path.
|
|
18
|
+
|
|
19
|
+
If a URL is provided, the content is first downloaded to a temporary local file.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
source (Path | AnyHttpUrl | str): The file input source. Can be a path or URL.
|
|
23
|
+
|
|
24
|
+
Raises:
|
|
25
|
+
ValueError: If source is of unexpected type.
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
Path: The local file path.
|
|
29
|
+
"""
|
|
30
|
+
try:
|
|
31
|
+
http_url: AnyHttpUrl = TypeAdapter(AnyHttpUrl).validate_python(source)
|
|
32
|
+
res = requests.get(http_url, stream=True)
|
|
33
|
+
res.raise_for_status()
|
|
34
|
+
fname = None
|
|
35
|
+
# try to get filename from response header
|
|
36
|
+
if cont_disp := res.headers.get("Content-Disposition"):
|
|
37
|
+
for par in cont_disp.strip().split(";"):
|
|
38
|
+
# currently only handling directive "filename" (not "*filename")
|
|
39
|
+
if (split := par.split("=")) and split[0].strip() == "filename":
|
|
40
|
+
fname = "=".join(split[1:]).strip().strip("'\"") or None
|
|
41
|
+
break
|
|
42
|
+
# otherwise, use name from URL:
|
|
43
|
+
if fname is None:
|
|
44
|
+
fname = Path(http_url.path or "file").name
|
|
45
|
+
local_path = Path(tempfile.mkdtemp()) / fname
|
|
46
|
+
with open(local_path, "wb") as f:
|
|
47
|
+
for chunk in res.iter_content(chunk_size=1024): # using 1-KB chunks
|
|
48
|
+
f.write(chunk)
|
|
49
|
+
except ValidationError:
|
|
50
|
+
try:
|
|
51
|
+
local_path = TypeAdapter(Path).validate_python(source)
|
|
52
|
+
except ValidationError:
|
|
53
|
+
raise ValueError(f"Unexpected source type encountered: {type(source)}")
|
|
54
|
+
return local_path
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "docling-core"
|
|
3
|
-
version = "1.
|
|
3
|
+
version = "1.6.0"
|
|
4
4
|
description = "A python library to define and validate data types in Docling."
|
|
5
5
|
license = "MIT"
|
|
6
6
|
authors = [
|
|
@@ -118,6 +118,7 @@ module = [
|
|
|
118
118
|
"jsonschema.*",
|
|
119
119
|
"json_schema_for_humans.*",
|
|
120
120
|
"pandas.*",
|
|
121
|
+
"requests.*",
|
|
121
122
|
"tabulate.*",
|
|
122
123
|
]
|
|
123
124
|
ignore_missing_imports = true
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-1.4.1 → docling_core-1.6.0}/docling_core/resources/schemas/doc/OCR-output.json
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-1.4.1 → docling_core-1.6.0}/docling_core/search/json_schema_to_search_mapper.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-1.4.1 → docling_core-1.6.0}/docling_core/transforms/chunker/hierarchical_chunker.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|