docling-core 1.1.3__tar.gz → 1.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- {docling_core-1.1.3 → docling_core-1.2.0}/PKG-INFO +11 -10
- {docling_core-1.1.3 → docling_core-1.2.0}/README.md +10 -8
- {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/types/doc/base.py +6 -0
- {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/types/doc/document.py +286 -11
- {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/types/rec/statement.py +34 -0
- {docling_core-1.1.3 → docling_core-1.2.0}/pyproject.toml +1 -2
- {docling_core-1.1.3 → docling_core-1.2.0}/LICENSE +0 -0
- {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/__init__.py +0 -0
- {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/py.typed +0 -0
- {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/resources/schemas/doc/ANN.json +0 -0
- {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/resources/schemas/doc/DOC.json +0 -0
- {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
- {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/resources/schemas/doc/RAW.json +0 -0
- {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
- {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
- {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
- {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
- {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/search/__init__.py +0 -0
- {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/search/json_schema_to_search_mapper.py +0 -0
- {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/search/mapping.py +0 -0
- {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/search/meta.py +0 -0
- {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/search/package.py +0 -0
- {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/types/__init__.py +0 -0
- {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/types/base.py +0 -0
- {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/types/doc/__init__.py +0 -0
- {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/types/doc/doc_ann.py +0 -0
- {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/types/doc/doc_ocr.py +0 -0
- {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/types/doc/doc_raw.py +0 -0
- {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/types/gen/__init__.py +0 -0
- {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/types/gen/generic.py +0 -0
- {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/types/nlp/__init__.py +0 -0
- {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/types/nlp/qa.py +0 -0
- {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/types/nlp/qa_labels.py +0 -0
- {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/types/rec/__init__.py +0 -0
- {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/types/rec/attribute.py +0 -0
- {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/types/rec/base.py +0 -0
- {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/types/rec/predicate.py +0 -0
- {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/types/rec/record.py +0 -0
- {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/types/rec/subject.py +0 -0
- {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/utils/__init__.py +0 -0
- {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/utils/alias.py +0 -0
- {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/utils/ds_generate_docs.py +0 -0
- {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/utils/ds_generate_jsonschema.py +0 -0
- {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/utils/validate.py +0 -0
- {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/utils/validators.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: docling-core
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.2.0
|
|
4
4
|
Summary: A python library to define and validate data types in Docling.
|
|
5
5
|
Home-page: https://ds4sd.github.io/
|
|
6
6
|
License: MIT
|
|
@@ -28,7 +28,6 @@ Classifier: Typing :: Typed
|
|
|
28
28
|
Requires-Dist: json-schema-for-humans (>=1.0.0,<2.0.0)
|
|
29
29
|
Requires-Dist: jsonref (>=1.1.0,<2.0.0)
|
|
30
30
|
Requires-Dist: jsonschema (>=4.16.0,<5.0.0)
|
|
31
|
-
Requires-Dist: poetry (>=1.8.3,<2.0.0)
|
|
32
31
|
Requires-Dist: pydantic (>=2.6.0,<3.0.0)
|
|
33
32
|
Requires-Dist: pyproject-toml (>=0.0.10,<0.0.11)
|
|
34
33
|
Requires-Dist: tabulate (>=0.9.0,<0.10.0)
|
|
@@ -47,7 +46,7 @@ Description-Content-Type: text/markdown
|
|
|
47
46
|
[](https://github.com/pre-commit/pre-commit)
|
|
48
47
|
[](https://opensource.org/licenses/MIT)
|
|
49
48
|
|
|
50
|
-
Docling Core is a library that defines the data types in [Docling](https://
|
|
49
|
+
Docling Core is a library that defines the data types in [Docling](https://github.com/DS4SD/docling), leveraging pydantic models.
|
|
51
50
|
|
|
52
51
|
## Installation
|
|
53
52
|
|
|
@@ -116,13 +115,15 @@ Please read [Contributing to Docling Core](./CONTRIBUTING.md) for details.
|
|
|
116
115
|
If you use Docling Core in your projects, please consider citing the following:
|
|
117
116
|
|
|
118
117
|
```bib
|
|
119
|
-
@
|
|
120
|
-
author =
|
|
121
|
-
month =
|
|
122
|
-
title =
|
|
123
|
-
url =
|
|
124
|
-
|
|
125
|
-
|
|
118
|
+
@techreport{Docling,
|
|
119
|
+
author = "Deep Search Team",
|
|
120
|
+
month = 8,
|
|
121
|
+
title = "Docling Technical Report",
|
|
122
|
+
url = "https://arxiv.org/abs/2408.09869",
|
|
123
|
+
eprint = "2408.09869",
|
|
124
|
+
doi = "10.48550/arXiv.2408.09869",
|
|
125
|
+
version = "1.0.0",
|
|
126
|
+
year = 2024
|
|
126
127
|
}
|
|
127
128
|
```
|
|
128
129
|
|
|
@@ -10,7 +10,7 @@
|
|
|
10
10
|
[](https://github.com/pre-commit/pre-commit)
|
|
11
11
|
[](https://opensource.org/licenses/MIT)
|
|
12
12
|
|
|
13
|
-
Docling Core is a library that defines the data types in [Docling](https://
|
|
13
|
+
Docling Core is a library that defines the data types in [Docling](https://github.com/DS4SD/docling), leveraging pydantic models.
|
|
14
14
|
|
|
15
15
|
## Installation
|
|
16
16
|
|
|
@@ -79,13 +79,15 @@ Please read [Contributing to Docling Core](./CONTRIBUTING.md) for details.
|
|
|
79
79
|
If you use Docling Core in your projects, please consider citing the following:
|
|
80
80
|
|
|
81
81
|
```bib
|
|
82
|
-
@
|
|
83
|
-
author =
|
|
84
|
-
month =
|
|
85
|
-
title =
|
|
86
|
-
url =
|
|
87
|
-
|
|
88
|
-
|
|
82
|
+
@techreport{Docling,
|
|
83
|
+
author = "Deep Search Team",
|
|
84
|
+
month = 8,
|
|
85
|
+
title = "Docling Technical Report",
|
|
86
|
+
url = "https://arxiv.org/abs/2408.09869",
|
|
87
|
+
eprint = "2408.09869",
|
|
88
|
+
doi = "10.48550/arXiv.2408.09869",
|
|
89
|
+
version = "1.0.0",
|
|
90
|
+
year = 2024
|
|
89
91
|
}
|
|
90
92
|
```
|
|
91
93
|
|
|
@@ -131,6 +131,7 @@ class GlmTableCell(TableCell):
|
|
|
131
131
|
class BaseCell(AliasModel):
|
|
132
132
|
"""Base cell."""
|
|
133
133
|
|
|
134
|
+
# FIXME: we need to check why we have bounding_box (this should be in prov)
|
|
134
135
|
bounding_box: Optional[BoundingBoxContainer] = Field(
|
|
135
136
|
default=None, alias="bounding-box", json_schema_extra=es_field(suppress=True)
|
|
136
137
|
)
|
|
@@ -152,6 +153,11 @@ class Table(BaseCell):
|
|
|
152
153
|
model: Optional[str] = None
|
|
153
154
|
|
|
154
155
|
|
|
156
|
+
# FIXME: let's add some figure specific data-types later
|
|
157
|
+
class Figure(BaseCell):
|
|
158
|
+
"""Figure."""
|
|
159
|
+
|
|
160
|
+
|
|
155
161
|
class BaseText(AliasModel):
|
|
156
162
|
"""Base model for text objects."""
|
|
157
163
|
|
|
@@ -6,7 +6,8 @@
|
|
|
6
6
|
"""Models for the Docling Document data type."""
|
|
7
7
|
|
|
8
8
|
from datetime import datetime
|
|
9
|
-
from
|
|
9
|
+
from enum import Enum
|
|
10
|
+
from typing import Generic, Optional, Tuple, Union
|
|
10
11
|
|
|
11
12
|
from pydantic import (
|
|
12
13
|
AnyHttpUrl,
|
|
@@ -35,6 +36,7 @@ from docling_core.types.doc.base import (
|
|
|
35
36
|
BaseCell,
|
|
36
37
|
BaseText,
|
|
37
38
|
BitmapObject,
|
|
39
|
+
Figure,
|
|
38
40
|
PageDimensions,
|
|
39
41
|
PageReference,
|
|
40
42
|
Ref,
|
|
@@ -275,7 +277,7 @@ class MinimalDocument(
|
|
|
275
277
|
main_text: Optional[list[Union[Ref, BaseText]]] = Field(
|
|
276
278
|
default=None, alias="main-text"
|
|
277
279
|
)
|
|
278
|
-
figures: Optional[list[
|
|
280
|
+
figures: Optional[list[Figure]] = None
|
|
279
281
|
tables: Optional[list[Table]] = None
|
|
280
282
|
|
|
281
283
|
|
|
@@ -311,6 +313,8 @@ class CCSDocument(
|
|
|
311
313
|
@classmethod
|
|
312
314
|
def from_dict(cls, data):
|
|
313
315
|
"""Validates and fixes the input data."""
|
|
316
|
+
if not isinstance(data, dict):
|
|
317
|
+
return data
|
|
314
318
|
description_collection = data["description"].get("collection")
|
|
315
319
|
if not description_collection:
|
|
316
320
|
data["description"].setdefault("collection", {})
|
|
@@ -343,6 +347,107 @@ class CCSDocument(
|
|
|
343
347
|
return data
|
|
344
348
|
|
|
345
349
|
|
|
350
|
+
class DocumentToken(Enum):
|
|
351
|
+
"""Class to represent an LLM friendly representation of a Document."""
|
|
352
|
+
|
|
353
|
+
BEG_DOCUMENT = "<document>"
|
|
354
|
+
END_DOCUMENT = "</document>"
|
|
355
|
+
|
|
356
|
+
BEG_TITLE = "<title>"
|
|
357
|
+
END_TITLE = "</title>"
|
|
358
|
+
|
|
359
|
+
BEG_ABSTRACT = "<abstract>"
|
|
360
|
+
END_ABSTRACT = "</abstract>"
|
|
361
|
+
|
|
362
|
+
BEG_DOI = "<doi>"
|
|
363
|
+
END_DOI = "</doi>"
|
|
364
|
+
BEG_DATE = "<date>"
|
|
365
|
+
END_DATE = "</date>"
|
|
366
|
+
|
|
367
|
+
BEG_AUTHORS = "<authors>"
|
|
368
|
+
END_AUTHORS = "</authors>"
|
|
369
|
+
BEG_AUTHOR = "<author>"
|
|
370
|
+
END_AUTHOR = "</author>"
|
|
371
|
+
|
|
372
|
+
BEG_AFFILIATIONS = "<affiliations>"
|
|
373
|
+
END_AFFILIATIONS = "</affiliations>"
|
|
374
|
+
BEG_AFFILIATION = "<affiliation>"
|
|
375
|
+
END_AFFILIATION = "</affiliation>"
|
|
376
|
+
|
|
377
|
+
BEG_HEADER = "<section-header>"
|
|
378
|
+
END_HEADER = "</section-header>"
|
|
379
|
+
BEG_TEXT = "<text>"
|
|
380
|
+
END_TEXT = "</text>"
|
|
381
|
+
BEG_PARAGRAPH = "<paragraph>"
|
|
382
|
+
END_PARAGRAPH = "</paragraph>"
|
|
383
|
+
BEG_TABLE = "<table>"
|
|
384
|
+
END_TABLE = "</table>"
|
|
385
|
+
BEG_FIGURE = "<figure>"
|
|
386
|
+
END_FIGURE = "</figure>"
|
|
387
|
+
BEG_CAPTION = "<caption>"
|
|
388
|
+
END_CAPTION = "</caption>"
|
|
389
|
+
BEG_EQUATION = "<equation>"
|
|
390
|
+
END_EQUATION = "</equation>"
|
|
391
|
+
BEG_LIST = "<list>"
|
|
392
|
+
END_LIST = "</list>"
|
|
393
|
+
BEG_LISTITEM = "<list-item>"
|
|
394
|
+
END_LISTITEM = "</list-item>"
|
|
395
|
+
|
|
396
|
+
BEG_LOCATION = "<location>"
|
|
397
|
+
END_LOCATION = "</location>"
|
|
398
|
+
BEG_GROUP = "<group>"
|
|
399
|
+
END_GROUP = "</group>"
|
|
400
|
+
|
|
401
|
+
@classmethod
|
|
402
|
+
def get_special_tokens(
|
|
403
|
+
cls,
|
|
404
|
+
max_rows: int = 100,
|
|
405
|
+
max_cols: int = 100,
|
|
406
|
+
max_pages: int = 1000,
|
|
407
|
+
page_dimension: Tuple[int, int] = (100, 100),
|
|
408
|
+
):
|
|
409
|
+
"""Function to get all special document tokens."""
|
|
410
|
+
special_tokens = [token.value for token in cls]
|
|
411
|
+
|
|
412
|
+
# Adding dynamically generated row and col tokens
|
|
413
|
+
for i in range(0, max_rows):
|
|
414
|
+
special_tokens += [f"<row_{i}>", f"</row_{i}>"]
|
|
415
|
+
|
|
416
|
+
for i in range(0, max_cols):
|
|
417
|
+
special_tokens += [f"<col_{i}>", f"</col_{i}>"]
|
|
418
|
+
|
|
419
|
+
for i in range(6):
|
|
420
|
+
special_tokens += [f"<section-header-{i}>", f"</section-header-{i}>"]
|
|
421
|
+
|
|
422
|
+
# Adding dynamically generated page-tokens
|
|
423
|
+
for i in range(0, max_pages):
|
|
424
|
+
special_tokens.append(f"<page_{i}>")
|
|
425
|
+
|
|
426
|
+
# Adding dynamically generated location-tokens
|
|
427
|
+
for i in range(0, max(page_dimension[0], page_dimension[1])):
|
|
428
|
+
special_tokens.append(f"<loc_{i}>")
|
|
429
|
+
|
|
430
|
+
return special_tokens
|
|
431
|
+
|
|
432
|
+
@staticmethod
|
|
433
|
+
def get_page_token(page: int):
|
|
434
|
+
"""Function to get page tokens."""
|
|
435
|
+
return f"<page_{page}>"
|
|
436
|
+
|
|
437
|
+
@staticmethod
|
|
438
|
+
def get_location_token(val: float, rnorm: int = 100):
|
|
439
|
+
"""Function to get location tokens."""
|
|
440
|
+
val_ = round(rnorm * val)
|
|
441
|
+
|
|
442
|
+
if val_ < 0:
|
|
443
|
+
return "<loc_0>"
|
|
444
|
+
|
|
445
|
+
if val_ > rnorm:
|
|
446
|
+
return f"<loc_{rnorm}>"
|
|
447
|
+
|
|
448
|
+
return f"<loc_{val_}>"
|
|
449
|
+
|
|
450
|
+
|
|
346
451
|
class ExportedCCSDocument(
|
|
347
452
|
MinimalDocument,
|
|
348
453
|
Generic[
|
|
@@ -386,6 +491,8 @@ class ExportedCCSDocument(
|
|
|
386
491
|
@classmethod
|
|
387
492
|
def from_dict(cls, data):
|
|
388
493
|
"""Fix ref in main-text."""
|
|
494
|
+
if not isinstance(data, dict):
|
|
495
|
+
return data
|
|
389
496
|
if data.get("main-text"):
|
|
390
497
|
for item in data["main-text"]:
|
|
391
498
|
if ref := item.pop("__ref", None):
|
|
@@ -423,6 +530,14 @@ class ExportedCCSDocument(
|
|
|
423
530
|
delim: str = "\n\n",
|
|
424
531
|
main_text_start: int = 0,
|
|
425
532
|
main_text_stop: Optional[int] = None,
|
|
533
|
+
main_text_labels: list[str] = [
|
|
534
|
+
"title",
|
|
535
|
+
"subtitle-level-1",
|
|
536
|
+
"paragraph",
|
|
537
|
+
"caption",
|
|
538
|
+
"table",
|
|
539
|
+
],
|
|
540
|
+
strict_text: bool = False,
|
|
426
541
|
) -> str:
|
|
427
542
|
r"""Serialize to Markdown.
|
|
428
543
|
|
|
@@ -457,12 +572,7 @@ class ExportedCCSDocument(
|
|
|
457
572
|
continue
|
|
458
573
|
|
|
459
574
|
item_type = item.obj_type
|
|
460
|
-
if isinstance(item, BaseText) and item_type in
|
|
461
|
-
"title",
|
|
462
|
-
"subtitle-level-1",
|
|
463
|
-
"paragraph",
|
|
464
|
-
"caption",
|
|
465
|
-
}:
|
|
575
|
+
if isinstance(item, BaseText) and item_type in main_text_labels:
|
|
466
576
|
text = item.text
|
|
467
577
|
|
|
468
578
|
# ignore repeated text
|
|
@@ -473,20 +583,31 @@ class ExportedCCSDocument(
|
|
|
473
583
|
|
|
474
584
|
# first title match
|
|
475
585
|
if item_type == "title" and not has_title:
|
|
476
|
-
|
|
586
|
+
if strict_text:
|
|
587
|
+
markdown_text = f"{text}"
|
|
588
|
+
else:
|
|
589
|
+
markdown_text = f"# {text}"
|
|
477
590
|
has_title = True
|
|
478
591
|
|
|
479
592
|
# secondary titles
|
|
480
593
|
elif item_type in {"title", "subtitle-level-1"} or (
|
|
481
594
|
has_title and item_type == "title"
|
|
482
595
|
):
|
|
483
|
-
|
|
596
|
+
if strict_text:
|
|
597
|
+
markdown_text = f"{text}"
|
|
598
|
+
else:
|
|
599
|
+
markdown_text = f"## {text}"
|
|
484
600
|
|
|
485
601
|
# normal text
|
|
486
602
|
else:
|
|
487
603
|
markdown_text = text
|
|
488
604
|
|
|
489
|
-
elif
|
|
605
|
+
elif (
|
|
606
|
+
isinstance(item, Table)
|
|
607
|
+
and item.data
|
|
608
|
+
and item_type in main_text_labels
|
|
609
|
+
and not strict_text
|
|
610
|
+
):
|
|
490
611
|
table = []
|
|
491
612
|
for row in item.data:
|
|
492
613
|
tmp = []
|
|
@@ -514,3 +635,157 @@ class ExportedCCSDocument(
|
|
|
514
635
|
|
|
515
636
|
result = delim.join(md_texts)
|
|
516
637
|
return result
|
|
638
|
+
|
|
639
|
+
def export_to_document_tokens(
|
|
640
|
+
self,
|
|
641
|
+
delim: str = "\n\n",
|
|
642
|
+
main_text_start: int = 0,
|
|
643
|
+
main_text_stop: Optional[int] = None,
|
|
644
|
+
main_text_labels: list[str] = [
|
|
645
|
+
"title",
|
|
646
|
+
"subtitle-level-1",
|
|
647
|
+
"paragraph",
|
|
648
|
+
"caption",
|
|
649
|
+
"table",
|
|
650
|
+
"figure",
|
|
651
|
+
],
|
|
652
|
+
page_tagging: bool = True,
|
|
653
|
+
location_tagging: bool = True,
|
|
654
|
+
location_dimensions: Tuple[int, int] = (100, 100),
|
|
655
|
+
add_new_line: bool = True,
|
|
656
|
+
) -> str:
|
|
657
|
+
r"""Exports the document content to an DocumentToken format.
|
|
658
|
+
|
|
659
|
+
Operates on a slice of the document's main_text as defined through arguments
|
|
660
|
+
main_text_start and main_text_stop; defaulting to the whole main_text.
|
|
661
|
+
|
|
662
|
+
Args:
|
|
663
|
+
delim (str, optional): The delimiter used to separate text blocks in the
|
|
664
|
+
exported XML. Default is two newline characters ("\n\n").
|
|
665
|
+
main_text_start (int, optional): The starting index of the main text to
|
|
666
|
+
be included in the XML. Default is 0 (the beginning of the text).
|
|
667
|
+
main_text_stop (Optional[int], optional): The stopping index of the main
|
|
668
|
+
text. If set to None, the export includes text up to the end.
|
|
669
|
+
Default is None.
|
|
670
|
+
main_text_labels (list[str], optional): A list of text labels that
|
|
671
|
+
categorize the different sections of the document (e.g., "title",
|
|
672
|
+
"subtitle-level-1", "paragraph", "caption"). Default labels are
|
|
673
|
+
"title", "subtitle-level-1", "paragraph", and "caption".
|
|
674
|
+
location_tagging (bool, optional): Determines whether to include
|
|
675
|
+
location-based tagging in the XML. If True, the exported XML will
|
|
676
|
+
contain information about the locations of the text elements.
|
|
677
|
+
Default is True.
|
|
678
|
+
location_dimensions (Tuple[int, int], optional): Specifies the dimensions
|
|
679
|
+
(width and height) for the location tagging, if enabled.
|
|
680
|
+
Default is [100, 100].
|
|
681
|
+
add_new_line (bool, optional): Whether to add new line characters after
|
|
682
|
+
each text block. If True, a new line is added after each block of
|
|
683
|
+
text in the XML. Default is True.
|
|
684
|
+
|
|
685
|
+
Returns:
|
|
686
|
+
str: The content of the document formatted as an XML string.
|
|
687
|
+
"""
|
|
688
|
+
xml_str = DocumentToken.BEG_DOCUMENT.value
|
|
689
|
+
|
|
690
|
+
new_line = ""
|
|
691
|
+
if add_new_line:
|
|
692
|
+
new_line = "\n"
|
|
693
|
+
|
|
694
|
+
if self.main_text is not None:
|
|
695
|
+
for orig_item in self.main_text[main_text_start:main_text_stop]:
|
|
696
|
+
|
|
697
|
+
item = (
|
|
698
|
+
self._resolve_ref(orig_item)
|
|
699
|
+
if isinstance(orig_item, Ref)
|
|
700
|
+
else orig_item
|
|
701
|
+
)
|
|
702
|
+
|
|
703
|
+
if item is None:
|
|
704
|
+
continue
|
|
705
|
+
|
|
706
|
+
prov = item.prov
|
|
707
|
+
|
|
708
|
+
loc_str = "" # default is zero
|
|
709
|
+
if (
|
|
710
|
+
location_tagging
|
|
711
|
+
and self.page_dimensions is not None
|
|
712
|
+
and prov is not None
|
|
713
|
+
and len(prov) > 0
|
|
714
|
+
):
|
|
715
|
+
|
|
716
|
+
page = prov[0].page
|
|
717
|
+
page_dim = self.page_dimensions[page - 1]
|
|
718
|
+
|
|
719
|
+
page_w = float(page_dim.width)
|
|
720
|
+
page_h = float(page_dim.height)
|
|
721
|
+
|
|
722
|
+
x0 = float(prov[0].bbox[0]) / float(page_w)
|
|
723
|
+
y0 = float(prov[0].bbox[1]) / float(page_h)
|
|
724
|
+
x1 = float(prov[0].bbox[2]) / float(page_w)
|
|
725
|
+
y1 = float(prov[0].bbox[3]) / float(page_h)
|
|
726
|
+
|
|
727
|
+
page_tok = ""
|
|
728
|
+
if page_tagging:
|
|
729
|
+
page_tok = DocumentToken.get_page_token(page=page)
|
|
730
|
+
|
|
731
|
+
x0_tok = DocumentToken.get_location_token(
|
|
732
|
+
val=min(x0, x1), rnorm=location_dimensions[0]
|
|
733
|
+
)
|
|
734
|
+
y0_tok = DocumentToken.get_location_token(
|
|
735
|
+
val=min(y0, y1), rnorm=location_dimensions[1]
|
|
736
|
+
)
|
|
737
|
+
x1_tok = DocumentToken.get_location_token(
|
|
738
|
+
val=max(x0, x1), rnorm=location_dimensions[0]
|
|
739
|
+
)
|
|
740
|
+
y1_tok = DocumentToken.get_location_token(
|
|
741
|
+
val=max(y0, y1), rnorm=location_dimensions[1]
|
|
742
|
+
)
|
|
743
|
+
|
|
744
|
+
# update
|
|
745
|
+
loc_str = f"{DocumentToken.BEG_LOCATION.value}"
|
|
746
|
+
loc_str += f"{page_tok}"
|
|
747
|
+
loc_str += f"{x0_tok}{y0_tok}{x1_tok}{y1_tok}"
|
|
748
|
+
loc_str += f"{DocumentToken.END_LOCATION.value}"
|
|
749
|
+
|
|
750
|
+
item_type = item.obj_type
|
|
751
|
+
if isinstance(item, BaseText) and (item_type in main_text_labels):
|
|
752
|
+
text = item.text
|
|
753
|
+
|
|
754
|
+
xml_str += f"<{item_type}>{loc_str}{text}</{item_type}>{new_line}"
|
|
755
|
+
|
|
756
|
+
elif isinstance(item, Table) and (item_type in main_text_labels):
|
|
757
|
+
|
|
758
|
+
xml_str += f"<{item_type}>{loc_str}"
|
|
759
|
+
|
|
760
|
+
if item.text is not None and len(item.text) > 0:
|
|
761
|
+
xml_str += f"{DocumentToken.BEG_CAPTION.value}"
|
|
762
|
+
xml_str += (
|
|
763
|
+
f"{item.text}{DocumentToken.END_CAPTION.value}{new_line}"
|
|
764
|
+
)
|
|
765
|
+
|
|
766
|
+
if item.data is not None and len(item.data) > 0:
|
|
767
|
+
for i, row in enumerate(item.data):
|
|
768
|
+
xml_str += f"<row_{i}>"
|
|
769
|
+
for j, col in enumerate(row):
|
|
770
|
+
text = col.text
|
|
771
|
+
xml_str += f"<col_{j}>{text}</col_{j}>"
|
|
772
|
+
|
|
773
|
+
xml_str += f"</row_{i}>{new_line}"
|
|
774
|
+
|
|
775
|
+
xml_str += f"</{item_type}>{new_line}"
|
|
776
|
+
|
|
777
|
+
elif isinstance(item, Figure) and (item_type in main_text_labels):
|
|
778
|
+
|
|
779
|
+
xml_str += f"<{item_type}>{loc_str}"
|
|
780
|
+
|
|
781
|
+
if item.text is not None and len(item.text) > 0:
|
|
782
|
+
xml_str += f"{DocumentToken.BEG_CAPTION.value}"
|
|
783
|
+
xml_str += (
|
|
784
|
+
f"{item.text}{DocumentToken.END_CAPTION.value}{new_line}"
|
|
785
|
+
)
|
|
786
|
+
|
|
787
|
+
xml_str += f"</{item_type}>{new_line}"
|
|
788
|
+
|
|
789
|
+
xml_str += DocumentToken.END_DOCUMENT.value
|
|
790
|
+
|
|
791
|
+
return xml_str
|
|
@@ -4,6 +4,7 @@
|
|
|
4
4
|
#
|
|
5
5
|
|
|
6
6
|
"""Define the model Statement."""
|
|
7
|
+
from enum import Enum
|
|
7
8
|
from typing import Generic
|
|
8
9
|
|
|
9
10
|
from pydantic import Field
|
|
@@ -21,6 +22,39 @@ from docling_core.types.rec.attribute import Attribute
|
|
|
21
22
|
from docling_core.types.rec.subject import Subject
|
|
22
23
|
|
|
23
24
|
|
|
25
|
+
class StatementToken(Enum):
|
|
26
|
+
"""Class to represent an LLM friendly representation of statements."""
|
|
27
|
+
|
|
28
|
+
BEG_STATEMENTS = "<statements>"
|
|
29
|
+
END_STATEMENTS = "</statements>"
|
|
30
|
+
|
|
31
|
+
BEG_STATEMENT = "<statement>"
|
|
32
|
+
END_STATEMENT = "</statement>"
|
|
33
|
+
|
|
34
|
+
BEG_PROV = "<prov>"
|
|
35
|
+
END_PROV = "</prov>"
|
|
36
|
+
|
|
37
|
+
BEG_SUBJECT = "<subject>"
|
|
38
|
+
END_SUBJECT = "</subject>"
|
|
39
|
+
|
|
40
|
+
BEG_PREDICATE = "<predicate>"
|
|
41
|
+
END_PREDICATE = "</predicate>"
|
|
42
|
+
|
|
43
|
+
BEG_PROPERTY = "<property>"
|
|
44
|
+
END_PROPERTY = "</property>"
|
|
45
|
+
|
|
46
|
+
BEG_VALUE = "<value>"
|
|
47
|
+
END_VALUE = "</value>"
|
|
48
|
+
|
|
49
|
+
BEG_UNIT = "<unit>"
|
|
50
|
+
END_UNIT = "</unit>"
|
|
51
|
+
|
|
52
|
+
@classmethod
|
|
53
|
+
def get_special_tokens(cls):
|
|
54
|
+
"""Function to get all special statements tokens."""
|
|
55
|
+
return [token.value for token in cls]
|
|
56
|
+
|
|
57
|
+
|
|
24
58
|
class Statement(
|
|
25
59
|
Attribute,
|
|
26
60
|
Generic[
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "docling-core"
|
|
3
|
-
version = "1.
|
|
3
|
+
version = "1.2.0"
|
|
4
4
|
description = "A python library to define and validate data types in Docling."
|
|
5
5
|
license = "MIT"
|
|
6
6
|
authors = [
|
|
@@ -51,7 +51,6 @@ jsonschema = "^4.16.0"
|
|
|
51
51
|
pydantic = "^2.6.0"
|
|
52
52
|
jsonref = "^1.1.0"
|
|
53
53
|
json-schema-for-humans = "^1.0.0"
|
|
54
|
-
poetry = "^1.8.3"
|
|
55
54
|
pyproject-toml = "^0.0.10"
|
|
56
55
|
tabulate = "^0.9.0"
|
|
57
56
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-1.1.3 → docling_core-1.2.0}/docling_core/resources/schemas/doc/OCR-output.json
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-1.1.3 → docling_core-1.2.0}/docling_core/search/json_schema_to_search_mapper.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|