docling-core 1.0.0__py3-none-any.whl → 1.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- docling_core/search/package.py +1 -1
- docling_core/types/doc/document.py +112 -3
- {docling_core-1.0.0.dist-info → docling_core-1.1.1.dist-info}/METADATA +15 -15
- {docling_core-1.0.0.dist-info → docling_core-1.1.1.dist-info}/RECORD +7 -7
- {docling_core-1.0.0.dist-info → docling_core-1.1.1.dist-info}/LICENSE +0 -0
- {docling_core-1.0.0.dist-info → docling_core-1.1.1.dist-info}/WHEEL +0 -0
- {docling_core-1.0.0.dist-info → docling_core-1.1.1.dist-info}/entry_points.txt +0 -0
docling_core/search/package.py
CHANGED
|
@@ -26,7 +26,7 @@ class Package(BaseModel, extra="forbid"):
|
|
|
26
26
|
The version needs to comply with Semantic Versioning 2.0.0.
|
|
27
27
|
"""
|
|
28
28
|
|
|
29
|
-
name: StrictStr
|
|
29
|
+
name: StrictStr = "docling-core"
|
|
30
30
|
version: Annotated[str, StringConstraints(strict=True, pattern=VERSION_PATTERN)] = (
|
|
31
31
|
importlib.metadata.version("docling-core")
|
|
32
32
|
)
|
|
@@ -16,6 +16,7 @@ from pydantic import (
|
|
|
16
16
|
StrictStr,
|
|
17
17
|
model_validator,
|
|
18
18
|
)
|
|
19
|
+
from tabulate import tabulate
|
|
19
20
|
|
|
20
21
|
from docling_core.search.mapping import es_field
|
|
21
22
|
from docling_core.types.base import (
|
|
@@ -262,7 +263,7 @@ class MinimalDocument(
|
|
|
262
263
|
"""Minimal model for a document."""
|
|
263
264
|
|
|
264
265
|
name: StrictStr = Field(alias="_name")
|
|
265
|
-
obj_type: StrictStr = Field("document", alias="type")
|
|
266
|
+
obj_type: Optional[StrictStr] = Field("document", alias="type")
|
|
266
267
|
description: CCSDocumentDescription[
|
|
267
268
|
DescriptionAdvancedT,
|
|
268
269
|
DescriptionAnalyticsT,
|
|
@@ -290,7 +291,7 @@ class CCSDocument(
|
|
|
290
291
|
):
|
|
291
292
|
"""Model for a CCS-generated document."""
|
|
292
293
|
|
|
293
|
-
obj_type: StrictStr = Field("pdf-document", alias="type")
|
|
294
|
+
obj_type: Optional[StrictStr] = Field("pdf-document", alias="type")
|
|
294
295
|
bitmaps: Optional[list[BitmapObject]] = None
|
|
295
296
|
equations: Optional[list[BaseCell]] = None
|
|
296
297
|
footnotes: Optional[list[BaseText]] = None
|
|
@@ -354,7 +355,7 @@ class ExportedCCSDocument(
|
|
|
354
355
|
):
|
|
355
356
|
"""Document model for Docling."""
|
|
356
357
|
|
|
357
|
-
obj_type: StrictStr = Field(
|
|
358
|
+
obj_type: Optional[StrictStr] = Field(
|
|
358
359
|
"pdf-document",
|
|
359
360
|
alias="type",
|
|
360
361
|
json_schema_extra=es_field(type="keyword", ignore_above=8191),
|
|
@@ -391,3 +392,111 @@ class ExportedCCSDocument(
|
|
|
391
392
|
item["$ref"] = ref
|
|
392
393
|
|
|
393
394
|
return data
|
|
395
|
+
|
|
396
|
+
def _resolve_ref(self, item: Ref) -> Optional[Table]:
|
|
397
|
+
"""Return the resolved reference in case of table reference, otherwise None."""
|
|
398
|
+
result: Optional[Table] = None
|
|
399
|
+
|
|
400
|
+
# NOTE: currently only resolves table refs & makes assumptions on ref parts
|
|
401
|
+
if item.obj_type == "table" and self.tables:
|
|
402
|
+
parts = item.ref.split("/")
|
|
403
|
+
result = self.tables[int(parts[2])]
|
|
404
|
+
|
|
405
|
+
return result
|
|
406
|
+
|
|
407
|
+
def export_to_markdown(
|
|
408
|
+
self,
|
|
409
|
+
delim: str = "\n\n",
|
|
410
|
+
main_text_start: int = 0,
|
|
411
|
+
main_text_stop: Optional[int] = None,
|
|
412
|
+
) -> str:
|
|
413
|
+
r"""Serialize to Markdown.
|
|
414
|
+
|
|
415
|
+
Operates on a slice of the document's main_text as defined through arguments
|
|
416
|
+
main_text_start and main_text_stop; defaulting to the whole main_text.
|
|
417
|
+
|
|
418
|
+
Args:
|
|
419
|
+
delim (str, optional): Delimiter to use when concatenating the various
|
|
420
|
+
Markdown parts. Defaults to "\n\n".
|
|
421
|
+
main_text_start (int, optional): Main-text slicing start index (inclusive).
|
|
422
|
+
Defaults to 0.
|
|
423
|
+
main_text_end (Optional[int], optional): Main-text slicing stop index
|
|
424
|
+
(exclusive). Defaults to None.
|
|
425
|
+
|
|
426
|
+
Returns:
|
|
427
|
+
str: The exported Markdown representation.
|
|
428
|
+
"""
|
|
429
|
+
has_title = False
|
|
430
|
+
prev_text = ""
|
|
431
|
+
md_texts: list[str] = []
|
|
432
|
+
|
|
433
|
+
if self.main_text is not None:
|
|
434
|
+
for orig_item in self.main_text[main_text_start:main_text_stop]:
|
|
435
|
+
markdown_text = ""
|
|
436
|
+
|
|
437
|
+
item = (
|
|
438
|
+
self._resolve_ref(orig_item)
|
|
439
|
+
if isinstance(orig_item, Ref)
|
|
440
|
+
else orig_item
|
|
441
|
+
)
|
|
442
|
+
if item is None:
|
|
443
|
+
continue
|
|
444
|
+
|
|
445
|
+
item_type = item.obj_type
|
|
446
|
+
if isinstance(item, BaseText) and item_type in {
|
|
447
|
+
"title",
|
|
448
|
+
"subtitle-level-1",
|
|
449
|
+
"paragraph",
|
|
450
|
+
"caption",
|
|
451
|
+
}:
|
|
452
|
+
text = item.text
|
|
453
|
+
|
|
454
|
+
# ignore repeated text
|
|
455
|
+
if prev_text == text:
|
|
456
|
+
continue
|
|
457
|
+
else:
|
|
458
|
+
prev_text = text
|
|
459
|
+
|
|
460
|
+
# first title match
|
|
461
|
+
if item_type == "title" and not has_title:
|
|
462
|
+
markdown_text = f"# {text}"
|
|
463
|
+
has_title = True
|
|
464
|
+
|
|
465
|
+
# secondary titles
|
|
466
|
+
elif item_type in {"title", "subtitle-level-1"} or (
|
|
467
|
+
has_title and item_type == "title"
|
|
468
|
+
):
|
|
469
|
+
markdown_text = f"## {text}"
|
|
470
|
+
|
|
471
|
+
# normal text
|
|
472
|
+
else:
|
|
473
|
+
markdown_text = text
|
|
474
|
+
|
|
475
|
+
elif isinstance(item, Table) and item.data:
|
|
476
|
+
table = []
|
|
477
|
+
for row in item.data:
|
|
478
|
+
tmp = []
|
|
479
|
+
for col in row:
|
|
480
|
+
tmp.append(col.text)
|
|
481
|
+
table.append(tmp)
|
|
482
|
+
|
|
483
|
+
if len(table) > 1 and len(table[0]) > 0:
|
|
484
|
+
try:
|
|
485
|
+
md_table = tabulate(
|
|
486
|
+
table[1:], headers=table[0], tablefmt="github"
|
|
487
|
+
)
|
|
488
|
+
except ValueError:
|
|
489
|
+
md_table = tabulate(
|
|
490
|
+
table[1:],
|
|
491
|
+
headers=table[0],
|
|
492
|
+
tablefmt="github",
|
|
493
|
+
disable_numparse=True,
|
|
494
|
+
)
|
|
495
|
+
|
|
496
|
+
markdown_text = md_table
|
|
497
|
+
|
|
498
|
+
if markdown_text:
|
|
499
|
+
md_texts.append(markdown_text)
|
|
500
|
+
|
|
501
|
+
result = delim.join(md_texts)
|
|
502
|
+
return result
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: docling-core
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.1.1
|
|
4
4
|
Summary: A python library to define and validate data types in Docling.
|
|
5
5
|
Home-page: https://ds4sd.github.io/
|
|
6
6
|
License: MIT
|
|
@@ -31,39 +31,39 @@ Requires-Dist: jsonschema (>=4.16.0,<5.0.0)
|
|
|
31
31
|
Requires-Dist: poetry (>=1.8.3,<2.0.0)
|
|
32
32
|
Requires-Dist: pydantic (>=2.6.0,<3.0.0)
|
|
33
33
|
Requires-Dist: pyproject-toml (>=0.0.10,<0.0.11)
|
|
34
|
+
Requires-Dist: tabulate (>=0.9.0,<0.10.0)
|
|
34
35
|
Project-URL: Repository, https://github.com/DS4SD/docling-core
|
|
35
36
|
Description-Content-Type: text/markdown
|
|
36
37
|
|
|
38
|
+
# Docling Core
|
|
39
|
+
|
|
37
40
|
[](https://pypi.org/project/docling-core/)
|
|
38
|
-

|
|
41
|
+

|
|
39
42
|
[](https://python-poetry.org/)
|
|
40
43
|
[](https://github.com/psf/black)
|
|
41
44
|
[](https://pycqa.github.io/isort/)
|
|
42
45
|
[](https://mypy-lang.org/)
|
|
43
46
|
[](https://pydantic.dev)
|
|
44
47
|
[](https://github.com/pre-commit/pre-commit)
|
|
45
|
-
[](https://opensource.org/licenses/MIT)
|
|
48
49
|
|
|
49
50
|
Docling Core is a library that defines the data types in [Docling](https://ds4sd.github.io), leveraging pydantic models.
|
|
50
51
|
|
|
51
52
|
## Installation
|
|
52
53
|
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
poetry shell
|
|
54
|
+
To use Docling Core, simply install `docling-core` from your package manager, e.g. pip:
|
|
55
|
+
```bash
|
|
56
|
+
pip install docling-core
|
|
57
57
|
```
|
|
58
58
|
|
|
59
|
-
|
|
59
|
+
### Development setup
|
|
60
60
|
|
|
61
|
-
|
|
61
|
+
To develop for Docling Core, you need Python 3.9 / 3.10 / 3.11 / 3.12 and Poetry. You can then install from your local clone's root dir:
|
|
62
|
+
```bash
|
|
62
63
|
poetry install
|
|
63
64
|
```
|
|
64
65
|
|
|
65
|
-
|
|
66
|
-
|
|
66
|
+
To run the pytest suite, execute:
|
|
67
67
|
```
|
|
68
68
|
poetry run pytest test
|
|
69
69
|
```
|
|
@@ -113,7 +113,7 @@ Please read [Contributing to Docling Core](./CONTRIBUTING.md) for details.
|
|
|
113
113
|
|
|
114
114
|
## References
|
|
115
115
|
|
|
116
|
-
If you use
|
|
116
|
+
If you use Docling Core in your projects, please consider citing the following:
|
|
117
117
|
|
|
118
118
|
```bib
|
|
119
119
|
@software{Docling,
|
|
@@ -128,6 +128,6 @@ year = {2024}
|
|
|
128
128
|
|
|
129
129
|
## License
|
|
130
130
|
|
|
131
|
-
The
|
|
131
|
+
The Docling Core codebase is under MIT license.
|
|
132
132
|
For individual model usage, please refer to the model licenses found in the original packages.
|
|
133
133
|
|
|
@@ -12,7 +12,7 @@ docling_core/search/__init__.py,sha256=RucCUQjDlTZ7VfgbfnKDRBL-A-_Lcc2JWWeiVWHto
|
|
|
12
12
|
docling_core/search/json_schema_to_search_mapper.py,sha256=9crSFuSbcXrJej7j1rYWK6b0x37cHDmPF6va5j3gknA,12933
|
|
13
13
|
docling_core/search/mapping.py,sha256=6rqG7LgYSeWmooKNEcRa5gFDLp1ZdzPqDGlwTA5gpOk,724
|
|
14
14
|
docling_core/search/meta.py,sha256=wSurrsqdP1N3gQKx027fVdzVmc33a7Y6rPl-FClQvtA,3318
|
|
15
|
-
docling_core/search/package.py,sha256=
|
|
15
|
+
docling_core/search/package.py,sha256=Lz2ml2eDy5t0ZimnGTq-DXHAn-f18w0bn4H5xrhs75A,1841
|
|
16
16
|
docling_core/types/__init__.py,sha256=6mrAEKRW85uHJwNQBufwjPcMWCjm3oocA6MaO4_NLgg,805
|
|
17
17
|
docling_core/types/base.py,sha256=fNtfQ20NKa_RBNBWbq0DfO8o0zC1Cec8UAMu0Znsltk,8170
|
|
18
18
|
docling_core/types/doc/__init__.py,sha256=Pzj_8rft6SJTVTCHgXRwHtuZjL6LK_6dcBWjikL9biY,125
|
|
@@ -20,7 +20,7 @@ docling_core/types/doc/base.py,sha256=-j4vVs3JZuaUjm0fHIkLU9TD_4IZXQuGouLrddEAwP
|
|
|
20
20
|
docling_core/types/doc/doc_ann.py,sha256=8pV2efUglw19jxl4_oqB__mSxjWvtGIcllyCdqA-b2s,1196
|
|
21
21
|
docling_core/types/doc/doc_ocr.py,sha256=6PC0C-OczF-MyfgRxEI1xs3PWgNOzi7i2yEQbTqZz0I,1387
|
|
22
22
|
docling_core/types/doc/doc_raw.py,sha256=Y69G6IiauNDaoT-5el4xo1ypWpnBJQ75akGGkCMTZSc,3888
|
|
23
|
-
docling_core/types/doc/document.py,sha256=
|
|
23
|
+
docling_core/types/doc/document.py,sha256=nkJbdYLm0NEuYwvB0UumQCS1Gb_8dHJPXWh9HE1ot4I,16466
|
|
24
24
|
docling_core/types/gen/__init__.py,sha256=C6TuCfvpSnSL5XDOFMcYHUY2-i08vvfOGRcdu6Af0pI,124
|
|
25
25
|
docling_core/types/gen/generic.py,sha256=l4CZ4_Lb8ONG36WNJWbKX5hGKvTh_yU-hXp5hsm7uVU,844
|
|
26
26
|
docling_core/types/nlp/__init__.py,sha256=hGcztAeVK7xkRBqRRvc4zbY4PGeJ0r0QrEsetnSx9nI,119
|
|
@@ -39,8 +39,8 @@ docling_core/utils/ds_generate_docs.py,sha256=0xGBagdC_PGjyeHXYZo90VnVrSTMZgHb0S
|
|
|
39
39
|
docling_core/utils/ds_generate_jsonschema.py,sha256=EhNQutqWJFWuN-yl9UUPFZ7DJTvGqg54qBIvUMHTHdA,1647
|
|
40
40
|
docling_core/utils/validate.py,sha256=3FmnxnKTDZC5J9OGxCL3U3DGRl0t0bBV1NcySXswdas,2031
|
|
41
41
|
docling_core/utils/validators.py,sha256=fBdyWX4PvFh7o_d25ZTs4iwmeo75QTbrxsvXv2kXkTg,2777
|
|
42
|
-
docling_core-1.
|
|
43
|
-
docling_core-1.
|
|
44
|
-
docling_core-1.
|
|
45
|
-
docling_core-1.
|
|
46
|
-
docling_core-1.
|
|
42
|
+
docling_core-1.1.1.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
|
|
43
|
+
docling_core-1.1.1.dist-info/METADATA,sha256=IyDQiHgqcwSgNkZU0WiK-2_6kqk6A5KKNi8AUmyZ0nc,5335
|
|
44
|
+
docling_core-1.1.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
45
|
+
docling_core-1.1.1.dist-info/entry_points.txt,sha256=XHhtJEkdUuLxXSNxLdFIzx_siQ3z2UFQEKp-P8VYAE4,189
|
|
46
|
+
docling_core-1.1.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|