docling-core 1.0.0__py3-none-any.whl → 1.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

@@ -26,7 +26,7 @@ class Package(BaseModel, extra="forbid"):
26
26
  The version needs to comply with Semantic Versioning 2.0.0.
27
27
  """
28
28
 
29
- name: StrictStr
29
+ name: StrictStr = "docling-core"
30
30
  version: Annotated[str, StringConstraints(strict=True, pattern=VERSION_PATTERN)] = (
31
31
  importlib.metadata.version("docling-core")
32
32
  )
@@ -16,6 +16,7 @@ from pydantic import (
16
16
  StrictStr,
17
17
  model_validator,
18
18
  )
19
+ from tabulate import tabulate
19
20
 
20
21
  from docling_core.search.mapping import es_field
21
22
  from docling_core.types.base import (
@@ -262,7 +263,7 @@ class MinimalDocument(
262
263
  """Minimal model for a document."""
263
264
 
264
265
  name: StrictStr = Field(alias="_name")
265
- obj_type: StrictStr = Field("document", alias="type")
266
+ obj_type: Optional[StrictStr] = Field("document", alias="type")
266
267
  description: CCSDocumentDescription[
267
268
  DescriptionAdvancedT,
268
269
  DescriptionAnalyticsT,
@@ -290,7 +291,7 @@ class CCSDocument(
290
291
  ):
291
292
  """Model for a CCS-generated document."""
292
293
 
293
- obj_type: StrictStr = Field("pdf-document", alias="type")
294
+ obj_type: Optional[StrictStr] = Field("pdf-document", alias="type")
294
295
  bitmaps: Optional[list[BitmapObject]] = None
295
296
  equations: Optional[list[BaseCell]] = None
296
297
  footnotes: Optional[list[BaseText]] = None
@@ -354,7 +355,7 @@ class ExportedCCSDocument(
354
355
  ):
355
356
  """Document model for Docling."""
356
357
 
357
- obj_type: StrictStr = Field(
358
+ obj_type: Optional[StrictStr] = Field(
358
359
  "pdf-document",
359
360
  alias="type",
360
361
  json_schema_extra=es_field(type="keyword", ignore_above=8191),
@@ -391,3 +392,111 @@ class ExportedCCSDocument(
391
392
  item["$ref"] = ref
392
393
 
393
394
  return data
395
+
396
+ def _resolve_ref(self, item: Ref) -> Optional[Table]:
397
+ """Return the resolved reference in case of table reference, otherwise None."""
398
+ result: Optional[Table] = None
399
+
400
+ # NOTE: currently only resolves table refs & makes assumptions on ref parts
401
+ if item.obj_type == "table" and self.tables:
402
+ parts = item.ref.split("/")
403
+ result = self.tables[int(parts[2])]
404
+
405
+ return result
406
+
407
+ def export_to_markdown(
408
+ self,
409
+ delim: str = "\n\n",
410
+ main_text_start: int = 0,
411
+ main_text_stop: Optional[int] = None,
412
+ ) -> str:
413
+ r"""Serialize to Markdown.
414
+
415
+ Operates on a slice of the document's main_text as defined through arguments
416
+ main_text_start and main_text_stop; defaulting to the whole main_text.
417
+
418
+ Args:
419
+ delim (str, optional): Delimiter to use when concatenating the various
420
+ Markdown parts. Defaults to "\n\n".
421
+ main_text_start (int, optional): Main-text slicing start index (inclusive).
422
+ Defaults to 0.
423
+ main_text_end (Optional[int], optional): Main-text slicing stop index
424
+ (exclusive). Defaults to None.
425
+
426
+ Returns:
427
+ str: The exported Markdown representation.
428
+ """
429
+ has_title = False
430
+ prev_text = ""
431
+ md_texts: list[str] = []
432
+
433
+ if self.main_text is not None:
434
+ for orig_item in self.main_text[main_text_start:main_text_stop]:
435
+ markdown_text = ""
436
+
437
+ item = (
438
+ self._resolve_ref(orig_item)
439
+ if isinstance(orig_item, Ref)
440
+ else orig_item
441
+ )
442
+ if item is None:
443
+ continue
444
+
445
+ item_type = item.obj_type
446
+ if isinstance(item, BaseText) and item_type in {
447
+ "title",
448
+ "subtitle-level-1",
449
+ "paragraph",
450
+ "caption",
451
+ }:
452
+ text = item.text
453
+
454
+ # ignore repeated text
455
+ if prev_text == text:
456
+ continue
457
+ else:
458
+ prev_text = text
459
+
460
+ # first title match
461
+ if item_type == "title" and not has_title:
462
+ markdown_text = f"# {text}"
463
+ has_title = True
464
+
465
+ # secondary titles
466
+ elif item_type in {"title", "subtitle-level-1"} or (
467
+ has_title and item_type == "title"
468
+ ):
469
+ markdown_text = f"## {text}"
470
+
471
+ # normal text
472
+ else:
473
+ markdown_text = text
474
+
475
+ elif isinstance(item, Table) and item.data:
476
+ table = []
477
+ for row in item.data:
478
+ tmp = []
479
+ for col in row:
480
+ tmp.append(col.text)
481
+ table.append(tmp)
482
+
483
+ if len(table) > 1 and len(table[0]) > 0:
484
+ try:
485
+ md_table = tabulate(
486
+ table[1:], headers=table[0], tablefmt="github"
487
+ )
488
+ except ValueError:
489
+ md_table = tabulate(
490
+ table[1:],
491
+ headers=table[0],
492
+ tablefmt="github",
493
+ disable_numparse=True,
494
+ )
495
+
496
+ markdown_text = md_table
497
+
498
+ if markdown_text:
499
+ md_texts.append(markdown_text)
500
+
501
+ result = delim.join(md_texts)
502
+ return result
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling-core
3
- Version: 1.0.0
3
+ Version: 1.1.1
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Home-page: https://ds4sd.github.io/
6
6
  License: MIT
@@ -31,39 +31,39 @@ Requires-Dist: jsonschema (>=4.16.0,<5.0.0)
31
31
  Requires-Dist: poetry (>=1.8.3,<2.0.0)
32
32
  Requires-Dist: pydantic (>=2.6.0,<3.0.0)
33
33
  Requires-Dist: pyproject-toml (>=0.0.10,<0.0.11)
34
+ Requires-Dist: tabulate (>=0.9.0,<0.10.0)
34
35
  Project-URL: Repository, https://github.com/DS4SD/docling-core
35
36
  Description-Content-Type: text/markdown
36
37
 
38
+ # Docling Core
39
+
37
40
  [![PyPI version](https://img.shields.io/pypi/v/docling-core)](https://pypi.org/project/docling-core/)
38
- ![Python](https://img.shields.io/badge/python-3.9%20%7C%203.10%20%7C%203.11-blue)
41
+ ![Python](https://img.shields.io/badge/python-3.9%20%7C%203.10%20%7C%203.11%20%7C%203.12-blue)
39
42
  [![Poetry](https://img.shields.io/endpoint?url=https://python-poetry.org/badge/v0.json)](https://python-poetry.org/)
40
43
  [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
41
44
  [![Imports: isort](https://img.shields.io/badge/%20imports-isort-%231674b1?style=flat&labelColor=ef8336)](https://pycqa.github.io/isort/)
42
45
  [![Checked with mypy](https://www.mypy-lang.org/static/mypy_badge.svg)](https://mypy-lang.org/)
43
46
  [![Pydantic v2](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/pydantic/pydantic/main/docs/badge/v2.json)](https://pydantic.dev)
44
47
  [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit)
45
- [![License MIT](https://img.shields.io/github/license/ds4sd/deepsearch-toolkit)](https://opensource.org/licenses/MIT)
46
-
47
- # Docling Core
48
+ [![License MIT](https://img.shields.io/github/license/ds4sd/docling-core)](https://opensource.org/licenses/MIT)
48
49
 
49
50
  Docling Core is a library that defines the data types in [Docling](https://ds4sd.github.io), leveraging pydantic models.
50
51
 
51
52
  ## Installation
52
53
 
53
- Using [Poetry](https://python-poetry.org), create and activate a virtual environment.
54
-
55
- ```
56
- poetry shell
54
+ To use Docling Core, simply install `docling-core` from your package manager, e.g. pip:
55
+ ```bash
56
+ pip install docling-core
57
57
  ```
58
58
 
59
- Install the defined dependencies of the project.
59
+ ### Development setup
60
60
 
61
- ```
61
+ To develop for Docling Core, you need Python 3.9 / 3.10 / 3.11 / 3.12 and Poetry. You can then install from your local clone's root dir:
62
+ ```bash
62
63
  poetry install
63
64
  ```
64
65
 
65
- Test the installation running the pytest suite.
66
-
66
+ To run the pytest suite, execute:
67
67
  ```
68
68
  poetry run pytest test
69
69
  ```
@@ -113,7 +113,7 @@ Please read [Contributing to Docling Core](./CONTRIBUTING.md) for details.
113
113
 
114
114
  ## References
115
115
 
116
- If you use `Docling Core` in your projects, please consider citing the following:
116
+ If you use Docling Core in your projects, please consider citing the following:
117
117
 
118
118
  ```bib
119
119
  @software{Docling,
@@ -128,6 +128,6 @@ year = {2024}
128
128
 
129
129
  ## License
130
130
 
131
- The `Docling Core` codebase is under MIT license.
131
+ The Docling Core codebase is under MIT license.
132
132
  For individual model usage, please refer to the model licenses found in the original packages.
133
133
 
@@ -12,7 +12,7 @@ docling_core/search/__init__.py,sha256=RucCUQjDlTZ7VfgbfnKDRBL-A-_Lcc2JWWeiVWHto
12
12
  docling_core/search/json_schema_to_search_mapper.py,sha256=9crSFuSbcXrJej7j1rYWK6b0x37cHDmPF6va5j3gknA,12933
13
13
  docling_core/search/mapping.py,sha256=6rqG7LgYSeWmooKNEcRa5gFDLp1ZdzPqDGlwTA5gpOk,724
14
14
  docling_core/search/meta.py,sha256=wSurrsqdP1N3gQKx027fVdzVmc33a7Y6rPl-FClQvtA,3318
15
- docling_core/search/package.py,sha256=Q0_FAWFt71_g0ifcFkCuXEpVAgpVFiT9mOdzq1fqeDM,1824
15
+ docling_core/search/package.py,sha256=Lz2ml2eDy5t0ZimnGTq-DXHAn-f18w0bn4H5xrhs75A,1841
16
16
  docling_core/types/__init__.py,sha256=6mrAEKRW85uHJwNQBufwjPcMWCjm3oocA6MaO4_NLgg,805
17
17
  docling_core/types/base.py,sha256=fNtfQ20NKa_RBNBWbq0DfO8o0zC1Cec8UAMu0Znsltk,8170
18
18
  docling_core/types/doc/__init__.py,sha256=Pzj_8rft6SJTVTCHgXRwHtuZjL6LK_6dcBWjikL9biY,125
@@ -20,7 +20,7 @@ docling_core/types/doc/base.py,sha256=-j4vVs3JZuaUjm0fHIkLU9TD_4IZXQuGouLrddEAwP
20
20
  docling_core/types/doc/doc_ann.py,sha256=8pV2efUglw19jxl4_oqB__mSxjWvtGIcllyCdqA-b2s,1196
21
21
  docling_core/types/doc/doc_ocr.py,sha256=6PC0C-OczF-MyfgRxEI1xs3PWgNOzi7i2yEQbTqZz0I,1387
22
22
  docling_core/types/doc/doc_raw.py,sha256=Y69G6IiauNDaoT-5el4xo1ypWpnBJQ75akGGkCMTZSc,3888
23
- docling_core/types/doc/document.py,sha256=cMduCiFkPVCmXQehvNkXqXtDiXJJtB72o7_LZXz_S6I,12549
23
+ docling_core/types/doc/document.py,sha256=nkJbdYLm0NEuYwvB0UumQCS1Gb_8dHJPXWh9HE1ot4I,16466
24
24
  docling_core/types/gen/__init__.py,sha256=C6TuCfvpSnSL5XDOFMcYHUY2-i08vvfOGRcdu6Af0pI,124
25
25
  docling_core/types/gen/generic.py,sha256=l4CZ4_Lb8ONG36WNJWbKX5hGKvTh_yU-hXp5hsm7uVU,844
26
26
  docling_core/types/nlp/__init__.py,sha256=hGcztAeVK7xkRBqRRvc4zbY4PGeJ0r0QrEsetnSx9nI,119
@@ -39,8 +39,8 @@ docling_core/utils/ds_generate_docs.py,sha256=0xGBagdC_PGjyeHXYZo90VnVrSTMZgHb0S
39
39
  docling_core/utils/ds_generate_jsonschema.py,sha256=EhNQutqWJFWuN-yl9UUPFZ7DJTvGqg54qBIvUMHTHdA,1647
40
40
  docling_core/utils/validate.py,sha256=3FmnxnKTDZC5J9OGxCL3U3DGRl0t0bBV1NcySXswdas,2031
41
41
  docling_core/utils/validators.py,sha256=fBdyWX4PvFh7o_d25ZTs4iwmeo75QTbrxsvXv2kXkTg,2777
42
- docling_core-1.0.0.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
43
- docling_core-1.0.0.dist-info/METADATA,sha256=RPsZbjVEs0aIfMYDmK25CxR1b77iRCqbu8WbodN4l9g,5174
44
- docling_core-1.0.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
45
- docling_core-1.0.0.dist-info/entry_points.txt,sha256=XHhtJEkdUuLxXSNxLdFIzx_siQ3z2UFQEKp-P8VYAE4,189
46
- docling_core-1.0.0.dist-info/RECORD,,
42
+ docling_core-1.1.1.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
43
+ docling_core-1.1.1.dist-info/METADATA,sha256=IyDQiHgqcwSgNkZU0WiK-2_6kqk6A5KKNi8AUmyZ0nc,5335
44
+ docling_core-1.1.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
45
+ docling_core-1.1.1.dist-info/entry_points.txt,sha256=XHhtJEkdUuLxXSNxLdFIzx_siQ3z2UFQEKp-P8VYAE4,189
46
+ docling_core-1.1.1.dist-info/RECORD,,