docling-core 1.0.0__tar.gz → 1.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- {docling_core-1.0.0 → docling_core-1.1.0}/PKG-INFO +2 -1
- {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/types/doc/document.py +109 -0
- {docling_core-1.0.0 → docling_core-1.1.0}/pyproject.toml +3 -2
- {docling_core-1.0.0 → docling_core-1.1.0}/LICENSE +0 -0
- {docling_core-1.0.0 → docling_core-1.1.0}/README.md +0 -0
- {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/__init__.py +0 -0
- {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/py.typed +0 -0
- {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/resources/schemas/doc/ANN.json +0 -0
- {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/resources/schemas/doc/DOC.json +0 -0
- {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
- {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/resources/schemas/doc/RAW.json +0 -0
- {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
- {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
- {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
- {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
- {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/search/__init__.py +0 -0
- {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/search/json_schema_to_search_mapper.py +0 -0
- {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/search/mapping.py +0 -0
- {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/search/meta.py +0 -0
- {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/search/package.py +0 -0
- {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/types/__init__.py +0 -0
- {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/types/base.py +0 -0
- {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/types/doc/__init__.py +0 -0
- {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/types/doc/base.py +0 -0
- {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/types/doc/doc_ann.py +0 -0
- {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/types/doc/doc_ocr.py +0 -0
- {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/types/doc/doc_raw.py +0 -0
- {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/types/gen/__init__.py +0 -0
- {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/types/gen/generic.py +0 -0
- {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/types/nlp/__init__.py +0 -0
- {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/types/nlp/qa.py +0 -0
- {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/types/nlp/qa_labels.py +0 -0
- {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/types/rec/__init__.py +0 -0
- {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/types/rec/attribute.py +0 -0
- {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/types/rec/base.py +0 -0
- {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/types/rec/predicate.py +0 -0
- {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/types/rec/record.py +0 -0
- {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/types/rec/statement.py +0 -0
- {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/types/rec/subject.py +0 -0
- {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/utils/__init__.py +0 -0
- {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/utils/alias.py +0 -0
- {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/utils/ds_generate_docs.py +0 -0
- {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/utils/ds_generate_jsonschema.py +0 -0
- {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/utils/validate.py +0 -0
- {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/utils/validators.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: docling-core
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.1.0
|
|
4
4
|
Summary: A python library to define and validate data types in Docling.
|
|
5
5
|
Home-page: https://ds4sd.github.io/
|
|
6
6
|
License: MIT
|
|
@@ -31,6 +31,7 @@ Requires-Dist: jsonschema (>=4.16.0,<5.0.0)
|
|
|
31
31
|
Requires-Dist: poetry (>=1.8.3,<2.0.0)
|
|
32
32
|
Requires-Dist: pydantic (>=2.6.0,<3.0.0)
|
|
33
33
|
Requires-Dist: pyproject-toml (>=0.0.10,<0.0.11)
|
|
34
|
+
Requires-Dist: tabulate (>=0.9.0,<0.10.0)
|
|
34
35
|
Project-URL: Repository, https://github.com/DS4SD/docling-core
|
|
35
36
|
Description-Content-Type: text/markdown
|
|
36
37
|
|
|
@@ -16,6 +16,7 @@ from pydantic import (
|
|
|
16
16
|
StrictStr,
|
|
17
17
|
model_validator,
|
|
18
18
|
)
|
|
19
|
+
from tabulate import tabulate
|
|
19
20
|
|
|
20
21
|
from docling_core.search.mapping import es_field
|
|
21
22
|
from docling_core.types.base import (
|
|
@@ -391,3 +392,111 @@ class ExportedCCSDocument(
|
|
|
391
392
|
item["$ref"] = ref
|
|
392
393
|
|
|
393
394
|
return data
|
|
395
|
+
|
|
396
|
+
def _resolve_ref(self, item: Ref) -> Optional[Table]:
|
|
397
|
+
"""Return the resolved reference in case of table reference, otherwise None."""
|
|
398
|
+
result: Optional[Table] = None
|
|
399
|
+
|
|
400
|
+
# NOTE: currently only resolves table refs & makes assumptions on ref parts
|
|
401
|
+
if item.obj_type == "table" and self.tables:
|
|
402
|
+
parts = item.ref.split("/")
|
|
403
|
+
result = self.tables[int(parts[2])]
|
|
404
|
+
|
|
405
|
+
return result
|
|
406
|
+
|
|
407
|
+
def export_to_markdown(
|
|
408
|
+
self,
|
|
409
|
+
delim: str = "\n\n",
|
|
410
|
+
main_text_start: int = 0,
|
|
411
|
+
main_text_stop: Optional[int] = None,
|
|
412
|
+
) -> str:
|
|
413
|
+
r"""Serialize to Markdown.
|
|
414
|
+
|
|
415
|
+
Operates on a slice of the document's main_text as defined through arguments
|
|
416
|
+
main_text_start and main_text_stop; defaulting to the whole main_text.
|
|
417
|
+
|
|
418
|
+
Args:
|
|
419
|
+
delim (str, optional): Delimiter to use when concatenating the various
|
|
420
|
+
Markdown parts. Defaults to "\n\n".
|
|
421
|
+
main_text_start (int, optional): Main-text slicing start index (inclusive).
|
|
422
|
+
Defaults to 0.
|
|
423
|
+
main_text_end (Optional[int], optional): Main-text slicing stop index
|
|
424
|
+
(exclusive). Defaults to None.
|
|
425
|
+
|
|
426
|
+
Returns:
|
|
427
|
+
str: The exported Markdown representation.
|
|
428
|
+
"""
|
|
429
|
+
has_title = False
|
|
430
|
+
prev_text = ""
|
|
431
|
+
md_texts: list[str] = []
|
|
432
|
+
|
|
433
|
+
if self.main_text is not None:
|
|
434
|
+
for orig_item in self.main_text[main_text_start:main_text_stop]:
|
|
435
|
+
markdown_text = ""
|
|
436
|
+
|
|
437
|
+
item = (
|
|
438
|
+
self._resolve_ref(orig_item)
|
|
439
|
+
if isinstance(orig_item, Ref)
|
|
440
|
+
else orig_item
|
|
441
|
+
)
|
|
442
|
+
if item is None:
|
|
443
|
+
continue
|
|
444
|
+
|
|
445
|
+
item_type = item.obj_type
|
|
446
|
+
if isinstance(item, BaseText) and item_type in {
|
|
447
|
+
"title",
|
|
448
|
+
"subtitle-level-1",
|
|
449
|
+
"paragraph",
|
|
450
|
+
"caption",
|
|
451
|
+
}:
|
|
452
|
+
text = item.text
|
|
453
|
+
|
|
454
|
+
# ignore repeated text
|
|
455
|
+
if prev_text == text:
|
|
456
|
+
continue
|
|
457
|
+
else:
|
|
458
|
+
prev_text = text
|
|
459
|
+
|
|
460
|
+
# first title match
|
|
461
|
+
if item_type == "title" and not has_title:
|
|
462
|
+
markdown_text = f"# {text}"
|
|
463
|
+
has_title = True
|
|
464
|
+
|
|
465
|
+
# secondary titles
|
|
466
|
+
elif item_type in {"title", "subtitle-level-1"} or (
|
|
467
|
+
has_title and item_type == "title"
|
|
468
|
+
):
|
|
469
|
+
markdown_text = f"## {text}"
|
|
470
|
+
|
|
471
|
+
# normal text
|
|
472
|
+
else:
|
|
473
|
+
markdown_text = text
|
|
474
|
+
|
|
475
|
+
elif isinstance(item, Table) and item.data:
|
|
476
|
+
table = []
|
|
477
|
+
for row in item.data:
|
|
478
|
+
tmp = []
|
|
479
|
+
for col in row:
|
|
480
|
+
tmp.append(col.text)
|
|
481
|
+
table.append(tmp)
|
|
482
|
+
|
|
483
|
+
if len(table) > 1 and len(table[0]) > 0:
|
|
484
|
+
try:
|
|
485
|
+
md_table = tabulate(
|
|
486
|
+
table[1:], headers=table[0], tablefmt="github"
|
|
487
|
+
)
|
|
488
|
+
except ValueError:
|
|
489
|
+
md_table = tabulate(
|
|
490
|
+
table[1:],
|
|
491
|
+
headers=table[0],
|
|
492
|
+
tablefmt="github",
|
|
493
|
+
disable_numparse=True,
|
|
494
|
+
)
|
|
495
|
+
|
|
496
|
+
markdown_text = md_table
|
|
497
|
+
|
|
498
|
+
if markdown_text:
|
|
499
|
+
md_texts.append(markdown_text)
|
|
500
|
+
|
|
501
|
+
result = delim.join(md_texts)
|
|
502
|
+
return result
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "docling-core"
|
|
3
|
-
version = "1.
|
|
3
|
+
version = "1.1.0"
|
|
4
4
|
description = "A python library to define and validate data types in Docling."
|
|
5
5
|
license = "MIT"
|
|
6
6
|
authors = [
|
|
@@ -53,6 +53,7 @@ jsonref = "^1.1.0"
|
|
|
53
53
|
json-schema-for-humans = "^1.0.0"
|
|
54
54
|
poetry = "^1.8.3"
|
|
55
55
|
pyproject-toml = "^0.0.10"
|
|
56
|
+
tabulate = "^0.9.0"
|
|
56
57
|
|
|
57
58
|
[tool.poetry.group.dev.dependencies]
|
|
58
59
|
black = "^24.4.2"
|
|
@@ -111,7 +112,7 @@ python_version = "3.9"
|
|
|
111
112
|
plugins = ["pydantic.mypy"]
|
|
112
113
|
|
|
113
114
|
[[tool.mypy.overrides]]
|
|
114
|
-
module = ["jsondiff.*", "jsonref.*", "jsonschema.*", "json_schema_for_humans.*"]
|
|
115
|
+
module = ["jsondiff.*", "jsonref.*", "jsonschema.*", "json_schema_for_humans.*", "tabulate.*"]
|
|
115
116
|
ignore_missing_imports = true
|
|
116
117
|
|
|
117
118
|
[tool.semantic_release]
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-1.0.0 → docling_core-1.1.0}/docling_core/resources/schemas/doc/OCR-output.json
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-1.0.0 → docling_core-1.1.0}/docling_core/search/json_schema_to_search_mapper.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|