docling-core 2.23.0__tar.gz → 2.23.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- {docling_core-2.23.0 → docling_core-2.23.2}/PKG-INFO +5 -5
- {docling_core-2.23.0 → docling_core-2.23.2}/README.md +2 -2
- {docling_core-2.23.0 → docling_core-2.23.2}/docling_core/types/doc/document.py +31 -4
- {docling_core-2.23.0 → docling_core-2.23.2}/pyproject.toml +3 -3
- {docling_core-2.23.0 → docling_core-2.23.2}/LICENSE +0 -0
- {docling_core-2.23.0 → docling_core-2.23.2}/docling_core/__init__.py +0 -0
- {docling_core-2.23.0 → docling_core-2.23.2}/docling_core/cli/__init__.py +0 -0
- {docling_core-2.23.0 → docling_core-2.23.2}/docling_core/cli/view.py +0 -0
- {docling_core-2.23.0 → docling_core-2.23.2}/docling_core/experimental/__init__.py +0 -0
- {docling_core-2.23.0 → docling_core-2.23.2}/docling_core/experimental/serializer/__init__.py +0 -0
- {docling_core-2.23.0 → docling_core-2.23.2}/docling_core/experimental/serializer/base.py +0 -0
- {docling_core-2.23.0 → docling_core-2.23.2}/docling_core/experimental/serializer/common.py +0 -0
- {docling_core-2.23.0 → docling_core-2.23.2}/docling_core/experimental/serializer/markdown.py +0 -0
- {docling_core-2.23.0 → docling_core-2.23.2}/docling_core/py.typed +0 -0
- {docling_core-2.23.0 → docling_core-2.23.2}/docling_core/resources/schemas/doc/ANN.json +0 -0
- {docling_core-2.23.0 → docling_core-2.23.2}/docling_core/resources/schemas/doc/DOC.json +0 -0
- {docling_core-2.23.0 → docling_core-2.23.2}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
- {docling_core-2.23.0 → docling_core-2.23.2}/docling_core/resources/schemas/doc/RAW.json +0 -0
- {docling_core-2.23.0 → docling_core-2.23.2}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
- {docling_core-2.23.0 → docling_core-2.23.2}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
- {docling_core-2.23.0 → docling_core-2.23.2}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
- {docling_core-2.23.0 → docling_core-2.23.2}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
- {docling_core-2.23.0 → docling_core-2.23.2}/docling_core/search/__init__.py +0 -0
- {docling_core-2.23.0 → docling_core-2.23.2}/docling_core/search/json_schema_to_search_mapper.py +0 -0
- {docling_core-2.23.0 → docling_core-2.23.2}/docling_core/search/mapping.py +0 -0
- {docling_core-2.23.0 → docling_core-2.23.2}/docling_core/search/meta.py +0 -0
- {docling_core-2.23.0 → docling_core-2.23.2}/docling_core/search/package.py +0 -0
- {docling_core-2.23.0 → docling_core-2.23.2}/docling_core/transforms/__init__.py +0 -0
- {docling_core-2.23.0 → docling_core-2.23.2}/docling_core/transforms/chunker/__init__.py +0 -0
- {docling_core-2.23.0 → docling_core-2.23.2}/docling_core/transforms/chunker/base.py +0 -0
- {docling_core-2.23.0 → docling_core-2.23.2}/docling_core/transforms/chunker/hierarchical_chunker.py +0 -0
- {docling_core-2.23.0 → docling_core-2.23.2}/docling_core/transforms/chunker/hybrid_chunker.py +0 -0
- {docling_core-2.23.0 → docling_core-2.23.2}/docling_core/types/__init__.py +0 -0
- {docling_core-2.23.0 → docling_core-2.23.2}/docling_core/types/base.py +0 -0
- {docling_core-2.23.0 → docling_core-2.23.2}/docling_core/types/doc/__init__.py +0 -0
- {docling_core-2.23.0 → docling_core-2.23.2}/docling_core/types/doc/base.py +0 -0
- {docling_core-2.23.0 → docling_core-2.23.2}/docling_core/types/doc/labels.py +0 -0
- {docling_core-2.23.0 → docling_core-2.23.2}/docling_core/types/doc/page.py +0 -0
- {docling_core-2.23.0 → docling_core-2.23.2}/docling_core/types/doc/tokens.py +0 -0
- {docling_core-2.23.0 → docling_core-2.23.2}/docling_core/types/doc/utils.py +0 -0
- {docling_core-2.23.0 → docling_core-2.23.2}/docling_core/types/gen/__init__.py +0 -0
- {docling_core-2.23.0 → docling_core-2.23.2}/docling_core/types/gen/generic.py +0 -0
- {docling_core-2.23.0 → docling_core-2.23.2}/docling_core/types/io/__init__.py +0 -0
- {docling_core-2.23.0 → docling_core-2.23.2}/docling_core/types/legacy_doc/__init__.py +0 -0
- {docling_core-2.23.0 → docling_core-2.23.2}/docling_core/types/legacy_doc/base.py +0 -0
- {docling_core-2.23.0 → docling_core-2.23.2}/docling_core/types/legacy_doc/doc_ann.py +0 -0
- {docling_core-2.23.0 → docling_core-2.23.2}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
- {docling_core-2.23.0 → docling_core-2.23.2}/docling_core/types/legacy_doc/doc_raw.py +0 -0
- {docling_core-2.23.0 → docling_core-2.23.2}/docling_core/types/legacy_doc/document.py +0 -0
- {docling_core-2.23.0 → docling_core-2.23.2}/docling_core/types/legacy_doc/tokens.py +0 -0
- {docling_core-2.23.0 → docling_core-2.23.2}/docling_core/types/nlp/__init__.py +0 -0
- {docling_core-2.23.0 → docling_core-2.23.2}/docling_core/types/nlp/qa.py +0 -0
- {docling_core-2.23.0 → docling_core-2.23.2}/docling_core/types/nlp/qa_labels.py +0 -0
- {docling_core-2.23.0 → docling_core-2.23.2}/docling_core/types/rec/__init__.py +0 -0
- {docling_core-2.23.0 → docling_core-2.23.2}/docling_core/types/rec/attribute.py +0 -0
- {docling_core-2.23.0 → docling_core-2.23.2}/docling_core/types/rec/base.py +0 -0
- {docling_core-2.23.0 → docling_core-2.23.2}/docling_core/types/rec/predicate.py +0 -0
- {docling_core-2.23.0 → docling_core-2.23.2}/docling_core/types/rec/record.py +0 -0
- {docling_core-2.23.0 → docling_core-2.23.2}/docling_core/types/rec/statement.py +0 -0
- {docling_core-2.23.0 → docling_core-2.23.2}/docling_core/types/rec/subject.py +0 -0
- {docling_core-2.23.0 → docling_core-2.23.2}/docling_core/utils/__init__.py +0 -0
- {docling_core-2.23.0 → docling_core-2.23.2}/docling_core/utils/alias.py +0 -0
- {docling_core-2.23.0 → docling_core-2.23.2}/docling_core/utils/file.py +0 -0
- {docling_core-2.23.0 → docling_core-2.23.2}/docling_core/utils/generate_docs.py +0 -0
- {docling_core-2.23.0 → docling_core-2.23.2}/docling_core/utils/generate_jsonschema.py +0 -0
- {docling_core-2.23.0 → docling_core-2.23.2}/docling_core/utils/legacy.py +0 -0
- {docling_core-2.23.0 → docling_core-2.23.2}/docling_core/utils/validate.py +0 -0
- {docling_core-2.23.0 → docling_core-2.23.2}/docling_core/utils/validators.py +0 -0
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: docling-core
|
|
3
|
-
Version: 2.23.
|
|
3
|
+
Version: 2.23.2
|
|
4
4
|
Summary: A python library to define and validate data types in Docling.
|
|
5
|
-
Home-page: https://
|
|
5
|
+
Home-page: https://github.com/docling-project
|
|
6
6
|
License: MIT
|
|
7
7
|
Keywords: docling,discovery,etl,information retrieval,analytics,database,database schema,schema,JSON
|
|
8
8
|
Author: Cesar Berrospi Ramis
|
|
@@ -38,7 +38,7 @@ Requires-Dist: tabulate (>=0.9.0,<0.10.0)
|
|
|
38
38
|
Requires-Dist: transformers (>=4.34.0,<5.0.0) ; extra == "chunking"
|
|
39
39
|
Requires-Dist: typer (>=0.12.5,<0.13.0)
|
|
40
40
|
Requires-Dist: typing-extensions (>=4.12.2,<5.0.0)
|
|
41
|
-
Project-URL: Repository, https://github.com/
|
|
41
|
+
Project-URL: Repository, https://github.com/docling-project/docling-core
|
|
42
42
|
Description-Content-Type: text/markdown
|
|
43
43
|
|
|
44
44
|
# Docling Core
|
|
@@ -51,9 +51,9 @@ Description-Content-Type: text/markdown
|
|
|
51
51
|
[](https://mypy-lang.org/)
|
|
52
52
|
[](https://pydantic.dev)
|
|
53
53
|
[](https://github.com/pre-commit/pre-commit)
|
|
54
|
-
[](https://opensource.org/licenses/MIT)
|
|
55
55
|
|
|
56
|
-
Docling Core is a library that defines the data types in [Docling](https://github.com/
|
|
56
|
+
Docling Core is a library that defines the data types in [Docling](https://github.com/docling-project/docling), leveraging pydantic models.
|
|
57
57
|
|
|
58
58
|
## Installation
|
|
59
59
|
|
|
@@ -8,9 +8,9 @@
|
|
|
8
8
|
[](https://mypy-lang.org/)
|
|
9
9
|
[](https://pydantic.dev)
|
|
10
10
|
[](https://github.com/pre-commit/pre-commit)
|
|
11
|
-
[](https://opensource.org/licenses/MIT)
|
|
12
12
|
|
|
13
|
-
Docling Core is a library that defines the data types in [Docling](https://github.com/
|
|
13
|
+
Docling Core is a library that defines the data types in [Docling](https://github.com/docling-project/docling), leveraging pydantic models.
|
|
14
14
|
|
|
15
15
|
## Installation
|
|
16
16
|
|
|
@@ -1548,7 +1548,7 @@ class DoclingDocument(BaseModel):
|
|
|
1548
1548
|
|
|
1549
1549
|
_HTML_DEFAULT_HEAD: str = r"""<head>
|
|
1550
1550
|
<link rel="icon" type="image/png"
|
|
1551
|
-
href="https://
|
|
1551
|
+
href="https://raw.githubusercontent.com/docling-project/docling/refs/heads/main/docs/assets/logo.svg"/>
|
|
1552
1552
|
<meta charset="UTF-8">
|
|
1553
1553
|
<title>
|
|
1554
1554
|
Powered by Docling
|
|
@@ -3051,6 +3051,25 @@ class DoclingDocument(BaseModel):
|
|
|
3051
3051
|
"""Strip all <...> tags inside the chunk to get the raw text content."""
|
|
3052
3052
|
return re.sub(r"<.*?>", "", text_chunk, flags=re.DOTALL).strip()
|
|
3053
3053
|
|
|
3054
|
+
def extract_caption(
|
|
3055
|
+
text_chunk: str,
|
|
3056
|
+
) -> tuple[Optional[TextItem], Optional[BoundingBox]]:
|
|
3057
|
+
"""Extract caption text from the chunk."""
|
|
3058
|
+
caption = re.search(r"<caption>(.*?)</caption>", text_chunk)
|
|
3059
|
+
if caption is not None:
|
|
3060
|
+
caption_content = caption.group(1)
|
|
3061
|
+
bbox = extract_bounding_box(caption_content)
|
|
3062
|
+
caption_text = extract_inner_text(caption_content)
|
|
3063
|
+
caption_item = self.add_text(
|
|
3064
|
+
label=DocItemLabel.CAPTION,
|
|
3065
|
+
text=caption_text,
|
|
3066
|
+
parent=None,
|
|
3067
|
+
)
|
|
3068
|
+
else:
|
|
3069
|
+
caption_item = None
|
|
3070
|
+
bbox = None
|
|
3071
|
+
return caption_item, bbox
|
|
3072
|
+
|
|
3054
3073
|
def otsl_parse_texts(texts, tokens):
|
|
3055
3074
|
split_word = TableToken.OTSL_NL.value
|
|
3056
3075
|
split_row_tokens = [
|
|
@@ -3261,16 +3280,24 @@ class DoclingDocument(BaseModel):
|
|
|
3261
3280
|
if tag_name == DocumentToken.OTSL.value:
|
|
3262
3281
|
table_data = parse_table_content(full_chunk)
|
|
3263
3282
|
bbox = extract_bounding_box(full_chunk) if image else None
|
|
3264
|
-
|
|
3283
|
+
caption, caption_bbox = extract_caption(full_chunk)
|
|
3284
|
+
if caption is not None and caption_bbox is not None:
|
|
3285
|
+
caption.prov.append(
|
|
3286
|
+
ProvenanceItem(
|
|
3287
|
+
bbox=caption_bbox.resize_by_scale(pg_width, pg_height),
|
|
3288
|
+
charspan=(0, 0),
|
|
3289
|
+
page_no=page_no,
|
|
3290
|
+
)
|
|
3291
|
+
)
|
|
3265
3292
|
if bbox:
|
|
3266
3293
|
prov = ProvenanceItem(
|
|
3267
3294
|
bbox=bbox.resize_by_scale(pg_width, pg_height),
|
|
3268
3295
|
charspan=(0, 0),
|
|
3269
3296
|
page_no=page_no,
|
|
3270
3297
|
)
|
|
3271
|
-
self.add_table(data=table_data, prov=prov)
|
|
3298
|
+
self.add_table(data=table_data, prov=prov, caption=caption)
|
|
3272
3299
|
else:
|
|
3273
|
-
self.add_table(data=table_data)
|
|
3300
|
+
self.add_table(data=table_data, caption=caption)
|
|
3274
3301
|
|
|
3275
3302
|
elif tag_name == DocItemLabel.PICTURE:
|
|
3276
3303
|
text_caption_content = extract_inner_text(full_chunk)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "docling-core"
|
|
3
|
-
version = "2.23.
|
|
3
|
+
version = "2.23.2"
|
|
4
4
|
description = "A python library to define and validate data types in Docling."
|
|
5
5
|
license = "MIT"
|
|
6
6
|
authors = [
|
|
@@ -22,8 +22,8 @@ maintainers = [
|
|
|
22
22
|
"Panos Vagenas <pva@zurich.ibm.com>",
|
|
23
23
|
]
|
|
24
24
|
readme = "README.md"
|
|
25
|
-
homepage = "https://
|
|
26
|
-
repository = "https://github.com/
|
|
25
|
+
homepage = "https://github.com/docling-project"
|
|
26
|
+
repository = "https://github.com/docling-project/docling-core"
|
|
27
27
|
keywords = ["docling", "discovery", "etl", "information retrieval", "analytics", "database", "database schema", "schema", "JSON"]
|
|
28
28
|
classifiers=[
|
|
29
29
|
"Development Status :: 5 - Production/Stable",
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.23.0 → docling_core-2.23.2}/docling_core/experimental/serializer/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.23.0 → docling_core-2.23.2}/docling_core/experimental/serializer/markdown.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.23.0 → docling_core-2.23.2}/docling_core/resources/schemas/doc/OCR-output.json
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.23.0 → docling_core-2.23.2}/docling_core/search/json_schema_to_search_mapper.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.23.0 → docling_core-2.23.2}/docling_core/transforms/chunker/hierarchical_chunker.py
RENAMED
|
File without changes
|
{docling_core-2.23.0 → docling_core-2.23.2}/docling_core/transforms/chunker/hybrid_chunker.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|