docling-core 2.23.1__tar.gz → 2.23.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- {docling_core-2.23.1 → docling_core-2.23.2}/PKG-INFO +1 -1
- {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/types/doc/document.py +30 -3
- {docling_core-2.23.1 → docling_core-2.23.2}/pyproject.toml +1 -1
- {docling_core-2.23.1 → docling_core-2.23.2}/LICENSE +0 -0
- {docling_core-2.23.1 → docling_core-2.23.2}/README.md +0 -0
- {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/__init__.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/cli/__init__.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/cli/view.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/experimental/__init__.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/experimental/serializer/__init__.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/experimental/serializer/base.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/experimental/serializer/common.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/experimental/serializer/markdown.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/py.typed +0 -0
- {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/resources/schemas/doc/ANN.json +0 -0
- {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/resources/schemas/doc/DOC.json +0 -0
- {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
- {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/resources/schemas/doc/RAW.json +0 -0
- {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
- {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
- {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
- {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
- {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/search/__init__.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/search/json_schema_to_search_mapper.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/search/mapping.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/search/meta.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/search/package.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/transforms/__init__.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/transforms/chunker/__init__.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/transforms/chunker/base.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/transforms/chunker/hierarchical_chunker.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/transforms/chunker/hybrid_chunker.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/types/__init__.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/types/base.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/types/doc/__init__.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/types/doc/base.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/types/doc/labels.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/types/doc/page.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/types/doc/tokens.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/types/doc/utils.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/types/gen/__init__.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/types/gen/generic.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/types/io/__init__.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/types/legacy_doc/__init__.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/types/legacy_doc/base.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/types/legacy_doc/doc_ann.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/types/legacy_doc/doc_raw.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/types/legacy_doc/document.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/types/legacy_doc/tokens.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/types/nlp/__init__.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/types/nlp/qa.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/types/nlp/qa_labels.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/types/rec/__init__.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/types/rec/attribute.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/types/rec/base.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/types/rec/predicate.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/types/rec/record.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/types/rec/statement.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/types/rec/subject.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/utils/__init__.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/utils/alias.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/utils/file.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/utils/generate_docs.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/utils/generate_jsonschema.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/utils/legacy.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/utils/validate.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/utils/validators.py +0 -0
|
@@ -3051,6 +3051,25 @@ class DoclingDocument(BaseModel):
|
|
|
3051
3051
|
"""Strip all <...> tags inside the chunk to get the raw text content."""
|
|
3052
3052
|
return re.sub(r"<.*?>", "", text_chunk, flags=re.DOTALL).strip()
|
|
3053
3053
|
|
|
3054
|
+
def extract_caption(
|
|
3055
|
+
text_chunk: str,
|
|
3056
|
+
) -> tuple[Optional[TextItem], Optional[BoundingBox]]:
|
|
3057
|
+
"""Extract caption text from the chunk."""
|
|
3058
|
+
caption = re.search(r"<caption>(.*?)</caption>", text_chunk)
|
|
3059
|
+
if caption is not None:
|
|
3060
|
+
caption_content = caption.group(1)
|
|
3061
|
+
bbox = extract_bounding_box(caption_content)
|
|
3062
|
+
caption_text = extract_inner_text(caption_content)
|
|
3063
|
+
caption_item = self.add_text(
|
|
3064
|
+
label=DocItemLabel.CAPTION,
|
|
3065
|
+
text=caption_text,
|
|
3066
|
+
parent=None,
|
|
3067
|
+
)
|
|
3068
|
+
else:
|
|
3069
|
+
caption_item = None
|
|
3070
|
+
bbox = None
|
|
3071
|
+
return caption_item, bbox
|
|
3072
|
+
|
|
3054
3073
|
def otsl_parse_texts(texts, tokens):
|
|
3055
3074
|
split_word = TableToken.OTSL_NL.value
|
|
3056
3075
|
split_row_tokens = [
|
|
@@ -3261,16 +3280,24 @@ class DoclingDocument(BaseModel):
|
|
|
3261
3280
|
if tag_name == DocumentToken.OTSL.value:
|
|
3262
3281
|
table_data = parse_table_content(full_chunk)
|
|
3263
3282
|
bbox = extract_bounding_box(full_chunk) if image else None
|
|
3264
|
-
|
|
3283
|
+
caption, caption_bbox = extract_caption(full_chunk)
|
|
3284
|
+
if caption is not None and caption_bbox is not None:
|
|
3285
|
+
caption.prov.append(
|
|
3286
|
+
ProvenanceItem(
|
|
3287
|
+
bbox=caption_bbox.resize_by_scale(pg_width, pg_height),
|
|
3288
|
+
charspan=(0, 0),
|
|
3289
|
+
page_no=page_no,
|
|
3290
|
+
)
|
|
3291
|
+
)
|
|
3265
3292
|
if bbox:
|
|
3266
3293
|
prov = ProvenanceItem(
|
|
3267
3294
|
bbox=bbox.resize_by_scale(pg_width, pg_height),
|
|
3268
3295
|
charspan=(0, 0),
|
|
3269
3296
|
page_no=page_no,
|
|
3270
3297
|
)
|
|
3271
|
-
self.add_table(data=table_data, prov=prov)
|
|
3298
|
+
self.add_table(data=table_data, prov=prov, caption=caption)
|
|
3272
3299
|
else:
|
|
3273
|
-
self.add_table(data=table_data)
|
|
3300
|
+
self.add_table(data=table_data, caption=caption)
|
|
3274
3301
|
|
|
3275
3302
|
elif tag_name == DocItemLabel.PICTURE:
|
|
3276
3303
|
text_caption_content = extract_inner_text(full_chunk)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.23.1 → docling_core-2.23.2}/docling_core/experimental/serializer/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.23.1 → docling_core-2.23.2}/docling_core/experimental/serializer/markdown.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.23.1 → docling_core-2.23.2}/docling_core/resources/schemas/doc/OCR-output.json
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.23.1 → docling_core-2.23.2}/docling_core/search/json_schema_to_search_mapper.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.23.1 → docling_core-2.23.2}/docling_core/transforms/chunker/hierarchical_chunker.py
RENAMED
|
File without changes
|
{docling_core-2.23.1 → docling_core-2.23.2}/docling_core/transforms/chunker/hybrid_chunker.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|