docling-core 2.23.1__tar.gz → 2.23.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- {docling_core-2.23.1 → docling_core-2.23.3}/PKG-INFO +1 -1
- {docling_core-2.23.1 → docling_core-2.23.3}/docling_core/experimental/serializer/markdown.py +10 -2
- {docling_core-2.23.1 → docling_core-2.23.3}/docling_core/types/doc/document.py +30 -3
- {docling_core-2.23.1 → docling_core-2.23.3}/pyproject.toml +1 -1
- {docling_core-2.23.1 → docling_core-2.23.3}/LICENSE +0 -0
- {docling_core-2.23.1 → docling_core-2.23.3}/README.md +0 -0
- {docling_core-2.23.1 → docling_core-2.23.3}/docling_core/__init__.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.3}/docling_core/cli/__init__.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.3}/docling_core/cli/view.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.3}/docling_core/experimental/__init__.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.3}/docling_core/experimental/serializer/__init__.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.3}/docling_core/experimental/serializer/base.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.3}/docling_core/experimental/serializer/common.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.3}/docling_core/py.typed +0 -0
- {docling_core-2.23.1 → docling_core-2.23.3}/docling_core/resources/schemas/doc/ANN.json +0 -0
- {docling_core-2.23.1 → docling_core-2.23.3}/docling_core/resources/schemas/doc/DOC.json +0 -0
- {docling_core-2.23.1 → docling_core-2.23.3}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
- {docling_core-2.23.1 → docling_core-2.23.3}/docling_core/resources/schemas/doc/RAW.json +0 -0
- {docling_core-2.23.1 → docling_core-2.23.3}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
- {docling_core-2.23.1 → docling_core-2.23.3}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
- {docling_core-2.23.1 → docling_core-2.23.3}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
- {docling_core-2.23.1 → docling_core-2.23.3}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
- {docling_core-2.23.1 → docling_core-2.23.3}/docling_core/search/__init__.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.3}/docling_core/search/json_schema_to_search_mapper.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.3}/docling_core/search/mapping.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.3}/docling_core/search/meta.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.3}/docling_core/search/package.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.3}/docling_core/transforms/__init__.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.3}/docling_core/transforms/chunker/__init__.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.3}/docling_core/transforms/chunker/base.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.3}/docling_core/transforms/chunker/hierarchical_chunker.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.3}/docling_core/transforms/chunker/hybrid_chunker.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.3}/docling_core/types/__init__.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.3}/docling_core/types/base.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.3}/docling_core/types/doc/__init__.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.3}/docling_core/types/doc/base.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.3}/docling_core/types/doc/labels.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.3}/docling_core/types/doc/page.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.3}/docling_core/types/doc/tokens.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.3}/docling_core/types/doc/utils.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.3}/docling_core/types/gen/__init__.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.3}/docling_core/types/gen/generic.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.3}/docling_core/types/io/__init__.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.3}/docling_core/types/legacy_doc/__init__.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.3}/docling_core/types/legacy_doc/base.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.3}/docling_core/types/legacy_doc/doc_ann.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.3}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.3}/docling_core/types/legacy_doc/doc_raw.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.3}/docling_core/types/legacy_doc/document.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.3}/docling_core/types/legacy_doc/tokens.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.3}/docling_core/types/nlp/__init__.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.3}/docling_core/types/nlp/qa.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.3}/docling_core/types/nlp/qa_labels.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.3}/docling_core/types/rec/__init__.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.3}/docling_core/types/rec/attribute.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.3}/docling_core/types/rec/base.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.3}/docling_core/types/rec/predicate.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.3}/docling_core/types/rec/record.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.3}/docling_core/types/rec/statement.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.3}/docling_core/types/rec/subject.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.3}/docling_core/utils/__init__.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.3}/docling_core/utils/alias.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.3}/docling_core/utils/file.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.3}/docling_core/utils/generate_docs.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.3}/docling_core/utils/generate_jsonschema.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.3}/docling_core/utils/legacy.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.3}/docling_core/utils/validate.py +0 -0
- {docling_core-2.23.1 → docling_core-2.23.3}/docling_core/utils/validators.py +0 -0
{docling_core-2.23.1 → docling_core-2.23.3}/docling_core/experimental/serializer/markdown.py
RENAMED
|
@@ -310,9 +310,17 @@ class MarkdownListSerializer(BaseModel, BaseListSerializer):
|
|
|
310
310
|
is_inline_scope=is_inline_scope,
|
|
311
311
|
visited=my_visited,
|
|
312
312
|
)
|
|
313
|
+
sep = "\n"
|
|
314
|
+
my_parts: list[SerializationResult] = []
|
|
315
|
+
for p in parts:
|
|
316
|
+
if p.text and p.text[0] == " " and my_parts:
|
|
317
|
+
my_parts[-1].text = sep.join([my_parts[-1].text, p.text]) # update last
|
|
318
|
+
else:
|
|
319
|
+
my_parts.append(p)
|
|
320
|
+
|
|
313
321
|
indent_str = list_level * self.indent * " "
|
|
314
322
|
is_ol = isinstance(item, OrderedList)
|
|
315
|
-
text_res =
|
|
323
|
+
text_res = sep.join(
|
|
316
324
|
[
|
|
317
325
|
# avoid additional marker on already evaled sublists
|
|
318
326
|
(
|
|
@@ -320,7 +328,7 @@ class MarkdownListSerializer(BaseModel, BaseListSerializer):
|
|
|
320
328
|
if c.text and c.text[0] == " "
|
|
321
329
|
else f"{indent_str}{f'{i + 1}.' if is_ol else '-'} {c.text}"
|
|
322
330
|
)
|
|
323
|
-
for i, c in enumerate(
|
|
331
|
+
for i, c in enumerate(my_parts)
|
|
324
332
|
]
|
|
325
333
|
)
|
|
326
334
|
return SerializationResult(text=text_res)
|
|
@@ -3051,6 +3051,25 @@ class DoclingDocument(BaseModel):
|
|
|
3051
3051
|
"""Strip all <...> tags inside the chunk to get the raw text content."""
|
|
3052
3052
|
return re.sub(r"<.*?>", "", text_chunk, flags=re.DOTALL).strip()
|
|
3053
3053
|
|
|
3054
|
+
def extract_caption(
|
|
3055
|
+
text_chunk: str,
|
|
3056
|
+
) -> tuple[Optional[TextItem], Optional[BoundingBox]]:
|
|
3057
|
+
"""Extract caption text from the chunk."""
|
|
3058
|
+
caption = re.search(r"<caption>(.*?)</caption>", text_chunk)
|
|
3059
|
+
if caption is not None:
|
|
3060
|
+
caption_content = caption.group(1)
|
|
3061
|
+
bbox = extract_bounding_box(caption_content)
|
|
3062
|
+
caption_text = extract_inner_text(caption_content)
|
|
3063
|
+
caption_item = self.add_text(
|
|
3064
|
+
label=DocItemLabel.CAPTION,
|
|
3065
|
+
text=caption_text,
|
|
3066
|
+
parent=None,
|
|
3067
|
+
)
|
|
3068
|
+
else:
|
|
3069
|
+
caption_item = None
|
|
3070
|
+
bbox = None
|
|
3071
|
+
return caption_item, bbox
|
|
3072
|
+
|
|
3054
3073
|
def otsl_parse_texts(texts, tokens):
|
|
3055
3074
|
split_word = TableToken.OTSL_NL.value
|
|
3056
3075
|
split_row_tokens = [
|
|
@@ -3261,16 +3280,24 @@ class DoclingDocument(BaseModel):
|
|
|
3261
3280
|
if tag_name == DocumentToken.OTSL.value:
|
|
3262
3281
|
table_data = parse_table_content(full_chunk)
|
|
3263
3282
|
bbox = extract_bounding_box(full_chunk) if image else None
|
|
3264
|
-
|
|
3283
|
+
caption, caption_bbox = extract_caption(full_chunk)
|
|
3284
|
+
if caption is not None and caption_bbox is not None:
|
|
3285
|
+
caption.prov.append(
|
|
3286
|
+
ProvenanceItem(
|
|
3287
|
+
bbox=caption_bbox.resize_by_scale(pg_width, pg_height),
|
|
3288
|
+
charspan=(0, 0),
|
|
3289
|
+
page_no=page_no,
|
|
3290
|
+
)
|
|
3291
|
+
)
|
|
3265
3292
|
if bbox:
|
|
3266
3293
|
prov = ProvenanceItem(
|
|
3267
3294
|
bbox=bbox.resize_by_scale(pg_width, pg_height),
|
|
3268
3295
|
charspan=(0, 0),
|
|
3269
3296
|
page_no=page_no,
|
|
3270
3297
|
)
|
|
3271
|
-
self.add_table(data=table_data, prov=prov)
|
|
3298
|
+
self.add_table(data=table_data, prov=prov, caption=caption)
|
|
3272
3299
|
else:
|
|
3273
|
-
self.add_table(data=table_data)
|
|
3300
|
+
self.add_table(data=table_data, caption=caption)
|
|
3274
3301
|
|
|
3275
3302
|
elif tag_name == DocItemLabel.PICTURE:
|
|
3276
3303
|
text_caption_content = extract_inner_text(full_chunk)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.23.1 → docling_core-2.23.3}/docling_core/experimental/serializer/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.23.1 → docling_core-2.23.3}/docling_core/resources/schemas/doc/OCR-output.json
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.23.1 → docling_core-2.23.3}/docling_core/search/json_schema_to_search_mapper.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.23.1 → docling_core-2.23.3}/docling_core/transforms/chunker/hierarchical_chunker.py
RENAMED
|
File without changes
|
{docling_core-2.23.1 → docling_core-2.23.3}/docling_core/transforms/chunker/hybrid_chunker.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|