docling-core 2.23.0__py3-none-any.whl → 2.23.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

@@ -1548,7 +1548,7 @@ class DoclingDocument(BaseModel):
1548
1548
 
1549
1549
  _HTML_DEFAULT_HEAD: str = r"""<head>
1550
1550
  <link rel="icon" type="image/png"
1551
- href="https://ds4sd.github.io/docling/assets/logo.png"/>
1551
+ href="https://raw.githubusercontent.com/docling-project/docling/refs/heads/main/docs/assets/logo.svg"/>
1552
1552
  <meta charset="UTF-8">
1553
1553
  <title>
1554
1554
  Powered by Docling
@@ -3051,6 +3051,25 @@ class DoclingDocument(BaseModel):
3051
3051
  """Strip all <...> tags inside the chunk to get the raw text content."""
3052
3052
  return re.sub(r"<.*?>", "", text_chunk, flags=re.DOTALL).strip()
3053
3053
 
3054
+ def extract_caption(
3055
+ text_chunk: str,
3056
+ ) -> tuple[Optional[TextItem], Optional[BoundingBox]]:
3057
+ """Extract caption text from the chunk."""
3058
+ caption = re.search(r"<caption>(.*?)</caption>", text_chunk)
3059
+ if caption is not None:
3060
+ caption_content = caption.group(1)
3061
+ bbox = extract_bounding_box(caption_content)
3062
+ caption_text = extract_inner_text(caption_content)
3063
+ caption_item = self.add_text(
3064
+ label=DocItemLabel.CAPTION,
3065
+ text=caption_text,
3066
+ parent=None,
3067
+ )
3068
+ else:
3069
+ caption_item = None
3070
+ bbox = None
3071
+ return caption_item, bbox
3072
+
3054
3073
  def otsl_parse_texts(texts, tokens):
3055
3074
  split_word = TableToken.OTSL_NL.value
3056
3075
  split_row_tokens = [
@@ -3261,16 +3280,24 @@ class DoclingDocument(BaseModel):
3261
3280
  if tag_name == DocumentToken.OTSL.value:
3262
3281
  table_data = parse_table_content(full_chunk)
3263
3282
  bbox = extract_bounding_box(full_chunk) if image else None
3264
-
3283
+ caption, caption_bbox = extract_caption(full_chunk)
3284
+ if caption is not None and caption_bbox is not None:
3285
+ caption.prov.append(
3286
+ ProvenanceItem(
3287
+ bbox=caption_bbox.resize_by_scale(pg_width, pg_height),
3288
+ charspan=(0, 0),
3289
+ page_no=page_no,
3290
+ )
3291
+ )
3265
3292
  if bbox:
3266
3293
  prov = ProvenanceItem(
3267
3294
  bbox=bbox.resize_by_scale(pg_width, pg_height),
3268
3295
  charspan=(0, 0),
3269
3296
  page_no=page_no,
3270
3297
  )
3271
- self.add_table(data=table_data, prov=prov)
3298
+ self.add_table(data=table_data, prov=prov, caption=caption)
3272
3299
  else:
3273
- self.add_table(data=table_data)
3300
+ self.add_table(data=table_data, caption=caption)
3274
3301
 
3275
3302
  elif tag_name == DocItemLabel.PICTURE:
3276
3303
  text_caption_content = extract_inner_text(full_chunk)
@@ -1,8 +1,8 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling-core
3
- Version: 2.23.0
3
+ Version: 2.23.2
4
4
  Summary: A python library to define and validate data types in Docling.
5
- Home-page: https://ds4sd.github.io/
5
+ Home-page: https://github.com/docling-project
6
6
  License: MIT
7
7
  Keywords: docling,discovery,etl,information retrieval,analytics,database,database schema,schema,JSON
8
8
  Author: Cesar Berrospi Ramis
@@ -38,7 +38,7 @@ Requires-Dist: tabulate (>=0.9.0,<0.10.0)
38
38
  Requires-Dist: transformers (>=4.34.0,<5.0.0) ; extra == "chunking"
39
39
  Requires-Dist: typer (>=0.12.5,<0.13.0)
40
40
  Requires-Dist: typing-extensions (>=4.12.2,<5.0.0)
41
- Project-URL: Repository, https://github.com/DS4SD/docling-core
41
+ Project-URL: Repository, https://github.com/docling-project/docling-core
42
42
  Description-Content-Type: text/markdown
43
43
 
44
44
  # Docling Core
@@ -51,9 +51,9 @@ Description-Content-Type: text/markdown
51
51
  [![Checked with mypy](https://www.mypy-lang.org/static/mypy_badge.svg)](https://mypy-lang.org/)
52
52
  [![Pydantic v2](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/pydantic/pydantic/main/docs/badge/v2.json)](https://pydantic.dev)
53
53
  [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit)
54
- [![License MIT](https://img.shields.io/github/license/ds4sd/docling-core)](https://opensource.org/licenses/MIT)
54
+ [![License MIT](https://img.shields.io/github/license/docling-project/docling-core)](https://opensource.org/licenses/MIT)
55
55
 
56
- Docling Core is a library that defines the data types in [Docling](https://github.com/DS4SD/docling), leveraging pydantic models.
56
+ Docling Core is a library that defines the data types in [Docling](https://github.com/docling-project/docling), leveraging pydantic models.
57
57
 
58
58
  ## Installation
59
59
 
@@ -29,7 +29,7 @@ docling_core/types/__init__.py,sha256=MVRSgsk5focwGyAplh_TRR3dEecIXpd98g_u3zZ5HX
29
29
  docling_core/types/base.py,sha256=PusJskRVL19y-hq0BgXr5e8--QEqSqLnFNJ8UbOqW88,8318
30
30
  docling_core/types/doc/__init__.py,sha256=bysJn2iwjAHwThSWDPXEdVUUij7p_ax12_nx2_0CMdg,653
31
31
  docling_core/types/doc/base.py,sha256=22U1qDlD-2ICmgzbdZrjNayoPHnq4S1ks1GRoqB7y1Q,12542
32
- docling_core/types/doc/document.py,sha256=zlB-J7qrVgYl-LhiSJMMuy31zCues6XzFAA1mQ8CiKw,127097
32
+ docling_core/types/doc/document.py,sha256=j3v1hL2O6_DzN9n8Ak0Ho46sRhElqmRXU_Gd4zqThLA,128422
33
33
  docling_core/types/doc/labels.py,sha256=0J9Gsqz-jQ4FP2yxs9wOxoTr3qg97BniFX7MJVziUmk,5684
34
34
  docling_core/types/doc/page.py,sha256=8A9sM-6mNad_JzaoaIXlfsBoo6zbw29uk7fp6j24omg,39461
35
35
  docling_core/types/doc/tokens.py,sha256=Z2FuzHWinYQzWZdTvOBsEQACAKPcBiSf777w5S9NJms,3947
@@ -62,8 +62,8 @@ docling_core/utils/generate_jsonschema.py,sha256=uNX1O5XnjyB5nA66XqZXTt3YbGuR2ty
62
62
  docling_core/utils/legacy.py,sha256=SqNQAxl97aHfoJEsC9vZcMJg5FNkmqKPFi-wdSrnfI0,24442
63
63
  docling_core/utils/validate.py,sha256=aQ11UbFyl8iD_N7yTTZmm_VVeXz8KcCyn3GLXgkfYRM,2049
64
64
  docling_core/utils/validators.py,sha256=azcrndLzhNkTWnbFSu9shJ5D3j_znnLrIFA5R8hzmGU,2798
65
- docling_core-2.23.0.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
66
- docling_core-2.23.0.dist-info/METADATA,sha256=PO2_Q0F0Coo_y-7K-pkbfC5I0up3Us-vMvFpmn4Z-fY,5803
67
- docling_core-2.23.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
68
- docling_core-2.23.0.dist-info/entry_points.txt,sha256=oClcdb2L2RKx4jdqUykY16Kum_f0_whwWhGzIodyidc,216
69
- docling_core-2.23.0.dist-info/RECORD,,
65
+ docling_core-2.23.2.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
66
+ docling_core-2.23.2.dist-info/METADATA,sha256=3l2ExHgU9RvydsDmp-8N2TwmHNCGmaLgVmceikfOnZ0,5843
67
+ docling_core-2.23.2.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
68
+ docling_core-2.23.2.dist-info/entry_points.txt,sha256=oClcdb2L2RKx4jdqUykY16Kum_f0_whwWhGzIodyidc,216
69
+ docling_core-2.23.2.dist-info/RECORD,,