docling-core 2.23.1__tar.gz → 2.23.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

Files changed (68) hide show
  1. {docling_core-2.23.1 → docling_core-2.23.2}/PKG-INFO +1 -1
  2. {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/types/doc/document.py +30 -3
  3. {docling_core-2.23.1 → docling_core-2.23.2}/pyproject.toml +1 -1
  4. {docling_core-2.23.1 → docling_core-2.23.2}/LICENSE +0 -0
  5. {docling_core-2.23.1 → docling_core-2.23.2}/README.md +0 -0
  6. {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/__init__.py +0 -0
  7. {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/cli/__init__.py +0 -0
  8. {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/cli/view.py +0 -0
  9. {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/experimental/__init__.py +0 -0
  10. {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/experimental/serializer/__init__.py +0 -0
  11. {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/experimental/serializer/base.py +0 -0
  12. {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/experimental/serializer/common.py +0 -0
  13. {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/experimental/serializer/markdown.py +0 -0
  14. {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/py.typed +0 -0
  15. {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/resources/schemas/doc/ANN.json +0 -0
  16. {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/resources/schemas/doc/DOC.json +0 -0
  17. {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
  18. {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/resources/schemas/doc/RAW.json +0 -0
  19. {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
  20. {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
  21. {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
  22. {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
  23. {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/search/__init__.py +0 -0
  24. {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/search/json_schema_to_search_mapper.py +0 -0
  25. {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/search/mapping.py +0 -0
  26. {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/search/meta.py +0 -0
  27. {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/search/package.py +0 -0
  28. {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/transforms/__init__.py +0 -0
  29. {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/transforms/chunker/__init__.py +0 -0
  30. {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/transforms/chunker/base.py +0 -0
  31. {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/transforms/chunker/hierarchical_chunker.py +0 -0
  32. {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/transforms/chunker/hybrid_chunker.py +0 -0
  33. {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/types/__init__.py +0 -0
  34. {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/types/base.py +0 -0
  35. {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/types/doc/__init__.py +0 -0
  36. {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/types/doc/base.py +0 -0
  37. {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/types/doc/labels.py +0 -0
  38. {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/types/doc/page.py +0 -0
  39. {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/types/doc/tokens.py +0 -0
  40. {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/types/doc/utils.py +0 -0
  41. {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/types/gen/__init__.py +0 -0
  42. {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/types/gen/generic.py +0 -0
  43. {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/types/io/__init__.py +0 -0
  44. {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/types/legacy_doc/__init__.py +0 -0
  45. {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/types/legacy_doc/base.py +0 -0
  46. {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/types/legacy_doc/doc_ann.py +0 -0
  47. {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
  48. {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/types/legacy_doc/doc_raw.py +0 -0
  49. {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/types/legacy_doc/document.py +0 -0
  50. {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/types/legacy_doc/tokens.py +0 -0
  51. {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/types/nlp/__init__.py +0 -0
  52. {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/types/nlp/qa.py +0 -0
  53. {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/types/nlp/qa_labels.py +0 -0
  54. {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/types/rec/__init__.py +0 -0
  55. {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/types/rec/attribute.py +0 -0
  56. {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/types/rec/base.py +0 -0
  57. {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/types/rec/predicate.py +0 -0
  58. {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/types/rec/record.py +0 -0
  59. {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/types/rec/statement.py +0 -0
  60. {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/types/rec/subject.py +0 -0
  61. {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/utils/__init__.py +0 -0
  62. {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/utils/alias.py +0 -0
  63. {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/utils/file.py +0 -0
  64. {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/utils/generate_docs.py +0 -0
  65. {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/utils/generate_jsonschema.py +0 -0
  66. {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/utils/legacy.py +0 -0
  67. {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/utils/validate.py +0 -0
  68. {docling_core-2.23.1 → docling_core-2.23.2}/docling_core/utils/validators.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling-core
3
- Version: 2.23.1
3
+ Version: 2.23.2
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Home-page: https://github.com/docling-project
6
6
  License: MIT
@@ -3051,6 +3051,25 @@ class DoclingDocument(BaseModel):
3051
3051
  """Strip all <...> tags inside the chunk to get the raw text content."""
3052
3052
  return re.sub(r"<.*?>", "", text_chunk, flags=re.DOTALL).strip()
3053
3053
 
3054
+ def extract_caption(
3055
+ text_chunk: str,
3056
+ ) -> tuple[Optional[TextItem], Optional[BoundingBox]]:
3057
+ """Extract caption text from the chunk."""
3058
+ caption = re.search(r"<caption>(.*?)</caption>", text_chunk)
3059
+ if caption is not None:
3060
+ caption_content = caption.group(1)
3061
+ bbox = extract_bounding_box(caption_content)
3062
+ caption_text = extract_inner_text(caption_content)
3063
+ caption_item = self.add_text(
3064
+ label=DocItemLabel.CAPTION,
3065
+ text=caption_text,
3066
+ parent=None,
3067
+ )
3068
+ else:
3069
+ caption_item = None
3070
+ bbox = None
3071
+ return caption_item, bbox
3072
+
3054
3073
  def otsl_parse_texts(texts, tokens):
3055
3074
  split_word = TableToken.OTSL_NL.value
3056
3075
  split_row_tokens = [
@@ -3261,16 +3280,24 @@ class DoclingDocument(BaseModel):
3261
3280
  if tag_name == DocumentToken.OTSL.value:
3262
3281
  table_data = parse_table_content(full_chunk)
3263
3282
  bbox = extract_bounding_box(full_chunk) if image else None
3264
-
3283
+ caption, caption_bbox = extract_caption(full_chunk)
3284
+ if caption is not None and caption_bbox is not None:
3285
+ caption.prov.append(
3286
+ ProvenanceItem(
3287
+ bbox=caption_bbox.resize_by_scale(pg_width, pg_height),
3288
+ charspan=(0, 0),
3289
+ page_no=page_no,
3290
+ )
3291
+ )
3265
3292
  if bbox:
3266
3293
  prov = ProvenanceItem(
3267
3294
  bbox=bbox.resize_by_scale(pg_width, pg_height),
3268
3295
  charspan=(0, 0),
3269
3296
  page_no=page_no,
3270
3297
  )
3271
- self.add_table(data=table_data, prov=prov)
3298
+ self.add_table(data=table_data, prov=prov, caption=caption)
3272
3299
  else:
3273
- self.add_table(data=table_data)
3300
+ self.add_table(data=table_data, caption=caption)
3274
3301
 
3275
3302
  elif tag_name == DocItemLabel.PICTURE:
3276
3303
  text_caption_content = extract_inner_text(full_chunk)
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling-core"
3
- version = "2.23.1"
3
+ version = "2.23.2"
4
4
  description = "A python library to define and validate data types in Docling."
5
5
  license = "MIT"
6
6
  authors = [
File without changes
File without changes