docling-core 2.44.2__tar.gz → 2.46.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

Files changed (111) hide show
  1. {docling_core-2.44.2 → docling_core-2.46.0}/PKG-INFO +1 -1
  2. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/transforms/serializer/common.py +1 -0
  3. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/transforms/serializer/doctags.py +2 -0
  4. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/transforms/serializer/html.py +18 -12
  5. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/transforms/serializer/markdown.py +8 -1
  6. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/types/doc/__init__.py +2 -0
  7. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/types/doc/document.py +285 -257
  8. docling_core-2.46.0/docling_core/types/doc/utils.py +282 -0
  9. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/utils/legacy.py +1 -1
  10. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core.egg-info/PKG-INFO +1 -1
  11. {docling_core-2.44.2 → docling_core-2.46.0}/pyproject.toml +1 -1
  12. {docling_core-2.44.2 → docling_core-2.46.0}/test/test_docling_doc.py +332 -1
  13. {docling_core-2.44.2 → docling_core-2.46.0}/test/test_serialization.py +29 -1
  14. docling_core-2.44.2/docling_core/types/doc/utils.py +0 -86
  15. {docling_core-2.44.2 → docling_core-2.46.0}/LICENSE +0 -0
  16. {docling_core-2.44.2 → docling_core-2.46.0}/README.md +0 -0
  17. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/__init__.py +0 -0
  18. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/cli/__init__.py +0 -0
  19. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/cli/view.py +0 -0
  20. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/experimental/__init__.py +0 -0
  21. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/py.typed +0 -0
  22. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/resources/schemas/doc/ANN.json +0 -0
  23. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/resources/schemas/doc/DOC.json +0 -0
  24. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
  25. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/resources/schemas/doc/RAW.json +0 -0
  26. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
  27. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
  28. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
  29. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
  30. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/search/__init__.py +0 -0
  31. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/search/json_schema_to_search_mapper.py +0 -0
  32. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/search/mapping.py +0 -0
  33. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/search/meta.py +0 -0
  34. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/search/package.py +0 -0
  35. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/transforms/__init__.py +0 -0
  36. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/transforms/chunker/__init__.py +0 -0
  37. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/transforms/chunker/base.py +0 -0
  38. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/transforms/chunker/hierarchical_chunker.py +0 -0
  39. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/transforms/chunker/hybrid_chunker.py +0 -0
  40. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/transforms/chunker/page_chunker.py +0 -0
  41. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/transforms/chunker/tokenizer/__init__.py +0 -0
  42. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/transforms/chunker/tokenizer/base.py +0 -0
  43. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/transforms/chunker/tokenizer/huggingface.py +0 -0
  44. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/transforms/chunker/tokenizer/openai.py +0 -0
  45. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/transforms/serializer/__init__.py +0 -0
  46. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/transforms/serializer/base.py +0 -0
  47. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/transforms/serializer/html_styles.py +0 -0
  48. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/transforms/visualizer/__init__.py +0 -0
  49. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/transforms/visualizer/base.py +0 -0
  50. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/transforms/visualizer/key_value_visualizer.py +0 -0
  51. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/transforms/visualizer/layout_visualizer.py +0 -0
  52. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/transforms/visualizer/reading_order_visualizer.py +0 -0
  53. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/transforms/visualizer/table_visualizer.py +0 -0
  54. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/types/__init__.py +0 -0
  55. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/types/base.py +0 -0
  56. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/types/doc/base.py +0 -0
  57. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/types/doc/labels.py +0 -0
  58. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/types/doc/page.py +0 -0
  59. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/types/doc/tokens.py +0 -0
  60. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/types/gen/__init__.py +0 -0
  61. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/types/gen/generic.py +0 -0
  62. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/types/io/__init__.py +0 -0
  63. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/types/legacy_doc/__init__.py +0 -0
  64. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/types/legacy_doc/base.py +0 -0
  65. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/types/legacy_doc/doc_ann.py +0 -0
  66. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
  67. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/types/legacy_doc/doc_raw.py +0 -0
  68. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/types/legacy_doc/document.py +0 -0
  69. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/types/legacy_doc/tokens.py +0 -0
  70. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/types/nlp/__init__.py +0 -0
  71. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/types/nlp/qa.py +0 -0
  72. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/types/nlp/qa_labels.py +0 -0
  73. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/types/rec/__init__.py +0 -0
  74. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/types/rec/attribute.py +0 -0
  75. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/types/rec/base.py +0 -0
  76. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/types/rec/predicate.py +0 -0
  77. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/types/rec/record.py +0 -0
  78. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/types/rec/statement.py +0 -0
  79. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/types/rec/subject.py +0 -0
  80. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/utils/__init__.py +0 -0
  81. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/utils/alias.py +0 -0
  82. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/utils/file.py +0 -0
  83. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/utils/generate_docs.py +0 -0
  84. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/utils/generate_jsonschema.py +0 -0
  85. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/utils/validate.py +0 -0
  86. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/utils/validators.py +0 -0
  87. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core.egg-info/SOURCES.txt +0 -0
  88. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core.egg-info/dependency_links.txt +0 -0
  89. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core.egg-info/entry_points.txt +0 -0
  90. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core.egg-info/requires.txt +0 -0
  91. {docling_core-2.44.2 → docling_core-2.46.0}/docling_core.egg-info/top_level.txt +0 -0
  92. {docling_core-2.44.2 → docling_core-2.46.0}/setup.cfg +0 -0
  93. {docling_core-2.44.2 → docling_core-2.46.0}/test/test_base.py +0 -0
  94. {docling_core-2.44.2 → docling_core-2.46.0}/test/test_collection.py +0 -0
  95. {docling_core-2.44.2 → docling_core-2.46.0}/test/test_data_gen_flag.py +0 -0
  96. {docling_core-2.44.2 → docling_core-2.46.0}/test/test_doc_base.py +0 -0
  97. {docling_core-2.44.2 → docling_core-2.46.0}/test/test_doc_legacy_convert.py +0 -0
  98. {docling_core-2.44.2 → docling_core-2.46.0}/test/test_doc_schema.py +0 -0
  99. {docling_core-2.44.2 → docling_core-2.46.0}/test/test_doc_schema_extractor.py +0 -0
  100. {docling_core-2.44.2 → docling_core-2.46.0}/test/test_doctags_load.py +0 -0
  101. {docling_core-2.44.2 → docling_core-2.46.0}/test/test_hierarchical_chunker.py +0 -0
  102. {docling_core-2.44.2 → docling_core-2.46.0}/test/test_hybrid_chunker.py +0 -0
  103. {docling_core-2.44.2 → docling_core-2.46.0}/test/test_json_schema_to_search_mapper.py +0 -0
  104. {docling_core-2.44.2 → docling_core-2.46.0}/test/test_nlp_qa.py +0 -0
  105. {docling_core-2.44.2 → docling_core-2.46.0}/test/test_otsl_table_export.py +0 -0
  106. {docling_core-2.44.2 → docling_core-2.46.0}/test/test_page.py +0 -0
  107. {docling_core-2.44.2 → docling_core-2.46.0}/test/test_page_chunker.py +0 -0
  108. {docling_core-2.44.2 → docling_core-2.46.0}/test/test_rec_schema.py +0 -0
  109. {docling_core-2.44.2 → docling_core-2.46.0}/test/test_search_meta.py +0 -0
  110. {docling_core-2.44.2 → docling_core-2.46.0}/test/test_utils.py +0 -0
  111. {docling_core-2.44.2 → docling_core-2.46.0}/test/test_visualization.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling-core
3
- Version: 2.44.2
3
+ Version: 2.46.0
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Author-email: Cesar Berrospi Ramis <ceb@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  Maintainer-email: Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>, Cesar Berrospi Ramis <ceb@zurich.ibm.com>
@@ -359,6 +359,7 @@ class DocSerializer(BaseModel, BaseDocSerializer):
359
359
  item=item,
360
360
  doc_serializer=self,
361
361
  doc=self.doc,
362
+ visited=my_visited,
362
363
  **my_kwargs,
363
364
  )
364
365
  elif isinstance(item, PictureItem):
@@ -157,6 +157,7 @@ class DocTagsTableSerializer(BaseTableSerializer):
157
157
  item: TableItem,
158
158
  doc_serializer: BaseDocSerializer,
159
159
  doc: DoclingDocument,
160
+ visited: Optional[set[str]] = None,
160
161
  **kwargs: Any,
161
162
  ) -> SerializationResult:
162
163
  """Serializes the passed item."""
@@ -179,6 +180,7 @@ class DocTagsTableSerializer(BaseTableSerializer):
179
180
  add_cell_text=params.add_table_cell_text,
180
181
  xsize=params.xsize,
181
182
  ysize=params.ysize,
183
+ visited=visited,
182
184
  )
183
185
  res_parts.append(create_ser_result(text=otsl_text, span_source=item))
184
186
 
@@ -65,8 +65,8 @@ from docling_core.types.doc.document import (
65
65
  PictureItem,
66
66
  PictureMoleculeData,
67
67
  PictureTabularChartData,
68
+ RichTableCell,
68
69
  SectionHeaderItem,
69
- TableCell,
70
70
  TableItem,
71
71
  TextItem,
72
72
  TitleItem,
@@ -346,9 +346,6 @@ class HTMLTableSerializer(BaseTableSerializer):
346
346
  **kwargs: Any,
347
347
  ) -> SerializationResult:
348
348
  """Serializes the passed table item to HTML."""
349
- nrows = item.data.num_rows
350
- ncols = item.data.num_cols
351
-
352
349
  res_parts: list[SerializationResult] = []
353
350
  cap_res = doc_serializer.serialize_captions(item=item, tag="caption", **kwargs)
354
351
  if cap_res.text:
@@ -356,11 +353,11 @@ class HTMLTableSerializer(BaseTableSerializer):
356
353
 
357
354
  if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
358
355
  body = ""
356
+ span_source: Union[DocItem, list[SerializationResult]] = []
359
357
 
360
- for i in range(nrows):
358
+ for i, row in enumerate(item.data.grid):
361
359
  body += "<tr>"
362
- for j in range(ncols):
363
- cell: TableCell = item.data.grid[i][j]
360
+ for j, cell in enumerate(row):
364
361
 
365
362
  rowspan, rowstart = (
366
363
  cell.row_span,
@@ -376,7 +373,16 @@ class HTMLTableSerializer(BaseTableSerializer):
376
373
  if colstart != j:
377
374
  continue
378
375
 
379
- content = html.escape(cell.text.strip())
376
+ if isinstance(cell, RichTableCell):
377
+ ser_res = doc_serializer.serialize(
378
+ item=cell.ref.resolve(doc=doc), **kwargs
379
+ )
380
+ content = ser_res.text
381
+ span_source = [ser_res]
382
+ else:
383
+ content = html.escape(cell.text.strip())
384
+ span_source = item
385
+
380
386
  celltag = "td"
381
387
  if cell.column_header or cell.row_header or cell.row_section:
382
388
  celltag = "th"
@@ -389,14 +395,14 @@ class HTMLTableSerializer(BaseTableSerializer):
389
395
 
390
396
  text_dir = get_text_direction(content)
391
397
  if text_dir == "rtl":
392
- opening_tag += f' dir="{dir}"'
398
+ opening_tag += f' dir="{text_dir}"'
393
399
 
394
400
  body += f"<{opening_tag}>{content}</{celltag}>"
395
401
  body += "</tr>"
396
402
 
397
403
  if body:
398
404
  body = f"<tbody>{body}</tbody>"
399
- res_parts.append(create_ser_result(text=body, span_source=item))
405
+ res_parts.append(create_ser_result(text=body, span_source=span_source))
400
406
 
401
407
  text_res = "".join([r.text for r in res_parts])
402
408
  text_res = f"<table>{text_res}</table>" if text_res else ""
@@ -1057,7 +1063,7 @@ class HTMLDocSerializer(DocSerializer):
1057
1063
  if self.params.html_head is not None:
1058
1064
  return self.params.html_head
1059
1065
 
1060
- head_parts = ["<head>", '<meta charset="UTF-8">']
1066
+ head_parts = ["<head>", '<meta charset="UTF-8"/>']
1061
1067
 
1062
1068
  # Add metadata if requested
1063
1069
  if params.add_document_metadata:
@@ -1067,7 +1073,7 @@ class HTMLDocSerializer(DocSerializer):
1067
1073
  head_parts.append("<title>Docling Document</title>")
1068
1074
 
1069
1075
  head_parts.append(
1070
- '<meta name="generator" content="Docling HTML Serializer">'
1076
+ '<meta name="generator" content="Docling HTML Serializer"/>'
1071
1077
  )
1072
1078
 
1073
1079
  # Add default styles or custom CSS
@@ -55,6 +55,7 @@ from docling_core.types.doc.document import (
55
55
  PictureItem,
56
56
  PictureMoleculeData,
57
57
  PictureTabularChartData,
58
+ RichTableCell,
58
59
  SectionHeaderItem,
59
60
  TableItem,
60
61
  TextItem,
@@ -320,7 +321,13 @@ class MarkdownTableSerializer(BaseTableSerializer):
320
321
  [
321
322
  # make sure that md tables are not broken
322
323
  # due to newline chars in the text
323
- col.text.replace("\n", " ")
324
+ (
325
+ doc_serializer.serialize(
326
+ item=col.ref.resolve(doc=doc), **kwargs
327
+ ).text
328
+ if isinstance(col, RichTableCell)
329
+ else col.text
330
+ ).replace("\n", " ")
324
331
  for col in row
325
332
  ]
326
333
  for row in item.data.grid
@@ -7,6 +7,7 @@
7
7
 
8
8
  from .base import BoundingBox, CoordOrigin, ImageRefMode, Size
9
9
  from .document import (
10
+ AnyTableCell,
10
11
  BaseAnnotation,
11
12
  ChartBar,
12
13
  ChartLine,
@@ -52,6 +53,7 @@ from .document import (
52
53
  PictureTabularChartData,
53
54
  ProvenanceItem,
54
55
  RefItem,
56
+ RichTableCell,
55
57
  Script,
56
58
  SectionHeaderItem,
57
59
  TableCell,