docling-core 2.11.0__py3-none-any.whl → 2.12.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

docling_core/cli/view.py CHANGED
@@ -57,7 +57,7 @@ def view(
57
57
  doc = DoclingDocument.load_from_json(filename=path)
58
58
  target_path = Path(tempfile.mkdtemp()) / "out.html"
59
59
  html_output = doc.export_to_html(image_mode=ImageRefMode.EMBEDDED)
60
- with open(target_path, "w") as f:
60
+ with open(target_path, "w", encoding="utf-8") as f:
61
61
  f.write(html_output)
62
62
  webbrowser.open(url=f"file://{target_path.absolute().resolve()}")
63
63
 
@@ -1884,7 +1884,7 @@ class DoclingDocument(BaseModel):
1884
1884
  )
1885
1885
 
1886
1886
  out = new_doc.export_to_dict()
1887
- with open(filename, "w") as fw:
1887
+ with open(filename, "w", encoding="utf-8") as fw:
1888
1888
  json.dump(out, fw, indent=indent)
1889
1889
 
1890
1890
  @classmethod
@@ -1898,7 +1898,7 @@ class DoclingDocument(BaseModel):
1898
1898
  :rtype: DoclingDocument
1899
1899
 
1900
1900
  """
1901
- with open(filename, "r") as f:
1901
+ with open(filename, "r", encoding="utf-8") as f:
1902
1902
  return cls.model_validate_json(f.read())
1903
1903
 
1904
1904
  def save_as_yaml(
@@ -1919,7 +1919,7 @@ class DoclingDocument(BaseModel):
1919
1919
  )
1920
1920
 
1921
1921
  out = new_doc.export_to_dict()
1922
- with open(filename, "w") as fw:
1922
+ with open(filename, "w", encoding="utf-8") as fw:
1923
1923
  yaml.dump(out, fw, default_flow_style=default_flow_style)
1924
1924
 
1925
1925
  def export_to_dict(
@@ -1971,7 +1971,7 @@ class DoclingDocument(BaseModel):
1971
1971
  page_no=page_no,
1972
1972
  )
1973
1973
 
1974
- with open(filename, "w") as fw:
1974
+ with open(filename, "w", encoding="utf-8") as fw:
1975
1975
  fw.write(md_out)
1976
1976
 
1977
1977
  def export_to_markdown( # noqa: C901
@@ -2038,6 +2038,9 @@ class DoclingDocument(BaseModel):
2038
2038
  if ix < from_element or to_element <= ix:
2039
2039
  continue # skip as many items as you want
2040
2040
 
2041
+ if (isinstance(item, DocItem)) and (item.label not in labels):
2042
+ continue # skip any label that is not whitelisted
2043
+
2041
2044
  # Handle newlines between different types of content
2042
2045
  if (
2043
2046
  len(mdtexts) > 0
@@ -2224,7 +2227,7 @@ class DoclingDocument(BaseModel):
2224
2227
  html_head=html_head,
2225
2228
  )
2226
2229
 
2227
- with open(filename, "w") as fw:
2230
+ with open(filename, "w", encoding="utf-8") as fw:
2228
2231
  fw.write(html_out)
2229
2232
 
2230
2233
  def _get_output_paths(
@@ -2462,7 +2465,7 @@ class DoclingDocument(BaseModel):
2462
2465
  with_groups=with_groups,
2463
2466
  )
2464
2467
 
2465
- with open(filename, "w") as fw:
2468
+ with open(filename, "w", encoding="utf-8") as fw:
2466
2469
  fw.write(out)
2467
2470
 
2468
2471
  def export_to_document_tokens(
@@ -48,6 +48,7 @@ class GroupLabel(str, Enum):
48
48
  SLIDE = "slide"
49
49
  FORM_AREA = "form_area"
50
50
  KEY_VALUE_AREA = "key_value_area"
51
+ COMMENT_SECTION = "comment_section"
51
52
 
52
53
  def __str__(self):
53
54
  """Get string value."""
@@ -550,17 +550,18 @@ class ExportedCCSDocument(
550
550
 
551
551
  elif (
552
552
  isinstance(item, Table)
553
- and item.data
553
+ and (item.data or item.text)
554
554
  and item_type in main_text_labels
555
555
  ):
556
556
 
557
557
  md_table = ""
558
558
  table = []
559
- for row in item.data:
560
- tmp = []
561
- for col in row:
562
- tmp.append(col.text)
563
- table.append(tmp)
559
+ if item.data is not None:
560
+ for row in item.data:
561
+ tmp = []
562
+ for col in row:
563
+ tmp.append(col.text)
564
+ table.append(tmp)
564
565
 
565
566
  if len(table) > 1 and len(table[0]) > 0:
566
567
  try:
@@ -579,7 +580,9 @@ class ExportedCCSDocument(
579
580
  if item.text:
580
581
  markdown_text = item.text
581
582
  if not strict_text:
582
- markdown_text += "\n\n" + md_table
583
+ markdown_text += (
584
+ "\n\n" if len(markdown_text) > 0 else ""
585
+ ) + md_table
583
586
 
584
587
  elif isinstance(item, Figure) and item_type in main_text_labels:
585
588
 
@@ -587,7 +590,9 @@ class ExportedCCSDocument(
587
590
  if item.text:
588
591
  markdown_text = item.text
589
592
  if not strict_text:
590
- markdown_text += f"\n{image_placeholder}"
593
+ markdown_text += (
594
+ "\n" if len(markdown_text) > 0 else ""
595
+ ) + image_placeholder
591
596
 
592
597
  if markdown_text:
593
598
  md_texts.append(markdown_text)
@@ -38,7 +38,7 @@ def run():
38
38
  """Run the validation of a file containing a Document."""
39
39
  file_format, input_file = parse_arguments()
40
40
 
41
- with open(input_file, "r") as fd:
41
+ with open(input_file, "r", encoding="utf-8") as fd:
42
42
  file_ = json.load(fd)
43
43
 
44
44
  result = (False, "Empty result")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling-core
3
- Version: 2.11.0
3
+ Version: 2.12.1
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Home-page: https://ds4sd.github.io/
6
6
  License: MIT
@@ -1,6 +1,6 @@
1
1
  docling_core/__init__.py,sha256=D0afxif-BMUrgx2cYk1cwxiwATRYaGXsIMk_z4nw1Vs,90
2
2
  docling_core/cli/__init__.py,sha256=C63yWifzpA0IV7YWDatpAdrhoV8zjqxAKv0xMf09VdM,19
3
- docling_core/cli/view.py,sha256=bhxvPQWIJVo2g_pRL0GjQwjDw-jdiRXp1-BTbG849go,1746
3
+ docling_core/cli/view.py,sha256=gwxSBYhGqwznMR8pdXaEuAh2bjFD5X_g11xFYSgFgtM,1764
4
4
  docling_core/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
5
  docling_core/resources/schemas/doc/ANN.json,sha256=04U5j-PU9m5w7IagJ_rHcAx7qUtLkUuaWZO9GuYHnTA,4202
6
6
  docling_core/resources/schemas/doc/DOC.json,sha256=9tVKpCqDGGq3074Nn5qlUCdTN-5k1Q0ri_scJblwnLE,6686
@@ -24,8 +24,8 @@ docling_core/types/__init__.py,sha256=MVRSgsk5focwGyAplh_TRR3dEecIXpd98g_u3zZ5HX
24
24
  docling_core/types/base.py,sha256=PusJskRVL19y-hq0BgXr5e8--QEqSqLnFNJ8UbOqW88,8318
25
25
  docling_core/types/doc/__init__.py,sha256=bEL4zKVOG7Wxm6xQrgF58mu-Teds9aSavuEAKVNhrTU,639
26
26
  docling_core/types/doc/base.py,sha256=_ttU8QI8wXDTQRUnN5n7L6D9wYFVLSAibxlFoMbgAsk,4557
27
- docling_core/types/doc/document.py,sha256=9t6FPvrxT9gKtUaYMP_Kyhz_izo2p6TQX_LlG2Fj5hY,91593
28
- docling_core/types/doc/labels.py,sha256=4BG_wNG1qDc5E3qQHixPjM_IAxGjGo14hobNyfTycZw,1662
27
+ docling_core/types/doc/document.py,sha256=2W4wZunI0K_EOxNtY5jbKeyw7bYWKKNLiljxfN8anHc,91844
28
+ docling_core/types/doc/labels.py,sha256=Pc5avKtGM2fv-w7mXinoxs9BkhktmFaJ6ACsgFiAAm4,1702
29
29
  docling_core/types/doc/tokens.py,sha256=uU_MYW_p7ypf7eYICFBvxdnVaPZ7CQnvZmbJ6oPrtEA,6134
30
30
  docling_core/types/doc/utils.py,sha256=YDOh_ZD1Y7OmCEDdCLJ_MO5K3HA67nc_acfhOK6WztU,1439
31
31
  docling_core/types/gen/__init__.py,sha256=C6TuCfvpSnSL5XDOFMcYHUY2-i08vvfOGRcdu6Af0pI,124
@@ -36,7 +36,7 @@ docling_core/types/legacy_doc/base.py,sha256=aBKBunw6M6nvEq4lqP1cfFWK3GpGa6PXwNQ
36
36
  docling_core/types/legacy_doc/doc_ann.py,sha256=CIQHW8yzu70bsMR9gtu7dqe4oz603Tq2eDDt9sh-tYo,1203
37
37
  docling_core/types/legacy_doc/doc_ocr.py,sha256=FfFqHAyMSbFt5cKeE7QLcxS0qUweBilBJoN9CH2TsQs,1394
38
38
  docling_core/types/legacy_doc/doc_raw.py,sha256=LrvQ9DhNjBRy98p_F9PUyHZeTGAxMKWqJzY4WJ7v-xs,3895
39
- docling_core/types/legacy_doc/document.py,sha256=A_cTYOjx6pNIICpOUm09YsfwPrIGDEZTKdetb2fx4PM,24273
39
+ docling_core/types/legacy_doc/document.py,sha256=AW8AIBM19k-HtTmXPsFKagqd6gi9THJdB4RsPb1C5F0,24534
40
40
  docling_core/types/legacy_doc/tokens.py,sha256=uU_MYW_p7ypf7eYICFBvxdnVaPZ7CQnvZmbJ6oPrtEA,6134
41
41
  docling_core/types/nlp/__init__.py,sha256=hGcztAeVK7xkRBqRRvc4zbY4PGeJ0r0QrEsetnSx9nI,119
42
42
  docling_core/types/nlp/qa.py,sha256=TyZjubqkEoREv0YzmuLKlq4WW_TnJNj7BoBY1_r2a1E,2731
@@ -54,10 +54,10 @@ docling_core/utils/file.py,sha256=GzX0pclvewwPoqHJSaVUuULzSJwJgkCUwgKgJ7G5ohQ,56
54
54
  docling_core/utils/generate_docs.py,sha256=BdKAoduWXOc7YMvcmlhjoJOFlUxij1ybxglj6LZDtC8,2290
55
55
  docling_core/utils/generate_jsonschema.py,sha256=uNX1O5XnjyB5nA66XqZXTt3YbGuR2tyi_OhHepHYtZg,1654
56
56
  docling_core/utils/legacy.py,sha256=xfp7U0JqjI60K3loWiNTk8w08_KfCUzTb2MNULBOIz4,24396
57
- docling_core/utils/validate.py,sha256=3FmnxnKTDZC5J9OGxCL3U3DGRl0t0bBV1NcySXswdas,2031
57
+ docling_core/utils/validate.py,sha256=aQ11UbFyl8iD_N7yTTZmm_VVeXz8KcCyn3GLXgkfYRM,2049
58
58
  docling_core/utils/validators.py,sha256=azcrndLzhNkTWnbFSu9shJ5D3j_znnLrIFA5R8hzmGU,2798
59
- docling_core-2.11.0.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
60
- docling_core-2.11.0.dist-info/METADATA,sha256=4Xb7VqXg4dAxRWiT-KThSn4i_TiIsoIXdhyN8eZOWSk,5744
61
- docling_core-2.11.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
62
- docling_core-2.11.0.dist-info/entry_points.txt,sha256=oClcdb2L2RKx4jdqUykY16Kum_f0_whwWhGzIodyidc,216
63
- docling_core-2.11.0.dist-info/RECORD,,
59
+ docling_core-2.12.1.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
60
+ docling_core-2.12.1.dist-info/METADATA,sha256=7GQuAsiJL6EGBIEjeiX-XQs7gELqhCGWsNeSrsGm5Kk,5744
61
+ docling_core-2.12.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
62
+ docling_core-2.12.1.dist-info/entry_points.txt,sha256=oClcdb2L2RKx4jdqUykY16Kum_f0_whwWhGzIodyidc,216
63
+ docling_core-2.12.1.dist-info/RECORD,,