docling-core 2.11.0__tar.gz → 2.12.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

Files changed (62) hide show
  1. {docling_core-2.11.0 → docling_core-2.12.1}/PKG-INFO +1 -1
  2. {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/cli/view.py +1 -1
  3. {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/types/doc/document.py +9 -6
  4. {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/types/doc/labels.py +1 -0
  5. {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/types/legacy_doc/document.py +13 -8
  6. {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/utils/validate.py +1 -1
  7. {docling_core-2.11.0 → docling_core-2.12.1}/pyproject.toml +1 -1
  8. {docling_core-2.11.0 → docling_core-2.12.1}/LICENSE +0 -0
  9. {docling_core-2.11.0 → docling_core-2.12.1}/README.md +0 -0
  10. {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/__init__.py +0 -0
  11. {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/cli/__init__.py +0 -0
  12. {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/py.typed +0 -0
  13. {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/resources/schemas/doc/ANN.json +0 -0
  14. {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/resources/schemas/doc/DOC.json +0 -0
  15. {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
  16. {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/resources/schemas/doc/RAW.json +0 -0
  17. {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
  18. {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
  19. {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
  20. {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
  21. {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/search/__init__.py +0 -0
  22. {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/search/json_schema_to_search_mapper.py +0 -0
  23. {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/search/mapping.py +0 -0
  24. {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/search/meta.py +0 -0
  25. {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/search/package.py +0 -0
  26. {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/transforms/__init__.py +0 -0
  27. {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/transforms/chunker/__init__.py +0 -0
  28. {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/transforms/chunker/base.py +0 -0
  29. {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/transforms/chunker/hierarchical_chunker.py +0 -0
  30. {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/transforms/chunker/hybrid_chunker.py +0 -0
  31. {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/types/__init__.py +0 -0
  32. {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/types/base.py +0 -0
  33. {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/types/doc/__init__.py +0 -0
  34. {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/types/doc/base.py +0 -0
  35. {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/types/doc/tokens.py +0 -0
  36. {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/types/doc/utils.py +0 -0
  37. {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/types/gen/__init__.py +0 -0
  38. {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/types/gen/generic.py +0 -0
  39. {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/types/io/__init__.py +0 -0
  40. {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/types/legacy_doc/__init__.py +0 -0
  41. {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/types/legacy_doc/base.py +0 -0
  42. {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/types/legacy_doc/doc_ann.py +0 -0
  43. {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
  44. {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/types/legacy_doc/doc_raw.py +0 -0
  45. {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/types/legacy_doc/tokens.py +0 -0
  46. {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/types/nlp/__init__.py +0 -0
  47. {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/types/nlp/qa.py +0 -0
  48. {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/types/nlp/qa_labels.py +0 -0
  49. {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/types/rec/__init__.py +0 -0
  50. {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/types/rec/attribute.py +0 -0
  51. {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/types/rec/base.py +0 -0
  52. {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/types/rec/predicate.py +0 -0
  53. {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/types/rec/record.py +0 -0
  54. {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/types/rec/statement.py +0 -0
  55. {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/types/rec/subject.py +0 -0
  56. {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/utils/__init__.py +0 -0
  57. {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/utils/alias.py +0 -0
  58. {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/utils/file.py +0 -0
  59. {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/utils/generate_docs.py +0 -0
  60. {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/utils/generate_jsonschema.py +0 -0
  61. {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/utils/legacy.py +0 -0
  62. {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/utils/validators.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling-core
3
- Version: 2.11.0
3
+ Version: 2.12.1
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Home-page: https://ds4sd.github.io/
6
6
  License: MIT
@@ -57,7 +57,7 @@ def view(
57
57
  doc = DoclingDocument.load_from_json(filename=path)
58
58
  target_path = Path(tempfile.mkdtemp()) / "out.html"
59
59
  html_output = doc.export_to_html(image_mode=ImageRefMode.EMBEDDED)
60
- with open(target_path, "w") as f:
60
+ with open(target_path, "w", encoding="utf-8") as f:
61
61
  f.write(html_output)
62
62
  webbrowser.open(url=f"file://{target_path.absolute().resolve()}")
63
63
 
@@ -1884,7 +1884,7 @@ class DoclingDocument(BaseModel):
1884
1884
  )
1885
1885
 
1886
1886
  out = new_doc.export_to_dict()
1887
- with open(filename, "w") as fw:
1887
+ with open(filename, "w", encoding="utf-8") as fw:
1888
1888
  json.dump(out, fw, indent=indent)
1889
1889
 
1890
1890
  @classmethod
@@ -1898,7 +1898,7 @@ class DoclingDocument(BaseModel):
1898
1898
  :rtype: DoclingDocument
1899
1899
 
1900
1900
  """
1901
- with open(filename, "r") as f:
1901
+ with open(filename, "r", encoding="utf-8") as f:
1902
1902
  return cls.model_validate_json(f.read())
1903
1903
 
1904
1904
  def save_as_yaml(
@@ -1919,7 +1919,7 @@ class DoclingDocument(BaseModel):
1919
1919
  )
1920
1920
 
1921
1921
  out = new_doc.export_to_dict()
1922
- with open(filename, "w") as fw:
1922
+ with open(filename, "w", encoding="utf-8") as fw:
1923
1923
  yaml.dump(out, fw, default_flow_style=default_flow_style)
1924
1924
 
1925
1925
  def export_to_dict(
@@ -1971,7 +1971,7 @@ class DoclingDocument(BaseModel):
1971
1971
  page_no=page_no,
1972
1972
  )
1973
1973
 
1974
- with open(filename, "w") as fw:
1974
+ with open(filename, "w", encoding="utf-8") as fw:
1975
1975
  fw.write(md_out)
1976
1976
 
1977
1977
  def export_to_markdown( # noqa: C901
@@ -2038,6 +2038,9 @@ class DoclingDocument(BaseModel):
2038
2038
  if ix < from_element or to_element <= ix:
2039
2039
  continue # skip as many items as you want
2040
2040
 
2041
+ if (isinstance(item, DocItem)) and (item.label not in labels):
2042
+ continue # skip any label that is not whitelisted
2043
+
2041
2044
  # Handle newlines between different types of content
2042
2045
  if (
2043
2046
  len(mdtexts) > 0
@@ -2224,7 +2227,7 @@ class DoclingDocument(BaseModel):
2224
2227
  html_head=html_head,
2225
2228
  )
2226
2229
 
2227
- with open(filename, "w") as fw:
2230
+ with open(filename, "w", encoding="utf-8") as fw:
2228
2231
  fw.write(html_out)
2229
2232
 
2230
2233
  def _get_output_paths(
@@ -2462,7 +2465,7 @@ class DoclingDocument(BaseModel):
2462
2465
  with_groups=with_groups,
2463
2466
  )
2464
2467
 
2465
- with open(filename, "w") as fw:
2468
+ with open(filename, "w", encoding="utf-8") as fw:
2466
2469
  fw.write(out)
2467
2470
 
2468
2471
  def export_to_document_tokens(
@@ -48,6 +48,7 @@ class GroupLabel(str, Enum):
48
48
  SLIDE = "slide"
49
49
  FORM_AREA = "form_area"
50
50
  KEY_VALUE_AREA = "key_value_area"
51
+ COMMENT_SECTION = "comment_section"
51
52
 
52
53
  def __str__(self):
53
54
  """Get string value."""
@@ -550,17 +550,18 @@ class ExportedCCSDocument(
550
550
 
551
551
  elif (
552
552
  isinstance(item, Table)
553
- and item.data
553
+ and (item.data or item.text)
554
554
  and item_type in main_text_labels
555
555
  ):
556
556
 
557
557
  md_table = ""
558
558
  table = []
559
- for row in item.data:
560
- tmp = []
561
- for col in row:
562
- tmp.append(col.text)
563
- table.append(tmp)
559
+ if item.data is not None:
560
+ for row in item.data:
561
+ tmp = []
562
+ for col in row:
563
+ tmp.append(col.text)
564
+ table.append(tmp)
564
565
 
565
566
  if len(table) > 1 and len(table[0]) > 0:
566
567
  try:
@@ -579,7 +580,9 @@ class ExportedCCSDocument(
579
580
  if item.text:
580
581
  markdown_text = item.text
581
582
  if not strict_text:
582
- markdown_text += "\n\n" + md_table
583
+ markdown_text += (
584
+ "\n\n" if len(markdown_text) > 0 else ""
585
+ ) + md_table
583
586
 
584
587
  elif isinstance(item, Figure) and item_type in main_text_labels:
585
588
 
@@ -587,7 +590,9 @@ class ExportedCCSDocument(
587
590
  if item.text:
588
591
  markdown_text = item.text
589
592
  if not strict_text:
590
- markdown_text += f"\n{image_placeholder}"
593
+ markdown_text += (
594
+ "\n" if len(markdown_text) > 0 else ""
595
+ ) + image_placeholder
591
596
 
592
597
  if markdown_text:
593
598
  md_texts.append(markdown_text)
@@ -38,7 +38,7 @@ def run():
38
38
  """Run the validation of a file containing a Document."""
39
39
  file_format, input_file = parse_arguments()
40
40
 
41
- with open(input_file, "r") as fd:
41
+ with open(input_file, "r", encoding="utf-8") as fd:
42
42
  file_ = json.load(fd)
43
43
 
44
44
  result = (False, "Empty result")
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling-core"
3
- version = "2.11.0"
3
+ version = "2.12.1"
4
4
  description = "A python library to define and validate data types in Docling."
5
5
  license = "MIT"
6
6
  authors = [
File without changes
File without changes