docling-core 2.38.0__tar.gz → 2.38.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

Files changed (107) hide show
  1. {docling_core-2.38.0 → docling_core-2.38.2}/PKG-INFO +1 -1
  2. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core/transforms/serializer/common.py +1 -0
  3. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core/transforms/serializer/markdown.py +43 -18
  4. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core/types/doc/document.py +3 -0
  5. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core/types/doc/page.py +4 -0
  6. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core.egg-info/PKG-INFO +1 -1
  7. {docling_core-2.38.0 → docling_core-2.38.2}/pyproject.toml +1 -1
  8. {docling_core-2.38.0 → docling_core-2.38.2}/test/test_serialization.py +14 -0
  9. {docling_core-2.38.0 → docling_core-2.38.2}/LICENSE +0 -0
  10. {docling_core-2.38.0 → docling_core-2.38.2}/README.md +0 -0
  11. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core/__init__.py +0 -0
  12. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core/cli/__init__.py +0 -0
  13. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core/cli/view.py +0 -0
  14. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core/experimental/__init__.py +0 -0
  15. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core/py.typed +0 -0
  16. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core/resources/schemas/doc/ANN.json +0 -0
  17. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core/resources/schemas/doc/DOC.json +0 -0
  18. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
  19. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core/resources/schemas/doc/RAW.json +0 -0
  20. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
  21. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
  22. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
  23. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
  24. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core/search/__init__.py +0 -0
  25. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core/search/json_schema_to_search_mapper.py +0 -0
  26. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core/search/mapping.py +0 -0
  27. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core/search/meta.py +0 -0
  28. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core/search/package.py +0 -0
  29. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core/transforms/__init__.py +0 -0
  30. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core/transforms/chunker/__init__.py +0 -0
  31. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core/transforms/chunker/base.py +0 -0
  32. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core/transforms/chunker/hierarchical_chunker.py +0 -0
  33. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core/transforms/chunker/hybrid_chunker.py +0 -0
  34. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core/transforms/chunker/tokenizer/__init__.py +0 -0
  35. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core/transforms/chunker/tokenizer/base.py +0 -0
  36. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core/transforms/chunker/tokenizer/huggingface.py +0 -0
  37. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core/transforms/chunker/tokenizer/openai.py +0 -0
  38. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core/transforms/serializer/__init__.py +0 -0
  39. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core/transforms/serializer/base.py +0 -0
  40. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core/transforms/serializer/doctags.py +0 -0
  41. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core/transforms/serializer/html.py +0 -0
  42. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core/transforms/serializer/html_styles.py +0 -0
  43. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core/transforms/visualizer/__init__.py +0 -0
  44. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core/transforms/visualizer/base.py +0 -0
  45. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core/transforms/visualizer/layout_visualizer.py +0 -0
  46. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core/transforms/visualizer/reading_order_visualizer.py +0 -0
  47. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core/transforms/visualizer/table_visualizer.py +0 -0
  48. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core/types/__init__.py +0 -0
  49. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core/types/base.py +0 -0
  50. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core/types/doc/__init__.py +0 -0
  51. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core/types/doc/base.py +0 -0
  52. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core/types/doc/labels.py +0 -0
  53. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core/types/doc/tokens.py +0 -0
  54. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core/types/doc/utils.py +0 -0
  55. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core/types/gen/__init__.py +0 -0
  56. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core/types/gen/generic.py +0 -0
  57. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core/types/io/__init__.py +0 -0
  58. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core/types/legacy_doc/__init__.py +0 -0
  59. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core/types/legacy_doc/base.py +0 -0
  60. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core/types/legacy_doc/doc_ann.py +0 -0
  61. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
  62. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core/types/legacy_doc/doc_raw.py +0 -0
  63. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core/types/legacy_doc/document.py +0 -0
  64. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core/types/legacy_doc/tokens.py +0 -0
  65. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core/types/nlp/__init__.py +0 -0
  66. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core/types/nlp/qa.py +0 -0
  67. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core/types/nlp/qa_labels.py +0 -0
  68. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core/types/rec/__init__.py +0 -0
  69. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core/types/rec/attribute.py +0 -0
  70. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core/types/rec/base.py +0 -0
  71. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core/types/rec/predicate.py +0 -0
  72. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core/types/rec/record.py +0 -0
  73. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core/types/rec/statement.py +0 -0
  74. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core/types/rec/subject.py +0 -0
  75. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core/utils/__init__.py +0 -0
  76. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core/utils/alias.py +0 -0
  77. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core/utils/file.py +0 -0
  78. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core/utils/generate_docs.py +0 -0
  79. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core/utils/generate_jsonschema.py +0 -0
  80. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core/utils/legacy.py +0 -0
  81. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core/utils/validate.py +0 -0
  82. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core/utils/validators.py +0 -0
  83. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core.egg-info/SOURCES.txt +0 -0
  84. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core.egg-info/dependency_links.txt +0 -0
  85. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core.egg-info/entry_points.txt +0 -0
  86. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core.egg-info/requires.txt +0 -0
  87. {docling_core-2.38.0 → docling_core-2.38.2}/docling_core.egg-info/top_level.txt +0 -0
  88. {docling_core-2.38.0 → docling_core-2.38.2}/setup.cfg +0 -0
  89. {docling_core-2.38.0 → docling_core-2.38.2}/test/test_base.py +0 -0
  90. {docling_core-2.38.0 → docling_core-2.38.2}/test/test_collection.py +0 -0
  91. {docling_core-2.38.0 → docling_core-2.38.2}/test/test_data_gen_flag.py +0 -0
  92. {docling_core-2.38.0 → docling_core-2.38.2}/test/test_doc_base.py +0 -0
  93. {docling_core-2.38.0 → docling_core-2.38.2}/test/test_doc_legacy_convert.py +0 -0
  94. {docling_core-2.38.0 → docling_core-2.38.2}/test/test_doc_schema.py +0 -0
  95. {docling_core-2.38.0 → docling_core-2.38.2}/test/test_doc_schema_extractor.py +0 -0
  96. {docling_core-2.38.0 → docling_core-2.38.2}/test/test_docling_doc.py +0 -0
  97. {docling_core-2.38.0 → docling_core-2.38.2}/test/test_doctags_load.py +0 -0
  98. {docling_core-2.38.0 → docling_core-2.38.2}/test/test_hierarchical_chunker.py +0 -0
  99. {docling_core-2.38.0 → docling_core-2.38.2}/test/test_hybrid_chunker.py +0 -0
  100. {docling_core-2.38.0 → docling_core-2.38.2}/test/test_json_schema_to_search_mapper.py +0 -0
  101. {docling_core-2.38.0 → docling_core-2.38.2}/test/test_nlp_qa.py +0 -0
  102. {docling_core-2.38.0 → docling_core-2.38.2}/test/test_otsl_table_export.py +0 -0
  103. {docling_core-2.38.0 → docling_core-2.38.2}/test/test_page.py +0 -0
  104. {docling_core-2.38.0 → docling_core-2.38.2}/test/test_rec_schema.py +0 -0
  105. {docling_core-2.38.0 → docling_core-2.38.2}/test/test_search_meta.py +0 -0
  106. {docling_core-2.38.0 → docling_core-2.38.2}/test/test_utils.py +0 -0
  107. {docling_core-2.38.0 → docling_core-2.38.2}/test/test_visualization.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling-core
3
- Version: 2.38.0
3
+ Version: 2.38.2
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Author-email: Cesar Berrospi Ramis <ceb@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  Maintainer-email: Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>, Cesar Berrospi Ramis <ceb@zurich.ibm.com>
@@ -349,6 +349,7 @@ class DocSerializer(BaseModel, BaseDocSerializer):
349
349
  doc_serializer=self,
350
350
  doc=self.doc,
351
351
  is_inline_scope=is_inline_scope,
352
+ visited=my_visited,
352
353
  **my_kwargs,
353
354
  )
354
355
  if item.self_ref not in self.get_excluded_refs(**kwargs)
@@ -106,26 +106,49 @@ class MarkdownTextSerializer(BaseModel, BaseTextSerializer):
106
106
  doc_serializer: BaseDocSerializer,
107
107
  doc: DoclingDocument,
108
108
  is_inline_scope: bool = False,
109
+ visited: Optional[set[str]] = None, # refs of visited items
109
110
  **kwargs: Any,
110
111
  ) -> SerializationResult:
111
112
  """Serializes the passed item."""
113
+ my_visited = visited if visited is not None else set()
112
114
  params = MarkdownParams(**kwargs)
113
115
  res_parts: list[SerializationResult] = []
116
+ text = item.text
114
117
  escape_html = True
115
118
  escape_underscores = True
116
- if isinstance(item, TitleItem):
117
- text_part = f"# {item.text}"
118
- elif isinstance(item, SectionHeaderItem):
119
- text_part = f"{(item.level + 1) * '#'} {item.text}"
119
+ processing_pending = True
120
+ if isinstance(item, (TitleItem, SectionHeaderItem)):
121
+ # case where processing/formatting should be applied first (in inner scope)
122
+ processing_pending = False
123
+ if (
124
+ text == ""
125
+ and len(item.children) == 1
126
+ and isinstance(
127
+ (child_group := item.children[0].resolve(doc)), InlineGroup
128
+ )
129
+ ):
130
+ # case of heading with inline
131
+ ser_res = doc_serializer.serialize(item=child_group)
132
+ text = ser_res.text
133
+ for span in ser_res.spans:
134
+ my_visited.add(span.item.self_ref)
135
+ else:
136
+ text = doc_serializer.post_process(
137
+ text=text,
138
+ escape_html=escape_html,
139
+ escape_underscores=escape_underscores,
140
+ formatting=item.formatting,
141
+ hyperlink=item.hyperlink,
142
+ )
143
+ num_hashes = 1 if isinstance(item, TitleItem) else item.level + 1
144
+ text_part = f"{num_hashes * '#'} {text}"
120
145
  elif isinstance(item, CodeItem):
121
- text_part = (
122
- f"`{item.text}`" if is_inline_scope else f"```\n{item.text}\n```"
123
- )
146
+ text_part = f"`{text}`" if is_inline_scope else f"```\n{text}\n```"
124
147
  escape_html = False
125
148
  escape_underscores = False
126
149
  elif isinstance(item, FormulaItem):
127
- if item.text:
128
- text_part = f"${item.text}$" if is_inline_scope else f"$${item.text}$$"
150
+ if text:
151
+ text_part = f"${text}$" if is_inline_scope else f"$${text}$$"
129
152
  elif item.orig:
130
153
  text_part = "<!-- formula-not-decoded -->"
131
154
  else:
@@ -133,9 +156,10 @@ class MarkdownTextSerializer(BaseModel, BaseTextSerializer):
133
156
  escape_html = False
134
157
  escape_underscores = False
135
158
  elif params.wrap_width:
136
- text_part = textwrap.fill(item.text, width=params.wrap_width)
159
+ # although wrapping is not guaranteed if post-processing makes changes
160
+ text_part = textwrap.fill(text, width=params.wrap_width)
137
161
  else:
138
- text_part = item.text
162
+ text_part = text
139
163
 
140
164
  if text_part:
141
165
  text_res = create_ser_result(text=text_part, span_source=item)
@@ -147,13 +171,14 @@ class MarkdownTextSerializer(BaseModel, BaseTextSerializer):
147
171
  res_parts.append(cap_res)
148
172
 
149
173
  text = (" " if is_inline_scope else "\n\n").join([r.text for r in res_parts])
150
- text = doc_serializer.post_process(
151
- text=text,
152
- escape_html=escape_html,
153
- escape_underscores=escape_underscores,
154
- formatting=item.formatting,
155
- hyperlink=item.hyperlink,
156
- )
174
+ if processing_pending:
175
+ text = doc_serializer.post_process(
176
+ text=text,
177
+ escape_html=escape_html,
178
+ escape_underscores=escape_underscores,
179
+ formatting=item.formatting,
180
+ hyperlink=item.hyperlink,
181
+ )
157
182
  return create_ser_result(text=text, span_source=res_parts)
158
183
 
159
184
 
@@ -540,6 +540,9 @@ class DocumentOrigin(BaseModel):
540
540
  "text/asciidoc",
541
541
  "text/markdown",
542
542
  "text/csv",
543
+ "audio/x-wav",
544
+ "audio/wav",
545
+ "audio/mp3",
543
546
  ]
544
547
 
545
548
  @field_validator("binary_hash", mode="before")
@@ -649,6 +649,7 @@ class SegmentedPdfPage(SegmentedPage):
649
649
  add_location: bool = True,
650
650
  add_fontkey: bool = False,
651
651
  add_fontname: bool = True,
652
+ add_text_direction: bool = True,
652
653
  ) -> List[str]:
653
654
  """Export text cells as formatted text lines.
654
655
 
@@ -676,6 +677,9 @@ class SegmentedPdfPage(SegmentedPage):
676
677
  if add_fontname and isinstance(cell, PdfTextCell):
677
678
  line += f"{cell.font_name:>10} "
678
679
 
680
+ if add_text_direction and isinstance(cell, PdfTextCell):
681
+ line += f"{cell.text_direction} "
682
+
679
683
  line += f"{cell.text}"
680
684
  lines.append(line)
681
685
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling-core
3
- Version: 2.38.0
3
+ Version: 2.38.2
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Author-email: Cesar Berrospi Ramis <ceb@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  Maintainer-email: Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>, Cesar Berrospi Ramis <ceb@zurich.ibm.com>
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "docling-core"
3
- version = "2.38.0" # DO NOT EDIT, updated automatically
3
+ version = "2.38.2" # DO NOT EDIT, updated automatically
4
4
  description = "A python library to define and validate data types in Docling."
5
5
  license = "MIT"
6
6
  license-files = ["LICENSE"]
@@ -196,6 +196,20 @@ def test_md_charts():
196
196
  verify(exp_file=src.parent / f"{src.stem}.gt.md", actual=actual)
197
197
 
198
198
 
199
+ def test_md_inline_and_formatting():
200
+ src = Path("./test/data/doc/inline_and_formatting.yaml")
201
+ doc = DoclingDocument.load_from_yaml(src)
202
+
203
+ ser = MarkdownDocSerializer(
204
+ doc=doc,
205
+ params=MarkdownParams(
206
+ image_mode=ImageRefMode.PLACEHOLDER,
207
+ ),
208
+ )
209
+ actual = ser.serialize().text
210
+ verify(exp_file=src.parent / f"{src.stem}.md", actual=actual)
211
+
212
+
199
213
  def test_html_cross_page_list_page_break():
200
214
  src = Path("./test/data/doc/activities.json")
201
215
  doc = DoclingDocument.load_from_json(src)
File without changes
File without changes
File without changes