docling-core 2.38.0__py3-none-any.whl → 2.38.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

@@ -349,6 +349,7 @@ class DocSerializer(BaseModel, BaseDocSerializer):
349
349
  doc_serializer=self,
350
350
  doc=self.doc,
351
351
  is_inline_scope=is_inline_scope,
352
+ visited=my_visited,
352
353
  **my_kwargs,
353
354
  )
354
355
  if item.self_ref not in self.get_excluded_refs(**kwargs)
@@ -106,26 +106,49 @@ class MarkdownTextSerializer(BaseModel, BaseTextSerializer):
106
106
  doc_serializer: BaseDocSerializer,
107
107
  doc: DoclingDocument,
108
108
  is_inline_scope: bool = False,
109
+ visited: Optional[set[str]] = None, # refs of visited items
109
110
  **kwargs: Any,
110
111
  ) -> SerializationResult:
111
112
  """Serializes the passed item."""
113
+ my_visited = visited if visited is not None else set()
112
114
  params = MarkdownParams(**kwargs)
113
115
  res_parts: list[SerializationResult] = []
116
+ text = item.text
114
117
  escape_html = True
115
118
  escape_underscores = True
116
- if isinstance(item, TitleItem):
117
- text_part = f"# {item.text}"
118
- elif isinstance(item, SectionHeaderItem):
119
- text_part = f"{(item.level + 1) * '#'} {item.text}"
119
+ processing_pending = True
120
+ if isinstance(item, (TitleItem, SectionHeaderItem)):
121
+ # case where processing/formatting should be applied first (in inner scope)
122
+ processing_pending = False
123
+ if (
124
+ text == ""
125
+ and len(item.children) == 1
126
+ and isinstance(
127
+ (child_group := item.children[0].resolve(doc)), InlineGroup
128
+ )
129
+ ):
130
+ # case of heading with inline
131
+ ser_res = doc_serializer.serialize(item=child_group)
132
+ text = ser_res.text
133
+ for span in ser_res.spans:
134
+ my_visited.add(span.item.self_ref)
135
+ else:
136
+ text = doc_serializer.post_process(
137
+ text=text,
138
+ escape_html=escape_html,
139
+ escape_underscores=escape_underscores,
140
+ formatting=item.formatting,
141
+ hyperlink=item.hyperlink,
142
+ )
143
+ num_hashes = 1 if isinstance(item, TitleItem) else item.level + 1
144
+ text_part = f"{num_hashes * '#'} {text}"
120
145
  elif isinstance(item, CodeItem):
121
- text_part = (
122
- f"`{item.text}`" if is_inline_scope else f"```\n{item.text}\n```"
123
- )
146
+ text_part = f"`{text}`" if is_inline_scope else f"```\n{text}\n```"
124
147
  escape_html = False
125
148
  escape_underscores = False
126
149
  elif isinstance(item, FormulaItem):
127
- if item.text:
128
- text_part = f"${item.text}$" if is_inline_scope else f"$${item.text}$$"
150
+ if text:
151
+ text_part = f"${text}$" if is_inline_scope else f"$${text}$$"
129
152
  elif item.orig:
130
153
  text_part = "<!-- formula-not-decoded -->"
131
154
  else:
@@ -133,9 +156,10 @@ class MarkdownTextSerializer(BaseModel, BaseTextSerializer):
133
156
  escape_html = False
134
157
  escape_underscores = False
135
158
  elif params.wrap_width:
136
- text_part = textwrap.fill(item.text, width=params.wrap_width)
159
+ # although wrapping is not guaranteed if post-processing makes changes
160
+ text_part = textwrap.fill(text, width=params.wrap_width)
137
161
  else:
138
- text_part = item.text
162
+ text_part = text
139
163
 
140
164
  if text_part:
141
165
  text_res = create_ser_result(text=text_part, span_source=item)
@@ -147,13 +171,14 @@ class MarkdownTextSerializer(BaseModel, BaseTextSerializer):
147
171
  res_parts.append(cap_res)
148
172
 
149
173
  text = (" " if is_inline_scope else "\n\n").join([r.text for r in res_parts])
150
- text = doc_serializer.post_process(
151
- text=text,
152
- escape_html=escape_html,
153
- escape_underscores=escape_underscores,
154
- formatting=item.formatting,
155
- hyperlink=item.hyperlink,
156
- )
174
+ if processing_pending:
175
+ text = doc_serializer.post_process(
176
+ text=text,
177
+ escape_html=escape_html,
178
+ escape_underscores=escape_underscores,
179
+ formatting=item.formatting,
180
+ hyperlink=item.hyperlink,
181
+ )
157
182
  return create_ser_result(text=text, span_source=res_parts)
158
183
 
159
184
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling-core
3
- Version: 2.38.0
3
+ Version: 2.38.1
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Author-email: Cesar Berrospi Ramis <ceb@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  Maintainer-email: Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>, Cesar Berrospi Ramis <ceb@zurich.ibm.com>
@@ -27,11 +27,11 @@ docling_core/transforms/chunker/tokenizer/huggingface.py,sha256=aZ_RNQIzcNkAHGHZ
27
27
  docling_core/transforms/chunker/tokenizer/openai.py,sha256=zt2kwcC-r8MafeEG0CESab8E4RIC9aaFXxxnxOGyTMA,918
28
28
  docling_core/transforms/serializer/__init__.py,sha256=CECQlMoCDUxkg4RAUdC3itA3I3qFhKhe2HcYghN6_xw,105
29
29
  docling_core/transforms/serializer/base.py,sha256=ZFIiZeplL-QbBs9EDUb1awqxapQ23PsApVetJtAs7Vs,6891
30
- docling_core/transforms/serializer/common.py,sha256=WP-qO-woidrKyvZ56m0vlKMysoLrMzzZtHSCIwsl3ek,19119
30
+ docling_core/transforms/serializer/common.py,sha256=RO2KWl3sZq_PIvzWzuGJTWntKjLOAy3n17cgZi84AAs,19163
31
31
  docling_core/transforms/serializer/doctags.py,sha256=PuAExlP-2HxcDSP_R_phtYQU0yKBW94RrPgb85IUxck,19905
32
32
  docling_core/transforms/serializer/html.py,sha256=SZgQa0QnknEoRwMFLdgmVsLQqLF2rQl3D7XyEZzUHCE,37151
33
33
  docling_core/transforms/serializer/html_styles.py,sha256=-jBwS4EU7yfKoz0GSoxhwx90OmIKieO6TwPw57IuxcA,4692
34
- docling_core/transforms/serializer/markdown.py,sha256=wfMNrjA4wMehWLCejAhEN1eQPRixUO1SyL6ojkKkzZY,20614
34
+ docling_core/transforms/serializer/markdown.py,sha256=2wV0ydqWKSm-HAW94gF0IRBpjWgoqUjL4JHRYS8DDgY,21803
35
35
  docling_core/transforms/visualizer/__init__.py,sha256=gUfF25yiJ_KO46ZIUNqZQOZGy2PLx6gnnr6AZYxKHXI,35
36
36
  docling_core/transforms/visualizer/base.py,sha256=aEF7b3rHq6DVdX8zDYEPoq55BHDYe4Hh_97lBdcW4lY,555
37
37
  docling_core/transforms/visualizer/layout_visualizer.py,sha256=zHzQTWcy-z1J2BcsjvakLkrp8pgStgnxhDl8YqIAotY,8035
@@ -74,9 +74,9 @@ docling_core/utils/generate_jsonschema.py,sha256=uNX1O5XnjyB5nA66XqZXTt3YbGuR2ty
74
74
  docling_core/utils/legacy.py,sha256=DrI3QGoL755ZCIoKHF74-pTWm8R0zfFo2C2vB5dT2aY,24463
75
75
  docling_core/utils/validate.py,sha256=aQ11UbFyl8iD_N7yTTZmm_VVeXz8KcCyn3GLXgkfYRM,2049
76
76
  docling_core/utils/validators.py,sha256=azcrndLzhNkTWnbFSu9shJ5D3j_znnLrIFA5R8hzmGU,2798
77
- docling_core-2.38.0.dist-info/licenses/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
78
- docling_core-2.38.0.dist-info/METADATA,sha256=llcycAVzvc09CX0igt4VIGrGWT8UuMjnWN5rrQoEJ6s,6453
79
- docling_core-2.38.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
80
- docling_core-2.38.0.dist-info/entry_points.txt,sha256=ER4zROQWkFMHIrY-oqY5E4HeCcCIg8dLkNztYGxdb7c,59
81
- docling_core-2.38.0.dist-info/top_level.txt,sha256=O-tcXpGiurlud-1ZxMq1b-OmrfAVA4sajcgWU32RtfA,13
82
- docling_core-2.38.0.dist-info/RECORD,,
77
+ docling_core-2.38.1.dist-info/licenses/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
78
+ docling_core-2.38.1.dist-info/METADATA,sha256=MnIJIe_3840fVysMwWJFKu792iTKG2wzp_3dOjxi6Yg,6453
79
+ docling_core-2.38.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
80
+ docling_core-2.38.1.dist-info/entry_points.txt,sha256=ER4zROQWkFMHIrY-oqY5E4HeCcCIg8dLkNztYGxdb7c,59
81
+ docling_core-2.38.1.dist-info/top_level.txt,sha256=O-tcXpGiurlud-1ZxMq1b-OmrfAVA4sajcgWU32RtfA,13
82
+ docling_core-2.38.1.dist-info/RECORD,,