docling 2.2.0__py3-none-any.whl → 2.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -136,7 +136,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
136
136
  def get_direct_text(self, item):
137
137
  """Get the direct text of the <li> element (ignoring nested lists)."""
138
138
  text = item.find(string=True, recursive=False)
139
-
140
139
  if isinstance(text, str):
141
140
  return text.strip()
142
141
 
@@ -149,21 +148,20 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
149
148
  if isinstance(item, str):
150
149
  return [item]
151
150
 
152
- result.append(self.get_direct_text(item))
153
-
154
- try:
155
- # Iterate over the children (and their text and tails)
156
- for child in item:
157
- try:
158
- # Recursively get the child's text content
159
- result.extend(self.extract_text_recursively(child))
160
- except:
161
- pass
162
- except:
163
- _log.warn("item has no children")
164
- pass
165
-
166
- return " ".join(result)
151
+ if item.name not in ["ul", "ol"]:
152
+ try:
153
+ # Iterate over the children (and their text and tails)
154
+ for child in item:
155
+ try:
156
+ # Recursively get the child's text content
157
+ result.extend(self.extract_text_recursively(child))
158
+ except:
159
+ pass
160
+ except:
161
+ _log.warn("item has no children")
162
+ pass
163
+
164
+ return "".join(result) + " "
167
165
 
168
166
  def handle_header(self, element, idx, doc):
169
167
  """Handles header tags (h1, h2, etc.)."""
@@ -182,11 +180,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
182
180
  parent=self.parents[0], label=DocItemLabel.TITLE, text=text
183
181
  )
184
182
 
185
- elif hlevel == self.level:
186
- self.parents[hlevel] = doc.add_text(
187
- parent=self.parents[hlevel - 1], label=label, text=text
188
- )
189
-
190
183
  elif hlevel > self.level:
191
184
 
192
185
  # add invisible group
@@ -196,10 +189,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
196
189
  label=GroupLabel.SECTION,
197
190
  parent=self.parents[i - 1],
198
191
  )
199
-
200
- self.parents[hlevel] = doc.add_text(
201
- parent=self.parents[hlevel - 1], label=label, text=text
202
- )
203
192
  self.level = hlevel
204
193
 
205
194
  elif hlevel < self.level:
@@ -208,12 +197,14 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
208
197
  for key, val in self.parents.items():
209
198
  if key > hlevel:
210
199
  self.parents[key] = None
211
-
212
- self.parents[hlevel] = doc.add_text(
213
- parent=self.parents[hlevel - 1], label=label, text=text
214
- )
215
200
  self.level = hlevel
216
201
 
202
+ self.parents[hlevel] = doc.add_heading(
203
+ parent=self.parents[hlevel - 1],
204
+ text=text,
205
+ level=hlevel,
206
+ )
207
+
217
208
  def handle_paragraph(self, element, idx, doc):
218
209
  """Handles paragraph tags (p)."""
219
210
  if element.text is None:
@@ -255,7 +246,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
255
246
 
256
247
  if nested_lists:
257
248
  name = element.name
258
- text = self.get_direct_text(element)
249
+ # Text in list item can be hidden within hierarchy, hence
250
+ # we need to extract it recursively
251
+ text = self.extract_text_recursively(element)
252
+ # Flatten text, remove break lines:
253
+ text = text.replace("\n", "").replace("\r", "")
254
+ text = " ".join(text.split()).strip()
259
255
 
260
256
  marker = ""
261
257
  enumerated = False
@@ -263,14 +259,15 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
263
259
  marker = str(index_in_list)
264
260
  enumerated = True
265
261
 
266
- # create a list-item
267
- self.parents[self.level + 1] = doc.add_list_item(
268
- text=text,
269
- enumerated=enumerated,
270
- marker=marker,
271
- parent=self.parents[self.level],
272
- )
273
- self.level += 1
262
+ if len(text) > 0:
263
+ # create a list-item
264
+ self.parents[self.level + 1] = doc.add_list_item(
265
+ text=text,
266
+ enumerated=enumerated,
267
+ marker=marker,
268
+ parent=self.parents[self.level],
269
+ )
270
+ self.level += 1
274
271
 
275
272
  self.walk(element, doc)
276
273
 
@@ -1,4 +1,6 @@
1
1
  import logging
2
+ import re
3
+ import warnings
2
4
  from io import BytesIO
3
5
  from pathlib import Path
4
6
  from typing import Set, Union
@@ -25,6 +27,30 @@ _log = logging.getLogger(__name__)
25
27
 
26
28
 
27
29
  class MarkdownDocumentBackend(DeclarativeDocumentBackend):
30
+
31
+ def shorten_underscore_sequences(self, markdown_text, max_length=10):
32
+ # This regex will match any sequence of underscores
33
+ pattern = r"_+"
34
+
35
+ def replace_match(match):
36
+ underscore_sequence = match.group(
37
+ 0
38
+ ) # Get the full match (sequence of underscores)
39
+
40
+ # Shorten the sequence if it exceeds max_length
41
+ if len(underscore_sequence) > max_length:
42
+ return "_" * max_length
43
+ else:
44
+ return underscore_sequence # Leave it unchanged if it is shorter or equal to max_length
45
+
46
+ # Use re.sub to replace long underscore sequences
47
+ shortened_text = re.sub(pattern, replace_match, markdown_text)
48
+
49
+ if len(shortened_text) != len(markdown_text):
50
+ warnings.warn("Detected potentially incorrect Markdown, correcting...")
51
+
52
+ return shortened_text
53
+
28
54
  def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
29
55
  super().__init__(in_doc, path_or_stream)
30
56
 
@@ -42,11 +68,19 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
42
68
  try:
43
69
  if isinstance(self.path_or_stream, BytesIO):
44
70
  text_stream = self.path_or_stream.getvalue().decode("utf-8")
45
- self.markdown = text_stream
71
+ # remove invalid sequences
72
+ # very long sequences of underscores will lead to unnecessary long processing times.
73
+ # In any proper Markdown files, underscores have to be escaped,
74
+ # otherwise they represent emphasis (bold or italic)
75
+ self.markdown = self.shorten_underscore_sequences(text_stream)
46
76
  if isinstance(self.path_or_stream, Path):
47
77
  with open(self.path_or_stream, "r", encoding="utf-8") as f:
48
78
  md_content = f.read()
49
- self.markdown = md_content
79
+ # remove invalid sequences
80
+ # very long sequences of underscores will lead to unnecessary long processing times.
81
+ # In any proper Markdown files, underscores have to be escaped,
82
+ # otherwise they represent emphasis (bold or italic)
83
+ self.markdown = self.shorten_underscore_sequences(md_content)
50
84
  self.valid = True
51
85
 
52
86
  _log.debug(self.markdown)
@@ -135,11 +169,29 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
135
169
  doc_label = DocItemLabel.TITLE
136
170
  else:
137
171
  doc_label = DocItemLabel.SECTION_HEADER
138
- snippet_text = element.children[0].children.strip()
139
172
 
140
- parent_element = doc.add_text(
141
- label=doc_label, parent=parent_element, text=snippet_text
142
- )
173
+ # Header could have arbitrary inclusion of bold, italic or emphasis,
174
+ # hence we need to traverse the tree to get full text of a header
175
+ strings = []
176
+
177
+ # Define a recursive function to traverse the tree
178
+ def traverse(node):
179
+ # Check if the node has a "children" attribute
180
+ if hasattr(node, "children"):
181
+ # If "children" is a list, continue traversal
182
+ if isinstance(node.children, list):
183
+ for child in node.children:
184
+ traverse(child)
185
+ # If "children" is text, add it to header text
186
+ elif isinstance(node.children, str):
187
+ strings.append(node.children)
188
+
189
+ traverse(element)
190
+ snippet_text = "".join(strings)
191
+ if len(snippet_text) > 0:
192
+ parent_element = doc.add_text(
193
+ label=doc_label, parent=parent_element, text=snippet_text
194
+ )
143
195
 
144
196
  elif isinstance(element, marko.block.List):
145
197
  self.close_table(doc)
@@ -286,6 +338,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
286
338
  parsed_ast = marko_parser.parse(self.markdown)
287
339
  # Start iterating from the root of the AST
288
340
  self.iterate_elements(parsed_ast, 0, doc, None)
341
+ self.process_inline_text(None, doc) # handle last hanging inline text
289
342
  else:
290
343
  raise RuntimeError(
291
344
  f"Cannot convert md with {self.document_hash} because the backend failed to init."
@@ -294,13 +294,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
294
294
  level = self.get_level()
295
295
  if isinstance(curr_level, int):
296
296
 
297
- if curr_level == level:
298
-
299
- self.parents[level] = doc.add_heading(
300
- parent=self.parents[level - 1], text=text
301
- )
302
-
303
- elif curr_level > level:
297
+ if curr_level > level:
304
298
 
305
299
  # add invisible group
306
300
  for i in range(level, curr_level):
@@ -310,10 +304,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
310
304
  name=f"header-{i}",
311
305
  )
312
306
 
313
- self.parents[curr_level] = doc.add_heading(
314
- parent=self.parents[curr_level - 1], text=text
315
- )
316
-
317
307
  elif curr_level < level:
318
308
 
319
309
  # remove the tail
@@ -321,13 +311,17 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
321
311
  if key >= curr_level:
322
312
  self.parents[key] = None
323
313
 
324
- self.parents[curr_level] = doc.add_heading(
325
- parent=self.parents[curr_level - 1], text=text
326
- )
314
+ self.parents[curr_level] = doc.add_heading(
315
+ parent=self.parents[curr_level - 1],
316
+ text=text,
317
+ level=curr_level,
318
+ )
327
319
 
328
320
  else:
329
321
  self.parents[self.level] = doc.add_heading(
330
- parent=self.parents[self.level - 1], text=text
322
+ parent=self.parents[self.level - 1],
323
+ text=text,
324
+ level=1,
331
325
  )
332
326
  return
333
327
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.2.0
3
+ Version: 2.2.1
4
4
  Summary: Docling PDF conversion package
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -23,7 +23,7 @@ Provides-Extra: tesserocr
23
23
  Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
24
24
  Requires-Dist: certifi (>=2024.7.4)
25
25
  Requires-Dist: deepsearch-glm (>=0.26.1,<0.27.0)
26
- Requires-Dist: docling-core (>=2.1.0,<3.0.0)
26
+ Requires-Dist: docling-core (>=2.2.1,<3.0.0)
27
27
  Requires-Dist: docling-ibm-models (>=2.0.1,<3.0.0)
28
28
  Requires-Dist: docling-parse (>=2.0.0,<3.0.0)
29
29
  Requires-Dist: easyocr (>=1.7,<2.0)
@@ -4,10 +4,10 @@ docling/backend/abstract_backend.py,sha256=-or6kWVV7egQeyIuN-vI0Tr7Q1htalBZSlhgq
4
4
  docling/backend/asciidoc_backend.py,sha256=WW0eIanPIObcg5ci9YcnqFxwipmqRFsRY8zjZDdKvJA,14116
5
5
  docling/backend/docling_parse_backend.py,sha256=TaIMli9vePd3fz9L6S4t75JPYZDpgYBLRGfWjbc9Hbk,7632
6
6
  docling/backend/docling_parse_v2_backend.py,sha256=QlVU8NgqKvVCa99E8oDa2Xvy__kq30C-myGY3o9Qoq4,8588
7
- docling/backend/html_backend.py,sha256=wfh5PWEwoqsCXxFCQbFBdJvEtlqZhXgqfPfTYETWHfE,14974
8
- docling/backend/md_backend.py,sha256=osYiNLnep9UgLq8mUH9bmwG3kP9RXxt69I8LlyeJN6g,11505
7
+ docling/backend/html_backend.py,sha256=TUY5EVv3bo28A_w5CvBgNW4ZqL1d-VxOQPh1_taPHgU,15070
8
+ docling/backend/md_backend.py,sha256=tmuSCghjor9PqKIiVieCuZ4_t5JEjZMy3cq7u3yTgyU,14032
9
9
  docling/backend/mspowerpoint_backend.py,sha256=J472AIH_IXvGg3D0FDmXhue1At_VSBD6n15c64Kxttw,15446
10
- docling/backend/msword_backend.py,sha256=6bY0ebOaeSbpskUJY5t5pOf4a2VclWzeHeSo-vzsaO0,17470
10
+ docling/backend/msword_backend.py,sha256=FAUdP74QxGKo2xMZQ4WQGYwtpIBCTJ_FG17PBpRwhxI,17230
11
11
  docling/backend/pdf_backend.py,sha256=unnw7QiRE1VXg6Pj-eYrtnFGrp5SSYiI324OlFxyv6c,2050
12
12
  docling/backend/pypdfium2_backend.py,sha256=MJX6fQqwK3r967fyAAs-RA_YIkeQvhgsLkQAgaBTgaE,8995
13
13
  docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -37,8 +37,8 @@ docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
37
37
  docling/utils/export.py,sha256=KyGF1BVDHPFfHVXZc8vegsWlFfOgGPP2YckWpTadyI8,4694
38
38
  docling/utils/layout_utils.py,sha256=vlN0rc8i0ayRGn3WnaG-pdmqEL00KKGl2zez3Gj-hrk,32074
39
39
  docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
40
- docling-2.2.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
41
- docling-2.2.0.dist-info/METADATA,sha256=TkaywA2l2ImdMc9WpUYWUQy3n50zG9Y9eC7ziElBlU0,6205
42
- docling-2.2.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
43
- docling-2.2.0.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
44
- docling-2.2.0.dist-info/RECORD,,
40
+ docling-2.2.1.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
41
+ docling-2.2.1.dist-info/METADATA,sha256=BOYg-5kaA2Fjxc2bwaJOuAd9LmrQerOzQLHCyaiQ1aE,6205
42
+ docling-2.2.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
43
+ docling-2.2.1.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
44
+ docling-2.2.1.dist-info/RECORD,,