docling 2.47.0__py3-none-any.whl → 2.48.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,6 @@
1
1
  import logging
2
2
  import re
3
+ import traceback
3
4
  from contextlib import contextmanager
4
5
  from copy import deepcopy
5
6
  from io import BytesIO
@@ -45,20 +46,22 @@ _BLOCK_TAGS: Final = {
45
46
  "h4",
46
47
  "h5",
47
48
  "h6",
49
+ "ol",
48
50
  "p",
49
51
  "pre",
50
- "code",
51
- "ul",
52
- "ol",
53
52
  "summary",
54
53
  "table",
54
+ "ul",
55
55
  }
56
56
 
57
+ _CODE_TAG_SET: Final = {"code", "kbd", "samp"}
58
+
57
59
  _FORMAT_TAG_MAP: Final = {
58
60
  "b": {"bold": True},
59
61
  "strong": {"bold": True},
60
62
  "i": {"italic": True},
61
63
  "em": {"italic": True},
64
+ "var": {"italic": True},
62
65
  # "mark",
63
66
  # "small",
64
67
  "s": {"strikethrough": True},
@@ -67,6 +70,7 @@ _FORMAT_TAG_MAP: Final = {
67
70
  "ins": {"underline": True},
68
71
  "sub": {"script": Script.SUB},
69
72
  "sup": {"script": Script.SUPER},
73
+ **{k: {} for k in _CODE_TAG_SET},
70
74
  }
71
75
 
72
76
 
@@ -79,6 +83,7 @@ class AnnotatedText(BaseModel):
79
83
  text: str
80
84
  hyperlink: Union[AnyUrl, Path, None] = None
81
85
  formatting: Union[Formatting, None] = None
86
+ code: bool = False
82
87
 
83
88
 
84
89
  class AnnotatedTextList(list):
@@ -86,10 +91,12 @@ class AnnotatedTextList(list):
86
91
  current_h = None
87
92
  current_text = ""
88
93
  current_f = None
94
+ current_code = False
89
95
  for at in self:
90
96
  t = at.text
91
97
  h = at.hyperlink
92
98
  f = at.formatting
99
+ c = at.code
93
100
  current_text += t.strip() + " "
94
101
  if f is not None and current_f is None:
95
102
  current_f = f
@@ -103,8 +110,13 @@ class AnnotatedTextList(list):
103
110
  _log.warning(
104
111
  f"Clashing hyperlinks: '{h}' and '{current_h}'! Chose '{current_h}'"
105
112
  )
113
+ current_code = c if c else current_code
114
+
106
115
  return AnnotatedText(
107
- text=current_text.strip(), hyperlink=current_h, formatting=current_f
116
+ text=current_text.strip(),
117
+ hyperlink=current_h,
118
+ formatting=current_f,
119
+ code=current_code,
108
120
  )
109
121
 
110
122
  def simplify_text_elements(self) -> "AnnotatedTextList":
@@ -114,9 +126,14 @@ class AnnotatedTextList(list):
114
126
  text = self[0].text
115
127
  hyperlink = self[0].hyperlink
116
128
  formatting = self[0].formatting
129
+ code = self[0].code
117
130
  last_elm = text
118
131
  for i in range(1, len(self)):
119
- if hyperlink == self[i].hyperlink and formatting == self[i].formatting:
132
+ if (
133
+ hyperlink == self[i].hyperlink
134
+ and formatting == self[i].formatting
135
+ and code == self[i].code
136
+ ):
120
137
  sep = " "
121
138
  if not self[i].text.strip() or not last_elm.strip():
122
139
  sep = ""
@@ -124,15 +141,20 @@ class AnnotatedTextList(list):
124
141
  last_elm = self[i].text
125
142
  else:
126
143
  simplified.append(
127
- AnnotatedText(text=text, hyperlink=hyperlink, formatting=formatting)
144
+ AnnotatedText(
145
+ text=text, hyperlink=hyperlink, formatting=formatting, code=code
146
+ )
128
147
  )
129
148
  text = self[i].text
130
149
  last_elm = text
131
150
  hyperlink = self[i].hyperlink
132
151
  formatting = self[i].formatting
152
+ code = self[i].code
133
153
  if text:
134
154
  simplified.append(
135
- AnnotatedText(text=text, hyperlink=hyperlink, formatting=formatting)
155
+ AnnotatedText(
156
+ text=text, hyperlink=hyperlink, formatting=formatting, code=code
157
+ )
136
158
  )
137
159
  return simplified
138
160
 
@@ -174,7 +196,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
174
196
  self.ctx = _Context()
175
197
  for i in range(self.max_levels):
176
198
  self.parents[i] = None
177
- self.hyperlink = None
199
+ self.hyperlink: Union[AnyUrl, Path, None] = None
178
200
  self.original_url = original_url
179
201
  self.format_tags: list[str] = []
180
202
 
@@ -235,9 +257,13 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
235
257
  orig=title_text,
236
258
  content_layer=ContentLayer.FURNITURE,
237
259
  )
238
- # remove scripts/styles
260
+ # remove script and style tags
239
261
  for tag in self.soup(["script", "style"]):
240
262
  tag.decompose()
263
+ # remove any hidden tag
264
+ for tag in self.soup(hidden=True):
265
+ tag.decompose()
266
+
241
267
  content = self.soup.body or self.soup
242
268
  # normalize <br> tags
243
269
  for br in content("br"):
@@ -268,7 +294,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
268
294
  def flush_buffer():
269
295
  if not buffer:
270
296
  return
271
- annotated_text_list = buffer.simplify_text_elements()
297
+ annotated_text_list: AnnotatedTextList = buffer.simplify_text_elements()
272
298
  parts = annotated_text_list.split_by_newline()
273
299
  buffer.clear()
274
300
 
@@ -276,20 +302,29 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
276
302
  return
277
303
 
278
304
  for annotated_text_list in parts:
279
- with self.use_inline_group(annotated_text_list, doc):
305
+ with self._use_inline_group(annotated_text_list, doc):
280
306
  for annotated_text in annotated_text_list:
281
307
  if annotated_text.text.strip():
282
308
  seg_clean = HTMLDocumentBackend._clean_unicode(
283
309
  annotated_text.text.strip()
284
310
  )
285
- doc.add_text(
286
- parent=self.parents[self.level],
287
- label=DocItemLabel.TEXT,
288
- text=seg_clean,
289
- content_layer=self.content_layer,
290
- formatting=annotated_text.formatting,
291
- hyperlink=annotated_text.hyperlink,
292
- )
311
+ if annotated_text.code:
312
+ doc.add_code(
313
+ parent=self.parents[self.level],
314
+ text=seg_clean,
315
+ content_layer=self.content_layer,
316
+ formatting=annotated_text.formatting,
317
+ hyperlink=annotated_text.hyperlink,
318
+ )
319
+ else:
320
+ doc.add_text(
321
+ parent=self.parents[self.level],
322
+ label=DocItemLabel.TEXT,
323
+ text=seg_clean,
324
+ content_layer=self.content_layer,
325
+ formatting=annotated_text.formatting,
326
+ hyperlink=annotated_text.hyperlink,
327
+ )
293
328
 
294
329
  for node in element.contents:
295
330
  if isinstance(node, Tag):
@@ -298,10 +333,10 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
298
333
  flush_buffer()
299
334
  self._emit_image(node, doc)
300
335
  elif name in _FORMAT_TAG_MAP:
301
- with self.use_format([name]):
336
+ with self._use_format([name]):
302
337
  self._walk(node, doc)
303
338
  elif name == "a":
304
- with self.use_hyperlink(node):
339
+ with self._use_hyperlink(node):
305
340
  self._walk(node, doc)
306
341
  elif name in _BLOCK_TAGS:
307
342
  flush_buffer()
@@ -367,8 +402,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
367
402
  this_parent = item.parent
368
403
  while this_parent is not None:
369
404
  if this_parent.name == "a" and this_parent.get("href"):
370
- with self.use_format(format_tags):
371
- with self.use_hyperlink(this_parent):
405
+ with self._use_format(format_tags):
406
+ with self._use_hyperlink(this_parent):
372
407
  return self._extract_text_and_hyperlink_recursively(
373
408
  item, ignore_list
374
409
  )
@@ -379,6 +414,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
379
414
 
380
415
  if isinstance(item, NavigableString):
381
416
  text = item.strip()
417
+ code = any(code_tag in self.format_tags for code_tag in _CODE_TAG_SET)
382
418
  if text:
383
419
  return AnnotatedTextList(
384
420
  [
@@ -386,6 +422,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
386
422
  text=text,
387
423
  hyperlink=self.hyperlink,
388
424
  formatting=self._formatting,
425
+ code=code,
389
426
  )
390
427
  ]
391
428
  )
@@ -396,6 +433,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
396
433
  text="\n",
397
434
  hyperlink=self.hyperlink,
398
435
  formatting=self._formatting,
436
+ code=code,
399
437
  )
400
438
  ]
401
439
  )
@@ -405,14 +443,14 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
405
443
  if not ignore_list or (tag.name not in ["ul", "ol"]):
406
444
  for child in tag:
407
445
  if isinstance(child, Tag) and child.name in _FORMAT_TAG_MAP:
408
- with self.use_format([child.name]):
446
+ with self._use_format([child.name]):
409
447
  result.extend(
410
448
  self._extract_text_and_hyperlink_recursively(
411
449
  child, ignore_list, keep_newlines=keep_newlines
412
450
  )
413
451
  )
414
452
  elif isinstance(child, Tag) and child.name == "a":
415
- with self.use_hyperlink(child):
453
+ with self._use_hyperlink(child):
416
454
  result.extend(
417
455
  self._extract_text_and_hyperlink_recursively(
418
456
  child, ignore_list, keep_newlines=keep_newlines
@@ -428,29 +466,30 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
428
466
  return result
429
467
 
430
468
  @contextmanager
431
- def use_hyperlink(self, tag):
469
+ def _use_hyperlink(self, tag: Tag):
432
470
  this_href = tag.get("href")
433
471
  if this_href is None:
434
472
  yield None
435
473
  else:
436
- if this_href:
437
- old_hyperlink = self.hyperlink
474
+ if isinstance(this_href, str) and this_href:
475
+ old_hyperlink: Union[AnyUrl, Path, None] = self.hyperlink
476
+ new_hyperlink: Union[AnyUrl, Path, None] = None
438
477
  if self.original_url is not None:
439
- this_href = urljoin(self.original_url, this_href)
478
+ this_href = urljoin(str(self.original_url), str(this_href))
440
479
  # ugly fix for relative links since pydantic does not support them.
441
480
  try:
442
- AnyUrl(this_href)
481
+ new_hyperlink = AnyUrl(this_href)
443
482
  except ValidationError:
444
- this_href = Path(this_href)
445
- self.hyperlink = this_href
483
+ new_hyperlink = Path(this_href)
484
+ self.hyperlink = new_hyperlink
446
485
  try:
447
486
  yield None
448
487
  finally:
449
- if this_href:
488
+ if new_hyperlink:
450
489
  self.hyperlink = old_hyperlink
451
490
 
452
491
  @contextmanager
453
- def use_format(self, tags: list[str]):
492
+ def _use_format(self, tags: list[str]):
454
493
  if not tags:
455
494
  yield None
456
495
  else:
@@ -461,7 +500,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
461
500
  self.format_tags = self.format_tags[: -len(tags)]
462
501
 
463
502
  @contextmanager
464
- def use_inline_group(
503
+ def _use_inline_group(
465
504
  self, annotated_text_list: AnnotatedTextList, doc: DoclingDocument
466
505
  ):
467
506
  """Create an inline group for annotated texts.
@@ -473,9 +512,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
473
512
  Args:
474
513
  annotated_text_list (AnnotatedTextList): Annotated text
475
514
  doc (DoclingDocument): Currently used document
476
-
477
- Yields:
478
- None: _description_
479
515
  """
480
516
  if len(annotated_text_list) > 1:
481
517
  inline_fmt = doc.add_group(
@@ -493,6 +529,57 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
493
529
  else:
494
530
  yield None
495
531
 
532
+ @contextmanager
533
+ def _use_details(self, tag: Tag, doc: DoclingDocument):
534
+ """Create a group with the content of a details tag.
535
+
536
+ While the context manager is active, the hierarchy level is set one
537
+ level higher as the cuurent parent.
538
+
539
+ Args:
540
+ tag: The details tag.
541
+ doc: Currently used document.
542
+ """
543
+ self.parents[self.level + 1] = doc.add_group(
544
+ name=tag.name,
545
+ label=GroupLabel.SECTION,
546
+ parent=self.parents[self.level],
547
+ content_layer=self.content_layer,
548
+ )
549
+ self.level += 1
550
+ try:
551
+ yield None
552
+ finally:
553
+ self.parents[self.level + 1] = None
554
+ self.level -= 1
555
+
556
+ @contextmanager
557
+ def _use_footer(self, tag: Tag, doc: DoclingDocument):
558
+ """Create a group with a footer.
559
+
560
+ Create a group with the content of a footer tag. While the context manager
561
+ is active, the hierarchy level is set one level higher as the cuurent parent.
562
+
563
+ Args:
564
+ tag: The footer tag.
565
+ doc: Currently used document.
566
+ """
567
+ current_layer = self.content_layer
568
+ self.content_layer = ContentLayer.FURNITURE
569
+ self.parents[self.level + 1] = doc.add_group(
570
+ name=tag.name,
571
+ label=GroupLabel.SECTION,
572
+ parent=self.parents[self.level],
573
+ content_layer=self.content_layer,
574
+ )
575
+ self.level += 1
576
+ try:
577
+ yield None
578
+ finally:
579
+ self.parents[self.level + 1] = None
580
+ self.level -= 1
581
+ self.content_layer = current_layer
582
+
496
583
  def _handle_heading(self, tag: Tag, doc: DoclingDocument) -> None:
497
584
  tag_name = tag.name.lower()
498
585
  # set default content layer to BODY as soon as we encounter a heading
@@ -611,20 +698,29 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
611
698
  content_layer=self.content_layer,
612
699
  )
613
700
  self.level += 1
614
- with self.use_inline_group(min_parts, doc):
701
+ with self._use_inline_group(min_parts, doc):
615
702
  for annotated_text in min_parts:
616
703
  li_text = re.sub(
617
704
  r"\s+|\n+", " ", annotated_text.text
618
705
  ).strip()
619
706
  li_clean = HTMLDocumentBackend._clean_unicode(li_text)
620
- doc.add_text(
621
- parent=self.parents[self.level],
622
- label=DocItemLabel.TEXT,
623
- text=li_clean,
624
- content_layer=self.content_layer,
625
- formatting=annotated_text.formatting,
626
- hyperlink=annotated_text.hyperlink,
627
- )
707
+ if annotated_text.code:
708
+ doc.add_code(
709
+ parent=self.parents[self.level],
710
+ text=li_clean,
711
+ content_layer=self.content_layer,
712
+ formatting=annotated_text.formatting,
713
+ hyperlink=annotated_text.hyperlink,
714
+ )
715
+ else:
716
+ doc.add_text(
717
+ parent=self.parents[self.level],
718
+ label=DocItemLabel.TEXT,
719
+ text=li_clean,
720
+ content_layer=self.content_layer,
721
+ formatting=annotated_text.formatting,
722
+ hyperlink=annotated_text.hyperlink,
723
+ )
628
724
 
629
725
  # 4) recurse into any nested lists, attaching them to this <li> item
630
726
  for sublist in li({"ul", "ol"}, recursive=False):
@@ -687,20 +783,29 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
687
783
  text_list = self._extract_text_and_hyperlink_recursively(
688
784
  tag, find_parent_annotation=True
689
785
  )
690
- annotated_texts = text_list.simplify_text_elements()
786
+ annotated_texts: AnnotatedTextList = text_list.simplify_text_elements()
691
787
  for part in annotated_texts.split_by_newline():
692
- with self.use_inline_group(part, doc):
788
+ with self._use_inline_group(part, doc):
693
789
  for annotated_text in part:
694
790
  if seg := annotated_text.text.strip():
695
791
  seg_clean = HTMLDocumentBackend._clean_unicode(seg)
696
- doc.add_text(
697
- parent=self.parents[self.level],
698
- label=DocItemLabel.TEXT,
699
- text=seg_clean,
700
- content_layer=self.content_layer,
701
- formatting=annotated_text.formatting,
702
- hyperlink=annotated_text.hyperlink,
703
- )
792
+ if annotated_text.code:
793
+ doc.add_code(
794
+ parent=self.parents[self.level],
795
+ text=seg_clean,
796
+ content_layer=self.content_layer,
797
+ formatting=annotated_text.formatting,
798
+ hyperlink=annotated_text.hyperlink,
799
+ )
800
+ else:
801
+ doc.add_text(
802
+ parent=self.parents[self.level],
803
+ label=DocItemLabel.TEXT,
804
+ text=seg_clean,
805
+ content_layer=self.content_layer,
806
+ formatting=annotated_text.formatting,
807
+ hyperlink=annotated_text.hyperlink,
808
+ )
704
809
 
705
810
  for img_tag in tag("img"):
706
811
  if isinstance(img_tag, Tag):
@@ -718,13 +823,13 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
718
823
  content_layer=self.content_layer,
719
824
  )
720
825
 
721
- elif tag_name in {"pre", "code"}:
826
+ elif tag_name in {"pre"}:
722
827
  # handle monospace code snippets (pre).
723
828
  text_list = self._extract_text_and_hyperlink_recursively(
724
- tag, find_parent_annotation=True
829
+ tag, find_parent_annotation=True, keep_newlines=True
725
830
  )
726
831
  annotated_texts = text_list.simplify_text_elements()
727
- with self.use_inline_group(annotated_texts, doc):
832
+ with self._use_inline_group(annotated_texts, doc):
728
833
  for annotated_text in annotated_texts:
729
834
  text_clean = HTMLDocumentBackend._clean_unicode(
730
835
  annotated_text.text.strip()
@@ -737,22 +842,13 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
737
842
  hyperlink=annotated_text.hyperlink,
738
843
  )
739
844
 
740
- elif tag_name in {"details", "footer"}:
741
- if tag_name == "footer":
742
- current_layer = self.content_layer
743
- self.content_layer = ContentLayer.FURNITURE
744
- self.parents[self.level + 1] = doc.add_group(
745
- name=tag_name,
746
- label=GroupLabel.SECTION,
747
- parent=self.parents[self.level],
748
- content_layer=self.content_layer,
749
- )
750
- self.level += 1
751
- self._walk(tag, doc)
752
- self.parents[self.level + 1] = None
753
- self.level -= 1
754
- if tag_name == "footer":
755
- self.content_layer = current_layer
845
+ elif tag_name == "footer":
846
+ with self._use_footer(tag, doc):
847
+ self._walk(tag, doc)
848
+
849
+ elif tag_name == "details":
850
+ with self._use_details(tag, doc):
851
+ self._walk(tag, doc)
756
852
 
757
853
  def _emit_image(self, img_tag: Tag, doc: DoclingDocument) -> None:
758
854
  figure = img_tag.find_parent("figure")
@@ -99,6 +99,8 @@ class RapidOcrOptions(OcrOptions):
99
99
  # For more details on the following options visit
100
100
  # https://rapidai.github.io/RapidOCRDocs/install_usage/api/RapidOCR/
101
101
 
102
+ # https://rapidai.github.io/RapidOCRDocs/main/install_usage/rapidocr/usage/#__tabbed_3_4
103
+ backend: Literal["onnxruntime", "openvino", "paddle", "torch"] = "onnxruntime"
102
104
  text_score: float = 0.5 # same default as rapidocr
103
105
 
104
106
  use_det: Optional[bool] = None # same default as rapidocr
@@ -42,10 +42,10 @@ class RapidOcrModel(BaseOcrModel):
42
42
 
43
43
  if self.enabled:
44
44
  try:
45
- from rapidocr_onnxruntime import RapidOCR # type: ignore
45
+ from rapidocr import EngineType, RapidOCR # type: ignore
46
46
  except ImportError:
47
47
  raise ImportError(
48
- "RapidOCR is not installed. Please install it via `pip install rapidocr_onnxruntime` to use this OCR engine. "
48
+ "RapidOCR is not installed. Please install it via `pip install rapidocr onnxruntime` to use this OCR engine. "
49
49
  "Alternatively, Docling has support for other OCR engines. See the documentation."
50
50
  )
51
51
 
@@ -54,21 +54,39 @@ class RapidOcrModel(BaseOcrModel):
54
54
  use_cuda = str(AcceleratorDevice.CUDA.value).lower() in device
55
55
  use_dml = accelerator_options.device == AcceleratorDevice.AUTO
56
56
  intra_op_num_threads = accelerator_options.num_threads
57
+ _ALIASES = {
58
+ "onnxruntime": EngineType.ONNXRUNTIME,
59
+ "openvino": EngineType.OPENVINO,
60
+ "paddle": EngineType.PADDLE,
61
+ "torch": EngineType.TORCH,
62
+ }
63
+ backend_enum = _ALIASES.get(self.options.backend, EngineType.ONNXRUNTIME)
57
64
 
58
65
  self.reader = RapidOCR(
59
- text_score=self.options.text_score,
60
- cls_use_cuda=use_cuda,
61
- rec_use_cuda=use_cuda,
62
- det_use_cuda=use_cuda,
63
- det_use_dml=use_dml,
64
- cls_use_dml=use_dml,
65
- rec_use_dml=use_dml,
66
- intra_op_num_threads=intra_op_num_threads,
67
- print_verbose=self.options.print_verbose,
68
- det_model_path=self.options.det_model_path,
69
- cls_model_path=self.options.cls_model_path,
70
- rec_model_path=self.options.rec_model_path,
71
- rec_keys_path=self.options.rec_keys_path,
66
+ params={
67
+ # Global settings (these are still correct)
68
+ "Global.text_score": self.options.text_score,
69
+ # "Global.verbose": self.options.print_verbose,
70
+ # Detection model settings
71
+ "Det.model_path": self.options.det_model_path,
72
+ "Det.use_cuda": use_cuda,
73
+ "Det.use_dml": use_dml,
74
+ "Det.intra_op_num_threads": intra_op_num_threads,
75
+ # Classification model settings
76
+ "Cls.model_path": self.options.cls_model_path,
77
+ "Cls.use_cuda": use_cuda,
78
+ "Cls.use_dml": use_dml,
79
+ "Cls.intra_op_num_threads": intra_op_num_threads,
80
+ # Recognition model settings
81
+ "Rec.model_path": self.options.rec_model_path,
82
+ "Rec.keys_path": self.options.rec_keys_path,
83
+ "Rec.use_cuda": use_cuda,
84
+ "Rec.use_dml": use_dml,
85
+ "Rec.intra_op_num_threads": intra_op_num_threads,
86
+ "Det.engine_type": backend_enum,
87
+ "Cls.engine_type": backend_enum,
88
+ "Rec.engine_type": backend_enum,
89
+ }
72
90
  )
73
91
 
74
92
  def __call__(
@@ -95,12 +113,15 @@ class RapidOcrModel(BaseOcrModel):
95
113
  scale=self.scale, cropbox=ocr_rect
96
114
  )
97
115
  im = numpy.array(high_res_image)
98
- result, _ = self.reader(
116
+ result = self.reader(
99
117
  im,
100
118
  use_det=self.options.use_det,
101
119
  use_cls=self.options.use_cls,
102
120
  use_rec=self.options.use_rec,
103
121
  )
122
+ result = list(
123
+ zip(result.boxes.tolist(), result.txts, result.scores)
124
+ )
104
125
 
105
126
  del high_res_image
106
127
  del im
@@ -146,6 +146,7 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
146
146
  conv_res.pages.append(Page(page_no=i))
147
147
 
148
148
  try:
149
+ total_pages_processed = 0
149
150
  # Iterate batches of pages (page_batch_size) in the doc
150
151
  for page_batch in chunkify(
151
152
  conv_res.pages, settings.perf.page_batch_size
@@ -186,9 +187,9 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
186
187
  )
187
188
  conv_res.status = ConversionStatus.PARTIAL_SUCCESS
188
189
  break
189
-
190
+ total_pages_processed += len(page_batch)
190
191
  _log.debug(
191
- f"Finished converting page batch time={end_batch_time:.3f}"
192
+ f"Finished converting pages {total_pages_processed}/{len(conv_res.pages)} time={end_batch_time:.3f}"
192
193
  )
193
194
 
194
195
  except Exception as e:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling
3
- Version: 2.47.0
3
+ Version: 2.48.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  License-Expression: MIT
@@ -59,10 +59,11 @@ Provides-Extra: vlm
59
59
  Requires-Dist: transformers<5.0.0,>=4.46.0; extra == "vlm"
60
60
  Requires-Dist: accelerate<2.0.0,>=1.2.1; extra == "vlm"
61
61
  Requires-Dist: mlx-vlm<1.0.0,>=0.3.0; (python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64") and extra == "vlm"
62
- Requires-Dist: vllm<1.0.0,>=0.10.0; (python_version >= "3.10" and sys_platform == "linux") and extra == "vlm"
62
+ Requires-Dist: vllm<1.0.0,>=0.10.0; (python_version >= "3.10" and sys_platform == "linux" and platform_machine == "x86_64") and extra == "vlm"
63
63
  Provides-Extra: rapidocr
64
- Requires-Dist: rapidocr-onnxruntime<2.0.0,>=1.4.0; python_version < "3.13" and extra == "rapidocr"
64
+ Requires-Dist: rapidocr<4.0.0,>=3.3; python_version < "3.14" and extra == "rapidocr"
65
65
  Requires-Dist: onnxruntime<2.0.0,>=1.7.0; extra == "rapidocr"
66
+ Requires-Dist: modelscope>=1.29.0; extra == "rapidocr"
66
67
  Provides-Extra: asr
67
68
  Requires-Dist: openai-whisper>=20250625; extra == "asr"
68
69
  Dynamic: license-file
@@ -9,7 +9,7 @@ docling/backend/csv_backend.py,sha256=2g9famYG2W-ID9jEdZPxc6O8QGv1vWQfjN8pL-QMBE
9
9
  docling/backend/docling_parse_backend.py,sha256=9rUo1vPxX6QLzGqF-2B2iEYglZg6YQ3Uea00XrLluTg,7918
10
10
  docling/backend/docling_parse_v2_backend.py,sha256=3ckTfke8IICjaImlIzc3TRhG7KDuxDDba0AuCEcjA-M,9500
11
11
  docling/backend/docling_parse_v4_backend.py,sha256=MbCMxNGmoW4iuev9tX1Vt4jtIeak2kC9Uac3xQSRxeo,7509
12
- docling/backend/html_backend.py,sha256=qPXmMiKxskSDJJK5e0a46xhkSiATgjyi02eMrY_ahR8,38323
12
+ docling/backend/html_backend.py,sha256=MqtU9fA83lcjqb85lFTmGDedOH72WxTmwvj0ZzPur1I,42224
13
13
  docling/backend/md_backend.py,sha256=qCI7SD9hnWWGrkG_drpzQv2Z7DVBG4Tsq3hhTsYV790,22562
14
14
  docling/backend/mets_gbs_backend.py,sha256=EA8sY6tbmGiysKGYPPZiNlK-i7Adn8bLTo-7Ym15hTU,12774
15
15
  docling/backend/msexcel_backend.py,sha256=cq8MQ2RSh6pqCiVrldjOerSww7dOPTWmCQoCBI57i6w,18579
@@ -38,7 +38,7 @@ docling/datamodel/asr_model_specs.py,sha256=Wg7z3zm_wXIWu122iPVy0RMECsA_JCFHrlFF
38
38
  docling/datamodel/base_models.py,sha256=OI2-tBjH3PZMF_Zyyc4eezJ4gFXIBiKT4BYKYy6n81E,11924
39
39
  docling/datamodel/document.py,sha256=zsxFYXvo6GtwGNogSDoBB1TFvkm7IOrP_VnqXNqBhJs,17329
40
40
  docling/datamodel/layout_model_specs.py,sha256=GSkJ-Z_0PVgwWGi7C7TsxbzRjlrWS9ZrHJjHumv-Z5U,2339
41
- docling/datamodel/pipeline_options.py,sha256=x0RlEdTiEU9gH27YDRov1ZVMpTlx4BnqEoEtmOHd08k,10584
41
+ docling/datamodel/pipeline_options.py,sha256=0Qk2nyzEo90NWxSKaiHaVhIV_6zB20CXwC-Icn7g3gw,10760
42
42
  docling/datamodel/pipeline_options_asr_model.py,sha256=7X068xl-qpbyPxC7-TwX7Q6tLyZXGT5h1osZ_xLNLM0,1454
43
43
  docling/datamodel/pipeline_options_vlm_model.py,sha256=AcqqThSW74hwQ6x7pazzm57LnJiUqB7gQi5wFayGlbk,2628
44
44
  docling/datamodel/settings.py,sha256=c0MTw6pO5be_BKxHKYl4SaBJAw_qL-aapxp-g5HHj1A,2084
@@ -57,7 +57,7 @@ docling/models/page_preprocessing_model.py,sha256=rHNX1uP1ScTjVUlsxZ0eamK2uNUqI9
57
57
  docling/models/picture_description_api_model.py,sha256=o3EkV5aHW_6WzE_fdj_VRnNCrS_btclO_ZCLAUqrfl0,2377
58
58
  docling/models/picture_description_base_model.py,sha256=kLthLhdlgwhootQ4_xhhcAk6A-vso5-qcsFJ3TcYfO0,2991
59
59
  docling/models/picture_description_vlm_model.py,sha256=5BJvaF3PHuL9lCVYqPv9krh3h_7YwNSdKYw1EVEj13k,4156
60
- docling/models/rapid_ocr_model.py,sha256=AMdc66s_iWO4p6nQ0LNjQMUYVxrDSxMyLNPpjPYt6N8,5916
60
+ docling/models/rapid_ocr_model.py,sha256=h5f-UMPzGoKv7jJKkH1bkb1OcB33zxs3yZpIFOgZdsw,7037
61
61
  docling/models/readingorder_model.py,sha256=bZoXHaSwUsa8niSmJrbCuy784ixCeBXT-RQBUfgHJ4A,14925
62
62
  docling/models/table_structure_model.py,sha256=RFXo73f2q4XuKyaSqbxpznh7JVtlLcT0FsOWl9oZbSg,12518
63
63
  docling/models/tesseract_ocr_cli_model.py,sha256=I3Gn28Y-LD8OfvyCElN9fLiNgpo2sT0uMkVt258253s,12881
@@ -76,7 +76,7 @@ docling/models/vlm_models_inline/mlx_model.py,sha256=VP05v97mqzmaG4o9bOpJcxIlEqv
76
76
  docling/models/vlm_models_inline/vllm_model.py,sha256=_EnK1nfpAPJky7aRlyp8SUIghiZOQO8AkDN_hHqXLZg,8615
77
77
  docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
78
78
  docling/pipeline/asr_pipeline.py,sha256=tQkhu9fXdkSuYIL22xzV2YRUlQh-9qktHBbs2qeXhJI,9070
79
- docling/pipeline/base_pipeline.py,sha256=VYVYndifTPSD2GWHKjfi4Y76M5qgt1DiygO-jowKsqM,9919
79
+ docling/pipeline/base_pipeline.py,sha256=Tl_C3adFABNxtE7hX83VSdx-j7D8GRvoFcno5A3Z-YQ,10062
80
80
  docling/pipeline/simple_pipeline.py,sha256=TXZOwR7hZRji462ZTIpte0VJjzbxvNVE8dbLFANDhSU,2253
81
81
  docling/pipeline/standard_pdf_pipeline.py,sha256=yFishq4Cu01BiBGHk3Irr7ogcTQKeSC0QZImQVAhIaY,12740
82
82
  docling/pipeline/threaded_standard_pdf_pipeline.py,sha256=NgdZxpfpElnvCgGlrQ8kSvq44LNzJcc6wOqD-AMrKZ0,26132
@@ -94,9 +94,9 @@ docling/utils/orientation.py,sha256=jTyLxyT31FlOodZoBMlADHNQK2lAWKYVs5z7pXd_6Cg,
94
94
  docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
95
95
  docling/utils/utils.py,sha256=kJtIYuzXeOyJHYlxmLAo7dGM5rEsDa1i84qEsUj1nio,1908
96
96
  docling/utils/visualization.py,sha256=tY2ylE2aiQKkmzlSLnFW-HTfFyqUUMguW18ldd1PLfo,2868
97
- docling-2.47.0.dist-info/licenses/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
98
- docling-2.47.0.dist-info/METADATA,sha256=4cBB6CG8LFZzsDFRenI4f09ypOTCswZk2mVwczxEcVs,10569
99
- docling-2.47.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
100
- docling-2.47.0.dist-info/entry_points.txt,sha256=hzVlbeE0aMSTQ9S0-NTYN0Hmgsn6qL_EA2qX4UbkAuY,149
101
- docling-2.47.0.dist-info/top_level.txt,sha256=vkIywP-USjFyYo1AIRQbWQQaL3xB5jf8vkCYdTIfNic,8
102
- docling-2.47.0.dist-info/RECORD,,
97
+ docling-2.48.0.dist-info/licenses/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
98
+ docling-2.48.0.dist-info/METADATA,sha256=EEjk7em4miqz1ZEyCZg9lRnzPBsoOljSwSFfi12a98g,10643
99
+ docling-2.48.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
100
+ docling-2.48.0.dist-info/entry_points.txt,sha256=hzVlbeE0aMSTQ9S0-NTYN0Hmgsn6qL_EA2qX4UbkAuY,149
101
+ docling-2.48.0.dist-info/top_level.txt,sha256=vkIywP-USjFyYo1AIRQbWQQaL3xB5jf8vkCYdTIfNic,8
102
+ docling-2.48.0.dist-info/RECORD,,