docling 2.15.1__py3-none-any.whl → 2.16.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. docling/backend/abstract_backend.py +0 -1
  2. docling/backend/asciidoc_backend.py +0 -1
  3. docling/backend/docling_parse_backend.py +1 -1
  4. docling/backend/docling_parse_v2_backend.py +1 -1
  5. docling/backend/html_backend.py +1 -1
  6. docling/backend/json/__init__.py +0 -0
  7. docling/backend/json/docling_json_backend.py +58 -0
  8. docling/backend/md_backend.py +44 -27
  9. docling/backend/msexcel_backend.py +50 -38
  10. docling/backend/msword_backend.py +0 -1
  11. docling/backend/pdf_backend.py +0 -2
  12. docling/backend/pypdfium2_backend.py +1 -1
  13. docling/datamodel/base_models.py +30 -3
  14. docling/datamodel/document.py +2 -0
  15. docling/datamodel/pipeline_options.py +6 -9
  16. docling/document_converter.py +4 -0
  17. docling/models/base_model.py +62 -6
  18. docling/models/code_formula_model.py +245 -0
  19. docling/models/document_picture_classifier.py +187 -0
  20. docling/models/layout_model.py +10 -86
  21. docling/models/page_assemble_model.py +1 -33
  22. docling/models/tesseract_ocr_cli_model.py +0 -1
  23. docling/models/tesseract_ocr_model.py +63 -15
  24. docling/pipeline/base_pipeline.py +40 -17
  25. docling/pipeline/standard_pdf_pipeline.py +31 -2
  26. docling/utils/glm_utils.py +4 -1
  27. docling/utils/visualization.py +80 -0
  28. {docling-2.15.1.dist-info → docling-2.16.0.dist-info}/METADATA +5 -4
  29. docling-2.16.0.dist-info/RECORD +61 -0
  30. docling-2.15.1.dist-info/RECORD +0 -56
  31. {docling-2.15.1.dist-info → docling-2.16.0.dist-info}/LICENSE +0 -0
  32. {docling-2.15.1.dist-info → docling-2.16.0.dist-info}/WHEEL +0 -0
  33. {docling-2.15.1.dist-info → docling-2.16.0.dist-info}/entry_points.txt +0 -0
@@ -54,43 +54,56 @@ class TesseractOcrModel(BaseOcrModel):
54
54
  # Initialize the tesseractAPI
55
55
  _log.debug("Initializing TesserOCR: %s", tesseract_version)
56
56
  lang = "+".join(self.options.lang)
57
+
58
+ self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}
59
+
60
+ if any([l.startswith("script/") for l in tesserocr_languages]):
61
+ self.script_prefix = "script/"
62
+ else:
63
+ self.script_prefix = ""
64
+
65
+ tesserocr_kwargs = {
66
+ "psm": tesserocr.PSM.AUTO,
67
+ "init": True,
68
+ "oem": tesserocr.OEM.DEFAULT,
69
+ }
70
+
57
71
  if self.options.path is not None:
72
+ tesserocr_kwargs["path"] = self.options.path
73
+
74
+ if lang == "auto":
58
75
  self.reader = tesserocr.PyTessBaseAPI(
59
- path=self.options.path,
60
- lang=lang,
61
- psm=tesserocr.PSM.AUTO,
62
- init=True,
63
- oem=tesserocr.OEM.DEFAULT,
76
+ **{"lang": "osd", "psm": tesserocr.PSM.OSD_ONLY} | tesserocr_kwargs
64
77
  )
65
78
  else:
66
79
  self.reader = tesserocr.PyTessBaseAPI(
67
- lang=lang,
68
- psm=tesserocr.PSM.AUTO,
69
- init=True,
70
- oem=tesserocr.OEM.DEFAULT,
80
+ **{"lang": lang} | tesserocr_kwargs,
71
81
  )
82
+
72
83
  self.reader_RIL = tesserocr.RIL
73
84
 
74
85
  def __del__(self):
75
86
  if self.reader is not None:
76
87
  # Finalize the tesseractAPI
77
88
  self.reader.End()
89
+ for script in self.script_readers:
90
+ self.script_readers[script].End()
78
91
 
79
92
  def __call__(
80
93
  self, conv_res: ConversionResult, page_batch: Iterable[Page]
81
94
  ) -> Iterable[Page]:
82
-
83
95
  if not self.enabled:
84
96
  yield from page_batch
85
97
  return
86
98
 
99
+ import tesserocr
100
+
87
101
  for page in page_batch:
88
102
  assert page._backend is not None
89
103
  if not page._backend.is_valid():
90
104
  yield page
91
105
  else:
92
106
  with TimeRecorder(conv_res, "ocr"):
93
-
94
107
  assert self.reader is not None
95
108
 
96
109
  ocr_rects = self.get_ocr_rects(page)
@@ -106,20 +119,55 @@ class TesseractOcrModel(BaseOcrModel):
106
119
 
107
120
  # Retrieve text snippets with their bounding boxes
108
121
  self.reader.SetImage(high_res_image)
109
- boxes = self.reader.GetComponentImages(
122
+
123
+ if self.options.lang == ["auto"]:
124
+ osd = self.reader.DetectOrientationScript()
125
+
126
+ # No text, probably
127
+ if osd is None:
128
+ continue
129
+
130
+ script = osd["script_name"]
131
+
132
+ if script == "Katakana" or script == "Hiragana":
133
+ script = "Japanese"
134
+ elif script == "Han":
135
+ script = "HanS"
136
+ elif script == "Korean":
137
+ script = "Hangul"
138
+
139
+ _log.debug(
140
+ f'Using model for the detected script "{script}"'
141
+ )
142
+
143
+ if script not in self.script_readers:
144
+ self.script_readers[script] = tesserocr.PyTessBaseAPI(
145
+ path=self.reader.GetDatapath(),
146
+ lang=f"{self.script_prefix}{script}",
147
+ psm=tesserocr.PSM.AUTO,
148
+ init=True,
149
+ oem=tesserocr.OEM.DEFAULT,
150
+ )
151
+
152
+ local_reader = self.script_readers[script]
153
+ local_reader.SetImage(high_res_image)
154
+ else:
155
+ local_reader = self.reader
156
+
157
+ boxes = local_reader.GetComponentImages(
110
158
  self.reader_RIL.TEXTLINE, True
111
159
  )
112
160
 
113
161
  cells = []
114
162
  for ix, (im, box, _, _) in enumerate(boxes):
115
163
  # Set the area of interest. Tesseract uses Bottom-Left for the origin
116
- self.reader.SetRectangle(
164
+ local_reader.SetRectangle(
117
165
  box["x"], box["y"], box["w"], box["h"]
118
166
  )
119
167
 
120
168
  # Extract text within the bounding box
121
- text = self.reader.GetUTF8Text().strip()
122
- confidence = self.reader.MeanTextConf()
169
+ text = local_reader.GetUTF8Text().strip()
170
+ confidence = local_reader.MeanTextConf()
123
171
  left = box["x"] / self.scale
124
172
  bottom = box["y"] / self.scale
125
173
  right = (box["x"] + box["w"]) / self.scale
@@ -3,7 +3,7 @@ import logging
3
3
  import time
4
4
  import traceback
5
5
  from abc import ABC, abstractmethod
6
- from typing import Callable, Iterable, List
6
+ from typing import Any, Callable, Iterable, List
7
7
 
8
8
  from docling_core.types.doc import DoclingDocument, NodeItem
9
9
 
@@ -18,7 +18,7 @@ from docling.datamodel.base_models import (
18
18
  from docling.datamodel.document import ConversionResult, InputDocument
19
19
  from docling.datamodel.pipeline_options import PipelineOptions
20
20
  from docling.datamodel.settings import settings
21
- from docling.models.base_model import BaseEnrichmentModel
21
+ from docling.models.base_model import GenericEnrichmentModel
22
22
  from docling.utils.profiling import ProfilingScope, TimeRecorder
23
23
  from docling.utils.utils import chunkify
24
24
 
@@ -28,8 +28,9 @@ _log = logging.getLogger(__name__)
28
28
  class BasePipeline(ABC):
29
29
  def __init__(self, pipeline_options: PipelineOptions):
30
30
  self.pipeline_options = pipeline_options
31
+ self.keep_images = False
31
32
  self.build_pipe: List[Callable] = []
32
- self.enrichment_pipe: List[BaseEnrichmentModel] = []
33
+ self.enrichment_pipe: List[GenericEnrichmentModel[Any]] = []
33
34
 
34
35
  def execute(self, in_doc: InputDocument, raises_on_error: bool) -> ConversionResult:
35
36
  conv_res = ConversionResult(input=in_doc)
@@ -40,7 +41,7 @@ class BasePipeline(ABC):
40
41
  conv_res, "pipeline_total", scope=ProfilingScope.DOCUMENT
41
42
  ):
42
43
  # These steps are building and assembling the structure of the
43
- # output DoclingDocument
44
+ # output DoclingDocument.
44
45
  conv_res = self._build_document(conv_res)
45
46
  conv_res = self._assemble_document(conv_res)
46
47
  # From this stage, all operations should rely only on conv_res.output
@@ -50,6 +51,8 @@ class BasePipeline(ABC):
50
51
  conv_res.status = ConversionStatus.FAILURE
51
52
  if raises_on_error:
52
53
  raise e
54
+ finally:
55
+ self._unload(conv_res)
53
56
 
54
57
  return conv_res
55
58
 
@@ -62,21 +65,22 @@ class BasePipeline(ABC):
62
65
 
63
66
  def _enrich_document(self, conv_res: ConversionResult) -> ConversionResult:
64
67
 
65
- def _filter_elements(
66
- doc: DoclingDocument, model: BaseEnrichmentModel
68
+ def _prepare_elements(
69
+ conv_res: ConversionResult, model: GenericEnrichmentModel[Any]
67
70
  ) -> Iterable[NodeItem]:
68
- for element, _level in doc.iterate_items():
69
- if model.is_processable(doc=doc, element=element):
70
- yield element
71
+ for doc_element, _level in conv_res.document.iterate_items():
72
+ prepared_element = model.prepare_element(
73
+ conv_res=conv_res, element=doc_element
74
+ )
75
+ if prepared_element is not None:
76
+ yield prepared_element
71
77
 
72
78
  with TimeRecorder(conv_res, "doc_enrich", scope=ProfilingScope.DOCUMENT):
73
79
  for model in self.enrichment_pipe:
74
80
  for element_batch in chunkify(
75
- _filter_elements(conv_res.document, model),
81
+ _prepare_elements(conv_res, model),
76
82
  settings.perf.elements_batch_size,
77
83
  ):
78
- # TODO: currently we assume the element itself is modified, because
79
- # we don't have an interface to save the element back to the document
80
84
  for element in model(
81
85
  doc=conv_res.document, element_batch=element_batch
82
86
  ): # Must exhaust!
@@ -88,6 +92,9 @@ class BasePipeline(ABC):
88
92
  def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
89
93
  pass
90
94
 
95
+ def _unload(self, conv_res: ConversionResult):
96
+ pass
97
+
91
98
  @classmethod
92
99
  @abstractmethod
93
100
  def get_default_options(cls) -> PipelineOptions:
@@ -107,6 +114,10 @@ class BasePipeline(ABC):
107
114
 
108
115
  class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
109
116
 
117
+ def __init__(self, pipeline_options: PipelineOptions):
118
+ super().__init__(pipeline_options)
119
+ self.keep_backend = False
120
+
110
121
  def _apply_on_pages(
111
122
  self, conv_res: ConversionResult, page_batch: Iterable[Page]
112
123
  ) -> Iterable[Page]:
@@ -148,7 +159,14 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
148
159
  pipeline_pages = self._apply_on_pages(conv_res, init_pages)
149
160
 
150
161
  for p in pipeline_pages: # Must exhaust!
151
- pass
162
+
163
+ # Cleanup cached images
164
+ if not self.keep_images:
165
+ p._image_cache = {}
166
+
167
+ # Cleanup page backends
168
+ if not self.keep_backend and p._backend is not None:
169
+ p._backend.unload()
152
170
 
153
171
  end_batch_time = time.monotonic()
154
172
  total_elapsed_time += end_batch_time - start_batch_time
@@ -177,10 +195,15 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
177
195
  )
178
196
  raise e
179
197
 
180
- finally:
181
- # Always unload the PDF backend, even in case of failure
182
- if conv_res.input._backend:
183
- conv_res.input._backend.unload()
198
+ return conv_res
199
+
200
+ def _unload(self, conv_res: ConversionResult) -> ConversionResult:
201
+ for page in conv_res.pages:
202
+ if page._backend is not None:
203
+ page._backend.unload()
204
+
205
+ if conv_res.input._backend:
206
+ conv_res.input._backend.unload()
184
207
 
185
208
  return conv_res
186
209
 
@@ -18,6 +18,11 @@ from docling.datamodel.pipeline_options import (
18
18
  TesseractOcrOptions,
19
19
  )
20
20
  from docling.models.base_ocr_model import BaseOcrModel
21
+ from docling.models.code_formula_model import CodeFormulaModel, CodeFormulaModelOptions
22
+ from docling.models.document_picture_classifier import (
23
+ DocumentPictureClassifier,
24
+ DocumentPictureClassifierOptions,
25
+ )
21
26
  from docling.models.ds_glm_model import GlmModel, GlmOptions
22
27
  from docling.models.easyocr_model import EasyOcrModel
23
28
  from docling.models.layout_model import LayoutModel
@@ -50,7 +55,7 @@ class StandardPdfPipeline(PaginatedPipeline):
50
55
  else:
51
56
  self.artifacts_path = Path(pipeline_options.artifacts_path)
52
57
 
53
- keep_images = (
58
+ self.keep_images = (
54
59
  self.pipeline_options.generate_page_images
55
60
  or self.pipeline_options.generate_picture_images
56
61
  or self.pipeline_options.generate_table_images
@@ -87,13 +92,37 @@ class StandardPdfPipeline(PaginatedPipeline):
87
92
  accelerator_options=pipeline_options.accelerator_options,
88
93
  ),
89
94
  # Page assemble
90
- PageAssembleModel(options=PageAssembleOptions(keep_images=keep_images)),
95
+ PageAssembleModel(options=PageAssembleOptions()),
91
96
  ]
92
97
 
93
98
  self.enrichment_pipe = [
94
99
  # Other models working on `NodeItem` elements in the DoclingDocument
100
+ # Code Formula Enrichment Model
101
+ CodeFormulaModel(
102
+ enabled=pipeline_options.do_code_enrichment
103
+ or pipeline_options.do_formula_enrichment,
104
+ artifacts_path=pipeline_options.artifacts_path,
105
+ options=CodeFormulaModelOptions(
106
+ do_code_enrichment=pipeline_options.do_code_enrichment,
107
+ do_formula_enrichment=pipeline_options.do_formula_enrichment,
108
+ ),
109
+ accelerator_options=pipeline_options.accelerator_options,
110
+ ),
111
+ # Document Picture Classifier
112
+ DocumentPictureClassifier(
113
+ enabled=pipeline_options.do_picture_classification,
114
+ artifacts_path=pipeline_options.artifacts_path,
115
+ options=DocumentPictureClassifierOptions(),
116
+ accelerator_options=pipeline_options.accelerator_options,
117
+ ),
95
118
  ]
96
119
 
120
+ if (
121
+ self.pipeline_options.do_formula_enrichment
122
+ or self.pipeline_options.do_code_enrichment
123
+ ):
124
+ self.keep_backend = True
125
+
97
126
  @staticmethod
98
127
  def download_models_hf(
99
128
  local_dir: Optional[Path] = None, force: bool = False
@@ -270,7 +270,6 @@ def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
270
270
  container_el = doc.add_group(label=group_label)
271
271
 
272
272
  _add_child_elements(container_el, doc, obj, pelem)
273
-
274
273
  elif "text" in obj:
275
274
  text = obj["text"][span_i:span_j]
276
275
 
@@ -304,6 +303,10 @@ def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
304
303
  current_list = None
305
304
 
306
305
  doc.add_heading(text=text, prov=prov)
306
+ elif label == DocItemLabel.CODE:
307
+ current_list = None
308
+
309
+ doc.add_code(text=text, prov=prov)
307
310
  else:
308
311
  current_list = None
309
312
 
@@ -0,0 +1,80 @@
1
+ from docling_core.types.doc import DocItemLabel
2
+ from PIL import Image, ImageDraw, ImageFont
3
+ from PIL.ImageFont import FreeTypeFont
4
+
5
+ from docling.datamodel.base_models import Cluster
6
+
7
+
8
+ def draw_clusters(
9
+ image: Image.Image, clusters: list[Cluster], scale_x: float, scale_y: float
10
+ ) -> None:
11
+ """
12
+ Draw clusters on an image
13
+ """
14
+ draw = ImageDraw.Draw(image, "RGBA")
15
+ # Create a smaller font for the labels
16
+ font: ImageFont.ImageFont | FreeTypeFont
17
+ try:
18
+ font = ImageFont.truetype("arial.ttf", 12)
19
+ except OSError:
20
+ # Fallback to default font if arial is not available
21
+ font = ImageFont.load_default()
22
+ for c_tl in clusters:
23
+ all_clusters = [c_tl, *c_tl.children]
24
+ for c in all_clusters:
25
+ # Draw cells first (underneath)
26
+ cell_color = (0, 0, 0, 40) # Transparent black for cells
27
+ for tc in c.cells:
28
+ cx0, cy0, cx1, cy1 = tc.bbox.as_tuple()
29
+ cx0 *= scale_x
30
+ cx1 *= scale_x
31
+ cy0 *= scale_x
32
+ cy1 *= scale_y
33
+
34
+ draw.rectangle(
35
+ [(cx0, cy0), (cx1, cy1)],
36
+ outline=None,
37
+ fill=cell_color,
38
+ )
39
+ # Draw cluster rectangle
40
+ x0, y0, x1, y1 = c.bbox.as_tuple()
41
+ x0 *= scale_x
42
+ x1 *= scale_x
43
+ y0 *= scale_x
44
+ y1 *= scale_y
45
+
46
+ cluster_fill_color = (*list(DocItemLabel.get_color(c.label)), 70)
47
+ cluster_outline_color = (
48
+ *list(DocItemLabel.get_color(c.label)),
49
+ 255,
50
+ )
51
+ draw.rectangle(
52
+ [(x0, y0), (x1, y1)],
53
+ outline=cluster_outline_color,
54
+ fill=cluster_fill_color,
55
+ )
56
+ # Add label name and confidence
57
+ label_text = f"{c.label.name} ({c.confidence:.2f})"
58
+ # Create semi-transparent background for text
59
+ text_bbox = draw.textbbox((x0, y0), label_text, font=font)
60
+ text_bg_padding = 2
61
+ draw.rectangle(
62
+ [
63
+ (
64
+ text_bbox[0] - text_bg_padding,
65
+ text_bbox[1] - text_bg_padding,
66
+ ),
67
+ (
68
+ text_bbox[2] + text_bg_padding,
69
+ text_bbox[3] + text_bg_padding,
70
+ ),
71
+ ],
72
+ fill=(255, 255, 255, 180), # Semi-transparent white
73
+ )
74
+ # Draw text
75
+ draw.text(
76
+ (x0, y0),
77
+ label_text,
78
+ fill=(0, 0, 0, 255), # Solid black
79
+ font=font,
80
+ )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.15.1
3
+ Version: 2.16.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -26,9 +26,9 @@ Provides-Extra: tesserocr
26
26
  Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
27
27
  Requires-Dist: certifi (>=2024.7.4)
28
28
  Requires-Dist: deepsearch-glm (>=1.0.0,<2.0.0)
29
- Requires-Dist: docling-core[chunking] (>=2.13.1,<3.0.0)
30
- Requires-Dist: docling-ibm-models (>=3.1.0,<4.0.0)
31
- Requires-Dist: docling-parse (>=3.0.0,<4.0.0)
29
+ Requires-Dist: docling-core[chunking] (>=2.15.1,<3.0.0)
30
+ Requires-Dist: docling-ibm-models (>=3.3.0,<4.0.0)
31
+ Requires-Dist: docling-parse (>=3.1.0,<4.0.0)
32
32
  Requires-Dist: easyocr (>=1.7,<2.0)
33
33
  Requires-Dist: filetype (>=1.2.0,<2.0.0)
34
34
  Requires-Dist: huggingface_hub (>=0.23,<1)
@@ -39,6 +39,7 @@ Requires-Dist: onnxruntime (>=1.7.0,<1.20.0) ; (python_version < "3.10") and (ex
39
39
  Requires-Dist: onnxruntime (>=1.7.0,<2.0.0) ; (python_version >= "3.10") and (extra == "rapidocr")
40
40
  Requires-Dist: openpyxl (>=3.1.5,<4.0.0)
41
41
  Requires-Dist: pandas (>=2.1.4,<3.0.0)
42
+ Requires-Dist: pillow (>=10.0.0,<11.0.0)
42
43
  Requires-Dist: pydantic (>=2.0.0,<3.0.0)
43
44
  Requires-Dist: pydantic-settings (>=2.3.0,<3.0.0)
44
45
  Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
@@ -0,0 +1,61 @@
1
+ docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ docling/backend/abstract_backend.py,sha256=1lNxzwDTn303aXduPDVmTyXn-5ZIoWMLYqNxANGWmQQ,1658
4
+ docling/backend/asciidoc_backend.py,sha256=zyHxlG_BvlLwvpdNca3P6aopxOJZw8wbDFkJQQknNXk,14050
5
+ docling/backend/docling_parse_backend.py,sha256=hEEJibI1oJS0LAnFoIs6gMshS3bCqGtVxHnDNvBGZuA,7649
6
+ docling/backend/docling_parse_v2_backend.py,sha256=IpwrBrtLGwNRl5AYO-o3NjEfNRsAkuMhzvDt2HXb9Ko,8655
7
+ docling/backend/html_backend.py,sha256=vUEfx0h24gEaHO2taQyWNs8zCkDox7kopEeMbWBXss0,15560
8
+ docling/backend/json/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
+ docling/backend/json/docling_json_backend.py,sha256=LlFMVoZrrCfVwbDuRbNN4Xg96Lujh4xxrTBt9jGhY9I,1984
10
+ docling/backend/md_backend.py,sha256=ajEooDWNnWPHnPQMgUDh-K44Ch1X-sTBHqa1xBp7yJs,14645
11
+ docling/backend/msexcel_backend.py,sha256=lyJc4ShJGAN2ZfNTTuhdYTF-44cZsGyn_8Djstp3IEU,12700
12
+ docling/backend/mspowerpoint_backend.py,sha256=kOGawhcn0BFq4M_C6kW0mY8vMIB24_6R6q6GaszbSt0,15957
13
+ docling/backend/msword_backend.py,sha256=WcQmRYmpH8o2snGoWGxNRkCtUI3mf2JL3-9CxAfDAJg,19232
14
+ docling/backend/pdf_backend.py,sha256=17Pr8dWsD1C4FYUprrwMM9trDGW-JYLjrcScx1Ul4io,2048
15
+ docling/backend/pypdfium2_backend.py,sha256=QSPfp903ZtSpoNqPmcIek0HmvETrJ1kkwrdxnF5pjS0,9014
16
+ docling/backend/xml/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
+ docling/backend/xml/pubmed_backend.py,sha256=LMnpowjnxa5SydfNC00Ll840BYraL8dCJu-FfC9iSKk,20447
18
+ docling/backend/xml/uspto_backend.py,sha256=2YsnB-WRARIAaHPL6gxHePP24GQGi-Up2_K8ZapD3k4,70974
19
+ docling/chunking/__init__.py,sha256=h83TDs0AuOV6oEPLAPrn9dpGKiU-2Vg6IRNo4cv6GDA,346
20
+ docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
21
+ docling/cli/main.py,sha256=NR7NEt8Sf3FE9D7sHpEmABM9mFMTMO5w0VPwYIIvVsk,15481
22
+ docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
23
+ docling/datamodel/base_models.py,sha256=vewP1X99qfAwiUsiC2m8CBDGiQPsGyp_WkKJHYPoYn4,7026
24
+ docling/datamodel/document.py,sha256=R748mLCFai4MeiE8ougQrQVJF_16t3f4CUrrEes5AV0,13202
25
+ docling/datamodel/pipeline_options.py,sha256=GA5LwywfOkcBDvG2LhDHikqDQYlFlUPJa93tPSx-vFw,7820
26
+ docling/datamodel/settings.py,sha256=Sw0rN_f8rdLV1eNvVeKiyET2Oe6oz9jtW3lJzniW9Do,1302
27
+ docling/document_converter.py,sha256=qtYPEkWuMUUGmFko2in38iSHdYrjAFf_GHNoXRRvEVs,12631
28
+ docling/exceptions.py,sha256=-FoP46rFJgz_jn5uDv2V052udEEg8gckk6uhoItchXc,85
29
+ docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
30
+ docling/models/base_model.py,sha256=H5X-exVaAN-XMTzxpgUc-rwH-D8Uk7-VuZtq2soNGXI,2567
31
+ docling/models/base_ocr_model.py,sha256=YiUMvdjnHw9SHjnfJKT5INrPMoIGEf_Z2OApfl_VRTE,6919
32
+ docling/models/code_formula_model.py,sha256=bOIKJvckZ0QpnDZ-CDiYv-CvuGvaGzJgp2PiYAidKBQ,8422
33
+ docling/models/document_picture_classifier.py,sha256=RLB80ueqWZ86hdXtTKmSynCU13nT-T10vUp2sky9110,6078
34
+ docling/models/ds_glm_model.py,sha256=CkhsP0cEWwm4wb1g3cLFriVGpVtELiUK3REDMkPwAMw,13028
35
+ docling/models/easyocr_model.py,sha256=Kakb20ioBxDmNsIqoGvSSs_vbqAWN3QQNHYtEi-eErg,4990
36
+ docling/models/layout_model.py,sha256=3Fw7OM6g0j7NgItKsQOgFOCd1q6lp1DacN_db7f6QCw,6090
37
+ docling/models/ocr_mac_model.py,sha256=bLP14UUmZcSzjDe-HLj-mtksTuBmsCTg2C1wCxUpan0,4502
38
+ docling/models/page_assemble_model.py,sha256=c5KLKwkUIdW0JcDHizWsqrpb5x_3DK28x82Q8o-3VJM,5968
39
+ docling/models/page_preprocessing_model.py,sha256=1gVrZjObKxAvXkkKvXnIFApPOggzgiTFPtt1CGbMbSs,2763
40
+ docling/models/rapid_ocr_model.py,sha256=LOIvczJs3_db2o8mtrKk-pIXgC-xqWqRLu2cjA3wvy4,4980
41
+ docling/models/table_structure_model.py,sha256=fUpCHthO4Uk3BhA99a85BHBm51fmdE9kfqhAk3WjuBw,9392
42
+ docling/models/tesseract_ocr_cli_model.py,sha256=FP9cnSkSyj6-EETHtabV720Fr3x9K_oBP2UuJi4VUwE,6621
43
+ docling/models/tesseract_ocr_model.py,sha256=N27xjo8aPb5x276wKHkf_6VFwJObfosdHLo5_hCuf94,8055
44
+ docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
45
+ docling/pipeline/base_pipeline.py,sha256=J0ZjtincsJr-BbRgqoQozxIhDWxWFlWaS9CTPwypJFk,8621
46
+ docling/pipeline/simple_pipeline.py,sha256=mZqANqUtAOFAyqQEShErQnAUz6tJFOl6zVoazEDJ_wE,2254
47
+ docling/pipeline/standard_pdf_pipeline.py,sha256=Qefg1JSiFwipypi8TZPJ50WgXTLjwkC0wvYAl02RM2o,10480
48
+ docling/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
49
+ docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
50
+ docling/utils/accelerator_utils.py,sha256=ZjULCn-qhxqx3frF-rJmAlWdzqgUMxH5utLHbSPev80,1367
51
+ docling/utils/export.py,sha256=KyGF1BVDHPFfHVXZc8vegsWlFfOgGPP2YckWpTadyI8,4694
52
+ docling/utils/glm_utils.py,sha256=Nfxdx0W-sl1owYncTeJmZdiPcn-jpTqK8f8TeQlDOMY,11683
53
+ docling/utils/layout_postprocessor.py,sha256=urRzeF9PrKiMBvA6DdHHwyLxG06CMhelgJeV5B1l6l0,24258
54
+ docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
55
+ docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
56
+ docling/utils/visualization.py,sha256=4pn-80fVuE04ken7hUg5Ar47ndRSL9MWBgdHM-1g1zU,2735
57
+ docling-2.16.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
58
+ docling-2.16.0.dist-info/METADATA,sha256=wJgRO2R9Szl69jFE8gj-VGIBpkwwMWPfgytz9nDsT_E,7780
59
+ docling-2.16.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
60
+ docling-2.16.0.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
61
+ docling-2.16.0.dist-info/RECORD,,
@@ -1,56 +0,0 @@
1
- docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- docling/backend/abstract_backend.py,sha256=-or6kWVV7egQeyIuN-vI0Tr7Q1htalBZSlhgq_G2RdU,1678
4
- docling/backend/asciidoc_backend.py,sha256=kXZxOLk_LvLFVZwnJVVwjmvc3QWZ0iiG7VnwjgtC3hI,14051
5
- docling/backend/docling_parse_backend.py,sha256=cJLkuOmfCtshRrwsv7WWayRNeMQASZv76v3nUHucqgM,7636
6
- docling/backend/docling_parse_v2_backend.py,sha256=-lLsorxhK_Awrql_zXPen2LX0Gt9UvcDLMcmXf7_LKc,8642
7
- docling/backend/html_backend.py,sha256=O8qXaw7MzOIdaxbBcjHieM9Ce4GEdtBj9YW0vpJspuA,15560
8
- docling/backend/md_backend.py,sha256=tmuSCghjor9PqKIiVieCuZ4_t5JEjZMy3cq7u3yTgyU,14032
9
- docling/backend/msexcel_backend.py,sha256=23qUEScqr5GhY06xiqg-eBQ_JlAqO0FkPEmX6554sVA,12040
10
- docling/backend/mspowerpoint_backend.py,sha256=kOGawhcn0BFq4M_C6kW0mY8vMIB24_6R6q6GaszbSt0,15957
11
- docling/backend/msword_backend.py,sha256=K1D_h0ulLA6KQsPe62327cDVkQqV1f7EetCHo66wCKw,19233
12
- docling/backend/pdf_backend.py,sha256=unnw7QiRE1VXg6Pj-eYrtnFGrp5SSYiI324OlFxyv6c,2050
13
- docling/backend/pypdfium2_backend.py,sha256=Exb3NBp3x2YSLoNfmXq4NefShgooJXsxTXrJ4JbTzcc,9001
14
- docling/backend/xml/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
- docling/backend/xml/pubmed_backend.py,sha256=LMnpowjnxa5SydfNC00Ll840BYraL8dCJu-FfC9iSKk,20447
16
- docling/backend/xml/uspto_backend.py,sha256=2YsnB-WRARIAaHPL6gxHePP24GQGi-Up2_K8ZapD3k4,70974
17
- docling/chunking/__init__.py,sha256=h83TDs0AuOV6oEPLAPrn9dpGKiU-2Vg6IRNo4cv6GDA,346
18
- docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
- docling/cli/main.py,sha256=NR7NEt8Sf3FE9D7sHpEmABM9mFMTMO5w0VPwYIIvVsk,15481
20
- docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
21
- docling/datamodel/base_models.py,sha256=50Jf5zk9c4-zmnOzZLoPBnHQhTX0_OFQzIkKgnKK1o4,6229
22
- docling/datamodel/document.py,sha256=OHM6bm0a-62xnAZ8DFlMHzATmbgNcfMxQoQO2udaW5Q,13071
23
- docling/datamodel/pipeline_options.py,sha256=wKFzw8sAim6emQGsjuS12n7FfpMo8HVNoMOPhkXTkVo,7734
24
- docling/datamodel/settings.py,sha256=Sw0rN_f8rdLV1eNvVeKiyET2Oe6oz9jtW3lJzniW9Do,1302
25
- docling/document_converter.py,sha256=_pk0sHuPXJ14NEutatf5bK2VyNiU5cvYsVbh1HIgrIw,12431
26
- docling/exceptions.py,sha256=-FoP46rFJgz_jn5uDv2V052udEEg8gckk6uhoItchXc,85
27
- docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
28
- docling/models/base_model.py,sha256=Yq_-FmUhqhE20vXYG3WiQXDRTIPjik1CyuEZ8iYTGAY,701
29
- docling/models/base_ocr_model.py,sha256=YiUMvdjnHw9SHjnfJKT5INrPMoIGEf_Z2OApfl_VRTE,6919
30
- docling/models/ds_glm_model.py,sha256=CkhsP0cEWwm4wb1g3cLFriVGpVtELiUK3REDMkPwAMw,13028
31
- docling/models/easyocr_model.py,sha256=Kakb20ioBxDmNsIqoGvSSs_vbqAWN3QQNHYtEi-eErg,4990
32
- docling/models/layout_model.py,sha256=Xo8sclRTOO_V8Cr4RwuxB67vSWKF0LZ5nJRYU1WI--k,9063
33
- docling/models/ocr_mac_model.py,sha256=bLP14UUmZcSzjDe-HLj-mtksTuBmsCTg2C1wCxUpan0,4502
34
- docling/models/page_assemble_model.py,sha256=qdEX0AIb76ZOqJV6O9j-7r67WmuIkUlwbb2PsL7eFK4,7608
35
- docling/models/page_preprocessing_model.py,sha256=1gVrZjObKxAvXkkKvXnIFApPOggzgiTFPtt1CGbMbSs,2763
36
- docling/models/rapid_ocr_model.py,sha256=LOIvczJs3_db2o8mtrKk-pIXgC-xqWqRLu2cjA3wvy4,4980
37
- docling/models/table_structure_model.py,sha256=fUpCHthO4Uk3BhA99a85BHBm51fmdE9kfqhAk3WjuBw,9392
38
- docling/models/tesseract_ocr_cli_model.py,sha256=aKQBaty4cYu6zG_C5uy6Zm3eeRQo5fxIierbKixa2kc,6622
39
- docling/models/tesseract_ocr_model.py,sha256=RDf6iV1q-oXaGfZXv0bW6SqjHNKQvBUDlUsOkuz0neY,6095
40
- docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
41
- docling/pipeline/base_pipeline.py,sha256=W1HgNp0vPp8vVY0vTX47Xe4m7j0lWT8XroCkTx5PyMw,7949
42
- docling/pipeline/simple_pipeline.py,sha256=mZqANqUtAOFAyqQEShErQnAUz6tJFOl6zVoazEDJ_wE,2254
43
- docling/pipeline/standard_pdf_pipeline.py,sha256=iXjVLy-9q82jrU_0AZTkbz3ccrqz4WiRLYD-epxG5BQ,9174
44
- docling/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
45
- docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
46
- docling/utils/accelerator_utils.py,sha256=ZjULCn-qhxqx3frF-rJmAlWdzqgUMxH5utLHbSPev80,1367
47
- docling/utils/export.py,sha256=KyGF1BVDHPFfHVXZc8vegsWlFfOgGPP2YckWpTadyI8,4694
48
- docling/utils/glm_utils.py,sha256=IB19wToGath97gD3jAA3G_rQSptnZKhQCWLvPUCnkww,11551
49
- docling/utils/layout_postprocessor.py,sha256=urRzeF9PrKiMBvA6DdHHwyLxG06CMhelgJeV5B1l6l0,24258
50
- docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
51
- docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
52
- docling-2.15.1.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
53
- docling-2.15.1.dist-info/METADATA,sha256=6WRzA633us43nw7RHwhX_jwizh2JSpGWxNh0pJq2ZYs,7739
54
- docling-2.15.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
55
- docling-2.15.1.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
56
- docling-2.15.1.dist-info/RECORD,,