docling 1.5.0__py3-none-any.whl → 1.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -18,6 +18,10 @@ class PdfPageBackend(ABC):
18
18
  def get_text_cells(self) -> Iterable["Cell"]:
19
19
  pass
20
20
 
21
+ @abstractmethod
22
+ def get_bitmap_rects(self, scale: int = 1) -> Iterable["BoundingBox"]:
23
+ pass
24
+
21
25
  @abstractmethod
22
26
  def get_page_image(
23
27
  self, scale: int = 1, cropbox: Optional["BoundingBox"] = None
@@ -3,7 +3,7 @@ import random
3
3
  import time
4
4
  from io import BytesIO
5
5
  from pathlib import Path
6
- from typing import Iterable, List, Optional, Union
6
+ from typing import Iterable, Optional, Union
7
7
 
8
8
  import pypdfium2 as pdfium
9
9
  from docling_parse.docling_parse import pdf_parser
@@ -43,7 +43,7 @@ class DoclingParsePageBackend(PdfPageBackend):
43
43
  r=x1 * scale * page_size.width / parser_width,
44
44
  t=y1 * scale * page_size.height / parser_height,
45
45
  coord_origin=CoordOrigin.BOTTOMLEFT,
46
- ).to_top_left_origin(page_size.height * scale)
46
+ ).to_top_left_origin(page_height=page_size.height * scale)
47
47
 
48
48
  overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area()
49
49
 
@@ -66,6 +66,12 @@ class DoclingParsePageBackend(PdfPageBackend):
66
66
  for i in range(len(self._dpage["cells"])):
67
67
  rect = self._dpage["cells"][i]["box"]["device"]
68
68
  x0, y0, x1, y1 = rect
69
+
70
+ if x1 < x0:
71
+ x0, x1 = x1, x0
72
+ if y1 < y0:
73
+ y0, y1 = y1, y0
74
+
69
75
  text_piece = self._dpage["cells"][i]["content"]["rnormalized"]
70
76
  cells.append(
71
77
  Cell(
@@ -108,6 +114,20 @@ class DoclingParsePageBackend(PdfPageBackend):
108
114
 
109
115
  return cells
110
116
 
117
+ def get_bitmap_rects(self, scale: int = 1) -> Iterable[BoundingBox]:
118
+ AREA_THRESHOLD = 32 * 32
119
+
120
+ for i in range(len(self._dpage["images"])):
121
+ bitmap = self._dpage["images"][i]
122
+ cropbox = BoundingBox.from_tuple(
123
+ bitmap["box"], origin=CoordOrigin.BOTTOMLEFT
124
+ ).to_top_left_origin(self.get_size().height)
125
+
126
+ if cropbox.area() > AREA_THRESHOLD:
127
+ cropbox = cropbox.scaled(scale=scale)
128
+
129
+ yield cropbox
130
+
111
131
  def get_page_image(
112
132
  self, scale: int = 1, cropbox: Optional[BoundingBox] = None
113
133
  ) -> Image.Image:
@@ -173,7 +193,7 @@ class DoclingParseDocumentBackend(PdfDocumentBackend):
173
193
  def page_count(self) -> int:
174
194
  return len(self._parser_doc["pages"])
175
195
 
176
- def load_page(self, page_no: int) -> PdfPage:
196
+ def load_page(self, page_no: int) -> DoclingParsePageBackend:
177
197
  return DoclingParsePageBackend(
178
198
  self._pdoc[page_no], self._parser_doc["pages"][page_no]
179
199
  )
@@ -4,6 +4,7 @@ from pathlib import Path
4
4
  from typing import Iterable, List, Optional, Union
5
5
 
6
6
  import pypdfium2 as pdfium
7
+ import pypdfium2.raw as pdfium_c
7
8
  from PIL import Image, ImageDraw
8
9
  from pypdfium2 import PdfPage
9
10
 
@@ -17,6 +18,19 @@ class PyPdfiumPageBackend(PdfPageBackend):
17
18
  self._ppage = page_obj
18
19
  self.text_page = None
19
20
 
21
+ def get_bitmap_rects(self, scale: int = 1) -> Iterable[BoundingBox]:
22
+ AREA_THRESHOLD = 32 * 32
23
+ for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
24
+ pos = obj.get_pos()
25
+ cropbox = BoundingBox.from_tuple(
26
+ pos, origin=CoordOrigin.BOTTOMLEFT
27
+ ).to_top_left_origin(page_height=self.get_size().height)
28
+
29
+ if cropbox.area() > AREA_THRESHOLD:
30
+ cropbox = cropbox.scaled(scale=scale)
31
+
32
+ yield cropbox
33
+
20
34
  def get_text_in_rect(self, bbox: BoundingBox) -> str:
21
35
  if not self.text_page:
22
36
  self.text_page = self._ppage.get_textpage()
@@ -208,7 +222,7 @@ class PyPdfiumDocumentBackend(PdfDocumentBackend):
208
222
  def page_count(self) -> int:
209
223
  return len(self._pdoc)
210
224
 
211
- def load_page(self, page_no: int) -> PdfPage:
225
+ def load_page(self, page_no: int) -> PyPdfiumPageBackend:
212
226
  return PyPdfiumPageBackend(self._pdoc[page_no])
213
227
 
214
228
  def is_valid(self) -> bool:
@@ -68,13 +68,21 @@ class BoundingBox(BaseModel):
68
68
  @classmethod
69
69
  def from_tuple(cls, coord: Tuple[float], origin: CoordOrigin):
70
70
  if origin == CoordOrigin.TOPLEFT:
71
- return BoundingBox(
72
- l=coord[0], t=coord[1], r=coord[2], b=coord[3], coord_origin=origin
73
- )
71
+ l, t, r, b = coord[0], coord[1], coord[2], coord[3]
72
+ if r < l:
73
+ l, r = r, l
74
+ if b < t:
75
+ b, t = t, b
76
+
77
+ return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
74
78
  elif origin == CoordOrigin.BOTTOMLEFT:
75
- return BoundingBox(
76
- l=coord[0], b=coord[1], r=coord[2], t=coord[3], coord_origin=origin
77
- )
79
+ l, b, r, t = coord[0], coord[1], coord[2], coord[3]
80
+ if r < l:
81
+ l, r = r, l
82
+ if b > t:
83
+ b, t = t, b
84
+
85
+ return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
78
86
 
79
87
  def area(self) -> float:
80
88
  return (self.r - self.l) * (self.b - self.t)
@@ -280,7 +288,7 @@ class TableStructureOptions(BaseModel):
280
288
 
281
289
  class PipelineOptions(BaseModel):
282
290
  do_table_structure: bool = True # True: perform table structure extraction
283
- do_ocr: bool = False # True: perform OCR, replace programmatic PDF text
291
+ do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
284
292
 
285
293
  table_structure_options: TableStructureOptions = TableStructureOptions()
286
294
 
@@ -35,8 +35,6 @@ _log = logging.getLogger(__name__)
35
35
 
36
36
 
37
37
  class DocumentConverter:
38
- _layout_model_path = "model_artifacts/layout/beehive_v0.0.5"
39
- _table_model_path = "model_artifacts/tableformer"
40
38
  _default_download_filename = "file.pdf"
41
39
 
42
40
  def __init__(
@@ -0,0 +1,124 @@
1
+ import copy
2
+ import logging
3
+ from abc import abstractmethod
4
+ from typing import Iterable, List, Tuple
5
+
6
+ import numpy
7
+ import numpy as np
8
+ from PIL import Image, ImageDraw
9
+ from rtree import index
10
+ from scipy.ndimage import find_objects, label
11
+
12
+ from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
13
+
14
+ _log = logging.getLogger(__name__)
15
+
16
+
17
+ class BaseOcrModel:
18
+ def __init__(self, config):
19
+ self.config = config
20
+ self.enabled = config["enabled"]
21
+
22
+ # Computes the optimum amount and coordinates of rectangles to OCR on a given page
23
+ def get_ocr_rects(self, page: Page) -> Tuple[bool, List[BoundingBox]]:
24
+ BITMAP_COVERAGE_TRESHOLD = 0.75
25
+
26
+ def find_ocr_rects(size, bitmap_rects):
27
+ image = Image.new(
28
+ "1", (round(size.width), round(size.height))
29
+ ) # '1' mode is binary
30
+
31
+ # Draw all bitmap rects into a binary image
32
+ draw = ImageDraw.Draw(image)
33
+ for rect in bitmap_rects:
34
+ x0, y0, x1, y1 = rect.as_tuple()
35
+ x0, y0, x1, y1 = round(x0), round(y0), round(x1), round(y1)
36
+ draw.rectangle([(x0, y0), (x1, y1)], fill=1)
37
+
38
+ np_image = np.array(image)
39
+
40
+ # Find the connected components
41
+ labeled_image, num_features = label(
42
+ np_image > 0
43
+ ) # Label black (0 value) regions
44
+
45
+ # Find enclosing bounding boxes for each connected component.
46
+ slices = find_objects(labeled_image)
47
+ bounding_boxes = [
48
+ BoundingBox(
49
+ l=slc[1].start,
50
+ t=slc[0].start,
51
+ r=slc[1].stop - 1,
52
+ b=slc[0].stop - 1,
53
+ coord_origin=CoordOrigin.TOPLEFT,
54
+ )
55
+ for slc in slices
56
+ ]
57
+
58
+ # Compute area fraction on page covered by bitmaps
59
+ area_frac = np.sum(np_image > 0) / (size.width * size.height)
60
+
61
+ return (area_frac, bounding_boxes) # fraction covered # boxes
62
+
63
+ bitmap_rects = page._backend.get_bitmap_rects()
64
+ coverage, ocr_rects = find_ocr_rects(page.size, bitmap_rects)
65
+
66
+ # return full-page rectangle if sufficiently covered with bitmaps
67
+ if coverage > BITMAP_COVERAGE_TRESHOLD:
68
+ return [
69
+ BoundingBox(
70
+ l=0,
71
+ t=0,
72
+ r=page.size.width,
73
+ b=page.size.height,
74
+ coord_origin=CoordOrigin.TOPLEFT,
75
+ )
76
+ ]
77
+ # return individual rectangles if the bitmap coverage is smaller
78
+ elif coverage < BITMAP_COVERAGE_TRESHOLD:
79
+ return ocr_rects
80
+
81
+ # Filters OCR cells by dropping any OCR cell that intersects with an existing programmatic cell.
82
+ def filter_ocr_cells(self, ocr_cells, programmatic_cells):
83
+ # Create R-tree index for programmatic cells
84
+ p = index.Property()
85
+ p.dimension = 2
86
+ idx = index.Index(properties=p)
87
+ for i, cell in enumerate(programmatic_cells):
88
+ idx.insert(i, cell.bbox.as_tuple())
89
+
90
+ def is_overlapping_with_existing_cells(ocr_cell):
91
+ # Query the R-tree to get overlapping rectangles
92
+ possible_matches_index = list(idx.intersection(ocr_cell.bbox.as_tuple()))
93
+
94
+ return (
95
+ len(possible_matches_index) > 0
96
+ ) # this is a weak criterion but it works.
97
+
98
+ filtered_ocr_cells = [
99
+ rect for rect in ocr_cells if not is_overlapping_with_existing_cells(rect)
100
+ ]
101
+ return filtered_ocr_cells
102
+
103
+ def draw_ocr_rects_and_cells(self, page, ocr_rects):
104
+ image = copy.deepcopy(page.image)
105
+ draw = ImageDraw.Draw(image, "RGBA")
106
+
107
+ # Draw OCR rectangles as yellow filled rect
108
+ for rect in ocr_rects:
109
+ x0, y0, x1, y1 = rect.as_tuple()
110
+ shade_color = (255, 255, 0, 40) # transparent yellow
111
+ draw.rectangle([(x0, y0), (x1, y1)], fill=shade_color, outline=None)
112
+
113
+ # Draw OCR and programmatic cells
114
+ for tc in page.cells:
115
+ x0, y0, x1, y1 = tc.bbox.as_tuple()
116
+ color = "red"
117
+ if isinstance(tc, OcrCell):
118
+ color = "magenta"
119
+ draw.rectangle([(x0, y0), (x1, y1)], outline=color)
120
+ image.show()
121
+
122
+ @abstractmethod
123
+ def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
124
+ pass
@@ -1,20 +1,18 @@
1
- import copy
2
1
  import logging
3
- import random
4
2
  from typing import Iterable
5
3
 
6
4
  import numpy
7
- from PIL import ImageDraw
8
5
 
9
6
  from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
7
+ from docling.models.base_ocr_model import BaseOcrModel
10
8
 
11
9
  _log = logging.getLogger(__name__)
12
10
 
13
11
 
14
- class EasyOcrModel:
12
+ class EasyOcrModel(BaseOcrModel):
15
13
  def __init__(self, config):
16
- self.config = config
17
- self.enabled = config["enabled"]
14
+ super().__init__(config)
15
+
18
16
  self.scale = 3 # multiplier for 72 dpi == 216 dpi.
19
17
 
20
18
  if self.enabled:
@@ -29,49 +27,44 @@ class EasyOcrModel:
29
27
  return
30
28
 
31
29
  for page in page_batch:
32
- # rects = page._fpage.
33
- high_res_image = page.get_image(scale=self.scale)
34
- im = numpy.array(high_res_image)
35
- result = self.reader.readtext(im)
36
-
37
- del high_res_image
38
- del im
39
-
40
- cells = [
41
- OcrCell(
42
- id=ix,
43
- text=line[1],
44
- confidence=line[2],
45
- bbox=BoundingBox.from_tuple(
46
- coord=(
47
- line[0][0][0] / self.scale,
48
- line[0][0][1] / self.scale,
49
- line[0][2][0] / self.scale,
50
- line[0][2][1] / self.scale,
51
- ),
52
- origin=CoordOrigin.TOPLEFT,
53
- ),
30
+ ocr_rects = self.get_ocr_rects(page)
31
+
32
+ all_ocr_cells = []
33
+ for ocr_rect in ocr_rects:
34
+ high_res_image = page._backend.get_page_image(
35
+ scale=self.scale, cropbox=ocr_rect
54
36
  )
55
- for ix, line in enumerate(result)
56
- ]
37
+ im = numpy.array(high_res_image)
38
+ result = self.reader.readtext(im)
39
+
40
+ del high_res_image
41
+ del im
42
+
43
+ cells = [
44
+ OcrCell(
45
+ id=ix,
46
+ text=line[1],
47
+ confidence=line[2],
48
+ bbox=BoundingBox.from_tuple(
49
+ coord=(
50
+ (line[0][0][0] / self.scale) + ocr_rect.l,
51
+ (line[0][0][1] / self.scale) + ocr_rect.t,
52
+ (line[0][2][0] / self.scale) + ocr_rect.l,
53
+ (line[0][2][1] / self.scale) + ocr_rect.t,
54
+ ),
55
+ origin=CoordOrigin.TOPLEFT,
56
+ ),
57
+ )
58
+ for ix, line in enumerate(result)
59
+ ]
60
+ all_ocr_cells.extend(cells)
57
61
 
58
- page.cells = cells # For now, just overwrites all digital cells.
62
+ ## Remove OCR cells which overlap with programmatic cells.
63
+ filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
59
64
 
60
- # DEBUG code:
61
- def draw_clusters_and_cells():
62
- image = copy.deepcopy(page.image)
63
- draw = ImageDraw.Draw(image)
64
-
65
- cell_color = (
66
- random.randint(30, 140),
67
- random.randint(30, 140),
68
- random.randint(30, 140),
69
- )
70
- for tc in cells:
71
- x0, y0, x1, y1 = tc.bbox.as_tuple()
72
- draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
73
- image.show()
65
+ page.cells.extend(filtered_ocr_cells)
74
66
 
75
- # draw_clusters_and_cells()
67
+ # DEBUG code:
68
+ # self.draw_ocr_rects_and_cells(page, ocr_rects)
76
69
 
77
70
  yield page
@@ -1,5 +1,4 @@
1
1
  import copy
2
- import random
3
2
  from typing import Iterable, List
4
3
 
5
4
  import numpy
@@ -1,4 +1,3 @@
1
- from abc import abstractmethod
2
1
  from pathlib import Path
3
2
  from typing import Iterable
4
3
 
@@ -1,10 +1,8 @@
1
1
  from pathlib import Path
2
- from typing import Iterable
3
2
 
4
- from docling.datamodel.base_models import Page, PipelineOptions
3
+ from docling.datamodel.base_models import PipelineOptions
5
4
  from docling.models.easyocr_model import EasyOcrModel
6
5
  from docling.models.layout_model import LayoutModel
7
- from docling.models.page_assemble_model import PageAssembleModel
8
6
  from docling.models.table_structure_model import TableStructureModel
9
7
  from docling.pipeline.base_model_pipeline import BaseModelPipeline
10
8
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 1.5.0
3
+ Version: 1.6.1
4
4
  Summary: Docling PDF conversion package
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -19,20 +19,21 @@ Classifier: Programming Language :: Python :: 3.10
19
19
  Classifier: Programming Language :: Python :: 3.11
20
20
  Classifier: Programming Language :: Python :: 3.12
21
21
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
- Provides-Extra: easyocr
23
22
  Provides-Extra: ocr
24
23
  Requires-Dist: certifi (>=2024.7.4)
25
24
  Requires-Dist: deepsearch-glm (>=0.19.0,<1)
26
25
  Requires-Dist: docling-core (>=1.1.2,<2.0.0)
27
- Requires-Dist: docling-ibm-models (>=1.1.1,<2.0.0)
26
+ Requires-Dist: docling-ibm-models (>=1.1.2,<2.0.0)
28
27
  Requires-Dist: docling-parse (>=0.2.0,<0.3.0)
29
- Requires-Dist: easyocr (>=1.7,<2.0) ; extra == "easyocr" or extra == "ocr"
28
+ Requires-Dist: easyocr (>=1.7,<2.0) ; extra == "ocr"
30
29
  Requires-Dist: filetype (>=1.2.0,<2.0.0)
31
30
  Requires-Dist: huggingface_hub (>=0.23,<1)
32
31
  Requires-Dist: pydantic (>=2.0.0,<3.0.0)
33
32
  Requires-Dist: pydantic-settings (>=2.3.0,<3.0.0)
34
33
  Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
35
34
  Requires-Dist: requests (>=2.32.3,<3.0.0)
35
+ Requires-Dist: rtree (>=1.3.0,<2.0.0)
36
+ Requires-Dist: scipy (>=1.14.1,<2.0.0)
36
37
  Project-URL: Repository, https://github.com/DS4SD/docling
37
38
  Description-Content-Type: text/markdown
38
39
 
@@ -0,0 +1,27 @@
1
+ docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ docling/backend/abstract_backend.py,sha256=ZfEHaBPGM1cmqrhaEoU3MHhnHU11NhOnhtFEIbVMYDo,1221
4
+ docling/backend/docling_parse_backend.py,sha256=TN7Ln3Lkc8k0v6HzxA2iUGc8f2iqMw0I-3eryLQkpdw,6924
5
+ docling/backend/pypdfium2_backend.py,sha256=xUiIYgd7i22YDx4-W2hfPUaQFszW0gcT6pavG5qZ8LE,8062
6
+ docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
+ docling/datamodel/base_models.py,sha256=5VHit5h7OleKnbhvy-sWDxQLizEdNrGUBrypyzwHyAE,8604
8
+ docling/datamodel/document.py,sha256=Dgi9pSwXCgIoR26MKiRDiVMyMaFKdvGSKq2Fm5Lef9M,13173
9
+ docling/datamodel/settings.py,sha256=t5g6wrEJnPa9gBzMMl8ppgBRUYz-8xgopEtfMS0ZH28,733
10
+ docling/document_converter.py,sha256=UFSELvUSWsr8s0VByu4lNuzu7bn7zZauJTL3FTSLSBg,10371
11
+ docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
+ docling/models/base_ocr_model.py,sha256=Ipl82a3AV2OsgMQSMEMpnWJ6MXcmyIQzmp52PmTaB0g,4465
13
+ docling/models/ds_glm_model.py,sha256=wmb--2JKFQby-kvidw6PyM8wURPXYPQ_Z_eKKCBAdYQ,3192
14
+ docling/models/easyocr_model.py,sha256=ABIqALvtNNrDQ47fXaZ0lDFhOwKsYGUUlAPnIsFZgZA,2232
15
+ docling/models/layout_model.py,sha256=ZFmaLXlRWUfsT1pJCiYVxhQFrBBsiz6Aw0m9GM3UvVM,11249
16
+ docling/models/page_assemble_model.py,sha256=8eoG2WiFxPxq9TPvM-wkngb2gkr0tdtCRVXg1JcTETo,5550
17
+ docling/models/table_structure_model.py,sha256=5jzTlpM-GdCSq4l0vD1W6aSPTJXeTcXEnNuPxnw-DlA,5437
18
+ docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
+ docling/pipeline/base_model_pipeline.py,sha256=AC5NTR0xLy5JIZqsTINkKEHeCPqpyvJpuE_bcnZhyvI,529
20
+ docling/pipeline/standard_model_pipeline.py,sha256=UTjyaEXvz9htYZz-IMTkn11cZwNjgvo_Fl2dfBVnRQs,1442
21
+ docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
22
+ docling/utils/layout_utils.py,sha256=FOFbL0hKzUoWXdZaeUvEtFqKv0IkPifIr4sdGW4suKs,31804
23
+ docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
24
+ docling-1.6.1.dist-info/LICENSE,sha256=ACwmltkrXIz5VsEQcrqljq-fat6ZXAMepjXGoe40KtE,1069
25
+ docling-1.6.1.dist-info/METADATA,sha256=5ML-S0PmaQqA1SMYhaZrNIL3RzU6FcwfAnzXprKf6Oc,7266
26
+ docling-1.6.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
27
+ docling-1.6.1.dist-info/RECORD,,
@@ -1,26 +0,0 @@
1
- docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- docling/backend/abstract_backend.py,sha256=swwmXzNueZSHqEOvw4j-IFhP2OUJhBeB--gV7NtzKgo,1112
4
- docling/backend/docling_parse_backend.py,sha256=-bIjYJ-80R2SArAEw_lAyzgW5_BFEoX83n1oBMmUGF4,6284
5
- docling/backend/pypdfium2_backend.py,sha256=3Qeeal8z6DunUe4S10Z2TXrdeucanCpa8evt6SQtpKQ,7496
6
- docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
- docling/datamodel/base_models.py,sha256=uOq0zjUS60aIkROREiypp3Jn1yqQTlWEf34jXTT43ls,8391
8
- docling/datamodel/document.py,sha256=Dgi9pSwXCgIoR26MKiRDiVMyMaFKdvGSKq2Fm5Lef9M,13173
9
- docling/datamodel/settings.py,sha256=t5g6wrEJnPa9gBzMMl8ppgBRUYz-8xgopEtfMS0ZH28,733
10
- docling/document_converter.py,sha256=r9z48VjL_hkq-rbAgyZ135njzUGBJ5AnhEH6-1zfyCA,10490
11
- docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
- docling/models/ds_glm_model.py,sha256=wmb--2JKFQby-kvidw6PyM8wURPXYPQ_Z_eKKCBAdYQ,3192
13
- docling/models/easyocr_model.py,sha256=Y-RWolIFE3By6gk8dnb2qFy7Cr9qcHs6eo65fWPT0Nc,2276
14
- docling/models/layout_model.py,sha256=ZFmaLXlRWUfsT1pJCiYVxhQFrBBsiz6Aw0m9GM3UvVM,11249
15
- docling/models/page_assemble_model.py,sha256=8eoG2WiFxPxq9TPvM-wkngb2gkr0tdtCRVXg1JcTETo,5550
16
- docling/models/table_structure_model.py,sha256=lKsodvfZaGwxOHp-CbRW5nzCKZYMwf770h0Ka6Bdbgw,5451
17
- docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
- docling/pipeline/base_model_pipeline.py,sha256=ozHdJak0yQAxQf7pQN_C480vI35A2e5KL5Qq1xSkq5c,560
19
- docling/pipeline/standard_model_pipeline.py,sha256=UTwodKUKrisLoVcntbNUBDhjzRyFvpdUvyVw-gNmBlM,1541
20
- docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
21
- docling/utils/layout_utils.py,sha256=FOFbL0hKzUoWXdZaeUvEtFqKv0IkPifIr4sdGW4suKs,31804
22
- docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
23
- docling-1.5.0.dist-info/LICENSE,sha256=ACwmltkrXIz5VsEQcrqljq-fat6ZXAMepjXGoe40KtE,1069
24
- docling-1.5.0.dist-info/METADATA,sha256=jWcjsrdfYcpeYFCRQ1h5C1b8MyaKsJWyUhGheXQEGvY,7235
25
- docling-1.5.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
26
- docling-1.5.0.dist-info/RECORD,,