docling 1.19.0__py3-none-any.whl → 1.19.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/models/tesseract_ocr_cli_model.py +2 -2
- {docling-1.19.0.dist-info → docling-1.19.1.dist-info}/METADATA +11 -11
- {docling-1.19.0.dist-info → docling-1.19.1.dist-info}/RECORD +6 -6
- {docling-1.19.0.dist-info → docling-1.19.1.dist-info}/LICENSE +0 -0
- {docling-1.19.0.dist-info → docling-1.19.1.dist-info}/WHEEL +0 -0
- {docling-1.19.0.dist-info → docling-1.19.1.dist-info}/entry_points.txt +0 -0
@@ -1,7 +1,7 @@
|
|
1
1
|
import io
|
2
2
|
import logging
|
3
3
|
import tempfile
|
4
|
-
from subprocess import PIPE, Popen
|
4
|
+
from subprocess import DEVNULL, PIPE, Popen
|
5
5
|
from typing import Iterable, Tuple
|
6
6
|
|
7
7
|
import pandas as pd
|
@@ -81,7 +81,7 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
81
81
|
cmd += [ifilename, "stdout", "tsv"]
|
82
82
|
_log.info("command: {}".format(" ".join(cmd)))
|
83
83
|
|
84
|
-
proc = Popen(cmd, stdout=PIPE)
|
84
|
+
proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
|
85
85
|
output, _ = proc.communicate()
|
86
86
|
|
87
87
|
# _log.info(output)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 1.19.
|
3
|
+
Version: 1.19.1
|
4
4
|
Summary: Docling PDF conversion package
|
5
5
|
Home-page: https://github.com/DS4SD/docling
|
6
6
|
License: MIT
|
@@ -22,12 +22,13 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
22
22
|
Provides-Extra: tesserocr
|
23
23
|
Requires-Dist: certifi (>=2024.7.4)
|
24
24
|
Requires-Dist: deepsearch-glm (>=0.22.0,<0.23.0)
|
25
|
-
Requires-Dist: docling-core (>=1.
|
25
|
+
Requires-Dist: docling-core (>=1.7.1,<2.0.0)
|
26
26
|
Requires-Dist: docling-ibm-models (>=2.0.0,<3.0.0)
|
27
27
|
Requires-Dist: docling-parse (>=1.4.1,<2.0.0)
|
28
28
|
Requires-Dist: easyocr (>=1.7,<2.0)
|
29
29
|
Requires-Dist: filetype (>=1.2.0,<2.0.0)
|
30
30
|
Requires-Dist: huggingface_hub (>=0.23,<1)
|
31
|
+
Requires-Dist: pandas (>=2.1.4,<3.0.0)
|
31
32
|
Requires-Dist: pyarrow (>=16.1.0,<17.0.0)
|
32
33
|
Requires-Dist: pydantic (>=2.0.0,<3.0.0)
|
33
34
|
Requires-Dist: pydantic-settings (>=2.3.0,<3.0.0)
|
@@ -335,15 +336,14 @@ from docling_core.transforms.chunker import HierarchicalChunker
|
|
335
336
|
|
336
337
|
doc = DocumentConverter().convert_single("https://arxiv.org/pdf/2206.01062").output
|
337
338
|
chunks = list(HierarchicalChunker().chunk(doc))
|
338
|
-
|
339
|
-
#
|
340
|
-
#
|
341
|
-
#
|
342
|
-
#
|
343
|
-
#
|
344
|
-
#
|
345
|
-
#
|
346
|
-
# > ]
|
339
|
+
print(chunks[0])
|
340
|
+
# ChunkWithMetadata(
|
341
|
+
# path='#/main-text/1',
|
342
|
+
# text='DocLayNet: A Large Human-Annotated Dataset [...]',
|
343
|
+
# page=1,
|
344
|
+
# bbox=[107.30, 672.38, 505.19, 709.08],
|
345
|
+
# [...]
|
346
|
+
# )
|
347
347
|
```
|
348
348
|
|
349
349
|
|
@@ -18,7 +18,7 @@ docling/models/easyocr_model.py,sha256=fmfTvOfqo97n_xYQDPRMKlvMOs9QqgLgSTqwBDrjK
|
|
18
18
|
docling/models/layout_model.py,sha256=5wCohyzGK1p5F5cTRY5QWbW2AI4eevXobJDJdSLhX7k,11272
|
19
19
|
docling/models/page_assemble_model.py,sha256=8eoG2WiFxPxq9TPvM-wkngb2gkr0tdtCRVXg1JcTETo,5550
|
20
20
|
docling/models/table_structure_model.py,sha256=iHJjWdKCpTcH3l_ElMWnC5pt6tkUpIuByed304Fdq9w,6009
|
21
|
-
docling/models/tesseract_ocr_cli_model.py,sha256=
|
21
|
+
docling/models/tesseract_ocr_cli_model.py,sha256=B7zPkpKgpfTTUWXm-_zvw7x0yvzVd85WguawFVDwdqI,5529
|
22
22
|
docling/models/tesseract_ocr_model.py,sha256=PqQv1Hv3GC1FByjegWvjNFw15Jcw-mT25_MvFr3hAHQ,4575
|
23
23
|
docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
24
24
|
docling/pipeline/base_model_pipeline.py,sha256=rrMog3EuiR5Gx9OWtfMj24rQvHCrWkxZ3g9OIr7LPSQ,607
|
@@ -27,8 +27,8 @@ docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
27
27
|
docling/utils/export.py,sha256=bKLdbeUcR-rQsGPV1IqJkCHKMCv7X2QOHyxmjNuH3HE,4655
|
28
28
|
docling/utils/layout_utils.py,sha256=FOFbL0hKzUoWXdZaeUvEtFqKv0IkPifIr4sdGW4suKs,31804
|
29
29
|
docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
|
30
|
-
docling-1.19.
|
31
|
-
docling-1.19.
|
32
|
-
docling-1.19.
|
33
|
-
docling-1.19.
|
34
|
-
docling-1.19.
|
30
|
+
docling-1.19.1.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
|
31
|
+
docling-1.19.1.dist-info/METADATA,sha256=hCQeq3JVB16CfTwtjjwnX5u9bWYjD0CsSbn9h1tZZTM,16800
|
32
|
+
docling-1.19.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
33
|
+
docling-1.19.1.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
|
34
|
+
docling-1.19.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|