docling 2.17.0__py3-none-any.whl → 2.18.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/html_backend.py +18 -18
- docling/backend/md_backend.py +88 -35
- docling/backend/mspowerpoint_backend.py +39 -27
- docling/backend/msword_backend.py +172 -130
- docling/datamodel/document.py +2 -0
- docling/datamodel/settings.py +16 -1
- docling/document_converter.py +12 -2
- docling/models/table_structure_model.py +9 -5
- docling/pipeline/base_pipeline.py +3 -1
- docling/utils/glm_utils.py +4 -0
- {docling-2.17.0.dist-info → docling-2.18.0.dist-info}/METADATA +8 -3
- {docling-2.17.0.dist-info → docling-2.18.0.dist-info}/RECORD +15 -15
- {docling-2.17.0.dist-info → docling-2.18.0.dist-info}/WHEEL +1 -1
- {docling-2.17.0.dist-info → docling-2.18.0.dist-info}/LICENSE +0 -0
- {docling-2.17.0.dist-info → docling-2.18.0.dist-info}/entry_points.txt +0 -0
@@ -141,7 +141,9 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
|
141
141
|
with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
|
142
142
|
|
143
143
|
for i in range(0, conv_res.input.page_count):
|
144
|
-
conv_res.
|
144
|
+
start_page, end_page = conv_res.input.limits.page_range
|
145
|
+
if (start_page - 1) <= i <= (end_page - 1):
|
146
|
+
conv_res.pages.append(Page(page_no=i))
|
145
147
|
|
146
148
|
try:
|
147
149
|
# Iterate batches of pages (page_batch_size) in the doc
|
docling/utils/glm_utils.py
CHANGED
@@ -307,6 +307,10 @@ def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
|
|
307
307
|
current_list = None
|
308
308
|
|
309
309
|
doc.add_code(text=text, prov=prov)
|
310
|
+
elif label == DocItemLabel.FORMULA:
|
311
|
+
current_list = None
|
312
|
+
|
313
|
+
doc.add_text(label=DocItemLabel.FORMULA, text="", orig=text, prov=prov)
|
310
314
|
else:
|
311
315
|
current_list = None
|
312
316
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.18.0
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Home-page: https://github.com/DS4SD/docling
|
6
6
|
License: MIT
|
@@ -19,6 +19,7 @@ Classifier: Programming Language :: Python :: 3.9
|
|
19
19
|
Classifier: Programming Language :: Python :: 3.10
|
20
20
|
Classifier: Programming Language :: Python :: 3.11
|
21
21
|
Classifier: Programming Language :: Python :: 3.12
|
22
|
+
Classifier: Programming Language :: Python :: 3.13
|
22
23
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
23
24
|
Provides-Extra: ocrmac
|
24
25
|
Provides-Extra: rapidocr
|
@@ -26,7 +27,7 @@ Provides-Extra: tesserocr
|
|
26
27
|
Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
|
27
28
|
Requires-Dist: certifi (>=2024.7.4)
|
28
29
|
Requires-Dist: deepsearch-glm (>=1.0.0,<2.0.0)
|
29
|
-
Requires-Dist: docling-core[chunking] (>=2.
|
30
|
+
Requires-Dist: docling-core[chunking] (>=2.17.0,<3.0.0)
|
30
31
|
Requires-Dist: docling-ibm-models (>=3.3.0,<4.0.0)
|
31
32
|
Requires-Dist: docling-parse (>=3.1.0,<4.0.0)
|
32
33
|
Requires-Dist: easyocr (>=1.7,<2.0)
|
@@ -48,7 +49,8 @@ Requires-Dist: python-pptx (>=1.0.2,<2.0.0)
|
|
48
49
|
Requires-Dist: rapidocr-onnxruntime (>=1.4.0,<2.0.0) ; (python_version < "3.13") and (extra == "rapidocr")
|
49
50
|
Requires-Dist: requests (>=2.32.2,<3.0.0)
|
50
51
|
Requires-Dist: rtree (>=1.3.0,<2.0.0)
|
51
|
-
Requires-Dist: scipy (>=1.6.0,<
|
52
|
+
Requires-Dist: scipy (>=1.6.0,<1.14.0) ; python_version < "3.10"
|
53
|
+
Requires-Dist: scipy (>=1.6.0,<2.0.0) ; python_version >= "3.10"
|
52
54
|
Requires-Dist: tesserocr (>=2.7.1,<3.0.0) ; extra == "tesserocr"
|
53
55
|
Requires-Dist: typer (>=0.12.5,<0.13.0)
|
54
56
|
Project-URL: Repository, https://github.com/DS4SD/docling
|
@@ -94,6 +96,9 @@ Docling simplifies document processing, parsing diverse formats — including ad
|
|
94
96
|
### Coming soon
|
95
97
|
|
96
98
|
* 📝 Metadata extraction, including title, authors, references & language
|
99
|
+
* 📝 Inclusion of Visual Language Models ([SmolDocling](https://huggingface.co/blog/smolervlm#smoldocling))
|
100
|
+
* 📝 Chart understanding (Barchart, Piechart, LinePlot, etc)
|
101
|
+
* 📝 Complex chemistry understanding (Molecular structures)
|
97
102
|
|
98
103
|
## Installation
|
99
104
|
|
@@ -4,13 +4,13 @@ docling/backend/abstract_backend.py,sha256=1lNxzwDTn303aXduPDVmTyXn-5ZIoWMLYqNxA
|
|
4
4
|
docling/backend/asciidoc_backend.py,sha256=zyHxlG_BvlLwvpdNca3P6aopxOJZw8wbDFkJQQknNXk,14050
|
5
5
|
docling/backend/docling_parse_backend.py,sha256=hEEJibI1oJS0LAnFoIs6gMshS3bCqGtVxHnDNvBGZuA,7649
|
6
6
|
docling/backend/docling_parse_v2_backend.py,sha256=IpwrBrtLGwNRl5AYO-o3NjEfNRsAkuMhzvDt2HXb9Ko,8655
|
7
|
-
docling/backend/html_backend.py,sha256=
|
7
|
+
docling/backend/html_backend.py,sha256=YTPLZiEEEuGaP6G62skK3wXJ0KftuqBCl8erNXeJyoE,15893
|
8
8
|
docling/backend/json/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
9
9
|
docling/backend/json/docling_json_backend.py,sha256=LlFMVoZrrCfVwbDuRbNN4Xg96Lujh4xxrTBt9jGhY9I,1984
|
10
|
-
docling/backend/md_backend.py,sha256=
|
10
|
+
docling/backend/md_backend.py,sha256=d7XAFHzFO9qhrCJA3raWEmZ8WXSYyy3KOE57oMeqKGc,16502
|
11
11
|
docling/backend/msexcel_backend.py,sha256=lyJc4ShJGAN2ZfNTTuhdYTF-44cZsGyn_8Djstp3IEU,12700
|
12
|
-
docling/backend/mspowerpoint_backend.py,sha256=
|
13
|
-
docling/backend/msword_backend.py,sha256=
|
12
|
+
docling/backend/mspowerpoint_backend.py,sha256=esAyaaQe17BQFweGAGJHvImKETefY0BpvfpUSECC49w,16424
|
13
|
+
docling/backend/msword_backend.py,sha256=0iR1l3eLplPv3CPT7iGwQb50LIVf3C32KZFzwAkARrc,20573
|
14
14
|
docling/backend/pdf_backend.py,sha256=17Pr8dWsD1C4FYUprrwMM9trDGW-JYLjrcScx1Ul4io,2048
|
15
15
|
docling/backend/pypdfium2_backend.py,sha256=QSPfp903ZtSpoNqPmcIek0HmvETrJ1kkwrdxnF5pjS0,9014
|
16
16
|
docling/backend/xml/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -21,10 +21,10 @@ docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
21
21
|
docling/cli/main.py,sha256=K5C2yQIoM40_W3YU8a7SmneY-hWbNp_JOFPLk0NPcDI,16098
|
22
22
|
docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
23
23
|
docling/datamodel/base_models.py,sha256=vewP1X99qfAwiUsiC2m8CBDGiQPsGyp_WkKJHYPoYn4,7026
|
24
|
-
docling/datamodel/document.py,sha256=
|
24
|
+
docling/datamodel/document.py,sha256=HkmvQKW3QSx3tAqPTnXiJbD_y1EVwR-WE3n6Gq8g1NY,13428
|
25
25
|
docling/datamodel/pipeline_options.py,sha256=f9-VQFgOdahyclGQgH_T8ZYBopkWsF_fbWbxo39ux3g,7888
|
26
|
-
docling/datamodel/settings.py,sha256=
|
27
|
-
docling/document_converter.py,sha256=
|
26
|
+
docling/datamodel/settings.py,sha256=uN9jeXMwx--tJb-DFU7nr77g0Iou13YAVDzsymTvbHg,1759
|
27
|
+
docling/document_converter.py,sha256=qaldb7Thqk59RdE-RTGtj1M7l5UzaBdnxIvGoQ7lTeo,12876
|
28
28
|
docling/exceptions.py,sha256=-FoP46rFJgz_jn5uDv2V052udEEg8gckk6uhoItchXc,85
|
29
29
|
docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
30
30
|
docling/models/base_model.py,sha256=H5X-exVaAN-XMTzxpgUc-rwH-D8Uk7-VuZtq2soNGXI,2567
|
@@ -38,25 +38,25 @@ docling/models/ocr_mac_model.py,sha256=bLP14UUmZcSzjDe-HLj-mtksTuBmsCTg2C1wCxUpa
|
|
38
38
|
docling/models/page_assemble_model.py,sha256=c5KLKwkUIdW0JcDHizWsqrpb5x_3DK28x82Q8o-3VJM,5968
|
39
39
|
docling/models/page_preprocessing_model.py,sha256=1gVrZjObKxAvXkkKvXnIFApPOggzgiTFPtt1CGbMbSs,2763
|
40
40
|
docling/models/rapid_ocr_model.py,sha256=2HXmurNRPP6qyqn7U5h9NQIs8zi0TMHf56CpcKQk0fU,5038
|
41
|
-
docling/models/table_structure_model.py,sha256=
|
41
|
+
docling/models/table_structure_model.py,sha256=qZgoBrBh7H-RJGCTtaRGcj79g2WzZiUBTPnHqJZ-bLA,9557
|
42
42
|
docling/models/tesseract_ocr_cli_model.py,sha256=b2Is5x2gZLS6mQWnKe0y7p6UU6hRTHDfoH4D2RQ5mx0,9310
|
43
43
|
docling/models/tesseract_ocr_model.py,sha256=BN85u-4a-xzUY7Iw21Ib8L8kx4mgbDGiUtxBelLiJm8,8513
|
44
44
|
docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
45
|
-
docling/pipeline/base_pipeline.py,sha256=
|
45
|
+
docling/pipeline/base_pipeline.py,sha256=lK8PQiydWJ9M16kIVL7U1A2iryTRFrN5WSucVo2ohFQ,8757
|
46
46
|
docling/pipeline/simple_pipeline.py,sha256=mZqANqUtAOFAyqQEShErQnAUz6tJFOl6zVoazEDJ_wE,2254
|
47
47
|
docling/pipeline/standard_pdf_pipeline.py,sha256=Qefg1JSiFwipypi8TZPJ50WgXTLjwkC0wvYAl02RM2o,10480
|
48
48
|
docling/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
49
49
|
docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
50
50
|
docling/utils/accelerator_utils.py,sha256=ZjULCn-qhxqx3frF-rJmAlWdzqgUMxH5utLHbSPev80,1367
|
51
51
|
docling/utils/export.py,sha256=KyGF1BVDHPFfHVXZc8vegsWlFfOgGPP2YckWpTadyI8,4694
|
52
|
-
docling/utils/glm_utils.py,sha256=
|
52
|
+
docling/utils/glm_utils.py,sha256=uyCoFTX9FbS1Ke0aSlkdzGLUt08dZfkgriWadkyLiiA,11856
|
53
53
|
docling/utils/layout_postprocessor.py,sha256=urRzeF9PrKiMBvA6DdHHwyLxG06CMhelgJeV5B1l6l0,24258
|
54
54
|
docling/utils/ocr_utils.py,sha256=F7iOOjqolUcImUzir4qjDQd4QWSO3s6JC4WRn3U7uY4,263
|
55
55
|
docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
|
56
56
|
docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
|
57
57
|
docling/utils/visualization.py,sha256=4pn-80fVuE04ken7hUg5Ar47ndRSL9MWBgdHM-1g1zU,2735
|
58
|
-
docling-2.
|
59
|
-
docling-2.
|
60
|
-
docling-2.
|
61
|
-
docling-2.
|
62
|
-
docling-2.
|
58
|
+
docling-2.18.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
|
59
|
+
docling-2.18.0.dist-info/METADATA,sha256=rBP1Z7m0HMpC-HjR360i2JNuIA9lqknRPjUab1mtVic,8403
|
60
|
+
docling-2.18.0.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
61
|
+
docling-2.18.0.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
|
62
|
+
docling-2.18.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|