docling 2.17.0__py3-none-any.whl → 2.18.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -141,7 +141,9 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
141
141
  with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
142
142
 
143
143
  for i in range(0, conv_res.input.page_count):
144
- conv_res.pages.append(Page(page_no=i))
144
+ start_page, end_page = conv_res.input.limits.page_range
145
+ if (start_page - 1) <= i <= (end_page - 1):
146
+ conv_res.pages.append(Page(page_no=i))
145
147
 
146
148
  try:
147
149
  # Iterate batches of pages (page_batch_size) in the doc
@@ -307,6 +307,10 @@ def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
307
307
  current_list = None
308
308
 
309
309
  doc.add_code(text=text, prov=prov)
310
+ elif label == DocItemLabel.FORMULA:
311
+ current_list = None
312
+
313
+ doc.add_text(label=DocItemLabel.FORMULA, text="", orig=text, prov=prov)
310
314
  else:
311
315
  current_list = None
312
316
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.17.0
3
+ Version: 2.18.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -19,6 +19,7 @@ Classifier: Programming Language :: Python :: 3.9
19
19
  Classifier: Programming Language :: Python :: 3.10
20
20
  Classifier: Programming Language :: Python :: 3.11
21
21
  Classifier: Programming Language :: Python :: 3.12
22
+ Classifier: Programming Language :: Python :: 3.13
22
23
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
23
24
  Provides-Extra: ocrmac
24
25
  Provides-Extra: rapidocr
@@ -26,7 +27,7 @@ Provides-Extra: tesserocr
26
27
  Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
27
28
  Requires-Dist: certifi (>=2024.7.4)
28
29
  Requires-Dist: deepsearch-glm (>=1.0.0,<2.0.0)
29
- Requires-Dist: docling-core[chunking] (>=2.15.1,<3.0.0)
30
+ Requires-Dist: docling-core[chunking] (>=2.17.0,<3.0.0)
30
31
  Requires-Dist: docling-ibm-models (>=3.3.0,<4.0.0)
31
32
  Requires-Dist: docling-parse (>=3.1.0,<4.0.0)
32
33
  Requires-Dist: easyocr (>=1.7,<2.0)
@@ -48,7 +49,8 @@ Requires-Dist: python-pptx (>=1.0.2,<2.0.0)
48
49
  Requires-Dist: rapidocr-onnxruntime (>=1.4.0,<2.0.0) ; (python_version < "3.13") and (extra == "rapidocr")
49
50
  Requires-Dist: requests (>=2.32.2,<3.0.0)
50
51
  Requires-Dist: rtree (>=1.3.0,<2.0.0)
51
- Requires-Dist: scipy (>=1.6.0,<2.0.0)
52
+ Requires-Dist: scipy (>=1.6.0,<1.14.0) ; python_version < "3.10"
53
+ Requires-Dist: scipy (>=1.6.0,<2.0.0) ; python_version >= "3.10"
52
54
  Requires-Dist: tesserocr (>=2.7.1,<3.0.0) ; extra == "tesserocr"
53
55
  Requires-Dist: typer (>=0.12.5,<0.13.0)
54
56
  Project-URL: Repository, https://github.com/DS4SD/docling
@@ -94,6 +96,9 @@ Docling simplifies document processing, parsing diverse formats — including ad
94
96
  ### Coming soon
95
97
 
96
98
  * 📝 Metadata extraction, including title, authors, references & language
99
+ * 📝 Inclusion of Visual Language Models ([SmolDocling](https://huggingface.co/blog/smolervlm#smoldocling))
100
+ * 📝 Chart understanding (Barchart, Piechart, LinePlot, etc)
101
+ * 📝 Complex chemistry understanding (Molecular structures)
97
102
 
98
103
  ## Installation
99
104
 
@@ -4,13 +4,13 @@ docling/backend/abstract_backend.py,sha256=1lNxzwDTn303aXduPDVmTyXn-5ZIoWMLYqNxA
4
4
  docling/backend/asciidoc_backend.py,sha256=zyHxlG_BvlLwvpdNca3P6aopxOJZw8wbDFkJQQknNXk,14050
5
5
  docling/backend/docling_parse_backend.py,sha256=hEEJibI1oJS0LAnFoIs6gMshS3bCqGtVxHnDNvBGZuA,7649
6
6
  docling/backend/docling_parse_v2_backend.py,sha256=IpwrBrtLGwNRl5AYO-o3NjEfNRsAkuMhzvDt2HXb9Ko,8655
7
- docling/backend/html_backend.py,sha256=DDfQ84VQB4nF_0wgGtbYUA9luVumB5bjjoWHjESa6Tk,15596
7
+ docling/backend/html_backend.py,sha256=YTPLZiEEEuGaP6G62skK3wXJ0KftuqBCl8erNXeJyoE,15893
8
8
  docling/backend/json/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
9
  docling/backend/json/docling_json_backend.py,sha256=LlFMVoZrrCfVwbDuRbNN4Xg96Lujh4xxrTBt9jGhY9I,1984
10
- docling/backend/md_backend.py,sha256=PicGKM2cg4r1lztr46eC4sKbFLvGnqzrEcLTE5fW1zc,14426
10
+ docling/backend/md_backend.py,sha256=d7XAFHzFO9qhrCJA3raWEmZ8WXSYyy3KOE57oMeqKGc,16502
11
11
  docling/backend/msexcel_backend.py,sha256=lyJc4ShJGAN2ZfNTTuhdYTF-44cZsGyn_8Djstp3IEU,12700
12
- docling/backend/mspowerpoint_backend.py,sha256=kOGawhcn0BFq4M_C6kW0mY8vMIB24_6R6q6GaszbSt0,15957
13
- docling/backend/msword_backend.py,sha256=WcQmRYmpH8o2snGoWGxNRkCtUI3mf2JL3-9CxAfDAJg,19232
12
+ docling/backend/mspowerpoint_backend.py,sha256=esAyaaQe17BQFweGAGJHvImKETefY0BpvfpUSECC49w,16424
13
+ docling/backend/msword_backend.py,sha256=0iR1l3eLplPv3CPT7iGwQb50LIVf3C32KZFzwAkARrc,20573
14
14
  docling/backend/pdf_backend.py,sha256=17Pr8dWsD1C4FYUprrwMM9trDGW-JYLjrcScx1Ul4io,2048
15
15
  docling/backend/pypdfium2_backend.py,sha256=QSPfp903ZtSpoNqPmcIek0HmvETrJ1kkwrdxnF5pjS0,9014
16
16
  docling/backend/xml/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -21,10 +21,10 @@ docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
21
21
  docling/cli/main.py,sha256=K5C2yQIoM40_W3YU8a7SmneY-hWbNp_JOFPLk0NPcDI,16098
22
22
  docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
23
23
  docling/datamodel/base_models.py,sha256=vewP1X99qfAwiUsiC2m8CBDGiQPsGyp_WkKJHYPoYn4,7026
24
- docling/datamodel/document.py,sha256=vuY8S9n-_w5UQl-7C_wasrW4bSHPQeAeH4RR-MWrGW4,13315
24
+ docling/datamodel/document.py,sha256=HkmvQKW3QSx3tAqPTnXiJbD_y1EVwR-WE3n6Gq8g1NY,13428
25
25
  docling/datamodel/pipeline_options.py,sha256=f9-VQFgOdahyclGQgH_T8ZYBopkWsF_fbWbxo39ux3g,7888
26
- docling/datamodel/settings.py,sha256=Sw0rN_f8rdLV1eNvVeKiyET2Oe6oz9jtW3lJzniW9Do,1302
27
- docling/document_converter.py,sha256=qtYPEkWuMUUGmFko2in38iSHdYrjAFf_GHNoXRRvEVs,12631
26
+ docling/datamodel/settings.py,sha256=uN9jeXMwx--tJb-DFU7nr77g0Iou13YAVDzsymTvbHg,1759
27
+ docling/document_converter.py,sha256=qaldb7Thqk59RdE-RTGtj1M7l5UzaBdnxIvGoQ7lTeo,12876
28
28
  docling/exceptions.py,sha256=-FoP46rFJgz_jn5uDv2V052udEEg8gckk6uhoItchXc,85
29
29
  docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
30
30
  docling/models/base_model.py,sha256=H5X-exVaAN-XMTzxpgUc-rwH-D8Uk7-VuZtq2soNGXI,2567
@@ -38,25 +38,25 @@ docling/models/ocr_mac_model.py,sha256=bLP14UUmZcSzjDe-HLj-mtksTuBmsCTg2C1wCxUpa
38
38
  docling/models/page_assemble_model.py,sha256=c5KLKwkUIdW0JcDHizWsqrpb5x_3DK28x82Q8o-3VJM,5968
39
39
  docling/models/page_preprocessing_model.py,sha256=1gVrZjObKxAvXkkKvXnIFApPOggzgiTFPtt1CGbMbSs,2763
40
40
  docling/models/rapid_ocr_model.py,sha256=2HXmurNRPP6qyqn7U5h9NQIs8zi0TMHf56CpcKQk0fU,5038
41
- docling/models/table_structure_model.py,sha256=fUpCHthO4Uk3BhA99a85BHBm51fmdE9kfqhAk3WjuBw,9392
41
+ docling/models/table_structure_model.py,sha256=qZgoBrBh7H-RJGCTtaRGcj79g2WzZiUBTPnHqJZ-bLA,9557
42
42
  docling/models/tesseract_ocr_cli_model.py,sha256=b2Is5x2gZLS6mQWnKe0y7p6UU6hRTHDfoH4D2RQ5mx0,9310
43
43
  docling/models/tesseract_ocr_model.py,sha256=BN85u-4a-xzUY7Iw21Ib8L8kx4mgbDGiUtxBelLiJm8,8513
44
44
  docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
45
- docling/pipeline/base_pipeline.py,sha256=J0ZjtincsJr-BbRgqoQozxIhDWxWFlWaS9CTPwypJFk,8621
45
+ docling/pipeline/base_pipeline.py,sha256=lK8PQiydWJ9M16kIVL7U1A2iryTRFrN5WSucVo2ohFQ,8757
46
46
  docling/pipeline/simple_pipeline.py,sha256=mZqANqUtAOFAyqQEShErQnAUz6tJFOl6zVoazEDJ_wE,2254
47
47
  docling/pipeline/standard_pdf_pipeline.py,sha256=Qefg1JSiFwipypi8TZPJ50WgXTLjwkC0wvYAl02RM2o,10480
48
48
  docling/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
49
49
  docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
50
50
  docling/utils/accelerator_utils.py,sha256=ZjULCn-qhxqx3frF-rJmAlWdzqgUMxH5utLHbSPev80,1367
51
51
  docling/utils/export.py,sha256=KyGF1BVDHPFfHVXZc8vegsWlFfOgGPP2YckWpTadyI8,4694
52
- docling/utils/glm_utils.py,sha256=Nfxdx0W-sl1owYncTeJmZdiPcn-jpTqK8f8TeQlDOMY,11683
52
+ docling/utils/glm_utils.py,sha256=uyCoFTX9FbS1Ke0aSlkdzGLUt08dZfkgriWadkyLiiA,11856
53
53
  docling/utils/layout_postprocessor.py,sha256=urRzeF9PrKiMBvA6DdHHwyLxG06CMhelgJeV5B1l6l0,24258
54
54
  docling/utils/ocr_utils.py,sha256=F7iOOjqolUcImUzir4qjDQd4QWSO3s6JC4WRn3U7uY4,263
55
55
  docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
56
56
  docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
57
57
  docling/utils/visualization.py,sha256=4pn-80fVuE04ken7hUg5Ar47ndRSL9MWBgdHM-1g1zU,2735
58
- docling-2.17.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
59
- docling-2.17.0.dist-info/METADATA,sha256=BkpXBck-2EjuYUsn_2aAGftgdbf260baqAb9P8ZixSM,8025
60
- docling-2.17.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
61
- docling-2.17.0.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
62
- docling-2.17.0.dist-info/RECORD,,
58
+ docling-2.18.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
59
+ docling-2.18.0.dist-info/METADATA,sha256=rBP1Z7m0HMpC-HjR360i2JNuIA9lqknRPjUab1mtVic,8403
60
+ docling-2.18.0.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
61
+ docling-2.18.0.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
62
+ docling-2.18.0.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: poetry-core 1.9.0
2
+ Generator: poetry-core 1.9.1
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any