docling 1.1.2__tar.gz → 2.64.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (185) hide show
  1. {docling-1.1.2 → docling-2.64.1}/LICENSE +1 -1
  2. docling-2.64.1/PKG-INFO +237 -0
  3. docling-2.64.1/README.md +161 -0
  4. docling-2.64.1/docling/backend/abstract_backend.py +84 -0
  5. docling-2.64.1/docling/backend/asciidoc_backend.py +443 -0
  6. docling-2.64.1/docling/backend/csv_backend.py +125 -0
  7. docling-2.64.1/docling/backend/docling_parse_backend.py +237 -0
  8. docling-2.64.1/docling/backend/docling_parse_v2_backend.py +276 -0
  9. docling-2.64.1/docling/backend/docling_parse_v4_backend.py +260 -0
  10. docling-2.64.1/docling/backend/docx/drawingml/utils.py +131 -0
  11. docling-2.64.1/docling/backend/docx/latex/latex_dict.py +274 -0
  12. docling-2.64.1/docling/backend/docx/latex/omml.py +459 -0
  13. docling-2.64.1/docling/backend/html_backend.py +1499 -0
  14. docling-2.64.1/docling/backend/image_backend.py +188 -0
  15. docling-2.64.1/docling/backend/json/docling_json_backend.py +58 -0
  16. docling-2.64.1/docling/backend/md_backend.py +614 -0
  17. docling-2.64.1/docling/backend/mets_gbs_backend.py +399 -0
  18. docling-2.64.1/docling/backend/msexcel_backend.py +686 -0
  19. docling-2.64.1/docling/backend/mspowerpoint_backend.py +398 -0
  20. docling-2.64.1/docling/backend/msword_backend.py +1646 -0
  21. docling-2.64.1/docling/backend/noop_backend.py +51 -0
  22. docling-2.64.1/docling/backend/pdf_backend.py +82 -0
  23. docling-2.64.1/docling/backend/pypdfium2_backend.py +408 -0
  24. docling-2.64.1/docling/backend/webvtt_backend.py +572 -0
  25. docling-2.64.1/docling/backend/xml/jats_backend.py +819 -0
  26. docling-2.64.1/docling/backend/xml/uspto_backend.py +1905 -0
  27. docling-2.64.1/docling/chunking/__init__.py +12 -0
  28. docling-2.64.1/docling/cli/__init__.py +0 -0
  29. docling-2.64.1/docling/cli/main.py +893 -0
  30. docling-2.64.1/docling/cli/models.py +196 -0
  31. docling-2.64.1/docling/cli/tools.py +17 -0
  32. docling-2.64.1/docling/datamodel/__init__.py +0 -0
  33. docling-2.64.1/docling/datamodel/accelerator_options.py +68 -0
  34. docling-2.64.1/docling/datamodel/asr_model_specs.py +494 -0
  35. docling-2.64.1/docling/datamodel/backend_options.py +96 -0
  36. docling-2.64.1/docling/datamodel/base_models.py +492 -0
  37. docling-2.64.1/docling/datamodel/document.py +699 -0
  38. docling-2.64.1/docling/datamodel/extraction.py +39 -0
  39. docling-2.64.1/docling/datamodel/layout_model_specs.py +90 -0
  40. docling-2.64.1/docling/datamodel/pipeline_options.py +397 -0
  41. docling-2.64.1/docling/datamodel/pipeline_options_asr_model.py +77 -0
  42. docling-2.64.1/docling/datamodel/pipeline_options_vlm_model.py +134 -0
  43. docling-2.64.1/docling/datamodel/settings.py +65 -0
  44. docling-2.64.1/docling/datamodel/vlm_model_specs.py +314 -0
  45. docling-2.64.1/docling/document_converter.py +456 -0
  46. docling-2.64.1/docling/document_extractor.py +327 -0
  47. docling-2.64.1/docling/exceptions.py +10 -0
  48. docling-2.64.1/docling/experimental/__init__.py +5 -0
  49. docling-2.64.1/docling/experimental/datamodel/__init__.py +1 -0
  50. docling-2.64.1/docling/experimental/datamodel/table_crops_layout_options.py +13 -0
  51. docling-2.64.1/docling/experimental/datamodel/threaded_layout_vlm_pipeline_options.py +45 -0
  52. docling-2.64.1/docling/experimental/models/__init__.py +3 -0
  53. docling-2.64.1/docling/experimental/models/table_crops_layout_model.py +114 -0
  54. docling-2.64.1/docling/experimental/pipeline/__init__.py +1 -0
  55. docling-2.64.1/docling/experimental/pipeline/threaded_layout_vlm_pipeline.py +439 -0
  56. docling-2.64.1/docling/models/__init__.py +0 -0
  57. docling-2.64.1/docling/models/api_vlm_model.py +180 -0
  58. docling-2.64.1/docling/models/auto_ocr_model.py +132 -0
  59. docling-2.64.1/docling/models/base_layout_model.py +39 -0
  60. docling-2.64.1/docling/models/base_model.py +230 -0
  61. docling-2.64.1/docling/models/base_ocr_model.py +241 -0
  62. docling-2.64.1/docling/models/base_table_model.py +45 -0
  63. docling-2.64.1/docling/models/code_formula_model.py +337 -0
  64. docling-2.64.1/docling/models/document_picture_classifier.py +185 -0
  65. docling-2.64.1/docling/models/easyocr_model.py +200 -0
  66. docling-2.64.1/docling/models/factories/__init__.py +47 -0
  67. docling-2.64.1/docling/models/factories/base_factory.py +122 -0
  68. docling-2.64.1/docling/models/factories/layout_factory.py +7 -0
  69. docling-2.64.1/docling/models/factories/ocr_factory.py +11 -0
  70. docling-2.64.1/docling/models/factories/picture_description_factory.py +11 -0
  71. docling-2.64.1/docling/models/factories/table_factory.py +7 -0
  72. docling-2.64.1/docling/models/layout_model.py +249 -0
  73. docling-2.64.1/docling/models/ocr_mac_model.py +145 -0
  74. docling-2.64.1/docling/models/page_assemble_model.py +156 -0
  75. docling-2.64.1/docling/models/page_preprocessing_model.py +145 -0
  76. docling-2.64.1/docling/models/picture_description_api_model.py +66 -0
  77. docling-2.64.1/docling/models/picture_description_base_model.py +91 -0
  78. docling-2.64.1/docling/models/picture_description_vlm_model.py +120 -0
  79. docling-2.64.1/docling/models/plugins/__init__.py +0 -0
  80. docling-2.64.1/docling/models/plugins/defaults.py +54 -0
  81. docling-2.64.1/docling/models/rapid_ocr_model.py +328 -0
  82. docling-2.64.1/docling/models/readingorder_model.py +431 -0
  83. docling-2.64.1/docling/models/table_structure_model.py +305 -0
  84. docling-2.64.1/docling/models/tesseract_ocr_cli_model.py +331 -0
  85. docling-2.64.1/docling/models/tesseract_ocr_model.py +262 -0
  86. docling-2.64.1/docling/models/utils/__init__.py +0 -0
  87. docling-2.64.1/docling/models/utils/generation_utils.py +157 -0
  88. docling-2.64.1/docling/models/utils/hf_model_download.py +45 -0
  89. docling-2.64.1/docling/models/vlm_models_inline/__init__.py +1 -0
  90. docling-2.64.1/docling/models/vlm_models_inline/hf_transformers_model.py +391 -0
  91. docling-2.64.1/docling/models/vlm_models_inline/mlx_model.py +330 -0
  92. docling-2.64.1/docling/models/vlm_models_inline/nuextract_transformers_model.py +305 -0
  93. docling-2.64.1/docling/models/vlm_models_inline/vllm_model.py +344 -0
  94. docling-2.64.1/docling/pipeline/__init__.py +0 -0
  95. docling-2.64.1/docling/pipeline/asr_pipeline.py +431 -0
  96. docling-2.64.1/docling/pipeline/base_extraction_pipeline.py +72 -0
  97. docling-2.64.1/docling/pipeline/base_pipeline.py +326 -0
  98. docling-2.64.1/docling/pipeline/extraction_vlm_pipeline.py +207 -0
  99. docling-2.64.1/docling/pipeline/legacy_standard_pdf_pipeline.py +253 -0
  100. docling-2.64.1/docling/pipeline/simple_pipeline.py +55 -0
  101. docling-2.64.1/docling/pipeline/standard_pdf_pipeline.py +843 -0
  102. docling-2.64.1/docling/pipeline/threaded_standard_pdf_pipeline.py +5 -0
  103. docling-2.64.1/docling/pipeline/vlm_pipeline.py +411 -0
  104. docling-2.64.1/docling/py.typed +1 -0
  105. docling-2.64.1/docling/utils/__init__.py +0 -0
  106. docling-2.64.1/docling/utils/accelerator_utils.py +83 -0
  107. docling-2.64.1/docling/utils/api_image_request.py +205 -0
  108. docling-2.64.1/docling/utils/export.py +146 -0
  109. docling-2.64.1/docling/utils/glm_utils.py +361 -0
  110. docling-2.64.1/docling/utils/layout_postprocessor.py +683 -0
  111. docling-2.64.1/docling/utils/locks.py +3 -0
  112. docling-2.64.1/docling/utils/model_downloader.py +158 -0
  113. docling-2.64.1/docling/utils/ocr_utils.py +69 -0
  114. docling-2.64.1/docling/utils/orientation.py +65 -0
  115. docling-2.64.1/docling/utils/profiling.py +62 -0
  116. {docling-1.1.2 → docling-2.64.1}/docling/utils/utils.py +27 -3
  117. docling-2.64.1/docling/utils/visualization.py +85 -0
  118. docling-2.64.1/docling.egg-info/PKG-INFO +237 -0
  119. docling-2.64.1/docling.egg-info/SOURCES.txt +166 -0
  120. docling-2.64.1/docling.egg-info/dependency_links.txt +1 -0
  121. docling-2.64.1/docling.egg-info/entry_points.txt +6 -0
  122. docling-2.64.1/docling.egg-info/requires.txt +66 -0
  123. docling-2.64.1/docling.egg-info/top_level.txt +1 -0
  124. docling-2.64.1/pyproject.toml +283 -0
  125. docling-2.64.1/setup.cfg +4 -0
  126. docling-2.64.1/tests/test_asr_mlx_whisper.py +340 -0
  127. docling-2.64.1/tests/test_asr_pipeline.py +404 -0
  128. docling-2.64.1/tests/test_backend_asciidoc.py +72 -0
  129. docling-2.64.1/tests/test_backend_csv.py +87 -0
  130. docling-2.64.1/tests/test_backend_docling_json.py +58 -0
  131. docling-2.64.1/tests/test_backend_docling_parse.py +77 -0
  132. docling-2.64.1/tests/test_backend_docling_parse_v2.py +76 -0
  133. docling-2.64.1/tests/test_backend_docling_parse_v4.py +93 -0
  134. docling-2.64.1/tests/test_backend_html.py +561 -0
  135. docling-2.64.1/tests/test_backend_image_native.py +218 -0
  136. docling-2.64.1/tests/test_backend_jats.py +62 -0
  137. docling-2.64.1/tests/test_backend_markdown.py +111 -0
  138. docling-2.64.1/tests/test_backend_mets_gbs.py +77 -0
  139. docling-2.64.1/tests/test_backend_msexcel.py +314 -0
  140. docling-2.64.1/tests/test_backend_msword.py +239 -0
  141. docling-2.64.1/tests/test_backend_patent_uspto.py +466 -0
  142. docling-2.64.1/tests/test_backend_pdfium.py +109 -0
  143. docling-2.64.1/tests/test_backend_pptx.py +55 -0
  144. docling-2.64.1/tests/test_backend_vtt.py +232 -0
  145. docling-2.64.1/tests/test_backend_webp.py +86 -0
  146. docling-2.64.1/tests/test_cli.py +92 -0
  147. docling-2.64.1/tests/test_code_formula.py +84 -0
  148. docling-2.64.1/tests/test_conversion_result_json.py +44 -0
  149. docling-2.64.1/tests/test_data_gen_flag.py +9 -0
  150. docling-2.64.1/tests/test_document_picture_classifier.py +79 -0
  151. docling-2.64.1/tests/test_e2e_conversion.py +63 -0
  152. docling-2.64.1/tests/test_e2e_ocr_conversion.py +117 -0
  153. docling-2.64.1/tests/test_extraction.py +108 -0
  154. docling-2.64.1/tests/test_input_doc.py +282 -0
  155. docling-2.64.1/tests/test_interfaces.py +138 -0
  156. docling-2.64.1/tests/test_invalid_input.py +44 -0
  157. docling-2.64.1/tests/test_legacy_format_transform.py +53 -0
  158. docling-2.64.1/tests/test_ocr_utils.py +80 -0
  159. docling-2.64.1/tests/test_options.py +200 -0
  160. docling-2.64.1/tests/test_pdf_password.py +63 -0
  161. docling-2.64.1/tests/test_settings_load.py +29 -0
  162. docling-2.64.1/tests/test_threaded_pipeline.py +198 -0
  163. docling-1.1.2/PKG-INFO +0 -183
  164. docling-1.1.2/README.md +0 -146
  165. docling-1.1.2/docling/backend/abstract_backend.py +0 -55
  166. docling-1.1.2/docling/backend/pypdfium2_backend.py +0 -217
  167. docling-1.1.2/docling/datamodel/base_models.py +0 -267
  168. docling-1.1.2/docling/datamodel/document.py +0 -348
  169. docling-1.1.2/docling/datamodel/settings.py +0 -32
  170. docling-1.1.2/docling/document_converter.py +0 -263
  171. docling-1.1.2/docling/models/ds_glm_model.py +0 -82
  172. docling-1.1.2/docling/models/easyocr_model.py +0 -77
  173. docling-1.1.2/docling/models/layout_model.py +0 -318
  174. docling-1.1.2/docling/models/page_assemble_model.py +0 -148
  175. docling-1.1.2/docling/models/table_structure_model.py +0 -149
  176. docling-1.1.2/docling/pipeline/base_model_pipeline.py +0 -18
  177. docling-1.1.2/docling/pipeline/standard_model_pipeline.py +0 -40
  178. docling-1.1.2/docling/utils/layout_utils.py +0 -806
  179. docling-1.1.2/pyproject.toml +0 -89
  180. {docling-1.1.2 → docling-2.64.1}/docling/__init__.py +0 -0
  181. {docling-1.1.2 → docling-2.64.1}/docling/backend/__init__.py +0 -0
  182. {docling-1.1.2/docling/datamodel → docling-2.64.1/docling/backend/docx}/__init__.py +0 -0
  183. {docling-1.1.2/docling/models → docling-2.64.1/docling/backend/docx/latex}/__init__.py +0 -0
  184. {docling-1.1.2/docling/pipeline → docling-2.64.1/docling/backend/json}/__init__.py +0 -0
  185. {docling-1.1.2/docling/utils → docling-2.64.1/docling/backend/xml}/__init__.py +0 -0
@@ -1,6 +1,6 @@
1
1
  MIT License
2
2
 
3
- Copyright (c) [year] [fullname]
3
+ Copyright (c) 2024 International Business Machines
4
4
 
5
5
  Permission is hereby granted, free of charge, to any person obtaining a copy
6
6
  of this software and associated documentation files (the "Software"), to deal
@@ -0,0 +1,237 @@
1
+ Metadata-Version: 2.4
2
+ Name: docling
3
+ Version: 2.64.1
4
+ Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
+ Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
+ License-Expression: MIT
7
+ Project-URL: homepage, https://github.com/docling-project/docling
8
+ Project-URL: repository, https://github.com/docling-project/docling
9
+ Project-URL: issues, https://github.com/docling-project/docling/issues
10
+ Project-URL: changelog, https://github.com/docling-project/docling/blob/main/CHANGELOG.md
11
+ Keywords: docling,convert,document,pdf,docx,html,markdown,layout model,segmentation,table structure,table former
12
+ Classifier: Operating System :: MacOS :: MacOS X
13
+ Classifier: Operating System :: POSIX :: Linux
14
+ Classifier: Operating System :: Microsoft :: Windows
15
+ Classifier: Development Status :: 5 - Production/Stable
16
+ Classifier: Intended Audience :: Developers
17
+ Classifier: Intended Audience :: Science/Research
18
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
19
+ Classifier: Programming Language :: Python :: 3
20
+ Classifier: Programming Language :: Python :: 3.9
21
+ Classifier: Programming Language :: Python :: 3.10
22
+ Classifier: Programming Language :: Python :: 3.11
23
+ Classifier: Programming Language :: Python :: 3.12
24
+ Classifier: Programming Language :: Python :: 3.13
25
+ Classifier: Programming Language :: Python :: 3.14
26
+ Requires-Python: <4.0,>=3.9
27
+ Description-Content-Type: text/markdown
28
+ License-File: LICENSE
29
+ Requires-Dist: pydantic<3.0.0,>=2.0.0
30
+ Requires-Dist: docling-core[chunking]<3.0.0,>=2.50.1
31
+ Requires-Dist: docling-parse<5.0.0,>=4.7.0
32
+ Requires-Dist: docling-ibm-models<4,>=3.9.1
33
+ Requires-Dist: filetype<2.0.0,>=1.2.0
34
+ Requires-Dist: pypdfium2!=4.30.1,<5.0.0,>=4.30.0
35
+ Requires-Dist: pydantic-settings<3.0.0,>=2.3.0
36
+ Requires-Dist: huggingface_hub<1,>=0.23
37
+ Requires-Dist: requests<3.0.0,>=2.32.2
38
+ Requires-Dist: ocrmac<2.0.0,>=1.0.0; sys_platform == "darwin"
39
+ Requires-Dist: rapidocr<4.0.0,>=3.3
40
+ Requires-Dist: certifi>=2024.7.4
41
+ Requires-Dist: rtree<2.0.0,>=1.3.0
42
+ Requires-Dist: typer<0.20.0,>=0.12.5
43
+ Requires-Dist: python-docx<2.0.0,>=1.1.2
44
+ Requires-Dist: python-pptx<2.0.0,>=1.0.2
45
+ Requires-Dist: beautifulsoup4<5.0.0,>=4.12.3
46
+ Requires-Dist: pandas<3.0.0,>=2.1.4
47
+ Requires-Dist: marko<3.0.0,>=2.1.2
48
+ Requires-Dist: openpyxl<4.0.0,>=3.1.5
49
+ Requires-Dist: lxml<7.0.0,>=4.0.0
50
+ Requires-Dist: pillow<12.0.0,>=10.0.0
51
+ Requires-Dist: tqdm<5.0.0,>=4.65.0
52
+ Requires-Dist: pluggy<2.0.0,>=1.0.0
53
+ Requires-Dist: pylatexenc<3.0,>=2.10
54
+ Requires-Dist: scipy<2.0.0,>=1.6.0
55
+ Requires-Dist: accelerate<2,>=1.0.0
56
+ Requires-Dist: polyfactory>=2.22.2
57
+ Provides-Extra: easyocr
58
+ Requires-Dist: easyocr<2.0,>=1.7; extra == "easyocr"
59
+ Provides-Extra: tesserocr
60
+ Requires-Dist: tesserocr<3.0.0,>=2.7.1; extra == "tesserocr"
61
+ Provides-Extra: ocrmac
62
+ Requires-Dist: ocrmac<2.0.0,>=1.0.0; sys_platform == "darwin" and extra == "ocrmac"
63
+ Provides-Extra: vlm
64
+ Requires-Dist: transformers<5.0.0,>=4.46.0; extra == "vlm"
65
+ Requires-Dist: accelerate<2.0.0,>=1.2.1; extra == "vlm"
66
+ Requires-Dist: mlx-vlm<1.0.0,>=0.3.0; (python_version >= "3.10" and python_version < "3.14" and sys_platform == "darwin" and platform_machine == "arm64") and extra == "vlm"
67
+ Requires-Dist: vllm<1.0.0,>=0.10.0; (python_version >= "3.10" and python_version < "3.14" and sys_platform == "linux" and platform_machine == "x86_64") and extra == "vlm"
68
+ Requires-Dist: qwen-vl-utils>=0.0.11; extra == "vlm"
69
+ Provides-Extra: rapidocr
70
+ Requires-Dist: rapidocr<4.0.0,>=3.3; extra == "rapidocr"
71
+ Requires-Dist: onnxruntime<2.0.0,>=1.7.0; python_version < "3.14" and extra == "rapidocr"
72
+ Provides-Extra: asr
73
+ Requires-Dist: mlx-whisper>=0.4.3; (python_version >= "3.10" and python_version < "3.14" and sys_platform == "darwin" and platform_machine == "arm64") and extra == "asr"
74
+ Requires-Dist: openai-whisper>=20250625; python_version < "3.14" and extra == "asr"
75
+ Dynamic: license-file
76
+
77
+ <p align="center">
78
+ <a href="https://github.com/docling-project/docling">
79
+ <img loading="lazy" alt="Docling" src="https://github.com/docling-project/docling/raw/main/docs/assets/docling_processing.png" width="100%"/>
80
+ </a>
81
+ </p>
82
+
83
+ # Docling
84
+
85
+ <p align="center">
86
+ <a href="https://trendshift.io/repositories/12132" target="_blank"><img src="https://trendshift.io/api/badge/repositories/12132" alt="DS4SD%2Fdocling | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
87
+ </p>
88
+
89
+ [![arXiv](https://img.shields.io/badge/arXiv-2408.09869-b31b1b.svg)](https://arxiv.org/abs/2408.09869)
90
+ [![Docs](https://img.shields.io/badge/docs-live-brightgreen)](https://docling-project.github.io/docling/)
91
+ [![PyPI version](https://img.shields.io/pypi/v/docling)](https://pypi.org/project/docling/)
92
+ [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/docling)](https://pypi.org/project/docling/)
93
+ [![uv](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/uv/main/assets/badge/v0.json)](https://github.com/astral-sh/uv)
94
+ [![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
95
+ [![Pydantic v2](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/pydantic/pydantic/main/docs/badge/v2.json)](https://pydantic.dev)
96
+ [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit)
97
+ [![License MIT](https://img.shields.io/github/license/docling-project/docling)](https://opensource.org/licenses/MIT)
98
+ [![PyPI Downloads](https://static.pepy.tech/badge/docling/month)](https://pepy.tech/projects/docling)
99
+ [![Docling Actor](https://apify.com/actor-badge?actor=vancura/docling?fpr=docling)](https://apify.com/vancura/docling)
100
+ [![Chat with Dosu](https://dosu.dev/dosu-chat-badge.svg)](https://app.dosu.dev/097760a8-135e-4789-8234-90c8837d7f1c/ask?utm_source=github)
101
+ [![Discord](https://img.shields.io/discord/1399788921306746971?color=6A7EC2&logo=discord&logoColor=ffffff)](https://docling.ai/discord)
102
+ [![OpenSSF Best Practices](https://www.bestpractices.dev/projects/10101/badge)](https://www.bestpractices.dev/projects/10101)
103
+ [![LF AI & Data](https://img.shields.io/badge/LF%20AI%20%26%20Data-003778?logo=linuxfoundation&logoColor=fff&color=0094ff&labelColor=003778)](https://lfaidata.foundation/projects/)
104
+
105
+ Docling simplifies document processing, parsing diverse formats — including advanced PDF understanding — and providing seamless integrations with the gen AI ecosystem.
106
+
107
+ ## Features
108
+
109
+ * 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, VTT, images (PNG, TIFF, JPEG, ...), and more
110
+ * 📑 Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more
111
+ * 🧬 Unified, expressive [DoclingDocument][docling_document] representation format
112
+ * ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, [DocTags](https://arxiv.org/abs/2503.11576) and lossless JSON
113
+ * 🔒 Local execution capabilities for sensitive data and air-gapped environments
114
+ * 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
115
+ * 🔍 Extensive OCR support for scanned PDFs and images
116
+ * 👓 Support of several Visual Language Models ([GraniteDocling](https://huggingface.co/ibm-granite/granite-docling-258M))
117
+ * 🎙️ Audio support with Automatic Speech Recognition (ASR) models
118
+ * 🔌 Connect to any agent using the [MCP server](https://docling-project.github.io/docling/usage/mcp/)
119
+ * 💻 Simple and convenient CLI
120
+
121
+ ### What's new
122
+ * 📤 Structured [information extraction][extraction] \[🧪 beta\]
123
+ * 📑 New layout model (**Heron**) by default, for faster PDF parsing
124
+ * 🔌 [MCP server](https://docling-project.github.io/docling/usage/mcp/) for agentic applications
125
+ * 💬 Parsing of Web Video Text Tracks (WebVTT) files
126
+
127
+ ### Coming soon
128
+
129
+ * 📝 Metadata extraction, including title, authors, references & language
130
+ * 📝 Chart understanding (Barchart, Piechart, LinePlot, etc)
131
+ * 📝 Complex chemistry understanding (Molecular structures)
132
+
133
+ ## Installation
134
+
135
+ To use Docling, simply install `docling` from your package manager, e.g. pip:
136
+ ```bash
137
+ pip install docling
138
+ ```
139
+
140
+ Works on macOS, Linux and Windows environments. Both x86_64 and arm64 architectures.
141
+
142
+ More [detailed installation instructions](https://docling-project.github.io/docling/installation/) are available in the docs.
143
+
144
+ ## Getting started
145
+
146
+ To convert individual documents with python, use `convert()`, for example:
147
+
148
+ ```python
149
+ from docling.document_converter import DocumentConverter
150
+
151
+ source = "https://arxiv.org/pdf/2408.09869" # document per local path or URL
152
+ converter = DocumentConverter()
153
+ result = converter.convert(source)
154
+ print(result.document.export_to_markdown()) # output: "## Docling Technical Report[...]"
155
+ ```
156
+
157
+ More [advanced usage options](https://docling-project.github.io/docling/usage/advanced_options/) are available in
158
+ the docs.
159
+
160
+ ## CLI
161
+
162
+ Docling has a built-in CLI to run conversions.
163
+
164
+ ```bash
165
+ docling https://arxiv.org/pdf/2206.01062
166
+ ```
167
+
168
+ You can also use 🥚[GraniteDocling](https://huggingface.co/ibm-granite/granite-docling-258M) and other VLMs via Docling CLI:
169
+ ```bash
170
+ docling --pipeline vlm --vlm-model granite_docling https://arxiv.org/pdf/2206.01062
171
+ ```
172
+ This will use MLX acceleration on supported Apple Silicon hardware.
173
+
174
+ Read more [here](https://docling-project.github.io/docling/usage/)
175
+
176
+ ## Documentation
177
+
178
+ Check out Docling's [documentation](https://docling-project.github.io/docling/), for details on
179
+ installation, usage, concepts, recipes, extensions, and more.
180
+
181
+ ## Examples
182
+
183
+ Go hands-on with our [examples](https://docling-project.github.io/docling/examples/),
184
+ demonstrating how to address different application use cases with Docling.
185
+
186
+ ## Integrations
187
+
188
+ To further accelerate your AI application development, check out Docling's native
189
+ [integrations](https://docling-project.github.io/docling/integrations/) with popular frameworks
190
+ and tools.
191
+
192
+ ## Get help and support
193
+
194
+ Please feel free to connect with us using the [discussion section](https://github.com/docling-project/docling/discussions).
195
+
196
+ ## Technical report
197
+
198
+ For more details on Docling's inner workings, check out the [Docling Technical Report](https://arxiv.org/abs/2408.09869).
199
+
200
+ ## Contributing
201
+
202
+ Please read [Contributing to Docling](https://github.com/docling-project/docling/blob/main/CONTRIBUTING.md) for details.
203
+
204
+ ## References
205
+
206
+ If you use Docling in your projects, please consider citing the following:
207
+
208
+ ```bib
209
+ @techreport{Docling,
210
+ author = {Deep Search Team},
211
+ month = {8},
212
+ title = {Docling Technical Report},
213
+ url = {https://arxiv.org/abs/2408.09869},
214
+ eprint = {2408.09869},
215
+ doi = {10.48550/arXiv.2408.09869},
216
+ version = {1.0.0},
217
+ year = {2024}
218
+ }
219
+ ```
220
+
221
+ ## License
222
+
223
+ The Docling codebase is under MIT license.
224
+ For individual model usage, please refer to the model licenses found in the original packages.
225
+
226
+ ## LF AI & Data
227
+
228
+ Docling is hosted as a project in the [LF AI & Data Foundation](https://lfaidata.foundation/projects/).
229
+
230
+ ### IBM ❤️ Open Source AI
231
+
232
+ The project was started by the AI for knowledge team at IBM Research Zurich.
233
+
234
+ [supported_formats]: https://docling-project.github.io/docling/usage/supported_formats/
235
+ [docling_document]: https://docling-project.github.io/docling/concepts/docling_document/
236
+ [integrations]: https://docling-project.github.io/docling/integrations/
237
+ [extraction]: https://docling-project.github.io/docling/examples/extraction/
@@ -0,0 +1,161 @@
1
+ <p align="center">
2
+ <a href="https://github.com/docling-project/docling">
3
+ <img loading="lazy" alt="Docling" src="https://github.com/docling-project/docling/raw/main/docs/assets/docling_processing.png" width="100%"/>
4
+ </a>
5
+ </p>
6
+
7
+ # Docling
8
+
9
+ <p align="center">
10
+ <a href="https://trendshift.io/repositories/12132" target="_blank"><img src="https://trendshift.io/api/badge/repositories/12132" alt="DS4SD%2Fdocling | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
11
+ </p>
12
+
13
+ [![arXiv](https://img.shields.io/badge/arXiv-2408.09869-b31b1b.svg)](https://arxiv.org/abs/2408.09869)
14
+ [![Docs](https://img.shields.io/badge/docs-live-brightgreen)](https://docling-project.github.io/docling/)
15
+ [![PyPI version](https://img.shields.io/pypi/v/docling)](https://pypi.org/project/docling/)
16
+ [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/docling)](https://pypi.org/project/docling/)
17
+ [![uv](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/uv/main/assets/badge/v0.json)](https://github.com/astral-sh/uv)
18
+ [![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
19
+ [![Pydantic v2](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/pydantic/pydantic/main/docs/badge/v2.json)](https://pydantic.dev)
20
+ [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit)
21
+ [![License MIT](https://img.shields.io/github/license/docling-project/docling)](https://opensource.org/licenses/MIT)
22
+ [![PyPI Downloads](https://static.pepy.tech/badge/docling/month)](https://pepy.tech/projects/docling)
23
+ [![Docling Actor](https://apify.com/actor-badge?actor=vancura/docling?fpr=docling)](https://apify.com/vancura/docling)
24
+ [![Chat with Dosu](https://dosu.dev/dosu-chat-badge.svg)](https://app.dosu.dev/097760a8-135e-4789-8234-90c8837d7f1c/ask?utm_source=github)
25
+ [![Discord](https://img.shields.io/discord/1399788921306746971?color=6A7EC2&logo=discord&logoColor=ffffff)](https://docling.ai/discord)
26
+ [![OpenSSF Best Practices](https://www.bestpractices.dev/projects/10101/badge)](https://www.bestpractices.dev/projects/10101)
27
+ [![LF AI & Data](https://img.shields.io/badge/LF%20AI%20%26%20Data-003778?logo=linuxfoundation&logoColor=fff&color=0094ff&labelColor=003778)](https://lfaidata.foundation/projects/)
28
+
29
+ Docling simplifies document processing, parsing diverse formats — including advanced PDF understanding — and providing seamless integrations with the gen AI ecosystem.
30
+
31
+ ## Features
32
+
33
+ * 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, VTT, images (PNG, TIFF, JPEG, ...), and more
34
+ * 📑 Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more
35
+ * 🧬 Unified, expressive [DoclingDocument][docling_document] representation format
36
+ * ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, [DocTags](https://arxiv.org/abs/2503.11576) and lossless JSON
37
+ * 🔒 Local execution capabilities for sensitive data and air-gapped environments
38
+ * 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
39
+ * 🔍 Extensive OCR support for scanned PDFs and images
40
+ * 👓 Support of several Visual Language Models ([GraniteDocling](https://huggingface.co/ibm-granite/granite-docling-258M))
41
+ * 🎙️ Audio support with Automatic Speech Recognition (ASR) models
42
+ * 🔌 Connect to any agent using the [MCP server](https://docling-project.github.io/docling/usage/mcp/)
43
+ * 💻 Simple and convenient CLI
44
+
45
+ ### What's new
46
+ * 📤 Structured [information extraction][extraction] \[🧪 beta\]
47
+ * 📑 New layout model (**Heron**) by default, for faster PDF parsing
48
+ * 🔌 [MCP server](https://docling-project.github.io/docling/usage/mcp/) for agentic applications
49
+ * 💬 Parsing of Web Video Text Tracks (WebVTT) files
50
+
51
+ ### Coming soon
52
+
53
+ * 📝 Metadata extraction, including title, authors, references & language
54
+ * 📝 Chart understanding (Barchart, Piechart, LinePlot, etc)
55
+ * 📝 Complex chemistry understanding (Molecular structures)
56
+
57
+ ## Installation
58
+
59
+ To use Docling, simply install `docling` from your package manager, e.g. pip:
60
+ ```bash
61
+ pip install docling
62
+ ```
63
+
64
+ Works on macOS, Linux and Windows environments. Both x86_64 and arm64 architectures.
65
+
66
+ More [detailed installation instructions](https://docling-project.github.io/docling/installation/) are available in the docs.
67
+
68
+ ## Getting started
69
+
70
+ To convert individual documents with python, use `convert()`, for example:
71
+
72
+ ```python
73
+ from docling.document_converter import DocumentConverter
74
+
75
+ source = "https://arxiv.org/pdf/2408.09869" # document per local path or URL
76
+ converter = DocumentConverter()
77
+ result = converter.convert(source)
78
+ print(result.document.export_to_markdown()) # output: "## Docling Technical Report[...]"
79
+ ```
80
+
81
+ More [advanced usage options](https://docling-project.github.io/docling/usage/advanced_options/) are available in
82
+ the docs.
83
+
84
+ ## CLI
85
+
86
+ Docling has a built-in CLI to run conversions.
87
+
88
+ ```bash
89
+ docling https://arxiv.org/pdf/2206.01062
90
+ ```
91
+
92
+ You can also use 🥚[GraniteDocling](https://huggingface.co/ibm-granite/granite-docling-258M) and other VLMs via Docling CLI:
93
+ ```bash
94
+ docling --pipeline vlm --vlm-model granite_docling https://arxiv.org/pdf/2206.01062
95
+ ```
96
+ This will use MLX acceleration on supported Apple Silicon hardware.
97
+
98
+ Read more [here](https://docling-project.github.io/docling/usage/)
99
+
100
+ ## Documentation
101
+
102
+ Check out Docling's [documentation](https://docling-project.github.io/docling/), for details on
103
+ installation, usage, concepts, recipes, extensions, and more.
104
+
105
+ ## Examples
106
+
107
+ Go hands-on with our [examples](https://docling-project.github.io/docling/examples/),
108
+ demonstrating how to address different application use cases with Docling.
109
+
110
+ ## Integrations
111
+
112
+ To further accelerate your AI application development, check out Docling's native
113
+ [integrations](https://docling-project.github.io/docling/integrations/) with popular frameworks
114
+ and tools.
115
+
116
+ ## Get help and support
117
+
118
+ Please feel free to connect with us using the [discussion section](https://github.com/docling-project/docling/discussions).
119
+
120
+ ## Technical report
121
+
122
+ For more details on Docling's inner workings, check out the [Docling Technical Report](https://arxiv.org/abs/2408.09869).
123
+
124
+ ## Contributing
125
+
126
+ Please read [Contributing to Docling](https://github.com/docling-project/docling/blob/main/CONTRIBUTING.md) for details.
127
+
128
+ ## References
129
+
130
+ If you use Docling in your projects, please consider citing the following:
131
+
132
+ ```bib
133
+ @techreport{Docling,
134
+ author = {Deep Search Team},
135
+ month = {8},
136
+ title = {Docling Technical Report},
137
+ url = {https://arxiv.org/abs/2408.09869},
138
+ eprint = {2408.09869},
139
+ doi = {10.48550/arXiv.2408.09869},
140
+ version = {1.0.0},
141
+ year = {2024}
142
+ }
143
+ ```
144
+
145
+ ## License
146
+
147
+ The Docling codebase is under MIT license.
148
+ For individual model usage, please refer to the model licenses found in the original packages.
149
+
150
+ ## LF AI & Data
151
+
152
+ Docling is hosted as a project in the [LF AI & Data Foundation](https://lfaidata.foundation/projects/).
153
+
154
+ ### IBM ❤️ Open Source AI
155
+
156
+ The project was started by the AI for knowledge team at IBM Research Zurich.
157
+
158
+ [supported_formats]: https://docling-project.github.io/docling/usage/supported_formats/
159
+ [docling_document]: https://docling-project.github.io/docling/concepts/docling_document/
160
+ [integrations]: https://docling-project.github.io/docling/integrations/
161
+ [extraction]: https://docling-project.github.io/docling/examples/extraction/
@@ -0,0 +1,84 @@
1
+ from abc import ABC, abstractmethod
2
+ from io import BytesIO
3
+ from pathlib import Path
4
+ from typing import TYPE_CHECKING, Union
5
+
6
+ from docling_core.types.doc import DoclingDocument
7
+
8
+ from docling.datamodel.backend_options import (
9
+ BackendOptions,
10
+ BaseBackendOptions,
11
+ DeclarativeBackendOptions,
12
+ )
13
+
14
+ if TYPE_CHECKING:
15
+ from docling.datamodel.base_models import InputFormat
16
+ from docling.datamodel.document import InputDocument
17
+
18
+
19
+ class AbstractDocumentBackend(ABC):
20
+ @abstractmethod
21
+ def __init__(
22
+ self,
23
+ in_doc: "InputDocument",
24
+ path_or_stream: Union[BytesIO, Path],
25
+ options: BaseBackendOptions = BaseBackendOptions(),
26
+ ):
27
+ self.file = in_doc.file
28
+ self.path_or_stream = path_or_stream
29
+ self.document_hash = in_doc.document_hash
30
+ self.input_format = in_doc.format
31
+ self.options = options
32
+
33
+ @abstractmethod
34
+ def is_valid(self) -> bool:
35
+ pass
36
+
37
+ @classmethod
38
+ @abstractmethod
39
+ def supports_pagination(cls) -> bool:
40
+ pass
41
+
42
+ def unload(self):
43
+ if isinstance(self.path_or_stream, BytesIO):
44
+ self.path_or_stream.close()
45
+
46
+ self.path_or_stream = None
47
+
48
+ @classmethod
49
+ @abstractmethod
50
+ def supported_formats(cls) -> set["InputFormat"]:
51
+ pass
52
+
53
+
54
+ class PaginatedDocumentBackend(AbstractDocumentBackend):
55
+ """DeclarativeDocumentBackend.
56
+
57
+ A declarative document backend is a backend that can transform to DoclingDocument
58
+ straight without a recognition pipeline.
59
+ """
60
+
61
+ @abstractmethod
62
+ def page_count(self) -> int:
63
+ pass
64
+
65
+
66
+ class DeclarativeDocumentBackend(AbstractDocumentBackend):
67
+ """DeclarativeDocumentBackend.
68
+
69
+ A declarative document backend is a backend that can transform to DoclingDocument
70
+ straight without a recognition pipeline.
71
+ """
72
+
73
+ @abstractmethod
74
+ def __init__(
75
+ self,
76
+ in_doc: "InputDocument",
77
+ path_or_stream: Union[BytesIO, Path],
78
+ options: BackendOptions = DeclarativeBackendOptions(),
79
+ ) -> None:
80
+ super().__init__(in_doc, path_or_stream, options)
81
+
82
+ @abstractmethod
83
+ def convert(self) -> DoclingDocument:
84
+ pass