docling 2.39.0__tar.gz → 2.64.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (189) hide show
  1. {docling-2.39.0 → docling-2.64.1}/PKG-INFO +38 -19
  2. {docling-2.39.0 → docling-2.64.1}/README.md +17 -7
  3. {docling-2.39.0 → docling-2.64.1}/docling/backend/abstract_backend.py +24 -3
  4. {docling-2.39.0 → docling-2.64.1}/docling/backend/asciidoc_backend.py +4 -4
  5. {docling-2.39.0 → docling-2.64.1}/docling/backend/docling_parse_v4_backend.py +103 -36
  6. docling-2.64.1/docling/backend/docx/drawingml/utils.py +131 -0
  7. {docling-2.39.0 → docling-2.64.1}/docling/backend/docx/latex/latex_dict.py +5 -0
  8. {docling-2.39.0 → docling-2.64.1}/docling/backend/docx/latex/omml.py +11 -2
  9. docling-2.64.1/docling/backend/html_backend.py +1499 -0
  10. docling-2.64.1/docling/backend/image_backend.py +188 -0
  11. {docling-2.39.0 → docling-2.64.1}/docling/backend/md_backend.py +78 -18
  12. docling-2.64.1/docling/backend/mets_gbs_backend.py +399 -0
  13. {docling-2.39.0 → docling-2.64.1}/docling/backend/msexcel_backend.py +252 -85
  14. {docling-2.39.0 → docling-2.64.1}/docling/backend/mspowerpoint_backend.py +2 -2
  15. {docling-2.39.0 → docling-2.64.1}/docling/backend/msword_backend.py +628 -145
  16. {docling-2.39.0 → docling-2.64.1}/docling/backend/pdf_backend.py +14 -14
  17. {docling-2.39.0 → docling-2.64.1}/docling/backend/pypdfium2_backend.py +39 -8
  18. docling-2.64.1/docling/backend/webvtt_backend.py +572 -0
  19. {docling-2.39.0 → docling-2.64.1}/docling/backend/xml/jats_backend.py +123 -11
  20. {docling-2.39.0 → docling-2.64.1}/docling/backend/xml/uspto_backend.py +1 -1
  21. {docling-2.39.0 → docling-2.64.1}/docling/cli/main.py +246 -64
  22. {docling-2.39.0 → docling-2.64.1}/docling/cli/models.py +63 -1
  23. docling-2.64.1/docling/datamodel/asr_model_specs.py +494 -0
  24. docling-2.64.1/docling/datamodel/backend_options.py +96 -0
  25. {docling-2.39.0 → docling-2.64.1}/docling/datamodel/base_models.py +102 -26
  26. {docling-2.39.0 → docling-2.64.1}/docling/datamodel/document.py +292 -52
  27. docling-2.64.1/docling/datamodel/extraction.py +39 -0
  28. docling-2.64.1/docling/datamodel/layout_model_specs.py +90 -0
  29. {docling-2.39.0 → docling-2.64.1}/docling/datamodel/pipeline_options.py +106 -22
  30. {docling-2.39.0 → docling-2.64.1}/docling/datamodel/pipeline_options_asr_model.py +21 -1
  31. {docling-2.39.0 → docling-2.64.1}/docling/datamodel/pipeline_options_vlm_model.py +56 -3
  32. {docling-2.39.0 → docling-2.64.1}/docling/datamodel/settings.py +7 -12
  33. docling-2.64.1/docling/datamodel/vlm_model_specs.py +314 -0
  34. {docling-2.39.0 → docling-2.64.1}/docling/document_converter.py +152 -80
  35. docling-2.64.1/docling/document_extractor.py +327 -0
  36. docling-2.64.1/docling/experimental/__init__.py +5 -0
  37. docling-2.64.1/docling/experimental/datamodel/__init__.py +1 -0
  38. docling-2.64.1/docling/experimental/datamodel/table_crops_layout_options.py +13 -0
  39. docling-2.64.1/docling/experimental/datamodel/threaded_layout_vlm_pipeline_options.py +45 -0
  40. docling-2.64.1/docling/experimental/models/__init__.py +3 -0
  41. docling-2.64.1/docling/experimental/models/table_crops_layout_model.py +114 -0
  42. docling-2.64.1/docling/experimental/pipeline/__init__.py +1 -0
  43. docling-2.64.1/docling/experimental/pipeline/threaded_layout_vlm_pipeline.py +439 -0
  44. docling-2.64.1/docling/models/api_vlm_model.py +180 -0
  45. docling-2.64.1/docling/models/auto_ocr_model.py +132 -0
  46. docling-2.64.1/docling/models/base_layout_model.py +39 -0
  47. docling-2.64.1/docling/models/base_model.py +230 -0
  48. {docling-2.39.0 → docling-2.64.1}/docling/models/base_ocr_model.py +20 -2
  49. docling-2.64.1/docling/models/base_table_model.py +45 -0
  50. {docling-2.39.0 → docling-2.64.1}/docling/models/code_formula_model.py +87 -76
  51. {docling-2.39.0 → docling-2.64.1}/docling/models/document_picture_classifier.py +14 -15
  52. {docling-2.39.0 → docling-2.64.1}/docling/models/easyocr_model.py +19 -9
  53. {docling-2.39.0 → docling-2.64.1}/docling/models/factories/__init__.py +20 -0
  54. docling-2.64.1/docling/models/factories/layout_factory.py +7 -0
  55. docling-2.64.1/docling/models/factories/table_factory.py +7 -0
  56. docling-2.64.1/docling/models/layout_model.py +249 -0
  57. {docling-2.39.0 → docling-2.64.1}/docling/models/page_preprocessing_model.py +6 -2
  58. {docling-2.39.0 → docling-2.64.1}/docling/models/picture_description_api_model.py +3 -1
  59. {docling-2.39.0 → docling-2.64.1}/docling/models/picture_description_vlm_model.py +23 -11
  60. docling-2.64.1/docling/models/plugins/defaults.py +54 -0
  61. docling-2.64.1/docling/models/rapid_ocr_model.py +328 -0
  62. {docling-2.39.0 → docling-2.64.1}/docling/models/readingorder_model.py +71 -14
  63. docling-2.64.1/docling/models/table_structure_model.py +305 -0
  64. {docling-2.39.0 → docling-2.64.1}/docling/models/tesseract_ocr_cli_model.py +8 -2
  65. {docling-2.39.0 → docling-2.64.1}/docling/models/tesseract_ocr_model.py +23 -9
  66. docling-2.64.1/docling/models/utils/generation_utils.py +157 -0
  67. {docling-2.39.0 → docling-2.64.1}/docling/models/utils/hf_model_download.py +6 -1
  68. docling-2.64.1/docling/models/vlm_models_inline/hf_transformers_model.py +391 -0
  69. docling-2.64.1/docling/models/vlm_models_inline/mlx_model.py +330 -0
  70. docling-2.64.1/docling/models/vlm_models_inline/nuextract_transformers_model.py +305 -0
  71. docling-2.64.1/docling/models/vlm_models_inline/vllm_model.py +344 -0
  72. {docling-2.39.0 → docling-2.64.1}/docling/pipeline/asr_pipeline.py +203 -25
  73. docling-2.64.1/docling/pipeline/base_extraction_pipeline.py +72 -0
  74. {docling-2.39.0 → docling-2.64.1}/docling/pipeline/base_pipeline.py +107 -18
  75. docling-2.64.1/docling/pipeline/extraction_vlm_pipeline.py +207 -0
  76. docling-2.39.0/docling/pipeline/standard_pdf_pipeline.py → docling-2.64.1/docling/pipeline/legacy_standard_pdf_pipeline.py +31 -69
  77. {docling-2.39.0 → docling-2.64.1}/docling/pipeline/simple_pipeline.py +6 -6
  78. docling-2.64.1/docling/pipeline/standard_pdf_pipeline.py +843 -0
  79. docling-2.64.1/docling/pipeline/threaded_standard_pdf_pipeline.py +5 -0
  80. {docling-2.39.0 → docling-2.64.1}/docling/pipeline/vlm_pipeline.py +72 -47
  81. docling-2.64.1/docling/py.typed +1 -0
  82. {docling-2.39.0 → docling-2.64.1}/docling/utils/accelerator_utils.py +2 -2
  83. docling-2.64.1/docling/utils/api_image_request.py +205 -0
  84. {docling-2.39.0 → docling-2.64.1}/docling/utils/layout_postprocessor.py +79 -61
  85. {docling-2.39.0 → docling-2.64.1}/docling/utils/model_downloader.py +38 -2
  86. {docling-2.39.0 → docling-2.64.1}/docling/utils/ocr_utils.py +1 -1
  87. {docling-2.39.0 → docling-2.64.1}/docling/utils/orientation.py +22 -28
  88. {docling-2.39.0 → docling-2.64.1}/docling.egg-info/PKG-INFO +38 -19
  89. {docling-2.39.0 → docling-2.64.1}/docling.egg-info/SOURCES.txt +38 -1
  90. docling-2.64.1/docling.egg-info/requires.txt +66 -0
  91. {docling-2.39.0 → docling-2.64.1}/pyproject.toml +29 -17
  92. docling-2.64.1/tests/test_asr_mlx_whisper.py +340 -0
  93. docling-2.64.1/tests/test_asr_pipeline.py +404 -0
  94. {docling-2.39.0 → docling-2.64.1}/tests/test_backend_docling_parse_v4.py +17 -0
  95. docling-2.64.1/tests/test_backend_html.py +561 -0
  96. docling-2.64.1/tests/test_backend_image_native.py +218 -0
  97. {docling-2.39.0 → docling-2.64.1}/tests/test_backend_jats.py +14 -14
  98. docling-2.64.1/tests/test_backend_markdown.py +111 -0
  99. docling-2.64.1/tests/test_backend_mets_gbs.py +77 -0
  100. docling-2.64.1/tests/test_backend_msexcel.py +314 -0
  101. docling-2.64.1/tests/test_backend_msword.py +239 -0
  102. {docling-2.39.0 → docling-2.64.1}/tests/test_backend_patent_uspto.py +11 -3
  103. {docling-2.39.0 → docling-2.64.1}/tests/test_backend_pdfium.py +19 -0
  104. docling-2.64.1/tests/test_backend_vtt.py +232 -0
  105. {docling-2.39.0 → docling-2.64.1}/tests/test_backend_webp.py +7 -3
  106. docling-2.64.1/tests/test_cli.py +92 -0
  107. docling-2.64.1/tests/test_conversion_result_json.py +44 -0
  108. {docling-2.39.0 → docling-2.64.1}/tests/test_document_picture_classifier.py +2 -1
  109. {docling-2.39.0 → docling-2.64.1}/tests/test_e2e_conversion.py +11 -8
  110. {docling-2.39.0 → docling-2.64.1}/tests/test_e2e_ocr_conversion.py +28 -15
  111. docling-2.64.1/tests/test_extraction.py +108 -0
  112. {docling-2.39.0 → docling-2.64.1}/tests/test_input_doc.py +71 -34
  113. docling-2.64.1/tests/test_interfaces.py +138 -0
  114. {docling-2.39.0 → docling-2.64.1}/tests/test_legacy_format_transform.py +1 -0
  115. docling-2.64.1/tests/test_ocr_utils.py +80 -0
  116. {docling-2.39.0 → docling-2.64.1}/tests/test_options.py +28 -0
  117. docling-2.64.1/tests/test_pdf_password.py +63 -0
  118. {docling-2.39.0 → docling-2.64.1}/tests/test_settings_load.py +1 -1
  119. docling-2.64.1/tests/test_threaded_pipeline.py +198 -0
  120. docling-2.39.0/docling/backend/html_backend.py +0 -577
  121. docling-2.39.0/docling/datamodel/asr_model_specs.py +0 -92
  122. docling-2.39.0/docling/datamodel/vlm_model_specs.py +0 -144
  123. docling-2.39.0/docling/models/api_vlm_model.py +0 -73
  124. docling-2.39.0/docling/models/base_model.py +0 -93
  125. docling-2.39.0/docling/models/layout_model.py +0 -210
  126. docling-2.39.0/docling/models/plugins/defaults.py +0 -28
  127. docling-2.39.0/docling/models/rapid_ocr_model.py +0 -147
  128. docling-2.39.0/docling/models/table_structure_model.py +0 -302
  129. docling-2.39.0/docling/models/vlm_models_inline/hf_transformers_model.py +0 -197
  130. docling-2.39.0/docling/models/vlm_models_inline/mlx_model.py +0 -149
  131. docling-2.39.0/docling/utils/__init__.py +0 -0
  132. docling-2.39.0/docling/utils/api_image_request.py +0 -61
  133. docling-2.39.0/docling.egg-info/requires.txt +0 -49
  134. docling-2.39.0/tests/test_asr_pipeline.py +0 -59
  135. docling-2.39.0/tests/test_backend_html.py +0 -149
  136. docling-2.39.0/tests/test_backend_markdown.py +0 -52
  137. docling-2.39.0/tests/test_backend_msexcel.py +0 -99
  138. docling-2.39.0/tests/test_backend_msword.py +0 -173
  139. docling-2.39.0/tests/test_cli.py +0 -27
  140. docling-2.39.0/tests/test_interfaces.py +0 -67
  141. {docling-2.39.0 → docling-2.64.1}/LICENSE +0 -0
  142. {docling-2.39.0 → docling-2.64.1}/docling/__init__.py +0 -0
  143. {docling-2.39.0 → docling-2.64.1}/docling/backend/__init__.py +0 -0
  144. {docling-2.39.0 → docling-2.64.1}/docling/backend/csv_backend.py +0 -0
  145. {docling-2.39.0 → docling-2.64.1}/docling/backend/docling_parse_backend.py +0 -0
  146. {docling-2.39.0 → docling-2.64.1}/docling/backend/docling_parse_v2_backend.py +0 -0
  147. {docling-2.39.0 → docling-2.64.1}/docling/backend/docx/__init__.py +0 -0
  148. {docling-2.39.0 → docling-2.64.1}/docling/backend/docx/latex/__init__.py +0 -0
  149. {docling-2.39.0 → docling-2.64.1}/docling/backend/json/__init__.py +0 -0
  150. {docling-2.39.0 → docling-2.64.1}/docling/backend/json/docling_json_backend.py +0 -0
  151. {docling-2.39.0 → docling-2.64.1}/docling/backend/noop_backend.py +0 -0
  152. {docling-2.39.0 → docling-2.64.1}/docling/backend/xml/__init__.py +0 -0
  153. {docling-2.39.0 → docling-2.64.1}/docling/chunking/__init__.py +0 -0
  154. {docling-2.39.0 → docling-2.64.1}/docling/cli/__init__.py +0 -0
  155. {docling-2.39.0 → docling-2.64.1}/docling/cli/tools.py +0 -0
  156. {docling-2.39.0 → docling-2.64.1}/docling/datamodel/__init__.py +0 -0
  157. {docling-2.39.0 → docling-2.64.1}/docling/datamodel/accelerator_options.py +0 -0
  158. {docling-2.39.0 → docling-2.64.1}/docling/exceptions.py +0 -0
  159. {docling-2.39.0 → docling-2.64.1}/docling/models/__init__.py +0 -0
  160. {docling-2.39.0 → docling-2.64.1}/docling/models/factories/base_factory.py +0 -0
  161. {docling-2.39.0 → docling-2.64.1}/docling/models/factories/ocr_factory.py +0 -0
  162. {docling-2.39.0 → docling-2.64.1}/docling/models/factories/picture_description_factory.py +0 -0
  163. {docling-2.39.0 → docling-2.64.1}/docling/models/ocr_mac_model.py +0 -0
  164. {docling-2.39.0 → docling-2.64.1}/docling/models/page_assemble_model.py +0 -0
  165. {docling-2.39.0 → docling-2.64.1}/docling/models/picture_description_base_model.py +0 -0
  166. {docling-2.39.0 → docling-2.64.1}/docling/models/plugins/__init__.py +0 -0
  167. {docling-2.39.0 → docling-2.64.1}/docling/models/utils/__init__.py +0 -0
  168. /docling-2.39.0/docling/py.typed → /docling-2.64.1/docling/models/vlm_models_inline/__init__.py +0 -0
  169. {docling-2.39.0/docling/models/vlm_models_inline → docling-2.64.1/docling/pipeline}/__init__.py +0 -0
  170. {docling-2.39.0/docling/pipeline → docling-2.64.1/docling/utils}/__init__.py +0 -0
  171. {docling-2.39.0 → docling-2.64.1}/docling/utils/export.py +0 -0
  172. {docling-2.39.0 → docling-2.64.1}/docling/utils/glm_utils.py +0 -0
  173. {docling-2.39.0 → docling-2.64.1}/docling/utils/locks.py +0 -0
  174. {docling-2.39.0 → docling-2.64.1}/docling/utils/profiling.py +0 -0
  175. {docling-2.39.0 → docling-2.64.1}/docling/utils/utils.py +0 -0
  176. {docling-2.39.0 → docling-2.64.1}/docling/utils/visualization.py +0 -0
  177. {docling-2.39.0 → docling-2.64.1}/docling.egg-info/dependency_links.txt +0 -0
  178. {docling-2.39.0 → docling-2.64.1}/docling.egg-info/entry_points.txt +0 -0
  179. {docling-2.39.0 → docling-2.64.1}/docling.egg-info/top_level.txt +0 -0
  180. {docling-2.39.0 → docling-2.64.1}/setup.cfg +0 -0
  181. {docling-2.39.0 → docling-2.64.1}/tests/test_backend_asciidoc.py +0 -0
  182. {docling-2.39.0 → docling-2.64.1}/tests/test_backend_csv.py +0 -0
  183. {docling-2.39.0 → docling-2.64.1}/tests/test_backend_docling_json.py +0 -0
  184. {docling-2.39.0 → docling-2.64.1}/tests/test_backend_docling_parse.py +0 -0
  185. {docling-2.39.0 → docling-2.64.1}/tests/test_backend_docling_parse_v2.py +0 -0
  186. {docling-2.39.0 → docling-2.64.1}/tests/test_backend_pptx.py +0 -0
  187. {docling-2.39.0 → docling-2.64.1}/tests/test_code_formula.py +0 -0
  188. {docling-2.39.0 → docling-2.64.1}/tests/test_data_gen_flag.py +0 -0
  189. {docling-2.39.0 → docling-2.64.1}/tests/test_invalid_input.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling
3
- Version: 2.39.0
3
+ Version: 2.64.1
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  License-Expression: MIT
@@ -22,34 +22,40 @@ Classifier: Programming Language :: Python :: 3.10
22
22
  Classifier: Programming Language :: Python :: 3.11
23
23
  Classifier: Programming Language :: Python :: 3.12
24
24
  Classifier: Programming Language :: Python :: 3.13
25
+ Classifier: Programming Language :: Python :: 3.14
25
26
  Requires-Python: <4.0,>=3.9
26
27
  Description-Content-Type: text/markdown
27
28
  License-File: LICENSE
28
29
  Requires-Dist: pydantic<3.0.0,>=2.0.0
29
- Requires-Dist: docling-core[chunking]<3.0.0,>=2.39.0
30
- Requires-Dist: docling-ibm-models<4.0.0,>=3.4.4
31
- Requires-Dist: docling-parse<5.0.0,>=4.0.0
30
+ Requires-Dist: docling-core[chunking]<3.0.0,>=2.50.1
31
+ Requires-Dist: docling-parse<5.0.0,>=4.7.0
32
+ Requires-Dist: docling-ibm-models<4,>=3.9.1
32
33
  Requires-Dist: filetype<2.0.0,>=1.2.0
33
- Requires-Dist: pypdfium2<5.0.0,>=4.30.0
34
+ Requires-Dist: pypdfium2!=4.30.1,<5.0.0,>=4.30.0
34
35
  Requires-Dist: pydantic-settings<3.0.0,>=2.3.0
35
36
  Requires-Dist: huggingface_hub<1,>=0.23
36
37
  Requires-Dist: requests<3.0.0,>=2.32.2
37
- Requires-Dist: easyocr<2.0,>=1.7
38
+ Requires-Dist: ocrmac<2.0.0,>=1.0.0; sys_platform == "darwin"
39
+ Requires-Dist: rapidocr<4.0.0,>=3.3
38
40
  Requires-Dist: certifi>=2024.7.4
39
41
  Requires-Dist: rtree<2.0.0,>=1.3.0
40
- Requires-Dist: typer<0.17.0,>=0.12.5
42
+ Requires-Dist: typer<0.20.0,>=0.12.5
41
43
  Requires-Dist: python-docx<2.0.0,>=1.1.2
42
44
  Requires-Dist: python-pptx<2.0.0,>=1.0.2
43
45
  Requires-Dist: beautifulsoup4<5.0.0,>=4.12.3
44
46
  Requires-Dist: pandas<3.0.0,>=2.1.4
45
47
  Requires-Dist: marko<3.0.0,>=2.1.2
46
48
  Requires-Dist: openpyxl<4.0.0,>=3.1.5
47
- Requires-Dist: lxml<6.0.0,>=4.0.0
49
+ Requires-Dist: lxml<7.0.0,>=4.0.0
48
50
  Requires-Dist: pillow<12.0.0,>=10.0.0
49
51
  Requires-Dist: tqdm<5.0.0,>=4.65.0
50
52
  Requires-Dist: pluggy<2.0.0,>=1.0.0
51
53
  Requires-Dist: pylatexenc<3.0,>=2.10
52
54
  Requires-Dist: scipy<2.0.0,>=1.6.0
55
+ Requires-Dist: accelerate<2,>=1.0.0
56
+ Requires-Dist: polyfactory>=2.22.2
57
+ Provides-Extra: easyocr
58
+ Requires-Dist: easyocr<2.0,>=1.7; extra == "easyocr"
53
59
  Provides-Extra: tesserocr
54
60
  Requires-Dist: tesserocr<3.0.0,>=2.7.1; extra == "tesserocr"
55
61
  Provides-Extra: ocrmac
@@ -57,12 +63,15 @@ Requires-Dist: ocrmac<2.0.0,>=1.0.0; sys_platform == "darwin" and extra == "ocrm
57
63
  Provides-Extra: vlm
58
64
  Requires-Dist: transformers<5.0.0,>=4.46.0; extra == "vlm"
59
65
  Requires-Dist: accelerate<2.0.0,>=1.2.1; extra == "vlm"
60
- Requires-Dist: mlx-vlm>=0.1.22; (python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64") and extra == "vlm"
66
+ Requires-Dist: mlx-vlm<1.0.0,>=0.3.0; (python_version >= "3.10" and python_version < "3.14" and sys_platform == "darwin" and platform_machine == "arm64") and extra == "vlm"
67
+ Requires-Dist: vllm<1.0.0,>=0.10.0; (python_version >= "3.10" and python_version < "3.14" and sys_platform == "linux" and platform_machine == "x86_64") and extra == "vlm"
68
+ Requires-Dist: qwen-vl-utils>=0.0.11; extra == "vlm"
61
69
  Provides-Extra: rapidocr
62
- Requires-Dist: rapidocr-onnxruntime<2.0.0,>=1.4.0; python_version < "3.13" and extra == "rapidocr"
63
- Requires-Dist: onnxruntime<2.0.0,>=1.7.0; extra == "rapidocr"
70
+ Requires-Dist: rapidocr<4.0.0,>=3.3; extra == "rapidocr"
71
+ Requires-Dist: onnxruntime<2.0.0,>=1.7.0; python_version < "3.14" and extra == "rapidocr"
64
72
  Provides-Extra: asr
65
- Requires-Dist: openai-whisper>=20240930; extra == "asr"
73
+ Requires-Dist: mlx-whisper>=0.4.3; (python_version >= "3.10" and python_version < "3.14" and sys_platform == "darwin" and platform_machine == "arm64") and extra == "asr"
74
+ Requires-Dist: openai-whisper>=20250625; python_version < "3.14" and extra == "asr"
66
75
  Dynamic: license-file
67
76
 
68
77
  <p align="center">
@@ -88,6 +97,8 @@ Dynamic: license-file
88
97
  [![License MIT](https://img.shields.io/github/license/docling-project/docling)](https://opensource.org/licenses/MIT)
89
98
  [![PyPI Downloads](https://static.pepy.tech/badge/docling/month)](https://pepy.tech/projects/docling)
90
99
  [![Docling Actor](https://apify.com/actor-badge?actor=vancura/docling?fpr=docling)](https://apify.com/vancura/docling)
100
+ [![Chat with Dosu](https://dosu.dev/dosu-chat-badge.svg)](https://app.dosu.dev/097760a8-135e-4789-8234-90c8837d7f1c/ask?utm_source=github)
101
+ [![Discord](https://img.shields.io/discord/1399788921306746971?color=6A7EC2&logo=discord&logoColor=ffffff)](https://docling.ai/discord)
91
102
  [![OpenSSF Best Practices](https://www.bestpractices.dev/projects/10101/badge)](https://www.bestpractices.dev/projects/10101)
92
103
  [![LF AI & Data](https://img.shields.io/badge/LF%20AI%20%26%20Data-003778?logo=linuxfoundation&logoColor=fff&color=0094ff&labelColor=003778)](https://lfaidata.foundation/projects/)
93
104
 
@@ -95,17 +106,24 @@ Docling simplifies document processing, parsing diverse formats — including ad
95
106
 
96
107
  ## Features
97
108
 
98
- * 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, images (PNG, TIFF, JPEG, ...), and more
109
+ * 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, VTT, images (PNG, TIFF, JPEG, ...), and more
99
110
  * 📑 Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more
100
111
  * 🧬 Unified, expressive [DoclingDocument][docling_document] representation format
101
- * ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, [DocTags](https://arxiv.org/abs/2503.11576) and lossless JSON
112
+ * ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, [DocTags](https://arxiv.org/abs/2503.11576) and lossless JSON
102
113
  * 🔒 Local execution capabilities for sensitive data and air-gapped environments
103
114
  * 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
104
115
  * 🔍 Extensive OCR support for scanned PDFs and images
105
- * 👓 Support of several Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview))
106
- * 🎙️ Support for Audio with Automatic Speech Recognition (ASR) models
116
+ * 👓 Support of several Visual Language Models ([GraniteDocling](https://huggingface.co/ibm-granite/granite-docling-258M))
117
+ * 🎙️ Audio support with Automatic Speech Recognition (ASR) models
118
+ * 🔌 Connect to any agent using the [MCP server](https://docling-project.github.io/docling/usage/mcp/)
107
119
  * 💻 Simple and convenient CLI
108
120
 
121
+ ### What's new
122
+ * 📤 Structured [information extraction][extraction] \[🧪 beta\]
123
+ * 📑 New layout model (**Heron**) by default, for faster PDF parsing
124
+ * 🔌 [MCP server](https://docling-project.github.io/docling/usage/mcp/) for agentic applications
125
+ * 💬 Parsing of Web Video Text Tracks (WebVTT) files
126
+
109
127
  ### Coming soon
110
128
 
111
129
  * 📝 Metadata extraction, including title, authors, references & language
@@ -136,7 +154,7 @@ result = converter.convert(source)
136
154
  print(result.document.export_to_markdown()) # output: "## Docling Technical Report[...]"
137
155
  ```
138
156
 
139
- More [advanced usage options](https://docling-project.github.io/docling/usage/) are available in
157
+ More [advanced usage options](https://docling-project.github.io/docling/usage/advanced_options/) are available in
140
158
  the docs.
141
159
 
142
160
  ## CLI
@@ -147,9 +165,9 @@ Docling has a built-in CLI to run conversions.
147
165
  docling https://arxiv.org/pdf/2206.01062
148
166
  ```
149
167
 
150
- You can also use 🥚[SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview) and other VLMs via Docling CLI:
168
+ You can also use 🥚[GraniteDocling](https://huggingface.co/ibm-granite/granite-docling-258M) and other VLMs via Docling CLI:
151
169
  ```bash
152
- docling --pipeline vlm --vlm-model smoldocling https://arxiv.org/pdf/2206.01062
170
+ docling --pipeline vlm --vlm-model granite_docling https://arxiv.org/pdf/2206.01062
153
171
  ```
154
172
  This will use MLX acceleration on supported Apple Silicon hardware.
155
173
 
@@ -216,3 +234,4 @@ The project was started by the AI for knowledge team at IBM Research Zurich.
216
234
  [supported_formats]: https://docling-project.github.io/docling/usage/supported_formats/
217
235
  [docling_document]: https://docling-project.github.io/docling/concepts/docling_document/
218
236
  [integrations]: https://docling-project.github.io/docling/integrations/
237
+ [extraction]: https://docling-project.github.io/docling/examples/extraction/
@@ -21,6 +21,8 @@
21
21
  [![License MIT](https://img.shields.io/github/license/docling-project/docling)](https://opensource.org/licenses/MIT)
22
22
  [![PyPI Downloads](https://static.pepy.tech/badge/docling/month)](https://pepy.tech/projects/docling)
23
23
  [![Docling Actor](https://apify.com/actor-badge?actor=vancura/docling?fpr=docling)](https://apify.com/vancura/docling)
24
+ [![Chat with Dosu](https://dosu.dev/dosu-chat-badge.svg)](https://app.dosu.dev/097760a8-135e-4789-8234-90c8837d7f1c/ask?utm_source=github)
25
+ [![Discord](https://img.shields.io/discord/1399788921306746971?color=6A7EC2&logo=discord&logoColor=ffffff)](https://docling.ai/discord)
24
26
  [![OpenSSF Best Practices](https://www.bestpractices.dev/projects/10101/badge)](https://www.bestpractices.dev/projects/10101)
25
27
  [![LF AI & Data](https://img.shields.io/badge/LF%20AI%20%26%20Data-003778?logo=linuxfoundation&logoColor=fff&color=0094ff&labelColor=003778)](https://lfaidata.foundation/projects/)
26
28
 
@@ -28,17 +30,24 @@ Docling simplifies document processing, parsing diverse formats — including ad
28
30
 
29
31
  ## Features
30
32
 
31
- * 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, images (PNG, TIFF, JPEG, ...), and more
33
+ * 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, VTT, images (PNG, TIFF, JPEG, ...), and more
32
34
  * 📑 Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more
33
35
  * 🧬 Unified, expressive [DoclingDocument][docling_document] representation format
34
- * ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, [DocTags](https://arxiv.org/abs/2503.11576) and lossless JSON
36
+ * ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, [DocTags](https://arxiv.org/abs/2503.11576) and lossless JSON
35
37
  * 🔒 Local execution capabilities for sensitive data and air-gapped environments
36
38
  * 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
37
39
  * 🔍 Extensive OCR support for scanned PDFs and images
38
- * 👓 Support of several Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview))
39
- * 🎙️ Support for Audio with Automatic Speech Recognition (ASR) models
40
+ * 👓 Support of several Visual Language Models ([GraniteDocling](https://huggingface.co/ibm-granite/granite-docling-258M))
41
+ * 🎙️ Audio support with Automatic Speech Recognition (ASR) models
42
+ * 🔌 Connect to any agent using the [MCP server](https://docling-project.github.io/docling/usage/mcp/)
40
43
  * 💻 Simple and convenient CLI
41
44
 
45
+ ### What's new
46
+ * 📤 Structured [information extraction][extraction] \[🧪 beta\]
47
+ * 📑 New layout model (**Heron**) by default, for faster PDF parsing
48
+ * 🔌 [MCP server](https://docling-project.github.io/docling/usage/mcp/) for agentic applications
49
+ * 💬 Parsing of Web Video Text Tracks (WebVTT) files
50
+
42
51
  ### Coming soon
43
52
 
44
53
  * 📝 Metadata extraction, including title, authors, references & language
@@ -69,7 +78,7 @@ result = converter.convert(source)
69
78
  print(result.document.export_to_markdown()) # output: "## Docling Technical Report[...]"
70
79
  ```
71
80
 
72
- More [advanced usage options](https://docling-project.github.io/docling/usage/) are available in
81
+ More [advanced usage options](https://docling-project.github.io/docling/usage/advanced_options/) are available in
73
82
  the docs.
74
83
 
75
84
  ## CLI
@@ -80,9 +89,9 @@ Docling has a built-in CLI to run conversions.
80
89
  docling https://arxiv.org/pdf/2206.01062
81
90
  ```
82
91
 
83
- You can also use 🥚[SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview) and other VLMs via Docling CLI:
92
+ You can also use 🥚[GraniteDocling](https://huggingface.co/ibm-granite/granite-docling-258M) and other VLMs via Docling CLI:
84
93
  ```bash
85
- docling --pipeline vlm --vlm-model smoldocling https://arxiv.org/pdf/2206.01062
94
+ docling --pipeline vlm --vlm-model granite_docling https://arxiv.org/pdf/2206.01062
86
95
  ```
87
96
  This will use MLX acceleration on supported Apple Silicon hardware.
88
97
 
@@ -149,3 +158,4 @@ The project was started by the AI for knowledge team at IBM Research Zurich.
149
158
  [supported_formats]: https://docling-project.github.io/docling/usage/supported_formats/
150
159
  [docling_document]: https://docling-project.github.io/docling/concepts/docling_document/
151
160
  [integrations]: https://docling-project.github.io/docling/integrations/
161
+ [extraction]: https://docling-project.github.io/docling/examples/extraction/
@@ -1,10 +1,16 @@
1
1
  from abc import ABC, abstractmethod
2
2
  from io import BytesIO
3
3
  from pathlib import Path
4
- from typing import TYPE_CHECKING, Set, Union
4
+ from typing import TYPE_CHECKING, Union
5
5
 
6
6
  from docling_core.types.doc import DoclingDocument
7
7
 
8
+ from docling.datamodel.backend_options import (
9
+ BackendOptions,
10
+ BaseBackendOptions,
11
+ DeclarativeBackendOptions,
12
+ )
13
+
8
14
  if TYPE_CHECKING:
9
15
  from docling.datamodel.base_models import InputFormat
10
16
  from docling.datamodel.document import InputDocument
@@ -12,11 +18,17 @@ if TYPE_CHECKING:
12
18
 
13
19
  class AbstractDocumentBackend(ABC):
14
20
  @abstractmethod
15
- def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
21
+ def __init__(
22
+ self,
23
+ in_doc: "InputDocument",
24
+ path_or_stream: Union[BytesIO, Path],
25
+ options: BaseBackendOptions = BaseBackendOptions(),
26
+ ):
16
27
  self.file = in_doc.file
17
28
  self.path_or_stream = path_or_stream
18
29
  self.document_hash = in_doc.document_hash
19
30
  self.input_format = in_doc.format
31
+ self.options = options
20
32
 
21
33
  @abstractmethod
22
34
  def is_valid(self) -> bool:
@@ -35,7 +47,7 @@ class AbstractDocumentBackend(ABC):
35
47
 
36
48
  @classmethod
37
49
  @abstractmethod
38
- def supported_formats(cls) -> Set["InputFormat"]:
50
+ def supported_formats(cls) -> set["InputFormat"]:
39
51
  pass
40
52
 
41
53
 
@@ -58,6 +70,15 @@ class DeclarativeDocumentBackend(AbstractDocumentBackend):
58
70
  straight without a recognition pipeline.
59
71
  """
60
72
 
73
+ @abstractmethod
74
+ def __init__(
75
+ self,
76
+ in_doc: "InputDocument",
77
+ path_or_stream: Union[BytesIO, Path],
78
+ options: BackendOptions = DeclarativeBackendOptions(),
79
+ ) -> None:
80
+ super().__init__(in_doc, path_or_stream, options)
81
+
61
82
  @abstractmethod
62
83
  def convert(self) -> DoclingDocument:
63
84
  pass
@@ -2,7 +2,7 @@ import logging
2
2
  import re
3
3
  from io import BytesIO
4
4
  from pathlib import Path
5
- from typing import Final, Set, Union
5
+ from typing import Final, Union
6
6
 
7
7
  from docling_core.types.doc import (
8
8
  DocItemLabel,
@@ -27,7 +27,7 @@ DEFAULT_IMAGE_HEIGHT: Final = 128
27
27
 
28
28
 
29
29
  class AsciiDocBackend(DeclarativeDocumentBackend):
30
- def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
30
+ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
31
31
  super().__init__(in_doc, path_or_stream)
32
32
 
33
33
  self.path_or_stream = path_or_stream
@@ -58,7 +58,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
58
58
  return
59
59
 
60
60
  @classmethod
61
- def supported_formats(cls) -> Set[InputFormat]:
61
+ def supported_formats(cls) -> set[InputFormat]:
62
62
  return {InputFormat.ASCIIDOC}
63
63
 
64
64
  def convert(self) -> DoclingDocument:
@@ -78,7 +78,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
78
78
 
79
79
  return doc
80
80
 
81
- def _parse(self, doc: DoclingDocument): # noqa: C901
81
+ def _parse(self, doc: DoclingDocument):
82
82
  """
83
83
  Main function that orchestrates the parsing by yielding components:
84
84
  title, section headers, text, lists, and tables.
@@ -12,6 +12,7 @@ from PIL import Image
12
12
  from pypdfium2 import PdfPage
13
13
 
14
14
  from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
15
+ from docling.datamodel.backend_options import PdfBackendOptions
15
16
  from docling.datamodel.base_models import Size
16
17
  from docling.utils.locks import pypdfium2_lock
17
18
 
@@ -22,15 +23,64 @@ _log = logging.getLogger(__name__)
22
23
 
23
24
 
24
25
  class DoclingParseV4PageBackend(PdfPageBackend):
25
- def __init__(self, parsed_page: SegmentedPdfPage, page_obj: PdfPage):
26
+ def __init__(
27
+ self,
28
+ *,
29
+ dp_doc: PdfDocument,
30
+ page_obj: PdfPage,
31
+ page_no: int,
32
+ create_words: bool = True,
33
+ create_textlines: bool = True,
34
+ keep_chars: bool = False,
35
+ keep_lines: bool = False,
36
+ keep_images: bool = True,
37
+ ):
26
38
  self._ppage = page_obj
27
- self._dpage = parsed_page
28
- self.valid = parsed_page is not None
39
+ self._dp_doc = dp_doc
40
+ self._page_no = page_no
41
+
42
+ self._create_words = create_words
43
+ self._create_textlines = create_textlines
44
+
45
+ self._keep_chars = keep_chars
46
+ self._keep_lines = keep_lines
47
+ self._keep_images = keep_images
48
+
49
+ self._dpage: Optional[SegmentedPdfPage] = None
50
+ self._unloaded = False
51
+ self.valid = (self._ppage is not None) and (self._dp_doc is not None)
52
+
53
+ def _ensure_parsed(self) -> None:
54
+ if self._dpage is not None:
55
+ return
56
+
57
+ seg_page = self._dp_doc.get_page(
58
+ self._page_no + 1,
59
+ keep_chars=self._keep_chars,
60
+ keep_lines=self._keep_lines,
61
+ keep_bitmaps=self._keep_images,
62
+ create_words=self._create_words,
63
+ create_textlines=self._create_textlines,
64
+ enforce_same_font=True,
65
+ )
66
+
67
+ # In Docling, all TextCell instances are expected with top-left origin.
68
+ [
69
+ tc.to_top_left_origin(seg_page.dimension.height)
70
+ for tc in seg_page.textline_cells
71
+ ]
72
+ [tc.to_top_left_origin(seg_page.dimension.height) for tc in seg_page.char_cells]
73
+ [tc.to_top_left_origin(seg_page.dimension.height) for tc in seg_page.word_cells]
74
+
75
+ self._dpage = seg_page
29
76
 
30
77
  def is_valid(self) -> bool:
31
78
  return self.valid
32
79
 
33
80
  def get_text_in_rect(self, bbox: BoundingBox) -> str:
81
+ self._ensure_parsed()
82
+ assert self._dpage is not None
83
+
34
84
  # Find intersecting cells on the page
35
85
  text_piece = ""
36
86
  page_size = self.get_size()
@@ -56,12 +106,19 @@ class DoclingParseV4PageBackend(PdfPageBackend):
56
106
  return text_piece
57
107
 
58
108
  def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
109
+ self._ensure_parsed()
59
110
  return self._dpage
60
111
 
61
112
  def get_text_cells(self) -> Iterable[TextCell]:
113
+ self._ensure_parsed()
114
+ assert self._dpage is not None
115
+
62
116
  return self._dpage.textline_cells
63
117
 
64
118
  def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
119
+ self._ensure_parsed()
120
+ assert self._dpage is not None
121
+
65
122
  AREA_THRESHOLD = 0 # 32 * 32
66
123
 
67
124
  images = self._dpage.bitmap_resources
@@ -123,18 +180,33 @@ class DoclingParseV4PageBackend(PdfPageBackend):
123
180
  # )
124
181
 
125
182
  def unload(self):
183
+ if not self._unloaded and self._dp_doc is not None:
184
+ self._dp_doc.unload_pages((self._page_no + 1, self._page_no + 2))
185
+ self._unloaded = True
186
+
126
187
  self._ppage = None
127
188
  self._dpage = None
189
+ self._dp_doc = None
128
190
 
129
191
 
130
192
  class DoclingParseV4DocumentBackend(PdfDocumentBackend):
131
- def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
132
- super().__init__(in_doc, path_or_stream)
133
-
193
+ def __init__(
194
+ self,
195
+ in_doc: "InputDocument",
196
+ path_or_stream: Union[BytesIO, Path],
197
+ options: PdfBackendOptions = PdfBackendOptions(),
198
+ ):
199
+ super().__init__(in_doc, path_or_stream, options)
200
+
201
+ password = (
202
+ self.options.password.get_secret_value() if self.options.password else None
203
+ )
134
204
  with pypdfium2_lock:
135
- self._pdoc = pdfium.PdfDocument(self.path_or_stream)
205
+ self._pdoc = pdfium.PdfDocument(self.path_or_stream, password=password)
136
206
  self.parser = DoclingPdfParser(loglevel="fatal")
137
- self.dp_doc: PdfDocument = self.parser.load(path_or_stream=self.path_or_stream)
207
+ self.dp_doc: PdfDocument = self.parser.load(
208
+ path_or_stream=self.path_or_stream, password=password
209
+ )
138
210
  success = self.dp_doc is not None
139
211
 
140
212
  if not success:
@@ -157,37 +229,32 @@ class DoclingParseV4DocumentBackend(PdfDocumentBackend):
157
229
  self, page_no: int, create_words: bool = True, create_textlines: bool = True
158
230
  ) -> DoclingParseV4PageBackend:
159
231
  with pypdfium2_lock:
160
- seg_page = self.dp_doc.get_page(
161
- page_no + 1,
162
- create_words=create_words,
163
- create_textlines=create_textlines,
164
- )
165
-
166
- # In Docling, all TextCell instances are expected with top-left origin.
167
- [
168
- tc.to_top_left_origin(seg_page.dimension.height)
169
- for tc in seg_page.textline_cells
170
- ]
171
- [
172
- tc.to_top_left_origin(seg_page.dimension.height)
173
- for tc in seg_page.char_cells
174
- ]
175
- [
176
- tc.to_top_left_origin(seg_page.dimension.height)
177
- for tc in seg_page.word_cells
178
- ]
179
-
180
- return DoclingParseV4PageBackend(
181
- seg_page,
182
- self._pdoc[page_no],
183
- )
232
+ ppage = self._pdoc[page_no]
233
+
234
+ return DoclingParseV4PageBackend(
235
+ dp_doc=self.dp_doc,
236
+ page_obj=ppage,
237
+ page_no=page_no,
238
+ create_words=create_words,
239
+ create_textlines=create_textlines,
240
+ )
184
241
 
185
242
  def is_valid(self) -> bool:
186
243
  return self.page_count() > 0
187
244
 
188
245
  def unload(self):
189
246
  super().unload()
190
- self.dp_doc.unload()
191
- with pypdfium2_lock:
192
- self._pdoc.close()
193
- self._pdoc = None
247
+ # Unload docling-parse document first
248
+ if self.dp_doc is not None:
249
+ self.dp_doc.unload()
250
+ self.dp_doc = None
251
+
252
+ # Then close pypdfium2 document with proper locking
253
+ if self._pdoc is not None:
254
+ with pypdfium2_lock:
255
+ try:
256
+ self._pdoc.close()
257
+ except Exception:
258
+ # Ignore cleanup errors
259
+ pass
260
+ self._pdoc = None
@@ -0,0 +1,131 @@
1
+ import os
2
+ import shutil
3
+ import subprocess
4
+ from pathlib import Path
5
+ from tempfile import mkdtemp
6
+ from typing import Callable, Optional
7
+
8
+ import pypdfium2
9
+ from docx.document import Document
10
+ from PIL import Image, ImageChops
11
+
12
+
13
+ def get_libreoffice_cmd(raise_if_unavailable: bool = False) -> Optional[str]:
14
+ """Return the libreoffice cmd and optionally test it."""
15
+
16
+ libreoffice_cmd = (
17
+ shutil.which("libreoffice")
18
+ or shutil.which("soffice")
19
+ or (
20
+ "/Applications/LibreOffice.app/Contents/MacOS/soffice"
21
+ if os.path.isfile("/Applications/LibreOffice.app/Contents/MacOS/soffice")
22
+ else None
23
+ )
24
+ )
25
+
26
+ if raise_if_unavailable:
27
+ if libreoffice_cmd is None:
28
+ raise RuntimeError("Libreoffice not found")
29
+
30
+ # The following test will raise if the libreoffice_cmd cannot be used
31
+ subprocess.run(
32
+ [
33
+ libreoffice_cmd,
34
+ "-h",
35
+ ],
36
+ stdout=subprocess.DEVNULL,
37
+ stderr=subprocess.DEVNULL,
38
+ check=True,
39
+ )
40
+
41
+ return libreoffice_cmd
42
+
43
+
44
+ def get_docx_to_pdf_converter() -> Optional[Callable]:
45
+ """
46
+ Detects the best available DOCX to PDF tool and returns a conversion function.
47
+ The returned function accepts (input_path, output_path).
48
+ Returns None if no tool is available.
49
+ """
50
+
51
+ # Try LibreOffice
52
+ libreoffice_cmd = get_libreoffice_cmd()
53
+
54
+ if libreoffice_cmd:
55
+
56
+ def convert_with_libreoffice(input_path, output_path):
57
+ subprocess.run(
58
+ [
59
+ libreoffice_cmd,
60
+ "--headless",
61
+ "--convert-to",
62
+ "pdf",
63
+ "--outdir",
64
+ os.path.dirname(output_path),
65
+ input_path,
66
+ ],
67
+ stdout=subprocess.DEVNULL,
68
+ stderr=subprocess.DEVNULL,
69
+ check=True,
70
+ )
71
+
72
+ expected_output = os.path.join(
73
+ os.path.dirname(output_path),
74
+ os.path.splitext(os.path.basename(input_path))[0] + ".pdf",
75
+ )
76
+ if expected_output != output_path:
77
+ os.rename(expected_output, output_path)
78
+
79
+ return convert_with_libreoffice
80
+
81
+ ## Space for other DOCX to PDF converters if available
82
+
83
+ # No tools found
84
+ return None
85
+
86
+
87
+ def crop_whitespace(image: Image.Image, bg_color=None, padding=0) -> Image.Image:
88
+ if bg_color is None:
89
+ bg_color = image.getpixel((0, 0))
90
+
91
+ bg = Image.new(image.mode, image.size, bg_color)
92
+ diff = ImageChops.difference(image, bg)
93
+ bbox = diff.getbbox()
94
+
95
+ if bbox:
96
+ left, upper, right, lower = bbox
97
+ left = max(0, left - padding)
98
+ upper = max(0, upper - padding)
99
+ right = min(image.width, right + padding)
100
+ lower = min(image.height, lower + padding)
101
+ return image.crop((left, upper, right, lower))
102
+ else:
103
+ return image
104
+
105
+
106
+ def get_pil_from_dml_docx(
107
+ docx: Document, converter: Optional[Callable]
108
+ ) -> Optional[Image.Image]:
109
+ if converter is None:
110
+ return None
111
+
112
+ temp_dir = Path(mkdtemp())
113
+ temp_docx = Path(temp_dir / "drawing_only.docx")
114
+ temp_pdf = Path(temp_dir / "drawing_only.pdf")
115
+
116
+ # 1) Save docx temporarily
117
+ docx.save(str(temp_docx))
118
+
119
+ # 2) Export to PDF
120
+ converter(temp_docx, temp_pdf)
121
+
122
+ # 3) Load PDF as PNG
123
+ pdf = pypdfium2.PdfDocument(temp_pdf)
124
+ page = pdf[0]
125
+ image = crop_whitespace(page.render(scale=2).to_pil())
126
+ page.close()
127
+ pdf.close()
128
+
129
+ shutil.rmtree(temp_dir, ignore_errors=True)
130
+
131
+ return image
@@ -65,6 +65,11 @@ CHR_BO = {
65
65
  "\u2210": "\\coprod",
66
66
  "\u2211": "\\sum",
67
67
  "\u222b": "\\int",
68
+ "\u222c": "\\iint",
69
+ "\u222d": "\\iiint",
70
+ "\u222e": "\\oint",
71
+ "\u222f": "\\oiint",
72
+ "\u2230": "\\oiiint",
68
73
  "\u22c0": "\\bigwedge",
69
74
  "\u22c1": "\\bigvee",
70
75
  "\u22c2": "\\bigcap",
@@ -260,7 +260,15 @@ class oMath2Latex(Tag2Method):
260
260
  the fraction object
261
261
  """
262
262
  c_dict = self.process_children_dict(elm)
263
- pr = c_dict["fPr"]
263
+ pr = c_dict.get("fPr")
264
+ if pr is None:
265
+ # Handle missing fPr element gracefully
266
+ _log.debug("Missing fPr element in fraction, using default formatting")
267
+ latex_s = F_DEFAULT
268
+ return latex_s.format(
269
+ num=c_dict.get("num"),
270
+ den=c_dict.get("den"),
271
+ )
264
272
  latex_s = get_val(pr.type, default=F_DEFAULT, store=F)
265
273
  return pr.text + latex_s.format(num=c_dict.get("num"), den=c_dict.get("den"))
266
274
 
@@ -373,7 +381,8 @@ class oMath2Latex(Tag2Method):
373
381
  bo = ""
374
382
  for stag, t, e in self.process_children_list(elm):
375
383
  if stag == "naryPr":
376
- bo = get_val(t.chr, store=CHR_BO)
384
+ # if <m:naryPr> contains no <m:chr>, the n-ary represents an integral
385
+ bo = get_val(t.chr, default="\\int", store=CHR_BO)
377
386
  else:
378
387
  res.append(t)
379
388
  return bo + BLANK.join(res)