docling 1.9.0__tar.gz → 1.11.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. {docling-1.9.0 → docling-1.11.0}/LICENSE +1 -1
  2. {docling-1.9.0 → docling-1.11.0}/PKG-INFO +15 -11
  3. {docling-1.9.0 → docling-1.11.0}/README.md +9 -5
  4. {docling-1.9.0 → docling-1.11.0}/docling/datamodel/document.py +72 -3
  5. {docling-1.9.0 → docling-1.11.0}/docling/models/ds_glm_model.py +1 -1
  6. {docling-1.9.0 → docling-1.11.0}/docling/utils/export.py +5 -1
  7. {docling-1.9.0 → docling-1.11.0}/pyproject.toml +6 -6
  8. {docling-1.9.0 → docling-1.11.0}/docling/__init__.py +0 -0
  9. {docling-1.9.0 → docling-1.11.0}/docling/backend/__init__.py +0 -0
  10. {docling-1.9.0 → docling-1.11.0}/docling/backend/abstract_backend.py +0 -0
  11. {docling-1.9.0 → docling-1.11.0}/docling/backend/docling_parse_backend.py +0 -0
  12. {docling-1.9.0 → docling-1.11.0}/docling/backend/pypdfium2_backend.py +0 -0
  13. {docling-1.9.0 → docling-1.11.0}/docling/datamodel/__init__.py +0 -0
  14. {docling-1.9.0 → docling-1.11.0}/docling/datamodel/base_models.py +0 -0
  15. {docling-1.9.0 → docling-1.11.0}/docling/datamodel/settings.py +0 -0
  16. {docling-1.9.0 → docling-1.11.0}/docling/document_converter.py +0 -0
  17. {docling-1.9.0 → docling-1.11.0}/docling/models/__init__.py +0 -0
  18. {docling-1.9.0 → docling-1.11.0}/docling/models/base_ocr_model.py +0 -0
  19. {docling-1.9.0 → docling-1.11.0}/docling/models/easyocr_model.py +0 -0
  20. {docling-1.9.0 → docling-1.11.0}/docling/models/layout_model.py +0 -0
  21. {docling-1.9.0 → docling-1.11.0}/docling/models/page_assemble_model.py +0 -0
  22. {docling-1.9.0 → docling-1.11.0}/docling/models/table_structure_model.py +0 -0
  23. {docling-1.9.0 → docling-1.11.0}/docling/pipeline/__init__.py +0 -0
  24. {docling-1.9.0 → docling-1.11.0}/docling/pipeline/base_model_pipeline.py +0 -0
  25. {docling-1.9.0 → docling-1.11.0}/docling/pipeline/standard_model_pipeline.py +0 -0
  26. {docling-1.9.0 → docling-1.11.0}/docling/utils/__init__.py +0 -0
  27. {docling-1.9.0 → docling-1.11.0}/docling/utils/layout_utils.py +0 -0
  28. {docling-1.9.0 → docling-1.11.0}/docling/utils/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  MIT License
2
2
 
3
- Copyright (c) [year] [fullname]
3
+ Copyright (c) 2024 International Business Machines
4
4
 
5
5
  Permission is hereby granted, free of charge, to any person obtaining a copy
6
6
  of this software and associated documentation files (the "Software"), to deal
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 1.9.0
3
+ Version: 1.11.0
4
4
  Summary: Docling PDF conversion package
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -20,14 +20,14 @@ Classifier: Programming Language :: Python :: 3.11
20
20
  Classifier: Programming Language :: Python :: 3.12
21
21
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
22
  Requires-Dist: certifi (>=2024.7.4)
23
- Requires-Dist: deepsearch-glm (>=0.19.1,<0.20.0)
24
- Requires-Dist: docling-core (>=1.1.3,<2.0.0)
25
- Requires-Dist: docling-ibm-models (>=1.1.3,<2.0.0)
26
- Requires-Dist: docling-parse (>=1.1.3,<2.0.0)
23
+ Requires-Dist: deepsearch-glm (>=0.21.0,<0.22.0)
24
+ Requires-Dist: docling-core (>=1.2.0,<2.0.0)
25
+ Requires-Dist: docling-ibm-models (>=1.1.7,<2.0.0)
26
+ Requires-Dist: docling-parse (>=1.2.0,<2.0.0)
27
27
  Requires-Dist: easyocr (>=1.7,<2.0)
28
28
  Requires-Dist: filetype (>=1.2.0,<2.0.0)
29
29
  Requires-Dist: huggingface_hub (>=0.23,<1)
30
- Requires-Dist: pyarrow (>=17.0.0,<18.0.0)
30
+ Requires-Dist: pyarrow (>=16.1.0,<17.0.0)
31
31
  Requires-Dist: pydantic (>=2.0.0,<3.0.0)
32
32
  Requires-Dist: pydantic-settings (>=2.3.0,<3.0.0)
33
33
  Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
@@ -63,7 +63,7 @@ Docling bundles PDF document conversion to JSON and Markdown in an easy, self-co
63
63
  * 📝 Extracts metadata from the document, such as title, authors, references and language
64
64
  * 🔍 Optionally applies OCR (use with scanned PDFs)
65
65
 
66
- Doing RAG or Q/A? Also consider [Quackling](https://github.com/DS4SD/quackling) to get the most out of your documents.
66
+ For RAG, check out [Quackling](https://github.com/DS4SD/quackling) to get the most out of your docs, be it using LlamaIndex, LangChain or your pipeline.
67
67
 
68
68
  ## Installation
69
69
 
@@ -183,6 +183,10 @@ results = doc_converter.convert(conv_input)
183
183
  You can limit the CPU threads used by Docling by setting the environment variable `OMP_NUM_THREADS` accordingly. The default setting is using 4 CPU threads.
184
184
 
185
185
 
186
+ ## Technical report
187
+
188
+ For more details on Docling's inner workings, check out the [Docling Technical Report](https://arxiv.org/abs/2408.09869).
189
+
186
190
  ## Contributing
187
191
 
188
192
  Please read [Contributing to Docling](https://github.com/DS4SD/docling/blob/main/CONTRIBUTING.md) for details.
@@ -196,10 +200,10 @@ If you use Docling in your projects, please consider citing the following:
196
200
  @techreport{Docling,
197
201
  author = {Deep Search Team},
198
202
  month = {8},
199
- title = {{Docling Technical Report}},
200
- url={https://arxiv.org/abs/2408.09869},
201
- eprint={2408.09869},
202
- doi = "10.48550/arXiv.2408.09869",
203
+ title = {Docling Technical Report},
204
+ url = {https://arxiv.org/abs/2408.09869},
205
+ eprint = {2408.09869},
206
+ doi = {10.48550/arXiv.2408.09869},
203
207
  version = {1.0.0},
204
208
  year = {2024}
205
209
  }
@@ -24,7 +24,7 @@ Docling bundles PDF document conversion to JSON and Markdown in an easy, self-co
24
24
  * 📝 Extracts metadata from the document, such as title, authors, references and language
25
25
  * 🔍 Optionally applies OCR (use with scanned PDFs)
26
26
 
27
- Doing RAG or Q/A? Also consider [Quackling](https://github.com/DS4SD/quackling) to get the most out of your documents.
27
+ For RAG, check out [Quackling](https://github.com/DS4SD/quackling) to get the most out of your docs, be it using LlamaIndex, LangChain or your pipeline.
28
28
 
29
29
  ## Installation
30
30
 
@@ -144,6 +144,10 @@ results = doc_converter.convert(conv_input)
144
144
  You can limit the CPU threads used by Docling by setting the environment variable `OMP_NUM_THREADS` accordingly. The default setting is using 4 CPU threads.
145
145
 
146
146
 
147
+ ## Technical report
148
+
149
+ For more details on Docling's inner workings, check out the [Docling Technical Report](https://arxiv.org/abs/2408.09869).
150
+
147
151
  ## Contributing
148
152
 
149
153
  Please read [Contributing to Docling](https://github.com/DS4SD/docling/blob/main/CONTRIBUTING.md) for details.
@@ -157,10 +161,10 @@ If you use Docling in your projects, please consider citing the following:
157
161
  @techreport{Docling,
158
162
  author = {Deep Search Team},
159
163
  month = {8},
160
- title = {{Docling Technical Report}},
161
- url={https://arxiv.org/abs/2408.09869},
162
- eprint={2408.09869},
163
- doi = "10.48550/arXiv.2408.09869",
164
+ title = {Docling Technical Report},
165
+ url = {https://arxiv.org/abs/2408.09869},
166
+ eprint = {2408.09869},
167
+ doi = {10.48550/arXiv.2408.09869},
164
168
  version = {1.0.0},
165
169
  year = {2024}
166
170
  }
@@ -11,6 +11,7 @@ from docling_core.types import FileInfoObject as DsFileInfoObject
11
11
  from docling_core.types import PageDimensions, PageReference, Prov, Ref
12
12
  from docling_core.types import Table as DsSchemaTable
13
13
  from docling_core.types import TableCell
14
+ from docling_core.types.doc.base import Figure
14
15
  from pydantic import BaseModel
15
16
  from typing_extensions import deprecated
16
17
 
@@ -279,7 +280,7 @@ class ConvertedDocument(BaseModel):
279
280
  ),
280
281
  )
281
282
  figures.append(
282
- BaseCell(
283
+ Figure(
283
284
  prov=[
284
285
  Prov(
285
286
  bbox=target_bbox,
@@ -312,8 +313,76 @@ class ConvertedDocument(BaseModel):
312
313
  def render_as_dict(self):
313
314
  return self.output.model_dump(by_alias=True, exclude_none=True)
314
315
 
315
- def render_as_markdown(self):
316
- return self.output.export_to_markdown()
316
+ def render_as_markdown(
317
+ self,
318
+ delim: str = "\n\n",
319
+ main_text_start: int = 0,
320
+ main_text_stop: Optional[int] = None,
321
+ main_text_labels: list[str] = [
322
+ "title",
323
+ "subtitle-level-1",
324
+ "paragraph",
325
+ "caption",
326
+ "table",
327
+ ],
328
+ strict_text: bool = False,
329
+ ):
330
+ return self.output.export_to_markdown(
331
+ delim=delim,
332
+ main_text_start=main_text_start,
333
+ main_text_stop=main_text_stop,
334
+ main_text_labels=main_text_labels,
335
+ strict_text=strict_text,
336
+ )
337
+
338
+ def render_as_text(
339
+ self,
340
+ delim: str = "\n\n",
341
+ main_text_start: int = 0,
342
+ main_text_stop: Optional[int] = None,
343
+ main_text_labels: list[str] = [
344
+ "title",
345
+ "subtitle-level-1",
346
+ "paragraph",
347
+ "caption",
348
+ ],
349
+ ):
350
+ return self.output.export_to_markdown(
351
+ delim=delim,
352
+ main_text_start=main_text_start,
353
+ main_text_stop=main_text_stop,
354
+ main_text_labels=main_text_labels,
355
+ strict_text=True,
356
+ )
357
+
358
+ def render_as_doctags(
359
+ self,
360
+ delim: str = "\n\n",
361
+ main_text_start: int = 0,
362
+ main_text_stop: Optional[int] = None,
363
+ main_text_labels: list[str] = [
364
+ "title",
365
+ "subtitle-level-1",
366
+ "paragraph",
367
+ "caption",
368
+ "table",
369
+ "figure",
370
+ ],
371
+ page_tagging: bool = True,
372
+ location_tagging: bool = True,
373
+ location_dimensions: Tuple[int, int] = (100, 100),
374
+ add_new_line: bool = True,
375
+ ) -> str:
376
+ return self.output.export_to_document_tokens(
377
+ delim=delim,
378
+ main_text_start=main_text_start,
379
+ main_text_stop=main_text_stop,
380
+ main_text_labels=main_text_labels,
381
+ page_tagging=page_tagging,
382
+ location_tagging=location_tagging,
383
+ location_dimensions=location_dimensions,
384
+ add_new_line=add_new_line,
385
+ )
317
386
 
318
387
  def render_element_images(
319
388
  self, element_types: Tuple[PageElement] = (FigureElement,)
@@ -2,7 +2,7 @@ import copy
2
2
  import random
3
3
 
4
4
  from deepsearch_glm.nlp_utils import init_nlp_model
5
- from deepsearch_glm.utils.ds_utils import to_legacy_document_format
5
+ from deepsearch_glm.utils.doc_utils import to_legacy_document_format
6
6
  from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models
7
7
  from docling_core.types import BaseText
8
8
  from docling_core.types import Document as DsDocument
@@ -163,8 +163,12 @@ def generate_multimodal_pages(
163
163
  content_md = doc.export_to_markdown(
164
164
  main_text_start=start_ix, main_text_stop=end_ix
165
165
  )
166
+ # No page-tagging since we only do 1 page at the time
167
+ content_dt = doc.export_to_document_tokens(
168
+ main_text_start=start_ix, main_text_stop=end_ix, page_tagging=False
169
+ )
166
170
 
167
- return content_text, content_md, page_cells, page_segments, page
171
+ return content_text, content_md, content_dt, page_cells, page_segments, page
168
172
 
169
173
  for ix, orig_item in enumerate(doc.main_text):
170
174
 
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling"
3
- version = "1.9.0" # DO NOT EDIT, updated automatically
3
+ version = "1.11.0" # DO NOT EDIT, updated automatically
4
4
  description = "Docling PDF conversion package"
5
5
  authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
6
6
  license = "MIT"
@@ -23,20 +23,20 @@ packages = [{include = "docling"}]
23
23
  [tool.poetry.dependencies]
24
24
  python = "^3.10"
25
25
  pydantic = "^2.0.0"
26
- docling-core = "^1.1.3"
27
- docling-ibm-models = "^1.1.3"
28
- deepsearch-glm = "^0.19.1"
26
+ docling-core = "^1.2.0"
27
+ docling-ibm-models = "^1.1.7"
28
+ deepsearch-glm = "^0.21.0"
29
29
  filetype = "^1.2.0"
30
30
  pypdfium2 = "^4.30.0"
31
31
  pydantic-settings = "^2.3.0"
32
32
  huggingface_hub = ">=0.23,<1"
33
33
  requests = "^2.32.3"
34
34
  easyocr = "^1.7"
35
- docling-parse = "^1.1.3"
35
+ docling-parse = "^1.2.0"
36
36
  certifi = ">=2024.7.4"
37
37
  rtree = "^1.3.0"
38
38
  scipy = "^1.14.1"
39
- pyarrow = "^17.0.0"
39
+ pyarrow = "^16.1.0"
40
40
 
41
41
  [tool.poetry.group.dev.dependencies]
42
42
  black = {extras = ["jupyter"], version = "^24.4.2"}
File without changes
File without changes