docling 1.2.0__tar.gz → 1.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. {docling-1.2.0 → docling-1.2.1}/PKG-INFO +8 -3
  2. {docling-1.2.0 → docling-1.2.1}/README.md +6 -2
  3. {docling-1.2.0 → docling-1.2.1}/docling/backend/abstract_backend.py +1 -1
  4. {docling-1.2.0 → docling-1.2.1}/docling/backend/docling_parse_backend.py +3 -2
  5. {docling-1.2.0 → docling-1.2.1}/docling/backend/pypdfium2_backend.py +1 -1
  6. {docling-1.2.0 → docling-1.2.1}/pyproject.toml +2 -1
  7. {docling-1.2.0 → docling-1.2.1}/LICENSE +0 -0
  8. {docling-1.2.0 → docling-1.2.1}/docling/__init__.py +0 -0
  9. {docling-1.2.0 → docling-1.2.1}/docling/backend/__init__.py +0 -0
  10. {docling-1.2.0 → docling-1.2.1}/docling/datamodel/__init__.py +0 -0
  11. {docling-1.2.0 → docling-1.2.1}/docling/datamodel/base_models.py +0 -0
  12. {docling-1.2.0 → docling-1.2.1}/docling/datamodel/document.py +0 -0
  13. {docling-1.2.0 → docling-1.2.1}/docling/datamodel/settings.py +0 -0
  14. {docling-1.2.0 → docling-1.2.1}/docling/document_converter.py +0 -0
  15. {docling-1.2.0 → docling-1.2.1}/docling/models/__init__.py +0 -0
  16. {docling-1.2.0 → docling-1.2.1}/docling/models/ds_glm_model.py +0 -0
  17. {docling-1.2.0 → docling-1.2.1}/docling/models/easyocr_model.py +0 -0
  18. {docling-1.2.0 → docling-1.2.1}/docling/models/layout_model.py +0 -0
  19. {docling-1.2.0 → docling-1.2.1}/docling/models/page_assemble_model.py +0 -0
  20. {docling-1.2.0 → docling-1.2.1}/docling/models/table_structure_model.py +0 -0
  21. {docling-1.2.0 → docling-1.2.1}/docling/pipeline/__init__.py +0 -0
  22. {docling-1.2.0 → docling-1.2.1}/docling/pipeline/base_model_pipeline.py +0 -0
  23. {docling-1.2.0 → docling-1.2.1}/docling/pipeline/standard_model_pipeline.py +0 -0
  24. {docling-1.2.0 → docling-1.2.1}/docling/utils/__init__.py +0 -0
  25. {docling-1.2.0 → docling-1.2.1}/docling/utils/layout_utils.py +0 -0
  26. {docling-1.2.0 → docling-1.2.1}/docling/utils/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 1.2.0
3
+ Version: 1.2.1
4
4
  Summary: Docling PDF conversion package
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -21,6 +21,7 @@ Classifier: Programming Language :: Python :: 3.12
21
21
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
22
  Provides-Extra: easyocr
23
23
  Provides-Extra: ocr
24
+ Requires-Dist: certifi (>=2024.7.4)
24
25
  Requires-Dist: deepsearch-glm (>=0.19.0,<1)
25
26
  Requires-Dist: docling-core (>=1.1.2,<2.0.0)
26
27
  Requires-Dist: docling-ibm-models (>=1.1.0,<2.0.0)
@@ -93,17 +94,21 @@ print(doc.export_to_markdown()) # output: "## DocLayNet: A Large Human-Annotate
93
94
 
94
95
  ### Convert a batch of documents
95
96
 
96
- For an example of batch-converting documents, see [convert.py](https://github.com/DS4SD/docling/blob/main/examples/convert.py).
97
+ For an example of batch-converting documents, see [batch_convert.py](https://github.com/DS4SD/docling/blob/main/examples/batch_convert.py).
97
98
 
98
99
  From a local repo clone, you can run it with:
99
100
 
100
101
  ```
101
- python examples/convert.py
102
+ python examples/batch_convert.py
102
103
  ```
103
104
  The output of the above command will be written to `./scratch`.
104
105
 
105
106
  ### Adjust pipeline features
106
107
 
108
+ The example file [custom_convert.py](https://github.com/DS4SD/docling/blob/main/examples/custom_convert.py) contains multiple ways
109
+ one can adjust the conversion pipeline and features.
110
+
111
+
107
112
  #### Control pipeline options
108
113
 
109
114
  You can control if table structure recognition or OCR should be performed by arguments passed to `DocumentConverter`:
@@ -56,17 +56,21 @@ print(doc.export_to_markdown()) # output: "## DocLayNet: A Large Human-Annotate
56
56
 
57
57
  ### Convert a batch of documents
58
58
 
59
- For an example of batch-converting documents, see [convert.py](https://github.com/DS4SD/docling/blob/main/examples/convert.py).
59
+ For an example of batch-converting documents, see [batch_convert.py](https://github.com/DS4SD/docling/blob/main/examples/batch_convert.py).
60
60
 
61
61
  From a local repo clone, you can run it with:
62
62
 
63
63
  ```
64
- python examples/convert.py
64
+ python examples/batch_convert.py
65
65
  ```
66
66
  The output of the above command will be written to `./scratch`.
67
67
 
68
68
  ### Adjust pipeline features
69
69
 
70
+ The example file [custom_convert.py](https://github.com/DS4SD/docling/blob/main/examples/custom_convert.py) contains multiple ways
71
+ one can adjust the conversion pipeline and features.
72
+
73
+
70
74
  #### Control pipeline options
71
75
 
72
76
  You can control if table structure recognition or OCR should be performed by arguments passed to `DocumentConverter`:
@@ -35,7 +35,7 @@ class PdfPageBackend(ABC):
35
35
 
36
36
  class PdfDocumentBackend(ABC):
37
37
  @abstractmethod
38
- def __init__(self, path_or_stream: Iterable[Union[BytesIO, Path]]):
38
+ def __init__(self, path_or_stream: Union[BytesIO, Path]):
39
39
  pass
40
40
 
41
41
  @abstractmethod
@@ -146,11 +146,12 @@ class DoclingParsePageBackend(PdfPageBackend):
146
146
 
147
147
 
148
148
  class DoclingParseDocumentBackend(PdfDocumentBackend):
149
- def __init__(self, path_or_stream: Iterable[Union[BytesIO, Path]]):
149
+ def __init__(self, path_or_stream: Union[BytesIO, Path]):
150
150
  super().__init__(path_or_stream)
151
151
  self._pdoc = pdfium.PdfDocument(path_or_stream)
152
152
  # Parsing cells with docling_parser call
153
- print("PARSING WITH DOCLING PARSE")
153
+ if isinstance(path_or_stream, BytesIO):
154
+ raise NotImplemented("This backend does not support byte streams yet.")
154
155
  parser = pdf_parser()
155
156
  self._parser_doc = parser.find_cells(str(path_or_stream))
156
157
 
@@ -199,7 +199,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
199
199
 
200
200
 
201
201
  class PyPdfiumDocumentBackend(PdfDocumentBackend):
202
- def __init__(self, path_or_stream: Iterable[Union[BytesIO, Path]]):
202
+ def __init__(self, path_or_stream: Union[BytesIO, Path]):
203
203
  super().__init__(path_or_stream)
204
204
  self._pdoc = pdfium.PdfDocument(path_or_stream)
205
205
 
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling"
3
- version = "1.2.0" # DO NOT EDIT, updated automatically
3
+ version = "1.2.1" # DO NOT EDIT, updated automatically
4
4
  description = "Docling PDF conversion package"
5
5
  authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
6
6
  license = "MIT"
@@ -33,6 +33,7 @@ huggingface_hub = ">=0.23,<1"
33
33
  requests = "^2.32.3"
34
34
  easyocr = { version = "^1.7", optional = true }
35
35
  docling-parse = "^0.0.1"
36
+ certifi = ">=2024.7.4"
36
37
 
37
38
  [tool.poetry.group.dev.dependencies]
38
39
  black = {extras = ["jupyter"], version = "^24.4.2"}
File without changes
File without changes
File without changes