docling-jobkit 1.7.1__py3-none-any.whl → 1.8.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling_jobkit/convert/manager.py +74 -4
- {docling_jobkit-1.7.1.dist-info → docling_jobkit-1.8.1.dist-info}/METADATA +3 -3
- {docling_jobkit-1.7.1.dist-info → docling_jobkit-1.8.1.dist-info}/RECORD +6 -6
- {docling_jobkit-1.7.1.dist-info → docling_jobkit-1.8.1.dist-info}/WHEEL +1 -1
- {docling_jobkit-1.7.1.dist-info → docling_jobkit-1.8.1.dist-info}/entry_points.txt +0 -0
- {docling_jobkit-1.7.1.dist-info → docling_jobkit-1.8.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -31,7 +31,12 @@ from docling.datamodel.pipeline_options import (
|
|
|
31
31
|
VlmPipelineOptions,
|
|
32
32
|
)
|
|
33
33
|
from docling.datamodel.pipeline_options_vlm_model import ApiVlmOptions, InlineVlmOptions
|
|
34
|
-
from docling.document_converter import
|
|
34
|
+
from docling.document_converter import (
|
|
35
|
+
DocumentConverter,
|
|
36
|
+
FormatOption,
|
|
37
|
+
ImageFormatOption,
|
|
38
|
+
PdfFormatOption,
|
|
39
|
+
)
|
|
35
40
|
from docling.models.factories import get_ocr_factory
|
|
36
41
|
from docling.pipeline.vlm_pipeline import VlmPipeline
|
|
37
42
|
from docling_core.types.doc import ImageRefMode
|
|
@@ -50,6 +55,13 @@ class DoclingConverterManagerConfig(BaseModel):
|
|
|
50
55
|
max_num_pages: int = sys.maxsize
|
|
51
56
|
max_file_size: int = sys.maxsize
|
|
52
57
|
|
|
58
|
+
# Threading pipeline
|
|
59
|
+
queue_max_size: Optional[int] = None
|
|
60
|
+
ocr_batch_size: Optional[int] = None
|
|
61
|
+
layout_batch_size: Optional[int] = None
|
|
62
|
+
table_batch_size: Optional[int] = None
|
|
63
|
+
batch_polling_interval_seconds: Optional[float] = None
|
|
64
|
+
|
|
53
65
|
|
|
54
66
|
# Custom serializer for PdfFormatOption
|
|
55
67
|
# (model_dump_json does not work with some classes)
|
|
@@ -61,12 +73,28 @@ def _hash_pdf_format_option(pdf_format_option: PdfFormatOption) -> bytes:
|
|
|
61
73
|
data["pipeline_options"] = pdf_format_option.pipeline_options.model_dump(
|
|
62
74
|
serialize_as_any=True, mode="json"
|
|
63
75
|
)
|
|
76
|
+
data["pipeline_options_type"] = (
|
|
77
|
+
f"{pdf_format_option.pipeline_options.__class__.__module__}."
|
|
78
|
+
f"{pdf_format_option.pipeline_options.__class__.__qualname__}"
|
|
79
|
+
)
|
|
80
|
+
else:
|
|
81
|
+
data["pipeline_options_type"] = None
|
|
64
82
|
|
|
65
83
|
# Replace `pipeline_cls` with a string representation
|
|
66
|
-
|
|
84
|
+
pipeline_cls = pdf_format_option.pipeline_cls
|
|
85
|
+
data["pipeline_cls"] = (
|
|
86
|
+
f"{pipeline_cls.__module__}.{pipeline_cls.__qualname__}"
|
|
87
|
+
if pipeline_cls is not None
|
|
88
|
+
else "None"
|
|
89
|
+
)
|
|
67
90
|
|
|
68
91
|
# Replace `backend` with a string representation
|
|
69
|
-
|
|
92
|
+
backend = pdf_format_option.backend
|
|
93
|
+
data["backend"] = (
|
|
94
|
+
f"{backend.__module__}.{backend.__qualname__}"
|
|
95
|
+
if backend is not None
|
|
96
|
+
else "None"
|
|
97
|
+
)
|
|
70
98
|
|
|
71
99
|
# Serialize the dictionary to JSON with sorted keys to have consistent hashes
|
|
72
100
|
serialized_data = json.dumps(data, sort_keys=True)
|
|
@@ -114,9 +142,19 @@ class DoclingConverterManager:
|
|
|
114
142
|
@lru_cache(maxsize=cache_size)
|
|
115
143
|
def _get_converter_from_hash(options_hash: bytes) -> DocumentConverter:
|
|
116
144
|
pdf_format_option = self._options_map[options_hash]
|
|
145
|
+
image_format_option: FormatOption = pdf_format_option
|
|
146
|
+
if isinstance(pdf_format_option.pipeline_cls, type) and issubclass(
|
|
147
|
+
pdf_format_option.pipeline_cls, VlmPipeline
|
|
148
|
+
):
|
|
149
|
+
image_format_option = ImageFormatOption(
|
|
150
|
+
pipeline_cls=pdf_format_option.pipeline_cls,
|
|
151
|
+
pipeline_options=pdf_format_option.pipeline_options,
|
|
152
|
+
backend_options=pdf_format_option.backend_options,
|
|
153
|
+
)
|
|
154
|
+
|
|
117
155
|
format_options: dict[InputFormat, FormatOption] = {
|
|
118
156
|
InputFormat.PDF: pdf_format_option,
|
|
119
|
-
InputFormat.IMAGE:
|
|
157
|
+
InputFormat.IMAGE: image_format_option,
|
|
120
158
|
}
|
|
121
159
|
|
|
122
160
|
return DocumentConverter(format_options=format_options)
|
|
@@ -202,6 +240,17 @@ class DoclingConverterManager:
|
|
|
202
240
|
request.picture_description_area_threshold
|
|
203
241
|
)
|
|
204
242
|
|
|
243
|
+
# Forward the definition of the following attributes, if they are not none
|
|
244
|
+
for attr in (
|
|
245
|
+
"queue_max_size",
|
|
246
|
+
"ocr_batch_size",
|
|
247
|
+
"layout_batch_size",
|
|
248
|
+
"table_batch_size",
|
|
249
|
+
"batch_polling_interval_seconds",
|
|
250
|
+
):
|
|
251
|
+
if value := getattr(self.config, attr):
|
|
252
|
+
setattr(pipeline_options, attr, value)
|
|
253
|
+
|
|
205
254
|
return pipeline_options
|
|
206
255
|
|
|
207
256
|
def _parse_backend(
|
|
@@ -264,6 +313,27 @@ class DoclingConverterManager:
|
|
|
264
313
|
request.vlm_pipeline_model_api.model_dump()
|
|
265
314
|
)
|
|
266
315
|
|
|
316
|
+
pipeline_options.do_picture_classification = request.do_picture_classification
|
|
317
|
+
pipeline_options.do_picture_description = request.do_picture_description
|
|
318
|
+
|
|
319
|
+
if request.picture_description_local is not None:
|
|
320
|
+
pipeline_options.picture_description_options = (
|
|
321
|
+
PictureDescriptionVlmOptions.model_validate(
|
|
322
|
+
request.picture_description_local.model_dump()
|
|
323
|
+
)
|
|
324
|
+
)
|
|
325
|
+
|
|
326
|
+
if request.picture_description_api is not None:
|
|
327
|
+
pipeline_options.picture_description_options = (
|
|
328
|
+
PictureDescriptionApiOptions.model_validate(
|
|
329
|
+
request.picture_description_api.model_dump()
|
|
330
|
+
)
|
|
331
|
+
)
|
|
332
|
+
|
|
333
|
+
pipeline_options.picture_description_options.picture_area_threshold = (
|
|
334
|
+
request.picture_description_area_threshold
|
|
335
|
+
)
|
|
336
|
+
|
|
267
337
|
return pipeline_options
|
|
268
338
|
|
|
269
339
|
# Computes the PDF pipeline options and returns the PdfFormatOption and its hash
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docling-jobkit
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.8.1
|
|
4
4
|
Summary: Running a distributed job processing documents with Docling.
|
|
5
5
|
Project-URL: Homepage, https://github.com/docling-project/docling-jobkit
|
|
6
6
|
Project-URL: Repository, https://github.com/docling-project/docling-jobkit
|
|
@@ -25,7 +25,7 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
|
25
25
|
Classifier: Typing :: Typed
|
|
26
26
|
Requires-Python: >=3.10
|
|
27
27
|
Requires-Dist: boto3~=1.35
|
|
28
|
-
Requires-Dist: docling~=2.
|
|
28
|
+
Requires-Dist: docling~=2.60
|
|
29
29
|
Requires-Dist: fastparquet~=2024.11
|
|
30
30
|
Requires-Dist: httpx~=0.28
|
|
31
31
|
Requires-Dist: pandas~=2.2
|
|
@@ -45,7 +45,7 @@ Provides-Extra: rq
|
|
|
45
45
|
Requires-Dist: msgpack~=1.1; extra == 'rq'
|
|
46
46
|
Requires-Dist: rq~=2.4; extra == 'rq'
|
|
47
47
|
Provides-Extra: vlm
|
|
48
|
-
Requires-Dist: docling[vlm]~=2.
|
|
48
|
+
Requires-Dist: docling[vlm]~=2.60; extra == 'vlm'
|
|
49
49
|
Description-Content-Type: text/markdown
|
|
50
50
|
|
|
51
51
|
# Docling Jobkit
|
|
@@ -16,7 +16,7 @@ docling_jobkit/connectors/target_processor.py,sha256=2iIJE7Ip_-1dxJGt02_ALwDC2BP
|
|
|
16
16
|
docling_jobkit/connectors/target_processor_factory.py,sha256=b_Q3L_mlvfQlZG7A2cskzf6-LzQ1G_seGd2vLT51b5o,688
|
|
17
17
|
docling_jobkit/convert/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
18
18
|
docling_jobkit/convert/chunking.py,sha256=jFl7g8rGFmIBV_-0rxfyp6970N_0dqLehLgDSHKOM-o,11933
|
|
19
|
-
docling_jobkit/convert/manager.py,sha256=
|
|
19
|
+
docling_jobkit/convert/manager.py,sha256=M6kB5hFzQpcDCrdQRXxImdadbYJuwoLel7fTDeXWRtw,15949
|
|
20
20
|
docling_jobkit/convert/results.py,sha256=vQvOuXIdlmPskHwUJlXX2zyJSb2k20ip5TfzuyPH5mU,9053
|
|
21
21
|
docling_jobkit/convert/results_processor.py,sha256=TtiN6hqcUriEYMsEiyAutrgpMIz78D4pf-1HtiSjrXQ,16558
|
|
22
22
|
docling_jobkit/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -51,8 +51,8 @@ docling_jobkit/orchestrators/rq/orchestrator.py,sha256=MFINSpXb5tQh0PKk3xK10eZ8c
|
|
|
51
51
|
docling_jobkit/orchestrators/rq/worker.py,sha256=P9rXhH9k814sNPGgjY24CATwQSzL6Hfd5Th5d4I3ejs,6591
|
|
52
52
|
docling_jobkit/ray_job/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
53
53
|
docling_jobkit/ray_job/main.py,sha256=o52gLtEdGqyxa9XcVmwN55bCSVsIUq8zRm-FjgRlYN8,13465
|
|
54
|
-
docling_jobkit-1.
|
|
55
|
-
docling_jobkit-1.
|
|
56
|
-
docling_jobkit-1.
|
|
57
|
-
docling_jobkit-1.
|
|
58
|
-
docling_jobkit-1.
|
|
54
|
+
docling_jobkit-1.8.1.dist-info/METADATA,sha256=i1VO-KS2Ub3H8mbnZi4rGJpp_noOrrZ1MOq6XTz874g,8105
|
|
55
|
+
docling_jobkit-1.8.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
56
|
+
docling_jobkit-1.8.1.dist-info/entry_points.txt,sha256=QWmq6d0B14If8Zshc7pRnBs6zO1e9vhEnrMHOgBfzj8,121
|
|
57
|
+
docling_jobkit-1.8.1.dist-info/licenses/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
|
|
58
|
+
docling_jobkit-1.8.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|