docling-jobkit 1.8.0__py3-none-any.whl → 1.8.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling_jobkit/convert/manager.py +56 -4
- {docling_jobkit-1.8.0.dist-info → docling_jobkit-1.8.1.dist-info}/METADATA +1 -1
- {docling_jobkit-1.8.0.dist-info → docling_jobkit-1.8.1.dist-info}/RECORD +6 -6
- {docling_jobkit-1.8.0.dist-info → docling_jobkit-1.8.1.dist-info}/WHEEL +1 -1
- {docling_jobkit-1.8.0.dist-info → docling_jobkit-1.8.1.dist-info}/entry_points.txt +0 -0
- {docling_jobkit-1.8.0.dist-info → docling_jobkit-1.8.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -31,7 +31,12 @@ from docling.datamodel.pipeline_options import (
|
|
|
31
31
|
VlmPipelineOptions,
|
|
32
32
|
)
|
|
33
33
|
from docling.datamodel.pipeline_options_vlm_model import ApiVlmOptions, InlineVlmOptions
|
|
34
|
-
from docling.document_converter import
|
|
34
|
+
from docling.document_converter import (
|
|
35
|
+
DocumentConverter,
|
|
36
|
+
FormatOption,
|
|
37
|
+
ImageFormatOption,
|
|
38
|
+
PdfFormatOption,
|
|
39
|
+
)
|
|
35
40
|
from docling.models.factories import get_ocr_factory
|
|
36
41
|
from docling.pipeline.vlm_pipeline import VlmPipeline
|
|
37
42
|
from docling_core.types.doc import ImageRefMode
|
|
@@ -68,12 +73,28 @@ def _hash_pdf_format_option(pdf_format_option: PdfFormatOption) -> bytes:
|
|
|
68
73
|
data["pipeline_options"] = pdf_format_option.pipeline_options.model_dump(
|
|
69
74
|
serialize_as_any=True, mode="json"
|
|
70
75
|
)
|
|
76
|
+
data["pipeline_options_type"] = (
|
|
77
|
+
f"{pdf_format_option.pipeline_options.__class__.__module__}."
|
|
78
|
+
f"{pdf_format_option.pipeline_options.__class__.__qualname__}"
|
|
79
|
+
)
|
|
80
|
+
else:
|
|
81
|
+
data["pipeline_options_type"] = None
|
|
71
82
|
|
|
72
83
|
# Replace `pipeline_cls` with a string representation
|
|
73
|
-
|
|
84
|
+
pipeline_cls = pdf_format_option.pipeline_cls
|
|
85
|
+
data["pipeline_cls"] = (
|
|
86
|
+
f"{pipeline_cls.__module__}.{pipeline_cls.__qualname__}"
|
|
87
|
+
if pipeline_cls is not None
|
|
88
|
+
else "None"
|
|
89
|
+
)
|
|
74
90
|
|
|
75
91
|
# Replace `backend` with a string representation
|
|
76
|
-
|
|
92
|
+
backend = pdf_format_option.backend
|
|
93
|
+
data["backend"] = (
|
|
94
|
+
f"{backend.__module__}.{backend.__qualname__}"
|
|
95
|
+
if backend is not None
|
|
96
|
+
else "None"
|
|
97
|
+
)
|
|
77
98
|
|
|
78
99
|
# Serialize the dictionary to JSON with sorted keys to have consistent hashes
|
|
79
100
|
serialized_data = json.dumps(data, sort_keys=True)
|
|
@@ -121,9 +142,19 @@ class DoclingConverterManager:
|
|
|
121
142
|
@lru_cache(maxsize=cache_size)
|
|
122
143
|
def _get_converter_from_hash(options_hash: bytes) -> DocumentConverter:
|
|
123
144
|
pdf_format_option = self._options_map[options_hash]
|
|
145
|
+
image_format_option: FormatOption = pdf_format_option
|
|
146
|
+
if isinstance(pdf_format_option.pipeline_cls, type) and issubclass(
|
|
147
|
+
pdf_format_option.pipeline_cls, VlmPipeline
|
|
148
|
+
):
|
|
149
|
+
image_format_option = ImageFormatOption(
|
|
150
|
+
pipeline_cls=pdf_format_option.pipeline_cls,
|
|
151
|
+
pipeline_options=pdf_format_option.pipeline_options,
|
|
152
|
+
backend_options=pdf_format_option.backend_options,
|
|
153
|
+
)
|
|
154
|
+
|
|
124
155
|
format_options: dict[InputFormat, FormatOption] = {
|
|
125
156
|
InputFormat.PDF: pdf_format_option,
|
|
126
|
-
InputFormat.IMAGE:
|
|
157
|
+
InputFormat.IMAGE: image_format_option,
|
|
127
158
|
}
|
|
128
159
|
|
|
129
160
|
return DocumentConverter(format_options=format_options)
|
|
@@ -282,6 +313,27 @@ class DoclingConverterManager:
|
|
|
282
313
|
request.vlm_pipeline_model_api.model_dump()
|
|
283
314
|
)
|
|
284
315
|
|
|
316
|
+
pipeline_options.do_picture_classification = request.do_picture_classification
|
|
317
|
+
pipeline_options.do_picture_description = request.do_picture_description
|
|
318
|
+
|
|
319
|
+
if request.picture_description_local is not None:
|
|
320
|
+
pipeline_options.picture_description_options = (
|
|
321
|
+
PictureDescriptionVlmOptions.model_validate(
|
|
322
|
+
request.picture_description_local.model_dump()
|
|
323
|
+
)
|
|
324
|
+
)
|
|
325
|
+
|
|
326
|
+
if request.picture_description_api is not None:
|
|
327
|
+
pipeline_options.picture_description_options = (
|
|
328
|
+
PictureDescriptionApiOptions.model_validate(
|
|
329
|
+
request.picture_description_api.model_dump()
|
|
330
|
+
)
|
|
331
|
+
)
|
|
332
|
+
|
|
333
|
+
pipeline_options.picture_description_options.picture_area_threshold = (
|
|
334
|
+
request.picture_description_area_threshold
|
|
335
|
+
)
|
|
336
|
+
|
|
285
337
|
return pipeline_options
|
|
286
338
|
|
|
287
339
|
# Computes the PDF pipeline options and returns the PdfFormatOption and its hash
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docling-jobkit
|
|
3
|
-
Version: 1.8.
|
|
3
|
+
Version: 1.8.1
|
|
4
4
|
Summary: Running a distributed job processing documents with Docling.
|
|
5
5
|
Project-URL: Homepage, https://github.com/docling-project/docling-jobkit
|
|
6
6
|
Project-URL: Repository, https://github.com/docling-project/docling-jobkit
|
|
@@ -16,7 +16,7 @@ docling_jobkit/connectors/target_processor.py,sha256=2iIJE7Ip_-1dxJGt02_ALwDC2BP
|
|
|
16
16
|
docling_jobkit/connectors/target_processor_factory.py,sha256=b_Q3L_mlvfQlZG7A2cskzf6-LzQ1G_seGd2vLT51b5o,688
|
|
17
17
|
docling_jobkit/convert/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
18
18
|
docling_jobkit/convert/chunking.py,sha256=jFl7g8rGFmIBV_-0rxfyp6970N_0dqLehLgDSHKOM-o,11933
|
|
19
|
-
docling_jobkit/convert/manager.py,sha256=
|
|
19
|
+
docling_jobkit/convert/manager.py,sha256=M6kB5hFzQpcDCrdQRXxImdadbYJuwoLel7fTDeXWRtw,15949
|
|
20
20
|
docling_jobkit/convert/results.py,sha256=vQvOuXIdlmPskHwUJlXX2zyJSb2k20ip5TfzuyPH5mU,9053
|
|
21
21
|
docling_jobkit/convert/results_processor.py,sha256=TtiN6hqcUriEYMsEiyAutrgpMIz78D4pf-1HtiSjrXQ,16558
|
|
22
22
|
docling_jobkit/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -51,8 +51,8 @@ docling_jobkit/orchestrators/rq/orchestrator.py,sha256=MFINSpXb5tQh0PKk3xK10eZ8c
|
|
|
51
51
|
docling_jobkit/orchestrators/rq/worker.py,sha256=P9rXhH9k814sNPGgjY24CATwQSzL6Hfd5Th5d4I3ejs,6591
|
|
52
52
|
docling_jobkit/ray_job/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
53
53
|
docling_jobkit/ray_job/main.py,sha256=o52gLtEdGqyxa9XcVmwN55bCSVsIUq8zRm-FjgRlYN8,13465
|
|
54
|
-
docling_jobkit-1.8.
|
|
55
|
-
docling_jobkit-1.8.
|
|
56
|
-
docling_jobkit-1.8.
|
|
57
|
-
docling_jobkit-1.8.
|
|
58
|
-
docling_jobkit-1.8.
|
|
54
|
+
docling_jobkit-1.8.1.dist-info/METADATA,sha256=i1VO-KS2Ub3H8mbnZi4rGJpp_noOrrZ1MOq6XTz874g,8105
|
|
55
|
+
docling_jobkit-1.8.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
56
|
+
docling_jobkit-1.8.1.dist-info/entry_points.txt,sha256=QWmq6d0B14If8Zshc7pRnBs6zO1e9vhEnrMHOgBfzj8,121
|
|
57
|
+
docling_jobkit-1.8.1.dist-info/licenses/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
|
|
58
|
+
docling_jobkit-1.8.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|