docling-jobkit 1.7.1__py3-none-any.whl → 1.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -31,7 +31,12 @@ from docling.datamodel.pipeline_options import (
31
31
  VlmPipelineOptions,
32
32
  )
33
33
  from docling.datamodel.pipeline_options_vlm_model import ApiVlmOptions, InlineVlmOptions
34
- from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
34
+ from docling.document_converter import (
35
+ DocumentConverter,
36
+ FormatOption,
37
+ ImageFormatOption,
38
+ PdfFormatOption,
39
+ )
35
40
  from docling.models.factories import get_ocr_factory
36
41
  from docling.pipeline.vlm_pipeline import VlmPipeline
37
42
  from docling_core.types.doc import ImageRefMode
@@ -50,6 +55,13 @@ class DoclingConverterManagerConfig(BaseModel):
50
55
  max_num_pages: int = sys.maxsize
51
56
  max_file_size: int = sys.maxsize
52
57
 
58
+ # Threading pipeline
59
+ queue_max_size: Optional[int] = None
60
+ ocr_batch_size: Optional[int] = None
61
+ layout_batch_size: Optional[int] = None
62
+ table_batch_size: Optional[int] = None
63
+ batch_polling_interval_seconds: Optional[float] = None
64
+
53
65
 
54
66
  # Custom serializer for PdfFormatOption
55
67
  # (model_dump_json does not work with some classes)
@@ -61,12 +73,28 @@ def _hash_pdf_format_option(pdf_format_option: PdfFormatOption) -> bytes:
61
73
  data["pipeline_options"] = pdf_format_option.pipeline_options.model_dump(
62
74
  serialize_as_any=True, mode="json"
63
75
  )
76
+ data["pipeline_options_type"] = (
77
+ f"{pdf_format_option.pipeline_options.__class__.__module__}."
78
+ f"{pdf_format_option.pipeline_options.__class__.__qualname__}"
79
+ )
80
+ else:
81
+ data["pipeline_options_type"] = None
64
82
 
65
83
  # Replace `pipeline_cls` with a string representation
66
- data["pipeline_cls"] = repr(data["pipeline_cls"])
84
+ pipeline_cls = pdf_format_option.pipeline_cls
85
+ data["pipeline_cls"] = (
86
+ f"{pipeline_cls.__module__}.{pipeline_cls.__qualname__}"
87
+ if pipeline_cls is not None
88
+ else "None"
89
+ )
67
90
 
68
91
  # Replace `backend` with a string representation
69
- data["backend"] = repr(data["backend"])
92
+ backend = pdf_format_option.backend
93
+ data["backend"] = (
94
+ f"{backend.__module__}.{backend.__qualname__}"
95
+ if backend is not None
96
+ else "None"
97
+ )
70
98
 
71
99
  # Serialize the dictionary to JSON with sorted keys to have consistent hashes
72
100
  serialized_data = json.dumps(data, sort_keys=True)
@@ -114,9 +142,19 @@ class DoclingConverterManager:
114
142
  @lru_cache(maxsize=cache_size)
115
143
  def _get_converter_from_hash(options_hash: bytes) -> DocumentConverter:
116
144
  pdf_format_option = self._options_map[options_hash]
145
+ image_format_option: FormatOption = pdf_format_option
146
+ if isinstance(pdf_format_option.pipeline_cls, type) and issubclass(
147
+ pdf_format_option.pipeline_cls, VlmPipeline
148
+ ):
149
+ image_format_option = ImageFormatOption(
150
+ pipeline_cls=pdf_format_option.pipeline_cls,
151
+ pipeline_options=pdf_format_option.pipeline_options,
152
+ backend_options=pdf_format_option.backend_options,
153
+ )
154
+
117
155
  format_options: dict[InputFormat, FormatOption] = {
118
156
  InputFormat.PDF: pdf_format_option,
119
- InputFormat.IMAGE: pdf_format_option,
157
+ InputFormat.IMAGE: image_format_option,
120
158
  }
121
159
 
122
160
  return DocumentConverter(format_options=format_options)
@@ -202,6 +240,17 @@ class DoclingConverterManager:
202
240
  request.picture_description_area_threshold
203
241
  )
204
242
 
243
+ # Forward the definition of the following attributes, if they are not none
244
+ for attr in (
245
+ "queue_max_size",
246
+ "ocr_batch_size",
247
+ "layout_batch_size",
248
+ "table_batch_size",
249
+ "batch_polling_interval_seconds",
250
+ ):
251
+ if value := getattr(self.config, attr):
252
+ setattr(pipeline_options, attr, value)
253
+
205
254
  return pipeline_options
206
255
 
207
256
  def _parse_backend(
@@ -264,6 +313,27 @@ class DoclingConverterManager:
264
313
  request.vlm_pipeline_model_api.model_dump()
265
314
  )
266
315
 
316
+ pipeline_options.do_picture_classification = request.do_picture_classification
317
+ pipeline_options.do_picture_description = request.do_picture_description
318
+
319
+ if request.picture_description_local is not None:
320
+ pipeline_options.picture_description_options = (
321
+ PictureDescriptionVlmOptions.model_validate(
322
+ request.picture_description_local.model_dump()
323
+ )
324
+ )
325
+
326
+ if request.picture_description_api is not None:
327
+ pipeline_options.picture_description_options = (
328
+ PictureDescriptionApiOptions.model_validate(
329
+ request.picture_description_api.model_dump()
330
+ )
331
+ )
332
+
333
+ pipeline_options.picture_description_options.picture_area_threshold = (
334
+ request.picture_description_area_threshold
335
+ )
336
+
267
337
  return pipeline_options
268
338
 
269
339
  # Computes the PDF pipeline options and returns the PdfFormatOption and its hash
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling-jobkit
3
- Version: 1.7.1
3
+ Version: 1.8.1
4
4
  Summary: Running a distributed job processing documents with Docling.
5
5
  Project-URL: Homepage, https://github.com/docling-project/docling-jobkit
6
6
  Project-URL: Repository, https://github.com/docling-project/docling-jobkit
@@ -25,7 +25,7 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
25
25
  Classifier: Typing :: Typed
26
26
  Requires-Python: >=3.10
27
27
  Requires-Dist: boto3~=1.35
28
- Requires-Dist: docling~=2.56
28
+ Requires-Dist: docling~=2.60
29
29
  Requires-Dist: fastparquet~=2024.11
30
30
  Requires-Dist: httpx~=0.28
31
31
  Requires-Dist: pandas~=2.2
@@ -45,7 +45,7 @@ Provides-Extra: rq
45
45
  Requires-Dist: msgpack~=1.1; extra == 'rq'
46
46
  Requires-Dist: rq~=2.4; extra == 'rq'
47
47
  Provides-Extra: vlm
48
- Requires-Dist: docling[vlm]~=2.53; extra == 'vlm'
48
+ Requires-Dist: docling[vlm]~=2.60; extra == 'vlm'
49
49
  Description-Content-Type: text/markdown
50
50
 
51
51
  # Docling Jobkit
@@ -16,7 +16,7 @@ docling_jobkit/connectors/target_processor.py,sha256=2iIJE7Ip_-1dxJGt02_ALwDC2BP
16
16
  docling_jobkit/connectors/target_processor_factory.py,sha256=b_Q3L_mlvfQlZG7A2cskzf6-LzQ1G_seGd2vLT51b5o,688
17
17
  docling_jobkit/convert/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
18
  docling_jobkit/convert/chunking.py,sha256=jFl7g8rGFmIBV_-0rxfyp6970N_0dqLehLgDSHKOM-o,11933
19
- docling_jobkit/convert/manager.py,sha256=Ir7lHzDPi-NdlYtBbTqb-PcKVM4kefLU9xU7TXB_7jM,13341
19
+ docling_jobkit/convert/manager.py,sha256=M6kB5hFzQpcDCrdQRXxImdadbYJuwoLel7fTDeXWRtw,15949
20
20
  docling_jobkit/convert/results.py,sha256=vQvOuXIdlmPskHwUJlXX2zyJSb2k20ip5TfzuyPH5mU,9053
21
21
  docling_jobkit/convert/results_processor.py,sha256=TtiN6hqcUriEYMsEiyAutrgpMIz78D4pf-1HtiSjrXQ,16558
22
22
  docling_jobkit/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -51,8 +51,8 @@ docling_jobkit/orchestrators/rq/orchestrator.py,sha256=MFINSpXb5tQh0PKk3xK10eZ8c
51
51
  docling_jobkit/orchestrators/rq/worker.py,sha256=P9rXhH9k814sNPGgjY24CATwQSzL6Hfd5Th5d4I3ejs,6591
52
52
  docling_jobkit/ray_job/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
53
53
  docling_jobkit/ray_job/main.py,sha256=o52gLtEdGqyxa9XcVmwN55bCSVsIUq8zRm-FjgRlYN8,13465
54
- docling_jobkit-1.7.1.dist-info/METADATA,sha256=9IBxO3tGluaxQnTG8lyEwMYr8tbSn-oq3gedyoq438E,8105
55
- docling_jobkit-1.7.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
56
- docling_jobkit-1.7.1.dist-info/entry_points.txt,sha256=QWmq6d0B14If8Zshc7pRnBs6zO1e9vhEnrMHOgBfzj8,121
57
- docling_jobkit-1.7.1.dist-info/licenses/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
58
- docling_jobkit-1.7.1.dist-info/RECORD,,
54
+ docling_jobkit-1.8.1.dist-info/METADATA,sha256=i1VO-KS2Ub3H8mbnZi4rGJpp_noOrrZ1MOq6XTz874g,8105
55
+ docling_jobkit-1.8.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
56
+ docling_jobkit-1.8.1.dist-info/entry_points.txt,sha256=QWmq6d0B14If8Zshc7pRnBs6zO1e9vhEnrMHOgBfzj8,121
57
+ docling_jobkit-1.8.1.dist-info/licenses/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
58
+ docling_jobkit-1.8.1.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: hatchling 1.27.0
2
+ Generator: hatchling 1.28.0
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any