docling-jobkit 1.8.1__py3-none-any.whl → 1.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,8 @@
1
+ import sys
2
+
3
+ if sys.version_info >= (3, 14):
4
+ raise ImportError("ray support is not yet available for Python 3.14.")
5
+
1
6
  import argparse
2
7
  import json
3
8
  import os
@@ -16,7 +21,11 @@ from ray._raylet import ObjectRefGenerator # type: ignore
16
21
  from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
17
22
  from docling.datamodel.base_models import ConversionStatus, InputFormat
18
23
  from docling.datamodel.document import ConversionResult
19
- from docling.datamodel.pipeline_options import PdfPipelineOptions, TableFormerMode
24
+ from docling.datamodel.pipeline_options import (
25
+ PdfPipelineOptions,
26
+ TableFormerMode,
27
+ TableStructureOptions,
28
+ )
20
29
  from docling.document_converter import DocumentConverter, PdfFormatOption
21
30
 
22
31
  # Load credentials
@@ -224,8 +233,8 @@ class DoclingConvert:
224
233
  pipeline_options = PdfPipelineOptions()
225
234
  pipeline_options.do_ocr = do_ocr
226
235
  pipeline_options.do_table_structure = do_table_structure
227
- pipeline_options.table_structure_options.mode = TableFormerMode(
228
- table_structure_mode
236
+ pipeline_options.table_structure_options = TableStructureOptions(
237
+ mode=TableFormerMode(table_structure_mode)
229
238
  )
230
239
  pipeline_options.generate_page_images = generate_page_images
231
240
 
@@ -1,8 +1,9 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling-jobkit
3
- Version: 1.8.1
3
+ Version: 1.9.0
4
4
  Summary: Running a distributed job processing documents with Docling.
5
5
  Project-URL: Homepage, https://github.com/docling-project/docling-jobkit
6
+ Project-URL: Documentation, https://docling-project.github.io/docling/usage/jobkit/
6
7
  Project-URL: Repository, https://github.com/docling-project/docling-jobkit
7
8
  Project-URL: Issues, https://github.com/docling-project/docling-jobkit/issues
8
9
  Project-URL: Changelog, https://github.com/docling-project/docling-jobkit/blob/main/CHANGELOG.md
@@ -18,6 +19,7 @@ Classifier: Programming Language :: Python :: 3.10
18
19
  Classifier: Programming Language :: Python :: 3.11
19
20
  Classifier: Programming Language :: Python :: 3.12
20
21
  Classifier: Programming Language :: Python :: 3.13
22
+ Classifier: Programming Language :: Python :: 3.14
21
23
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
24
  Classifier: Topic :: Scientific/Engineering :: Information Analysis
23
25
  Classifier: Topic :: Software Development :: Build Tools
@@ -26,21 +28,18 @@ Classifier: Typing :: Typed
26
28
  Requires-Python: >=3.10
27
29
  Requires-Dist: boto3~=1.35
28
30
  Requires-Dist: docling~=2.60
29
- Requires-Dist: fastparquet~=2024.11
30
- Requires-Dist: httpx~=0.28
31
+ Requires-Dist: httpx<1,>=0.28
31
32
  Requires-Dist: pandas~=2.2
32
- Requires-Dist: pyarrow~=19.0
33
33
  Requires-Dist: pydantic-settings~=2.4
34
34
  Requires-Dist: pydantic~=2.10
35
- Requires-Dist: typer>=0.12.5
36
- Requires-Dist: typer~=0.12
35
+ Requires-Dist: typer<1,>=0.12.5
37
36
  Provides-Extra: gdrive
38
37
  Requires-Dist: google-api-python-client>=2.183.0; extra == 'gdrive'
39
38
  Requires-Dist: google-auth-oauthlib>=1.2.2; extra == 'gdrive'
40
39
  Provides-Extra: kfp
41
40
  Requires-Dist: kfp[kubernetes]>=2.10.0; extra == 'kfp'
42
41
  Provides-Extra: ray
43
- Requires-Dist: ray~=2.30; extra == 'ray'
42
+ Requires-Dist: ray~=2.30; (python_version < '3.14') and extra == 'ray'
44
43
  Provides-Extra: rq
45
44
  Requires-Dist: msgpack~=1.1; extra == 'rq'
46
45
  Requires-Dist: rq~=2.4; extra == 'rq'
@@ -55,6 +54,77 @@ Running a distributed job processing documents with Docling.
55
54
 
56
55
  ## How to use it
57
56
 
57
+ ### Local Multiprocessing CLI
58
+
59
+ The `docling-jobkit-multiproc` CLI enables parallel batch processing of documents using Python's multiprocessing. Each batch of documents is processed in a separate subprocess, allowing efficient parallel processing on a single machine.
60
+
61
+ #### Usage
62
+
63
+ ```bash
64
+ # Basic usage with default settings (batch_size=10, num_processes=CPU count)
65
+ docling-jobkit-multiproc config.yaml
66
+
67
+ # Custom batch size and number of processes
68
+ docling-jobkit-multiproc config.yaml --batch-size 20 --num-processes 4
69
+
70
+ # With model artifacts
71
+ docling-jobkit-multiproc config.yaml --artifacts-path /path/to/models
72
+
73
+ # Quiet mode (suppress progress bar)
74
+ docling-jobkit-multiproc config.yaml --quiet
75
+
76
+ # Full options
77
+ docling-jobkit-multiproc config.yaml \
78
+ --batch-size 30 \
79
+ --num-processes 8 \
80
+ --artifacts-path /path/to/models \
81
+ --enable-remote-services \
82
+ --allow-external-plugins
83
+ ```
84
+
85
+ #### Configuration
86
+
87
+ The configuration file format is the same as `docling-jobkit-local`. See example configurations:
88
+ - S3 source/target: `dev/configs/run_multiproc_s3_example.yaml`
89
+ - Local path source/target: `dev/configs/run_local_folder_example.yaml`
90
+
91
+ **Note:** Only S3, Google Drive, and local_path sources support batch processing. File and HTTP sources do not support chunking.
92
+
93
+ #### CLI Options
94
+
95
+ - `--batch-size, -b`: Number of documents to process in each batch (default: 10)
96
+ - `--num-processes, -n`: Number of parallel processes (default: CPU count)
97
+ - `--artifacts-path`: Path to model artifacts directory
98
+ - `--enable-remote-services`: Enable models connecting to remote services
99
+ - `--allow-external-plugins`: Enable loading modules from third-party plugins
100
+ - `--quiet, -q`: Suppress progress bar and detailed output
101
+
102
+ ### Local Sequential CLI
103
+
104
+ The `docling-jobkit-local` CLI processes documents sequentially in a single process.
105
+
106
+ ```bash
107
+ docling-jobkit-local config.yaml
108
+ ```
109
+
110
+ ### Using Local Path Sources and Targets
111
+
112
+ Both CLIs support local file system sources and targets. Example configuration:
113
+
114
+ ```yaml
115
+ sources:
116
+ - kind: local_path
117
+ path: ./input_documents/
118
+ recursive: true # optional, default true
119
+ pattern: "*.pdf" # optional glob pattern
120
+
121
+ target:
122
+ kind: local_path
123
+ path: ./output_documents/
124
+ ```
125
+
126
+ See `dev/configs/run_local_folder_example.yaml` for a complete example.
127
+
58
128
  ## Kubeflow pipeline with Docling Jobkit
59
129
 
60
130
  ### Using Kubeflow pipeline web dashboard UI
@@ -1,22 +1,25 @@
1
1
  docling_jobkit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  docling_jobkit/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
3
  docling_jobkit/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
- docling_jobkit/cli/local.py,sha256=EzI8TXxzZDdxtbeT0plpHKAOoK7gbR6K_IhI2KN0N8c,3665
4
+ docling_jobkit/cli/local.py,sha256=DZZvzwcxjsm-q7AD2EsjpP-B7nCw1RBY9yXNc4ifeiY,3789
5
+ docling_jobkit/cli/multiproc.py,sha256=X8M16Alv_eQ7QPB0u9NHpSPtI6nMlRzSbD_0hcRTwMw,16633
5
6
  docling_jobkit/connectors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
- docling_jobkit/connectors/google_drive_helper.py,sha256=8e0loUWxbhA7oaZc_Dapb-vRIu6WMn18bZDKVN77YGA,10489
7
- docling_jobkit/connectors/google_drive_source_processor.py,sha256=LxpuJXPiZZQeLNXOg0gyT8nZBA-nZHzfdSbZITEcQEc,1384
7
+ docling_jobkit/connectors/google_drive_helper.py,sha256=YDYLhLD-0Kb8QhRwi35PSbFu8Lc-CBhI9zWE5qyPPv0,10554
8
+ docling_jobkit/connectors/google_drive_source_processor.py,sha256=0bl7mEpF-wRMmJMyi6e2mcDNrqgBxWQzU4rwi81ER8I,2384
8
9
  docling_jobkit/connectors/google_drive_target_processor.py,sha256=sp54RIsH5DEfW54LaOGkCoEUJvYP3BYGcOo955aQs10,1884
9
- docling_jobkit/connectors/http_source_processor.py,sha256=F5Z8IzqY9aWCZ0Ak2HCgKHD4tbk51dY4FDNc4I3bBNw,797
10
+ docling_jobkit/connectors/http_source_processor.py,sha256=i8gYWdPGE19cMoPAPwKSKe46fCm1B1NAQ64YJKyvgI8,1654
11
+ docling_jobkit/connectors/local_path_source_processor.py,sha256=3R-w8HEZAZGPDn_vlkpdegdK4fiIoBWlsjaV2_NbXpg,4063
12
+ docling_jobkit/connectors/local_path_target_processor.py,sha256=DhGCMLqMV7PuV_3jp7DL2Y9Zp1eNbs8onDAadfwz5XY,3173
10
13
  docling_jobkit/connectors/s3_helper.py,sha256=YIDytDkuEWIeTNRa-k0LsDG3V-tG31mvbAT3JyC1iF4,6320
11
- docling_jobkit/connectors/s3_source_processor.py,sha256=gLsYLksLGvn6pLID4RH0NN1w6nuy8HZVTnmmM6NfF9k,1365
14
+ docling_jobkit/connectors/s3_source_processor.py,sha256=TZ1X0Lfj4eMM_QpOKz9KYF6HSbxDZ8XjtKqPqbxOLJM,2353
12
15
  docling_jobkit/connectors/s3_target_processor.py,sha256=CaX109JhZ8nFlYWcz3Sy0_Kl-pSD6Y9oz5SOPsFe9I4,1998
13
- docling_jobkit/connectors/source_processor.py,sha256=BbbsePSPIkyda73Ezwn29HnZSuerSJ7GrSPyPOOJFpo,1244
14
- docling_jobkit/connectors/source_processor_factory.py,sha256=CLanqD5PwvDqHQ6WP5ivhsXA5xMx-8PFnm8RtmVHvlI,932
16
+ docling_jobkit/connectors/source_processor.py,sha256=HW3u6x_Btl4219LNdSPJhBi-xCGbh6yMf4vA1H7iXz4,2726
17
+ docling_jobkit/connectors/source_processor_factory.py,sha256=FF93NMHPNMvOTdmahVk-tOdDGLJF93j6EVCAMT-YaCw,1155
15
18
  docling_jobkit/connectors/target_processor.py,sha256=2iIJE7Ip_-1dxJGt02_ALwDC2BPBOk0AomYI6EOFTCA,1447
16
- docling_jobkit/connectors/target_processor_factory.py,sha256=b_Q3L_mlvfQlZG7A2cskzf6-LzQ1G_seGd2vLT51b5o,688
19
+ docling_jobkit/connectors/target_processor_factory.py,sha256=A-ZuwQiuYK6HVNHTZomIVBZ3tC9D6femv3-7PSNQkEA,901
17
20
  docling_jobkit/convert/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
- docling_jobkit/convert/chunking.py,sha256=jFl7g8rGFmIBV_-0rxfyp6970N_0dqLehLgDSHKOM-o,11933
19
- docling_jobkit/convert/manager.py,sha256=M6kB5hFzQpcDCrdQRXxImdadbYJuwoLel7fTDeXWRtw,15949
21
+ docling_jobkit/convert/chunking.py,sha256=vvdrvg66JB8Snwnbo7yzYXox6b2rRFDS9R4UyTXvq58,12014
22
+ docling_jobkit/convert/manager.py,sha256=SPsPGf600vXL1H0tcXyQ7L7CCJlQcAz9gsetX4XWSPo,15938
20
23
  docling_jobkit/convert/results.py,sha256=vQvOuXIdlmPskHwUJlXX2zyJSb2k20ip5TfzuyPH5mU,9053
21
24
  docling_jobkit/convert/results_processor.py,sha256=TtiN6hqcUriEYMsEiyAutrgpMIz78D4pf-1HtiSjrXQ,16558
22
25
  docling_jobkit/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -29,8 +32,8 @@ docling_jobkit/datamodel/result.py,sha256=WaasDSc_aAyuN3wfGlWqt1ZksC_5tIDHcafgeV
29
32
  docling_jobkit/datamodel/s3_coords.py,sha256=9MrrtQaLJRBOdqi7bzuT3XVSbXJAdKjPlch0IAXDnfc,1260
30
33
  docling_jobkit/datamodel/task.py,sha256=llcL3G6FM5ktkFSib8WFTuO5c_-KbJHUWugsD9RitbA,3163
31
34
  docling_jobkit/datamodel/task_meta.py,sha256=QB_u9_TZEZlXYFlmmCgsFnsuMjzaO6QNVsnREmDu6hc,389
32
- docling_jobkit/datamodel/task_sources.py,sha256=_aw4ymzC0lkjBRzz3UqrtM76Wu35_nNL704-jY2J0E8,733
33
- docling_jobkit/datamodel/task_targets.py,sha256=P7eEwpalak5E8TiTrYA7Oq9y38qs0KQR06eU0WtROfY,749
35
+ docling_jobkit/datamodel/task_sources.py,sha256=HQVsy6d_qcClL-I9fjo1U9n_EGY2JJVZe91pmILUw9Q,2291
36
+ docling_jobkit/datamodel/task_targets.py,sha256=j31iiZWAkw0viYF7vFuwP7yyorTZEcnlpVzE6RgJ_Y8,1388
34
37
  docling_jobkit/kfp_pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
35
38
  docling_jobkit/kfp_pipeline/docling-s3in-s3out.yaml,sha256=tarhfU24S7eTnuTKSycCqbzLanJCfXb4EHmu_D3oYq8,15207
36
39
  docling_jobkit/kfp_pipeline/docling_s3in_s3out.py,sha256=DbIPQetoBIs9dX-JZszimjTTEEIWwiQOlLyhSt29Ybs,9818
@@ -44,15 +47,15 @@ docling_jobkit/orchestrators/kfp/kfp_pipeline.py,sha256=oglqcFAW2JS9G4Vyfby76T7a
44
47
  docling_jobkit/orchestrators/kfp/notify.py,sha256=uG9c18LJn9T0RuvRZtllWL497Uhv2Qe_whglB_ta8XY,883
45
48
  docling_jobkit/orchestrators/kfp/orchestrator.py,sha256=FkNCb9cscE4vckSYd9sA4_1SmnRJBepV738JeOwygcs,11653
46
49
  docling_jobkit/orchestrators/local/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
47
- docling_jobkit/orchestrators/local/orchestrator.py,sha256=usG5nLEra3GqhfERnw6Pff0aNLWWSqE9T2TOU2jjiUI,4600
48
- docling_jobkit/orchestrators/local/worker.py,sha256=KB-lB4kV2eyWzluGcJCmRq9b3XHzo4i3w1C-VLlrTVU,5696
50
+ docling_jobkit/orchestrators/local/orchestrator.py,sha256=swMw3a-Lm4a13poLV2JE33uF_EeBDyM3VZ71Dhbt-_o,4921
51
+ docling_jobkit/orchestrators/local/worker.py,sha256=v4YNAZsSIcnNzsLDXot_3jdiy_lfIlH4h6E1hLRixS0,5818
49
52
  docling_jobkit/orchestrators/rq/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
50
- docling_jobkit/orchestrators/rq/orchestrator.py,sha256=MFINSpXb5tQh0PKk3xK10eZ8cNc3gNflfeqLx5r9ueo,8896
51
- docling_jobkit/orchestrators/rq/worker.py,sha256=P9rXhH9k814sNPGgjY24CATwQSzL6Hfd5Th5d4I3ejs,6591
53
+ docling_jobkit/orchestrators/rq/orchestrator.py,sha256=sP0EzjhKST8R6tPW3cVyaAzGWNDd-7AdaWhWMUi7QY8,9259
54
+ docling_jobkit/orchestrators/rq/worker.py,sha256=tSGQCMFgHZTUqmIVkhVq8bYFdVl0eyhcm7NT22vmsHk,6719
52
55
  docling_jobkit/ray_job/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
53
- docling_jobkit/ray_job/main.py,sha256=o52gLtEdGqyxa9XcVmwN55bCSVsIUq8zRm-FjgRlYN8,13465
54
- docling_jobkit-1.8.1.dist-info/METADATA,sha256=i1VO-KS2Ub3H8mbnZi4rGJpp_noOrrZ1MOq6XTz874g,8105
55
- docling_jobkit-1.8.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
56
- docling_jobkit-1.8.1.dist-info/entry_points.txt,sha256=QWmq6d0B14If8Zshc7pRnBs6zO1e9vhEnrMHOgBfzj8,121
57
- docling_jobkit-1.8.1.dist-info/licenses/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
58
- docling_jobkit-1.8.1.dist-info/RECORD,,
56
+ docling_jobkit/ray_job/main.py,sha256=6VyAsn9wk3v09qH4uQb4u1YnesX_-1DJrEg2MkDXy2k,13648
57
+ docling_jobkit-1.9.0.dist-info/METADATA,sha256=RkesOBVu3Ue73C37am3HArxmYTLjzIhwaCjlJM5LqUs,10475
58
+ docling_jobkit-1.9.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
59
+ docling_jobkit-1.9.0.dist-info/entry_points.txt,sha256=-tTX7hZPMCPZ2zVSUhI2BTPFglsds_A6PbCpPR7gUVM,181
60
+ docling_jobkit-1.9.0.dist-info/licenses/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
61
+ docling_jobkit-1.9.0.dist-info/RECORD,,
@@ -1,3 +1,4 @@
1
1
  [console_scripts]
2
2
  docling-jobkit-local = docling_jobkit.cli.local:app
3
+ docling-jobkit-multiproc = docling_jobkit.cli.multiproc:app
3
4
  docling-ray-job = docling_jobkit.ray_job.main:main