docling-jobkit 1.8.1__py3-none-any.whl → 1.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling_jobkit/cli/local.py +14 -3
- docling_jobkit/cli/multiproc.py +504 -0
- docling_jobkit/connectors/google_drive_helper.py +5 -5
- docling_jobkit/connectors/google_drive_source_processor.py +30 -1
- docling_jobkit/connectors/http_source_processor.py +23 -3
- docling_jobkit/connectors/local_path_source_processor.py +126 -0
- docling_jobkit/connectors/local_path_target_processor.py +92 -0
- docling_jobkit/connectors/s3_source_processor.py +45 -24
- docling_jobkit/connectors/source_processor.py +52 -2
- docling_jobkit/connectors/source_processor_factory.py +6 -0
- docling_jobkit/connectors/target_processor_factory.py +6 -0
- docling_jobkit/convert/chunking.py +2 -1
- docling_jobkit/convert/manager.py +4 -5
- docling_jobkit/datamodel/task_sources.py +57 -2
- docling_jobkit/datamodel/task_targets.py +28 -1
- docling_jobkit/orchestrators/local/orchestrator.py +8 -0
- docling_jobkit/orchestrators/local/worker.py +6 -5
- docling_jobkit/orchestrators/rq/orchestrator.py +13 -3
- docling_jobkit/orchestrators/rq/worker.py +3 -0
- docling_jobkit/ray_job/main.py +12 -3
- {docling_jobkit-1.8.1.dist-info → docling_jobkit-1.9.0.dist-info}/METADATA +77 -7
- {docling_jobkit-1.8.1.dist-info → docling_jobkit-1.9.0.dist-info}/RECORD +25 -22
- {docling_jobkit-1.8.1.dist-info → docling_jobkit-1.9.0.dist-info}/entry_points.txt +1 -0
- {docling_jobkit-1.8.1.dist-info → docling_jobkit-1.9.0.dist-info}/WHEEL +0 -0
- {docling_jobkit-1.8.1.dist-info → docling_jobkit-1.9.0.dist-info}/licenses/LICENSE +0 -0
docling_jobkit/ray_job/main.py
CHANGED
|
@@ -1,3 +1,8 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
|
|
3
|
+
if sys.version_info >= (3, 14):
|
|
4
|
+
raise ImportError("ray support is not yet available for Python 3.14.")
|
|
5
|
+
|
|
1
6
|
import argparse
|
|
2
7
|
import json
|
|
3
8
|
import os
|
|
@@ -16,7 +21,11 @@ from ray._raylet import ObjectRefGenerator # type: ignore
|
|
|
16
21
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
|
17
22
|
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
|
18
23
|
from docling.datamodel.document import ConversionResult
|
|
19
|
-
from docling.datamodel.pipeline_options import
|
|
24
|
+
from docling.datamodel.pipeline_options import (
|
|
25
|
+
PdfPipelineOptions,
|
|
26
|
+
TableFormerMode,
|
|
27
|
+
TableStructureOptions,
|
|
28
|
+
)
|
|
20
29
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
|
21
30
|
|
|
22
31
|
# Load credentials
|
|
@@ -224,8 +233,8 @@ class DoclingConvert:
|
|
|
224
233
|
pipeline_options = PdfPipelineOptions()
|
|
225
234
|
pipeline_options.do_ocr = do_ocr
|
|
226
235
|
pipeline_options.do_table_structure = do_table_structure
|
|
227
|
-
pipeline_options.table_structure_options
|
|
228
|
-
table_structure_mode
|
|
236
|
+
pipeline_options.table_structure_options = TableStructureOptions(
|
|
237
|
+
mode=TableFormerMode(table_structure_mode)
|
|
229
238
|
)
|
|
230
239
|
pipeline_options.generate_page_images = generate_page_images
|
|
231
240
|
|
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docling-jobkit
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.9.0
|
|
4
4
|
Summary: Running a distributed job processing documents with Docling.
|
|
5
5
|
Project-URL: Homepage, https://github.com/docling-project/docling-jobkit
|
|
6
|
+
Project-URL: Documentation, https://docling-project.github.io/docling/usage/jobkit/
|
|
6
7
|
Project-URL: Repository, https://github.com/docling-project/docling-jobkit
|
|
7
8
|
Project-URL: Issues, https://github.com/docling-project/docling-jobkit/issues
|
|
8
9
|
Project-URL: Changelog, https://github.com/docling-project/docling-jobkit/blob/main/CHANGELOG.md
|
|
@@ -18,6 +19,7 @@ Classifier: Programming Language :: Python :: 3.10
|
|
|
18
19
|
Classifier: Programming Language :: Python :: 3.11
|
|
19
20
|
Classifier: Programming Language :: Python :: 3.12
|
|
20
21
|
Classifier: Programming Language :: Python :: 3.13
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
21
23
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
22
24
|
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
23
25
|
Classifier: Topic :: Software Development :: Build Tools
|
|
@@ -26,21 +28,18 @@ Classifier: Typing :: Typed
|
|
|
26
28
|
Requires-Python: >=3.10
|
|
27
29
|
Requires-Dist: boto3~=1.35
|
|
28
30
|
Requires-Dist: docling~=2.60
|
|
29
|
-
Requires-Dist:
|
|
30
|
-
Requires-Dist: httpx~=0.28
|
|
31
|
+
Requires-Dist: httpx<1,>=0.28
|
|
31
32
|
Requires-Dist: pandas~=2.2
|
|
32
|
-
Requires-Dist: pyarrow~=19.0
|
|
33
33
|
Requires-Dist: pydantic-settings~=2.4
|
|
34
34
|
Requires-Dist: pydantic~=2.10
|
|
35
|
-
Requires-Dist: typer
|
|
36
|
-
Requires-Dist: typer~=0.12
|
|
35
|
+
Requires-Dist: typer<1,>=0.12.5
|
|
37
36
|
Provides-Extra: gdrive
|
|
38
37
|
Requires-Dist: google-api-python-client>=2.183.0; extra == 'gdrive'
|
|
39
38
|
Requires-Dist: google-auth-oauthlib>=1.2.2; extra == 'gdrive'
|
|
40
39
|
Provides-Extra: kfp
|
|
41
40
|
Requires-Dist: kfp[kubernetes]>=2.10.0; extra == 'kfp'
|
|
42
41
|
Provides-Extra: ray
|
|
43
|
-
Requires-Dist: ray~=2.30; extra == 'ray'
|
|
42
|
+
Requires-Dist: ray~=2.30; (python_version < '3.14') and extra == 'ray'
|
|
44
43
|
Provides-Extra: rq
|
|
45
44
|
Requires-Dist: msgpack~=1.1; extra == 'rq'
|
|
46
45
|
Requires-Dist: rq~=2.4; extra == 'rq'
|
|
@@ -55,6 +54,77 @@ Running a distributed job processing documents with Docling.
|
|
|
55
54
|
|
|
56
55
|
## How to use it
|
|
57
56
|
|
|
57
|
+
### Local Multiprocessing CLI
|
|
58
|
+
|
|
59
|
+
The `docling-jobkit-multiproc` CLI enables parallel batch processing of documents using Python's multiprocessing. Each batch of documents is processed in a separate subprocess, allowing efficient parallel processing on a single machine.
|
|
60
|
+
|
|
61
|
+
#### Usage
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
# Basic usage with default settings (batch_size=10, num_processes=CPU count)
|
|
65
|
+
docling-jobkit-multiproc config.yaml
|
|
66
|
+
|
|
67
|
+
# Custom batch size and number of processes
|
|
68
|
+
docling-jobkit-multiproc config.yaml --batch-size 20 --num-processes 4
|
|
69
|
+
|
|
70
|
+
# With model artifacts
|
|
71
|
+
docling-jobkit-multiproc config.yaml --artifacts-path /path/to/models
|
|
72
|
+
|
|
73
|
+
# Quiet mode (suppress progress bar)
|
|
74
|
+
docling-jobkit-multiproc config.yaml --quiet
|
|
75
|
+
|
|
76
|
+
# Full options
|
|
77
|
+
docling-jobkit-multiproc config.yaml \
|
|
78
|
+
--batch-size 30 \
|
|
79
|
+
--num-processes 8 \
|
|
80
|
+
--artifacts-path /path/to/models \
|
|
81
|
+
--enable-remote-services \
|
|
82
|
+
--allow-external-plugins
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
#### Configuration
|
|
86
|
+
|
|
87
|
+
The configuration file format is the same as `docling-jobkit-local`. See example configurations:
|
|
88
|
+
- S3 source/target: `dev/configs/run_multiproc_s3_example.yaml`
|
|
89
|
+
- Local path source/target: `dev/configs/run_local_folder_example.yaml`
|
|
90
|
+
|
|
91
|
+
**Note:** Only S3, Google Drive, and local_path sources support batch processing. File and HTTP sources do not support chunking.
|
|
92
|
+
|
|
93
|
+
#### CLI Options
|
|
94
|
+
|
|
95
|
+
- `--batch-size, -b`: Number of documents to process in each batch (default: 10)
|
|
96
|
+
- `--num-processes, -n`: Number of parallel processes (default: CPU count)
|
|
97
|
+
- `--artifacts-path`: Path to model artifacts directory
|
|
98
|
+
- `--enable-remote-services`: Enable models connecting to remote services
|
|
99
|
+
- `--allow-external-plugins`: Enable loading modules from third-party plugins
|
|
100
|
+
- `--quiet, -q`: Suppress progress bar and detailed output
|
|
101
|
+
|
|
102
|
+
### Local Sequential CLI
|
|
103
|
+
|
|
104
|
+
The `docling-jobkit-local` CLI processes documents sequentially in a single process.
|
|
105
|
+
|
|
106
|
+
```bash
|
|
107
|
+
docling-jobkit-local config.yaml
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
### Using Local Path Sources and Targets
|
|
111
|
+
|
|
112
|
+
Both CLIs support local file system sources and targets. Example configuration:
|
|
113
|
+
|
|
114
|
+
```yaml
|
|
115
|
+
sources:
|
|
116
|
+
- kind: local_path
|
|
117
|
+
path: ./input_documents/
|
|
118
|
+
recursive: true # optional, default true
|
|
119
|
+
pattern: "*.pdf" # optional glob pattern
|
|
120
|
+
|
|
121
|
+
target:
|
|
122
|
+
kind: local_path
|
|
123
|
+
path: ./output_documents/
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
See `dev/configs/run_local_folder_example.yaml` for a complete example.
|
|
127
|
+
|
|
58
128
|
## Kubeflow pipeline with Docling Jobkit
|
|
59
129
|
|
|
60
130
|
### Using Kubeflow pipeline web dashboard UI
|
|
@@ -1,22 +1,25 @@
|
|
|
1
1
|
docling_jobkit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
2
|
docling_jobkit/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
3
|
docling_jobkit/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
|
-
docling_jobkit/cli/local.py,sha256=
|
|
4
|
+
docling_jobkit/cli/local.py,sha256=DZZvzwcxjsm-q7AD2EsjpP-B7nCw1RBY9yXNc4ifeiY,3789
|
|
5
|
+
docling_jobkit/cli/multiproc.py,sha256=X8M16Alv_eQ7QPB0u9NHpSPtI6nMlRzSbD_0hcRTwMw,16633
|
|
5
6
|
docling_jobkit/connectors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
-
docling_jobkit/connectors/google_drive_helper.py,sha256=
|
|
7
|
-
docling_jobkit/connectors/google_drive_source_processor.py,sha256=
|
|
7
|
+
docling_jobkit/connectors/google_drive_helper.py,sha256=YDYLhLD-0Kb8QhRwi35PSbFu8Lc-CBhI9zWE5qyPPv0,10554
|
|
8
|
+
docling_jobkit/connectors/google_drive_source_processor.py,sha256=0bl7mEpF-wRMmJMyi6e2mcDNrqgBxWQzU4rwi81ER8I,2384
|
|
8
9
|
docling_jobkit/connectors/google_drive_target_processor.py,sha256=sp54RIsH5DEfW54LaOGkCoEUJvYP3BYGcOo955aQs10,1884
|
|
9
|
-
docling_jobkit/connectors/http_source_processor.py,sha256=
|
|
10
|
+
docling_jobkit/connectors/http_source_processor.py,sha256=i8gYWdPGE19cMoPAPwKSKe46fCm1B1NAQ64YJKyvgI8,1654
|
|
11
|
+
docling_jobkit/connectors/local_path_source_processor.py,sha256=3R-w8HEZAZGPDn_vlkpdegdK4fiIoBWlsjaV2_NbXpg,4063
|
|
12
|
+
docling_jobkit/connectors/local_path_target_processor.py,sha256=DhGCMLqMV7PuV_3jp7DL2Y9Zp1eNbs8onDAadfwz5XY,3173
|
|
10
13
|
docling_jobkit/connectors/s3_helper.py,sha256=YIDytDkuEWIeTNRa-k0LsDG3V-tG31mvbAT3JyC1iF4,6320
|
|
11
|
-
docling_jobkit/connectors/s3_source_processor.py,sha256=
|
|
14
|
+
docling_jobkit/connectors/s3_source_processor.py,sha256=TZ1X0Lfj4eMM_QpOKz9KYF6HSbxDZ8XjtKqPqbxOLJM,2353
|
|
12
15
|
docling_jobkit/connectors/s3_target_processor.py,sha256=CaX109JhZ8nFlYWcz3Sy0_Kl-pSD6Y9oz5SOPsFe9I4,1998
|
|
13
|
-
docling_jobkit/connectors/source_processor.py,sha256=
|
|
14
|
-
docling_jobkit/connectors/source_processor_factory.py,sha256=
|
|
16
|
+
docling_jobkit/connectors/source_processor.py,sha256=HW3u6x_Btl4219LNdSPJhBi-xCGbh6yMf4vA1H7iXz4,2726
|
|
17
|
+
docling_jobkit/connectors/source_processor_factory.py,sha256=FF93NMHPNMvOTdmahVk-tOdDGLJF93j6EVCAMT-YaCw,1155
|
|
15
18
|
docling_jobkit/connectors/target_processor.py,sha256=2iIJE7Ip_-1dxJGt02_ALwDC2BPBOk0AomYI6EOFTCA,1447
|
|
16
|
-
docling_jobkit/connectors/target_processor_factory.py,sha256=
|
|
19
|
+
docling_jobkit/connectors/target_processor_factory.py,sha256=A-ZuwQiuYK6HVNHTZomIVBZ3tC9D6femv3-7PSNQkEA,901
|
|
17
20
|
docling_jobkit/convert/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
18
|
-
docling_jobkit/convert/chunking.py,sha256=
|
|
19
|
-
docling_jobkit/convert/manager.py,sha256=
|
|
21
|
+
docling_jobkit/convert/chunking.py,sha256=vvdrvg66JB8Snwnbo7yzYXox6b2rRFDS9R4UyTXvq58,12014
|
|
22
|
+
docling_jobkit/convert/manager.py,sha256=SPsPGf600vXL1H0tcXyQ7L7CCJlQcAz9gsetX4XWSPo,15938
|
|
20
23
|
docling_jobkit/convert/results.py,sha256=vQvOuXIdlmPskHwUJlXX2zyJSb2k20ip5TfzuyPH5mU,9053
|
|
21
24
|
docling_jobkit/convert/results_processor.py,sha256=TtiN6hqcUriEYMsEiyAutrgpMIz78D4pf-1HtiSjrXQ,16558
|
|
22
25
|
docling_jobkit/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -29,8 +32,8 @@ docling_jobkit/datamodel/result.py,sha256=WaasDSc_aAyuN3wfGlWqt1ZksC_5tIDHcafgeV
|
|
|
29
32
|
docling_jobkit/datamodel/s3_coords.py,sha256=9MrrtQaLJRBOdqi7bzuT3XVSbXJAdKjPlch0IAXDnfc,1260
|
|
30
33
|
docling_jobkit/datamodel/task.py,sha256=llcL3G6FM5ktkFSib8WFTuO5c_-KbJHUWugsD9RitbA,3163
|
|
31
34
|
docling_jobkit/datamodel/task_meta.py,sha256=QB_u9_TZEZlXYFlmmCgsFnsuMjzaO6QNVsnREmDu6hc,389
|
|
32
|
-
docling_jobkit/datamodel/task_sources.py,sha256=
|
|
33
|
-
docling_jobkit/datamodel/task_targets.py,sha256=
|
|
35
|
+
docling_jobkit/datamodel/task_sources.py,sha256=HQVsy6d_qcClL-I9fjo1U9n_EGY2JJVZe91pmILUw9Q,2291
|
|
36
|
+
docling_jobkit/datamodel/task_targets.py,sha256=j31iiZWAkw0viYF7vFuwP7yyorTZEcnlpVzE6RgJ_Y8,1388
|
|
34
37
|
docling_jobkit/kfp_pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
35
38
|
docling_jobkit/kfp_pipeline/docling-s3in-s3out.yaml,sha256=tarhfU24S7eTnuTKSycCqbzLanJCfXb4EHmu_D3oYq8,15207
|
|
36
39
|
docling_jobkit/kfp_pipeline/docling_s3in_s3out.py,sha256=DbIPQetoBIs9dX-JZszimjTTEEIWwiQOlLyhSt29Ybs,9818
|
|
@@ -44,15 +47,15 @@ docling_jobkit/orchestrators/kfp/kfp_pipeline.py,sha256=oglqcFAW2JS9G4Vyfby76T7a
|
|
|
44
47
|
docling_jobkit/orchestrators/kfp/notify.py,sha256=uG9c18LJn9T0RuvRZtllWL497Uhv2Qe_whglB_ta8XY,883
|
|
45
48
|
docling_jobkit/orchestrators/kfp/orchestrator.py,sha256=FkNCb9cscE4vckSYd9sA4_1SmnRJBepV738JeOwygcs,11653
|
|
46
49
|
docling_jobkit/orchestrators/local/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
47
|
-
docling_jobkit/orchestrators/local/orchestrator.py,sha256=
|
|
48
|
-
docling_jobkit/orchestrators/local/worker.py,sha256=
|
|
50
|
+
docling_jobkit/orchestrators/local/orchestrator.py,sha256=swMw3a-Lm4a13poLV2JE33uF_EeBDyM3VZ71Dhbt-_o,4921
|
|
51
|
+
docling_jobkit/orchestrators/local/worker.py,sha256=v4YNAZsSIcnNzsLDXot_3jdiy_lfIlH4h6E1hLRixS0,5818
|
|
49
52
|
docling_jobkit/orchestrators/rq/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
50
|
-
docling_jobkit/orchestrators/rq/orchestrator.py,sha256=
|
|
51
|
-
docling_jobkit/orchestrators/rq/worker.py,sha256=
|
|
53
|
+
docling_jobkit/orchestrators/rq/orchestrator.py,sha256=sP0EzjhKST8R6tPW3cVyaAzGWNDd-7AdaWhWMUi7QY8,9259
|
|
54
|
+
docling_jobkit/orchestrators/rq/worker.py,sha256=tSGQCMFgHZTUqmIVkhVq8bYFdVl0eyhcm7NT22vmsHk,6719
|
|
52
55
|
docling_jobkit/ray_job/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
53
|
-
docling_jobkit/ray_job/main.py,sha256=
|
|
54
|
-
docling_jobkit-1.
|
|
55
|
-
docling_jobkit-1.
|
|
56
|
-
docling_jobkit-1.
|
|
57
|
-
docling_jobkit-1.
|
|
58
|
-
docling_jobkit-1.
|
|
56
|
+
docling_jobkit/ray_job/main.py,sha256=6VyAsn9wk3v09qH4uQb4u1YnesX_-1DJrEg2MkDXy2k,13648
|
|
57
|
+
docling_jobkit-1.9.0.dist-info/METADATA,sha256=RkesOBVu3Ue73C37am3HArxmYTLjzIhwaCjlJM5LqUs,10475
|
|
58
|
+
docling_jobkit-1.9.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
59
|
+
docling_jobkit-1.9.0.dist-info/entry_points.txt,sha256=-tTX7hZPMCPZ2zVSUhI2BTPFglsds_A6PbCpPR7gUVM,181
|
|
60
|
+
docling_jobkit-1.9.0.dist-info/licenses/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
|
|
61
|
+
docling_jobkit-1.9.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|