docling-jobkit 1.8.0__tar.gz → 1.9.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/PKG-INFO +77 -7
- {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/README.md +71 -0
- {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/cli/local.py +14 -3
- docling_jobkit-1.9.0/docling_jobkit/cli/multiproc.py +504 -0
- {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/connectors/google_drive_helper.py +5 -5
- {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/connectors/google_drive_source_processor.py +30 -1
- docling_jobkit-1.9.0/docling_jobkit/connectors/http_source_processor.py +45 -0
- docling_jobkit-1.9.0/docling_jobkit/connectors/local_path_source_processor.py +126 -0
- docling_jobkit-1.9.0/docling_jobkit/connectors/local_path_target_processor.py +92 -0
- docling_jobkit-1.9.0/docling_jobkit/connectors/s3_source_processor.py +64 -0
- docling_jobkit-1.9.0/docling_jobkit/connectors/source_processor.py +93 -0
- {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/connectors/source_processor_factory.py +6 -0
- {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/connectors/target_processor_factory.py +6 -0
- {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/convert/chunking.py +2 -1
- {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/convert/manager.py +60 -9
- docling_jobkit-1.9.0/docling_jobkit/datamodel/task_sources.py +84 -0
- docling_jobkit-1.9.0/docling_jobkit/datamodel/task_targets.py +60 -0
- {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/orchestrators/local/orchestrator.py +8 -0
- {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/orchestrators/local/worker.py +6 -5
- {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/orchestrators/rq/orchestrator.py +13 -3
- {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/orchestrators/rq/worker.py +3 -0
- {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/ray_job/main.py +12 -3
- {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/pyproject.toml +14 -9
- docling_jobkit-1.8.0/docling_jobkit/connectors/http_source_processor.py +0 -25
- docling_jobkit-1.8.0/docling_jobkit/connectors/s3_source_processor.py +0 -43
- docling_jobkit-1.8.0/docling_jobkit/connectors/source_processor.py +0 -43
- docling_jobkit-1.8.0/docling_jobkit/datamodel/task_sources.py +0 -29
- docling_jobkit-1.8.0/docling_jobkit/datamodel/task_targets.py +0 -33
- {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/.gitignore +0 -0
- {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/LICENSE +0 -0
- {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/__init__.py +0 -0
- {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/cli/__init__.py +0 -0
- {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/connectors/__init__.py +0 -0
- {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/connectors/google_drive_target_processor.py +0 -0
- {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/connectors/s3_helper.py +0 -0
- {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/connectors/s3_target_processor.py +0 -0
- {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/connectors/target_processor.py +0 -0
- {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/convert/__init__.py +0 -0
- {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/convert/results.py +0 -0
- {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/convert/results_processor.py +0 -0
- {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/datamodel/__init__.py +0 -0
- {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/datamodel/callback.py +0 -0
- {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/datamodel/chunking.py +0 -0
- {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/datamodel/convert.py +0 -0
- {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/datamodel/google_drive_coords.py +0 -0
- {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/datamodel/http_inputs.py +0 -0
- {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/datamodel/result.py +0 -0
- {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/datamodel/s3_coords.py +0 -0
- {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/datamodel/task.py +0 -0
- {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/datamodel/task_meta.py +0 -0
- {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/kfp_pipeline/__init__.py +0 -0
- {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/kfp_pipeline/docling-s3in-s3out.yaml +0 -0
- {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/kfp_pipeline/docling_s3in_s3out.py +0 -0
- {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/kfp_pipeline/docling_s3in_s3out_with_infer.py +0 -0
- {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/kfp_pipeline/docling_s3in_s3out_with_infer.yaml +0 -0
- {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/orchestrators/__init__.py +0 -0
- {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/orchestrators/base_notifier.py +0 -0
- {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/orchestrators/base_orchestrator.py +0 -0
- {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/orchestrators/kfp/__init__.py +0 -0
- {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/orchestrators/kfp/kfp_pipeline.py +0 -0
- {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/orchestrators/kfp/notify.py +0 -0
- {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/orchestrators/kfp/orchestrator.py +0 -0
- {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/orchestrators/local/__init__.py +0 -0
- {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/orchestrators/rq/__init__.py +0 -0
- {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/py.typed +0 -0
- {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/ray_job/__init__.py +0 -0
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docling-jobkit
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.9.0
|
|
4
4
|
Summary: Running a distributed job processing documents with Docling.
|
|
5
5
|
Project-URL: Homepage, https://github.com/docling-project/docling-jobkit
|
|
6
|
+
Project-URL: Documentation, https://docling-project.github.io/docling/usage/jobkit/
|
|
6
7
|
Project-URL: Repository, https://github.com/docling-project/docling-jobkit
|
|
7
8
|
Project-URL: Issues, https://github.com/docling-project/docling-jobkit/issues
|
|
8
9
|
Project-URL: Changelog, https://github.com/docling-project/docling-jobkit/blob/main/CHANGELOG.md
|
|
@@ -18,6 +19,7 @@ Classifier: Programming Language :: Python :: 3.10
|
|
|
18
19
|
Classifier: Programming Language :: Python :: 3.11
|
|
19
20
|
Classifier: Programming Language :: Python :: 3.12
|
|
20
21
|
Classifier: Programming Language :: Python :: 3.13
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
21
23
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
22
24
|
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
23
25
|
Classifier: Topic :: Software Development :: Build Tools
|
|
@@ -26,21 +28,18 @@ Classifier: Typing :: Typed
|
|
|
26
28
|
Requires-Python: >=3.10
|
|
27
29
|
Requires-Dist: boto3~=1.35
|
|
28
30
|
Requires-Dist: docling~=2.60
|
|
29
|
-
Requires-Dist:
|
|
30
|
-
Requires-Dist: httpx~=0.28
|
|
31
|
+
Requires-Dist: httpx<1,>=0.28
|
|
31
32
|
Requires-Dist: pandas~=2.2
|
|
32
|
-
Requires-Dist: pyarrow~=19.0
|
|
33
33
|
Requires-Dist: pydantic-settings~=2.4
|
|
34
34
|
Requires-Dist: pydantic~=2.10
|
|
35
|
-
Requires-Dist: typer
|
|
36
|
-
Requires-Dist: typer~=0.12
|
|
35
|
+
Requires-Dist: typer<1,>=0.12.5
|
|
37
36
|
Provides-Extra: gdrive
|
|
38
37
|
Requires-Dist: google-api-python-client>=2.183.0; extra == 'gdrive'
|
|
39
38
|
Requires-Dist: google-auth-oauthlib>=1.2.2; extra == 'gdrive'
|
|
40
39
|
Provides-Extra: kfp
|
|
41
40
|
Requires-Dist: kfp[kubernetes]>=2.10.0; extra == 'kfp'
|
|
42
41
|
Provides-Extra: ray
|
|
43
|
-
Requires-Dist: ray~=2.30; extra == 'ray'
|
|
42
|
+
Requires-Dist: ray~=2.30; (python_version < '3.14') and extra == 'ray'
|
|
44
43
|
Provides-Extra: rq
|
|
45
44
|
Requires-Dist: msgpack~=1.1; extra == 'rq'
|
|
46
45
|
Requires-Dist: rq~=2.4; extra == 'rq'
|
|
@@ -55,6 +54,77 @@ Running a distributed job processing documents with Docling.
|
|
|
55
54
|
|
|
56
55
|
## How to use it
|
|
57
56
|
|
|
57
|
+
### Local Multiprocessing CLI
|
|
58
|
+
|
|
59
|
+
The `docling-jobkit-multiproc` CLI enables parallel batch processing of documents using Python's multiprocessing. Each batch of documents is processed in a separate subprocess, allowing efficient parallel processing on a single machine.
|
|
60
|
+
|
|
61
|
+
#### Usage
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
# Basic usage with default settings (batch_size=10, num_processes=CPU count)
|
|
65
|
+
docling-jobkit-multiproc config.yaml
|
|
66
|
+
|
|
67
|
+
# Custom batch size and number of processes
|
|
68
|
+
docling-jobkit-multiproc config.yaml --batch-size 20 --num-processes 4
|
|
69
|
+
|
|
70
|
+
# With model artifacts
|
|
71
|
+
docling-jobkit-multiproc config.yaml --artifacts-path /path/to/models
|
|
72
|
+
|
|
73
|
+
# Quiet mode (suppress progress bar)
|
|
74
|
+
docling-jobkit-multiproc config.yaml --quiet
|
|
75
|
+
|
|
76
|
+
# Full options
|
|
77
|
+
docling-jobkit-multiproc config.yaml \
|
|
78
|
+
--batch-size 30 \
|
|
79
|
+
--num-processes 8 \
|
|
80
|
+
--artifacts-path /path/to/models \
|
|
81
|
+
--enable-remote-services \
|
|
82
|
+
--allow-external-plugins
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
#### Configuration
|
|
86
|
+
|
|
87
|
+
The configuration file format is the same as `docling-jobkit-local`. See example configurations:
|
|
88
|
+
- S3 source/target: `dev/configs/run_multiproc_s3_example.yaml`
|
|
89
|
+
- Local path source/target: `dev/configs/run_local_folder_example.yaml`
|
|
90
|
+
|
|
91
|
+
**Note:** Only S3, Google Drive, and local_path sources support batch processing. File and HTTP sources do not support chunking.
|
|
92
|
+
|
|
93
|
+
#### CLI Options
|
|
94
|
+
|
|
95
|
+
- `--batch-size, -b`: Number of documents to process in each batch (default: 10)
|
|
96
|
+
- `--num-processes, -n`: Number of parallel processes (default: CPU count)
|
|
97
|
+
- `--artifacts-path`: Path to model artifacts directory
|
|
98
|
+
- `--enable-remote-services`: Enable models connecting to remote services
|
|
99
|
+
- `--allow-external-plugins`: Enable loading modules from third-party plugins
|
|
100
|
+
- `--quiet, -q`: Suppress progress bar and detailed output
|
|
101
|
+
|
|
102
|
+
### Local Sequential CLI
|
|
103
|
+
|
|
104
|
+
The `docling-jobkit-local` CLI processes documents sequentially in a single process.
|
|
105
|
+
|
|
106
|
+
```bash
|
|
107
|
+
docling-jobkit-local config.yaml
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
### Using Local Path Sources and Targets
|
|
111
|
+
|
|
112
|
+
Both CLIs support local file system sources and targets. Example configuration:
|
|
113
|
+
|
|
114
|
+
```yaml
|
|
115
|
+
sources:
|
|
116
|
+
- kind: local_path
|
|
117
|
+
path: ./input_documents/
|
|
118
|
+
recursive: true # optional, default true
|
|
119
|
+
pattern: "*.pdf" # optional glob pattern
|
|
120
|
+
|
|
121
|
+
target:
|
|
122
|
+
kind: local_path
|
|
123
|
+
path: ./output_documents/
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
See `dev/configs/run_local_folder_example.yaml` for a complete example.
|
|
127
|
+
|
|
58
128
|
## Kubeflow pipeline with Docling Jobkit
|
|
59
129
|
|
|
60
130
|
### Using Kubeflow pipeline web dashboard UI
|
|
@@ -5,6 +5,77 @@ Running a distributed job processing documents with Docling.
|
|
|
5
5
|
|
|
6
6
|
## How to use it
|
|
7
7
|
|
|
8
|
+
### Local Multiprocessing CLI
|
|
9
|
+
|
|
10
|
+
The `docling-jobkit-multiproc` CLI enables parallel batch processing of documents using Python's multiprocessing. Each batch of documents is processed in a separate subprocess, allowing efficient parallel processing on a single machine.
|
|
11
|
+
|
|
12
|
+
#### Usage
|
|
13
|
+
|
|
14
|
+
```bash
|
|
15
|
+
# Basic usage with default settings (batch_size=10, num_processes=CPU count)
|
|
16
|
+
docling-jobkit-multiproc config.yaml
|
|
17
|
+
|
|
18
|
+
# Custom batch size and number of processes
|
|
19
|
+
docling-jobkit-multiproc config.yaml --batch-size 20 --num-processes 4
|
|
20
|
+
|
|
21
|
+
# With model artifacts
|
|
22
|
+
docling-jobkit-multiproc config.yaml --artifacts-path /path/to/models
|
|
23
|
+
|
|
24
|
+
# Quiet mode (suppress progress bar)
|
|
25
|
+
docling-jobkit-multiproc config.yaml --quiet
|
|
26
|
+
|
|
27
|
+
# Full options
|
|
28
|
+
docling-jobkit-multiproc config.yaml \
|
|
29
|
+
--batch-size 30 \
|
|
30
|
+
--num-processes 8 \
|
|
31
|
+
--artifacts-path /path/to/models \
|
|
32
|
+
--enable-remote-services \
|
|
33
|
+
--allow-external-plugins
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
#### Configuration
|
|
37
|
+
|
|
38
|
+
The configuration file format is the same as `docling-jobkit-local`. See example configurations:
|
|
39
|
+
- S3 source/target: `dev/configs/run_multiproc_s3_example.yaml`
|
|
40
|
+
- Local path source/target: `dev/configs/run_local_folder_example.yaml`
|
|
41
|
+
|
|
42
|
+
**Note:** Only S3, Google Drive, and local_path sources support batch processing. File and HTTP sources do not support chunking.
|
|
43
|
+
|
|
44
|
+
#### CLI Options
|
|
45
|
+
|
|
46
|
+
- `--batch-size, -b`: Number of documents to process in each batch (default: 10)
|
|
47
|
+
- `--num-processes, -n`: Number of parallel processes (default: CPU count)
|
|
48
|
+
- `--artifacts-path`: Path to model artifacts directory
|
|
49
|
+
- `--enable-remote-services`: Enable models connecting to remote services
|
|
50
|
+
- `--allow-external-plugins`: Enable loading modules from third-party plugins
|
|
51
|
+
- `--quiet, -q`: Suppress progress bar and detailed output
|
|
52
|
+
|
|
53
|
+
### Local Sequential CLI
|
|
54
|
+
|
|
55
|
+
The `docling-jobkit-local` CLI processes documents sequentially in a single process.
|
|
56
|
+
|
|
57
|
+
```bash
|
|
58
|
+
docling-jobkit-local config.yaml
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
### Using Local Path Sources and Targets
|
|
62
|
+
|
|
63
|
+
Both CLIs support local file system sources and targets. Example configuration:
|
|
64
|
+
|
|
65
|
+
```yaml
|
|
66
|
+
sources:
|
|
67
|
+
- kind: local_path
|
|
68
|
+
path: ./input_documents/
|
|
69
|
+
recursive: true # optional, default true
|
|
70
|
+
pattern: "*.pdf" # optional glob pattern
|
|
71
|
+
|
|
72
|
+
target:
|
|
73
|
+
kind: local_path
|
|
74
|
+
path: ./output_documents/
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
See `dev/configs/run_local_folder_example.yaml` for a complete example.
|
|
78
|
+
|
|
8
79
|
## Kubeflow pipeline with Docling Jobkit
|
|
9
80
|
|
|
10
81
|
### Using Kubeflow pipeline web dashboard UI
|
|
@@ -18,9 +18,15 @@ from docling_jobkit.datamodel.task_sources import (
|
|
|
18
18
|
TaskFileSource,
|
|
19
19
|
TaskGoogleDriveSource,
|
|
20
20
|
TaskHttpSource,
|
|
21
|
+
TaskLocalPathSource,
|
|
21
22
|
TaskS3Source,
|
|
22
23
|
)
|
|
23
|
-
from docling_jobkit.datamodel.task_targets import
|
|
24
|
+
from docling_jobkit.datamodel.task_targets import (
|
|
25
|
+
GoogleDriveTarget,
|
|
26
|
+
LocalPathTarget,
|
|
27
|
+
S3Target,
|
|
28
|
+
ZipTarget,
|
|
29
|
+
)
|
|
24
30
|
|
|
25
31
|
console = Console()
|
|
26
32
|
err_console = Console(stderr=True)
|
|
@@ -34,12 +40,17 @@ app = typer.Typer(
|
|
|
34
40
|
)
|
|
35
41
|
|
|
36
42
|
JobTaskSource = Annotated[
|
|
37
|
-
TaskFileSource
|
|
43
|
+
TaskFileSource
|
|
44
|
+
| TaskHttpSource
|
|
45
|
+
| TaskLocalPathSource
|
|
46
|
+
| TaskS3Source
|
|
47
|
+
| TaskGoogleDriveSource,
|
|
38
48
|
Field(discriminator="kind"),
|
|
39
49
|
]
|
|
40
50
|
|
|
41
51
|
JobTaskTarget = Annotated[
|
|
42
|
-
ZipTarget | S3Target | GoogleDriveTarget,
|
|
52
|
+
ZipTarget | LocalPathTarget | S3Target | GoogleDriveTarget,
|
|
53
|
+
Field(discriminator="kind"),
|
|
43
54
|
]
|
|
44
55
|
|
|
45
56
|
|