docling-jobkit 1.8.0__tar.gz → 1.9.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/PKG-INFO +77 -7
  2. {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/README.md +71 -0
  3. {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/cli/local.py +14 -3
  4. docling_jobkit-1.9.0/docling_jobkit/cli/multiproc.py +504 -0
  5. {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/connectors/google_drive_helper.py +5 -5
  6. {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/connectors/google_drive_source_processor.py +30 -1
  7. docling_jobkit-1.9.0/docling_jobkit/connectors/http_source_processor.py +45 -0
  8. docling_jobkit-1.9.0/docling_jobkit/connectors/local_path_source_processor.py +126 -0
  9. docling_jobkit-1.9.0/docling_jobkit/connectors/local_path_target_processor.py +92 -0
  10. docling_jobkit-1.9.0/docling_jobkit/connectors/s3_source_processor.py +64 -0
  11. docling_jobkit-1.9.0/docling_jobkit/connectors/source_processor.py +93 -0
  12. {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/connectors/source_processor_factory.py +6 -0
  13. {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/connectors/target_processor_factory.py +6 -0
  14. {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/convert/chunking.py +2 -1
  15. {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/convert/manager.py +60 -9
  16. docling_jobkit-1.9.0/docling_jobkit/datamodel/task_sources.py +84 -0
  17. docling_jobkit-1.9.0/docling_jobkit/datamodel/task_targets.py +60 -0
  18. {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/orchestrators/local/orchestrator.py +8 -0
  19. {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/orchestrators/local/worker.py +6 -5
  20. {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/orchestrators/rq/orchestrator.py +13 -3
  21. {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/orchestrators/rq/worker.py +3 -0
  22. {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/ray_job/main.py +12 -3
  23. {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/pyproject.toml +14 -9
  24. docling_jobkit-1.8.0/docling_jobkit/connectors/http_source_processor.py +0 -25
  25. docling_jobkit-1.8.0/docling_jobkit/connectors/s3_source_processor.py +0 -43
  26. docling_jobkit-1.8.0/docling_jobkit/connectors/source_processor.py +0 -43
  27. docling_jobkit-1.8.0/docling_jobkit/datamodel/task_sources.py +0 -29
  28. docling_jobkit-1.8.0/docling_jobkit/datamodel/task_targets.py +0 -33
  29. {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/.gitignore +0 -0
  30. {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/LICENSE +0 -0
  31. {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/__init__.py +0 -0
  32. {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/cli/__init__.py +0 -0
  33. {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/connectors/__init__.py +0 -0
  34. {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/connectors/google_drive_target_processor.py +0 -0
  35. {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/connectors/s3_helper.py +0 -0
  36. {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/connectors/s3_target_processor.py +0 -0
  37. {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/connectors/target_processor.py +0 -0
  38. {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/convert/__init__.py +0 -0
  39. {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/convert/results.py +0 -0
  40. {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/convert/results_processor.py +0 -0
  41. {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/datamodel/__init__.py +0 -0
  42. {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/datamodel/callback.py +0 -0
  43. {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/datamodel/chunking.py +0 -0
  44. {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/datamodel/convert.py +0 -0
  45. {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/datamodel/google_drive_coords.py +0 -0
  46. {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/datamodel/http_inputs.py +0 -0
  47. {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/datamodel/result.py +0 -0
  48. {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/datamodel/s3_coords.py +0 -0
  49. {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/datamodel/task.py +0 -0
  50. {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/datamodel/task_meta.py +0 -0
  51. {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/kfp_pipeline/__init__.py +0 -0
  52. {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/kfp_pipeline/docling-s3in-s3out.yaml +0 -0
  53. {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/kfp_pipeline/docling_s3in_s3out.py +0 -0
  54. {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/kfp_pipeline/docling_s3in_s3out_with_infer.py +0 -0
  55. {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/kfp_pipeline/docling_s3in_s3out_with_infer.yaml +0 -0
  56. {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/orchestrators/__init__.py +0 -0
  57. {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/orchestrators/base_notifier.py +0 -0
  58. {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/orchestrators/base_orchestrator.py +0 -0
  59. {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/orchestrators/kfp/__init__.py +0 -0
  60. {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/orchestrators/kfp/kfp_pipeline.py +0 -0
  61. {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/orchestrators/kfp/notify.py +0 -0
  62. {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/orchestrators/kfp/orchestrator.py +0 -0
  63. {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/orchestrators/local/__init__.py +0 -0
  64. {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/orchestrators/rq/__init__.py +0 -0
  65. {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/py.typed +0 -0
  66. {docling_jobkit-1.8.0 → docling_jobkit-1.9.0}/docling_jobkit/ray_job/__init__.py +0 -0
@@ -1,8 +1,9 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling-jobkit
3
- Version: 1.8.0
3
+ Version: 1.9.0
4
4
  Summary: Running a distributed job processing documents with Docling.
5
5
  Project-URL: Homepage, https://github.com/docling-project/docling-jobkit
6
+ Project-URL: Documentation, https://docling-project.github.io/docling/usage/jobkit/
6
7
  Project-URL: Repository, https://github.com/docling-project/docling-jobkit
7
8
  Project-URL: Issues, https://github.com/docling-project/docling-jobkit/issues
8
9
  Project-URL: Changelog, https://github.com/docling-project/docling-jobkit/blob/main/CHANGELOG.md
@@ -18,6 +19,7 @@ Classifier: Programming Language :: Python :: 3.10
18
19
  Classifier: Programming Language :: Python :: 3.11
19
20
  Classifier: Programming Language :: Python :: 3.12
20
21
  Classifier: Programming Language :: Python :: 3.13
22
+ Classifier: Programming Language :: Python :: 3.14
21
23
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
24
  Classifier: Topic :: Scientific/Engineering :: Information Analysis
23
25
  Classifier: Topic :: Software Development :: Build Tools
@@ -26,21 +28,18 @@ Classifier: Typing :: Typed
26
28
  Requires-Python: >=3.10
27
29
  Requires-Dist: boto3~=1.35
28
30
  Requires-Dist: docling~=2.60
29
- Requires-Dist: fastparquet~=2024.11
30
- Requires-Dist: httpx~=0.28
31
+ Requires-Dist: httpx<1,>=0.28
31
32
  Requires-Dist: pandas~=2.2
32
- Requires-Dist: pyarrow~=19.0
33
33
  Requires-Dist: pydantic-settings~=2.4
34
34
  Requires-Dist: pydantic~=2.10
35
- Requires-Dist: typer>=0.12.5
36
- Requires-Dist: typer~=0.12
35
+ Requires-Dist: typer<1,>=0.12.5
37
36
  Provides-Extra: gdrive
38
37
  Requires-Dist: google-api-python-client>=2.183.0; extra == 'gdrive'
39
38
  Requires-Dist: google-auth-oauthlib>=1.2.2; extra == 'gdrive'
40
39
  Provides-Extra: kfp
41
40
  Requires-Dist: kfp[kubernetes]>=2.10.0; extra == 'kfp'
42
41
  Provides-Extra: ray
43
- Requires-Dist: ray~=2.30; extra == 'ray'
42
+ Requires-Dist: ray~=2.30; (python_version < '3.14') and extra == 'ray'
44
43
  Provides-Extra: rq
45
44
  Requires-Dist: msgpack~=1.1; extra == 'rq'
46
45
  Requires-Dist: rq~=2.4; extra == 'rq'
@@ -55,6 +54,77 @@ Running a distributed job processing documents with Docling.
55
54
 
56
55
  ## How to use it
57
56
 
57
+ ### Local Multiprocessing CLI
58
+
59
+ The `docling-jobkit-multiproc` CLI enables parallel batch processing of documents using Python's multiprocessing. Each batch of documents is processed in a separate subprocess, allowing efficient parallel processing on a single machine.
60
+
61
+ #### Usage
62
+
63
+ ```bash
64
+ # Basic usage with default settings (batch_size=10, num_processes=CPU count)
65
+ docling-jobkit-multiproc config.yaml
66
+
67
+ # Custom batch size and number of processes
68
+ docling-jobkit-multiproc config.yaml --batch-size 20 --num-processes 4
69
+
70
+ # With model artifacts
71
+ docling-jobkit-multiproc config.yaml --artifacts-path /path/to/models
72
+
73
+ # Quiet mode (suppress progress bar)
74
+ docling-jobkit-multiproc config.yaml --quiet
75
+
76
+ # Full options
77
+ docling-jobkit-multiproc config.yaml \
78
+ --batch-size 30 \
79
+ --num-processes 8 \
80
+ --artifacts-path /path/to/models \
81
+ --enable-remote-services \
82
+ --allow-external-plugins
83
+ ```
84
+
85
+ #### Configuration
86
+
87
+ The configuration file format is the same as `docling-jobkit-local`. See example configurations:
88
+ - S3 source/target: `dev/configs/run_multiproc_s3_example.yaml`
89
+ - Local path source/target: `dev/configs/run_local_folder_example.yaml`
90
+
91
+ **Note:** Only S3, Google Drive, and local_path sources support batch processing. File and HTTP sources do not support chunking.
92
+
93
+ #### CLI Options
94
+
95
+ - `--batch-size, -b`: Number of documents to process in each batch (default: 10)
96
+ - `--num-processes, -n`: Number of parallel processes (default: CPU count)
97
+ - `--artifacts-path`: Path to model artifacts directory
98
+ - `--enable-remote-services`: Enable models connecting to remote services
99
+ - `--allow-external-plugins`: Enable loading modules from third-party plugins
100
+ - `--quiet, -q`: Suppress progress bar and detailed output
101
+
102
+ ### Local Sequential CLI
103
+
104
+ The `docling-jobkit-local` CLI processes documents sequentially in a single process.
105
+
106
+ ```bash
107
+ docling-jobkit-local config.yaml
108
+ ```
109
+
110
+ ### Using Local Path Sources and Targets
111
+
112
+ Both CLIs support local file system sources and targets. Example configuration:
113
+
114
+ ```yaml
115
+ sources:
116
+ - kind: local_path
117
+ path: ./input_documents/
118
+ recursive: true # optional, default true
119
+ pattern: "*.pdf" # optional glob pattern
120
+
121
+ target:
122
+ kind: local_path
123
+ path: ./output_documents/
124
+ ```
125
+
126
+ See `dev/configs/run_local_folder_example.yaml` for a complete example.
127
+
58
128
  ## Kubeflow pipeline with Docling Jobkit
59
129
 
60
130
  ### Using Kubeflow pipeline web dashboard UI
@@ -5,6 +5,77 @@ Running a distributed job processing documents with Docling.
5
5
 
6
6
  ## How to use it
7
7
 
8
+ ### Local Multiprocessing CLI
9
+
10
+ The `docling-jobkit-multiproc` CLI enables parallel batch processing of documents using Python's multiprocessing. Each batch of documents is processed in a separate subprocess, allowing efficient parallel processing on a single machine.
11
+
12
+ #### Usage
13
+
14
+ ```bash
15
+ # Basic usage with default settings (batch_size=10, num_processes=CPU count)
16
+ docling-jobkit-multiproc config.yaml
17
+
18
+ # Custom batch size and number of processes
19
+ docling-jobkit-multiproc config.yaml --batch-size 20 --num-processes 4
20
+
21
+ # With model artifacts
22
+ docling-jobkit-multiproc config.yaml --artifacts-path /path/to/models
23
+
24
+ # Quiet mode (suppress progress bar)
25
+ docling-jobkit-multiproc config.yaml --quiet
26
+
27
+ # Full options
28
+ docling-jobkit-multiproc config.yaml \
29
+ --batch-size 30 \
30
+ --num-processes 8 \
31
+ --artifacts-path /path/to/models \
32
+ --enable-remote-services \
33
+ --allow-external-plugins
34
+ ```
35
+
36
+ #### Configuration
37
+
38
+ The configuration file format is the same as `docling-jobkit-local`. See example configurations:
39
+ - S3 source/target: `dev/configs/run_multiproc_s3_example.yaml`
40
+ - Local path source/target: `dev/configs/run_local_folder_example.yaml`
41
+
42
+ **Note:** Only S3, Google Drive, and local_path sources support batch processing. File and HTTP sources do not support chunking.
43
+
44
+ #### CLI Options
45
+
46
+ - `--batch-size, -b`: Number of documents to process in each batch (default: 10)
47
+ - `--num-processes, -n`: Number of parallel processes (default: CPU count)
48
+ - `--artifacts-path`: Path to model artifacts directory
49
+ - `--enable-remote-services`: Enable models connecting to remote services
50
+ - `--allow-external-plugins`: Enable loading modules from third-party plugins
51
+ - `--quiet, -q`: Suppress progress bar and detailed output
52
+
53
+ ### Local Sequential CLI
54
+
55
+ The `docling-jobkit-local` CLI processes documents sequentially in a single process.
56
+
57
+ ```bash
58
+ docling-jobkit-local config.yaml
59
+ ```
60
+
61
+ ### Using Local Path Sources and Targets
62
+
63
+ Both CLIs support local file system sources and targets. Example configuration:
64
+
65
+ ```yaml
66
+ sources:
67
+ - kind: local_path
68
+ path: ./input_documents/
69
+ recursive: true # optional, default true
70
+ pattern: "*.pdf" # optional glob pattern
71
+
72
+ target:
73
+ kind: local_path
74
+ path: ./output_documents/
75
+ ```
76
+
77
+ See `dev/configs/run_local_folder_example.yaml` for a complete example.
78
+
8
79
  ## Kubeflow pipeline with Docling Jobkit
9
80
 
10
81
  ### Using Kubeflow pipeline web dashboard UI
@@ -18,9 +18,15 @@ from docling_jobkit.datamodel.task_sources import (
18
18
  TaskFileSource,
19
19
  TaskGoogleDriveSource,
20
20
  TaskHttpSource,
21
+ TaskLocalPathSource,
21
22
  TaskS3Source,
22
23
  )
23
- from docling_jobkit.datamodel.task_targets import GoogleDriveTarget, S3Target, ZipTarget
24
+ from docling_jobkit.datamodel.task_targets import (
25
+ GoogleDriveTarget,
26
+ LocalPathTarget,
27
+ S3Target,
28
+ ZipTarget,
29
+ )
24
30
 
25
31
  console = Console()
26
32
  err_console = Console(stderr=True)
@@ -34,12 +40,17 @@ app = typer.Typer(
34
40
  )
35
41
 
36
42
  JobTaskSource = Annotated[
37
- TaskFileSource | TaskHttpSource | TaskS3Source | TaskGoogleDriveSource,
43
+ TaskFileSource
44
+ | TaskHttpSource
45
+ | TaskLocalPathSource
46
+ | TaskS3Source
47
+ | TaskGoogleDriveSource,
38
48
  Field(discriminator="kind"),
39
49
  ]
40
50
 
41
51
  JobTaskTarget = Annotated[
42
- ZipTarget | S3Target | GoogleDriveTarget, Field(discriminator="kind")
52
+ ZipTarget | LocalPathTarget | S3Target | GoogleDriveTarget,
53
+ Field(discriminator="kind"),
43
54
  ]
44
55
 
45
56