pipen-cli-gbatch 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pipen-cli-gbatch might be problematic. Click here for more details.

@@ -0,0 +1,480 @@
1
+ """A pipen cli plugin to run command via Google Cloud Batch.
2
+
3
+ The idea is to wrap the command as a single-process pipen (daemon) pipeline and use
4
+ the gbatch scheduler to run it on Google Cloud Batch.
5
+
6
+ For example, to run a command like:
7
+ python myscript.py --input input.txt --output output.txt
8
+
9
+ You can run it with:
10
+ pipen gbatch -- python myscript.py --input input.txt --output output.txt
11
+
12
+ In order to provide configurations like we do for a normal pipen pipeline, you
13
+ can also provide a config file (the [cli-gbatch] section will be used):
14
+ pipen gbatch @config.toml -- \\
15
+ python myscript.py --input input.txt --output output.txt
16
+
17
+ We can also use the --nowait option to run the command in a detached mode:
18
+ pipen gbatch --nowait -- \\
19
+ python myscript.py --input input.txt --output output.txt
20
+
21
+ Or by default, it will wait for the command to complete:
22
+ pipen gbatch -- \\
23
+ python myscript.py --input input.txt --output output.txt
24
+
25
+ while waiting the running logs will be pulled and shown in the terminal.
26
+
27
+ Because teh demon pipeline is running on Google Cloud Batch, so a Google Storage
28
+ Bucket path is required for the workdir. For example: gs://my-bucket/workdir
29
+
30
+ A unique job id will be generated per the name (--name) and workdir, so that if
31
+ the same command is run again with the same name and workdir, it will not start a
32
+ new job, but just attach to the existing job and pull the logs.
33
+
34
+ if `--name` is not provided in the command line or `cli-gbatch.name` is not
35
+ provided from the configuration file, it will try to grab the name (`--name`) from
36
+ the command line arguments after `--`, or else use "name" from the root section
37
+ of the configuration file, with a "CliGbatchDaemon" suffix. If nothing can be found, a
38
+ default name "PipenCliGbatchDaemon" will be used.
39
+
40
+ When running in the detached mode, one can also pull the logs later by:
41
+ pipen gbatch --view-logs -- \\
42
+ python myscript.py --input input.txt --output output.txt
43
+
44
+ Then a workdir `{workdir}/<daemon pipeline name>/` will be created to store the
45
+ meta information.
46
+
47
+ One can have some default configuration file for the daemon pipeline in either/both
48
+ the user home directory `~/.pipen.toml` or the current working directory
49
+ `./.pipen.toml`. The configurations in these files will be overridden by
50
+ the command line arguments.
51
+
52
+ The API can also be used to run commands programmatically:
53
+
54
+ >>> from pipen_cli_gbatch import CliGbatchDaemon
55
+ >>> pipe = CliGbatchDaemon(config_for_daemon, command)
56
+ >>> await pipe.run()
57
+
58
+ Note that the daemon pipeline will always be running without caching, so that the
59
+ command will always be executed when the pipeline is run.
60
+ """
61
+
62
+ from __future__ import annotations
63
+
64
+ import sys
65
+ import asyncio
66
+ from pathlib import Path
67
+ from time import sleep
68
+ from diot import Diot
69
+ from argx import Namespace
70
+ from yunpath import AnyPath, GSPath
71
+ from simpleconf import Config, ProfileConfig
72
+ from xqute import Xqute, plugin
73
+ from xqute.utils import logger, RichHandler, DuplicateFilter
74
+ from pipen import __version__ as pipen_version
75
+ from pipen.defaults import CONFIG_FILES
76
+ from pipen.cli import CLIPlugin
77
+ from pipen.scheduler import GbatchScheduler
78
+ from pipen_poplog import LogsPopulator
79
+
80
+ __version__ = "0.0.0"
81
+ __all__ = ("CliGbatchPlugin", "CliGbatchDaemon")
82
+
83
+
84
+ class XquteCliGbatchPlugin:
85
+ """The plugin used to pull logs for the real pipeline."""
86
+
87
+ def __init__(self, name: str = "logging", log_start: bool = True):
88
+ self.name = name
89
+ self.log_start = log_start
90
+ self.stdout_populator = LogsPopulator()
91
+ self.stderr_populator = LogsPopulator()
92
+
93
+ @plugin.impl
94
+ async def on_job_started(self, scheduler, job):
95
+ if not self.log_start:
96
+ return
97
+
98
+ self.stdout_populator.logfile = scheduler.workdir.joinpath("0", "job.stdout")
99
+ self.stderr_populator.logfile = scheduler.workdir.joinpath("0", "job.stderr")
100
+ logger.info("Job is picked up by Google Batch, pulling stdout/stderr...")
101
+
102
+ @plugin.impl
103
+ async def on_job_polling(self, scheduler, job, counter):
104
+ if counter % 5 != 0:
105
+ # Make it less frequent
106
+ return
107
+
108
+ stdout_lines = self.stdout_populator.populate()
109
+ self.stdout_populator.increment_counter(len(stdout_lines))
110
+ for line in stdout_lines:
111
+ logger.info(f"/STDOUT {line}")
112
+
113
+ stderr_lines = self.stderr_populator.populate()
114
+ self.stderr_populator.increment_counter(len(stderr_lines))
115
+ for line in stderr_lines:
116
+ logger.error(f"/STDERR {line}")
117
+
118
+ @plugin.impl
119
+ async def on_job_killed(self, scheduler, job):
120
+ await self.on_job_polling.impl(self, scheduler, job, 0)
121
+
122
+ @plugin.impl
123
+ async def on_job_failed(self, scheduler, job):
124
+ await self.on_job_polling.impl(self, scheduler, job, 0)
125
+
126
+ @plugin.impl
127
+ async def on_job_succeeded(self, scheduler, job):
128
+ await self.on_job_polling.impl(self, scheduler, job, 0)
129
+
130
+ @plugin.impl
131
+ def on_shutdown(self, xqute, sig):
132
+ del self.stdout_populator
133
+ self.stdout_populator = None
134
+ del self.stderr_populator
135
+ self.stderr_populator = None
136
+
137
+
138
+ class CliGbatchDaemon:
139
+
140
+ def __init__(self, config: dict | Namespace, command: list[str]):
141
+ if isinstance(config, Namespace):
142
+ self.config = Diot(vars(config))
143
+ else:
144
+ self.config = Diot(config)
145
+ self.command = command
146
+
147
+ def _get_arg_from_command(self, arg: str) -> str | None:
148
+ """Get the value of the given argument from the command line."""
149
+ cmd_equal = [cmd.startswith(f"--{arg}=") for cmd in self.command]
150
+ cmd_space = [cmd == f"--{arg}" for cmd in self.command]
151
+ cmd_at = [cmd.startswith("@") for cmd in self.command]
152
+
153
+ if any(cmd_equal):
154
+ index = cmd_equal.index(True)
155
+ value = self.command[index].split("=", 1)[1]
156
+ elif any(cmd_space) and len(cmd_space) > cmd_space.index(True) + 1:
157
+ index = cmd_space.index(True)
158
+ value = self.command[index + 1]
159
+ elif any(cmd_at):
160
+ index = cmd_at.index(True)
161
+ config_file = AnyPath(self.command[index][1:])
162
+ if not config_file.exists():
163
+ raise FileNotFoundError(f"Config file not found: {config_file}")
164
+
165
+ conf = Config.load_one(config_file)
166
+ value = conf.get("workdir", None)
167
+ else:
168
+ value = None
169
+
170
+ return value
171
+
172
+ def _check_workdir(self):
173
+ workdir = self.config.get("workdir", self._get_arg_from_command("workdir"))
174
+
175
+ if not workdir or not isinstance(AnyPath(workdir), GSPath):
176
+ print(
177
+ "\033[1;4mError\033[0m: A Google Storage Bucket path is required for "
178
+ "--workdir.\n"
179
+ )
180
+ sys.exit(1)
181
+
182
+ self.config["workdir"] = workdir
183
+
184
+ def _infer_name(self):
185
+ name = self.config.get("name", None)
186
+ if not name:
187
+ command_name = self._get_arg_from_command("name")
188
+ if not command_name:
189
+ name = "PipenCliGbatchDaemon"
190
+ else:
191
+ name = f"{name}GbatchDaemon"
192
+
193
+ self.config["name"] = name
194
+
195
+ def _infer_jobname_prefix(self):
196
+ prefix = self.config.get("jobname_prefix", None)
197
+ if not prefix:
198
+ command_name = self._get_arg_from_command("name")
199
+ if not command_name:
200
+ prefix = "pipen-gbatch-daemon"
201
+ else:
202
+ prefix = f"{command_name.lower()}-gbatch-daemon"
203
+
204
+ self.config["jobname_prefix"] = prefix
205
+
206
+ def _setup_mount(self):
207
+ mount = self.config.get("mount", [])
208
+ # mount the workdir
209
+ mount.append(f'{self.config["workdir"]}:{GbatchScheduler.MOUNTED_METADIR}')
210
+
211
+ self.config["mount"] = mount
212
+
213
+ def _get_xqute(self) -> Xqute:
214
+ plugins = ["-xqute.pipen"]
215
+ if not self.config.nowait and not self.config.view_logs:
216
+ plugins.append(XquteCliGbatchPlugin())
217
+
218
+ return Xqute(
219
+ "gbatch",
220
+ error_strategy=self.config.error_strategy,
221
+ num_retries=self.config.num_retries,
222
+ jobname_prefix=self.config.jobname_prefix,
223
+ scheduler_opts={
224
+ key: val
225
+ for key, val in self.config.items()
226
+ if key
227
+ not in (
228
+ "workdir",
229
+ "error_strategy",
230
+ "num_retries",
231
+ "jobname_prefix",
232
+ "COMMAND",
233
+ "nowait",
234
+ "view_logs",
235
+ "command",
236
+ "name",
237
+ "profile",
238
+ "version",
239
+ "loglevel",
240
+ "mounts",
241
+ )
242
+ },
243
+ workdir=(f'{self.config.workdir}/{self.config["name"]}'),
244
+ plugins=plugins,
245
+ )
246
+
247
+ def _run_version(self):
248
+ print(f"pipen-cli-gbatch version: v{__version__}")
249
+ print(f"pipen version: v{pipen_version}")
250
+
251
+ def _show_scheduler_opts(self):
252
+ logger.debug("Scheduler Options:")
253
+ for key, val in self.config.items():
254
+ if key in (
255
+ "workdir",
256
+ "error_strategy",
257
+ "num_retries",
258
+ "jobname_prefix",
259
+ "COMMAND",
260
+ "nowait",
261
+ "view_logs",
262
+ "command",
263
+ "name",
264
+ "profile",
265
+ "version",
266
+ "loglevel",
267
+ "mounts",
268
+ ):
269
+ continue
270
+
271
+ logger.debug(f"- {key}: {val}")
272
+
273
+ async def _run_wait(self):
274
+ if not self.command:
275
+ print("\033[1;4mError\033[0m: No command to run is provided.\n")
276
+ sys.exit(1)
277
+
278
+ xqute = self._get_xqute()
279
+
280
+ await xqute.put(self.command)
281
+ await xqute.run_until_complete()
282
+
283
+ async def _run_nowait(self):
284
+ """Run the pipeline without waiting for completion."""
285
+ if not self.command:
286
+ print("\033[1;4mError\033[0m: No command to run is provided.\n")
287
+ sys.exit(1)
288
+
289
+ xqute = self._get_xqute()
290
+
291
+ try:
292
+ job = xqute.scheduler.create_job(0, self.command)
293
+ if await xqute.scheduler.job_is_running(job):
294
+ logger.info(f"Job is already submited or running: {job.jid}")
295
+ logger.info("")
296
+ logger.info("To cancel the job, run:")
297
+ logger.info(
298
+ "> gcloud batch jobs cancel "
299
+ f"--location {xqute.scheduler.location} {job.jid}"
300
+ )
301
+ else:
302
+ await xqute.scheduler.submit_job_and_update_status(job)
303
+ logger.info(f"Job is running in a detached mode: {job.jid}")
304
+
305
+ logger.info("")
306
+ logger.info("To check the job status, run:")
307
+ logger.info(
308
+ "💻> gcloud batch jobs describe"
309
+ f" --location {xqute.scheduler.location} {job.jid}"
310
+ )
311
+ logger.info("")
312
+ logger.info("To pull the logs from both stdout and stderr, run:")
313
+ logger.info(
314
+ f"💻> pipen gbatch --view-logs all"
315
+ f" --name {self.config['name']}"
316
+ f" --workdir {self.config['workdir']}"
317
+ )
318
+ logger.info("To pull the logs from both stdout, run:")
319
+ logger.info(
320
+ f"💻> pipen gbatch --view-logs stdout"
321
+ f" --name {self.config['name']}"
322
+ f" --workdir {self.config['workdir']}"
323
+ )
324
+ logger.info("To pull the logs from both stderr, run:")
325
+ logger.info(
326
+ f"💻> pipen gbatch --view-logs stderr"
327
+ f" --name {self.config['name']}"
328
+ f" --workdir {self.config['workdir']}"
329
+ )
330
+ logger.info("")
331
+ logger.info("To check the meta information of the daemon job, go to:")
332
+ logger.info(f'📁 {self.config["workdir"]}/{self.config["name"]}/0/')
333
+ logger.info("")
334
+ finally:
335
+ if xqute.plugin_context:
336
+ xqute.plugin_context.__exit__()
337
+
338
+ def _run_view_logs(self):
339
+ log_source = {}
340
+ workdir = AnyPath(self.config["workdir"]) / self.config["name"] / "0"
341
+ if not workdir.exists():
342
+ print(f"\033[1;4mError\033[0m: Workdir not found: {workdir}\n")
343
+ sys.exit(1)
344
+
345
+ if self.config.view_logs == "stdout":
346
+ log_source["STDOUT"] = workdir.joinpath("job.stdout")
347
+ elif self.config.view_logs == "stderr":
348
+ log_source["STDERR"] = workdir.joinpath("job.stderr")
349
+ else: #
350
+ log_source["STDOUT"] = workdir.joinpath("job.stdout")
351
+ log_source["STDERR"] = workdir.joinpath("job.stderr")
352
+
353
+ poplulators = {
354
+ key: LogsPopulator(logfile=val) for key, val in log_source.items()
355
+ }
356
+ logger.info(f"Pulling logs from: {', '.join(log_source.keys())}")
357
+ logger.info("Press Ctrl-C (twice) to stop.")
358
+ print("")
359
+ while True:
360
+ for key, populator in poplulators.items():
361
+ lines = populator.populate()
362
+ for line in lines:
363
+ if len(log_source) > 1:
364
+ print(f"/{key} {line}")
365
+ else:
366
+ print(line)
367
+ sleep(5)
368
+
369
+ def setup(self):
370
+ logger.addHandler(RichHandler(show_path=False, show_time=False))
371
+ logger.addFilter(DuplicateFilter())
372
+ logger.setLevel(self.config.loglevel.upper())
373
+
374
+ self._check_workdir()
375
+ self._infer_name()
376
+ self._infer_jobname_prefix()
377
+ self._setup_mount()
378
+
379
+ async def run(self):
380
+ if self.config.version:
381
+ self._run_version()
382
+ return
383
+
384
+ self.setup()
385
+ self._show_scheduler_opts()
386
+ if self.config.nowait:
387
+ await self._run_nowait()
388
+ elif self.config.view_logs:
389
+ self._run_view_logs()
390
+ else:
391
+ await self._run_wait()
392
+
393
+
394
+ class CliGbatchPlugin(CLIPlugin):
395
+ """Simplify running commands via Google Cloud Batch."""
396
+
397
+ __version__ = __version__
398
+ name = "gbatch"
399
+
400
+ @staticmethod
401
+ def _get_defaults_from_config(
402
+ config_files: list[str],
403
+ profile: str | None,
404
+ ) -> dict:
405
+ """Get the default configurations from the given config files and profile."""
406
+ if not profile:
407
+ return {}
408
+
409
+ conf = ProfileConfig.load(
410
+ *config_files,
411
+ ignore_nonexist=True,
412
+ base=profile,
413
+ allow_missing_base=True,
414
+ )
415
+ conf = ProfileConfig.detach(conf)
416
+ return conf.get("scheduler_opts", {})
417
+
418
+ def __init__(self, parser, subparser):
419
+ super().__init__(parser, subparser)
420
+ subparser.epilog = """\033[1;4mExamples\033[0m:
421
+
422
+ \u200B
423
+ # Run a command and wait for it to complete
424
+ > pipen gbatch --workdir gs://my-bucket/workdir -- \\
425
+ python myscript.py --input input.txt --output output.txt
426
+
427
+ \u200B
428
+ # Run a command in a detached mode
429
+ > pipen gbatch --nowait --project $PROJECT --location $LOCATION \\
430
+ --workdir gs://my-bucket/workdir -- \\
431
+ python myscript.py --input input.txt --output output.txt
432
+
433
+ \u200B
434
+ # If you have a profile defined in ~/.pipen.toml or ./.pipen.toml
435
+ > pipen gbatch --profile myprofile -- \\
436
+ python myscript.py --input input.txt --output output.txt
437
+
438
+ \u200B
439
+ # View the logs of a previously run command
440
+ > pipen gbatch --view-logs all --name my-daemon-name \\
441
+ --workdir gs://my-bucket/workdir
442
+ """
443
+ argfile = Path(__file__).parent / "daemon_args.toml"
444
+ args_def = Config.load(argfile, loader="toml")
445
+ mutually_exclusive_groups = args_def.get("mutually_exclusive_groups", [])
446
+ groups = args_def.get("groups", [])
447
+ arguments = args_def.get("arguments", [])
448
+ subparser._add_decedents(mutually_exclusive_groups, groups, [], arguments, [])
449
+
450
+ def parse_args(self, known_parsed, unparsed_argv: list[str]) -> Namespace:
451
+ """Define arguments for the command"""
452
+ # Check if there is any unknown args
453
+ known_parsed = super().parse_args(known_parsed, unparsed_argv)
454
+ if known_parsed.command:
455
+ if known_parsed.command[0] != "--":
456
+ print("\033[1;4mError\033[0m: The command to run must be after '--'.\n")
457
+ sys.exit(1)
458
+
459
+ known_parsed.command = known_parsed.command[1:]
460
+
461
+ defaults = self.__class__._get_defaults_from_config(
462
+ CONFIG_FILES,
463
+ known_parsed.profile,
464
+ )
465
+ # update parsed with the defaults
466
+ for key, val in defaults.items():
467
+ if (
468
+ key == "command"
469
+ or val is None
470
+ or getattr(known_parsed, key, None) is not None
471
+ ):
472
+ continue
473
+
474
+ setattr(known_parsed, key, val)
475
+
476
+ return known_parsed
477
+
478
+ def exec_command(self, args: Namespace) -> None:
479
+ """Execute the command"""
480
+ asyncio.run(CliGbatchDaemon(args, args.command).run())
@@ -0,0 +1,197 @@
1
+ [[mutually_exclusive_groups]]
2
+
3
+ [[mutually_exclusive_groups.arguments]]
4
+ flags = ["--nowait"]
5
+ action = "store_true"
6
+ default = false
7
+ help = "Run the command in a detached mode without waiting for its completion."
8
+
9
+ [[mutually_exclusive_groups.arguments]]
10
+ flags = ["--view-logs"]
11
+ choices = ["all", "stdout", "stderr"]
12
+ help = "View the logs of a job."
13
+
14
+ [[arguments]]
15
+ flags = ["--name"]
16
+ type = "str"
17
+ help = """The name of the daemon pipeline.
18
+ If not provided, try to generate one from the command to run.
19
+ If the command is also not provided, use 'PipenCliGbatchDaemon' as the name."""
20
+
21
+ [[arguments]]
22
+ flags = ["--profile"]
23
+ type = "str"
24
+ help = """Use the `scheduler_opts` as the Scheduler Options of a given profile from pipen configuration files,
25
+ including ~/.pipen.toml and ./pipen.toml.
26
+ Note that if not provided, nothing will be loaded from the configuration files.
27
+ """
28
+
29
+ [[arguments]]
30
+ flags = ["--version"]
31
+ action = "store_true"
32
+ default = false
33
+ help = "Show the version of the pipen-cli-gbatch package."
34
+
35
+ [[arguments]]
36
+ flags = ["--loglevel"]
37
+ choices = ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL", "debug", "info", "warning", "error", "critical"]
38
+ default = "INFO"
39
+ help = "Set the logging level for the daemon process."
40
+
41
+ [[groups]]
42
+ title = "Key Options"
43
+ description = "The key options to run the command."
44
+
45
+ [[groups.arguments]]
46
+ flags = ["--workdir"]
47
+ type = "str"
48
+ help = """The workdir (a Google Storage Bucket path is required) to store the meta information of the daemon pipeline.
49
+ If not provided, the one from the command will be used."""
50
+
51
+ [[groups.arguments]]
52
+ flags = ["command"]
53
+ nargs = "..."
54
+ help = "The command passed after `--` to run, with all its arguments. Note that the command should be provided after `--`."
55
+
56
+ [[groups]]
57
+ title = "Scheduler Options"
58
+ description = "The options to configure the gbatch scheduler."
59
+
60
+
61
+ [[groups.arguments]]
62
+ flags = ["--error-strategy"]
63
+ choices = ["retry", "halt"]
64
+ default = "halt"
65
+ help = "The strategy when there is error happened"
66
+
67
+ [[groups.arguments]]
68
+ flags = ["--num-retries"]
69
+ type = "int"
70
+ default = 0
71
+ help = "The number of retries when there is error happened. Only valid when --error-strategy is 'retry'."
72
+
73
+ [[groups.arguments]]
74
+ flags = ["--prescript"]
75
+ type = "str"
76
+ help = "The prescript to run before the main command."
77
+
78
+ [[groups.arguments]]
79
+ flags = ["--postscript"]
80
+ type = "str"
81
+ help = "The postscript to run after the main command."
82
+
83
+ [[groups.arguments]]
84
+ flags = ["--jobname-prefix"]
85
+ type = "str"
86
+ help = """The prefix of the name prefix of the daemon job.
87
+ If not provided, try to generate one from the command to run.
88
+ If the command is also not provided, use 'pipen-gbatch-daemon' as the prefix."""
89
+
90
+ [[groups.arguments]]
91
+ flags = ["--recheck-interval"]
92
+ type = "int"
93
+ default = 600
94
+ help = "The interval to recheck the job status, each takes about 0.1 seconds."
95
+
96
+ [[groups.arguments]]
97
+ flags = ["--cwd"]
98
+ type = "str"
99
+ help = "The working directory to run the command. If not provided, the current directory is used. You can pass either a mounted path (inside the VM) or a Google Storage Bucket path (gs://...). If a Google Storage Bucket path is provided, the mounted path will be inferred from the mounted paths of the VM."
100
+
101
+ [[groups.arguments]]
102
+ flags = ["--project"]
103
+ type = "str"
104
+ # required = true
105
+ help = "The Google Cloud project to run the job."
106
+
107
+ [[groups.arguments]]
108
+ flags = ["--location"]
109
+ type = "str"
110
+ # required = true
111
+ help = "The location to run the job."
112
+
113
+ [[groups.arguments]]
114
+ flags = ["--mount"]
115
+ type = "list"
116
+ default = []
117
+ action = "clear_extend"
118
+ help = """The list of mounts to mount to the VM, each in the format of SOURCE:TARGET, where SOURCE must be either a Google Storage Bucket path (gs://...)."""
119
+
120
+ [[groups.arguments]]
121
+ flags = ["--service-account"]
122
+ type = "str"
123
+ help = "The service account to run the job."
124
+
125
+ [[groups.arguments]]
126
+ flags = ["--network"]
127
+ type = "str"
128
+ help = "The network to run the job."
129
+
130
+ [[groups.arguments]]
131
+ flags = ["--subnetwork"]
132
+ type = "str"
133
+ help = "The subnetwork to run the job."
134
+
135
+ [[groups.arguments]]
136
+ flags = ["--no-external-ip-address"]
137
+ action = "store_true"
138
+ help = "Whether to disable external IP address for the VM."
139
+
140
+ [[groups.arguments]]
141
+ flags = ["--machine-type"]
142
+ type = "str"
143
+ help = "The machine type of the VM."
144
+
145
+ [[groups.arguments]]
146
+ flags = ["--provisioning-model"]
147
+ choices = ["STANDARD", "SPOT"]
148
+ help = "The provisioning model of the VM."
149
+
150
+ [[groups.arguments]]
151
+ flags = ["--image-uri"]
152
+ type = "str"
153
+ help = "The custom image URI of the VM."
154
+
155
+ [[groups.arguments]]
156
+ flags = ["--entrypoint"]
157
+ type = "str"
158
+ help = "The entry point of the container to run the command."
159
+
160
+ [[groups.arguments]]
161
+ flags = ["--commands"]
162
+ type = "list"
163
+ default = []
164
+ action = "clear_extend"
165
+ help = "The list of commands to run in the container, each as a separate string."
166
+
167
+ [[groups.arguments]]
168
+ flags = ["--runnables"]
169
+ type = "json"
170
+ help = """The JSON string of extra settings of runnables add to the job.json.
171
+ Refer to https://cloud.google.com/batch/docs/reference/rest/v1/projects.locations.jobs#Runnable for details.
172
+ You can have an extra key 'order' for each runnable, where negative values mean to run before the main command,
173
+ and positive values mean to run after the main command."""
174
+
175
+ [[groups.arguments]]
176
+ flags = ["--allocationPolicy"]
177
+ type = "json"
178
+ default = "{}"
179
+ help = "The JSON string of extra settings of allocationPolicy add to the job.json. Refer to https://cloud.google.com/batch/docs/reference/rest/v1/projects.locations.jobs#AllocationPolicy for details."
180
+
181
+ [[groups.arguments]]
182
+ flags = ["--taskGroups"]
183
+ type = "json"
184
+ default = "[]"
185
+ help = "The JSON string of extra settings of taskGroups add to the job.json. Refer to https://cloud.google.com/batch/docs/reference/rest/v1/projects.locations.jobs#TaskGroup for details."
186
+
187
+ [[groups.arguments]]
188
+ flags = ["--labels"]
189
+ type = "json"
190
+ default = "{}"
191
+ help = "The JSON string of labels to add to the job. Refer to https://cloud.google.com/batch/docs/reference/rest/v1/projects.locations.jobs#Job.FIELDS.labels for details."
192
+
193
+ [[groups.arguments]]
194
+ flags = ["--gcloud"]
195
+ type = "str"
196
+ default = "gcloud"
197
+ help = "The path to the gcloud command."
@@ -0,0 +1,238 @@
1
+ Metadata-Version: 2.3
2
+ Name: pipen-cli-gbatch
3
+ Version: 0.0.0
4
+ Summary: A pipen cli plugin to run command via Google Cloud Batch
5
+ License: MIT
6
+ Author: pwwang
7
+ Author-email: pwwang@pwwang.com
8
+ Requires-Python: >=3.9,<4.0
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3.9
12
+ Classifier: Programming Language :: Python :: 3.10
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Programming Language :: Python :: 3.13
16
+ Requires-Dist: google-cloud-storage (>=3.0.0,<4.0.0)
17
+ Requires-Dist: pipen (>=0.17.17,<0.18.0)
18
+ Requires-Dist: pipen-poplog (>=0.3.6,<0.4.0)
19
+ Description-Content-Type: text/markdown
20
+
21
+ # pipen-cli-gbatch
22
+
23
+ A pipen CLI plugin to run commands via Google Cloud Batch.
24
+
25
+ The idea is to submit the command using xqute and use the gbatch scheduler to run it on Google Cloud Batch.
26
+
27
+ ## Installation
28
+
29
+ ```bash
30
+ pip install pipen-cli-gbatch
31
+ ```
32
+
33
+ ## Usage
34
+
35
+ ### Basic Command Execution
36
+
37
+ To run a command like:
38
+
39
+ ```bash
40
+ python myscript.py --input input.txt --output output.txt
41
+ ```
42
+
43
+ You can run it with:
44
+
45
+ ```bash
46
+ pipen gbatch -- python myscript.py --input input.txt --output output.txt
47
+ ```
48
+
49
+ ### With Configuration File
50
+
51
+ In order to provide configurations like we do for a normal pipen pipeline, you can also provide a config file (the `[pipen-cli-gbatch]` section will be used):
52
+
53
+ ```bash
54
+ pipen gbatch @config.toml -- \
55
+ python myscript.py --input input.txt --output output.txt
56
+ ```
57
+
58
+ ### Detached Mode
59
+
60
+ We can also use the `--nowait` option to run the command in a detached mode:
61
+
62
+ ```bash
63
+ pipen gbatch --nowait -- \
64
+ python myscript.py --input input.txt --output output.txt
65
+ ```
66
+
67
+ Or by default, it will wait for the command to complete:
68
+
69
+ ```bash
70
+ pipen gbatch -- \
71
+ python myscript.py --input input.txt --output output.txt
72
+ ```
73
+
74
+ While waiting, the running logs will be pulled and shown in the terminal.
75
+
76
+ ### View Logs
77
+
78
+ When running in detached mode, one can also pull the logs later by:
79
+
80
+ ```bash
81
+ pipen gbatch --view-logs -- \
82
+ python myscript.py --input input.txt --output output.txt
83
+
84
+ # or just provide the workdir
85
+ pipen gbatch --view-logs --workdir gs://my-bucket/workdir
86
+ ```
87
+
88
+ ## Configuration
89
+
90
+ Because the daemon pipeline is running on Google Cloud Batch, a Google Storage Bucket path is required for the workdir. For example: `gs://my-bucket/workdir`
91
+
92
+ A unique job ID will be generated per the name (`--name`) and workdir, so that if the same command is run again with the same name and workdir, it will not start a new job, but just attach to the existing job and pull the logs.
93
+
94
+ If `--name` is not provided in the command line, it will try to grab the name (`--name`) from the command line arguments after `--`, or else use "name" from the root section of the configuration file, with a "GbatchDaemon" suffix. If nothing can be found, a default name "PipenGbatchDaemon" will be used.
95
+
96
+ Then a workdir `{workdir}/<daemon pipeline name>/` will be created to store the meta information.
97
+
98
+ With `--profile` provided, the scheduler options (`scheduler_opts`) defined in `~/.pipen.toml` and `./.pipen.toml` will be used as default.
99
+
100
+ ## All Options
101
+
102
+ ```bash
103
+ > pipen gbatch --help
104
+ Usage: pipen gbatch [-h] [--nowait | --view-logs {all,stdout,stderr}] [--workdir WORKDIR]
105
+ [--error-strategy {retry,halt}] [--num-retries NUM_RETRIES] [--prescript PRESCRIPT]
106
+ [--postscript POSTSCRIPT] [--jobname-prefix JOBNAME_PREFIX] [--recheck-interval RECHECK_INTERVAL]
107
+ [--cwd CWD] [--project PROJECT] [--location LOCATION] [--mount MOUNT]
108
+ [--service-account SERVICE_ACCOUNT] [--network NETWORK] [--subnetwork SUBNETWORK]
109
+ [--no-external-ip-address] [--machine-type MACHINE_TYPE] [--provisioning-model {STANDARD,SPOT}]
110
+ [--image-uri IMAGE_URI] [--entrypoint ENTRYPOINT] [--commands COMMANDS] [--runnables RUNNABLES]
111
+ [--allocationPolicy ALLOCATIONPOLICY] [--taskGroups TASKGROUPS] [--labels LABELS] [--gcloud GCLOUD]
112
+ [--name NAME] [--profile PROFILE] [--version]
113
+ [--loglevel {DEBUG,INFO,WARNING,ERROR,CRITICAL,debug,info,warning,error,critical}]
114
+ ...
115
+
116
+ Simplify running commands via Google Cloud Batch.
117
+
118
+ Key Options:
119
+ The key options to run the command.
120
+
121
+ --workdir WORKDIR The workdir (a Google Storage Bucket path is required) to store the meta information of the
122
+ daemon pipeline.
123
+ If not provided, the one from the command will be used.
124
+ command The command passed after `--` to run, with all its arguments. Note that the command should be
125
+ provided after `--`.
126
+
127
+ Scheduler Options:
128
+ The options to configure the gbatch scheduler.
129
+
130
+ --error-strategy {retry,halt}
131
+ The strategy when there is error happened [default: halt]
132
+ --num-retries NUM_RETRIES
133
+ The number of retries when there is error happened. Only valid when --error-strategy is 'retry'.
134
+ [default: 0]
135
+ --prescript PRESCRIPT
136
+ The prescript to run before the main command.
137
+ --postscript POSTSCRIPT
138
+ The postscript to run after the main command.
139
+ --jobname-prefix JOBNAME_PREFIX
140
+ The prefix of the name prefix of the daemon job.
141
+ If not provided, try to generate one from the command to run.
142
+ If the command is also not provided, use 'pipen-gbatch-daemon' as the prefix.
143
+ --recheck-interval RECHECK_INTERVAL
144
+ The interval to recheck the job status, each takes about 0.1 seconds. [default: 600]
145
+ --cwd CWD The working directory to run the command. If not provided, the current directory is used. You
146
+ can pass either a mounted path (inside the VM) or a Google Storage Bucket path (gs://...). If a
147
+ Google Storage Bucket path is provided, the mounted path will be inferred from the mounted paths
148
+ of the VM.
149
+ --project PROJECT The Google Cloud project to run the job.
150
+ --location LOCATION The location to run the job.
151
+ --mount MOUNT The list of mounts to mount to the VM, each in the format of SOURCE:TARGET, where SOURCE must be
152
+ either a Google Storage Bucket path (gs://...). [default: []]
153
+ --service-account SERVICE_ACCOUNT
154
+ The service account to run the job.
155
+ --network NETWORK The network to run the job.
156
+ --subnetwork SUBNETWORK
157
+ The subnetwork to run the job.
158
+ --no-external-ip-address
159
+ Whether to disable external IP address for the VM.
160
+ --machine-type MACHINE_TYPE
161
+ The machine type of the VM.
162
+ --provisioning-model {STANDARD,SPOT}
163
+ The provisioning model of the VM.
164
+ --image-uri IMAGE_URI
165
+ The custom image URI of the VM.
166
+ --entrypoint ENTRYPOINT
167
+ The entry point of the container to run the command.
168
+ --commands COMMANDS The list of commands to run in the container, each as a separate string. [default: []]
169
+ --runnables RUNNABLES
170
+ The JSON string of extra settings of runnables add to the job.json.
171
+ Refer to https://cloud.google.com/batch/docs/reference/rest/v1/projects.locations.jobs#Runnable
172
+ for details.
173
+ You can have an extra key 'order' for each runnable, where negative values mean to run before
174
+ the main command,
175
+ and positive values mean to run after the main command.
176
+ --allocationPolicy ALLOCATIONPOLICY
177
+ The JSON string of extra settings of allocationPolicy add to the job.json. Refer to
178
+ https://cloud.google.com/batch/docs/reference/rest/v1/projects.locations.jobs#AllocationPolicy
179
+ for details. [default: {}]
180
+ --taskGroups TASKGROUPS
181
+ The JSON string of extra settings of taskGroups add to the job.json. Refer to
182
+ https://cloud.google.com/batch/docs/reference/rest/v1/projects.locations.jobs#TaskGroup for
183
+ details. [default: []]
184
+ --labels LABELS The JSON string of labels to add to the job. Refer to
185
+ https://cloud.google.com/batch/docs/reference/rest/v1/projects.locations.jobs#Job.FIELDS.labels
186
+ for details. [default: {}]
187
+ --gcloud GCLOUD The path to the gcloud command. [default: gcloud]
188
+
189
+ Options:
190
+ -h, --help show this help message and exit
191
+ --nowait Run the command in a detached mode without waiting for its completion. [default: False]
192
+ --view-logs {all,stdout,stderr}
193
+ View the logs of a job.
194
+ --name NAME The name of the daemon pipeline.
195
+ If not provided, try to generate one from the command to run.
196
+ If the command is also not provided, use 'PipenCliGbatchDaemon' as the name.
197
+ --profile PROFILE Use the `scheduler_opts` as the Scheduler Options of a given profile from pipen configuration
198
+ files,
199
+ including ~/.pipen.toml and ./pipen.toml.
200
+ Note that if not provided, nothing will be loaded from the configuration files.
201
+ --version Show the version of the pipen-cli-gbatch package. [default: False]
202
+ --loglevel {DEBUG,INFO,WARNING,ERROR,CRITICAL,debug,info,warning,error,critical}
203
+ Set the logging level for the daemon process. [default: INFO]
204
+
205
+ Examples:
206
+
207
+ # Run a command and wait for it to complete
208
+ > pipen gbatch --workdir gs://my-bucket/workdir -- \
209
+ python myscript.py --input input.txt --output output.txt
210
+
211
+ # Run a command in a detached mode
212
+ > pipen gbatch --nowait --project $PROJECT --location $LOCATION \
213
+ --workdir gs://my-bucket/workdir -- \
214
+ python myscript.py --input input.txt --output output.txt
215
+
216
+ # If you have a profile defined in ~/.pipen.toml or ./.pipen.toml
217
+ > pipen gbatch --profile myprofile -- \
218
+ python myscript.py --input input.txt --output output.txt
219
+
220
+ # View the logs of a previously run command
221
+ > pipen gbatch --view-logs all --name my-daemon-name \
222
+ --workdir gs://my-bucket/workdir
223
+ ```
224
+
225
+ ## API
226
+
227
+ The API can also be used to run commands programmatically:
228
+
229
+ ```python
230
+ import asyncio
231
+ from pipen_cli_gbatch import CliGbatchDaemon
232
+
233
+ pipe = CliGbatchDaemon(config_for_daemon, command)
234
+ asyncio.run(pipe.run())
235
+ ```
236
+
237
+ Note that the daemon pipeline will always be running without caching, so that the command will always be executed when the pipeline is run.
238
+
@@ -0,0 +1,6 @@
1
+ pipen_cli_gbatch/__init__.py,sha256=NiVv_1lQLHggyMoh7_BsvR31NenvTtocD4l86hC-1Gg,17175
2
+ pipen_cli_gbatch/daemon_args.toml,sha256=83hHa9K19DGmI-RROjyBVC-nHBIyNLW7NuRbQXFAnak,6051
3
+ pipen_cli_gbatch-0.0.0.dist-info/METADATA,sha256=KL6iEXsC217t_4RUE8uU7jDXYAKE4OMarU9_dpCBx1U,10914
4
+ pipen_cli_gbatch-0.0.0.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
5
+ pipen_cli_gbatch-0.0.0.dist-info/entry_points.txt,sha256=Z9NLeCpRo-rb8wss5mB5TBcG-_RbdlPA49b8Ma5pvQA,57
6
+ pipen_cli_gbatch-0.0.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: poetry-core 2.1.3
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,3 @@
1
+ [pipen_cli]
2
+ cli-gbatch=pipen_cli_gbatch:CliGbatchPlugin
3
+