datastudio-cli 0.2.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) [year] [fullname]
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,46 @@
1
+ Metadata-Version: 2.1
2
+ Name: datastudio-cli
3
+ Version: 0.2.2
4
+ Summary: A command line client for running DataStudio Datakits
5
+ Author-email: Varvara Efremova <varvara@echus.co>, James Wilmot <james.wilmot@pm.me>
6
+ Requires-Python: >=3.11
7
+ Description-Content-Type: text/markdown
8
+ License-File: LICENSE
9
+ Requires-Dist: typer<1,>=0.12
10
+ Requires-Dist: docker<8,>=7.1.0
11
+ Requires-Dist: matplotlib<4,>=3.9.1
12
+ Requires-Dist: tornado
13
+ Requires-Dist: datastudio-lib>=0.2.2
14
+ Requires-Dist: tabulate
15
+ Provides-Extra: development
16
+ Requires-Dist: pre-commit; extra == "development"
17
+ Requires-Dist: build; extra == "development"
18
+ Provides-Extra: all
19
+ Requires-Dist: datastudio-cli[development]; extra == "all"
20
+
21
+ # DataStudio datakit CLI
22
+
23
+ A command line client for running DataStudio datakits.
24
+
25
+ Usage documentation can be found [here](https://docs.datastudioapp.com).
26
+
27
+
28
+ ## Development
29
+
30
+ To install and test locally, navigate to the datakit directory you want to
31
+ test.
32
+ ```
33
+ cd /path/to/datakit
34
+ ```
35
+
36
+ Create a virtualenv and install the CLI via pip in local mode:
37
+ ```
38
+ python -m venv .venv
39
+ source .venv/bin/activate
40
+ pip install -e [/path/to/cli]
41
+ ```
42
+
43
+ You can now run the CLI script with:
44
+ ```
45
+ ds
46
+ ```
@@ -0,0 +1,26 @@
1
+ # DataStudio datakit CLI
2
+
3
+ A command line client for running DataStudio datakits.
4
+
5
+ Usage documentation can be found [here](https://docs.datastudioapp.com).
6
+
7
+
8
+ ## Development
9
+
10
+ To install and test locally, navigate to the datakit directory you want to
11
+ test.
12
+ ```
13
+ cd /path/to/datakit
14
+ ```
15
+
16
+ Create a virtualenv and install the CLI via pip in local mode:
17
+ ```
18
+ python -m venv .venv
19
+ source .venv/bin/activate
20
+ pip install -e [/path/to/cli]
21
+ ```
22
+
23
+ You can now run the CLI script with:
24
+ ```
25
+ ds
26
+ ```
@@ -0,0 +1,890 @@
1
+ import os
2
+ import time
3
+ import shutil
4
+ import json
5
+ import re
6
+ import pickle
7
+ import typer
8
+ import docker
9
+ import matplotlib
10
+ import matplotlib.pyplot as plt
11
+ import pandas as pd
12
+ from ast import literal_eval
13
+ from typing import Optional, Any
14
+ from typing_extensions import Annotated
15
+ from rich import print
16
+ from rich.panel import Panel
17
+ from tabulate import tabulate
18
+ from datakitpy.datakit import (
19
+ ExecutionError,
20
+ ResourceError,
21
+ execute_datakit,
22
+ execute_view,
23
+ init_resource,
24
+ load_resource_by_variable,
25
+ write_resource,
26
+ update_resource,
27
+ load_run_configuration,
28
+ write_run_configuration,
29
+ load_variable,
30
+ load_variable_signature,
31
+ load_datakit_configuration,
32
+ write_datakit_configuration,
33
+ load_algorithm,
34
+ write_algorithm,
35
+ get_algorithm_name,
36
+ RUN_DIR,
37
+ RELATIONSHIPS_FILE,
38
+ VIEW_ARTEFACTS_DIR,
39
+ )
40
+ from datakitpy.helpers import find_by_name, find
41
+
42
+
43
+ app = typer.Typer(no_args_is_help=True)
44
+
45
+
46
+ client = docker.from_env()
47
+
48
+
49
+ # Assume we are always at the datakit root
50
+ # TODO: Validate we actually are, and that this is a datakit
51
+ DATAKIT_PATH = os.getcwd() # Root datakit path
52
+ CONFIG_FILE = f"{DATAKIT_PATH}/.datakit"
53
+ RUN_EXTENSION = ".run"
54
+
55
+
56
+ # Helpers
57
+
58
+
59
+ def dumb_str_to_type(value) -> Any:
60
+ """Parse a string to any Python type"""
61
+ # Stupid workaround for Typer not supporting Union types :<
62
+ try:
63
+ return literal_eval(value)
64
+ except ValueError:
65
+ if value.lower() == "true":
66
+ return True
67
+ elif value.lower() == "false":
68
+ return False
69
+ else:
70
+ return value
71
+
72
+
73
+ def get_default_algorithm() -> str:
74
+ """Return the default algorithm for the current datakit"""
75
+ return load_datakit_configuration(base_path=DATAKIT_PATH)["algorithms"][0]
76
+
77
+
78
+ def load_config():
79
+ """Load CLI configuration file"""
80
+ with open(CONFIG_FILE, "r") as f:
81
+ return json.load(f)
82
+
83
+
84
+ def get_active_run():
85
+ try:
86
+ return load_config()["run"]
87
+ except FileNotFoundError:
88
+ print('[red]No active run is set. Have you run "ds init"?[/red]')
89
+ exit(1)
90
+
91
+
92
+ def write_config(run_name):
93
+ """Write updated CLI configuration file"""
94
+ with open(CONFIG_FILE, "w") as f:
95
+ json.dump({"run": run_name}, f, indent=2)
96
+
97
+
98
+ def run_exists(run_name):
99
+ """Check if specified run exists"""
100
+ run_dir = RUN_DIR.format(base_path=DATAKIT_PATH, run_name=run_name)
101
+ return os.path.exists(run_dir) and os.path.isdir(run_dir)
102
+
103
+
104
+ def get_full_run_name(run_name):
105
+ """Validate and return full run name"""
106
+ if run_name is not None:
107
+ # Check the run_name matches the pattern [algorithm].[name] or
108
+ # [algorithm]
109
+ pattern = re.compile(r"^([a-zA-Z0-9_]+)\.([a-zA-Z0-9_]+)$")
110
+
111
+ algorithms = load_datakit_configuration()["algorithms"]
112
+
113
+ if not pattern.match(run_name) and run_name not in algorithms:
114
+ print(f'[red]"{run_name}" is not a valid run name[/red]')
115
+ print(
116
+ "[red]Run names must match the format: "
117
+ r"\[algorithm].\[name][/red]"
118
+ )
119
+ print(
120
+ "[red]Did you forget to add your algorithm to "
121
+ "datakit.json?[/red]"
122
+ )
123
+ exit(1)
124
+
125
+ algorithm_name = get_algorithm_name(run_name)
126
+ datakit_algorithms = load_datakit_configuration(
127
+ base_path=DATAKIT_PATH
128
+ )["algorithms"]
129
+
130
+ if get_algorithm_name(run_name) not in datakit_algorithms:
131
+ print(
132
+ f'[red]"{algorithm_name}" is not a valid datakit '
133
+ "algorithm[/red]"
134
+ )
135
+ print(
136
+ "[red]Available datakit algorithms: "
137
+ f"{datakit_algorithms}[/red]"
138
+ )
139
+ exit(1)
140
+
141
+ return run_name + RUN_EXTENSION
142
+ else:
143
+ return get_default_algorithm() + RUN_EXTENSION
144
+
145
+
146
+ def execute_relationship(run_name: str, variable_name: str) -> None:
147
+ """Execute any relationships applied to the given source variable"""
148
+ # Load run configuration for modification
149
+ run = load_run_configuration(run_name)
150
+
151
+ print(
152
+ f"[bold]=>[/bold] Executing relationship for variable {variable_name}"
153
+ )
154
+
155
+ # Load associated relationship
156
+ try:
157
+ with open(
158
+ RELATIONSHIPS_FILE.format(
159
+ base_path=DATAKIT_PATH,
160
+ algorithm_name=get_algorithm_name(run_name),
161
+ ),
162
+ "r",
163
+ ) as f:
164
+ relationship = find(
165
+ json.load(f)["relationships"], "source", variable_name
166
+ )
167
+ except FileNotFoundError:
168
+ # No relationships to execute, return
169
+ return
170
+
171
+ if relationship is None:
172
+ # No relationship for specified variable found, return
173
+ return
174
+
175
+ # Apply relationship rules
176
+ for rule in relationship["rules"]:
177
+ if rule["type"] == "change":
178
+ # Currently the only type of "change" rule we have is one that
179
+ # mirrors the schema from the source to other resources, so assume
180
+ # this is the case here
181
+
182
+ # TODO: This will need to change in the future
183
+
184
+ source = load_resource_by_variable(
185
+ run_name=run_name,
186
+ variable_name=variable_name,
187
+ base_path=DATAKIT_PATH,
188
+ as_dict=True,
189
+ )
190
+
191
+ for target in rule["targets"]:
192
+ update_resource(
193
+ run_name=run_name,
194
+ resource_name=target["name"],
195
+ schema=source["schema"],
196
+ base_path=DATAKIT_PATH,
197
+ )
198
+
199
+ elif rule["type"] == "value":
200
+ # Check if this rule applies to current run configuration state
201
+
202
+ # Get source variable value
203
+ value = load_variable(
204
+ run_name=run_name,
205
+ variable_name=variable_name,
206
+ base_path=DATAKIT_PATH,
207
+ )["value"]
208
+
209
+ # If the source variable value matches the rule value, execute
210
+ # the relationship
211
+ if value in rule["values"]:
212
+ for target in rule["targets"]:
213
+ if "disabled" in target:
214
+ # Set target variable disabled value
215
+ target_variable = load_variable(
216
+ run_name=run_name,
217
+ variable_name=target["name"],
218
+ base_path=DATAKIT_PATH,
219
+ )
220
+
221
+ target_variable["disabled"] = target["disabled"]
222
+
223
+ if target["type"] == "resource":
224
+ # Set target resource data and schema
225
+ target_resource = load_resource_by_variable(
226
+ run_name=run["name"],
227
+ variable_name=target["name"],
228
+ base_path=DATAKIT_PATH,
229
+ as_dict=True,
230
+ )
231
+
232
+ if "data" in target:
233
+ print(
234
+ " [bold]*[/bold] Setting "
235
+ f"{target_resource['name']} data"
236
+ )
237
+ target_resource["data"] = target["data"]
238
+
239
+ if "schema" in target:
240
+ print(
241
+ " [bold]*[/bold] Setting "
242
+ f"{target_resource['name']} schema"
243
+ )
244
+ target_resource["schema"] = target["schema"]
245
+
246
+ write_resource(
247
+ run_name=run["name"],
248
+ resource=target_resource,
249
+ base_path=DATAKIT_PATH,
250
+ )
251
+ elif target["type"] == "value":
252
+ # Set target variable value
253
+ target_variable = find_by_name(
254
+ run["data"]["inputs"] + run["data"]["outputs"],
255
+ target["name"],
256
+ )
257
+
258
+ if "value" in target:
259
+ print(
260
+ f" [bold]*[/bold] Setting {target['name']} "
261
+ f"value from {target_variable['value']} to "
262
+ f"{target['value']}"
263
+ )
264
+ target_variable["value"] = target["value"]
265
+
266
+ if "metaschema" in target:
267
+ print(
268
+ f" [bold]*[/bold] Setting {target['name']} "
269
+ "metaschema from "
270
+ f"{target_variable['metaschema']} "
271
+ f"to {target['metaschema']}"
272
+ )
273
+ target_variable["metaschema"] = target[
274
+ "metaschema"
275
+ ]
276
+ else:
277
+ raise NotImplementedError(
278
+ (
279
+ 'Only "resource" and "value" type rule '
280
+ "targets are implemented"
281
+ )
282
+ )
283
+
284
+ else:
285
+ raise NotImplementedError("Only value-based rules are implemented")
286
+
287
+ # Write modified run configuration
288
+ write_run_configuration(run, base_path=DATAKIT_PATH)
289
+
290
+
291
+ # Commands
292
+
293
+
294
+ @app.command()
295
+ def init(
296
+ run_name: Annotated[
297
+ Optional[str],
298
+ typer.Argument(
299
+ help=(
300
+ "Name of the run you want to initialise in the format "
301
+ "[algorithm].[run name]"
302
+ )
303
+ ),
304
+ ] = None,
305
+ ) -> None:
306
+ """Initialise a datakit run"""
307
+ run_name = get_full_run_name(run_name)
308
+
309
+ # Check directory doesn't already exist
310
+ if run_exists(run_name):
311
+ print(f"[red]{run_name} already exists[/red]")
312
+ exit(1)
313
+
314
+ # Create run directory
315
+ run_dir = RUN_DIR.format(base_path=DATAKIT_PATH, run_name=run_name)
316
+ os.makedirs(f"{run_dir}/resources")
317
+ os.makedirs(f"{run_dir}/views")
318
+ print(f"[bold]=>[/bold] Created run directory: {run_dir}")
319
+
320
+ algorithm_name = get_algorithm_name(run_name)
321
+ algorithm = load_algorithm(algorithm_name, base_path=DATAKIT_PATH)
322
+
323
+ # Generate default run configuration
324
+ run = {
325
+ "name": run_name,
326
+ "title": f"Run configuration for {algorithm_name}",
327
+ "profile": "datakit-run",
328
+ "algorithm": f"{algorithm_name}",
329
+ "container": f'{algorithm["container"]}',
330
+ "data": {
331
+ "inputs": [],
332
+ "outputs": [],
333
+ },
334
+ }
335
+
336
+ # Create run configuration and initialise resources
337
+ for variable in algorithm["signature"]["inputs"]:
338
+ # Add variable defaults to run configuration
339
+ run["data"]["inputs"].append(
340
+ {
341
+ "name": variable["name"],
342
+ **variable["default"],
343
+ }
344
+ )
345
+
346
+ # Initialise associated resources
347
+ if variable["type"] == "resource":
348
+ resource_name = variable["default"]["resource"]
349
+
350
+ init_resource(
351
+ run_name=run["name"],
352
+ resource_name=resource_name,
353
+ base_path=DATAKIT_PATH,
354
+ )
355
+
356
+ print(f"[bold]=>[/bold] Generated input resource: {resource_name}")
357
+
358
+ for variable in algorithm["signature"]["outputs"]:
359
+ # Add variable defaults to run configuration
360
+ run["data"]["outputs"].append(
361
+ {
362
+ "name": variable["name"],
363
+ **variable["default"],
364
+ }
365
+ )
366
+
367
+ # Initialise associated resources
368
+ if variable["type"] == "resource":
369
+ resource_name = variable["default"]["resource"]
370
+
371
+ init_resource(
372
+ run_name=run["name"],
373
+ resource_name=resource_name,
374
+ base_path=DATAKIT_PATH,
375
+ )
376
+
377
+ print(f"[bold]=>[/bold] Generated input resource: {resource_name}")
378
+
379
+ # Write generated configuration
380
+ write_run_configuration(run, base_path=DATAKIT_PATH)
381
+
382
+ print(f"[bold]=>[/bold] Generated default run configuration: {run_name}")
383
+
384
+ # Add default run to datakit.json
385
+ datakit = load_datakit_configuration(base_path=DATAKIT_PATH)
386
+ datakit["runs"].append(run_name)
387
+ write_datakit_configuration(datakit, base_path=DATAKIT_PATH)
388
+
389
+ # Write current run name to config
390
+ write_config(run_name)
391
+
392
+
393
+ @app.command()
394
+ def set_run(
395
+ run_name: Annotated[
396
+ Optional[str],
397
+ typer.Argument(help="Name of the run you want to enable"),
398
+ ] = None,
399
+ ) -> None:
400
+ """Set the active run"""
401
+ run_name = get_full_run_name(run_name)
402
+
403
+ if run_exists(run_name):
404
+ # Set to active run
405
+ write_config(run_name)
406
+ else:
407
+ print(f"[red]{run_name} does not exist[/red]")
408
+
409
+
410
+ @app.command()
411
+ def get_run() -> None:
412
+ """Get the active run"""
413
+ print(f"[bold]{get_active_run()}[/bold]")
414
+
415
+
416
+ @app.command()
417
+ def run() -> None:
418
+ """Execute the active run"""
419
+ run_name = get_active_run()
420
+
421
+ # Execute algorithm container and print any logs
422
+ print(f"[bold]=>[/bold] Executing [bold]{run_name}[/bold]")
423
+
424
+ try:
425
+ logs = execute_datakit(
426
+ client,
427
+ run_name,
428
+ base_path=DATAKIT_PATH,
429
+ )
430
+ except ExecutionError as e:
431
+ print(
432
+ Panel(
433
+ e.logs,
434
+ title="[bold red]Execution error[/bold red]",
435
+ )
436
+ )
437
+ print("[red]Container execution failed[/red]")
438
+ exit(1)
439
+
440
+ if logs:
441
+ print(
442
+ Panel(
443
+ logs,
444
+ title="[bold]Execution container output[/bold]",
445
+ )
446
+ )
447
+
448
+ print(f"[bold]=>[/bold] Executed [bold]{run_name}[/bold] successfully")
449
+
450
+
451
+ @app.command()
452
+ def show(
453
+ variable_name: Annotated[
454
+ str,
455
+ typer.Argument(
456
+ help="Name of variable to print",
457
+ show_default=False,
458
+ ),
459
+ ],
460
+ ) -> None:
461
+ """Print a variable value"""
462
+ run_name = get_active_run()
463
+
464
+ # Load algorithum signature to check variable type
465
+ signature = load_variable_signature(
466
+ run_name=run_name,
467
+ variable_name=variable_name,
468
+ base_path=DATAKIT_PATH,
469
+ )
470
+
471
+ if signature["type"] == "resource":
472
+ # Variable is a tabular data resource
473
+ resource = load_resource_by_variable(
474
+ run_name=run_name,
475
+ variable_name=variable_name,
476
+ base_path=DATAKIT_PATH,
477
+ )
478
+
479
+ print(
480
+ tabulate(
481
+ resource.to_dict()["data"],
482
+ headers="keys",
483
+ tablefmt="rounded_grid",
484
+ )
485
+ )
486
+ else:
487
+ # Variable is a simple string/number/bool value
488
+ variable = load_variable(
489
+ run_name=run_name,
490
+ variable_name=variable_name,
491
+ base_path=DATAKIT_PATH,
492
+ )
493
+
494
+ print(
495
+ Panel(
496
+ str(variable["value"]),
497
+ title=f"{variable_name}",
498
+ expand=False,
499
+ )
500
+ )
501
+
502
+
503
+ @app.command()
504
+ def view(
505
+ view_name: Annotated[
506
+ str,
507
+ typer.Argument(
508
+ help="The name of the view to render", show_default=False
509
+ ),
510
+ ],
511
+ ) -> None:
512
+ """Render a view locally"""
513
+ run_name = get_active_run()
514
+
515
+ print(f"[bold]=>[/bold] Generating [bold]{view_name}[/bold] view")
516
+
517
+ try:
518
+ logs = execute_view(
519
+ docker_client=client,
520
+ run_name=run_name,
521
+ view_name=view_name,
522
+ base_path=DATAKIT_PATH,
523
+ )
524
+ except ResourceError as e:
525
+ print("[red]" + e.message + "[/red]")
526
+ exit(1)
527
+ except ExecutionError as e:
528
+ print(
529
+ Panel(
530
+ e.logs,
531
+ title="[bold red]View execution error[/bold red]",
532
+ )
533
+ )
534
+ print("[red]View execution failed[/red]")
535
+ exit(1)
536
+
537
+ if logs:
538
+ print(
539
+ Panel(
540
+ logs,
541
+ title="[bold]View container output[/bold]",
542
+ )
543
+ )
544
+
545
+ print(
546
+ f"[bold]=>[/bold] Successfully generated [bold]{view_name}[/bold] view"
547
+ )
548
+
549
+ print(
550
+ "[blue][bold]=>[/bold] Loading interactive view in web browser[/blue]"
551
+ )
552
+
553
+ matplotlib.use("WebAgg")
554
+
555
+ with open(
556
+ VIEW_ARTEFACTS_DIR.format(base_path=DATAKIT_PATH, run_name=run_name)
557
+ + f"/{view_name}.p",
558
+ "rb",
559
+ ) as f:
560
+ # NOTE: The matplotlib version in CLI must be >= the version of
561
+ # matplotlib used to generate the plot (which is chosen by the user)
562
+ # So the CLI should be kept up to date at all times
563
+
564
+ # Load matplotlib figure
565
+ pickle.load(f)
566
+
567
+ plt.show()
568
+
569
+
570
+ @app.command()
571
+ def load(
572
+ variable_name: Annotated[
573
+ str,
574
+ typer.Argument(
575
+ help="Name of variable to populate",
576
+ show_default=False,
577
+ ),
578
+ ],
579
+ path: Annotated[
580
+ str,
581
+ typer.Argument(
582
+ help="Path to data to ingest (xml, csv)", show_default=False
583
+ ),
584
+ ],
585
+ ) -> None:
586
+ """Load data into configuration variable"""
587
+ run_name = get_active_run()
588
+
589
+ # Load resource into TabularDataResource object
590
+ resource = load_resource_by_variable(
591
+ run_name=run_name,
592
+ variable_name=variable_name,
593
+ base_path=DATAKIT_PATH,
594
+ )
595
+
596
+ # Read CSV into resource
597
+ print(f"[bold]=>[/bold] Reading {path}")
598
+ resource.data = pd.read_csv(path)
599
+
600
+ # Write to resource
601
+ write_resource(
602
+ run_name=run_name, resource=resource, base_path=DATAKIT_PATH
603
+ )
604
+
605
+ # Execute any applicable relationships
606
+ execute_relationship(
607
+ run_name=run_name,
608
+ variable_name=variable_name,
609
+ )
610
+
611
+ print("[bold]=>[/bold] Resource successfully loaded!")
612
+
613
+
614
+ @app.command()
615
+ def set(
616
+ variable_ref: Annotated[
617
+ str,
618
+ typer.Argument(
619
+ help=(
620
+ "Either a variable name, or a table reference in the format "
621
+ "[resource name].[primary key].[column name]"
622
+ ),
623
+ show_default=False,
624
+ ),
625
+ ],
626
+ variable_value: Annotated[
627
+ str, # Workaround for union types not being supported by Typer yet
628
+ # Union[str, int, float, bool],
629
+ typer.Argument(
630
+ help="Value to set",
631
+ show_default=False,
632
+ ),
633
+ ],
634
+ ) -> None:
635
+ """Set a variable value"""
636
+ run_name = get_active_run()
637
+
638
+ # Parse value (workaround for Typer not supporting Union types :<)
639
+ variable_value = dumb_str_to_type(variable_value)
640
+
641
+ if "." in variable_ref:
642
+ # Variable reference is a table reference
643
+
644
+ # Check the variable_ref matches the pattern:
645
+ # [resource].[primary key].[column]
646
+ pattern = re.compile(
647
+ r"^([a-zA-Z0-9_]+)\.([a-zA-Z0-9_]+)\.([a-zA-Z0-9_]+)$"
648
+ )
649
+
650
+ if not pattern.match(variable_ref):
651
+ print(
652
+ "[red]Variable name argument must be either a variable name "
653
+ "or a table reference in the format "
654
+ r"\[resource name].\[primary key].\[column name][/red]"
655
+ )
656
+ exit(1)
657
+
658
+ # Parse variable and row/col names
659
+ variable_name, row_name, col_name = variable_ref.split(".")
660
+
661
+ # Load param resource
662
+ resource = load_resource_by_variable(
663
+ run_name=run_name,
664
+ variable_name=variable_name,
665
+ base_path=DATAKIT_PATH,
666
+ )
667
+
668
+ # Check it's a tabular data resource
669
+ if resource.profile != "tabular-data-resource":
670
+ print(
671
+ f"[red]Resource [bold]{resource.name}[/bold] is not of type "
672
+ '"tabular-data-resource"[/red]'
673
+ )
674
+ exit(1)
675
+
676
+ # If data is not populated, something has gone wrong
677
+ if not resource:
678
+ print(
679
+ f'[red]Parameter resource [bold]{resource.name}[/bold] "data" '
680
+ 'field is empty. Try running "ds reset"?[/red]'
681
+ )
682
+ exit(1)
683
+
684
+ print(
685
+ f"[bold]=>[/bold] Setting table value at row [bold]{row_name}"
686
+ f"[/bold] and column [bold]{col_name}[/bold] to "
687
+ f"[bold]{variable_value}[/bold]"
688
+ )
689
+
690
+ # Set table value
691
+ try:
692
+ # This will generate a key error if row_name doesn't exist
693
+ # The assignment doesn't unfortunately
694
+ resource.data.loc[row_name] # Ensure row exists
695
+ resource.data.loc[row_name, col_name] = variable_value
696
+ except KeyError:
697
+ print(
698
+ f'[red]Could not find row "{row_name}" or column "{col_name}" '
699
+ f"in resource [bold]{resource.name}[/bold][/red]"
700
+ )
701
+ exit(1)
702
+
703
+ # Write resource
704
+ write_resource(
705
+ run_name=run_name, resource=resource, base_path=DATAKIT_PATH
706
+ )
707
+
708
+ print(
709
+ f"[bold]=>[/bold] Successfully set table value at row "
710
+ f"[bold]{row_name}[/bold] and column [bold]{col_name}[/bold] to "
711
+ f"[bold]{variable_value}[/bold] in resource [bold]{resource.name}"
712
+ "[/bold]"
713
+ )
714
+ else:
715
+ # Variable reference is a simple variable name
716
+ variable_name = variable_ref
717
+
718
+ # Load variable signature
719
+ signature = load_variable_signature(
720
+ run_name, variable_name, base_path=DATAKIT_PATH
721
+ )
722
+
723
+ # Convenience dict mapping datakit types to Python types
724
+ type_map = {
725
+ "string": [str],
726
+ "boolean": [bool],
727
+ "number": [float, int],
728
+ }
729
+
730
+ # Check the value is of the expected type for this variable
731
+ # Raise some helpful errors
732
+ if signature.get("profile") == "tabular-data-resource":
733
+ print('[red]Use command "load" for tabular data resource[/red]')
734
+ exit(1)
735
+ elif "parameter-tabular-data-resource" in signature.get("profile", ""):
736
+ print('[red]Use command "set-param" for parameter resource[/red]')
737
+ exit(1)
738
+ # Specify False as fallback value here to avoid "None"s leaking through
739
+ elif not (type(variable_value) in type_map.get(signature["type"], [])):
740
+ print(
741
+ f"[red]Variable value must be of type {signature['type']}"
742
+ "[/red]"
743
+ )
744
+ exit(1)
745
+
746
+ # If this variable has an enum, check the value is allowed
747
+ if signature.get("enum", False):
748
+ allowed_values = [i["value"] for i in signature["enum"]]
749
+ if variable_value not in allowed_values:
750
+ print(
751
+ f"[red]Variable value must be one of {allowed_values}"
752
+ "[/red]"
753
+ )
754
+ exit(1)
755
+
756
+ # Check if nullable
757
+ if not signature["null"]:
758
+ if not variable_value:
759
+ print("[red]Variable value cannot be null[/red]")
760
+ exit(1)
761
+
762
+ # Load run configuration
763
+ run = load_run_configuration(run_name, base_path=DATAKIT_PATH)
764
+
765
+ # Set variable value
766
+ find_by_name(
767
+ run["data"]["inputs"] + run["data"]["outputs"], variable_name
768
+ )["value"] = variable_value
769
+
770
+ # Write configuration
771
+ write_run_configuration(run, base_path=DATAKIT_PATH)
772
+
773
+ # Execute any relationships applied to this variable value
774
+ execute_relationship(
775
+ run_name=run_name,
776
+ variable_name=variable_name,
777
+ )
778
+
779
+ print(
780
+ f"[bold]=>[/bold] Successfully set [bold]{variable_name}[/bold] "
781
+ "variable"
782
+ )
783
+
784
+ show(variable_name)
785
+
786
+
787
+ @app.command()
788
+ def reset():
789
+ """Reset datakit to clean state
790
+
791
+ Removes all run outputs and resets configurations to default
792
+ """
793
+ # Remove all run directories
794
+ for f in os.scandir(DATAKIT_PATH):
795
+ if f.is_dir() and f.path.endswith(".run"):
796
+ print(f"[bold]=>[/bold] Deleting [bold]{f.name}[/bold]")
797
+ shutil.rmtree(f.path)
798
+
799
+ # Remove all run references from datakit.json
800
+ datakit = load_datakit_configuration(base_path=DATAKIT_PATH)
801
+ datakit["runs"] = []
802
+ write_datakit_configuration(datakit, base_path=DATAKIT_PATH)
803
+
804
+ # Remove CLI config
805
+ if os.path.exists(CONFIG_FILE):
806
+ os.remove(CONFIG_FILE)
807
+
808
+
809
+ @app.command()
810
+ def new(
811
+ algorithm_name: Annotated[
812
+ str,
813
+ typer.Argument(
814
+ help="Name of the algorithm to generate",
815
+ show_default=False,
816
+ ),
817
+ ],
818
+ ) -> None:
819
+ """Generate a new datakit and algorithm scaffold"""
820
+ # Create new datakit directory
821
+ datakit_name = f"{algorithm_name}-datakit"
822
+ datakit_dir = f"{DATAKIT_PATH}/{datakit_name}"
823
+ algorithm_dir = f"{datakit_dir}/{algorithm_name}"
824
+
825
+ if not os.path.exists(datakit_dir):
826
+ os.makedirs(datakit_dir)
827
+ os.makedirs(algorithm_dir)
828
+ else:
829
+ print(f'[red]Directory named "{datakit_name}" already exists[/red]')
830
+ exit(1)
831
+
832
+ current_time = int(time.time())
833
+
834
+ datakit = {
835
+ "title": "New datakit",
836
+ "description": "A new datakit",
837
+ "profile": "datakit",
838
+ "algorithms": [algorithm_name],
839
+ "runs": [],
840
+ "repository": {},
841
+ "created": current_time,
842
+ "updated": current_time,
843
+ }
844
+
845
+ algorithm = {
846
+ "name": algorithm_name,
847
+ "title": "New algorithm",
848
+ "profile": "datakit-algorithm",
849
+ "code": "algorithm.py",
850
+ "container": "datakits/python-run-base:latest",
851
+ "signature": {
852
+ "inputs": [
853
+ {
854
+ "name": "x",
855
+ "title": "X",
856
+ "description": "An input variable",
857
+ "type": "number",
858
+ "null": False,
859
+ "default": {"value": 42},
860
+ },
861
+ ],
862
+ "outputs": [
863
+ {
864
+ "name": "result",
865
+ "title": "Result",
866
+ "description": "An output variable",
867
+ "type": "number",
868
+ "null": True,
869
+ "default": {"value": None},
870
+ },
871
+ ],
872
+ },
873
+ }
874
+
875
+ algorithm_code = '''def main(x):
876
+ """An algorithm that multiplies the input variable by 2"""
877
+ return {
878
+ "result": x*2,
879
+ }'''
880
+
881
+ write_datakit_configuration(datakit, base_path=datakit_dir)
882
+ write_algorithm(algorithm, base_path=datakit_dir)
883
+ with open(f"{datakit_dir}/{algorithm_name}/algorithm.py", "x") as f:
884
+ f.write(algorithm_code)
885
+
886
+ print(f"[bold]=>[/bold] Successfully created [bold]{datakit_name}[/bold]")
887
+
888
+
889
+ if __name__ == "__main__":
890
+ app()
@@ -0,0 +1,46 @@
1
+ Metadata-Version: 2.1
2
+ Name: datastudio-cli
3
+ Version: 0.2.2
4
+ Summary: A command line client for running DataStudio Datakits
5
+ Author-email: Varvara Efremova <varvara@echus.co>, James Wilmot <james.wilmot@pm.me>
6
+ Requires-Python: >=3.11
7
+ Description-Content-Type: text/markdown
8
+ License-File: LICENSE
9
+ Requires-Dist: typer<1,>=0.12
10
+ Requires-Dist: docker<8,>=7.1.0
11
+ Requires-Dist: matplotlib<4,>=3.9.1
12
+ Requires-Dist: tornado
13
+ Requires-Dist: datastudio-lib>=0.2.2
14
+ Requires-Dist: tabulate
15
+ Provides-Extra: development
16
+ Requires-Dist: pre-commit; extra == "development"
17
+ Requires-Dist: build; extra == "development"
18
+ Provides-Extra: all
19
+ Requires-Dist: datastudio-cli[development]; extra == "all"
20
+
21
+ # DataStudio datakit CLI
22
+
23
+ A command line client for running DataStudio datakits.
24
+
25
+ Usage documentation can be found [here](https://docs.datastudioapp.com).
26
+
27
+
28
+ ## Development
29
+
30
+ To install and test locally, navigate to the datakit directory you want to
31
+ test.
32
+ ```
33
+ cd /path/to/datakit
34
+ ```
35
+
36
+ Create a virtualenv and install the CLI via pip in local mode:
37
+ ```
38
+ python -m venv .venv
39
+ source .venv/bin/activate
40
+ pip install -e [/path/to/cli]
41
+ ```
42
+
43
+ You can now run the CLI script with:
44
+ ```
45
+ ds
46
+ ```
@@ -0,0 +1,10 @@
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ cli/main.py
5
+ datastudio_cli.egg-info/PKG-INFO
6
+ datastudio_cli.egg-info/SOURCES.txt
7
+ datastudio_cli.egg-info/dependency_links.txt
8
+ datastudio_cli.egg-info/entry_points.txt
9
+ datastudio_cli.egg-info/requires.txt
10
+ datastudio_cli.egg-info/top_level.txt
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ ds = datastudio_cli.main:app
@@ -0,0 +1,13 @@
1
+ typer<1,>=0.12
2
+ docker<8,>=7.1.0
3
+ matplotlib<4,>=3.9.1
4
+ tornado
5
+ datastudio-lib>=0.2.2
6
+ tabulate
7
+
8
+ [all]
9
+ datastudio-cli[development]
10
+
11
+ [development]
12
+ pre-commit
13
+ build
@@ -0,0 +1,41 @@
1
+ [project]
2
+ name = "datastudio-cli"
3
+ version = "0.2.2"
4
+ description = "A command line client for running DataStudio Datakits"
5
+ requires-python = ">=3.11"
6
+ authors = [
7
+ { name = "Varvara Efremova", email = "varvara@echus.co" },
8
+ { name = "James Wilmot", email = "james.wilmot@pm.me" },
9
+ ]
10
+ readme = "README.md"
11
+ dependencies = [
12
+ "typer >= 0.12, < 1",
13
+ "docker >= 7.1.0, < 8",
14
+ "matplotlib >= 3.9.1, < 4",
15
+ "tornado", # Required for rendering interactive plots
16
+ "datastudio-lib >= 0.2.2",
17
+ "tabulate",
18
+ ]
19
+
20
+ [project.scripts]
21
+ ds = "datastudio_cli.main:app"
22
+
23
+ [project.optional-dependencies]
24
+ development = [
25
+ "pre-commit",
26
+ "build",
27
+ ]
28
+ all = ["datastudio-cli[development]"]
29
+
30
+ [build-system]
31
+ requires = ["setuptools>=61.0"]
32
+ build-backend = "setuptools.build_meta"
33
+
34
+ [tool.black]
35
+ line-length = 79
36
+ include = '\.pyi?$'
37
+ force-exclude = '''
38
+ /(
39
+ \.git
40
+ )/
41
+ '''
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+