datachain 0.8.2__py3-none-any.whl → 0.8.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (44) hide show
  1. datachain/cache.py +4 -2
  2. datachain/catalog/catalog.py +100 -54
  3. datachain/catalog/datasource.py +4 -6
  4. datachain/cli/__init__.py +311 -0
  5. datachain/cli/commands/__init__.py +29 -0
  6. datachain/cli/commands/datasets.py +129 -0
  7. datachain/cli/commands/du.py +14 -0
  8. datachain/cli/commands/index.py +12 -0
  9. datachain/cli/commands/ls.py +169 -0
  10. datachain/cli/commands/misc.py +28 -0
  11. datachain/cli/commands/query.py +53 -0
  12. datachain/cli/commands/show.py +38 -0
  13. datachain/cli/parser/__init__.py +547 -0
  14. datachain/cli/parser/job.py +120 -0
  15. datachain/cli/parser/studio.py +126 -0
  16. datachain/cli/parser/utils.py +63 -0
  17. datachain/{cli_utils.py → cli/utils.py} +27 -1
  18. datachain/client/azure.py +21 -1
  19. datachain/client/fsspec.py +45 -13
  20. datachain/client/gcs.py +10 -2
  21. datachain/client/local.py +4 -4
  22. datachain/client/s3.py +10 -0
  23. datachain/dataset.py +1 -0
  24. datachain/func/__init__.py +2 -2
  25. datachain/func/conditional.py +52 -0
  26. datachain/func/func.py +5 -1
  27. datachain/lib/arrow.py +4 -0
  28. datachain/lib/dc.py +18 -3
  29. datachain/lib/file.py +1 -1
  30. datachain/lib/listing.py +36 -3
  31. datachain/lib/signal_schema.py +89 -27
  32. datachain/listing.py +1 -5
  33. datachain/node.py +27 -1
  34. datachain/progress.py +2 -2
  35. datachain/query/session.py +1 -1
  36. datachain/studio.py +58 -38
  37. datachain/utils.py +1 -1
  38. {datachain-0.8.2.dist-info → datachain-0.8.4.dist-info}/METADATA +6 -6
  39. {datachain-0.8.2.dist-info → datachain-0.8.4.dist-info}/RECORD +43 -31
  40. {datachain-0.8.2.dist-info → datachain-0.8.4.dist-info}/WHEEL +1 -1
  41. datachain/cli.py +0 -1475
  42. {datachain-0.8.2.dist-info → datachain-0.8.4.dist-info}/LICENSE +0 -0
  43. {datachain-0.8.2.dist-info → datachain-0.8.4.dist-info}/entry_points.txt +0 -0
  44. {datachain-0.8.2.dist-info → datachain-0.8.4.dist-info}/top_level.txt +0 -0
datachain/cli.py DELETED
@@ -1,1475 +0,0 @@
1
- import logging
2
- import os
3
- import shlex
4
- import sys
5
- import traceback
6
- from argparse import Action, ArgumentParser, ArgumentTypeError, Namespace
7
- from collections.abc import Iterable, Iterator, Sequence
8
- from importlib.metadata import PackageNotFoundError, version
9
- from itertools import chain
10
- from multiprocessing import freeze_support
11
- from typing import TYPE_CHECKING, Optional, Union
12
-
13
- import shtab
14
- from tabulate import tabulate
15
-
16
- from datachain import Session, utils
17
- from datachain.cli_utils import BooleanOptionalAction, CommaSeparatedArgs, KeyValueArgs
18
- from datachain.config import Config
19
- from datachain.error import DataChainError, DatasetNotFoundError
20
- from datachain.lib.dc import DataChain
21
- from datachain.studio import (
22
- edit_studio_dataset,
23
- list_datasets,
24
- process_studio_cli_args,
25
- remove_studio_dataset,
26
- )
27
- from datachain.telemetry import telemetry
28
-
29
- if TYPE_CHECKING:
30
- from datachain.catalog import Catalog
31
-
32
- logger = logging.getLogger("datachain")
33
-
34
- TTL_HUMAN = "4h"
35
- TTL_INT = 4 * 60 * 60
36
- FIND_COLUMNS = ["du", "name", "path", "size", "type"]
37
-
38
-
39
- def human_time_type(value_str: str, can_be_none: bool = False) -> Optional[int]:
40
- value = utils.human_time_to_int(value_str)
41
-
42
- if value:
43
- return value
44
- if can_be_none:
45
- return None
46
-
47
- raise ArgumentTypeError(
48
- "This option supports only a human-readable time interval like 12h or 4w."
49
- )
50
-
51
-
52
- def parse_find_column(column: str) -> str:
53
- column_lower = column.strip().lower()
54
- if column_lower in FIND_COLUMNS:
55
- return column_lower
56
- raise ArgumentTypeError(
57
- f"Invalid column for find: '{column}' Options are: {','.join(FIND_COLUMNS)}"
58
- )
59
-
60
-
61
- def find_columns_type(
62
- columns_str: str,
63
- default_colums_str: str = "path",
64
- ) -> list[str]:
65
- if not columns_str:
66
- columns_str = default_colums_str
67
-
68
- return [parse_find_column(c) for c in columns_str.split(",")]
69
-
70
-
71
- def add_sources_arg(parser: ArgumentParser, nargs: Union[str, int] = "+") -> Action:
72
- return parser.add_argument(
73
- "sources",
74
- type=str,
75
- nargs=nargs,
76
- help="Data sources - paths to cloud storage dirs",
77
- )
78
-
79
-
80
- def add_show_args(parser: ArgumentParser) -> None:
81
- parser.add_argument(
82
- "--limit",
83
- action="store",
84
- default=10,
85
- type=int,
86
- help="Number of rows to show",
87
- )
88
- parser.add_argument(
89
- "--offset",
90
- action="store",
91
- default=0,
92
- type=int,
93
- help="Number of rows to offset",
94
- )
95
- parser.add_argument(
96
- "--columns",
97
- default=[],
98
- action=CommaSeparatedArgs,
99
- help="Columns to show",
100
- )
101
- parser.add_argument(
102
- "--no-collapse",
103
- action="store_true",
104
- default=False,
105
- help="Do not collapse the columns",
106
- )
107
-
108
-
109
- def add_studio_parser(subparsers, parent_parser) -> None:
110
- studio_help = "Commands to authenticate DataChain with Iterative Studio"
111
- studio_description = (
112
- "Authenticate DataChain with Studio and set the token. "
113
- "Once this token has been properly configured,\n"
114
- "DataChain will utilize it for seamlessly sharing datasets\n"
115
- "and using Studio features from CLI"
116
- )
117
-
118
- studio_parser = subparsers.add_parser(
119
- "studio",
120
- parents=[parent_parser],
121
- description=studio_description,
122
- help=studio_help,
123
- )
124
- studio_subparser = studio_parser.add_subparsers(
125
- dest="cmd",
126
- help="Use `DataChain studio CMD --help` to display command-specific help.",
127
- required=True,
128
- )
129
-
130
- studio_login_help = "Authenticate DataChain with Studio host"
131
- studio_login_description = (
132
- "By default, this command authenticates the DataChain with Studio\n"
133
- "using default scopes and assigns a random name as the token name."
134
- )
135
- login_parser = studio_subparser.add_parser(
136
- "login",
137
- parents=[parent_parser],
138
- description=studio_login_description,
139
- help=studio_login_help,
140
- )
141
-
142
- login_parser.add_argument(
143
- "-H",
144
- "--hostname",
145
- action="store",
146
- default=None,
147
- help="The hostname of the Studio instance to authenticate with.",
148
- )
149
- login_parser.add_argument(
150
- "-s",
151
- "--scopes",
152
- action="store",
153
- default=None,
154
- help="The scopes for the authentication token. ",
155
- )
156
-
157
- login_parser.add_argument(
158
- "-n",
159
- "--name",
160
- action="store",
161
- default=None,
162
- help="The name of the authentication token. It will be used to\n"
163
- "identify token shown in Studio profile.",
164
- )
165
-
166
- login_parser.add_argument(
167
- "--no-open",
168
- action="store_true",
169
- default=False,
170
- help="Use authentication flow based on user code.\n"
171
- "You will be presented with user code to enter in browser.\n"
172
- "DataChain will also use this if it cannot launch browser on your behalf.",
173
- )
174
-
175
- studio_logout_help = "Logout user from Studio"
176
- studio_logout_description = "This removes the studio token from your global config."
177
-
178
- studio_subparser.add_parser(
179
- "logout",
180
- parents=[parent_parser],
181
- description=studio_logout_description,
182
- help=studio_logout_help,
183
- )
184
-
185
- studio_team_help = "Set the default team for DataChain"
186
- studio_team_description = (
187
- "Set the default team for DataChain to use when interacting with Studio."
188
- )
189
-
190
- team_parser = studio_subparser.add_parser(
191
- "team",
192
- parents=[parent_parser],
193
- description=studio_team_description,
194
- help=studio_team_help,
195
- )
196
- team_parser.add_argument(
197
- "team_name",
198
- action="store",
199
- help="The name of the team to set as the default.",
200
- )
201
- team_parser.add_argument(
202
- "--global",
203
- action="store_true",
204
- default=False,
205
- help="Set the team globally for all DataChain projects.",
206
- )
207
-
208
- studio_token_help = "View the token datachain uses to contact Studio" # noqa: S105 # nosec B105
209
-
210
- studio_subparser.add_parser(
211
- "token",
212
- parents=[parent_parser],
213
- description=studio_token_help,
214
- help=studio_token_help,
215
- )
216
-
217
- studio_ls_dataset_help = "List the available datasets from Studio"
218
- studio_ls_dataset_description = (
219
- "This command lists all the datasets available in Studio.\n"
220
- "It will show the dataset name and the number of versions available."
221
- )
222
-
223
- ls_dataset_parser = studio_subparser.add_parser(
224
- "datasets",
225
- parents=[parent_parser],
226
- description=studio_ls_dataset_description,
227
- help=studio_ls_dataset_help,
228
- )
229
- ls_dataset_parser.add_argument(
230
- "--team",
231
- action="store",
232
- default=None,
233
- help="The team to list datasets for. By default, it will use team from config.",
234
- )
235
-
236
- studio_run_help = "Run a job in Studio"
237
- studio_run_description = "This command runs a job in Studio."
238
-
239
- studio_run_parser = studio_subparser.add_parser(
240
- "run",
241
- parents=[parent_parser],
242
- description=studio_run_description,
243
- help=studio_run_help,
244
- )
245
-
246
- studio_run_parser.add_argument(
247
- "query_file",
248
- action="store",
249
- help="The query file to run.",
250
- )
251
-
252
- studio_run_parser.add_argument(
253
- "--team",
254
- action="store",
255
- default=None,
256
- help="The team to run a job for. By default, it will use team from config.",
257
- )
258
- studio_run_parser.add_argument(
259
- "--env-file",
260
- action="store",
261
- help="File containing environment variables to set for the job.",
262
- )
263
-
264
- studio_run_parser.add_argument(
265
- "--env",
266
- nargs="+",
267
- help="Environment variable. Can be specified multiple times. Format: KEY=VALUE",
268
- )
269
-
270
- studio_run_parser.add_argument(
271
- "--workers",
272
- type=int,
273
- help="Number of workers to use for the job.",
274
- )
275
- studio_run_parser.add_argument(
276
- "--files",
277
- nargs="+",
278
- help="Files to include in the job.",
279
- )
280
- studio_run_parser.add_argument(
281
- "--python-version",
282
- action="store",
283
- help="Python version to use for the job (e.g. '3.9', '3.10', '3.11').",
284
- )
285
- studio_run_parser.add_argument(
286
- "--req-file",
287
- action="store",
288
- help="File containing Python package requirements.",
289
- )
290
-
291
- studio_run_parser.add_argument(
292
- "--req",
293
- nargs="+",
294
- help="Python package requirement. Can be specified multiple times.",
295
- )
296
-
297
- studio_cancel_help = "Cancel a job in Studio"
298
- studio_cancel_description = "This command cancels a job in Studio."
299
-
300
- studio_cancel_parser = studio_subparser.add_parser(
301
- "cancel",
302
- parents=[parent_parser],
303
- description=studio_cancel_description,
304
- help=studio_cancel_help,
305
- )
306
-
307
- studio_cancel_parser.add_argument(
308
- "job_id",
309
- action="store",
310
- help="The job ID to cancel.",
311
- )
312
- studio_cancel_parser.add_argument(
313
- "--team",
314
- action="store",
315
- default=None,
316
- help="The team to cancel a job for. By default, it will use team from config.",
317
- )
318
-
319
-
320
- def get_parser() -> ArgumentParser: # noqa: PLR0915
321
- try:
322
- __version__ = version("datachain")
323
- except PackageNotFoundError:
324
- # package is not installed
325
- __version__ = "unknown"
326
-
327
- parser = ArgumentParser(
328
- description="DataChain: Wrangle unstructured AI data at scale", prog="datachain"
329
- )
330
- parser.add_argument("-V", "--version", action="version", version=__version__)
331
-
332
- parent_parser = ArgumentParser(add_help=False)
333
- parent_parser.add_argument(
334
- "--aws-endpoint-url",
335
- type=str,
336
- help="AWS endpoint URL",
337
- )
338
- parent_parser.add_argument(
339
- "--anon",
340
- action="store_true",
341
- help="AWS anon (aka awscli's --no-sign-request)",
342
- )
343
- parent_parser.add_argument(
344
- "-u", "--update", action="count", default=0, help="Update cache"
345
- )
346
- parent_parser.add_argument(
347
- "-v", "--verbose", action="count", default=0, help="Verbose"
348
- )
349
- parent_parser.add_argument(
350
- "-q", "--quiet", action="count", default=0, help="Be quiet"
351
- )
352
- parent_parser.add_argument(
353
- "--debug-sql",
354
- action="store_true",
355
- default=False,
356
- help="Show All SQL Queries (very verbose output, for debugging only)",
357
- )
358
- parent_parser.add_argument(
359
- "--pdb",
360
- action="store_true",
361
- default=False,
362
- help="Drop into the pdb debugger on fatal exception",
363
- )
364
-
365
- subp = parser.add_subparsers(
366
- title="Available Commands",
367
- metavar="command",
368
- dest="command",
369
- help=f"Use `{parser.prog} command --help` for command-specific help.",
370
- required=True,
371
- )
372
- parse_cp = subp.add_parser(
373
- "cp", parents=[parent_parser], description="Copy data files from the cloud"
374
- )
375
- add_sources_arg(parse_cp).complete = shtab.DIR # type: ignore[attr-defined]
376
- parse_cp.add_argument("output", type=str, help="Output")
377
- parse_cp.add_argument(
378
- "-f",
379
- "--force",
380
- default=False,
381
- action="store_true",
382
- help="Force creating outputs",
383
- )
384
- parse_cp.add_argument(
385
- "-r",
386
- "-R",
387
- "--recursive",
388
- default=False,
389
- action="store_true",
390
- help="Copy directories recursively",
391
- )
392
- parse_cp.add_argument(
393
- "--no-glob",
394
- default=False,
395
- action="store_true",
396
- help="Do not expand globs (such as * or ?)",
397
- )
398
-
399
- parse_clone = subp.add_parser(
400
- "clone", parents=[parent_parser], description="Copy data files from the cloud"
401
- )
402
- add_sources_arg(parse_clone).complete = shtab.DIR # type: ignore[attr-defined]
403
- parse_clone.add_argument("output", type=str, help="Output")
404
- parse_clone.add_argument(
405
- "-f",
406
- "--force",
407
- default=False,
408
- action="store_true",
409
- help="Force creating outputs",
410
- )
411
- parse_clone.add_argument(
412
- "-r",
413
- "-R",
414
- "--recursive",
415
- default=False,
416
- action="store_true",
417
- help="Copy directories recursively",
418
- )
419
- parse_clone.add_argument(
420
- "--no-glob",
421
- default=False,
422
- action="store_true",
423
- help="Do not expand globs (such as * or ?)",
424
- )
425
- parse_clone.add_argument(
426
- "--no-cp",
427
- default=False,
428
- action="store_true",
429
- help="Do not copy files, just create a dataset",
430
- )
431
- parse_clone.add_argument(
432
- "--edatachain",
433
- default=False,
434
- action="store_true",
435
- help="Create a .edatachain file",
436
- )
437
- parse_clone.add_argument(
438
- "--edatachain-file",
439
- help="Use a different filename for the resulting .edatachain file",
440
- )
441
-
442
- add_studio_parser(subp, parent_parser)
443
-
444
- datasets_parser = subp.add_parser(
445
- "datasets",
446
- aliases=["ds"],
447
- parents=[parent_parser],
448
- description="Commands for managing datasers",
449
- )
450
- datasets_subparser = datasets_parser.add_subparsers(
451
- dest="datasets_cmd",
452
- help="Use `datachain datasets CMD --help` to display command specific help",
453
- )
454
-
455
- parse_pull = datasets_subparser.add_parser(
456
- "pull",
457
- parents=[parent_parser],
458
- description="Pull specific dataset version from SaaS",
459
- )
460
- parse_pull.add_argument(
461
- "dataset",
462
- type=str,
463
- help="Name and version of remote dataset created in SaaS",
464
- )
465
- parse_pull.add_argument("-o", "--output", type=str, help="Output")
466
- parse_pull.add_argument(
467
- "-f",
468
- "--force",
469
- default=False,
470
- action="store_true",
471
- help="Force creating outputs",
472
- )
473
- parse_pull.add_argument(
474
- "-r",
475
- "-R",
476
- "--recursive",
477
- default=False,
478
- action="store_true",
479
- help="Copy directories recursively",
480
- )
481
- parse_pull.add_argument(
482
- "--cp",
483
- default=False,
484
- action="store_true",
485
- help="Copy actual files after pulling remote dataset into local DB",
486
- )
487
- parse_pull.add_argument(
488
- "--edatachain",
489
- default=False,
490
- action="store_true",
491
- help="Create .edatachain file",
492
- )
493
- parse_pull.add_argument(
494
- "--edatachain-file",
495
- help="Use a different filename for the resulting .edatachain file",
496
- )
497
- parse_pull.add_argument(
498
- "--local-name",
499
- action="store",
500
- default=None,
501
- help="Name of the local dataset",
502
- )
503
- parse_pull.add_argument(
504
- "--local-version",
505
- action="store",
506
- default=None,
507
- help="Version of the local dataset",
508
- )
509
-
510
- parse_edit_dataset = datasets_subparser.add_parser(
511
- "edit", parents=[parent_parser], description="Edit dataset metadata"
512
- )
513
- parse_edit_dataset.add_argument("name", type=str, help="Dataset name")
514
- parse_edit_dataset.add_argument(
515
- "--new-name",
516
- action="store",
517
- help="Dataset new name",
518
- )
519
- parse_edit_dataset.add_argument(
520
- "--description",
521
- action="store",
522
- help="Dataset description",
523
- )
524
- parse_edit_dataset.add_argument(
525
- "--labels",
526
- nargs="+",
527
- help="Dataset labels",
528
- )
529
- parse_edit_dataset.add_argument(
530
- "--studio",
531
- action="store_true",
532
- default=False,
533
- help="Edit dataset from Studio",
534
- )
535
- parse_edit_dataset.add_argument(
536
- "-L",
537
- "--local",
538
- action="store_true",
539
- default=False,
540
- help="Edit local dataset only",
541
- )
542
- parse_edit_dataset.add_argument(
543
- "-a",
544
- "--all",
545
- action="store_true",
546
- default=True,
547
- help="Edit both datasets from studio and local",
548
- )
549
- parse_edit_dataset.add_argument(
550
- "--team",
551
- action="store",
552
- default=None,
553
- help="The team to edit a dataset. By default, it will use team from config.",
554
- )
555
-
556
- datasets_parser = datasets_subparser.add_parser(
557
- "ls", parents=[parent_parser], description="List datasets"
558
- )
559
- datasets_parser.add_argument(
560
- "--studio",
561
- action="store_true",
562
- default=False,
563
- help="List the files in the Studio",
564
- )
565
- datasets_parser.add_argument(
566
- "-L",
567
- "--local",
568
- action="store_true",
569
- default=False,
570
- help="List local files only",
571
- )
572
- datasets_parser.add_argument(
573
- "-a",
574
- "--all",
575
- action="store_true",
576
- default=True,
577
- help="List all files including hidden files",
578
- )
579
- datasets_parser.add_argument(
580
- "--team",
581
- action="store",
582
- default=None,
583
- help="The team to list datasets for. By default, it will use team from config.",
584
- )
585
-
586
- rm_dataset_parser = datasets_subparser.add_parser(
587
- "rm", parents=[parent_parser], description="Removes dataset", aliases=["remove"]
588
- )
589
- rm_dataset_parser.add_argument("name", type=str, help="Dataset name")
590
- rm_dataset_parser.add_argument(
591
- "--version",
592
- action="store",
593
- default=None,
594
- type=int,
595
- help="Dataset version",
596
- )
597
- rm_dataset_parser.add_argument(
598
- "--force",
599
- default=False,
600
- action=BooleanOptionalAction,
601
- help="Force delete registered dataset with all of it's versions",
602
- )
603
- rm_dataset_parser.add_argument(
604
- "--studio",
605
- action="store_true",
606
- default=False,
607
- help="Remove dataset from Studio",
608
- )
609
- rm_dataset_parser.add_argument(
610
- "-L",
611
- "--local",
612
- action="store_true",
613
- default=False,
614
- help="Remove local datasets only",
615
- )
616
- rm_dataset_parser.add_argument(
617
- "-a",
618
- "--all",
619
- action="store_true",
620
- default=True,
621
- help="Remove both local and studio",
622
- )
623
- rm_dataset_parser.add_argument(
624
- "--team",
625
- action="store",
626
- default=None,
627
- help="The team to delete a dataset. By default, it will use team from config.",
628
- )
629
-
630
- dataset_stats_parser = datasets_subparser.add_parser(
631
- "stats",
632
- parents=[parent_parser],
633
- description="Shows basic dataset stats",
634
- )
635
- dataset_stats_parser.add_argument("name", type=str, help="Dataset name")
636
- dataset_stats_parser.add_argument(
637
- "--version",
638
- action="store",
639
- default=None,
640
- type=int,
641
- help="Dataset version",
642
- )
643
- dataset_stats_parser.add_argument(
644
- "-b",
645
- "--bytes",
646
- default=False,
647
- action="store_true",
648
- help="Display size in bytes instead of human-readable size",
649
- )
650
- dataset_stats_parser.add_argument(
651
- "--si",
652
- default=False,
653
- action="store_true",
654
- help="Display size using powers of 1000 not 1024",
655
- )
656
-
657
- parse_ls = subp.add_parser(
658
- "ls", parents=[parent_parser], description="List storage contents"
659
- )
660
- add_sources_arg(parse_ls, nargs="*")
661
- parse_ls.add_argument(
662
- "-l",
663
- "--long",
664
- action="count",
665
- default=0,
666
- help="List files in the long format",
667
- )
668
- parse_ls.add_argument(
669
- "--studio",
670
- action="store_true",
671
- default=False,
672
- help="List the files in the Studio",
673
- )
674
- parse_ls.add_argument(
675
- "-L",
676
- "--local",
677
- action="store_true",
678
- default=False,
679
- help="List local files only",
680
- )
681
- parse_ls.add_argument(
682
- "-a",
683
- "--all",
684
- action="store_true",
685
- default=True,
686
- help="List all files including hidden files",
687
- )
688
- parse_ls.add_argument(
689
- "--team",
690
- action="store",
691
- default=None,
692
- help="The team to list datasets for. By default, it will use team from config.",
693
- )
694
-
695
- parse_du = subp.add_parser(
696
- "du", parents=[parent_parser], description="Display space usage"
697
- )
698
- add_sources_arg(parse_du)
699
- parse_du.add_argument(
700
- "-b",
701
- "--bytes",
702
- default=False,
703
- action="store_true",
704
- help="Display sizes in bytes instead of human-readable sizes",
705
- )
706
- parse_du.add_argument(
707
- "-d",
708
- "--depth",
709
- "--max-depth",
710
- default=0,
711
- type=int,
712
- metavar="N",
713
- help=(
714
- "Display sizes for N directory depths below the given directory, "
715
- "the default is 0 (summarize provided directory only)."
716
- ),
717
- )
718
- parse_du.add_argument(
719
- "--si",
720
- default=False,
721
- action="store_true",
722
- help="Display sizes using powers of 1000 not 1024",
723
- )
724
-
725
- parse_find = subp.add_parser(
726
- "find", parents=[parent_parser], description="Search in a directory hierarchy"
727
- )
728
- add_sources_arg(parse_find)
729
- parse_find.add_argument(
730
- "--name",
731
- type=str,
732
- action="append",
733
- help="Filename to match pattern.",
734
- )
735
- parse_find.add_argument(
736
- "--iname",
737
- type=str,
738
- action="append",
739
- help="Like -name but case insensitive.",
740
- )
741
- parse_find.add_argument(
742
- "--path",
743
- type=str,
744
- action="append",
745
- help="Path to match pattern.",
746
- )
747
- parse_find.add_argument(
748
- "--ipath",
749
- type=str,
750
- action="append",
751
- help="Like -path but case insensitive.",
752
- )
753
- parse_find.add_argument(
754
- "--size",
755
- type=str,
756
- help=(
757
- "Filter by size (+ is greater or equal, - is less or equal). "
758
- "Specified size is in bytes, or use a suffix like K, M, G for "
759
- "kilobytes, megabytes, gigabytes, etc."
760
- ),
761
- )
762
- parse_find.add_argument(
763
- "--type",
764
- type=str,
765
- help='File type: "f" - regular, "d" - directory',
766
- )
767
- parse_find.add_argument(
768
- "-c",
769
- "--columns",
770
- type=find_columns_type,
771
- default=None,
772
- help=(
773
- "A comma-separated list of columns to print for each result. "
774
- f"Options are: {','.join(FIND_COLUMNS)} (Default: path)"
775
- ),
776
- )
777
-
778
- parse_index = subp.add_parser(
779
- "index", parents=[parent_parser], description="Index storage location"
780
- )
781
- add_sources_arg(parse_index)
782
-
783
- show_parser = subp.add_parser(
784
- "show",
785
- parents=[parent_parser],
786
- description="Create a new dataset with a query script",
787
- )
788
- show_parser.add_argument("name", type=str, help="Dataset name")
789
- show_parser.add_argument(
790
- "--version",
791
- action="store",
792
- default=None,
793
- type=int,
794
- help="Dataset version",
795
- )
796
- show_parser.add_argument("--schema", action="store_true", help="Show schema")
797
- add_show_args(show_parser)
798
-
799
- query_parser = subp.add_parser(
800
- "query",
801
- parents=[parent_parser],
802
- description="Create a new dataset with a query script",
803
- )
804
- query_parser.add_argument(
805
- "script", metavar="<script.py>", type=str, help="Filepath for script"
806
- )
807
- query_parser.add_argument(
808
- "--parallel",
809
- nargs="?",
810
- type=int,
811
- const=-1,
812
- default=None,
813
- metavar="N",
814
- help=(
815
- "Use multiprocessing to run any query script UDFs with N worker processes. "
816
- "N defaults to the CPU count."
817
- ),
818
- )
819
- query_parser.add_argument(
820
- "-p",
821
- "--param",
822
- metavar="param=value",
823
- nargs=1,
824
- action=KeyValueArgs,
825
- help="Query parameters",
826
- )
827
-
828
- subp.add_parser(
829
- "clear-cache", parents=[parent_parser], description="Clear the local file cache"
830
- )
831
- subp.add_parser(
832
- "gc", parents=[parent_parser], description="Garbage collect temporary tables"
833
- )
834
-
835
- subp.add_parser("internal-run-udf", parents=[parent_parser])
836
- subp.add_parser("internal-run-udf-worker", parents=[parent_parser])
837
- add_completion_parser(subp, [parent_parser])
838
- return parser
839
-
840
-
841
- def add_completion_parser(subparsers, parents):
842
- parser = subparsers.add_parser(
843
- "completion",
844
- parents=parents,
845
- description="Output shell completion script",
846
- )
847
- parser.add_argument(
848
- "-s",
849
- "--shell",
850
- help="Shell syntax for completions.",
851
- default="bash",
852
- choices=shtab.SUPPORTED_SHELLS,
853
- )
854
-
855
-
856
- def get_logging_level(args: Namespace) -> int:
857
- if args.quiet:
858
- return logging.CRITICAL
859
- if args.verbose:
860
- return logging.DEBUG
861
- return logging.INFO
862
-
863
-
864
- def ls_urls(
865
- sources,
866
- catalog: "Catalog",
867
- long: bool = False,
868
- **kwargs,
869
- ) -> Iterator[tuple[str, Iterator[str]]]:
870
- curr_dir = None
871
- value_iterables = []
872
- for next_dir, values in _ls_urls_flat(sources, long, catalog, **kwargs):
873
- if curr_dir is None or next_dir == curr_dir: # type: ignore[unreachable]
874
- value_iterables.append(values)
875
- else:
876
- yield curr_dir, chain(*value_iterables) # type: ignore[unreachable]
877
- value_iterables = [values]
878
- curr_dir = next_dir
879
- if curr_dir is not None:
880
- yield curr_dir, chain(*value_iterables)
881
-
882
-
883
- def _node_data_to_ls_values(row, long_format=False):
884
- from datachain.node import DirType, long_line_str
885
-
886
- name = row[0]
887
- is_dir = row[1] == DirType.DIR
888
- ending = "/" if is_dir else ""
889
- value = name + ending
890
- if long_format:
891
- last_modified = row[2]
892
- timestamp = last_modified if not is_dir else None
893
- return long_line_str(value, timestamp)
894
- return value
895
-
896
-
897
- def _ls_urls_flat(
898
- sources,
899
- long: bool,
900
- catalog: "Catalog",
901
- **kwargs,
902
- ) -> Iterator[tuple[str, Iterator[str]]]:
903
- from datachain.client import Client
904
- from datachain.node import long_line_str
905
-
906
- for source in sources:
907
- client_cls = Client.get_implementation(source)
908
- if client_cls.is_root_url(source):
909
- buckets = client_cls.ls_buckets(**catalog.client_config)
910
- if long:
911
- values = (long_line_str(b.name, b.created) for b in buckets)
912
- else:
913
- values = (b.name for b in buckets)
914
- yield source, values
915
- else:
916
- found = False
917
- fields = ["name", "dir_type"]
918
- if long:
919
- fields.append("last_modified")
920
- for data_source, results in catalog.ls([source], fields=fields, **kwargs):
921
- values = (_node_data_to_ls_values(r, long) for r in results)
922
- found = True
923
- yield data_source.dirname(), values
924
- if not found:
925
- raise FileNotFoundError(f"No such file or directory: {source}")
926
-
927
-
928
- def ls_local(
929
- sources,
930
- long: bool = False,
931
- catalog: Optional["Catalog"] = None,
932
- client_config=None,
933
- **kwargs,
934
- ):
935
- if catalog is None:
936
- from .catalog import get_catalog
937
-
938
- catalog = get_catalog(client_config=client_config)
939
- if sources:
940
- actual_sources = list(ls_urls(sources, catalog=catalog, long=long, **kwargs))
941
- if len(actual_sources) == 1:
942
- for _, entries in actual_sources:
943
- for entry in entries:
944
- print(format_ls_entry(entry))
945
- else:
946
- first = True
947
- for source, entries in actual_sources:
948
- # print a newline between directory listings
949
- if first:
950
- first = False
951
- else:
952
- print()
953
- if source:
954
- print(f"{source}:")
955
- for entry in entries:
956
- print(format_ls_entry(entry))
957
- else:
958
- chain = DataChain.listings()
959
- for ls in chain.collect("listing"):
960
- print(format_ls_entry(f"{ls.uri}@v{ls.version}")) # type: ignore[union-attr]
961
-
962
-
963
- def format_ls_entry(entry: str) -> str:
964
- if entry.endswith("/") or not entry:
965
- entry = shlex.quote(entry[:-1])
966
- return f"{entry}/"
967
- return shlex.quote(entry)
968
-
969
-
970
- def ls_remote(
971
- paths: Iterable[str],
972
- long: bool = False,
973
- team: Optional[str] = None,
974
- ):
975
- from datachain.node import long_line_str
976
- from datachain.remote.studio import StudioClient
977
-
978
- client = StudioClient(team=team)
979
- first = True
980
- for path, response in client.ls(paths):
981
- if not first:
982
- print()
983
- if not response.ok or response.data is None:
984
- print(f"{path}:\n Error: {response.message}\n")
985
- continue
986
-
987
- print(f"{path}:")
988
- if long:
989
- for row in response.data:
990
- entry = long_line_str(
991
- row["name"] + ("/" if row["dir_type"] else ""),
992
- row["last_modified"],
993
- )
994
- print(format_ls_entry(entry))
995
- else:
996
- for row in response.data:
997
- entry = row["name"] + ("/" if row["dir_type"] else "")
998
- print(format_ls_entry(entry))
999
- first = False
1000
-
1001
-
1002
- def ls(
1003
- sources,
1004
- long: bool = False,
1005
- studio: bool = False,
1006
- local: bool = False,
1007
- all: bool = True,
1008
- team: Optional[str] = None,
1009
- **kwargs,
1010
- ):
1011
- token = Config().read().get("studio", {}).get("token")
1012
- all, local, studio = _determine_flavors(studio, local, all, token)
1013
-
1014
- if all or local:
1015
- ls_local(sources, long=long, **kwargs)
1016
-
1017
- if (all or studio) and token:
1018
- ls_remote(sources, long=long, team=team)
1019
-
1020
-
1021
- def datasets(
1022
- catalog: "Catalog",
1023
- studio: bool = False,
1024
- local: bool = False,
1025
- all: bool = True,
1026
- team: Optional[str] = None,
1027
- ):
1028
- token = Config().read().get("studio", {}).get("token")
1029
- all, local, studio = _determine_flavors(studio, local, all, token)
1030
-
1031
- local_datasets = set(list_datasets_local(catalog)) if all or local else set()
1032
- studio_datasets = (
1033
- set(list_datasets(team=team)) if (all or studio) and token else set()
1034
- )
1035
-
1036
- rows = [
1037
- _datasets_tabulate_row(
1038
- name=name,
1039
- version=version,
1040
- both=(all or (local and studio)) and token,
1041
- local=(name, version) in local_datasets,
1042
- studio=(name, version) in studio_datasets,
1043
- )
1044
- for name, version in local_datasets.union(studio_datasets)
1045
- ]
1046
-
1047
- print(tabulate(rows, headers="keys"))
1048
-
1049
-
1050
- def list_datasets_local(catalog: "Catalog"):
1051
- for d in catalog.ls_datasets():
1052
- for v in d.versions:
1053
- yield (d.name, v.version)
1054
-
1055
-
1056
- def _datasets_tabulate_row(name, version, both, local, studio):
1057
- row = {
1058
- "Name": name,
1059
- "Version": version,
1060
- }
1061
- if both:
1062
- row["Studio"] = "\u2714" if studio else "\u2716"
1063
- row["Local"] = "\u2714" if local else "\u2716"
1064
- return row
1065
-
1066
-
1067
- def rm_dataset(
1068
- catalog: "Catalog",
1069
- name: str,
1070
- version: Optional[int] = None,
1071
- force: Optional[bool] = False,
1072
- studio: bool = False,
1073
- local: bool = False,
1074
- all: bool = True,
1075
- team: Optional[str] = None,
1076
- ):
1077
- token = Config().read().get("studio", {}).get("token")
1078
- all, local, studio = _determine_flavors(studio, local, all, token)
1079
-
1080
- if all or local:
1081
- try:
1082
- catalog.remove_dataset(name, version=version, force=force)
1083
- except DatasetNotFoundError:
1084
- print("Dataset not found in local", file=sys.stderr)
1085
-
1086
- if (all or studio) and token:
1087
- remove_studio_dataset(team, name, version, force)
1088
-
1089
-
1090
- def edit_dataset(
1091
- catalog: "Catalog",
1092
- name: str,
1093
- new_name: Optional[str] = None,
1094
- description: Optional[str] = None,
1095
- labels: Optional[list[str]] = None,
1096
- studio: bool = False,
1097
- local: bool = False,
1098
- all: bool = True,
1099
- team: Optional[str] = None,
1100
- ):
1101
- token = Config().read().get("studio", {}).get("token")
1102
- all, local, studio = _determine_flavors(studio, local, all, token)
1103
-
1104
- if all or local:
1105
- try:
1106
- catalog.edit_dataset(name, new_name, description, labels)
1107
- except DatasetNotFoundError:
1108
- print("Dataset not found in local", file=sys.stderr)
1109
-
1110
- if (all or studio) and token:
1111
- edit_studio_dataset(team, name, new_name, description, labels)
1112
-
1113
-
1114
- def dataset_stats(
1115
- catalog: "Catalog",
1116
- name: str,
1117
- version: int,
1118
- show_bytes=False,
1119
- si=False,
1120
- ):
1121
- stats = catalog.dataset_stats(name, version)
1122
-
1123
- if stats:
1124
- print(f"Number of objects: {stats.num_objects}")
1125
- if show_bytes:
1126
- print(f"Total objects size: {stats.size}")
1127
- else:
1128
- print(f"Total objects size: {utils.sizeof_fmt(stats.size, si=si): >7}")
1129
-
1130
-
1131
- def du(catalog: "Catalog", sources, show_bytes=False, si=False, **kwargs):
1132
- for path, size in catalog.du(sources, **kwargs):
1133
- if show_bytes:
1134
- print(f"{size} {path}")
1135
- else:
1136
- print(f"{utils.sizeof_fmt(size, si=si): >7} {path}")
1137
-
1138
-
1139
- def index(
1140
- catalog: "Catalog",
1141
- sources,
1142
- **kwargs,
1143
- ):
1144
- catalog.index(sources, **kwargs)
1145
-
1146
-
1147
- def show(
1148
- catalog: "Catalog",
1149
- name: str,
1150
- version: Optional[int] = None,
1151
- limit: int = 10,
1152
- offset: int = 0,
1153
- columns: Sequence[str] = (),
1154
- no_collapse: bool = False,
1155
- schema: bool = False,
1156
- ) -> None:
1157
- from datachain.lib.dc import DataChain
1158
- from datachain.query.dataset import DatasetQuery
1159
- from datachain.utils import show_records
1160
-
1161
- dataset = catalog.get_dataset(name)
1162
- dataset_version = dataset.get_version(version or dataset.latest_version)
1163
-
1164
- query = (
1165
- DatasetQuery(name=name, version=version, catalog=catalog)
1166
- .select(*columns)
1167
- .limit(limit)
1168
- .offset(offset)
1169
- )
1170
- records = query.to_db_records()
1171
- show_records(records, collapse_columns=not no_collapse)
1172
- if schema and dataset_version.feature_schema:
1173
- print("\nSchema:")
1174
- session = Session.get(catalog=catalog)
1175
- dc = DataChain.from_dataset(name=name, version=version, session=session)
1176
- dc.print_schema()
1177
-
1178
-
1179
- def query(
1180
- catalog: "Catalog",
1181
- script: str,
1182
- parallel: Optional[int] = None,
1183
- params: Optional[dict[str, str]] = None,
1184
- ) -> None:
1185
- from datachain.data_storage import JobQueryType, JobStatus
1186
-
1187
- with open(script, encoding="utf-8") as f:
1188
- script_content = f.read()
1189
-
1190
- if parallel is not None:
1191
- # This also sets this environment variable for any subprocesses
1192
- os.environ["DATACHAIN_SETTINGS_PARALLEL"] = str(parallel)
1193
-
1194
- python_version = f"{sys.version_info.major}.{sys.version_info.minor}"
1195
- python_executable = sys.executable
1196
-
1197
- job_id = catalog.metastore.create_job(
1198
- name=os.path.basename(script),
1199
- query=script_content,
1200
- query_type=JobQueryType.PYTHON,
1201
- python_version=python_version,
1202
- params=params,
1203
- )
1204
-
1205
- try:
1206
- catalog.query(
1207
- script_content,
1208
- python_executable=python_executable,
1209
- params=params,
1210
- job_id=job_id,
1211
- )
1212
- except Exception as e:
1213
- error_message = str(e)
1214
- error_stack = traceback.format_exc()
1215
- catalog.metastore.set_job_status(
1216
- job_id,
1217
- JobStatus.FAILED,
1218
- error_message=error_message,
1219
- error_stack=error_stack,
1220
- )
1221
- raise
1222
- catalog.metastore.set_job_status(job_id, JobStatus.COMPLETE)
1223
-
1224
-
1225
- def clear_cache(catalog: "Catalog"):
1226
- catalog.cache.clear()
1227
-
1228
-
1229
- def garbage_collect(catalog: "Catalog"):
1230
- temp_tables = catalog.get_temp_table_names()
1231
- if not temp_tables:
1232
- print("Nothing to clean up.")
1233
- else:
1234
- print(f"Garbage collecting {len(temp_tables)} tables.")
1235
- catalog.cleanup_tables(temp_tables)
1236
-
1237
-
1238
- def completion(shell: str) -> str:
1239
- return shtab.complete(
1240
- get_parser(),
1241
- shell=shell,
1242
- )
1243
-
1244
-
1245
- def _determine_flavors(studio: bool, local: bool, all: bool, token: Optional[str]):
1246
- if studio and not token:
1247
- raise DataChainError(
1248
- "Not logged in to Studio. Log in with 'datachain studio login'."
1249
- )
1250
-
1251
- if local or studio:
1252
- all = False
1253
-
1254
- all = all and not (local or studio)
1255
-
1256
- return all, local, studio
1257
-
1258
-
1259
- def main(argv: Optional[list[str]] = None) -> int: # noqa: C901, PLR0912, PLR0915
1260
- # Required for Windows multiprocessing support
1261
- freeze_support()
1262
-
1263
- parser = get_parser()
1264
- args = parser.parse_args(argv)
1265
-
1266
- if args.command == "internal-run-udf":
1267
- from datachain.query.dispatch import udf_entrypoint
1268
-
1269
- return udf_entrypoint()
1270
-
1271
- if args.command == "internal-run-udf-worker":
1272
- from datachain.query.dispatch import udf_worker_entrypoint
1273
-
1274
- return udf_worker_entrypoint()
1275
-
1276
- from .catalog import get_catalog
1277
-
1278
- logger.addHandler(logging.StreamHandler())
1279
- logging_level = get_logging_level(args)
1280
- logger.setLevel(logging_level)
1281
-
1282
- client_config = {
1283
- "aws_endpoint_url": args.aws_endpoint_url,
1284
- "anon": args.anon,
1285
- }
1286
-
1287
- if args.debug_sql:
1288
- # This also sets this environment variable for any subprocesses
1289
- os.environ["DEBUG_SHOW_SQL_QUERIES"] = "True"
1290
-
1291
- error = None
1292
- try:
1293
- catalog = get_catalog(client_config=client_config)
1294
- if args.command == "cp":
1295
- catalog.cp(
1296
- args.sources,
1297
- args.output,
1298
- force=bool(args.force),
1299
- update=bool(args.update),
1300
- recursive=bool(args.recursive),
1301
- edatachain_file=None,
1302
- edatachain_only=False,
1303
- no_edatachain_file=True,
1304
- no_glob=args.no_glob,
1305
- )
1306
- elif args.command == "clone":
1307
- catalog.clone(
1308
- args.sources,
1309
- args.output,
1310
- force=bool(args.force),
1311
- update=bool(args.update),
1312
- recursive=bool(args.recursive),
1313
- no_glob=args.no_glob,
1314
- no_cp=args.no_cp,
1315
- edatachain=args.edatachain,
1316
- edatachain_file=args.edatachain_file,
1317
- )
1318
- elif args.command in ("datasets", "ds"):
1319
- if args.datasets_cmd == "pull":
1320
- catalog.pull_dataset(
1321
- args.dataset,
1322
- args.output,
1323
- local_ds_name=args.local_name,
1324
- local_ds_version=args.local_version,
1325
- cp=args.cp,
1326
- force=bool(args.force),
1327
- edatachain=args.edatachain,
1328
- edatachain_file=args.edatachain_file,
1329
- )
1330
- elif args.datasets_cmd == "edit":
1331
- edit_dataset(
1332
- catalog,
1333
- args.name,
1334
- new_name=args.new_name,
1335
- description=args.description,
1336
- labels=args.labels,
1337
- studio=args.studio,
1338
- local=args.local,
1339
- all=args.all,
1340
- team=args.team,
1341
- )
1342
- elif args.datasets_cmd == "ls":
1343
- datasets(
1344
- catalog=catalog,
1345
- studio=args.studio,
1346
- local=args.local,
1347
- all=args.all,
1348
- team=args.team,
1349
- )
1350
- elif args.datasets_cmd in ("rm", "remove"):
1351
- rm_dataset(
1352
- catalog,
1353
- args.name,
1354
- version=args.version,
1355
- force=args.force,
1356
- studio=args.studio,
1357
- local=args.local,
1358
- all=args.all,
1359
- team=args.team,
1360
- )
1361
- elif args.datasets_cmd == "stats":
1362
- dataset_stats(
1363
- catalog,
1364
- args.name,
1365
- args.version,
1366
- show_bytes=args.bytes,
1367
- si=args.si,
1368
- )
1369
- else:
1370
- raise Exception(f"Unexpected command {args.datasets_cmd}")
1371
- elif args.command == "ls":
1372
- ls(
1373
- args.sources,
1374
- long=bool(args.long),
1375
- studio=args.studio,
1376
- local=args.local,
1377
- all=args.all,
1378
- team=args.team,
1379
- update=bool(args.update),
1380
- client_config=client_config,
1381
- )
1382
-
1383
- elif args.command == "show":
1384
- show(
1385
- catalog,
1386
- args.name,
1387
- args.version,
1388
- limit=args.limit,
1389
- offset=args.offset,
1390
- columns=args.columns,
1391
- no_collapse=args.no_collapse,
1392
- schema=args.schema,
1393
- )
1394
-
1395
- elif args.command == "du":
1396
- du(
1397
- catalog,
1398
- args.sources,
1399
- show_bytes=args.bytes,
1400
- depth=args.depth,
1401
- si=args.si,
1402
- update=bool(args.update),
1403
- client_config=client_config,
1404
- )
1405
- elif args.command == "find":
1406
- results_found = False
1407
- for result in catalog.find(
1408
- args.sources,
1409
- update=bool(args.update),
1410
- names=args.name,
1411
- inames=args.iname,
1412
- paths=args.path,
1413
- ipaths=args.ipath,
1414
- size=args.size,
1415
- typ=args.type,
1416
- columns=args.columns,
1417
- ):
1418
- print(result)
1419
- results_found = True
1420
- if not results_found:
1421
- print("No results")
1422
- elif args.command == "index":
1423
- index(
1424
- catalog,
1425
- args.sources,
1426
- update=bool(args.update),
1427
- )
1428
- elif args.command == "completion":
1429
- print(completion(args.shell))
1430
- elif args.command == "query":
1431
- query(
1432
- catalog,
1433
- args.script,
1434
- parallel=args.parallel,
1435
- params=args.param,
1436
- )
1437
- elif args.command == "clear-cache":
1438
- clear_cache(catalog)
1439
- elif args.command == "gc":
1440
- garbage_collect(catalog)
1441
- elif args.command == "studio":
1442
- process_studio_cli_args(args)
1443
- else:
1444
- print(f"invalid command: {args.command}", file=sys.stderr)
1445
- return 1
1446
- return 0
1447
- except BrokenPipeError as exc:
1448
- # Python flushes standard streams on exit; redirect remaining output
1449
- # to devnull to avoid another BrokenPipeError at shutdown
1450
- # See: https://docs.python.org/3/library/signal.html#note-on-sigpipe
1451
- error = str(exc)
1452
- devnull = os.open(os.devnull, os.O_WRONLY)
1453
- os.dup2(devnull, sys.stdout.fileno())
1454
- return 141 # 128 + 13 (SIGPIPE)
1455
- except (KeyboardInterrupt, Exception) as exc:
1456
- error = str(exc)
1457
- if isinstance(exc, KeyboardInterrupt):
1458
- msg = "Operation cancelled by the user"
1459
- else:
1460
- msg = str(exc)
1461
- print("Error:", msg, file=sys.stderr)
1462
- if logging_level <= logging.DEBUG:
1463
- traceback.print_exception(
1464
- type(exc),
1465
- exc,
1466
- exc.__traceback__,
1467
- file=sys.stderr,
1468
- )
1469
- if args.pdb:
1470
- import pdb # noqa: T100
1471
-
1472
- pdb.post_mortem()
1473
- return 1
1474
- finally:
1475
- telemetry.send_cli_call(args.command, error=error)