datachain 0.8.3__py3-none-any.whl → 0.8.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -0,0 +1,53 @@
1
+ import os
2
+ import sys
3
+ import traceback
4
+ from typing import TYPE_CHECKING, Optional
5
+
6
+ if TYPE_CHECKING:
7
+ from datachain.catalog import Catalog
8
+
9
+
10
+ def query(
11
+ catalog: "Catalog",
12
+ script: str,
13
+ parallel: Optional[int] = None,
14
+ params: Optional[dict[str, str]] = None,
15
+ ) -> None:
16
+ from datachain.data_storage import JobQueryType, JobStatus
17
+
18
+ with open(script, encoding="utf-8") as f:
19
+ script_content = f.read()
20
+
21
+ if parallel is not None:
22
+ # This also sets this environment variable for any subprocesses
23
+ os.environ["DATACHAIN_SETTINGS_PARALLEL"] = str(parallel)
24
+
25
+ python_version = f"{sys.version_info.major}.{sys.version_info.minor}"
26
+ python_executable = sys.executable
27
+
28
+ job_id = catalog.metastore.create_job(
29
+ name=os.path.basename(script),
30
+ query=script_content,
31
+ query_type=JobQueryType.PYTHON,
32
+ python_version=python_version,
33
+ params=params,
34
+ )
35
+
36
+ try:
37
+ catalog.query(
38
+ script_content,
39
+ python_executable=python_executable,
40
+ params=params,
41
+ job_id=job_id,
42
+ )
43
+ except Exception as e:
44
+ error_message = str(e)
45
+ error_stack = traceback.format_exc()
46
+ catalog.metastore.set_job_status(
47
+ job_id,
48
+ JobStatus.FAILED,
49
+ error_message=error_message,
50
+ error_stack=error_stack,
51
+ )
52
+ raise
53
+ catalog.metastore.set_job_status(job_id, JobStatus.COMPLETE)
@@ -0,0 +1,38 @@
1
+ from collections.abc import Sequence
2
+ from typing import TYPE_CHECKING, Optional
3
+
4
+ if TYPE_CHECKING:
5
+ from datachain.catalog import Catalog
6
+
7
+
8
+ def show(
9
+ catalog: "Catalog",
10
+ name: str,
11
+ version: Optional[int] = None,
12
+ limit: int = 10,
13
+ offset: int = 0,
14
+ columns: Sequence[str] = (),
15
+ no_collapse: bool = False,
16
+ schema: bool = False,
17
+ ) -> None:
18
+ from datachain import Session
19
+ from datachain.lib.dc import DataChain
20
+ from datachain.query.dataset import DatasetQuery
21
+ from datachain.utils import show_records
22
+
23
+ dataset = catalog.get_dataset(name)
24
+ dataset_version = dataset.get_version(version or dataset.latest_version)
25
+
26
+ query = (
27
+ DatasetQuery(name=name, version=version, catalog=catalog)
28
+ .select(*columns)
29
+ .limit(limit)
30
+ .offset(offset)
31
+ )
32
+ records = query.to_db_records()
33
+ show_records(records, collapse_columns=not no_collapse)
34
+ if schema and dataset_version.feature_schema:
35
+ print("\nSchema:")
36
+ session = Session.get(catalog=catalog)
37
+ dc = DataChain.from_dataset(name=name, version=version, session=session)
38
+ dc.print_schema()
@@ -0,0 +1,547 @@
1
+ from argparse import ArgumentParser
2
+ from importlib.metadata import PackageNotFoundError, version
3
+
4
+ import shtab
5
+
6
+ from datachain.cli.utils import BooleanOptionalAction, KeyValueArgs
7
+
8
+ from .job import add_jobs_parser
9
+ from .studio import add_studio_parser
10
+ from .utils import FIND_COLUMNS, add_show_args, add_sources_arg, find_columns_type
11
+
12
+
13
+ def get_parser() -> ArgumentParser: # noqa: PLR0915
14
+ try:
15
+ __version__ = version("datachain")
16
+ except PackageNotFoundError:
17
+ # package is not installed
18
+ __version__ = "unknown"
19
+
20
+ parser = ArgumentParser(
21
+ description="DataChain: Wrangle unstructured AI data at scale", prog="datachain"
22
+ )
23
+ parser.add_argument("-V", "--version", action="version", version=__version__)
24
+
25
+ parent_parser = ArgumentParser(add_help=False)
26
+ parent_parser.add_argument(
27
+ "--aws-endpoint-url",
28
+ type=str,
29
+ help="AWS endpoint URL",
30
+ )
31
+ parent_parser.add_argument(
32
+ "--anon",
33
+ action="store_true",
34
+ help="AWS anon (aka awscli's --no-sign-request)",
35
+ )
36
+ parent_parser.add_argument(
37
+ "-u", "--update", action="count", default=0, help="Update cache"
38
+ )
39
+ parent_parser.add_argument(
40
+ "-v", "--verbose", action="count", default=0, help="Verbose"
41
+ )
42
+ parent_parser.add_argument(
43
+ "-q", "--quiet", action="count", default=0, help="Be quiet"
44
+ )
45
+ parent_parser.add_argument(
46
+ "--debug-sql",
47
+ action="store_true",
48
+ default=False,
49
+ help="Show All SQL Queries (very verbose output, for debugging only)",
50
+ )
51
+ parent_parser.add_argument(
52
+ "--pdb",
53
+ action="store_true",
54
+ default=False,
55
+ help="Drop into the pdb debugger on fatal exception",
56
+ )
57
+
58
+ subp = parser.add_subparsers(
59
+ title="Available Commands",
60
+ metavar="command",
61
+ dest="command",
62
+ help=f"Use `{parser.prog} command --help` for command-specific help.",
63
+ required=True,
64
+ )
65
+ parse_cp = subp.add_parser(
66
+ "cp", parents=[parent_parser], description="Copy data files from the cloud"
67
+ )
68
+ add_sources_arg(parse_cp).complete = shtab.DIR # type: ignore[attr-defined]
69
+ parse_cp.add_argument("output", type=str, help="Output")
70
+ parse_cp.add_argument(
71
+ "-f",
72
+ "--force",
73
+ default=False,
74
+ action="store_true",
75
+ help="Force creating outputs",
76
+ )
77
+ parse_cp.add_argument(
78
+ "-r",
79
+ "-R",
80
+ "--recursive",
81
+ default=False,
82
+ action="store_true",
83
+ help="Copy directories recursively",
84
+ )
85
+ parse_cp.add_argument(
86
+ "--no-glob",
87
+ default=False,
88
+ action="store_true",
89
+ help="Do not expand globs (such as * or ?)",
90
+ )
91
+
92
+ parse_clone = subp.add_parser(
93
+ "clone", parents=[parent_parser], description="Copy data files from the cloud"
94
+ )
95
+ add_sources_arg(parse_clone).complete = shtab.DIR # type: ignore[attr-defined]
96
+ parse_clone.add_argument("output", type=str, help="Output")
97
+ parse_clone.add_argument(
98
+ "-f",
99
+ "--force",
100
+ default=False,
101
+ action="store_true",
102
+ help="Force creating outputs",
103
+ )
104
+ parse_clone.add_argument(
105
+ "-r",
106
+ "-R",
107
+ "--recursive",
108
+ default=False,
109
+ action="store_true",
110
+ help="Copy directories recursively",
111
+ )
112
+ parse_clone.add_argument(
113
+ "--no-glob",
114
+ default=False,
115
+ action="store_true",
116
+ help="Do not expand globs (such as * or ?)",
117
+ )
118
+ parse_clone.add_argument(
119
+ "--no-cp",
120
+ default=False,
121
+ action="store_true",
122
+ help="Do not copy files, just create a dataset",
123
+ )
124
+ parse_clone.add_argument(
125
+ "--edatachain",
126
+ default=False,
127
+ action="store_true",
128
+ help="Create a .edatachain file",
129
+ )
130
+ parse_clone.add_argument(
131
+ "--edatachain-file",
132
+ help="Use a different filename for the resulting .edatachain file",
133
+ )
134
+
135
+ add_studio_parser(subp, parent_parser)
136
+ add_jobs_parser(subp, parent_parser)
137
+
138
+ datasets_parser = subp.add_parser(
139
+ "dataset",
140
+ aliases=["ds"],
141
+ parents=[parent_parser],
142
+ description="Commands for managing datasers",
143
+ )
144
+ datasets_subparser = datasets_parser.add_subparsers(
145
+ dest="datasets_cmd",
146
+ help="Use `datachain datasets CMD --help` to display command specific help",
147
+ )
148
+
149
+ parse_pull = datasets_subparser.add_parser(
150
+ "pull",
151
+ parents=[parent_parser],
152
+ description="Pull specific dataset version from SaaS",
153
+ )
154
+ parse_pull.add_argument(
155
+ "dataset",
156
+ type=str,
157
+ help="Name and version of remote dataset created in SaaS",
158
+ )
159
+ parse_pull.add_argument("-o", "--output", type=str, help="Output")
160
+ parse_pull.add_argument(
161
+ "-f",
162
+ "--force",
163
+ default=False,
164
+ action="store_true",
165
+ help="Force creating outputs",
166
+ )
167
+ parse_pull.add_argument(
168
+ "-r",
169
+ "-R",
170
+ "--recursive",
171
+ default=False,
172
+ action="store_true",
173
+ help="Copy directories recursively",
174
+ )
175
+ parse_pull.add_argument(
176
+ "--cp",
177
+ default=False,
178
+ action="store_true",
179
+ help="Copy actual files after pulling remote dataset into local DB",
180
+ )
181
+ parse_pull.add_argument(
182
+ "--edatachain",
183
+ default=False,
184
+ action="store_true",
185
+ help="Create .edatachain file",
186
+ )
187
+ parse_pull.add_argument(
188
+ "--edatachain-file",
189
+ help="Use a different filename for the resulting .edatachain file",
190
+ )
191
+ parse_pull.add_argument(
192
+ "--local-name",
193
+ action="store",
194
+ default=None,
195
+ help="Name of the local dataset",
196
+ )
197
+ parse_pull.add_argument(
198
+ "--local-version",
199
+ action="store",
200
+ default=None,
201
+ help="Version of the local dataset",
202
+ )
203
+
204
+ parse_edit_dataset = datasets_subparser.add_parser(
205
+ "edit", parents=[parent_parser], description="Edit dataset metadata"
206
+ )
207
+ parse_edit_dataset.add_argument("name", type=str, help="Dataset name")
208
+ parse_edit_dataset.add_argument(
209
+ "--new-name",
210
+ action="store",
211
+ help="Dataset new name",
212
+ )
213
+ parse_edit_dataset.add_argument(
214
+ "--description",
215
+ action="store",
216
+ help="Dataset description",
217
+ )
218
+ parse_edit_dataset.add_argument(
219
+ "--labels",
220
+ nargs="+",
221
+ help="Dataset labels",
222
+ )
223
+ parse_edit_dataset.add_argument(
224
+ "--studio",
225
+ action="store_true",
226
+ default=False,
227
+ help="Edit dataset from Studio",
228
+ )
229
+ parse_edit_dataset.add_argument(
230
+ "-L",
231
+ "--local",
232
+ action="store_true",
233
+ default=False,
234
+ help="Edit local dataset only",
235
+ )
236
+ parse_edit_dataset.add_argument(
237
+ "-a",
238
+ "--all",
239
+ action="store_true",
240
+ default=True,
241
+ help="Edit both datasets from studio and local",
242
+ )
243
+ parse_edit_dataset.add_argument(
244
+ "--team",
245
+ action="store",
246
+ default=None,
247
+ help="The team to edit a dataset. By default, it will use team from config.",
248
+ )
249
+
250
+ datasets_parser = datasets_subparser.add_parser(
251
+ "ls", parents=[parent_parser], description="List datasets"
252
+ )
253
+ datasets_parser.add_argument(
254
+ "--studio",
255
+ action="store_true",
256
+ default=False,
257
+ help="List the files in the Studio",
258
+ )
259
+ datasets_parser.add_argument(
260
+ "-L",
261
+ "--local",
262
+ action="store_true",
263
+ default=False,
264
+ help="List local files only",
265
+ )
266
+ datasets_parser.add_argument(
267
+ "-a",
268
+ "--all",
269
+ action="store_true",
270
+ default=True,
271
+ help="List all files including hidden files",
272
+ )
273
+ datasets_parser.add_argument(
274
+ "--team",
275
+ action="store",
276
+ default=None,
277
+ help="The team to list datasets for. By default, it will use team from config.",
278
+ )
279
+
280
+ rm_dataset_parser = datasets_subparser.add_parser(
281
+ "rm", parents=[parent_parser], description="Removes dataset", aliases=["remove"]
282
+ )
283
+ rm_dataset_parser.add_argument("name", type=str, help="Dataset name")
284
+ rm_dataset_parser.add_argument(
285
+ "--version",
286
+ action="store",
287
+ default=None,
288
+ type=int,
289
+ help="Dataset version",
290
+ )
291
+ rm_dataset_parser.add_argument(
292
+ "--force",
293
+ default=False,
294
+ action=BooleanOptionalAction,
295
+ help="Force delete registered dataset with all of it's versions",
296
+ )
297
+ rm_dataset_parser.add_argument(
298
+ "--studio",
299
+ action="store_true",
300
+ default=False,
301
+ help="Remove dataset from Studio",
302
+ )
303
+ rm_dataset_parser.add_argument(
304
+ "-L",
305
+ "--local",
306
+ action="store_true",
307
+ default=False,
308
+ help="Remove local datasets only",
309
+ )
310
+ rm_dataset_parser.add_argument(
311
+ "-a",
312
+ "--all",
313
+ action="store_true",
314
+ default=True,
315
+ help="Remove both local and studio",
316
+ )
317
+ rm_dataset_parser.add_argument(
318
+ "--team",
319
+ action="store",
320
+ default=None,
321
+ help="The team to delete a dataset. By default, it will use team from config.",
322
+ )
323
+
324
+ dataset_stats_parser = datasets_subparser.add_parser(
325
+ "stats",
326
+ parents=[parent_parser],
327
+ description="Shows basic dataset stats",
328
+ )
329
+ dataset_stats_parser.add_argument("name", type=str, help="Dataset name")
330
+ dataset_stats_parser.add_argument(
331
+ "--version",
332
+ action="store",
333
+ default=None,
334
+ type=int,
335
+ help="Dataset version",
336
+ )
337
+ dataset_stats_parser.add_argument(
338
+ "-b",
339
+ "--bytes",
340
+ default=False,
341
+ action="store_true",
342
+ help="Display size in bytes instead of human-readable size",
343
+ )
344
+ dataset_stats_parser.add_argument(
345
+ "--si",
346
+ default=False,
347
+ action="store_true",
348
+ help="Display size using powers of 1000 not 1024",
349
+ )
350
+
351
+ parse_ls = subp.add_parser(
352
+ "ls", parents=[parent_parser], description="List storage contents"
353
+ )
354
+ add_sources_arg(parse_ls, nargs="*")
355
+ parse_ls.add_argument(
356
+ "-l",
357
+ "--long",
358
+ action="count",
359
+ default=0,
360
+ help="List files in the long format",
361
+ )
362
+ parse_ls.add_argument(
363
+ "--studio",
364
+ action="store_true",
365
+ default=False,
366
+ help="List the files in the Studio",
367
+ )
368
+ parse_ls.add_argument(
369
+ "-L",
370
+ "--local",
371
+ action="store_true",
372
+ default=False,
373
+ help="List local files only",
374
+ )
375
+ parse_ls.add_argument(
376
+ "-a",
377
+ "--all",
378
+ action="store_true",
379
+ default=True,
380
+ help="List all files including hidden files",
381
+ )
382
+ parse_ls.add_argument(
383
+ "--team",
384
+ action="store",
385
+ default=None,
386
+ help="The team to list datasets for. By default, it will use team from config.",
387
+ )
388
+
389
+ parse_du = subp.add_parser(
390
+ "du", parents=[parent_parser], description="Display space usage"
391
+ )
392
+ add_sources_arg(parse_du)
393
+ parse_du.add_argument(
394
+ "-b",
395
+ "--bytes",
396
+ default=False,
397
+ action="store_true",
398
+ help="Display sizes in bytes instead of human-readable sizes",
399
+ )
400
+ parse_du.add_argument(
401
+ "-d",
402
+ "--depth",
403
+ "--max-depth",
404
+ default=0,
405
+ type=int,
406
+ metavar="N",
407
+ help=(
408
+ "Display sizes for N directory depths below the given directory, "
409
+ "the default is 0 (summarize provided directory only)."
410
+ ),
411
+ )
412
+ parse_du.add_argument(
413
+ "--si",
414
+ default=False,
415
+ action="store_true",
416
+ help="Display sizes using powers of 1000 not 1024",
417
+ )
418
+
419
+ parse_find = subp.add_parser(
420
+ "find", parents=[parent_parser], description="Search in a directory hierarchy"
421
+ )
422
+ add_sources_arg(parse_find)
423
+ parse_find.add_argument(
424
+ "--name",
425
+ type=str,
426
+ action="append",
427
+ help="Filename to match pattern.",
428
+ )
429
+ parse_find.add_argument(
430
+ "--iname",
431
+ type=str,
432
+ action="append",
433
+ help="Like -name but case insensitive.",
434
+ )
435
+ parse_find.add_argument(
436
+ "--path",
437
+ type=str,
438
+ action="append",
439
+ help="Path to match pattern.",
440
+ )
441
+ parse_find.add_argument(
442
+ "--ipath",
443
+ type=str,
444
+ action="append",
445
+ help="Like -path but case insensitive.",
446
+ )
447
+ parse_find.add_argument(
448
+ "--size",
449
+ type=str,
450
+ help=(
451
+ "Filter by size (+ is greater or equal, - is less or equal). "
452
+ "Specified size is in bytes, or use a suffix like K, M, G for "
453
+ "kilobytes, megabytes, gigabytes, etc."
454
+ ),
455
+ )
456
+ parse_find.add_argument(
457
+ "--type",
458
+ type=str,
459
+ help='File type: "f" - regular, "d" - directory',
460
+ )
461
+ parse_find.add_argument(
462
+ "-c",
463
+ "--columns",
464
+ type=find_columns_type,
465
+ default=None,
466
+ help=(
467
+ "A comma-separated list of columns to print for each result. "
468
+ f"Options are: {','.join(FIND_COLUMNS)} (Default: path)"
469
+ ),
470
+ )
471
+
472
+ parse_index = subp.add_parser(
473
+ "index", parents=[parent_parser], description="Index storage location"
474
+ )
475
+ add_sources_arg(parse_index)
476
+
477
+ show_parser = subp.add_parser(
478
+ "show",
479
+ parents=[parent_parser],
480
+ description="Create a new dataset with a query script",
481
+ )
482
+ show_parser.add_argument("name", type=str, help="Dataset name")
483
+ show_parser.add_argument(
484
+ "--version",
485
+ action="store",
486
+ default=None,
487
+ type=int,
488
+ help="Dataset version",
489
+ )
490
+ show_parser.add_argument("--schema", action="store_true", help="Show schema")
491
+ add_show_args(show_parser)
492
+
493
+ query_parser = subp.add_parser(
494
+ "query",
495
+ parents=[parent_parser],
496
+ description="Create a new dataset with a query script",
497
+ )
498
+ query_parser.add_argument(
499
+ "script", metavar="<script.py>", type=str, help="Filepath for script"
500
+ )
501
+ query_parser.add_argument(
502
+ "--parallel",
503
+ nargs="?",
504
+ type=int,
505
+ const=-1,
506
+ default=None,
507
+ metavar="N",
508
+ help=(
509
+ "Use multiprocessing to run any query script UDFs with N worker processes. "
510
+ "N defaults to the CPU count."
511
+ ),
512
+ )
513
+ query_parser.add_argument(
514
+ "-p",
515
+ "--param",
516
+ metavar="param=value",
517
+ nargs=1,
518
+ action=KeyValueArgs,
519
+ help="Query parameters",
520
+ )
521
+
522
+ subp.add_parser(
523
+ "clear-cache", parents=[parent_parser], description="Clear the local file cache"
524
+ )
525
+ subp.add_parser(
526
+ "gc", parents=[parent_parser], description="Garbage collect temporary tables"
527
+ )
528
+
529
+ subp.add_parser("internal-run-udf", parents=[parent_parser])
530
+ subp.add_parser("internal-run-udf-worker", parents=[parent_parser])
531
+ add_completion_parser(subp, [parent_parser])
532
+ return parser
533
+
534
+
535
+ def add_completion_parser(subparsers, parents):
536
+ parser = subparsers.add_parser(
537
+ "completion",
538
+ parents=parents,
539
+ description="Output shell completion script",
540
+ )
541
+ parser.add_argument(
542
+ "-s",
543
+ "--shell",
544
+ help="Shell syntax for completions.",
545
+ default="bash",
546
+ choices=shtab.SUPPORTED_SHELLS,
547
+ )