datachain 0.8.10__py3-none-any.whl → 0.8.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/cache.py +4 -4
- datachain/catalog/__init__.py +0 -2
- datachain/catalog/catalog.py +102 -138
- datachain/cli/__init__.py +7 -6
- datachain/cli/parser/__init__.py +27 -16
- datachain/cli/parser/studio.py +7 -6
- datachain/cli/parser/utils.py +18 -0
- datachain/client/fsspec.py +11 -8
- datachain/client/local.py +4 -4
- datachain/data_storage/schema.py +1 -1
- datachain/dataset.py +1 -1
- datachain/error.py +12 -0
- datachain/func/__init__.py +2 -1
- datachain/func/conditional.py +67 -23
- datachain/func/func.py +17 -5
- datachain/lib/dc.py +24 -4
- datachain/lib/file.py +16 -0
- datachain/lib/listing.py +30 -12
- datachain/lib/pytorch.py +1 -1
- datachain/lib/udf.py +1 -1
- datachain/listing.py +1 -13
- datachain/node.py +0 -15
- datachain/nodes_fetcher.py +2 -2
- datachain/remote/studio.py +1 -1
- datachain/studio.py +1 -1
- {datachain-0.8.10.dist-info → datachain-0.8.11.dist-info}/METADATA +3 -7
- {datachain-0.8.10.dist-info → datachain-0.8.11.dist-info}/RECORD +31 -31
- {datachain-0.8.10.dist-info → datachain-0.8.11.dist-info}/LICENSE +0 -0
- {datachain-0.8.10.dist-info → datachain-0.8.11.dist-info}/WHEEL +0 -0
- {datachain-0.8.10.dist-info → datachain-0.8.11.dist-info}/entry_points.txt +0 -0
- {datachain-0.8.10.dist-info → datachain-0.8.11.dist-info}/top_level.txt +0 -0
datachain/client/fsspec.py
CHANGED
|
@@ -3,6 +3,7 @@ import functools
|
|
|
3
3
|
import logging
|
|
4
4
|
import multiprocessing
|
|
5
5
|
import os
|
|
6
|
+
import posixpath
|
|
6
7
|
import re
|
|
7
8
|
import sys
|
|
8
9
|
from abc import ABC, abstractmethod
|
|
@@ -25,7 +26,7 @@ from fsspec.asyn import get_loop, sync
|
|
|
25
26
|
from fsspec.callbacks import DEFAULT_CALLBACK, Callback
|
|
26
27
|
from tqdm.auto import tqdm
|
|
27
28
|
|
|
28
|
-
from datachain.cache import
|
|
29
|
+
from datachain.cache import Cache
|
|
29
30
|
from datachain.client.fileslice import FileWrapper
|
|
30
31
|
from datachain.error import ClientError as DataChainClientError
|
|
31
32
|
from datachain.nodes_fetcher import NodesFetcher
|
|
@@ -74,9 +75,7 @@ class Client(ABC):
|
|
|
74
75
|
PREFIX: ClassVar[str]
|
|
75
76
|
protocol: ClassVar[str]
|
|
76
77
|
|
|
77
|
-
def __init__(
|
|
78
|
-
self, name: str, fs_kwargs: dict[str, Any], cache: DataChainCache
|
|
79
|
-
) -> None:
|
|
78
|
+
def __init__(self, name: str, fs_kwargs: dict[str, Any], cache: Cache) -> None:
|
|
80
79
|
self.name = name
|
|
81
80
|
self.fs_kwargs = fs_kwargs
|
|
82
81
|
self._fs: Optional[AbstractFileSystem] = None
|
|
@@ -122,7 +121,7 @@ class Client(ABC):
|
|
|
122
121
|
return cls.get_uri(storage_name), rel_path
|
|
123
122
|
|
|
124
123
|
@staticmethod
|
|
125
|
-
def get_client(source: str, cache:
|
|
124
|
+
def get_client(source: str, cache: Cache, **kwargs) -> "Client":
|
|
126
125
|
cls = Client.get_implementation(source)
|
|
127
126
|
storage_url, _ = cls.split_url(source)
|
|
128
127
|
if os.name == "nt":
|
|
@@ -145,7 +144,7 @@ class Client(ABC):
|
|
|
145
144
|
def from_name(
|
|
146
145
|
cls,
|
|
147
146
|
name: str,
|
|
148
|
-
cache:
|
|
147
|
+
cache: Cache,
|
|
149
148
|
kwargs: dict[str, Any],
|
|
150
149
|
) -> "Client":
|
|
151
150
|
return cls(name, kwargs, cache)
|
|
@@ -154,7 +153,7 @@ class Client(ABC):
|
|
|
154
153
|
def from_source(
|
|
155
154
|
cls,
|
|
156
155
|
uri: "StorageURI",
|
|
157
|
-
cache:
|
|
156
|
+
cache: Cache,
|
|
158
157
|
**kwargs,
|
|
159
158
|
) -> "Client":
|
|
160
159
|
return cls(cls.FS_CLASS._strip_protocol(uri), kwargs, cache)
|
|
@@ -390,8 +389,12 @@ class Client(ABC):
|
|
|
390
389
|
self.fs.open(self.get_full_path(file.path, file.version)), cb
|
|
391
390
|
) # type: ignore[return-value]
|
|
392
391
|
|
|
393
|
-
def upload(self,
|
|
392
|
+
def upload(self, data: bytes, path: str) -> "File":
|
|
394
393
|
full_path = self.get_full_path(path)
|
|
394
|
+
|
|
395
|
+
parent = posixpath.dirname(full_path)
|
|
396
|
+
self.fs.makedirs(parent, exist_ok=True)
|
|
397
|
+
|
|
395
398
|
self.fs.pipe_file(full_path, data)
|
|
396
399
|
file_info = self.fs.info(full_path)
|
|
397
400
|
return self.info_to_file(file_info, path)
|
datachain/client/local.py
CHANGED
|
@@ -12,7 +12,7 @@ from datachain.lib.file import File
|
|
|
12
12
|
from .fsspec import Client
|
|
13
13
|
|
|
14
14
|
if TYPE_CHECKING:
|
|
15
|
-
from datachain.cache import
|
|
15
|
+
from datachain.cache import Cache
|
|
16
16
|
from datachain.dataset import StorageURI
|
|
17
17
|
|
|
18
18
|
|
|
@@ -25,7 +25,7 @@ class FileClient(Client):
|
|
|
25
25
|
self,
|
|
26
26
|
name: str,
|
|
27
27
|
fs_kwargs: dict[str, Any],
|
|
28
|
-
cache: "
|
|
28
|
+
cache: "Cache",
|
|
29
29
|
use_symlinks: bool = False,
|
|
30
30
|
) -> None:
|
|
31
31
|
super().__init__(name, fs_kwargs, cache)
|
|
@@ -82,7 +82,7 @@ class FileClient(Client):
|
|
|
82
82
|
return bucket, path
|
|
83
83
|
|
|
84
84
|
@classmethod
|
|
85
|
-
def from_name(cls, name: str, cache: "
|
|
85
|
+
def from_name(cls, name: str, cache: "Cache", kwargs) -> "FileClient":
|
|
86
86
|
use_symlinks = kwargs.pop("use_symlinks", False)
|
|
87
87
|
return cls(name, kwargs, cache, use_symlinks=use_symlinks)
|
|
88
88
|
|
|
@@ -90,7 +90,7 @@ class FileClient(Client):
|
|
|
90
90
|
def from_source(
|
|
91
91
|
cls,
|
|
92
92
|
uri: str,
|
|
93
|
-
cache: "
|
|
93
|
+
cache: "Cache",
|
|
94
94
|
use_symlinks: bool = False,
|
|
95
95
|
**kwargs,
|
|
96
96
|
) -> "FileClient":
|
datachain/data_storage/schema.py
CHANGED
|
@@ -200,7 +200,7 @@ class DataTable:
|
|
|
200
200
|
columns: Sequence["sa.Column"] = (),
|
|
201
201
|
metadata: Optional["sa.MetaData"] = None,
|
|
202
202
|
):
|
|
203
|
-
# copy columns, since
|
|
203
|
+
# copy columns, since reusing the same objects from another table
|
|
204
204
|
# may raise an error
|
|
205
205
|
columns = cls.sys_columns() + [cls.copy_column(c) for c in columns]
|
|
206
206
|
columns = dedup_columns(columns)
|
datachain/dataset.py
CHANGED
|
@@ -91,7 +91,7 @@ class DatasetDependency:
|
|
|
91
91
|
if self.type == DatasetDependencyType.DATASET:
|
|
92
92
|
return self.name
|
|
93
93
|
|
|
94
|
-
list_dataset_name, _, _ = parse_listing_uri(self.name.strip("/"),
|
|
94
|
+
list_dataset_name, _, _ = parse_listing_uri(self.name.strip("/"), {})
|
|
95
95
|
assert list_dataset_name
|
|
96
96
|
return list_dataset_name
|
|
97
97
|
|
datachain/error.py
CHANGED
|
@@ -1,3 +1,15 @@
|
|
|
1
|
+
import botocore.errorfactory
|
|
2
|
+
import botocore.exceptions
|
|
3
|
+
import gcsfs.retry
|
|
4
|
+
|
|
5
|
+
REMOTE_ERRORS = (
|
|
6
|
+
gcsfs.retry.HttpError, # GCS
|
|
7
|
+
OSError, # GCS
|
|
8
|
+
botocore.exceptions.BotoCoreError, # S3
|
|
9
|
+
ValueError, # Azure
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
|
|
1
13
|
class DataChainError(RuntimeError):
|
|
2
14
|
pass
|
|
3
15
|
|
datachain/func/__init__.py
CHANGED
|
@@ -16,7 +16,7 @@ from .aggregate import (
|
|
|
16
16
|
sum,
|
|
17
17
|
)
|
|
18
18
|
from .array import cosine_distance, euclidean_distance, length, sip_hash_64
|
|
19
|
-
from .conditional import case, greatest, ifelse, least
|
|
19
|
+
from .conditional import case, greatest, ifelse, isnone, least
|
|
20
20
|
from .numeric import bit_and, bit_hamming_distance, bit_or, bit_xor, int_hash_64
|
|
21
21
|
from .random import rand
|
|
22
22
|
from .string import byte_hamming_distance
|
|
@@ -42,6 +42,7 @@ __all__ = [
|
|
|
42
42
|
"greatest",
|
|
43
43
|
"ifelse",
|
|
44
44
|
"int_hash_64",
|
|
45
|
+
"isnone",
|
|
45
46
|
"least",
|
|
46
47
|
"length",
|
|
47
48
|
"literal",
|
datachain/func/conditional.py
CHANGED
|
@@ -1,14 +1,15 @@
|
|
|
1
|
-
from typing import Union
|
|
1
|
+
from typing import Optional, Union
|
|
2
2
|
|
|
3
|
+
from sqlalchemy import ColumnElement
|
|
3
4
|
from sqlalchemy import case as sql_case
|
|
4
|
-
from sqlalchemy.sql.elements import BinaryExpression
|
|
5
5
|
|
|
6
6
|
from datachain.lib.utils import DataChainParamsError
|
|
7
|
+
from datachain.query.schema import Column
|
|
7
8
|
from datachain.sql.functions import conditional
|
|
8
9
|
|
|
9
10
|
from .func import ColT, Func
|
|
10
11
|
|
|
11
|
-
CaseT = Union[int, float, complex, bool, str]
|
|
12
|
+
CaseT = Union[int, float, complex, bool, str, Func]
|
|
12
13
|
|
|
13
14
|
|
|
14
15
|
def greatest(*args: Union[ColT, float]) -> Func:
|
|
@@ -87,17 +88,21 @@ def least(*args: Union[ColT, float]) -> Func:
|
|
|
87
88
|
)
|
|
88
89
|
|
|
89
90
|
|
|
90
|
-
def case(
|
|
91
|
+
def case(
|
|
92
|
+
*args: tuple[Union[ColumnElement, Func], CaseT], else_: Optional[CaseT] = None
|
|
93
|
+
) -> Func:
|
|
91
94
|
"""
|
|
92
95
|
Returns the case function that produces case expression which has a list of
|
|
93
|
-
conditions and corresponding results. Results can
|
|
94
|
-
|
|
96
|
+
conditions and corresponding results. Results can be python primitives like string,
|
|
97
|
+
numbers or booleans but can also be other nested function (including case function).
|
|
98
|
+
Result type is inferred from condition results.
|
|
95
99
|
|
|
96
100
|
Args:
|
|
97
|
-
args (tuple(
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
+
args (tuple((ColumnElement, Func), (str | int | float | complex | bool, Func))):
|
|
102
|
+
Tuple of condition and values pair.
|
|
103
|
+
else_ (str | int | float | complex | bool, Func): optional else value in case
|
|
104
|
+
expression. If omitted, and no case conditions are satisfied, the result
|
|
105
|
+
will be None (NULL in DB).
|
|
101
106
|
|
|
102
107
|
Returns:
|
|
103
108
|
Func: A Func object that represents the case function.
|
|
@@ -111,15 +116,24 @@ def case(*args: tuple[BinaryExpression, CaseT], else_=None) -> Func:
|
|
|
111
116
|
"""
|
|
112
117
|
supported_types = [int, float, complex, str, bool]
|
|
113
118
|
|
|
114
|
-
|
|
119
|
+
def _get_type(val):
|
|
120
|
+
if isinstance(val, Func):
|
|
121
|
+
# nested functions
|
|
122
|
+
return val.result_type
|
|
123
|
+
return type(val)
|
|
115
124
|
|
|
116
125
|
if not args:
|
|
117
126
|
raise DataChainParamsError("Missing statements")
|
|
118
127
|
|
|
128
|
+
type_ = _get_type(else_) if else_ is not None else None
|
|
129
|
+
|
|
119
130
|
for arg in args:
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
131
|
+
arg_type = _get_type(arg[1])
|
|
132
|
+
if type_ and arg_type != type_:
|
|
133
|
+
raise DataChainParamsError(
|
|
134
|
+
f"Statement values must be of the same type, got {type_} and {arg_type}"
|
|
135
|
+
)
|
|
136
|
+
type_ = arg_type
|
|
123
137
|
|
|
124
138
|
if type_ not in supported_types:
|
|
125
139
|
raise DataChainParamsError(
|
|
@@ -127,20 +141,25 @@ def case(*args: tuple[BinaryExpression, CaseT], else_=None) -> Func:
|
|
|
127
141
|
)
|
|
128
142
|
|
|
129
143
|
kwargs = {"else_": else_}
|
|
130
|
-
|
|
144
|
+
|
|
145
|
+
return Func("case", inner=sql_case, cols=args, kwargs=kwargs, result_type=type_)
|
|
131
146
|
|
|
132
147
|
|
|
133
|
-
def ifelse(
|
|
148
|
+
def ifelse(
|
|
149
|
+
condition: Union[ColumnElement, Func], if_val: CaseT, else_val: CaseT
|
|
150
|
+
) -> Func:
|
|
134
151
|
"""
|
|
135
152
|
Returns the ifelse function that produces if expression which has a condition
|
|
136
|
-
and values for true and false outcome. Results can
|
|
137
|
-
like string,
|
|
153
|
+
and values for true and false outcome. Results can be one of python primitives
|
|
154
|
+
like string, numbers or booleans, but can also be nested functions.
|
|
155
|
+
Result type is inferred from the values.
|
|
138
156
|
|
|
139
157
|
Args:
|
|
140
|
-
condition
|
|
141
|
-
if_val
|
|
142
|
-
|
|
143
|
-
|
|
158
|
+
condition (ColumnElement, Func): Condition which is evaluated.
|
|
159
|
+
if_val (str | int | float | complex | bool, Func): Value for true
|
|
160
|
+
condition outcome.
|
|
161
|
+
else_val (str | int | float | complex | bool, Func): Value for false condition
|
|
162
|
+
outcome.
|
|
144
163
|
|
|
145
164
|
Returns:
|
|
146
165
|
Func: A Func object that represents the ifelse function.
|
|
@@ -148,8 +167,33 @@ def ifelse(condition: BinaryExpression, if_val: CaseT, else_val: CaseT) -> Func:
|
|
|
148
167
|
Example:
|
|
149
168
|
```py
|
|
150
169
|
dc.mutate(
|
|
151
|
-
res=func.ifelse(
|
|
170
|
+
res=func.ifelse(isnone("col"), "EMPTY", "NOT_EMPTY")
|
|
152
171
|
)
|
|
153
172
|
```
|
|
154
173
|
"""
|
|
155
174
|
return case((condition, if_val), else_=else_val)
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def isnone(col: Union[str, Column]) -> Func:
|
|
178
|
+
"""
|
|
179
|
+
Returns True if column value is None, otherwise False.
|
|
180
|
+
|
|
181
|
+
Args:
|
|
182
|
+
col (str | Column): Column to check if it's None or not.
|
|
183
|
+
If a string is provided, it is assumed to be the name of the column.
|
|
184
|
+
|
|
185
|
+
Returns:
|
|
186
|
+
Func: A Func object that represents the conditional to check if column is None.
|
|
187
|
+
|
|
188
|
+
Example:
|
|
189
|
+
```py
|
|
190
|
+
dc.mutate(test=ifelse(isnone("col"), "EMPTY", "NOT_EMPTY"))
|
|
191
|
+
```
|
|
192
|
+
"""
|
|
193
|
+
from datachain import C
|
|
194
|
+
|
|
195
|
+
if isinstance(col, str):
|
|
196
|
+
# if string, it is assumed to be the name of the column
|
|
197
|
+
col = C(col)
|
|
198
|
+
|
|
199
|
+
return case((col.is_(None) if col is not None else True, True), else_=False)
|
datachain/func/func.py
CHANGED
|
@@ -23,7 +23,7 @@ if TYPE_CHECKING:
|
|
|
23
23
|
from .window import Window
|
|
24
24
|
|
|
25
25
|
|
|
26
|
-
ColT = Union[str, ColumnElement, "Func"]
|
|
26
|
+
ColT = Union[str, ColumnElement, "Func", tuple]
|
|
27
27
|
|
|
28
28
|
|
|
29
29
|
class Func(Function):
|
|
@@ -78,7 +78,7 @@ class Func(Function):
|
|
|
78
78
|
return (
|
|
79
79
|
[
|
|
80
80
|
col
|
|
81
|
-
if isinstance(col, (Func, BindParameter, Case, Comparator))
|
|
81
|
+
if isinstance(col, (Func, BindParameter, Case, Comparator, tuple))
|
|
82
82
|
else ColumnMeta.to_db_name(
|
|
83
83
|
col.name if isinstance(col, ColumnElement) else col
|
|
84
84
|
)
|
|
@@ -381,17 +381,24 @@ class Func(Function):
|
|
|
381
381
|
col_type = self.get_result_type(signals_schema)
|
|
382
382
|
sql_type = python_to_sql(col_type)
|
|
383
383
|
|
|
384
|
-
def get_col(col: ColT) -> ColT:
|
|
384
|
+
def get_col(col: ColT, string_as_literal=False) -> ColT:
|
|
385
|
+
# string_as_literal is used only for conditionals like `case()` where
|
|
386
|
+
# literals are nested inside ColT as we have tuples of condition - values
|
|
387
|
+
# and if user wants to set some case value as column, explicit `C("col")`
|
|
388
|
+
# syntax must be used to distinguish from literals
|
|
389
|
+
if isinstance(col, tuple):
|
|
390
|
+
return tuple(get_col(x, string_as_literal=True) for x in col)
|
|
385
391
|
if isinstance(col, Func):
|
|
386
392
|
return col.get_column(signals_schema, table=table)
|
|
387
|
-
if isinstance(col, str):
|
|
393
|
+
if isinstance(col, str) and not string_as_literal:
|
|
388
394
|
column = Column(col, sql_type)
|
|
389
395
|
column.table = table
|
|
390
396
|
return column
|
|
391
397
|
return col
|
|
392
398
|
|
|
393
399
|
cols = [get_col(col) for col in self._db_cols]
|
|
394
|
-
|
|
400
|
+
kwargs = {k: get_col(v, string_as_literal=True) for k, v in self.kwargs.items()}
|
|
401
|
+
func_col = self.inner(*cols, *self.args, **kwargs)
|
|
395
402
|
|
|
396
403
|
if self.is_window:
|
|
397
404
|
if not self.window:
|
|
@@ -416,6 +423,11 @@ class Func(Function):
|
|
|
416
423
|
|
|
417
424
|
|
|
418
425
|
def get_db_col_type(signals_schema: "SignalSchema", col: ColT) -> "DataType":
|
|
426
|
+
if isinstance(col, tuple):
|
|
427
|
+
raise DataChainParamsError(
|
|
428
|
+
"Cannot get type from tuple, please provide type hint to the function"
|
|
429
|
+
)
|
|
430
|
+
|
|
419
431
|
if isinstance(col, Func):
|
|
420
432
|
return col.get_result_type(signals_schema)
|
|
421
433
|
|
datachain/lib/dc.py
CHANGED
|
@@ -25,6 +25,7 @@ from sqlalchemy.sql.functions import GenericFunction
|
|
|
25
25
|
from sqlalchemy.sql.sqltypes import NullType
|
|
26
26
|
|
|
27
27
|
from datachain.dataset import DatasetRecord
|
|
28
|
+
from datachain.func import literal
|
|
28
29
|
from datachain.func.base import Function
|
|
29
30
|
from datachain.func.func import Func
|
|
30
31
|
from datachain.lib.convert.python_to_sql import python_to_sql
|
|
@@ -1129,8 +1130,12 @@ class DataChain:
|
|
|
1129
1130
|
)
|
|
1130
1131
|
```
|
|
1131
1132
|
"""
|
|
1133
|
+
primitives = (bool, str, int, float)
|
|
1134
|
+
|
|
1132
1135
|
for col_name, expr in kwargs.items():
|
|
1133
|
-
if not isinstance(expr, (Column, Func)) and isinstance(
|
|
1136
|
+
if not isinstance(expr, (*primitives, Column, Func)) and isinstance(
|
|
1137
|
+
expr.type, NullType
|
|
1138
|
+
):
|
|
1134
1139
|
raise DataChainColumnError(
|
|
1135
1140
|
col_name, f"Cannot infer type with expression {expr}"
|
|
1136
1141
|
)
|
|
@@ -1145,6 +1150,11 @@ class DataChain:
|
|
|
1145
1150
|
elif isinstance(value, Func):
|
|
1146
1151
|
# adding new signal
|
|
1147
1152
|
mutated[name] = value.get_column(schema)
|
|
1153
|
+
elif isinstance(value, primitives):
|
|
1154
|
+
# adding simple python constant primitives like str, int, float, bool
|
|
1155
|
+
val = literal(value)
|
|
1156
|
+
val.type = python_to_sql(type(value))()
|
|
1157
|
+
mutated[name] = val # type: ignore[assignment]
|
|
1148
1158
|
else:
|
|
1149
1159
|
# adding new signal
|
|
1150
1160
|
mutated[name] = value
|
|
@@ -1942,7 +1952,7 @@ class DataChain:
|
|
|
1942
1952
|
def from_csv(
|
|
1943
1953
|
cls,
|
|
1944
1954
|
path,
|
|
1945
|
-
delimiter: str =
|
|
1955
|
+
delimiter: Optional[str] = None,
|
|
1946
1956
|
header: bool = True,
|
|
1947
1957
|
output: OutputType = None,
|
|
1948
1958
|
object_name: str = "",
|
|
@@ -1952,6 +1962,7 @@ class DataChain:
|
|
|
1952
1962
|
session: Optional[Session] = None,
|
|
1953
1963
|
settings: Optional[dict] = None,
|
|
1954
1964
|
column_types: Optional[dict[str, "Union[str, ArrowDataType]"]] = None,
|
|
1965
|
+
parse_options: Optional[dict[str, "Union[str, Union[bool, Callable]]"]] = None,
|
|
1955
1966
|
**kwargs,
|
|
1956
1967
|
) -> "DataChain":
|
|
1957
1968
|
"""Generate chain from csv files.
|
|
@@ -1959,7 +1970,8 @@ class DataChain:
|
|
|
1959
1970
|
Parameters:
|
|
1960
1971
|
path : Storage URI with directory. URI must start with storage prefix such
|
|
1961
1972
|
as `s3://`, `gs://`, `az://` or "file:///".
|
|
1962
|
-
delimiter : Character for delimiting columns.
|
|
1973
|
+
delimiter : Character for delimiting columns. Takes precedence if also
|
|
1974
|
+
specified in `parse_options`. Defaults to ",".
|
|
1963
1975
|
header : Whether the files include a header row.
|
|
1964
1976
|
output : Dictionary or feature class defining column names and their
|
|
1965
1977
|
corresponding types. List of column names is also accepted, in which
|
|
@@ -1973,6 +1985,8 @@ class DataChain:
|
|
|
1973
1985
|
column_types : Dictionary of column names and their corresponding types.
|
|
1974
1986
|
It is passed to CSV reader and for each column specified type auto
|
|
1975
1987
|
inference is disabled.
|
|
1988
|
+
parse_options: Tells the parser how to process lines.
|
|
1989
|
+
See https://arrow.apache.org/docs/python/generated/pyarrow.csv.ParseOptions.html
|
|
1976
1990
|
|
|
1977
1991
|
Example:
|
|
1978
1992
|
Reading a csv file:
|
|
@@ -1990,6 +2004,12 @@ class DataChain:
|
|
|
1990
2004
|
from pyarrow.dataset import CsvFileFormat
|
|
1991
2005
|
from pyarrow.lib import type_for_alias
|
|
1992
2006
|
|
|
2007
|
+
parse_options = parse_options or {}
|
|
2008
|
+
if "delimiter" not in parse_options:
|
|
2009
|
+
parse_options["delimiter"] = ","
|
|
2010
|
+
if delimiter:
|
|
2011
|
+
parse_options["delimiter"] = delimiter
|
|
2012
|
+
|
|
1993
2013
|
if column_types:
|
|
1994
2014
|
column_types = {
|
|
1995
2015
|
name: type_for_alias(typ) if isinstance(typ, str) else typ
|
|
@@ -2017,7 +2037,7 @@ class DataChain:
|
|
|
2017
2037
|
msg = f"error parsing csv - incompatible output type {type(output)}"
|
|
2018
2038
|
raise DatasetPrepareError(chain.name, msg)
|
|
2019
2039
|
|
|
2020
|
-
parse_options = ParseOptions(
|
|
2040
|
+
parse_options = ParseOptions(**parse_options)
|
|
2021
2041
|
read_options = ReadOptions(column_names=column_names)
|
|
2022
2042
|
convert_options = ConvertOptions(
|
|
2023
2043
|
strings_can_be_null=True,
|
datachain/lib/file.py
CHANGED
|
@@ -190,6 +190,22 @@ class File(DataModel):
|
|
|
190
190
|
self._catalog = None
|
|
191
191
|
self._caching_enabled: bool = False
|
|
192
192
|
|
|
193
|
+
@classmethod
|
|
194
|
+
def upload(
|
|
195
|
+
cls, data: bytes, path: str, catalog: Optional["Catalog"] = None
|
|
196
|
+
) -> "File":
|
|
197
|
+
if catalog is None:
|
|
198
|
+
from datachain.catalog.loader import get_catalog
|
|
199
|
+
|
|
200
|
+
catalog = get_catalog()
|
|
201
|
+
|
|
202
|
+
parent, name = posixpath.split(path)
|
|
203
|
+
|
|
204
|
+
client = catalog.get_client(parent)
|
|
205
|
+
file = client.upload(data, name)
|
|
206
|
+
file._set_stream(catalog)
|
|
207
|
+
return file
|
|
208
|
+
|
|
193
209
|
@classmethod
|
|
194
210
|
def _from_row(cls, row: "RowDict") -> "Self":
|
|
195
211
|
return cls(**{key: row[key] for key in cls._datachain_column_types})
|
datachain/lib/listing.py
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
1
3
|
import posixpath
|
|
2
4
|
from collections.abc import Iterator
|
|
3
5
|
from typing import TYPE_CHECKING, Callable, Optional, TypeVar
|
|
@@ -7,6 +9,7 @@ from sqlalchemy.sql.expression import true
|
|
|
7
9
|
|
|
8
10
|
from datachain.asyn import iter_over_async
|
|
9
11
|
from datachain.client import Client
|
|
12
|
+
from datachain.error import REMOTE_ERRORS, ClientError
|
|
10
13
|
from datachain.lib.file import File
|
|
11
14
|
from datachain.query.schema import Column
|
|
12
15
|
from datachain.sql.functions import path as pathfunc
|
|
@@ -22,6 +25,10 @@ LISTING_PREFIX = "lst__" # listing datasets start with this name
|
|
|
22
25
|
|
|
23
26
|
D = TypeVar("D", bound="DataChain")
|
|
24
27
|
|
|
28
|
+
# Disable warnings for remote errors in clients
|
|
29
|
+
logging.getLogger("aiobotocore.credentials").setLevel(logging.CRITICAL)
|
|
30
|
+
logging.getLogger("gcsfs").setLevel(logging.CRITICAL)
|
|
31
|
+
|
|
25
32
|
|
|
26
33
|
def list_bucket(uri: str, cache, client_config=None) -> Callable:
|
|
27
34
|
"""
|
|
@@ -90,6 +97,15 @@ def _isfile(client: "Client", path: str) -> bool:
|
|
|
90
97
|
Returns True if uri points to a file
|
|
91
98
|
"""
|
|
92
99
|
try:
|
|
100
|
+
if "://" in path:
|
|
101
|
+
# This makes sure that the uppercase scheme is converted to lowercase
|
|
102
|
+
scheme, path = path.split("://", 1)
|
|
103
|
+
path = f"{scheme.lower()}://{path}"
|
|
104
|
+
|
|
105
|
+
if os.name == "nt" and "*" in path:
|
|
106
|
+
# On Windows, the glob pattern "*" is not supported
|
|
107
|
+
return False
|
|
108
|
+
|
|
93
109
|
info = client.fs.info(path)
|
|
94
110
|
name = info.get("name")
|
|
95
111
|
# case for special simulated directories on some clouds
|
|
@@ -99,21 +115,21 @@ def _isfile(client: "Client", path: str) -> bool:
|
|
|
99
115
|
return False
|
|
100
116
|
|
|
101
117
|
return info["type"] == "file"
|
|
102
|
-
except
|
|
118
|
+
except FileNotFoundError:
|
|
103
119
|
return False
|
|
120
|
+
except REMOTE_ERRORS as e:
|
|
121
|
+
raise ClientError(
|
|
122
|
+
message=str(e),
|
|
123
|
+
error_code=getattr(e, "code", None),
|
|
124
|
+
) from e
|
|
104
125
|
|
|
105
126
|
|
|
106
|
-
def parse_listing_uri(uri: str,
|
|
127
|
+
def parse_listing_uri(uri: str, client_config) -> tuple[str, str, str]:
|
|
107
128
|
"""
|
|
108
129
|
Parsing uri and returns listing dataset name, listing uri and listing path
|
|
109
130
|
"""
|
|
110
131
|
client_config = client_config or {}
|
|
111
|
-
client = Client.get_client(uri, cache, **client_config)
|
|
112
132
|
storage_uri, path = Client.parse_url(uri)
|
|
113
|
-
telemetry.log_param("client", client.PREFIX)
|
|
114
|
-
|
|
115
|
-
if not uri.endswith("/") and _isfile(client, uri):
|
|
116
|
-
return None, f"{storage_uri}/{path.lstrip('/')}", path
|
|
117
133
|
if uses_glob(path):
|
|
118
134
|
lst_uri_path = posixpath.dirname(path)
|
|
119
135
|
else:
|
|
@@ -157,13 +173,15 @@ def get_listing(
|
|
|
157
173
|
client_config = catalog.client_config
|
|
158
174
|
|
|
159
175
|
client = Client.get_client(uri, cache, **client_config)
|
|
160
|
-
|
|
161
|
-
listing = None
|
|
176
|
+
telemetry.log_param("client", client.PREFIX)
|
|
162
177
|
|
|
163
|
-
#
|
|
164
|
-
if not
|
|
165
|
-
|
|
178
|
+
# we don't want to use cached dataset (e.g. for a single file listing)
|
|
179
|
+
if not uri.endswith("/") and _isfile(client, uri):
|
|
180
|
+
storage_uri, path = Client.parse_url(uri)
|
|
181
|
+
return None, f"{storage_uri}/{path.lstrip('/')}", path, False
|
|
166
182
|
|
|
183
|
+
ds_name, list_uri, list_path = parse_listing_uri(uri, client_config)
|
|
184
|
+
listing = None
|
|
167
185
|
listings = [
|
|
168
186
|
ls for ls in catalog.listings() if not ls.is_expired and ls.contains(ds_name)
|
|
169
187
|
]
|
datachain/lib/pytorch.py
CHANGED
|
@@ -23,7 +23,7 @@ from datachain.query.dataset import get_download_callback
|
|
|
23
23
|
if TYPE_CHECKING:
|
|
24
24
|
from torchvision.transforms.v2 import Transform
|
|
25
25
|
|
|
26
|
-
from datachain.cache import
|
|
26
|
+
from datachain.cache import Cache
|
|
27
27
|
|
|
28
28
|
|
|
29
29
|
logger = logging.getLogger("datachain")
|
datachain/lib/udf.py
CHANGED
|
@@ -32,7 +32,7 @@ if TYPE_CHECKING:
|
|
|
32
32
|
|
|
33
33
|
from typing_extensions import Self
|
|
34
34
|
|
|
35
|
-
from datachain.cache import
|
|
35
|
+
from datachain.cache import Cache
|
|
36
36
|
from datachain.catalog import Catalog
|
|
37
37
|
from datachain.lib.signal_schema import SignalSchema
|
|
38
38
|
from datachain.lib.udf_signature import UdfSignature
|
datachain/listing.py
CHANGED
|
@@ -2,7 +2,6 @@ import glob
|
|
|
2
2
|
import os
|
|
3
3
|
from collections.abc import Iterable, Iterator
|
|
4
4
|
from functools import cached_property
|
|
5
|
-
from itertools import zip_longest
|
|
6
5
|
from typing import TYPE_CHECKING, Optional
|
|
7
6
|
|
|
8
7
|
from sqlalchemy import Column
|
|
@@ -101,11 +100,8 @@ class Listing:
|
|
|
101
100
|
copy_to_filename: Optional[str],
|
|
102
101
|
recursive=False,
|
|
103
102
|
copy_dir_contents=False,
|
|
104
|
-
relative_path=None,
|
|
105
|
-
from_edatachain=False,
|
|
106
103
|
from_dataset=False,
|
|
107
104
|
) -> list[NodeWithPath]:
|
|
108
|
-
rel_path_elements = relative_path.split("/") if relative_path else []
|
|
109
105
|
all_nodes: list[NodeWithPath] = []
|
|
110
106
|
for src in sources:
|
|
111
107
|
node = src.node
|
|
@@ -119,15 +115,7 @@ class Listing:
|
|
|
119
115
|
)
|
|
120
116
|
else:
|
|
121
117
|
node_path = []
|
|
122
|
-
if
|
|
123
|
-
for rpe, npe in zip_longest(
|
|
124
|
-
rel_path_elements, node.path.split("/")
|
|
125
|
-
):
|
|
126
|
-
if rpe == npe:
|
|
127
|
-
continue
|
|
128
|
-
if npe:
|
|
129
|
-
node_path.append(npe)
|
|
130
|
-
elif copy_to_filename:
|
|
118
|
+
if copy_to_filename:
|
|
131
119
|
node_path = [os.path.basename(copy_to_filename)]
|
|
132
120
|
elif from_dataset:
|
|
133
121
|
node_path = [
|
datachain/node.py
CHANGED
|
@@ -84,18 +84,6 @@ class Node:
|
|
|
84
84
|
fd.write(f" size: {self.size}\n")
|
|
85
85
|
return size
|
|
86
86
|
|
|
87
|
-
def get_metafile_data(self, path: str):
|
|
88
|
-
data: dict[str, Any] = {
|
|
89
|
-
"name": path,
|
|
90
|
-
"etag": self.etag,
|
|
91
|
-
}
|
|
92
|
-
version = self.version
|
|
93
|
-
if version:
|
|
94
|
-
data["version"] = version
|
|
95
|
-
data["last_modified"] = time_to_str(self.last_modified)
|
|
96
|
-
data["size"] = self.size
|
|
97
|
-
return data
|
|
98
|
-
|
|
99
87
|
@property
|
|
100
88
|
def full_path(self) -> str:
|
|
101
89
|
if self.is_dir and self.path:
|
|
@@ -181,9 +169,6 @@ class NodeWithPath:
|
|
|
181
169
|
def append_to_file(self, fd):
|
|
182
170
|
return self.n.append_to_file(fd, "/".join(self.path))
|
|
183
171
|
|
|
184
|
-
def get_metafile_data(self):
|
|
185
|
-
return self.n.get_metafile_data("/".join(self.path))
|
|
186
|
-
|
|
187
172
|
@property
|
|
188
173
|
def full_path(self) -> str:
|
|
189
174
|
path = "/".join(self.path)
|
datachain/nodes_fetcher.py
CHANGED
|
@@ -5,7 +5,7 @@ from typing import TYPE_CHECKING
|
|
|
5
5
|
from datachain.nodes_thread_pool import NodesThreadPool
|
|
6
6
|
|
|
7
7
|
if TYPE_CHECKING:
|
|
8
|
-
from datachain.cache import
|
|
8
|
+
from datachain.cache import Cache
|
|
9
9
|
from datachain.client.fsspec import Client
|
|
10
10
|
from datachain.node import Node
|
|
11
11
|
|
|
@@ -13,7 +13,7 @@ logger = logging.getLogger("datachain")
|
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
class NodesFetcher(NodesThreadPool):
|
|
16
|
-
def __init__(self, client: "Client", max_threads: int, cache: "
|
|
16
|
+
def __init__(self, client: "Client", max_threads: int, cache: "Cache"):
|
|
17
17
|
super().__init__(max_threads)
|
|
18
18
|
self.client = client
|
|
19
19
|
self.cache = cache
|