datachain 0.8.9__py3-none-any.whl → 0.8.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/cache.py +4 -4
- datachain/catalog/__init__.py +0 -2
- datachain/catalog/catalog.py +102 -138
- datachain/cli/__init__.py +9 -9
- datachain/cli/parser/__init__.py +36 -20
- datachain/cli/parser/job.py +1 -1
- datachain/cli/parser/studio.py +35 -34
- datachain/cli/parser/utils.py +19 -1
- datachain/cli/utils.py +1 -1
- datachain/client/fsspec.py +11 -8
- datachain/client/local.py +4 -4
- datachain/data_storage/schema.py +1 -1
- datachain/data_storage/sqlite.py +38 -7
- datachain/data_storage/warehouse.py +2 -2
- datachain/dataset.py +1 -1
- datachain/error.py +12 -0
- datachain/func/__init__.py +2 -1
- datachain/func/conditional.py +67 -23
- datachain/func/func.py +17 -5
- datachain/lib/convert/python_to_sql.py +15 -3
- datachain/lib/dc.py +27 -5
- datachain/lib/file.py +16 -0
- datachain/lib/listing.py +30 -12
- datachain/lib/pytorch.py +1 -1
- datachain/lib/udf.py +1 -1
- datachain/listing.py +1 -13
- datachain/node.py +0 -15
- datachain/nodes_fetcher.py +2 -2
- datachain/query/dataset.py +8 -4
- datachain/remote/studio.py +3 -3
- datachain/sql/sqlite/base.py +35 -14
- datachain/studio.py +8 -8
- {datachain-0.8.9.dist-info → datachain-0.8.11.dist-info}/METADATA +3 -7
- {datachain-0.8.9.dist-info → datachain-0.8.11.dist-info}/RECORD +38 -38
- {datachain-0.8.9.dist-info → datachain-0.8.11.dist-info}/LICENSE +0 -0
- {datachain-0.8.9.dist-info → datachain-0.8.11.dist-info}/WHEEL +0 -0
- {datachain-0.8.9.dist-info → datachain-0.8.11.dist-info}/entry_points.txt +0 -0
- {datachain-0.8.9.dist-info → datachain-0.8.11.dist-info}/top_level.txt +0 -0
datachain/lib/dc.py
CHANGED
|
@@ -25,6 +25,7 @@ from sqlalchemy.sql.functions import GenericFunction
|
|
|
25
25
|
from sqlalchemy.sql.sqltypes import NullType
|
|
26
26
|
|
|
27
27
|
from datachain.dataset import DatasetRecord
|
|
28
|
+
from datachain.func import literal
|
|
28
29
|
from datachain.func.base import Function
|
|
29
30
|
from datachain.func.func import Func
|
|
30
31
|
from datachain.lib.convert.python_to_sql import python_to_sql
|
|
@@ -1129,8 +1130,12 @@ class DataChain:
|
|
|
1129
1130
|
)
|
|
1130
1131
|
```
|
|
1131
1132
|
"""
|
|
1133
|
+
primitives = (bool, str, int, float)
|
|
1134
|
+
|
|
1132
1135
|
for col_name, expr in kwargs.items():
|
|
1133
|
-
if not isinstance(expr, (Column, Func)) and isinstance(
|
|
1136
|
+
if not isinstance(expr, (*primitives, Column, Func)) and isinstance(
|
|
1137
|
+
expr.type, NullType
|
|
1138
|
+
):
|
|
1134
1139
|
raise DataChainColumnError(
|
|
1135
1140
|
col_name, f"Cannot infer type with expression {expr}"
|
|
1136
1141
|
)
|
|
@@ -1145,6 +1150,11 @@ class DataChain:
|
|
|
1145
1150
|
elif isinstance(value, Func):
|
|
1146
1151
|
# adding new signal
|
|
1147
1152
|
mutated[name] = value.get_column(schema)
|
|
1153
|
+
elif isinstance(value, primitives):
|
|
1154
|
+
# adding simple python constant primitives like str, int, float, bool
|
|
1155
|
+
val = literal(value)
|
|
1156
|
+
val.type = python_to_sql(type(value))()
|
|
1157
|
+
mutated[name] = val # type: ignore[assignment]
|
|
1148
1158
|
else:
|
|
1149
1159
|
# adding new signal
|
|
1150
1160
|
mutated[name] = value
|
|
@@ -1332,6 +1342,7 @@ class DataChain:
|
|
|
1332
1342
|
on: Union[MergeColType, Sequence[MergeColType]],
|
|
1333
1343
|
right_on: Optional[Union[MergeColType, Sequence[MergeColType]]] = None,
|
|
1334
1344
|
inner=False,
|
|
1345
|
+
full=False,
|
|
1335
1346
|
rname="right_",
|
|
1336
1347
|
) -> "Self":
|
|
1337
1348
|
"""Merge two chains based on the specified criteria.
|
|
@@ -1345,6 +1356,7 @@ class DataChain:
|
|
|
1345
1356
|
right_on: Optional predicate or list of Predicates for the `right_ds`
|
|
1346
1357
|
to join.
|
|
1347
1358
|
inner (bool): Whether to run inner join or outer join.
|
|
1359
|
+
full (bool): Whether to run full outer join.
|
|
1348
1360
|
rname (str): Name prefix for conflicting signal names.
|
|
1349
1361
|
|
|
1350
1362
|
Examples:
|
|
@@ -1419,7 +1431,7 @@ class DataChain:
|
|
|
1419
1431
|
)
|
|
1420
1432
|
|
|
1421
1433
|
query = self._query.join(
|
|
1422
|
-
right_ds._query, sqlalchemy.and_(*ops), inner, rname + "{name}"
|
|
1434
|
+
right_ds._query, sqlalchemy.and_(*ops), inner, full, rname + "{name}"
|
|
1423
1435
|
)
|
|
1424
1436
|
query.feature_schema = None
|
|
1425
1437
|
ds = self._evolve(query=query)
|
|
@@ -1940,7 +1952,7 @@ class DataChain:
|
|
|
1940
1952
|
def from_csv(
|
|
1941
1953
|
cls,
|
|
1942
1954
|
path,
|
|
1943
|
-
delimiter: str =
|
|
1955
|
+
delimiter: Optional[str] = None,
|
|
1944
1956
|
header: bool = True,
|
|
1945
1957
|
output: OutputType = None,
|
|
1946
1958
|
object_name: str = "",
|
|
@@ -1950,6 +1962,7 @@ class DataChain:
|
|
|
1950
1962
|
session: Optional[Session] = None,
|
|
1951
1963
|
settings: Optional[dict] = None,
|
|
1952
1964
|
column_types: Optional[dict[str, "Union[str, ArrowDataType]"]] = None,
|
|
1965
|
+
parse_options: Optional[dict[str, "Union[str, Union[bool, Callable]]"]] = None,
|
|
1953
1966
|
**kwargs,
|
|
1954
1967
|
) -> "DataChain":
|
|
1955
1968
|
"""Generate chain from csv files.
|
|
@@ -1957,7 +1970,8 @@ class DataChain:
|
|
|
1957
1970
|
Parameters:
|
|
1958
1971
|
path : Storage URI with directory. URI must start with storage prefix such
|
|
1959
1972
|
as `s3://`, `gs://`, `az://` or "file:///".
|
|
1960
|
-
delimiter : Character for delimiting columns.
|
|
1973
|
+
delimiter : Character for delimiting columns. Takes precedence if also
|
|
1974
|
+
specified in `parse_options`. Defaults to ",".
|
|
1961
1975
|
header : Whether the files include a header row.
|
|
1962
1976
|
output : Dictionary or feature class defining column names and their
|
|
1963
1977
|
corresponding types. List of column names is also accepted, in which
|
|
@@ -1971,6 +1985,8 @@ class DataChain:
|
|
|
1971
1985
|
column_types : Dictionary of column names and their corresponding types.
|
|
1972
1986
|
It is passed to CSV reader and for each column specified type auto
|
|
1973
1987
|
inference is disabled.
|
|
1988
|
+
parse_options: Tells the parser how to process lines.
|
|
1989
|
+
See https://arrow.apache.org/docs/python/generated/pyarrow.csv.ParseOptions.html
|
|
1974
1990
|
|
|
1975
1991
|
Example:
|
|
1976
1992
|
Reading a csv file:
|
|
@@ -1988,6 +2004,12 @@ class DataChain:
|
|
|
1988
2004
|
from pyarrow.dataset import CsvFileFormat
|
|
1989
2005
|
from pyarrow.lib import type_for_alias
|
|
1990
2006
|
|
|
2007
|
+
parse_options = parse_options or {}
|
|
2008
|
+
if "delimiter" not in parse_options:
|
|
2009
|
+
parse_options["delimiter"] = ","
|
|
2010
|
+
if delimiter:
|
|
2011
|
+
parse_options["delimiter"] = delimiter
|
|
2012
|
+
|
|
1991
2013
|
if column_types:
|
|
1992
2014
|
column_types = {
|
|
1993
2015
|
name: type_for_alias(typ) if isinstance(typ, str) else typ
|
|
@@ -2015,7 +2037,7 @@ class DataChain:
|
|
|
2015
2037
|
msg = f"error parsing csv - incompatible output type {type(output)}"
|
|
2016
2038
|
raise DatasetPrepareError(chain.name, msg)
|
|
2017
2039
|
|
|
2018
|
-
parse_options = ParseOptions(
|
|
2040
|
+
parse_options = ParseOptions(**parse_options)
|
|
2019
2041
|
read_options = ReadOptions(column_names=column_names)
|
|
2020
2042
|
convert_options = ConvertOptions(
|
|
2021
2043
|
strings_can_be_null=True,
|
datachain/lib/file.py
CHANGED
|
@@ -190,6 +190,22 @@ class File(DataModel):
|
|
|
190
190
|
self._catalog = None
|
|
191
191
|
self._caching_enabled: bool = False
|
|
192
192
|
|
|
193
|
+
@classmethod
|
|
194
|
+
def upload(
|
|
195
|
+
cls, data: bytes, path: str, catalog: Optional["Catalog"] = None
|
|
196
|
+
) -> "File":
|
|
197
|
+
if catalog is None:
|
|
198
|
+
from datachain.catalog.loader import get_catalog
|
|
199
|
+
|
|
200
|
+
catalog = get_catalog()
|
|
201
|
+
|
|
202
|
+
parent, name = posixpath.split(path)
|
|
203
|
+
|
|
204
|
+
client = catalog.get_client(parent)
|
|
205
|
+
file = client.upload(data, name)
|
|
206
|
+
file._set_stream(catalog)
|
|
207
|
+
return file
|
|
208
|
+
|
|
193
209
|
@classmethod
|
|
194
210
|
def _from_row(cls, row: "RowDict") -> "Self":
|
|
195
211
|
return cls(**{key: row[key] for key in cls._datachain_column_types})
|
datachain/lib/listing.py
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
1
3
|
import posixpath
|
|
2
4
|
from collections.abc import Iterator
|
|
3
5
|
from typing import TYPE_CHECKING, Callable, Optional, TypeVar
|
|
@@ -7,6 +9,7 @@ from sqlalchemy.sql.expression import true
|
|
|
7
9
|
|
|
8
10
|
from datachain.asyn import iter_over_async
|
|
9
11
|
from datachain.client import Client
|
|
12
|
+
from datachain.error import REMOTE_ERRORS, ClientError
|
|
10
13
|
from datachain.lib.file import File
|
|
11
14
|
from datachain.query.schema import Column
|
|
12
15
|
from datachain.sql.functions import path as pathfunc
|
|
@@ -22,6 +25,10 @@ LISTING_PREFIX = "lst__" # listing datasets start with this name
|
|
|
22
25
|
|
|
23
26
|
D = TypeVar("D", bound="DataChain")
|
|
24
27
|
|
|
28
|
+
# Disable warnings for remote errors in clients
|
|
29
|
+
logging.getLogger("aiobotocore.credentials").setLevel(logging.CRITICAL)
|
|
30
|
+
logging.getLogger("gcsfs").setLevel(logging.CRITICAL)
|
|
31
|
+
|
|
25
32
|
|
|
26
33
|
def list_bucket(uri: str, cache, client_config=None) -> Callable:
|
|
27
34
|
"""
|
|
@@ -90,6 +97,15 @@ def _isfile(client: "Client", path: str) -> bool:
|
|
|
90
97
|
Returns True if uri points to a file
|
|
91
98
|
"""
|
|
92
99
|
try:
|
|
100
|
+
if "://" in path:
|
|
101
|
+
# This makes sure that the uppercase scheme is converted to lowercase
|
|
102
|
+
scheme, path = path.split("://", 1)
|
|
103
|
+
path = f"{scheme.lower()}://{path}"
|
|
104
|
+
|
|
105
|
+
if os.name == "nt" and "*" in path:
|
|
106
|
+
# On Windows, the glob pattern "*" is not supported
|
|
107
|
+
return False
|
|
108
|
+
|
|
93
109
|
info = client.fs.info(path)
|
|
94
110
|
name = info.get("name")
|
|
95
111
|
# case for special simulated directories on some clouds
|
|
@@ -99,21 +115,21 @@ def _isfile(client: "Client", path: str) -> bool:
|
|
|
99
115
|
return False
|
|
100
116
|
|
|
101
117
|
return info["type"] == "file"
|
|
102
|
-
except
|
|
118
|
+
except FileNotFoundError:
|
|
103
119
|
return False
|
|
120
|
+
except REMOTE_ERRORS as e:
|
|
121
|
+
raise ClientError(
|
|
122
|
+
message=str(e),
|
|
123
|
+
error_code=getattr(e, "code", None),
|
|
124
|
+
) from e
|
|
104
125
|
|
|
105
126
|
|
|
106
|
-
def parse_listing_uri(uri: str,
|
|
127
|
+
def parse_listing_uri(uri: str, client_config) -> tuple[str, str, str]:
|
|
107
128
|
"""
|
|
108
129
|
Parsing uri and returns listing dataset name, listing uri and listing path
|
|
109
130
|
"""
|
|
110
131
|
client_config = client_config or {}
|
|
111
|
-
client = Client.get_client(uri, cache, **client_config)
|
|
112
132
|
storage_uri, path = Client.parse_url(uri)
|
|
113
|
-
telemetry.log_param("client", client.PREFIX)
|
|
114
|
-
|
|
115
|
-
if not uri.endswith("/") and _isfile(client, uri):
|
|
116
|
-
return None, f"{storage_uri}/{path.lstrip('/')}", path
|
|
117
133
|
if uses_glob(path):
|
|
118
134
|
lst_uri_path = posixpath.dirname(path)
|
|
119
135
|
else:
|
|
@@ -157,13 +173,15 @@ def get_listing(
|
|
|
157
173
|
client_config = catalog.client_config
|
|
158
174
|
|
|
159
175
|
client = Client.get_client(uri, cache, **client_config)
|
|
160
|
-
|
|
161
|
-
listing = None
|
|
176
|
+
telemetry.log_param("client", client.PREFIX)
|
|
162
177
|
|
|
163
|
-
#
|
|
164
|
-
if not
|
|
165
|
-
|
|
178
|
+
# we don't want to use cached dataset (e.g. for a single file listing)
|
|
179
|
+
if not uri.endswith("/") and _isfile(client, uri):
|
|
180
|
+
storage_uri, path = Client.parse_url(uri)
|
|
181
|
+
return None, f"{storage_uri}/{path.lstrip('/')}", path, False
|
|
166
182
|
|
|
183
|
+
ds_name, list_uri, list_path = parse_listing_uri(uri, client_config)
|
|
184
|
+
listing = None
|
|
167
185
|
listings = [
|
|
168
186
|
ls for ls in catalog.listings() if not ls.is_expired and ls.contains(ds_name)
|
|
169
187
|
]
|
datachain/lib/pytorch.py
CHANGED
|
@@ -23,7 +23,7 @@ from datachain.query.dataset import get_download_callback
|
|
|
23
23
|
if TYPE_CHECKING:
|
|
24
24
|
from torchvision.transforms.v2 import Transform
|
|
25
25
|
|
|
26
|
-
from datachain.cache import
|
|
26
|
+
from datachain.cache import Cache
|
|
27
27
|
|
|
28
28
|
|
|
29
29
|
logger = logging.getLogger("datachain")
|
datachain/lib/udf.py
CHANGED
|
@@ -32,7 +32,7 @@ if TYPE_CHECKING:
|
|
|
32
32
|
|
|
33
33
|
from typing_extensions import Self
|
|
34
34
|
|
|
35
|
-
from datachain.cache import
|
|
35
|
+
from datachain.cache import Cache
|
|
36
36
|
from datachain.catalog import Catalog
|
|
37
37
|
from datachain.lib.signal_schema import SignalSchema
|
|
38
38
|
from datachain.lib.udf_signature import UdfSignature
|
datachain/listing.py
CHANGED
|
@@ -2,7 +2,6 @@ import glob
|
|
|
2
2
|
import os
|
|
3
3
|
from collections.abc import Iterable, Iterator
|
|
4
4
|
from functools import cached_property
|
|
5
|
-
from itertools import zip_longest
|
|
6
5
|
from typing import TYPE_CHECKING, Optional
|
|
7
6
|
|
|
8
7
|
from sqlalchemy import Column
|
|
@@ -101,11 +100,8 @@ class Listing:
|
|
|
101
100
|
copy_to_filename: Optional[str],
|
|
102
101
|
recursive=False,
|
|
103
102
|
copy_dir_contents=False,
|
|
104
|
-
relative_path=None,
|
|
105
|
-
from_edatachain=False,
|
|
106
103
|
from_dataset=False,
|
|
107
104
|
) -> list[NodeWithPath]:
|
|
108
|
-
rel_path_elements = relative_path.split("/") if relative_path else []
|
|
109
105
|
all_nodes: list[NodeWithPath] = []
|
|
110
106
|
for src in sources:
|
|
111
107
|
node = src.node
|
|
@@ -119,15 +115,7 @@ class Listing:
|
|
|
119
115
|
)
|
|
120
116
|
else:
|
|
121
117
|
node_path = []
|
|
122
|
-
if
|
|
123
|
-
for rpe, npe in zip_longest(
|
|
124
|
-
rel_path_elements, node.path.split("/")
|
|
125
|
-
):
|
|
126
|
-
if rpe == npe:
|
|
127
|
-
continue
|
|
128
|
-
if npe:
|
|
129
|
-
node_path.append(npe)
|
|
130
|
-
elif copy_to_filename:
|
|
118
|
+
if copy_to_filename:
|
|
131
119
|
node_path = [os.path.basename(copy_to_filename)]
|
|
132
120
|
elif from_dataset:
|
|
133
121
|
node_path = [
|
datachain/node.py
CHANGED
|
@@ -84,18 +84,6 @@ class Node:
|
|
|
84
84
|
fd.write(f" size: {self.size}\n")
|
|
85
85
|
return size
|
|
86
86
|
|
|
87
|
-
def get_metafile_data(self, path: str):
|
|
88
|
-
data: dict[str, Any] = {
|
|
89
|
-
"name": path,
|
|
90
|
-
"etag": self.etag,
|
|
91
|
-
}
|
|
92
|
-
version = self.version
|
|
93
|
-
if version:
|
|
94
|
-
data["version"] = version
|
|
95
|
-
data["last_modified"] = time_to_str(self.last_modified)
|
|
96
|
-
data["size"] = self.size
|
|
97
|
-
return data
|
|
98
|
-
|
|
99
87
|
@property
|
|
100
88
|
def full_path(self) -> str:
|
|
101
89
|
if self.is_dir and self.path:
|
|
@@ -181,9 +169,6 @@ class NodeWithPath:
|
|
|
181
169
|
def append_to_file(self, fd):
|
|
182
170
|
return self.n.append_to_file(fd, "/".join(self.path))
|
|
183
171
|
|
|
184
|
-
def get_metafile_data(self):
|
|
185
|
-
return self.n.get_metafile_data("/".join(self.path))
|
|
186
|
-
|
|
187
172
|
@property
|
|
188
173
|
def full_path(self) -> str:
|
|
189
174
|
path = "/".join(self.path)
|
datachain/nodes_fetcher.py
CHANGED
|
@@ -5,7 +5,7 @@ from typing import TYPE_CHECKING
|
|
|
5
5
|
from datachain.nodes_thread_pool import NodesThreadPool
|
|
6
6
|
|
|
7
7
|
if TYPE_CHECKING:
|
|
8
|
-
from datachain.cache import
|
|
8
|
+
from datachain.cache import Cache
|
|
9
9
|
from datachain.client.fsspec import Client
|
|
10
10
|
from datachain.node import Node
|
|
11
11
|
|
|
@@ -13,7 +13,7 @@ logger = logging.getLogger("datachain")
|
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
class NodesFetcher(NodesThreadPool):
|
|
16
|
-
def __init__(self, client: "Client", max_threads: int, cache: "
|
|
16
|
+
def __init__(self, client: "Client", max_threads: int, cache: "Cache"):
|
|
17
17
|
super().__init__(max_threads)
|
|
18
18
|
self.client = client
|
|
19
19
|
self.cache = cache
|
datachain/query/dataset.py
CHANGED
|
@@ -875,6 +875,7 @@ class SQLJoin(Step):
|
|
|
875
875
|
query2: "DatasetQuery"
|
|
876
876
|
predicates: Union[JoinPredicateType, tuple[JoinPredicateType, ...]]
|
|
877
877
|
inner: bool
|
|
878
|
+
full: bool
|
|
878
879
|
rname: str
|
|
879
880
|
|
|
880
881
|
def get_query(self, dq: "DatasetQuery", temp_tables: list[str]) -> sa.Subquery:
|
|
@@ -977,14 +978,14 @@ class SQLJoin(Step):
|
|
|
977
978
|
self.validate_expression(join_expression, q1, q2)
|
|
978
979
|
|
|
979
980
|
def q(*columns):
|
|
980
|
-
|
|
981
|
+
return self.catalog.warehouse.join(
|
|
981
982
|
q1,
|
|
982
983
|
q2,
|
|
983
984
|
join_expression,
|
|
984
985
|
inner=self.inner,
|
|
986
|
+
full=self.full,
|
|
987
|
+
columns=columns,
|
|
985
988
|
)
|
|
986
|
-
return sqlalchemy.select(*columns).select_from(join_query)
|
|
987
|
-
# return sqlalchemy.select(*subquery.c).select_from(subquery)
|
|
988
989
|
|
|
989
990
|
return step_result(
|
|
990
991
|
q,
|
|
@@ -1489,6 +1490,7 @@ class DatasetQuery:
|
|
|
1489
1490
|
dataset_query: "DatasetQuery",
|
|
1490
1491
|
predicates: Union[JoinPredicateType, Sequence[JoinPredicateType]],
|
|
1491
1492
|
inner=False,
|
|
1493
|
+
full=False,
|
|
1492
1494
|
rname="{name}_right",
|
|
1493
1495
|
) -> "Self":
|
|
1494
1496
|
left = self.clone(new_table=False)
|
|
@@ -1504,7 +1506,9 @@ class DatasetQuery:
|
|
|
1504
1506
|
if isinstance(predicates, (str, ColumnClause, ColumnElement))
|
|
1505
1507
|
else tuple(predicates)
|
|
1506
1508
|
)
|
|
1507
|
-
new_query.steps = [
|
|
1509
|
+
new_query.steps = [
|
|
1510
|
+
SQLJoin(self.catalog, left, right, predicates, inner, full, rname)
|
|
1511
|
+
]
|
|
1508
1512
|
return new_query
|
|
1509
1513
|
|
|
1510
1514
|
@detach
|
datachain/remote/studio.py
CHANGED
|
@@ -75,7 +75,7 @@ class StudioClient:
|
|
|
75
75
|
|
|
76
76
|
if not token:
|
|
77
77
|
raise DataChainError(
|
|
78
|
-
"Studio token is not set. Use `datachain
|
|
78
|
+
"Studio token is not set. Use `datachain auth login` "
|
|
79
79
|
"or environment variable `DVC_STUDIO_TOKEN` to set it."
|
|
80
80
|
)
|
|
81
81
|
|
|
@@ -105,7 +105,7 @@ class StudioClient:
|
|
|
105
105
|
if not team:
|
|
106
106
|
raise DataChainError(
|
|
107
107
|
"Studio team is not set. "
|
|
108
|
-
"Use `datachain
|
|
108
|
+
"Use `datachain auth team <team_name>` "
|
|
109
109
|
"or environment variable `DVC_STUDIO_TEAM` to set it."
|
|
110
110
|
"You can also set it in the config file as team under studio."
|
|
111
111
|
)
|
|
@@ -375,7 +375,7 @@ class StudioClient:
|
|
|
375
375
|
method="GET",
|
|
376
376
|
)
|
|
377
377
|
|
|
378
|
-
def upload_file(self,
|
|
378
|
+
def upload_file(self, content: bytes, file_name: str) -> Response[FileUploadData]:
|
|
379
379
|
data = {
|
|
380
380
|
"file_content": base64.b64encode(content).decode("utf-8"),
|
|
381
381
|
"file_name": file_name,
|
datachain/sql/sqlite/base.py
CHANGED
|
@@ -4,6 +4,7 @@ import sqlite3
|
|
|
4
4
|
import warnings
|
|
5
5
|
from collections.abc import Iterable
|
|
6
6
|
from datetime import MAXYEAR, MINYEAR, datetime, timezone
|
|
7
|
+
from functools import cache
|
|
7
8
|
from types import MappingProxyType
|
|
8
9
|
from typing import Callable, Optional
|
|
9
10
|
|
|
@@ -526,24 +527,44 @@ def compile_collect(element, compiler, **kwargs):
|
|
|
526
527
|
return compiler.process(func.json_group_array(*element.clauses.clauses), **kwargs)
|
|
527
528
|
|
|
528
529
|
|
|
529
|
-
|
|
530
|
+
@cache
|
|
531
|
+
def usearch_sqlite_path() -> Optional[str]:
|
|
530
532
|
try:
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
533
|
+
import usearch
|
|
534
|
+
except ImportError:
|
|
535
|
+
return None
|
|
534
536
|
|
|
535
|
-
|
|
537
|
+
with warnings.catch_warnings():
|
|
538
|
+
# usearch binary is not available for Windows, see: https://github.com/unum-cloud/usearch/issues/427.
|
|
539
|
+
# and, sometimes fail to download the binary in other platforms
|
|
540
|
+
# triggering UserWarning.
|
|
536
541
|
|
|
537
|
-
|
|
538
|
-
# usearch binary is not available for Windows, see: https://github.com/unum-cloud/usearch/issues/427.
|
|
539
|
-
# and, sometimes fail to download the binary in other platforms
|
|
540
|
-
# triggering UserWarning.
|
|
542
|
+
warnings.filterwarnings("ignore", category=UserWarning, module="usearch")
|
|
541
543
|
|
|
542
|
-
|
|
543
|
-
|
|
544
|
+
try:
|
|
545
|
+
return usearch.sqlite_path()
|
|
546
|
+
except FileNotFoundError:
|
|
547
|
+
return None
|
|
544
548
|
|
|
545
|
-
conn.enable_load_extension(False)
|
|
546
|
-
return True
|
|
547
549
|
|
|
548
|
-
|
|
550
|
+
def load_usearch_extension(conn: sqlite3.Connection) -> bool:
|
|
551
|
+
# usearch is part of the vector optional dependencies
|
|
552
|
+
# we use the extension's cosine and euclidean distance functions
|
|
553
|
+
ext_path = usearch_sqlite_path()
|
|
554
|
+
if ext_path is None:
|
|
555
|
+
return False
|
|
556
|
+
|
|
557
|
+
try:
|
|
558
|
+
conn.enable_load_extension(True)
|
|
559
|
+
except AttributeError:
|
|
560
|
+
# sqlite3 module is not built with loadable extension support by default.
|
|
561
|
+
return False
|
|
562
|
+
|
|
563
|
+
try:
|
|
564
|
+
conn.load_extension(ext_path)
|
|
565
|
+
except sqlite3.OperationalError:
|
|
549
566
|
return False
|
|
567
|
+
else:
|
|
568
|
+
return True
|
|
569
|
+
finally:
|
|
570
|
+
conn.enable_load_extension(False)
|
datachain/studio.py
CHANGED
|
@@ -47,7 +47,7 @@ def process_jobs_args(args: "Namespace"):
|
|
|
47
47
|
raise DataChainError(f"Unknown command '{args.cmd}'.")
|
|
48
48
|
|
|
49
49
|
|
|
50
|
-
def
|
|
50
|
+
def process_auth_cli_args(args: "Namespace"):
|
|
51
51
|
if args.cmd is None:
|
|
52
52
|
print(
|
|
53
53
|
f"Use 'datachain {args.command} --help' to see available options",
|
|
@@ -95,7 +95,7 @@ def login(args: "Namespace"):
|
|
|
95
95
|
raise DataChainError(
|
|
96
96
|
"Token already exists. "
|
|
97
97
|
"To login with a different token, "
|
|
98
|
-
"logout using `datachain
|
|
98
|
+
"logout using `datachain auth logout`."
|
|
99
99
|
)
|
|
100
100
|
|
|
101
101
|
open_browser = not args.no_open
|
|
@@ -121,12 +121,12 @@ def logout():
|
|
|
121
121
|
token = conf.get("studio", {}).get("token")
|
|
122
122
|
if not token:
|
|
123
123
|
raise DataChainError(
|
|
124
|
-
"Not logged in to Studio. Log in with 'datachain
|
|
124
|
+
"Not logged in to Studio. Log in with 'datachain auth login'."
|
|
125
125
|
)
|
|
126
126
|
|
|
127
127
|
del conf["studio"]["token"]
|
|
128
128
|
|
|
129
|
-
print("Logged out from Studio. (you can log back in with 'datachain
|
|
129
|
+
print("Logged out from Studio. (you can log back in with 'datachain auth login')")
|
|
130
130
|
|
|
131
131
|
|
|
132
132
|
def token():
|
|
@@ -134,7 +134,7 @@ def token():
|
|
|
134
134
|
token = config.get("token")
|
|
135
135
|
if not token:
|
|
136
136
|
raise DataChainError(
|
|
137
|
-
"Not logged in to Studio. Log in with 'datachain
|
|
137
|
+
"Not logged in to Studio. Log in with 'datachain auth login'."
|
|
138
138
|
)
|
|
139
139
|
|
|
140
140
|
print(token)
|
|
@@ -282,7 +282,7 @@ def upload_files(client: StudioClient, files: list[str]) -> list[str]:
|
|
|
282
282
|
file_name = os.path.basename(file)
|
|
283
283
|
with open(file, "rb") as f:
|
|
284
284
|
file_content = f.read()
|
|
285
|
-
response = client.upload_file(
|
|
285
|
+
response = client.upload_file(file_content, file_name)
|
|
286
286
|
if not response.ok:
|
|
287
287
|
raise_remote_error(response.message)
|
|
288
288
|
|
|
@@ -299,7 +299,7 @@ def cancel_job(job_id: str, team_name: Optional[str]):
|
|
|
299
299
|
token = Config().read().get("studio", {}).get("token")
|
|
300
300
|
if not token:
|
|
301
301
|
raise DataChainError(
|
|
302
|
-
"Not logged in to Studio. Log in with 'datachain
|
|
302
|
+
"Not logged in to Studio. Log in with 'datachain auth login'."
|
|
303
303
|
)
|
|
304
304
|
|
|
305
305
|
client = StudioClient(team=team_name)
|
|
@@ -314,7 +314,7 @@ def show_job_logs(job_id: str, team_name: Optional[str]):
|
|
|
314
314
|
token = Config().read().get("studio", {}).get("token")
|
|
315
315
|
if not token:
|
|
316
316
|
raise DataChainError(
|
|
317
|
-
"Not logged in to Studio. Log in with 'datachain
|
|
317
|
+
"Not logged in to Studio. Log in with 'datachain auth login'."
|
|
318
318
|
)
|
|
319
319
|
|
|
320
320
|
client = StudioClient(team=team_name)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.8.
|
|
3
|
+
Version: 0.8.11
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -78,7 +78,6 @@ Requires-Dist: pytest-xdist>=3.3.1; extra == "tests"
|
|
|
78
78
|
Requires-Dist: virtualenv; extra == "tests"
|
|
79
79
|
Requires-Dist: dulwich; extra == "tests"
|
|
80
80
|
Requires-Dist: hypothesis; extra == "tests"
|
|
81
|
-
Requires-Dist: open_clip_torch; extra == "tests"
|
|
82
81
|
Requires-Dist: aiotools>=1.7.0; extra == "tests"
|
|
83
82
|
Requires-Dist: requests-mock; extra == "tests"
|
|
84
83
|
Requires-Dist: scipy; extra == "tests"
|
|
@@ -94,12 +93,9 @@ Provides-Extra: examples
|
|
|
94
93
|
Requires-Dist: datachain[tests]; extra == "examples"
|
|
95
94
|
Requires-Dist: defusedxml; extra == "examples"
|
|
96
95
|
Requires-Dist: accelerate; extra == "examples"
|
|
97
|
-
Requires-Dist: unstructured_ingest[embed-huggingface]; extra == "examples"
|
|
98
|
-
Requires-Dist: unstructured[pdf]<0.16.12; extra == "examples"
|
|
99
|
-
Requires-Dist: pdfplumber==0.11.5; extra == "examples"
|
|
100
96
|
Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
|
|
101
|
-
Requires-Dist:
|
|
102
|
-
Requires-Dist:
|
|
97
|
+
Requires-Dist: ultralytics==8.3.68; extra == "examples"
|
|
98
|
+
Requires-Dist: open_clip_torch; extra == "examples"
|
|
103
99
|
|
|
104
100
|
================
|
|
105
101
|
|logo| DataChain
|