datachain 0.3.15__py3-none-any.whl → 0.3.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/catalog/catalog.py +13 -37
- datachain/cli.py +0 -25
- datachain/data_storage/metastore.py +7 -66
- datachain/data_storage/sqlite.py +24 -2
- datachain/data_storage/warehouse.py +19 -25
- datachain/lib/dc.py +1 -2
- datachain/lib/listing.py +1 -0
- datachain/lib/tar.py +2 -1
- datachain/node.py +17 -3
- datachain/query/__init__.py +0 -2
- datachain/query/dataset.py +58 -145
- datachain/query/schema.py +23 -12
- datachain/query/udf.py +2 -42
- datachain/utils.py +0 -40
- {datachain-0.3.15.dist-info → datachain-0.3.17.dist-info}/METADATA +1 -1
- {datachain-0.3.15.dist-info → datachain-0.3.17.dist-info}/RECORD +20 -21
- {datachain-0.3.15.dist-info → datachain-0.3.17.dist-info}/WHEEL +1 -1
- datachain/query/builtins.py +0 -96
- {datachain-0.3.15.dist-info → datachain-0.3.17.dist-info}/LICENSE +0 -0
- {datachain-0.3.15.dist-info → datachain-0.3.17.dist-info}/entry_points.txt +0 -0
- {datachain-0.3.15.dist-info → datachain-0.3.17.dist-info}/top_level.txt +0 -0
datachain/catalog/catalog.py
CHANGED
|
@@ -68,8 +68,6 @@ from datachain.utils import (
|
|
|
68
68
|
DataChainDir,
|
|
69
69
|
batched,
|
|
70
70
|
datachain_paths_join,
|
|
71
|
-
import_object,
|
|
72
|
-
parse_params_string,
|
|
73
71
|
)
|
|
74
72
|
|
|
75
73
|
from .datasource import DataSource
|
|
@@ -843,7 +841,7 @@ class Catalog:
|
|
|
843
841
|
from datachain.query import DatasetQuery
|
|
844
842
|
|
|
845
843
|
def _row_to_node(d: dict[str, Any]) -> Node:
|
|
846
|
-
del d["
|
|
844
|
+
del d["file__source"]
|
|
847
845
|
return Node.from_dict(d)
|
|
848
846
|
|
|
849
847
|
enlisted_sources: list[tuple[bool, bool, Any]] = []
|
|
@@ -1148,30 +1146,28 @@ class Catalog:
|
|
|
1148
1146
|
if not sources:
|
|
1149
1147
|
raise ValueError("Sources needs to be non empty list")
|
|
1150
1148
|
|
|
1151
|
-
from datachain.
|
|
1149
|
+
from datachain.lib.dc import DataChain
|
|
1150
|
+
from datachain.query.session import Session
|
|
1151
|
+
|
|
1152
|
+
session = Session.get(catalog=self, client_config=client_config)
|
|
1152
1153
|
|
|
1153
|
-
|
|
1154
|
+
chains = []
|
|
1154
1155
|
for source in sources:
|
|
1155
1156
|
if source.startswith(DATASET_PREFIX):
|
|
1156
|
-
|
|
1157
|
-
|
|
1158
|
-
catalog=self,
|
|
1159
|
-
client_config=client_config,
|
|
1157
|
+
dc = DataChain.from_dataset(
|
|
1158
|
+
source[len(DATASET_PREFIX) :], session=session
|
|
1160
1159
|
)
|
|
1161
1160
|
else:
|
|
1162
|
-
|
|
1163
|
-
|
|
1164
|
-
catalog=self,
|
|
1165
|
-
client_config=client_config,
|
|
1166
|
-
recursive=recursive,
|
|
1161
|
+
dc = DataChain.from_storage(
|
|
1162
|
+
source, session=session, recursive=recursive
|
|
1167
1163
|
)
|
|
1168
1164
|
|
|
1169
|
-
|
|
1165
|
+
chains.append(dc)
|
|
1170
1166
|
|
|
1171
1167
|
# create union of all dataset queries created from sources
|
|
1172
|
-
|
|
1168
|
+
dc = reduce(lambda dc1, dc2: dc1.union(dc2), chains)
|
|
1173
1169
|
try:
|
|
1174
|
-
|
|
1170
|
+
dc.save(name)
|
|
1175
1171
|
except Exception as e: # noqa: BLE001
|
|
1176
1172
|
try:
|
|
1177
1173
|
ds = self.get_dataset(name)
|
|
@@ -1731,26 +1727,6 @@ class Catalog:
|
|
|
1731
1727
|
output, sources, client_config=client_config, recursive=recursive
|
|
1732
1728
|
)
|
|
1733
1729
|
|
|
1734
|
-
def apply_udf(
|
|
1735
|
-
self,
|
|
1736
|
-
udf_location: str,
|
|
1737
|
-
source: str,
|
|
1738
|
-
target_name: str,
|
|
1739
|
-
parallel: Optional[int] = None,
|
|
1740
|
-
params: Optional[str] = None,
|
|
1741
|
-
):
|
|
1742
|
-
from datachain.query import DatasetQuery
|
|
1743
|
-
|
|
1744
|
-
if source.startswith(DATASET_PREFIX):
|
|
1745
|
-
ds = DatasetQuery(name=source[len(DATASET_PREFIX) :], catalog=self)
|
|
1746
|
-
else:
|
|
1747
|
-
ds = DatasetQuery(path=source, catalog=self)
|
|
1748
|
-
udf = import_object(udf_location)
|
|
1749
|
-
if params:
|
|
1750
|
-
args, kwargs = parse_params_string(params)
|
|
1751
|
-
udf = udf(*args, **kwargs)
|
|
1752
|
-
ds.add_signals(udf, parallel=parallel).save(target_name)
|
|
1753
|
-
|
|
1754
1730
|
def query(
|
|
1755
1731
|
self,
|
|
1756
1732
|
query_script: str,
|
datachain/cli.py
CHANGED
|
@@ -494,27 +494,6 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
494
494
|
help="Query parameters",
|
|
495
495
|
)
|
|
496
496
|
|
|
497
|
-
apply_udf_parser = subp.add_parser(
|
|
498
|
-
"apply-udf", parents=[parent_parser], description="Apply UDF"
|
|
499
|
-
)
|
|
500
|
-
apply_udf_parser.add_argument("udf", type=str, help="UDF location")
|
|
501
|
-
apply_udf_parser.add_argument("source", type=str, help="Source storage or dataset")
|
|
502
|
-
apply_udf_parser.add_argument("target", type=str, help="Target dataset name")
|
|
503
|
-
apply_udf_parser.add_argument(
|
|
504
|
-
"--parallel",
|
|
505
|
-
nargs="?",
|
|
506
|
-
type=int,
|
|
507
|
-
const=-1,
|
|
508
|
-
default=None,
|
|
509
|
-
metavar="N",
|
|
510
|
-
help=(
|
|
511
|
-
"Use multiprocessing to run the UDF with N worker processes. "
|
|
512
|
-
"N defaults to the CPU count."
|
|
513
|
-
),
|
|
514
|
-
)
|
|
515
|
-
apply_udf_parser.add_argument(
|
|
516
|
-
"--udf-params", type=str, default=None, help="UDF class parameters"
|
|
517
|
-
)
|
|
518
497
|
subp.add_parser(
|
|
519
498
|
"clear-cache", parents=[parent_parser], description="Clear the local file cache"
|
|
520
499
|
)
|
|
@@ -1016,10 +995,6 @@ def main(argv: Optional[list[str]] = None) -> int: # noqa: C901, PLR0912, PLR09
|
|
|
1016
995
|
parallel=args.parallel,
|
|
1017
996
|
params=args.param,
|
|
1018
997
|
)
|
|
1019
|
-
elif args.command == "apply-udf":
|
|
1020
|
-
catalog.apply_udf(
|
|
1021
|
-
args.udf, args.source, args.target, args.parallel, args.udf_params
|
|
1022
|
-
)
|
|
1023
998
|
elif args.command == "clear-cache":
|
|
1024
999
|
clear_cache(catalog)
|
|
1025
1000
|
elif args.command == "gc":
|
|
@@ -297,39 +297,6 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
297
297
|
#
|
|
298
298
|
# Dataset dependencies
|
|
299
299
|
#
|
|
300
|
-
|
|
301
|
-
def add_dependency(
|
|
302
|
-
self,
|
|
303
|
-
dependency: DatasetDependency,
|
|
304
|
-
source_dataset_name: str,
|
|
305
|
-
source_dataset_version: int,
|
|
306
|
-
) -> None:
|
|
307
|
-
"""Add dependency to dataset or storage."""
|
|
308
|
-
if dependency.is_dataset:
|
|
309
|
-
self.add_dataset_dependency(
|
|
310
|
-
source_dataset_name,
|
|
311
|
-
source_dataset_version,
|
|
312
|
-
dependency.dataset_name,
|
|
313
|
-
int(dependency.version),
|
|
314
|
-
)
|
|
315
|
-
else:
|
|
316
|
-
self.add_storage_dependency(
|
|
317
|
-
source_dataset_name,
|
|
318
|
-
source_dataset_version,
|
|
319
|
-
StorageURI(dependency.name),
|
|
320
|
-
dependency.version,
|
|
321
|
-
)
|
|
322
|
-
|
|
323
|
-
@abstractmethod
|
|
324
|
-
def add_storage_dependency(
|
|
325
|
-
self,
|
|
326
|
-
source_dataset_name: str,
|
|
327
|
-
source_dataset_version: int,
|
|
328
|
-
storage_uri: StorageURI,
|
|
329
|
-
storage_timestamp_str: Optional[str] = None,
|
|
330
|
-
) -> None:
|
|
331
|
-
"""Adds storage dependency to dataset."""
|
|
332
|
-
|
|
333
300
|
@abstractmethod
|
|
334
301
|
def add_dataset_dependency(
|
|
335
302
|
self,
|
|
@@ -1268,32 +1235,6 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
1268
1235
|
#
|
|
1269
1236
|
# Dataset dependencies
|
|
1270
1237
|
#
|
|
1271
|
-
|
|
1272
|
-
def _insert_dataset_dependency(self, data: dict[str, Any]) -> None:
|
|
1273
|
-
"""Method for inserting dependencies."""
|
|
1274
|
-
self.db.execute(self._datasets_dependencies_insert().values(**data))
|
|
1275
|
-
|
|
1276
|
-
def add_storage_dependency(
|
|
1277
|
-
self,
|
|
1278
|
-
source_dataset_name: str,
|
|
1279
|
-
source_dataset_version: int,
|
|
1280
|
-
storage_uri: StorageURI,
|
|
1281
|
-
storage_timestamp_str: Optional[str] = None,
|
|
1282
|
-
) -> None:
|
|
1283
|
-
source_dataset = self.get_dataset(source_dataset_name)
|
|
1284
|
-
storage = self.get_storage(storage_uri)
|
|
1285
|
-
|
|
1286
|
-
self._insert_dataset_dependency(
|
|
1287
|
-
{
|
|
1288
|
-
"source_dataset_id": source_dataset.id,
|
|
1289
|
-
"source_dataset_version_id": (
|
|
1290
|
-
source_dataset.get_version(source_dataset_version).id
|
|
1291
|
-
),
|
|
1292
|
-
"bucket_id": storage.id,
|
|
1293
|
-
"bucket_version": storage_timestamp_str,
|
|
1294
|
-
}
|
|
1295
|
-
)
|
|
1296
|
-
|
|
1297
1238
|
def add_dataset_dependency(
|
|
1298
1239
|
self,
|
|
1299
1240
|
source_dataset_name: str,
|
|
@@ -1305,15 +1246,15 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
1305
1246
|
source_dataset = self.get_dataset(source_dataset_name)
|
|
1306
1247
|
dataset = self.get_dataset(dataset_name)
|
|
1307
1248
|
|
|
1308
|
-
self.
|
|
1309
|
-
|
|
1310
|
-
|
|
1311
|
-
|
|
1249
|
+
self.db.execute(
|
|
1250
|
+
self._datasets_dependencies_insert().values(
|
|
1251
|
+
source_dataset_id=source_dataset.id,
|
|
1252
|
+
source_dataset_version_id=(
|
|
1312
1253
|
source_dataset.get_version(source_dataset_version).id
|
|
1313
1254
|
),
|
|
1314
|
-
|
|
1315
|
-
|
|
1316
|
-
|
|
1255
|
+
dataset_id=dataset.id,
|
|
1256
|
+
dataset_version_id=dataset.get_version(dataset_version).id,
|
|
1257
|
+
)
|
|
1317
1258
|
)
|
|
1318
1259
|
|
|
1319
1260
|
def update_dataset_dependency_source(
|
datachain/data_storage/sqlite.py
CHANGED
|
@@ -40,7 +40,9 @@ if TYPE_CHECKING:
|
|
|
40
40
|
from sqlalchemy.dialects.sqlite import Insert
|
|
41
41
|
from sqlalchemy.engine.base import Engine
|
|
42
42
|
from sqlalchemy.schema import SchemaItem
|
|
43
|
+
from sqlalchemy.sql._typing import _FromClauseArgument, _OnClauseArgument
|
|
43
44
|
from sqlalchemy.sql.elements import ColumnElement
|
|
45
|
+
from sqlalchemy.sql.selectable import Join
|
|
44
46
|
from sqlalchemy.types import TypeEngine
|
|
45
47
|
|
|
46
48
|
from datachain.lib.file import File
|
|
@@ -649,11 +651,14 @@ class SQLiteWarehouse(AbstractWarehouse):
|
|
|
649
651
|
self, dataset: DatasetRecord, version: int
|
|
650
652
|
) -> list[StorageURI]:
|
|
651
653
|
dr = self.dataset_rows(dataset, version)
|
|
652
|
-
query = dr.select(dr.c.
|
|
654
|
+
query = dr.select(dr.c.file__source).distinct()
|
|
653
655
|
cur = self.db.cursor()
|
|
654
656
|
cur.row_factory = sqlite3.Row # type: ignore[assignment]
|
|
655
657
|
|
|
656
|
-
return [
|
|
658
|
+
return [
|
|
659
|
+
StorageURI(row["file__source"])
|
|
660
|
+
for row in self.db.execute(query, cursor=cur)
|
|
661
|
+
]
|
|
657
662
|
|
|
658
663
|
def merge_dataset_rows(
|
|
659
664
|
self,
|
|
@@ -788,6 +793,23 @@ class SQLiteWarehouse(AbstractWarehouse):
|
|
|
788
793
|
if progress_cb:
|
|
789
794
|
progress_cb(len(batch_ids))
|
|
790
795
|
|
|
796
|
+
def join(
|
|
797
|
+
self,
|
|
798
|
+
left: "_FromClauseArgument",
|
|
799
|
+
right: "_FromClauseArgument",
|
|
800
|
+
onclause: "_OnClauseArgument",
|
|
801
|
+
inner: bool = True,
|
|
802
|
+
) -> "Join":
|
|
803
|
+
"""
|
|
804
|
+
Join two tables together.
|
|
805
|
+
"""
|
|
806
|
+
return sqlalchemy.join(
|
|
807
|
+
left,
|
|
808
|
+
right,
|
|
809
|
+
onclause,
|
|
810
|
+
isouter=not inner,
|
|
811
|
+
)
|
|
812
|
+
|
|
791
813
|
def create_pre_udf_table(self, query: "Select") -> "Table":
|
|
792
814
|
"""
|
|
793
815
|
Create a temporary table from a query for use in a UDF.
|
|
@@ -27,8 +27,12 @@ from datachain.storage import StorageURI
|
|
|
27
27
|
from datachain.utils import sql_escape_like
|
|
28
28
|
|
|
29
29
|
if TYPE_CHECKING:
|
|
30
|
-
from sqlalchemy.sql._typing import
|
|
31
|
-
|
|
30
|
+
from sqlalchemy.sql._typing import (
|
|
31
|
+
_ColumnsClauseArgument,
|
|
32
|
+
_FromClauseArgument,
|
|
33
|
+
_OnClauseArgument,
|
|
34
|
+
)
|
|
35
|
+
from sqlalchemy.sql.selectable import Join, Select
|
|
32
36
|
from sqlalchemy.types import TypeEngine
|
|
33
37
|
|
|
34
38
|
from datachain.data_storage import AbstractIDGenerator, schema
|
|
@@ -894,6 +898,18 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
894
898
|
Copy the results of a query into a table.
|
|
895
899
|
"""
|
|
896
900
|
|
|
901
|
+
@abstractmethod
|
|
902
|
+
def join(
|
|
903
|
+
self,
|
|
904
|
+
left: "_FromClauseArgument",
|
|
905
|
+
right: "_FromClauseArgument",
|
|
906
|
+
onclause: "_OnClauseArgument",
|
|
907
|
+
inner: bool = True,
|
|
908
|
+
) -> "Join":
|
|
909
|
+
"""
|
|
910
|
+
Join two tables together.
|
|
911
|
+
"""
|
|
912
|
+
|
|
897
913
|
@abstractmethod
|
|
898
914
|
def create_pre_udf_table(self, query: "Select") -> "Table":
|
|
899
915
|
"""
|
|
@@ -922,32 +938,10 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
922
938
|
are cleaned up as soon as they are no longer needed.
|
|
923
939
|
"""
|
|
924
940
|
with tqdm(desc="Cleanup", unit=" tables") as pbar:
|
|
925
|
-
for name in names:
|
|
941
|
+
for name in set(names):
|
|
926
942
|
self.db.drop_table(Table(name, self.db.metadata), if_exists=True)
|
|
927
943
|
pbar.update(1)
|
|
928
944
|
|
|
929
|
-
def changed_query(
|
|
930
|
-
self,
|
|
931
|
-
source_query: sa.sql.selectable.Select,
|
|
932
|
-
target_query: sa.sql.selectable.Select,
|
|
933
|
-
) -> sa.sql.selectable.Select:
|
|
934
|
-
sq = source_query.alias("source_query")
|
|
935
|
-
tq = target_query.alias("target_query")
|
|
936
|
-
|
|
937
|
-
source_target_join = sa.join(
|
|
938
|
-
sq, tq, (sq.c.source == tq.c.source) & (sq.c.path == tq.c.path)
|
|
939
|
-
)
|
|
940
|
-
|
|
941
|
-
return (
|
|
942
|
-
select(*sq.c)
|
|
943
|
-
.select_from(source_target_join)
|
|
944
|
-
.where(
|
|
945
|
-
(sq.c.last_modified > tq.c.last_modified)
|
|
946
|
-
& (sq.c.is_latest == true())
|
|
947
|
-
& (tq.c.is_latest == true())
|
|
948
|
-
)
|
|
949
|
-
)
|
|
950
|
-
|
|
951
945
|
|
|
952
946
|
def _random_string(length: int) -> str:
|
|
953
947
|
return "".join(
|
datachain/lib/dc.py
CHANGED
|
@@ -1337,8 +1337,7 @@ class DataChain(DatasetQuery):
|
|
|
1337
1337
|
other.signals_schema.resolve(*right_on).db_signals(),
|
|
1338
1338
|
) # type: ignore[arg-type]
|
|
1339
1339
|
)
|
|
1340
|
-
|
|
1341
|
-
return super()._subtract(other, signals) # type: ignore[arg-type]
|
|
1340
|
+
return super().subtract(other, signals) # type: ignore[arg-type]
|
|
1342
1341
|
|
|
1343
1342
|
@classmethod
|
|
1344
1343
|
def from_values(
|
datachain/lib/listing.py
CHANGED
|
@@ -77,6 +77,7 @@ def parse_listing_uri(uri: str, cache, client_config) -> tuple[str, str, str]:
|
|
|
77
77
|
"""
|
|
78
78
|
Parsing uri and returns listing dataset name, listing uri and listing path
|
|
79
79
|
"""
|
|
80
|
+
client_config = client_config or {}
|
|
80
81
|
client = Client.get_client(uri, cache, **client_config)
|
|
81
82
|
storage_uri, path = Client.parse_url(uri)
|
|
82
83
|
|
datachain/lib/tar.py
CHANGED
datachain/node.py
CHANGED
|
@@ -114,9 +114,23 @@ class Node:
|
|
|
114
114
|
)
|
|
115
115
|
|
|
116
116
|
@classmethod
|
|
117
|
-
def from_dict(cls, d: dict[str, Any]) -> "Self":
|
|
118
|
-
|
|
119
|
-
|
|
117
|
+
def from_dict(cls, d: dict[str, Any], file_prefix: str = "file") -> "Self":
|
|
118
|
+
def _dval(field_name: str):
|
|
119
|
+
return d.get(f"{file_prefix}__{field_name}")
|
|
120
|
+
|
|
121
|
+
return cls(
|
|
122
|
+
sys__id=d["sys__id"],
|
|
123
|
+
sys__rand=d["sys__rand"],
|
|
124
|
+
source=_dval("source"),
|
|
125
|
+
path=_dval("path"),
|
|
126
|
+
etag=_dval("etag"),
|
|
127
|
+
is_latest=_dval("is_latest"),
|
|
128
|
+
size=_dval("size"),
|
|
129
|
+
last_modified=_dval("last_modified"),
|
|
130
|
+
version=_dval("version"),
|
|
131
|
+
location=_dval("location"),
|
|
132
|
+
dir_type=DirType.FILE,
|
|
133
|
+
)
|
|
120
134
|
|
|
121
135
|
@classmethod
|
|
122
136
|
def from_dir(cls, path, **kwargs) -> "Node":
|
datachain/query/__init__.py
CHANGED
|
@@ -2,7 +2,6 @@ from .dataset import DatasetQuery
|
|
|
2
2
|
from .params import param
|
|
3
3
|
from .schema import C, DatasetRow, LocalFilename, Object, Stream
|
|
4
4
|
from .session import Session
|
|
5
|
-
from .udf import udf
|
|
6
5
|
|
|
7
6
|
__all__ = [
|
|
8
7
|
"C",
|
|
@@ -13,5 +12,4 @@ __all__ = [
|
|
|
13
12
|
"Session",
|
|
14
13
|
"Stream",
|
|
15
14
|
"param",
|
|
16
|
-
"udf",
|
|
17
15
|
]
|
datachain/query/dataset.py
CHANGED
|
@@ -3,7 +3,6 @@ import inspect
|
|
|
3
3
|
import logging
|
|
4
4
|
import os
|
|
5
5
|
import random
|
|
6
|
-
import re
|
|
7
6
|
import string
|
|
8
7
|
import subprocess
|
|
9
8
|
import sys
|
|
@@ -33,11 +32,9 @@ from sqlalchemy.sql.elements import ColumnClause, ColumnElement
|
|
|
33
32
|
from sqlalchemy.sql.expression import label
|
|
34
33
|
from sqlalchemy.sql.schema import TableClause
|
|
35
34
|
from sqlalchemy.sql.selectable import Select
|
|
36
|
-
from tqdm import tqdm
|
|
37
35
|
|
|
38
36
|
from datachain.asyn import ASYNC_WORKERS, AsyncMapper, OrderedMapper
|
|
39
37
|
from datachain.catalog import QUERY_SCRIPT_CANCELED_EXIT_CODE, get_catalog
|
|
40
|
-
from datachain.client import Client
|
|
41
38
|
from datachain.data_storage.schema import (
|
|
42
39
|
PARTITION_COLUMN_ID,
|
|
43
40
|
partition_col_names,
|
|
@@ -47,7 +44,6 @@ from datachain.dataset import DatasetStatus, RowDict
|
|
|
47
44
|
from datachain.error import DatasetNotFoundError, QueryScriptCancelError
|
|
48
45
|
from datachain.progress import CombinedDownloadCallback
|
|
49
46
|
from datachain.sql.functions import rand
|
|
50
|
-
from datachain.storage import Storage, StorageURI
|
|
51
47
|
from datachain.utils import (
|
|
52
48
|
batched,
|
|
53
49
|
determine_processes,
|
|
@@ -78,9 +74,7 @@ INSERT_BATCH_SIZE = 10000
|
|
|
78
74
|
|
|
79
75
|
PartitionByType = Union[ColumnElement, Sequence[ColumnElement]]
|
|
80
76
|
JoinPredicateType = Union[str, ColumnClause, ColumnElement]
|
|
81
|
-
|
|
82
|
-
# depending what type of dependency we are adding
|
|
83
|
-
DatasetDependencyType = Union[tuple[str, int], StorageURI]
|
|
77
|
+
DatasetDependencyType = tuple[str, int]
|
|
84
78
|
|
|
85
79
|
logger = logging.getLogger("datachain")
|
|
86
80
|
|
|
@@ -186,38 +180,6 @@ class QueryStep(StartingStep):
|
|
|
186
180
|
)
|
|
187
181
|
|
|
188
182
|
|
|
189
|
-
@frozen
|
|
190
|
-
class IndexingStep(StartingStep):
|
|
191
|
-
path: str
|
|
192
|
-
catalog: "Catalog"
|
|
193
|
-
kwargs: dict[str, Any]
|
|
194
|
-
recursive: Optional[bool] = True
|
|
195
|
-
|
|
196
|
-
def apply(self):
|
|
197
|
-
self.catalog.index([self.path], **self.kwargs)
|
|
198
|
-
uri, path = Client.parse_url(self.path)
|
|
199
|
-
_partial_id, partial_path = self.catalog.metastore.get_valid_partial_id(
|
|
200
|
-
uri, path
|
|
201
|
-
)
|
|
202
|
-
dataset = self.catalog.get_dataset(Storage.dataset_name(uri, partial_path))
|
|
203
|
-
dataset_rows = self.catalog.warehouse.dataset_rows(
|
|
204
|
-
dataset, dataset.latest_version
|
|
205
|
-
)
|
|
206
|
-
|
|
207
|
-
def q(*columns):
|
|
208
|
-
col_names = [c.name for c in columns]
|
|
209
|
-
return self.catalog.warehouse.nodes_dataset_query(
|
|
210
|
-
dataset_rows,
|
|
211
|
-
column_names=col_names,
|
|
212
|
-
path=path,
|
|
213
|
-
recursive=self.recursive,
|
|
214
|
-
)
|
|
215
|
-
|
|
216
|
-
storage = self.catalog.metastore.get_storage(uri)
|
|
217
|
-
|
|
218
|
-
return step_result(q, dataset_rows.c, dependencies=[storage.uri])
|
|
219
|
-
|
|
220
|
-
|
|
221
183
|
def generator_then_call(generator, func: Callable):
|
|
222
184
|
"""
|
|
223
185
|
Yield items from generator then execute a function and yield
|
|
@@ -231,7 +193,7 @@ def generator_then_call(generator, func: Callable):
|
|
|
231
193
|
class DatasetDiffOperation(Step):
|
|
232
194
|
"""
|
|
233
195
|
Abstract class for operations that are calculation some kind of diff between
|
|
234
|
-
datasets queries like subtract
|
|
196
|
+
datasets queries like subtract etc.
|
|
235
197
|
"""
|
|
236
198
|
|
|
237
199
|
dq: "DatasetQuery"
|
|
@@ -305,28 +267,6 @@ class Subtract(DatasetDiffOperation):
|
|
|
305
267
|
return sq.select().except_(sq.select().where(where_clause))
|
|
306
268
|
|
|
307
269
|
|
|
308
|
-
@frozen
|
|
309
|
-
class Changed(DatasetDiffOperation):
|
|
310
|
-
"""
|
|
311
|
-
Calculates rows that are changed in a source query compared to target query
|
|
312
|
-
Changed means it has same source + path but different last_modified
|
|
313
|
-
Example:
|
|
314
|
-
>>> ds = DatasetQuery(name="dogs_cats") # some older dataset with embeddings
|
|
315
|
-
>>> ds_updated = (
|
|
316
|
-
DatasetQuery("gs://dvcx-datalakes/dogs-and-cats")
|
|
317
|
-
.filter(C.size > 1000) # we can also filter out source query
|
|
318
|
-
.changed(ds)
|
|
319
|
-
.add_signals(calc_embeddings) # calculae embeddings only on changed rows
|
|
320
|
-
.union(ds) # union with old dataset that's missing updated rows
|
|
321
|
-
.save("dogs_cats_updated")
|
|
322
|
-
)
|
|
323
|
-
|
|
324
|
-
"""
|
|
325
|
-
|
|
326
|
-
def query(self, source_query: Select, target_query: Select) -> Select:
|
|
327
|
-
return self.catalog.warehouse.changed_query(source_query, target_query)
|
|
328
|
-
|
|
329
|
-
|
|
330
270
|
def adjust_outputs(
|
|
331
271
|
warehouse: "AbstractWarehouse", row: dict[str, Any], udf_col_types: list[tuple]
|
|
332
272
|
) -> dict[str, Any]:
|
|
@@ -899,12 +839,36 @@ class SQLUnion(Step):
|
|
|
899
839
|
|
|
900
840
|
@frozen
|
|
901
841
|
class SQLJoin(Step):
|
|
842
|
+
catalog: "Catalog"
|
|
902
843
|
query1: "DatasetQuery"
|
|
903
844
|
query2: "DatasetQuery"
|
|
904
845
|
predicates: Union[JoinPredicateType, tuple[JoinPredicateType, ...]]
|
|
905
846
|
inner: bool
|
|
906
847
|
rname: str
|
|
907
848
|
|
|
849
|
+
def get_query(self, dq: "DatasetQuery", temp_tables: list[str]) -> sa.Subquery:
|
|
850
|
+
query = dq.apply_steps().select()
|
|
851
|
+
temp_tables.extend(dq.temp_table_names)
|
|
852
|
+
|
|
853
|
+
if not any(isinstance(step, (SQLJoin, SQLUnion)) for step in dq.steps):
|
|
854
|
+
return query.subquery(dq.table.name)
|
|
855
|
+
|
|
856
|
+
warehouse = self.catalog.warehouse
|
|
857
|
+
|
|
858
|
+
columns = [
|
|
859
|
+
c if isinstance(c, Column) else Column(c.name, c.type)
|
|
860
|
+
for c in query.subquery().columns
|
|
861
|
+
]
|
|
862
|
+
temp_table = warehouse.create_dataset_rows_table(
|
|
863
|
+
warehouse.temp_table_name(),
|
|
864
|
+
columns=columns,
|
|
865
|
+
)
|
|
866
|
+
temp_tables.append(temp_table.name)
|
|
867
|
+
|
|
868
|
+
warehouse.copy_table(temp_table, query)
|
|
869
|
+
|
|
870
|
+
return temp_table.select().subquery(dq.table.name)
|
|
871
|
+
|
|
908
872
|
def validate_expression(self, exp: "ClauseElement", q1, q2):
|
|
909
873
|
"""
|
|
910
874
|
Checking if columns used in expression actually exist in left / right
|
|
@@ -937,10 +901,8 @@ class SQLJoin(Step):
|
|
|
937
901
|
def apply(
|
|
938
902
|
self, query_generator: QueryGenerator, temp_tables: list[str]
|
|
939
903
|
) -> StepResult:
|
|
940
|
-
q1 = self.
|
|
941
|
-
|
|
942
|
-
q2 = self.query2.apply_steps().select().subquery(self.query2.table.name)
|
|
943
|
-
temp_tables.extend(self.query2.temp_table_names)
|
|
904
|
+
q1 = self.get_query(self.query1, temp_tables)
|
|
905
|
+
q2 = self.get_query(self.query2, temp_tables)
|
|
944
906
|
|
|
945
907
|
q1_columns = list(q1.c)
|
|
946
908
|
q1_column_names = {c.name for c in q1_columns}
|
|
@@ -951,7 +913,12 @@ class SQLJoin(Step):
|
|
|
951
913
|
continue
|
|
952
914
|
|
|
953
915
|
if c.name in q1_column_names:
|
|
954
|
-
|
|
916
|
+
new_name = self.rname.format(name=c.name)
|
|
917
|
+
new_name_idx = 0
|
|
918
|
+
while new_name in q1_column_names:
|
|
919
|
+
new_name_idx += 1
|
|
920
|
+
new_name = self.rname.format(name=f"{c.name}_{new_name_idx}")
|
|
921
|
+
c = c.label(new_name)
|
|
955
922
|
q2_columns.append(c)
|
|
956
923
|
|
|
957
924
|
res_columns = q1_columns + q2_columns
|
|
@@ -979,16 +946,14 @@ class SQLJoin(Step):
|
|
|
979
946
|
self.validate_expression(join_expression, q1, q2)
|
|
980
947
|
|
|
981
948
|
def q(*columns):
|
|
982
|
-
join_query =
|
|
949
|
+
join_query = self.catalog.warehouse.join(
|
|
983
950
|
q1,
|
|
984
951
|
q2,
|
|
985
952
|
join_expression,
|
|
986
|
-
|
|
953
|
+
inner=self.inner,
|
|
987
954
|
)
|
|
988
|
-
|
|
989
|
-
|
|
990
|
-
subquery = res.subquery()
|
|
991
|
-
return sqlalchemy.select(*subquery.c).select_from(subquery)
|
|
955
|
+
return sqlalchemy.select(*columns).select_from(join_query)
|
|
956
|
+
# return sqlalchemy.select(*subquery.c).select_from(subquery)
|
|
992
957
|
|
|
993
958
|
return step_result(
|
|
994
959
|
q,
|
|
@@ -1072,28 +1037,14 @@ class ResultIter:
|
|
|
1072
1037
|
class DatasetQuery:
|
|
1073
1038
|
def __init__(
|
|
1074
1039
|
self,
|
|
1075
|
-
|
|
1076
|
-
name: str = "",
|
|
1040
|
+
name: str,
|
|
1077
1041
|
version: Optional[int] = None,
|
|
1078
1042
|
catalog: Optional["Catalog"] = None,
|
|
1079
|
-
client_config=None,
|
|
1080
|
-
recursive: Optional[bool] = True,
|
|
1081
1043
|
session: Optional[Session] = None,
|
|
1082
|
-
anon: bool = False,
|
|
1083
|
-
indexing_feature_schema: Optional[dict] = None,
|
|
1084
1044
|
indexing_column_types: Optional[dict[str, Any]] = None,
|
|
1085
|
-
update: Optional[bool] = False,
|
|
1086
1045
|
in_memory: bool = False,
|
|
1087
1046
|
):
|
|
1088
|
-
|
|
1089
|
-
client_config = {}
|
|
1090
|
-
|
|
1091
|
-
if anon:
|
|
1092
|
-
client_config["anon"] = True
|
|
1093
|
-
|
|
1094
|
-
self.session = Session.get(
|
|
1095
|
-
session, catalog=catalog, client_config=client_config, in_memory=in_memory
|
|
1096
|
-
)
|
|
1047
|
+
self.session = Session.get(session, catalog=catalog, in_memory=in_memory)
|
|
1097
1048
|
self.catalog = catalog or self.session.catalog
|
|
1098
1049
|
self.steps: list[Step] = []
|
|
1099
1050
|
self._chunk_index: Optional[int] = None
|
|
@@ -1107,26 +1058,14 @@ class DatasetQuery:
|
|
|
1107
1058
|
self.feature_schema: Optional[dict] = None
|
|
1108
1059
|
self.column_types: Optional[dict[str, Any]] = None
|
|
1109
1060
|
|
|
1110
|
-
|
|
1111
|
-
|
|
1112
|
-
|
|
1113
|
-
|
|
1114
|
-
|
|
1115
|
-
|
|
1116
|
-
self.
|
|
1117
|
-
|
|
1118
|
-
self.version = version or ds.latest_version
|
|
1119
|
-
self.feature_schema = ds.get_version(self.version).feature_schema
|
|
1120
|
-
self.column_types = copy(ds.schema)
|
|
1121
|
-
if "sys__id" in self.column_types:
|
|
1122
|
-
self.column_types.pop("sys__id")
|
|
1123
|
-
self.starting_step = QueryStep(self.catalog, name, self.version)
|
|
1124
|
-
else:
|
|
1125
|
-
raise ValueError("must provide path or name")
|
|
1126
|
-
|
|
1127
|
-
@staticmethod
|
|
1128
|
-
def is_storage_path(path):
|
|
1129
|
-
return bool(re.compile(r"^[a-zA-Z0-9]+://").match(path))
|
|
1061
|
+
self.name = name
|
|
1062
|
+
ds = self.catalog.get_dataset(name)
|
|
1063
|
+
self.version = version or ds.latest_version
|
|
1064
|
+
self.feature_schema = ds.get_version(self.version).feature_schema
|
|
1065
|
+
self.column_types = copy(ds.schema)
|
|
1066
|
+
if "sys__id" in self.column_types:
|
|
1067
|
+
self.column_types.pop("sys__id")
|
|
1068
|
+
self.starting_step = QueryStep(self.catalog, name, self.version)
|
|
1130
1069
|
|
|
1131
1070
|
def __iter__(self):
|
|
1132
1071
|
return iter(self.db_results())
|
|
@@ -1511,7 +1450,7 @@ class DatasetQuery:
|
|
|
1511
1450
|
if isinstance(predicates, (str, ColumnClause, ColumnElement))
|
|
1512
1451
|
else tuple(predicates)
|
|
1513
1452
|
)
|
|
1514
|
-
new_query.steps = [SQLJoin(left, right, predicates, inner, rname)]
|
|
1453
|
+
new_query.steps = [SQLJoin(self.catalog, left, right, predicates, inner, rname)]
|
|
1515
1454
|
return new_query
|
|
1516
1455
|
|
|
1517
1456
|
@detach
|
|
@@ -1571,21 +1510,11 @@ class DatasetQuery:
|
|
|
1571
1510
|
return query
|
|
1572
1511
|
|
|
1573
1512
|
@detach
|
|
1574
|
-
def subtract(self, dq: "DatasetQuery") -> "Self":
|
|
1575
|
-
return self._subtract(dq, on=[("source", "source"), ("path", "path")])
|
|
1576
|
-
|
|
1577
|
-
@detach
|
|
1578
|
-
def _subtract(self, dq: "DatasetQuery", on: Sequence[tuple[str, str]]) -> "Self":
|
|
1513
|
+
def subtract(self, dq: "DatasetQuery", on: Sequence[tuple[str, str]]) -> "Self":
|
|
1579
1514
|
query = self.clone()
|
|
1580
1515
|
query.steps.append(Subtract(dq, self.catalog, on=on))
|
|
1581
1516
|
return query
|
|
1582
1517
|
|
|
1583
|
-
@detach
|
|
1584
|
-
def changed(self, dq: "DatasetQuery") -> "Self":
|
|
1585
|
-
query = self.clone()
|
|
1586
|
-
query.steps.append(Changed(dq, self.catalog))
|
|
1587
|
-
return query
|
|
1588
|
-
|
|
1589
1518
|
@detach
|
|
1590
1519
|
def generate(
|
|
1591
1520
|
self,
|
|
@@ -1616,24 +1545,13 @@ class DatasetQuery:
|
|
|
1616
1545
|
|
|
1617
1546
|
def _add_dependencies(self, dataset: "DatasetRecord", version: int):
|
|
1618
1547
|
for dependency in self.dependencies:
|
|
1619
|
-
|
|
1620
|
-
|
|
1621
|
-
|
|
1622
|
-
|
|
1623
|
-
|
|
1624
|
-
|
|
1625
|
-
|
|
1626
|
-
ds_dependency_version,
|
|
1627
|
-
)
|
|
1628
|
-
else:
|
|
1629
|
-
# storage dependency - its name is a valid StorageURI
|
|
1630
|
-
storage = self.catalog.metastore.get_storage(dependency)
|
|
1631
|
-
self.catalog.metastore.add_storage_dependency(
|
|
1632
|
-
StorageURI(dataset.name),
|
|
1633
|
-
version,
|
|
1634
|
-
storage.uri,
|
|
1635
|
-
storage.timestamp_str,
|
|
1636
|
-
)
|
|
1548
|
+
ds_dependency_name, ds_dependency_version = dependency
|
|
1549
|
+
self.catalog.metastore.add_dataset_dependency(
|
|
1550
|
+
dataset.name,
|
|
1551
|
+
version,
|
|
1552
|
+
ds_dependency_name,
|
|
1553
|
+
ds_dependency_version,
|
|
1554
|
+
)
|
|
1637
1555
|
|
|
1638
1556
|
def exec(self) -> "Self":
|
|
1639
1557
|
"""Execute the query."""
|
|
@@ -1687,12 +1605,7 @@ class DatasetQuery:
|
|
|
1687
1605
|
|
|
1688
1606
|
dr = self.catalog.warehouse.dataset_rows(dataset)
|
|
1689
1607
|
|
|
1690
|
-
|
|
1691
|
-
self.catalog.warehouse.copy_table(
|
|
1692
|
-
dr.get_table(),
|
|
1693
|
-
query.select(),
|
|
1694
|
-
progress_cb=pbar.update,
|
|
1695
|
-
)
|
|
1608
|
+
self.catalog.warehouse.copy_table(dr.get_table(), query.select())
|
|
1696
1609
|
|
|
1697
1610
|
self.catalog.metastore.update_dataset_status(
|
|
1698
1611
|
dataset, DatasetStatus.COMPLETE, version=version
|
datachain/query/schema.py
CHANGED
|
@@ -19,6 +19,17 @@ if TYPE_CHECKING:
|
|
|
19
19
|
DEFAULT_DELIMITER = "__"
|
|
20
20
|
|
|
21
21
|
|
|
22
|
+
def file_signals(row, signal_name="file"):
|
|
23
|
+
# TODO this is workaround until we decide what to do with these classes
|
|
24
|
+
prefix = f"{signal_name}{DEFAULT_DELIMITER}"
|
|
25
|
+
return {
|
|
26
|
+
c_name.removeprefix(prefix): c_value
|
|
27
|
+
for c_name, c_value in row.items()
|
|
28
|
+
if c_name.startswith(prefix)
|
|
29
|
+
and DEFAULT_DELIMITER not in c_name.removeprefix(prefix)
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
|
|
22
33
|
class ColumnMeta(type):
|
|
23
34
|
@staticmethod
|
|
24
35
|
def to_db_name(name: str) -> str:
|
|
@@ -86,8 +97,8 @@ class Object(UDFParameter):
|
|
|
86
97
|
cb: Callback = DEFAULT_CALLBACK,
|
|
87
98
|
**kwargs,
|
|
88
99
|
) -> Any:
|
|
89
|
-
client = catalog.get_client(row["
|
|
90
|
-
uid = catalog._get_row_uid(row)
|
|
100
|
+
client = catalog.get_client(row["file__source"])
|
|
101
|
+
uid = catalog._get_row_uid(file_signals(row))
|
|
91
102
|
if cache:
|
|
92
103
|
client.download(uid, callback=cb)
|
|
93
104
|
with client.open_object(uid, use_cache=cache, cb=cb) as f:
|
|
@@ -103,8 +114,8 @@ class Object(UDFParameter):
|
|
|
103
114
|
cb: Callback = DEFAULT_CALLBACK,
|
|
104
115
|
**kwargs,
|
|
105
116
|
) -> Any:
|
|
106
|
-
client = catalog.get_client(row["
|
|
107
|
-
uid = catalog._get_row_uid(row)
|
|
117
|
+
client = catalog.get_client(row["file__source"])
|
|
118
|
+
uid = catalog._get_row_uid(file_signals(row))
|
|
108
119
|
if cache:
|
|
109
120
|
await client._download(uid, callback=cb)
|
|
110
121
|
obj = await mapper.to_thread(
|
|
@@ -129,8 +140,8 @@ class Stream(UDFParameter):
|
|
|
129
140
|
cb: Callback = DEFAULT_CALLBACK,
|
|
130
141
|
**kwargs,
|
|
131
142
|
) -> Any:
|
|
132
|
-
client = catalog.get_client(row["
|
|
133
|
-
uid = catalog._get_row_uid(row)
|
|
143
|
+
client = catalog.get_client(row["file__source"])
|
|
144
|
+
uid = catalog._get_row_uid(file_signals(row))
|
|
134
145
|
if cache:
|
|
135
146
|
client.download(uid, callback=cb)
|
|
136
147
|
return client.open_object(uid, use_cache=cache, cb=cb)
|
|
@@ -145,8 +156,8 @@ class Stream(UDFParameter):
|
|
|
145
156
|
cb: Callback = DEFAULT_CALLBACK,
|
|
146
157
|
**kwargs,
|
|
147
158
|
) -> Any:
|
|
148
|
-
client = catalog.get_client(row["
|
|
149
|
-
uid = catalog._get_row_uid(row)
|
|
159
|
+
client = catalog.get_client(row["file__source"])
|
|
160
|
+
uid = catalog._get_row_uid(file_signals(row))
|
|
150
161
|
if cache:
|
|
151
162
|
await client._download(uid, callback=cb)
|
|
152
163
|
return await mapper.to_thread(
|
|
@@ -178,8 +189,8 @@ class LocalFilename(UDFParameter):
|
|
|
178
189
|
# If the glob pattern is specified and the row filename
|
|
179
190
|
# does not match it, then return None
|
|
180
191
|
return None
|
|
181
|
-
client = catalog.get_client(row["
|
|
182
|
-
uid = catalog._get_row_uid(row)
|
|
192
|
+
client = catalog.get_client(row["file__source"])
|
|
193
|
+
uid = catalog._get_row_uid(file_signals(row))
|
|
183
194
|
client.download(uid, callback=cb)
|
|
184
195
|
return client.cache.get_path(uid)
|
|
185
196
|
|
|
@@ -197,8 +208,8 @@ class LocalFilename(UDFParameter):
|
|
|
197
208
|
# If the glob pattern is specified and the row filename
|
|
198
209
|
# does not match it, then return None
|
|
199
210
|
return None
|
|
200
|
-
client = catalog.get_client(row["
|
|
201
|
-
uid = catalog._get_row_uid(row)
|
|
211
|
+
client = catalog.get_client(row["file__source"])
|
|
212
|
+
uid = catalog._get_row_uid(file_signals(row))
|
|
202
213
|
await client._download(uid, callback=cb)
|
|
203
214
|
return client.cache.get_path(uid)
|
|
204
215
|
|
datachain/query/udf.py
CHANGED
|
@@ -1,8 +1,7 @@
|
|
|
1
1
|
import typing
|
|
2
|
-
from collections.abc import Iterable, Iterator,
|
|
2
|
+
from collections.abc import Iterable, Iterator, Sequence
|
|
3
3
|
from dataclasses import dataclass
|
|
4
4
|
from functools import WRAPPER_ASSIGNMENTS
|
|
5
|
-
from inspect import isclass
|
|
6
5
|
from typing import (
|
|
7
6
|
TYPE_CHECKING,
|
|
8
7
|
Any,
|
|
@@ -23,11 +22,7 @@ from .batch import (
|
|
|
23
22
|
RowsOutputBatch,
|
|
24
23
|
UDFInputBatch,
|
|
25
24
|
)
|
|
26
|
-
from .schema import
|
|
27
|
-
UDFParameter,
|
|
28
|
-
UDFParamSpec,
|
|
29
|
-
normalize_param,
|
|
30
|
-
)
|
|
25
|
+
from .schema import UDFParameter
|
|
31
26
|
|
|
32
27
|
if TYPE_CHECKING:
|
|
33
28
|
from datachain.catalog import Catalog
|
|
@@ -66,41 +61,6 @@ class UDFProperties:
|
|
|
66
61
|
return self.output.keys()
|
|
67
62
|
|
|
68
63
|
|
|
69
|
-
def udf(
|
|
70
|
-
params: Sequence[UDFParamSpec],
|
|
71
|
-
output: UDFOutputSpec,
|
|
72
|
-
*,
|
|
73
|
-
method: Optional[str] = None, # only used for class-based UDFs
|
|
74
|
-
batch: int = 1,
|
|
75
|
-
):
|
|
76
|
-
"""
|
|
77
|
-
Decorate a function or a class to be used as a UDF.
|
|
78
|
-
|
|
79
|
-
The decorator expects both the outputs and inputs of the UDF to be specified.
|
|
80
|
-
The outputs are defined as a collection of tuples containing the signal name
|
|
81
|
-
and type.
|
|
82
|
-
Parameters are defined as a list of column objects (e.g. C.name).
|
|
83
|
-
Optionally, UDFs can be run on batches of rows to improve performance, this
|
|
84
|
-
is determined by the 'batch' parameter. When operating on batches of inputs,
|
|
85
|
-
the UDF function will be called with a single argument - a list
|
|
86
|
-
of tuples containing inputs (e.g. ((input1_a, input1_b), (input2_a, input2b))).
|
|
87
|
-
"""
|
|
88
|
-
if isinstance(params, str):
|
|
89
|
-
params = (params,)
|
|
90
|
-
if not isinstance(output, Mapping):
|
|
91
|
-
raise TypeError(f"'output' must be a mapping, got {type(output).__name__}")
|
|
92
|
-
|
|
93
|
-
properties = UDFProperties([normalize_param(p) for p in params], output, batch)
|
|
94
|
-
|
|
95
|
-
def decorator(udf_base: Union[Callable, type]):
|
|
96
|
-
if isclass(udf_base):
|
|
97
|
-
return UDFClassWrapper(udf_base, properties, method=method)
|
|
98
|
-
if callable(udf_base):
|
|
99
|
-
return UDFWrapper(udf_base, properties)
|
|
100
|
-
|
|
101
|
-
return decorator
|
|
102
|
-
|
|
103
|
-
|
|
104
64
|
class UDFBase:
|
|
105
65
|
"""A base class for implementing stateful UDFs."""
|
|
106
66
|
|
datachain/utils.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import glob
|
|
2
|
-
import importlib.util
|
|
3
2
|
import io
|
|
4
3
|
import json
|
|
5
4
|
import os
|
|
@@ -198,45 +197,6 @@ def get_envs_by_prefix(prefix: str) -> dict[str, str]:
|
|
|
198
197
|
return variables
|
|
199
198
|
|
|
200
199
|
|
|
201
|
-
def import_object(object_spec):
|
|
202
|
-
filename, identifier = object_spec.rsplit(":", 1)
|
|
203
|
-
filename = filename.strip()
|
|
204
|
-
identifier = identifier.strip()
|
|
205
|
-
|
|
206
|
-
if not identifier.isidentifier() or not filename.endswith(".py"):
|
|
207
|
-
raise ValueError(f"Invalid object spec: {object_spec}")
|
|
208
|
-
|
|
209
|
-
modname = os.path.abspath(filename)
|
|
210
|
-
if modname in sys.modules:
|
|
211
|
-
module = sys.modules[modname]
|
|
212
|
-
else:
|
|
213
|
-
# Use importlib to find and load the module from the given filename
|
|
214
|
-
spec = importlib.util.spec_from_file_location(modname, filename)
|
|
215
|
-
module = importlib.util.module_from_spec(spec)
|
|
216
|
-
sys.modules[modname] = module
|
|
217
|
-
spec.loader.exec_module(module)
|
|
218
|
-
|
|
219
|
-
return getattr(module, identifier)
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
def parse_params_string(params: str):
|
|
223
|
-
"""
|
|
224
|
-
Parse a string containing UDF class constructor parameters in the form
|
|
225
|
-
`a, b, key=val` into *args and **kwargs.
|
|
226
|
-
"""
|
|
227
|
-
args = []
|
|
228
|
-
kwargs = {}
|
|
229
|
-
for part in params.split():
|
|
230
|
-
if "=" in part:
|
|
231
|
-
key, val = part.split("=")
|
|
232
|
-
kwargs[key] = val
|
|
233
|
-
else:
|
|
234
|
-
args.append(part)
|
|
235
|
-
if any((args, kwargs)):
|
|
236
|
-
return args, kwargs
|
|
237
|
-
return None, None
|
|
238
|
-
|
|
239
|
-
|
|
240
200
|
_T_co = TypeVar("_T_co", covariant=True)
|
|
241
201
|
|
|
242
202
|
|
|
@@ -2,22 +2,22 @@ datachain/__init__.py,sha256=GeyhE-5LgfJav2OKYGaieP2lBvf2Gm-ihj7thnK9zjI,800
|
|
|
2
2
|
datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
|
|
3
3
|
datachain/asyn.py,sha256=Lg3Ck1PQLjQziMx9KU4atzbEnJXTE0924WMYkhgWtGU,8247
|
|
4
4
|
datachain/cache.py,sha256=WP-ktH_bRn3w2g1JOOQ7rCPsZyR4OM6K1Kb7yZsSSns,4056
|
|
5
|
-
datachain/cli.py,sha256=
|
|
5
|
+
datachain/cli.py,sha256=zObcD5W8dzUJKk2RGQ1MxQLEr3jnox6bybU8WyDaIqE,29941
|
|
6
6
|
datachain/cli_utils.py,sha256=jrn9ejGXjybeO1ur3fjdSiAyCHZrX0qsLLbJzN9ErPM,2418
|
|
7
7
|
datachain/config.py,sha256=PfC7W5yO6HFO6-iMB4YB-0RR88LPiGmD6sS_SfVbGso,1979
|
|
8
8
|
datachain/dataset.py,sha256=sHnsmKfMg2bK88gZH1izk8jlbmJDEhQpyOemdaPQVFo,14761
|
|
9
9
|
datachain/error.py,sha256=OnZ8OaBtDdTZPy8XQiy29SAjqdQArQeorYbP5ju7ldc,1199
|
|
10
10
|
datachain/job.py,sha256=Jt4sNutMHJReaGsj3r3scueN5aESLGfhimAa8pUP7Is,1271
|
|
11
11
|
datachain/listing.py,sha256=vfjOlcb98A7xkGGKWEYON6l7lfrOqNv6kldmdVnlJn4,8178
|
|
12
|
-
datachain/node.py,sha256
|
|
12
|
+
datachain/node.py,sha256=-Y8O7q7NtIm_jX0HgjhjvdFwm73TrO5QBslxvFVwTJE,5208
|
|
13
13
|
datachain/nodes_fetcher.py,sha256=kca19yvu11JxoVY1t4_ydp1FmchiV88GnNicNBQ9NIA,831
|
|
14
14
|
datachain/nodes_thread_pool.py,sha256=ZyzBvUImIPmi4WlKC2SW2msA0UhtembbTdcs2nx29A0,3191
|
|
15
15
|
datachain/progress.py,sha256=7_8FtJs770ITK9sMq-Lt4k4k18QmYl4yIG_kCoWID3o,4559
|
|
16
16
|
datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
17
17
|
datachain/storage.py,sha256=RiSJLYdHUjnrEWkLBKPcETHpAxld_B2WxLg711t0aZI,3733
|
|
18
|
-
datachain/utils.py,sha256=
|
|
18
|
+
datachain/utils.py,sha256=VGAcTWjGF0e2qB3Se77shhpiqGMo-ol0QAwf3MH5b7c,11857
|
|
19
19
|
datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
|
|
20
|
-
datachain/catalog/catalog.py,sha256=
|
|
20
|
+
datachain/catalog/catalog.py,sha256=IAaaSVFxtJxVlIGEbu8sHinmYyeDGY6dg7APrtOchVk,68278
|
|
21
21
|
datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
|
|
22
22
|
datachain/catalog/loader.py,sha256=-6VelNfXUdgUnwInVyA8g86Boxv2xqhTh9xNS-Zlwig,8242
|
|
23
23
|
datachain/client/__init__.py,sha256=T4wiYL9KIM0ZZ_UqIyzV8_ufzYlewmizlV4iymHNluE,86
|
|
@@ -32,28 +32,28 @@ datachain/data_storage/__init__.py,sha256=cEOJpyu1JDZtfUupYucCDNFI6e5Wmp_Oyzq6rZ
|
|
|
32
32
|
datachain/data_storage/db_engine.py,sha256=81Ol1of9TTTzD97ORajCnP366Xz2mEJt6C-kTUCaru4,3406
|
|
33
33
|
datachain/data_storage/id_generator.py,sha256=lCEoU0BM37Ai2aRpSbwo5oQT0GqZnSpYwwvizathRMQ,4292
|
|
34
34
|
datachain/data_storage/job.py,sha256=w-7spowjkOa1P5fUVtJou3OltT0L48P0RYWZ9rSJ9-s,383
|
|
35
|
-
datachain/data_storage/metastore.py,sha256=
|
|
35
|
+
datachain/data_storage/metastore.py,sha256=BePe3bVxo-Zuuccok8TLRo4cMHVnAIa8hfZMadbxzqM,52649
|
|
36
36
|
datachain/data_storage/schema.py,sha256=AGbjyEir5UmRZXI3m0jChZogUh5wd8csj6-YlUWaAxQ,8383
|
|
37
37
|
datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
|
|
38
|
-
datachain/data_storage/sqlite.py,sha256=
|
|
39
|
-
datachain/data_storage/warehouse.py,sha256=
|
|
38
|
+
datachain/data_storage/sqlite.py,sha256=EBKJncuzcyQfcKFm2mUjvHjHRTODsteM-k_zndunBrw,28834
|
|
39
|
+
datachain/data_storage/warehouse.py,sha256=Vwhu_OfcNAoTtg1BHui80VCzlPeTUjZQL0QWziu8awY,32186
|
|
40
40
|
datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
41
41
|
datachain/lib/arrow.py,sha256=voY9KuJ2uhPxw_DS6rIjwfKjWXi84T3LFJ7kGFcDQuk,7272
|
|
42
42
|
datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
|
|
43
43
|
datachain/lib/data_model.py,sha256=gHIjlow84GMRDa78yLL1Ud-N18or21fnTyPEwsatpXY,2045
|
|
44
44
|
datachain/lib/dataset_info.py,sha256=srPPhI2UHf6hFPBecyFEVw2SS5aPisIIMsvGgKqi7ss,2366
|
|
45
|
-
datachain/lib/dc.py,sha256=
|
|
45
|
+
datachain/lib/dc.py,sha256=DkzuKS14kgAEax47Gi9w_XJXV3dbboW85A0YOxwNjKY,68869
|
|
46
46
|
datachain/lib/file.py,sha256=elQLorLbIkusuQSVfiuC_KrGSZI8cGm-iT8fHmckJlo,13774
|
|
47
47
|
datachain/lib/hf.py,sha256=cPnmLuprr0pYABH7KqA5FARQ1JGlywdDwD3yDzVAm4k,5920
|
|
48
48
|
datachain/lib/image.py,sha256=AMXYwQsmarZjRbPCZY3M1jDsM2WAB_b3cTY4uOIuXNU,2675
|
|
49
|
-
datachain/lib/listing.py,sha256=
|
|
49
|
+
datachain/lib/listing.py,sha256=NrKb7_6jwp1vEqp5TavSvx3SbLJdvuBzSEclPvbQr30,4013
|
|
50
50
|
datachain/lib/listing_info.py,sha256=36NZ-tXY5Y118wurkajuWWbcE8UCjkRwZlacDtN9F3g,954
|
|
51
51
|
datachain/lib/meta_formats.py,sha256=3f-0vpMTesagS9iMd3y9-u9r-7g0eqYsxmK4fVfNWlw,6635
|
|
52
52
|
datachain/lib/model_store.py,sha256=DNIv8Y6Jtk1_idNLzIpsThOsdW2BMAudyUCbPUcgcxk,2515
|
|
53
53
|
datachain/lib/pytorch.py,sha256=8LNyFaBrx8zws--MEsFg5g3pb8oLnaQAUlgGvtjKxX4,5960
|
|
54
54
|
datachain/lib/settings.py,sha256=39thOpYJw-zPirzeNO6pmRC2vPrQvt4eBsw1xLWDFsw,2344
|
|
55
55
|
datachain/lib/signal_schema.py,sha256=iqgubjCBRiUJB30miv05qFX4uU04dA_Pzi3DCUsHZGs,24177
|
|
56
|
-
datachain/lib/tar.py,sha256=
|
|
56
|
+
datachain/lib/tar.py,sha256=3WIzao6yD5fbLqXLTt9GhPGNonbFIs_fDRu-9vgLgsA,1038
|
|
57
57
|
datachain/lib/text.py,sha256=UNHm8fhidk7wdrWqacEWaA6I9ykfYqarQ2URby7jc7M,1261
|
|
58
58
|
datachain/lib/udf.py,sha256=nG7DDuPgZ5ZuijwvDoCq-OZMxlDM8vFNzyxMmik0Y1c,11716
|
|
59
59
|
datachain/lib/udf_signature.py,sha256=gMStcEeYJka5M6cg50Z9orC6y6HzCAJ3MkFqqn1fjZg,7137
|
|
@@ -67,17 +67,16 @@ datachain/lib/convert/python_to_sql.py,sha256=40SAOdoOgikZRhn8iomCPDRoxC3RFxjJLi
|
|
|
67
67
|
datachain/lib/convert/sql_to_python.py,sha256=lGnKzSF_tz9Y_5SSKkrIU95QEjpcDzvOxIRkEKTQag0,443
|
|
68
68
|
datachain/lib/convert/unflatten.py,sha256=Ogvh_5wg2f38_At_1lN0D_e2uZOOpYEvwvB2xdq56Tw,2012
|
|
69
69
|
datachain/lib/convert/values_to_tuples.py,sha256=YOdbjzHq-uj6-cV2Qq43G72eN2avMNDGl4x5t6yQMl8,3931
|
|
70
|
-
datachain/query/__init__.py,sha256=
|
|
70
|
+
datachain/query/__init__.py,sha256=0NBOZVgIDpCcj1Ci883dQ9A0iiwe03xzmotkOCFbxYc,293
|
|
71
71
|
datachain/query/batch.py,sha256=-vlpINJiertlnaoUVv1C95RatU0F6zuhpIYRufJRo1M,3660
|
|
72
|
-
datachain/query/
|
|
73
|
-
datachain/query/dataset.py,sha256=9lhcgccavqypVParE4pvd_Hgg8gmoDAN6m1IkpSwXhE,58219
|
|
72
|
+
datachain/query/dataset.py,sha256=27QCFhwz4hK-pqTY0hvfFqwxNIH5yxuSjWRl1ZfELd0,55004
|
|
74
73
|
datachain/query/dispatch.py,sha256=GBh3EZHDp5AaXxrjOpfrpfsuy7Umnqxu-MAXcK9X3gc,12945
|
|
75
74
|
datachain/query/metrics.py,sha256=r5b0ygYhokbXp8Mg3kCH8iFSRw0jxzyeBe-C-J_bKFc,938
|
|
76
75
|
datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
|
|
77
76
|
datachain/query/queue.py,sha256=waqM_KzavU8C-G95-4211Nd4GXna_u2747Chgwtgz2w,3839
|
|
78
|
-
datachain/query/schema.py,sha256=
|
|
77
|
+
datachain/query/schema.py,sha256=A-Btmu-rR00841PNcs9CjcppfTUc176wT6-m3BOaSgo,8049
|
|
79
78
|
datachain/query/session.py,sha256=UPH5Z4fzCDsvj81ji0e8GA6Mgra3bOAEpVq4htqOtis,4317
|
|
80
|
-
datachain/query/udf.py,sha256=
|
|
79
|
+
datachain/query/udf.py,sha256=QRDcSgJ_zrY63dyMVD8nq4ky9Q2kaKSoM6bsDqpXOvQ,6682
|
|
81
80
|
datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
82
81
|
datachain/remote/studio.py,sha256=f5s6qSZ9uB4URGUoU_8_W1KZRRQQVSm6cgEBkBUEfuE,7226
|
|
83
82
|
datachain/sql/__init__.py,sha256=A2djrbQwSMUZZEIKGnm-mnRA-NDSbiDJNpAmmwGNyIo,303
|
|
@@ -97,9 +96,9 @@ datachain/sql/sqlite/base.py,sha256=WLPHBhZbXbiqPoRV1VgDrXJqku4UuvJpBhYeQ0k5rI8,
|
|
|
97
96
|
datachain/sql/sqlite/types.py,sha256=yzvp0sXSEoEYXs6zaYC_2YubarQoZH-MiUNXcpuEP4s,1573
|
|
98
97
|
datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
|
|
99
98
|
datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
|
|
100
|
-
datachain-0.3.
|
|
101
|
-
datachain-0.3.
|
|
102
|
-
datachain-0.3.
|
|
103
|
-
datachain-0.3.
|
|
104
|
-
datachain-0.3.
|
|
105
|
-
datachain-0.3.
|
|
99
|
+
datachain-0.3.17.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
|
|
100
|
+
datachain-0.3.17.dist-info/METADATA,sha256=bPMIQkvQjnflmMQhJa3BH3Mi4DpHTAI8KQ5Vd2ur8Mo,17073
|
|
101
|
+
datachain-0.3.17.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
|
|
102
|
+
datachain-0.3.17.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
|
|
103
|
+
datachain-0.3.17.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
|
|
104
|
+
datachain-0.3.17.dist-info/RECORD,,
|
datachain/query/builtins.py
DELETED
|
@@ -1,96 +0,0 @@
|
|
|
1
|
-
import hashlib
|
|
2
|
-
import tarfile
|
|
3
|
-
from functools import partial
|
|
4
|
-
|
|
5
|
-
from datachain.sql.types import String
|
|
6
|
-
|
|
7
|
-
from .schema import C, DatasetRow, Object
|
|
8
|
-
from .udf import udf
|
|
9
|
-
|
|
10
|
-
md5 = partial(hashlib.md5, usedforsecurity=False)
|
|
11
|
-
|
|
12
|
-
__all__ = ["checksum", "index_tar"]
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
def load_tar(raw):
|
|
16
|
-
with tarfile.open(fileobj=raw, mode="r:") as tar:
|
|
17
|
-
return tar.getmembers()
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
@udf(
|
|
21
|
-
(
|
|
22
|
-
C.source,
|
|
23
|
-
C.path,
|
|
24
|
-
C.size,
|
|
25
|
-
C.is_latest,
|
|
26
|
-
C.last_modified,
|
|
27
|
-
C.version,
|
|
28
|
-
C.etag,
|
|
29
|
-
Object(load_tar),
|
|
30
|
-
),
|
|
31
|
-
DatasetRow.schema,
|
|
32
|
-
)
|
|
33
|
-
def index_tar(
|
|
34
|
-
source,
|
|
35
|
-
parent_path,
|
|
36
|
-
size,
|
|
37
|
-
is_latest,
|
|
38
|
-
last_modified,
|
|
39
|
-
version,
|
|
40
|
-
etag,
|
|
41
|
-
tar_entries,
|
|
42
|
-
):
|
|
43
|
-
# generate original tar files as well, along with subobjects
|
|
44
|
-
yield DatasetRow.create(
|
|
45
|
-
source=source,
|
|
46
|
-
path=parent_path,
|
|
47
|
-
size=size,
|
|
48
|
-
is_latest=bool(is_latest),
|
|
49
|
-
last_modified=last_modified,
|
|
50
|
-
version=version,
|
|
51
|
-
etag=etag,
|
|
52
|
-
)
|
|
53
|
-
|
|
54
|
-
for info in tar_entries:
|
|
55
|
-
if info.isfile():
|
|
56
|
-
full_path = f"{parent_path}/{info.name}"
|
|
57
|
-
yield DatasetRow.create(
|
|
58
|
-
source=source,
|
|
59
|
-
path=full_path,
|
|
60
|
-
size=info.size,
|
|
61
|
-
location={
|
|
62
|
-
"vtype": "tar",
|
|
63
|
-
"offset": info.offset_data,
|
|
64
|
-
"size": info.size,
|
|
65
|
-
"parent": {
|
|
66
|
-
"source": source,
|
|
67
|
-
"path": parent_path,
|
|
68
|
-
"version": version,
|
|
69
|
-
"size": size,
|
|
70
|
-
"etag": etag,
|
|
71
|
-
"location": None,
|
|
72
|
-
},
|
|
73
|
-
},
|
|
74
|
-
)
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
BUFSIZE = 2**18
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
def file_digest(fileobj):
|
|
81
|
-
"""Calculate the digest of a file-like object."""
|
|
82
|
-
buf = bytearray(BUFSIZE) # Reusable buffer to reduce allocations.
|
|
83
|
-
view = memoryview(buf)
|
|
84
|
-
digestobj = md5()
|
|
85
|
-
# From 3.11's hashlib.filedigest()
|
|
86
|
-
while True:
|
|
87
|
-
size = fileobj.readinto(buf)
|
|
88
|
-
if size == 0:
|
|
89
|
-
break # EOF
|
|
90
|
-
digestobj.update(view[:size])
|
|
91
|
-
return digestobj.hexdigest()
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
@udf(params=[Object(file_digest)], output={"checksum": String})
|
|
95
|
-
def checksum(digest):
|
|
96
|
-
return (digest,)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|