datachain 0.3.15__py3-none-any.whl → 0.3.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -68,8 +68,6 @@ from datachain.utils import (
68
68
  DataChainDir,
69
69
  batched,
70
70
  datachain_paths_join,
71
- import_object,
72
- parse_params_string,
73
71
  )
74
72
 
75
73
  from .datasource import DataSource
@@ -843,7 +841,7 @@ class Catalog:
843
841
  from datachain.query import DatasetQuery
844
842
 
845
843
  def _row_to_node(d: dict[str, Any]) -> Node:
846
- del d["source"]
844
+ del d["file__source"]
847
845
  return Node.from_dict(d)
848
846
 
849
847
  enlisted_sources: list[tuple[bool, bool, Any]] = []
@@ -1148,30 +1146,28 @@ class Catalog:
1148
1146
  if not sources:
1149
1147
  raise ValueError("Sources needs to be non empty list")
1150
1148
 
1151
- from datachain.query import DatasetQuery
1149
+ from datachain.lib.dc import DataChain
1150
+ from datachain.query.session import Session
1151
+
1152
+ session = Session.get(catalog=self, client_config=client_config)
1152
1153
 
1153
- dataset_queries = []
1154
+ chains = []
1154
1155
  for source in sources:
1155
1156
  if source.startswith(DATASET_PREFIX):
1156
- dq = DatasetQuery(
1157
- name=source[len(DATASET_PREFIX) :],
1158
- catalog=self,
1159
- client_config=client_config,
1157
+ dc = DataChain.from_dataset(
1158
+ source[len(DATASET_PREFIX) :], session=session
1160
1159
  )
1161
1160
  else:
1162
- dq = DatasetQuery(
1163
- path=source,
1164
- catalog=self,
1165
- client_config=client_config,
1166
- recursive=recursive,
1161
+ dc = DataChain.from_storage(
1162
+ source, session=session, recursive=recursive
1167
1163
  )
1168
1164
 
1169
- dataset_queries.append(dq)
1165
+ chains.append(dc)
1170
1166
 
1171
1167
  # create union of all dataset queries created from sources
1172
- dq = reduce(lambda ds1, ds2: ds1.union(ds2), dataset_queries)
1168
+ dc = reduce(lambda dc1, dc2: dc1.union(dc2), chains)
1173
1169
  try:
1174
- dq.save(name)
1170
+ dc.save(name)
1175
1171
  except Exception as e: # noqa: BLE001
1176
1172
  try:
1177
1173
  ds = self.get_dataset(name)
@@ -1731,26 +1727,6 @@ class Catalog:
1731
1727
  output, sources, client_config=client_config, recursive=recursive
1732
1728
  )
1733
1729
 
1734
- def apply_udf(
1735
- self,
1736
- udf_location: str,
1737
- source: str,
1738
- target_name: str,
1739
- parallel: Optional[int] = None,
1740
- params: Optional[str] = None,
1741
- ):
1742
- from datachain.query import DatasetQuery
1743
-
1744
- if source.startswith(DATASET_PREFIX):
1745
- ds = DatasetQuery(name=source[len(DATASET_PREFIX) :], catalog=self)
1746
- else:
1747
- ds = DatasetQuery(path=source, catalog=self)
1748
- udf = import_object(udf_location)
1749
- if params:
1750
- args, kwargs = parse_params_string(params)
1751
- udf = udf(*args, **kwargs)
1752
- ds.add_signals(udf, parallel=parallel).save(target_name)
1753
-
1754
1730
  def query(
1755
1731
  self,
1756
1732
  query_script: str,
datachain/cli.py CHANGED
@@ -494,27 +494,6 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
494
494
  help="Query parameters",
495
495
  )
496
496
 
497
- apply_udf_parser = subp.add_parser(
498
- "apply-udf", parents=[parent_parser], description="Apply UDF"
499
- )
500
- apply_udf_parser.add_argument("udf", type=str, help="UDF location")
501
- apply_udf_parser.add_argument("source", type=str, help="Source storage or dataset")
502
- apply_udf_parser.add_argument("target", type=str, help="Target dataset name")
503
- apply_udf_parser.add_argument(
504
- "--parallel",
505
- nargs="?",
506
- type=int,
507
- const=-1,
508
- default=None,
509
- metavar="N",
510
- help=(
511
- "Use multiprocessing to run the UDF with N worker processes. "
512
- "N defaults to the CPU count."
513
- ),
514
- )
515
- apply_udf_parser.add_argument(
516
- "--udf-params", type=str, default=None, help="UDF class parameters"
517
- )
518
497
  subp.add_parser(
519
498
  "clear-cache", parents=[parent_parser], description="Clear the local file cache"
520
499
  )
@@ -1016,10 +995,6 @@ def main(argv: Optional[list[str]] = None) -> int: # noqa: C901, PLR0912, PLR09
1016
995
  parallel=args.parallel,
1017
996
  params=args.param,
1018
997
  )
1019
- elif args.command == "apply-udf":
1020
- catalog.apply_udf(
1021
- args.udf, args.source, args.target, args.parallel, args.udf_params
1022
- )
1023
998
  elif args.command == "clear-cache":
1024
999
  clear_cache(catalog)
1025
1000
  elif args.command == "gc":
@@ -297,39 +297,6 @@ class AbstractMetastore(ABC, Serializable):
297
297
  #
298
298
  # Dataset dependencies
299
299
  #
300
-
301
- def add_dependency(
302
- self,
303
- dependency: DatasetDependency,
304
- source_dataset_name: str,
305
- source_dataset_version: int,
306
- ) -> None:
307
- """Add dependency to dataset or storage."""
308
- if dependency.is_dataset:
309
- self.add_dataset_dependency(
310
- source_dataset_name,
311
- source_dataset_version,
312
- dependency.dataset_name,
313
- int(dependency.version),
314
- )
315
- else:
316
- self.add_storage_dependency(
317
- source_dataset_name,
318
- source_dataset_version,
319
- StorageURI(dependency.name),
320
- dependency.version,
321
- )
322
-
323
- @abstractmethod
324
- def add_storage_dependency(
325
- self,
326
- source_dataset_name: str,
327
- source_dataset_version: int,
328
- storage_uri: StorageURI,
329
- storage_timestamp_str: Optional[str] = None,
330
- ) -> None:
331
- """Adds storage dependency to dataset."""
332
-
333
300
  @abstractmethod
334
301
  def add_dataset_dependency(
335
302
  self,
@@ -1268,32 +1235,6 @@ class AbstractDBMetastore(AbstractMetastore):
1268
1235
  #
1269
1236
  # Dataset dependencies
1270
1237
  #
1271
-
1272
- def _insert_dataset_dependency(self, data: dict[str, Any]) -> None:
1273
- """Method for inserting dependencies."""
1274
- self.db.execute(self._datasets_dependencies_insert().values(**data))
1275
-
1276
- def add_storage_dependency(
1277
- self,
1278
- source_dataset_name: str,
1279
- source_dataset_version: int,
1280
- storage_uri: StorageURI,
1281
- storage_timestamp_str: Optional[str] = None,
1282
- ) -> None:
1283
- source_dataset = self.get_dataset(source_dataset_name)
1284
- storage = self.get_storage(storage_uri)
1285
-
1286
- self._insert_dataset_dependency(
1287
- {
1288
- "source_dataset_id": source_dataset.id,
1289
- "source_dataset_version_id": (
1290
- source_dataset.get_version(source_dataset_version).id
1291
- ),
1292
- "bucket_id": storage.id,
1293
- "bucket_version": storage_timestamp_str,
1294
- }
1295
- )
1296
-
1297
1238
  def add_dataset_dependency(
1298
1239
  self,
1299
1240
  source_dataset_name: str,
@@ -1305,15 +1246,15 @@ class AbstractDBMetastore(AbstractMetastore):
1305
1246
  source_dataset = self.get_dataset(source_dataset_name)
1306
1247
  dataset = self.get_dataset(dataset_name)
1307
1248
 
1308
- self._insert_dataset_dependency(
1309
- {
1310
- "source_dataset_id": source_dataset.id,
1311
- "source_dataset_version_id": (
1249
+ self.db.execute(
1250
+ self._datasets_dependencies_insert().values(
1251
+ source_dataset_id=source_dataset.id,
1252
+ source_dataset_version_id=(
1312
1253
  source_dataset.get_version(source_dataset_version).id
1313
1254
  ),
1314
- "dataset_id": dataset.id,
1315
- "dataset_version_id": dataset.get_version(dataset_version).id,
1316
- }
1255
+ dataset_id=dataset.id,
1256
+ dataset_version_id=dataset.get_version(dataset_version).id,
1257
+ )
1317
1258
  )
1318
1259
 
1319
1260
  def update_dataset_dependency_source(
@@ -40,7 +40,9 @@ if TYPE_CHECKING:
40
40
  from sqlalchemy.dialects.sqlite import Insert
41
41
  from sqlalchemy.engine.base import Engine
42
42
  from sqlalchemy.schema import SchemaItem
43
+ from sqlalchemy.sql._typing import _FromClauseArgument, _OnClauseArgument
43
44
  from sqlalchemy.sql.elements import ColumnElement
45
+ from sqlalchemy.sql.selectable import Join
44
46
  from sqlalchemy.types import TypeEngine
45
47
 
46
48
  from datachain.lib.file import File
@@ -649,11 +651,14 @@ class SQLiteWarehouse(AbstractWarehouse):
649
651
  self, dataset: DatasetRecord, version: int
650
652
  ) -> list[StorageURI]:
651
653
  dr = self.dataset_rows(dataset, version)
652
- query = dr.select(dr.c.source).distinct()
654
+ query = dr.select(dr.c.file__source).distinct()
653
655
  cur = self.db.cursor()
654
656
  cur.row_factory = sqlite3.Row # type: ignore[assignment]
655
657
 
656
- return [StorageURI(row["source"]) for row in self.db.execute(query, cursor=cur)]
658
+ return [
659
+ StorageURI(row["file__source"])
660
+ for row in self.db.execute(query, cursor=cur)
661
+ ]
657
662
 
658
663
  def merge_dataset_rows(
659
664
  self,
@@ -788,6 +793,23 @@ class SQLiteWarehouse(AbstractWarehouse):
788
793
  if progress_cb:
789
794
  progress_cb(len(batch_ids))
790
795
 
796
+ def join(
797
+ self,
798
+ left: "_FromClauseArgument",
799
+ right: "_FromClauseArgument",
800
+ onclause: "_OnClauseArgument",
801
+ inner: bool = True,
802
+ ) -> "Join":
803
+ """
804
+ Join two tables together.
805
+ """
806
+ return sqlalchemy.join(
807
+ left,
808
+ right,
809
+ onclause,
810
+ isouter=not inner,
811
+ )
812
+
791
813
  def create_pre_udf_table(self, query: "Select") -> "Table":
792
814
  """
793
815
  Create a temporary table from a query for use in a UDF.
@@ -27,8 +27,12 @@ from datachain.storage import StorageURI
27
27
  from datachain.utils import sql_escape_like
28
28
 
29
29
  if TYPE_CHECKING:
30
- from sqlalchemy.sql._typing import _ColumnsClauseArgument
31
- from sqlalchemy.sql.selectable import Select
30
+ from sqlalchemy.sql._typing import (
31
+ _ColumnsClauseArgument,
32
+ _FromClauseArgument,
33
+ _OnClauseArgument,
34
+ )
35
+ from sqlalchemy.sql.selectable import Join, Select
32
36
  from sqlalchemy.types import TypeEngine
33
37
 
34
38
  from datachain.data_storage import AbstractIDGenerator, schema
@@ -894,6 +898,18 @@ class AbstractWarehouse(ABC, Serializable):
894
898
  Copy the results of a query into a table.
895
899
  """
896
900
 
901
+ @abstractmethod
902
+ def join(
903
+ self,
904
+ left: "_FromClauseArgument",
905
+ right: "_FromClauseArgument",
906
+ onclause: "_OnClauseArgument",
907
+ inner: bool = True,
908
+ ) -> "Join":
909
+ """
910
+ Join two tables together.
911
+ """
912
+
897
913
  @abstractmethod
898
914
  def create_pre_udf_table(self, query: "Select") -> "Table":
899
915
  """
@@ -922,32 +938,10 @@ class AbstractWarehouse(ABC, Serializable):
922
938
  are cleaned up as soon as they are no longer needed.
923
939
  """
924
940
  with tqdm(desc="Cleanup", unit=" tables") as pbar:
925
- for name in names:
941
+ for name in set(names):
926
942
  self.db.drop_table(Table(name, self.db.metadata), if_exists=True)
927
943
  pbar.update(1)
928
944
 
929
- def changed_query(
930
- self,
931
- source_query: sa.sql.selectable.Select,
932
- target_query: sa.sql.selectable.Select,
933
- ) -> sa.sql.selectable.Select:
934
- sq = source_query.alias("source_query")
935
- tq = target_query.alias("target_query")
936
-
937
- source_target_join = sa.join(
938
- sq, tq, (sq.c.source == tq.c.source) & (sq.c.path == tq.c.path)
939
- )
940
-
941
- return (
942
- select(*sq.c)
943
- .select_from(source_target_join)
944
- .where(
945
- (sq.c.last_modified > tq.c.last_modified)
946
- & (sq.c.is_latest == true())
947
- & (tq.c.is_latest == true())
948
- )
949
- )
950
-
951
945
 
952
946
  def _random_string(length: int) -> str:
953
947
  return "".join(
datachain/lib/dc.py CHANGED
@@ -1337,8 +1337,7 @@ class DataChain(DatasetQuery):
1337
1337
  other.signals_schema.resolve(*right_on).db_signals(),
1338
1338
  ) # type: ignore[arg-type]
1339
1339
  )
1340
-
1341
- return super()._subtract(other, signals) # type: ignore[arg-type]
1340
+ return super().subtract(other, signals) # type: ignore[arg-type]
1342
1341
 
1343
1342
  @classmethod
1344
1343
  def from_values(
datachain/lib/listing.py CHANGED
@@ -77,6 +77,7 @@ def parse_listing_uri(uri: str, cache, client_config) -> tuple[str, str, str]:
77
77
  """
78
78
  Parsing uri and returns listing dataset name, listing uri and listing path
79
79
  """
80
+ client_config = client_config or {}
80
81
  client = Client.get_client(uri, cache, **client_config)
81
82
  storage_uri, path = Client.parse_url(uri)
82
83
 
datachain/lib/tar.py CHANGED
@@ -30,4 +30,5 @@ def process_tar(file: File) -> Iterator[File]:
30
30
  with file.open() as fd:
31
31
  with tarfile.open(fileobj=fd) as tar:
32
32
  for entry in tar.getmembers():
33
- yield build_tar_member(file, entry)
33
+ if entry.isfile():
34
+ yield build_tar_member(file, entry)
datachain/node.py CHANGED
@@ -114,9 +114,23 @@ class Node:
114
114
  )
115
115
 
116
116
  @classmethod
117
- def from_dict(cls, d: dict[str, Any]) -> "Self":
118
- kw = {f.name: d[f.name] for f in attrs.fields(cls) if f.name in d}
119
- return cls(**kw)
117
+ def from_dict(cls, d: dict[str, Any], file_prefix: str = "file") -> "Self":
118
+ def _dval(field_name: str):
119
+ return d.get(f"{file_prefix}__{field_name}")
120
+
121
+ return cls(
122
+ sys__id=d["sys__id"],
123
+ sys__rand=d["sys__rand"],
124
+ source=_dval("source"),
125
+ path=_dval("path"),
126
+ etag=_dval("etag"),
127
+ is_latest=_dval("is_latest"),
128
+ size=_dval("size"),
129
+ last_modified=_dval("last_modified"),
130
+ version=_dval("version"),
131
+ location=_dval("location"),
132
+ dir_type=DirType.FILE,
133
+ )
120
134
 
121
135
  @classmethod
122
136
  def from_dir(cls, path, **kwargs) -> "Node":
@@ -2,7 +2,6 @@ from .dataset import DatasetQuery
2
2
  from .params import param
3
3
  from .schema import C, DatasetRow, LocalFilename, Object, Stream
4
4
  from .session import Session
5
- from .udf import udf
6
5
 
7
6
  __all__ = [
8
7
  "C",
@@ -13,5 +12,4 @@ __all__ = [
13
12
  "Session",
14
13
  "Stream",
15
14
  "param",
16
- "udf",
17
15
  ]
@@ -3,7 +3,6 @@ import inspect
3
3
  import logging
4
4
  import os
5
5
  import random
6
- import re
7
6
  import string
8
7
  import subprocess
9
8
  import sys
@@ -33,11 +32,9 @@ from sqlalchemy.sql.elements import ColumnClause, ColumnElement
33
32
  from sqlalchemy.sql.expression import label
34
33
  from sqlalchemy.sql.schema import TableClause
35
34
  from sqlalchemy.sql.selectable import Select
36
- from tqdm import tqdm
37
35
 
38
36
  from datachain.asyn import ASYNC_WORKERS, AsyncMapper, OrderedMapper
39
37
  from datachain.catalog import QUERY_SCRIPT_CANCELED_EXIT_CODE, get_catalog
40
- from datachain.client import Client
41
38
  from datachain.data_storage.schema import (
42
39
  PARTITION_COLUMN_ID,
43
40
  partition_col_names,
@@ -47,7 +44,6 @@ from datachain.dataset import DatasetStatus, RowDict
47
44
  from datachain.error import DatasetNotFoundError, QueryScriptCancelError
48
45
  from datachain.progress import CombinedDownloadCallback
49
46
  from datachain.sql.functions import rand
50
- from datachain.storage import Storage, StorageURI
51
47
  from datachain.utils import (
52
48
  batched,
53
49
  determine_processes,
@@ -78,9 +74,7 @@ INSERT_BATCH_SIZE = 10000
78
74
 
79
75
  PartitionByType = Union[ColumnElement, Sequence[ColumnElement]]
80
76
  JoinPredicateType = Union[str, ColumnClause, ColumnElement]
81
- # dependency can be either dataset_name + dataset_version tuple or just storage uri
82
- # depending what type of dependency we are adding
83
- DatasetDependencyType = Union[tuple[str, int], StorageURI]
77
+ DatasetDependencyType = tuple[str, int]
84
78
 
85
79
  logger = logging.getLogger("datachain")
86
80
 
@@ -186,38 +180,6 @@ class QueryStep(StartingStep):
186
180
  )
187
181
 
188
182
 
189
- @frozen
190
- class IndexingStep(StartingStep):
191
- path: str
192
- catalog: "Catalog"
193
- kwargs: dict[str, Any]
194
- recursive: Optional[bool] = True
195
-
196
- def apply(self):
197
- self.catalog.index([self.path], **self.kwargs)
198
- uri, path = Client.parse_url(self.path)
199
- _partial_id, partial_path = self.catalog.metastore.get_valid_partial_id(
200
- uri, path
201
- )
202
- dataset = self.catalog.get_dataset(Storage.dataset_name(uri, partial_path))
203
- dataset_rows = self.catalog.warehouse.dataset_rows(
204
- dataset, dataset.latest_version
205
- )
206
-
207
- def q(*columns):
208
- col_names = [c.name for c in columns]
209
- return self.catalog.warehouse.nodes_dataset_query(
210
- dataset_rows,
211
- column_names=col_names,
212
- path=path,
213
- recursive=self.recursive,
214
- )
215
-
216
- storage = self.catalog.metastore.get_storage(uri)
217
-
218
- return step_result(q, dataset_rows.c, dependencies=[storage.uri])
219
-
220
-
221
183
  def generator_then_call(generator, func: Callable):
222
184
  """
223
185
  Yield items from generator then execute a function and yield
@@ -231,7 +193,7 @@ def generator_then_call(generator, func: Callable):
231
193
  class DatasetDiffOperation(Step):
232
194
  """
233
195
  Abstract class for operations that are calculation some kind of diff between
234
- datasets queries like subtract, changed etc.
196
+ datasets queries like subtract etc.
235
197
  """
236
198
 
237
199
  dq: "DatasetQuery"
@@ -305,28 +267,6 @@ class Subtract(DatasetDiffOperation):
305
267
  return sq.select().except_(sq.select().where(where_clause))
306
268
 
307
269
 
308
- @frozen
309
- class Changed(DatasetDiffOperation):
310
- """
311
- Calculates rows that are changed in a source query compared to target query
312
- Changed means it has same source + path but different last_modified
313
- Example:
314
- >>> ds = DatasetQuery(name="dogs_cats") # some older dataset with embeddings
315
- >>> ds_updated = (
316
- DatasetQuery("gs://dvcx-datalakes/dogs-and-cats")
317
- .filter(C.size > 1000) # we can also filter out source query
318
- .changed(ds)
319
- .add_signals(calc_embeddings) # calculae embeddings only on changed rows
320
- .union(ds) # union with old dataset that's missing updated rows
321
- .save("dogs_cats_updated")
322
- )
323
-
324
- """
325
-
326
- def query(self, source_query: Select, target_query: Select) -> Select:
327
- return self.catalog.warehouse.changed_query(source_query, target_query)
328
-
329
-
330
270
  def adjust_outputs(
331
271
  warehouse: "AbstractWarehouse", row: dict[str, Any], udf_col_types: list[tuple]
332
272
  ) -> dict[str, Any]:
@@ -899,12 +839,36 @@ class SQLUnion(Step):
899
839
 
900
840
  @frozen
901
841
  class SQLJoin(Step):
842
+ catalog: "Catalog"
902
843
  query1: "DatasetQuery"
903
844
  query2: "DatasetQuery"
904
845
  predicates: Union[JoinPredicateType, tuple[JoinPredicateType, ...]]
905
846
  inner: bool
906
847
  rname: str
907
848
 
849
+ def get_query(self, dq: "DatasetQuery", temp_tables: list[str]) -> sa.Subquery:
850
+ query = dq.apply_steps().select()
851
+ temp_tables.extend(dq.temp_table_names)
852
+
853
+ if not any(isinstance(step, (SQLJoin, SQLUnion)) for step in dq.steps):
854
+ return query.subquery(dq.table.name)
855
+
856
+ warehouse = self.catalog.warehouse
857
+
858
+ columns = [
859
+ c if isinstance(c, Column) else Column(c.name, c.type)
860
+ for c in query.subquery().columns
861
+ ]
862
+ temp_table = warehouse.create_dataset_rows_table(
863
+ warehouse.temp_table_name(),
864
+ columns=columns,
865
+ )
866
+ temp_tables.append(temp_table.name)
867
+
868
+ warehouse.copy_table(temp_table, query)
869
+
870
+ return temp_table.select().subquery(dq.table.name)
871
+
908
872
  def validate_expression(self, exp: "ClauseElement", q1, q2):
909
873
  """
910
874
  Checking if columns used in expression actually exist in left / right
@@ -937,10 +901,8 @@ class SQLJoin(Step):
937
901
  def apply(
938
902
  self, query_generator: QueryGenerator, temp_tables: list[str]
939
903
  ) -> StepResult:
940
- q1 = self.query1.apply_steps().select().subquery(self.query1.table.name)
941
- temp_tables.extend(self.query1.temp_table_names)
942
- q2 = self.query2.apply_steps().select().subquery(self.query2.table.name)
943
- temp_tables.extend(self.query2.temp_table_names)
904
+ q1 = self.get_query(self.query1, temp_tables)
905
+ q2 = self.get_query(self.query2, temp_tables)
944
906
 
945
907
  q1_columns = list(q1.c)
946
908
  q1_column_names = {c.name for c in q1_columns}
@@ -951,7 +913,12 @@ class SQLJoin(Step):
951
913
  continue
952
914
 
953
915
  if c.name in q1_column_names:
954
- c = c.label(self.rname.format(name=c.name))
916
+ new_name = self.rname.format(name=c.name)
917
+ new_name_idx = 0
918
+ while new_name in q1_column_names:
919
+ new_name_idx += 1
920
+ new_name = self.rname.format(name=f"{c.name}_{new_name_idx}")
921
+ c = c.label(new_name)
955
922
  q2_columns.append(c)
956
923
 
957
924
  res_columns = q1_columns + q2_columns
@@ -979,16 +946,14 @@ class SQLJoin(Step):
979
946
  self.validate_expression(join_expression, q1, q2)
980
947
 
981
948
  def q(*columns):
982
- join_query = sqlalchemy.join(
949
+ join_query = self.catalog.warehouse.join(
983
950
  q1,
984
951
  q2,
985
952
  join_expression,
986
- isouter=not self.inner,
953
+ inner=self.inner,
987
954
  )
988
-
989
- res = sqlalchemy.select(*columns).select_from(join_query)
990
- subquery = res.subquery()
991
- return sqlalchemy.select(*subquery.c).select_from(subquery)
955
+ return sqlalchemy.select(*columns).select_from(join_query)
956
+ # return sqlalchemy.select(*subquery.c).select_from(subquery)
992
957
 
993
958
  return step_result(
994
959
  q,
@@ -1072,28 +1037,14 @@ class ResultIter:
1072
1037
  class DatasetQuery:
1073
1038
  def __init__(
1074
1039
  self,
1075
- path: str = "",
1076
- name: str = "",
1040
+ name: str,
1077
1041
  version: Optional[int] = None,
1078
1042
  catalog: Optional["Catalog"] = None,
1079
- client_config=None,
1080
- recursive: Optional[bool] = True,
1081
1043
  session: Optional[Session] = None,
1082
- anon: bool = False,
1083
- indexing_feature_schema: Optional[dict] = None,
1084
1044
  indexing_column_types: Optional[dict[str, Any]] = None,
1085
- update: Optional[bool] = False,
1086
1045
  in_memory: bool = False,
1087
1046
  ):
1088
- if client_config is None:
1089
- client_config = {}
1090
-
1091
- if anon:
1092
- client_config["anon"] = True
1093
-
1094
- self.session = Session.get(
1095
- session, catalog=catalog, client_config=client_config, in_memory=in_memory
1096
- )
1047
+ self.session = Session.get(session, catalog=catalog, in_memory=in_memory)
1097
1048
  self.catalog = catalog or self.session.catalog
1098
1049
  self.steps: list[Step] = []
1099
1050
  self._chunk_index: Optional[int] = None
@@ -1107,26 +1058,14 @@ class DatasetQuery:
1107
1058
  self.feature_schema: Optional[dict] = None
1108
1059
  self.column_types: Optional[dict[str, Any]] = None
1109
1060
 
1110
- if path:
1111
- kwargs = {"update": True} if update else {}
1112
- self.starting_step = IndexingStep(path, self.catalog, kwargs, recursive)
1113
- self.feature_schema = indexing_feature_schema
1114
- self.column_types = indexing_column_types
1115
- elif name:
1116
- self.name = name
1117
- ds = self.catalog.get_dataset(name)
1118
- self.version = version or ds.latest_version
1119
- self.feature_schema = ds.get_version(self.version).feature_schema
1120
- self.column_types = copy(ds.schema)
1121
- if "sys__id" in self.column_types:
1122
- self.column_types.pop("sys__id")
1123
- self.starting_step = QueryStep(self.catalog, name, self.version)
1124
- else:
1125
- raise ValueError("must provide path or name")
1126
-
1127
- @staticmethod
1128
- def is_storage_path(path):
1129
- return bool(re.compile(r"^[a-zA-Z0-9]+://").match(path))
1061
+ self.name = name
1062
+ ds = self.catalog.get_dataset(name)
1063
+ self.version = version or ds.latest_version
1064
+ self.feature_schema = ds.get_version(self.version).feature_schema
1065
+ self.column_types = copy(ds.schema)
1066
+ if "sys__id" in self.column_types:
1067
+ self.column_types.pop("sys__id")
1068
+ self.starting_step = QueryStep(self.catalog, name, self.version)
1130
1069
 
1131
1070
  def __iter__(self):
1132
1071
  return iter(self.db_results())
@@ -1511,7 +1450,7 @@ class DatasetQuery:
1511
1450
  if isinstance(predicates, (str, ColumnClause, ColumnElement))
1512
1451
  else tuple(predicates)
1513
1452
  )
1514
- new_query.steps = [SQLJoin(left, right, predicates, inner, rname)]
1453
+ new_query.steps = [SQLJoin(self.catalog, left, right, predicates, inner, rname)]
1515
1454
  return new_query
1516
1455
 
1517
1456
  @detach
@@ -1571,21 +1510,11 @@ class DatasetQuery:
1571
1510
  return query
1572
1511
 
1573
1512
  @detach
1574
- def subtract(self, dq: "DatasetQuery") -> "Self":
1575
- return self._subtract(dq, on=[("source", "source"), ("path", "path")])
1576
-
1577
- @detach
1578
- def _subtract(self, dq: "DatasetQuery", on: Sequence[tuple[str, str]]) -> "Self":
1513
+ def subtract(self, dq: "DatasetQuery", on: Sequence[tuple[str, str]]) -> "Self":
1579
1514
  query = self.clone()
1580
1515
  query.steps.append(Subtract(dq, self.catalog, on=on))
1581
1516
  return query
1582
1517
 
1583
- @detach
1584
- def changed(self, dq: "DatasetQuery") -> "Self":
1585
- query = self.clone()
1586
- query.steps.append(Changed(dq, self.catalog))
1587
- return query
1588
-
1589
1518
  @detach
1590
1519
  def generate(
1591
1520
  self,
@@ -1616,24 +1545,13 @@ class DatasetQuery:
1616
1545
 
1617
1546
  def _add_dependencies(self, dataset: "DatasetRecord", version: int):
1618
1547
  for dependency in self.dependencies:
1619
- if isinstance(dependency, tuple):
1620
- # dataset dependency
1621
- ds_dependency_name, ds_dependency_version = dependency
1622
- self.catalog.metastore.add_dataset_dependency(
1623
- dataset.name,
1624
- version,
1625
- ds_dependency_name,
1626
- ds_dependency_version,
1627
- )
1628
- else:
1629
- # storage dependency - its name is a valid StorageURI
1630
- storage = self.catalog.metastore.get_storage(dependency)
1631
- self.catalog.metastore.add_storage_dependency(
1632
- StorageURI(dataset.name),
1633
- version,
1634
- storage.uri,
1635
- storage.timestamp_str,
1636
- )
1548
+ ds_dependency_name, ds_dependency_version = dependency
1549
+ self.catalog.metastore.add_dataset_dependency(
1550
+ dataset.name,
1551
+ version,
1552
+ ds_dependency_name,
1553
+ ds_dependency_version,
1554
+ )
1637
1555
 
1638
1556
  def exec(self) -> "Self":
1639
1557
  """Execute the query."""
@@ -1687,12 +1605,7 @@ class DatasetQuery:
1687
1605
 
1688
1606
  dr = self.catalog.warehouse.dataset_rows(dataset)
1689
1607
 
1690
- with tqdm(desc="Saving", unit=" rows") as pbar:
1691
- self.catalog.warehouse.copy_table(
1692
- dr.get_table(),
1693
- query.select(),
1694
- progress_cb=pbar.update,
1695
- )
1608
+ self.catalog.warehouse.copy_table(dr.get_table(), query.select())
1696
1609
 
1697
1610
  self.catalog.metastore.update_dataset_status(
1698
1611
  dataset, DatasetStatus.COMPLETE, version=version
datachain/query/schema.py CHANGED
@@ -19,6 +19,17 @@ if TYPE_CHECKING:
19
19
  DEFAULT_DELIMITER = "__"
20
20
 
21
21
 
22
+ def file_signals(row, signal_name="file"):
23
+ # TODO this is workaround until we decide what to do with these classes
24
+ prefix = f"{signal_name}{DEFAULT_DELIMITER}"
25
+ return {
26
+ c_name.removeprefix(prefix): c_value
27
+ for c_name, c_value in row.items()
28
+ if c_name.startswith(prefix)
29
+ and DEFAULT_DELIMITER not in c_name.removeprefix(prefix)
30
+ }
31
+
32
+
22
33
  class ColumnMeta(type):
23
34
  @staticmethod
24
35
  def to_db_name(name: str) -> str:
@@ -86,8 +97,8 @@ class Object(UDFParameter):
86
97
  cb: Callback = DEFAULT_CALLBACK,
87
98
  **kwargs,
88
99
  ) -> Any:
89
- client = catalog.get_client(row["source"])
90
- uid = catalog._get_row_uid(row)
100
+ client = catalog.get_client(row["file__source"])
101
+ uid = catalog._get_row_uid(file_signals(row))
91
102
  if cache:
92
103
  client.download(uid, callback=cb)
93
104
  with client.open_object(uid, use_cache=cache, cb=cb) as f:
@@ -103,8 +114,8 @@ class Object(UDFParameter):
103
114
  cb: Callback = DEFAULT_CALLBACK,
104
115
  **kwargs,
105
116
  ) -> Any:
106
- client = catalog.get_client(row["source"])
107
- uid = catalog._get_row_uid(row)
117
+ client = catalog.get_client(row["file__source"])
118
+ uid = catalog._get_row_uid(file_signals(row))
108
119
  if cache:
109
120
  await client._download(uid, callback=cb)
110
121
  obj = await mapper.to_thread(
@@ -129,8 +140,8 @@ class Stream(UDFParameter):
129
140
  cb: Callback = DEFAULT_CALLBACK,
130
141
  **kwargs,
131
142
  ) -> Any:
132
- client = catalog.get_client(row["source"])
133
- uid = catalog._get_row_uid(row)
143
+ client = catalog.get_client(row["file__source"])
144
+ uid = catalog._get_row_uid(file_signals(row))
134
145
  if cache:
135
146
  client.download(uid, callback=cb)
136
147
  return client.open_object(uid, use_cache=cache, cb=cb)
@@ -145,8 +156,8 @@ class Stream(UDFParameter):
145
156
  cb: Callback = DEFAULT_CALLBACK,
146
157
  **kwargs,
147
158
  ) -> Any:
148
- client = catalog.get_client(row["source"])
149
- uid = catalog._get_row_uid(row)
159
+ client = catalog.get_client(row["file__source"])
160
+ uid = catalog._get_row_uid(file_signals(row))
150
161
  if cache:
151
162
  await client._download(uid, callback=cb)
152
163
  return await mapper.to_thread(
@@ -178,8 +189,8 @@ class LocalFilename(UDFParameter):
178
189
  # If the glob pattern is specified and the row filename
179
190
  # does not match it, then return None
180
191
  return None
181
- client = catalog.get_client(row["source"])
182
- uid = catalog._get_row_uid(row)
192
+ client = catalog.get_client(row["file__source"])
193
+ uid = catalog._get_row_uid(file_signals(row))
183
194
  client.download(uid, callback=cb)
184
195
  return client.cache.get_path(uid)
185
196
 
@@ -197,8 +208,8 @@ class LocalFilename(UDFParameter):
197
208
  # If the glob pattern is specified and the row filename
198
209
  # does not match it, then return None
199
210
  return None
200
- client = catalog.get_client(row["source"])
201
- uid = catalog._get_row_uid(row)
211
+ client = catalog.get_client(row["file__source"])
212
+ uid = catalog._get_row_uid(file_signals(row))
202
213
  await client._download(uid, callback=cb)
203
214
  return client.cache.get_path(uid)
204
215
 
datachain/query/udf.py CHANGED
@@ -1,8 +1,7 @@
1
1
  import typing
2
- from collections.abc import Iterable, Iterator, Mapping, Sequence
2
+ from collections.abc import Iterable, Iterator, Sequence
3
3
  from dataclasses import dataclass
4
4
  from functools import WRAPPER_ASSIGNMENTS
5
- from inspect import isclass
6
5
  from typing import (
7
6
  TYPE_CHECKING,
8
7
  Any,
@@ -23,11 +22,7 @@ from .batch import (
23
22
  RowsOutputBatch,
24
23
  UDFInputBatch,
25
24
  )
26
- from .schema import (
27
- UDFParameter,
28
- UDFParamSpec,
29
- normalize_param,
30
- )
25
+ from .schema import UDFParameter
31
26
 
32
27
  if TYPE_CHECKING:
33
28
  from datachain.catalog import Catalog
@@ -66,41 +61,6 @@ class UDFProperties:
66
61
  return self.output.keys()
67
62
 
68
63
 
69
- def udf(
70
- params: Sequence[UDFParamSpec],
71
- output: UDFOutputSpec,
72
- *,
73
- method: Optional[str] = None, # only used for class-based UDFs
74
- batch: int = 1,
75
- ):
76
- """
77
- Decorate a function or a class to be used as a UDF.
78
-
79
- The decorator expects both the outputs and inputs of the UDF to be specified.
80
- The outputs are defined as a collection of tuples containing the signal name
81
- and type.
82
- Parameters are defined as a list of column objects (e.g. C.name).
83
- Optionally, UDFs can be run on batches of rows to improve performance, this
84
- is determined by the 'batch' parameter. When operating on batches of inputs,
85
- the UDF function will be called with a single argument - a list
86
- of tuples containing inputs (e.g. ((input1_a, input1_b), (input2_a, input2b))).
87
- """
88
- if isinstance(params, str):
89
- params = (params,)
90
- if not isinstance(output, Mapping):
91
- raise TypeError(f"'output' must be a mapping, got {type(output).__name__}")
92
-
93
- properties = UDFProperties([normalize_param(p) for p in params], output, batch)
94
-
95
- def decorator(udf_base: Union[Callable, type]):
96
- if isclass(udf_base):
97
- return UDFClassWrapper(udf_base, properties, method=method)
98
- if callable(udf_base):
99
- return UDFWrapper(udf_base, properties)
100
-
101
- return decorator
102
-
103
-
104
64
  class UDFBase:
105
65
  """A base class for implementing stateful UDFs."""
106
66
 
datachain/utils.py CHANGED
@@ -1,5 +1,4 @@
1
1
  import glob
2
- import importlib.util
3
2
  import io
4
3
  import json
5
4
  import os
@@ -198,45 +197,6 @@ def get_envs_by_prefix(prefix: str) -> dict[str, str]:
198
197
  return variables
199
198
 
200
199
 
201
- def import_object(object_spec):
202
- filename, identifier = object_spec.rsplit(":", 1)
203
- filename = filename.strip()
204
- identifier = identifier.strip()
205
-
206
- if not identifier.isidentifier() or not filename.endswith(".py"):
207
- raise ValueError(f"Invalid object spec: {object_spec}")
208
-
209
- modname = os.path.abspath(filename)
210
- if modname in sys.modules:
211
- module = sys.modules[modname]
212
- else:
213
- # Use importlib to find and load the module from the given filename
214
- spec = importlib.util.spec_from_file_location(modname, filename)
215
- module = importlib.util.module_from_spec(spec)
216
- sys.modules[modname] = module
217
- spec.loader.exec_module(module)
218
-
219
- return getattr(module, identifier)
220
-
221
-
222
- def parse_params_string(params: str):
223
- """
224
- Parse a string containing UDF class constructor parameters in the form
225
- `a, b, key=val` into *args and **kwargs.
226
- """
227
- args = []
228
- kwargs = {}
229
- for part in params.split():
230
- if "=" in part:
231
- key, val = part.split("=")
232
- kwargs[key] = val
233
- else:
234
- args.append(part)
235
- if any((args, kwargs)):
236
- return args, kwargs
237
- return None, None
238
-
239
-
240
200
  _T_co = TypeVar("_T_co", covariant=True)
241
201
 
242
202
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.3.15
3
+ Version: 0.3.17
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -2,22 +2,22 @@ datachain/__init__.py,sha256=GeyhE-5LgfJav2OKYGaieP2lBvf2Gm-ihj7thnK9zjI,800
2
2
  datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
3
3
  datachain/asyn.py,sha256=Lg3Ck1PQLjQziMx9KU4atzbEnJXTE0924WMYkhgWtGU,8247
4
4
  datachain/cache.py,sha256=WP-ktH_bRn3w2g1JOOQ7rCPsZyR4OM6K1Kb7yZsSSns,4056
5
- datachain/cli.py,sha256=alMjnoBUBLvBSMBR51N09rA_aUEdHJwyxSRogF7VbbA,30891
5
+ datachain/cli.py,sha256=zObcD5W8dzUJKk2RGQ1MxQLEr3jnox6bybU8WyDaIqE,29941
6
6
  datachain/cli_utils.py,sha256=jrn9ejGXjybeO1ur3fjdSiAyCHZrX0qsLLbJzN9ErPM,2418
7
7
  datachain/config.py,sha256=PfC7W5yO6HFO6-iMB4YB-0RR88LPiGmD6sS_SfVbGso,1979
8
8
  datachain/dataset.py,sha256=sHnsmKfMg2bK88gZH1izk8jlbmJDEhQpyOemdaPQVFo,14761
9
9
  datachain/error.py,sha256=OnZ8OaBtDdTZPy8XQiy29SAjqdQArQeorYbP5ju7ldc,1199
10
10
  datachain/job.py,sha256=Jt4sNutMHJReaGsj3r3scueN5aESLGfhimAa8pUP7Is,1271
11
11
  datachain/listing.py,sha256=vfjOlcb98A7xkGGKWEYON6l7lfrOqNv6kldmdVnlJn4,8178
12
- datachain/node.py,sha256=2pF3Y9oYzElfiUBcw2LIv7LNNt--V4E-K021zjv0b0I,4748
12
+ datachain/node.py,sha256=-Y8O7q7NtIm_jX0HgjhjvdFwm73TrO5QBslxvFVwTJE,5208
13
13
  datachain/nodes_fetcher.py,sha256=kca19yvu11JxoVY1t4_ydp1FmchiV88GnNicNBQ9NIA,831
14
14
  datachain/nodes_thread_pool.py,sha256=ZyzBvUImIPmi4WlKC2SW2msA0UhtembbTdcs2nx29A0,3191
15
15
  datachain/progress.py,sha256=7_8FtJs770ITK9sMq-Lt4k4k18QmYl4yIG_kCoWID3o,4559
16
16
  datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
17
  datachain/storage.py,sha256=RiSJLYdHUjnrEWkLBKPcETHpAxld_B2WxLg711t0aZI,3733
18
- datachain/utils.py,sha256=Z9-lPNvrrAh_VWpzVBJ7L5-Oy_Oo1V0ZW7G0MVDyPK4,13065
18
+ datachain/utils.py,sha256=VGAcTWjGF0e2qB3Se77shhpiqGMo-ol0QAwf3MH5b7c,11857
19
19
  datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
20
- datachain/catalog/catalog.py,sha256=kPg5ILeCWSjXCj3ewUZY6kzj36HTEqajB3mJDkbs-Vo,69023
20
+ datachain/catalog/catalog.py,sha256=IAaaSVFxtJxVlIGEbu8sHinmYyeDGY6dg7APrtOchVk,68278
21
21
  datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
22
22
  datachain/catalog/loader.py,sha256=-6VelNfXUdgUnwInVyA8g86Boxv2xqhTh9xNS-Zlwig,8242
23
23
  datachain/client/__init__.py,sha256=T4wiYL9KIM0ZZ_UqIyzV8_ufzYlewmizlV4iymHNluE,86
@@ -32,28 +32,28 @@ datachain/data_storage/__init__.py,sha256=cEOJpyu1JDZtfUupYucCDNFI6e5Wmp_Oyzq6rZ
32
32
  datachain/data_storage/db_engine.py,sha256=81Ol1of9TTTzD97ORajCnP366Xz2mEJt6C-kTUCaru4,3406
33
33
  datachain/data_storage/id_generator.py,sha256=lCEoU0BM37Ai2aRpSbwo5oQT0GqZnSpYwwvizathRMQ,4292
34
34
  datachain/data_storage/job.py,sha256=w-7spowjkOa1P5fUVtJou3OltT0L48P0RYWZ9rSJ9-s,383
35
- datachain/data_storage/metastore.py,sha256=cHN0xmbUvChyayHHZm3Vqxr87jFqojPSlGBqhTPStlE,54519
35
+ datachain/data_storage/metastore.py,sha256=BePe3bVxo-Zuuccok8TLRo4cMHVnAIa8hfZMadbxzqM,52649
36
36
  datachain/data_storage/schema.py,sha256=AGbjyEir5UmRZXI3m0jChZogUh5wd8csj6-YlUWaAxQ,8383
37
37
  datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
38
- datachain/data_storage/sqlite.py,sha256=yooLHQXrpoqDguGlF0SGcCiMU1T82OEc4wr1ra8eBHo,28285
39
- datachain/data_storage/warehouse.py,sha256=Pq6Nt3fyz1WFv6Mdtv2ZUr0_GFCNbafbtS4PdibblUg,32507
38
+ datachain/data_storage/sqlite.py,sha256=EBKJncuzcyQfcKFm2mUjvHjHRTODsteM-k_zndunBrw,28834
39
+ datachain/data_storage/warehouse.py,sha256=Vwhu_OfcNAoTtg1BHui80VCzlPeTUjZQL0QWziu8awY,32186
40
40
  datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
41
41
  datachain/lib/arrow.py,sha256=voY9KuJ2uhPxw_DS6rIjwfKjWXi84T3LFJ7kGFcDQuk,7272
42
42
  datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
43
43
  datachain/lib/data_model.py,sha256=gHIjlow84GMRDa78yLL1Ud-N18or21fnTyPEwsatpXY,2045
44
44
  datachain/lib/dataset_info.py,sha256=srPPhI2UHf6hFPBecyFEVw2SS5aPisIIMsvGgKqi7ss,2366
45
- datachain/lib/dc.py,sha256=HERJNR4TISbaAtSLARV72INgKPfQRItyd1l28P-GtzU,68871
45
+ datachain/lib/dc.py,sha256=DkzuKS14kgAEax47Gi9w_XJXV3dbboW85A0YOxwNjKY,68869
46
46
  datachain/lib/file.py,sha256=elQLorLbIkusuQSVfiuC_KrGSZI8cGm-iT8fHmckJlo,13774
47
47
  datachain/lib/hf.py,sha256=cPnmLuprr0pYABH7KqA5FARQ1JGlywdDwD3yDzVAm4k,5920
48
48
  datachain/lib/image.py,sha256=AMXYwQsmarZjRbPCZY3M1jDsM2WAB_b3cTY4uOIuXNU,2675
49
- datachain/lib/listing.py,sha256=e4O1gs3rKJ0eGwb0hSEfD-l9U7x-f-TYqYGF7Ni-x38,3973
49
+ datachain/lib/listing.py,sha256=NrKb7_6jwp1vEqp5TavSvx3SbLJdvuBzSEclPvbQr30,4013
50
50
  datachain/lib/listing_info.py,sha256=36NZ-tXY5Y118wurkajuWWbcE8UCjkRwZlacDtN9F3g,954
51
51
  datachain/lib/meta_formats.py,sha256=3f-0vpMTesagS9iMd3y9-u9r-7g0eqYsxmK4fVfNWlw,6635
52
52
  datachain/lib/model_store.py,sha256=DNIv8Y6Jtk1_idNLzIpsThOsdW2BMAudyUCbPUcgcxk,2515
53
53
  datachain/lib/pytorch.py,sha256=8LNyFaBrx8zws--MEsFg5g3pb8oLnaQAUlgGvtjKxX4,5960
54
54
  datachain/lib/settings.py,sha256=39thOpYJw-zPirzeNO6pmRC2vPrQvt4eBsw1xLWDFsw,2344
55
55
  datachain/lib/signal_schema.py,sha256=iqgubjCBRiUJB30miv05qFX4uU04dA_Pzi3DCUsHZGs,24177
56
- datachain/lib/tar.py,sha256=d7FpYyxbHCL1twRt_Oe9QoPbZa2Tn5lj7iWP0HvvRn0,999
56
+ datachain/lib/tar.py,sha256=3WIzao6yD5fbLqXLTt9GhPGNonbFIs_fDRu-9vgLgsA,1038
57
57
  datachain/lib/text.py,sha256=UNHm8fhidk7wdrWqacEWaA6I9ykfYqarQ2URby7jc7M,1261
58
58
  datachain/lib/udf.py,sha256=nG7DDuPgZ5ZuijwvDoCq-OZMxlDM8vFNzyxMmik0Y1c,11716
59
59
  datachain/lib/udf_signature.py,sha256=gMStcEeYJka5M6cg50Z9orC6y6HzCAJ3MkFqqn1fjZg,7137
@@ -67,17 +67,16 @@ datachain/lib/convert/python_to_sql.py,sha256=40SAOdoOgikZRhn8iomCPDRoxC3RFxjJLi
67
67
  datachain/lib/convert/sql_to_python.py,sha256=lGnKzSF_tz9Y_5SSKkrIU95QEjpcDzvOxIRkEKTQag0,443
68
68
  datachain/lib/convert/unflatten.py,sha256=Ogvh_5wg2f38_At_1lN0D_e2uZOOpYEvwvB2xdq56Tw,2012
69
69
  datachain/lib/convert/values_to_tuples.py,sha256=YOdbjzHq-uj6-cV2Qq43G72eN2avMNDGl4x5t6yQMl8,3931
70
- datachain/query/__init__.py,sha256=tv-spkjUCYamMN9ys_90scYrZ8kJ7C7d1MTYVmxGtk4,325
70
+ datachain/query/__init__.py,sha256=0NBOZVgIDpCcj1Ci883dQ9A0iiwe03xzmotkOCFbxYc,293
71
71
  datachain/query/batch.py,sha256=-vlpINJiertlnaoUVv1C95RatU0F6zuhpIYRufJRo1M,3660
72
- datachain/query/builtins.py,sha256=U6yHPF9bzxqK5iwyqCqbJxo8ggBVx9FtuXxRrQQ0SNM,2244
73
- datachain/query/dataset.py,sha256=9lhcgccavqypVParE4pvd_Hgg8gmoDAN6m1IkpSwXhE,58219
72
+ datachain/query/dataset.py,sha256=27QCFhwz4hK-pqTY0hvfFqwxNIH5yxuSjWRl1ZfELd0,55004
74
73
  datachain/query/dispatch.py,sha256=GBh3EZHDp5AaXxrjOpfrpfsuy7Umnqxu-MAXcK9X3gc,12945
75
74
  datachain/query/metrics.py,sha256=r5b0ygYhokbXp8Mg3kCH8iFSRw0jxzyeBe-C-J_bKFc,938
76
75
  datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
77
76
  datachain/query/queue.py,sha256=waqM_KzavU8C-G95-4211Nd4GXna_u2747Chgwtgz2w,3839
78
- datachain/query/schema.py,sha256=ytlkA1xFAUOia25u8d6pxvxBSRl3uivLuOe2eHaw-qc,7550
77
+ datachain/query/schema.py,sha256=A-Btmu-rR00841PNcs9CjcppfTUc176wT6-m3BOaSgo,8049
79
78
  datachain/query/session.py,sha256=UPH5Z4fzCDsvj81ji0e8GA6Mgra3bOAEpVq4htqOtis,4317
80
- datachain/query/udf.py,sha256=j3NhmKK5rYG5TclcM2Sr0LhS1tmYLMjzMugx9G9iFLM,8100
79
+ datachain/query/udf.py,sha256=QRDcSgJ_zrY63dyMVD8nq4ky9Q2kaKSoM6bsDqpXOvQ,6682
81
80
  datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
82
81
  datachain/remote/studio.py,sha256=f5s6qSZ9uB4URGUoU_8_W1KZRRQQVSm6cgEBkBUEfuE,7226
83
82
  datachain/sql/__init__.py,sha256=A2djrbQwSMUZZEIKGnm-mnRA-NDSbiDJNpAmmwGNyIo,303
@@ -97,9 +96,9 @@ datachain/sql/sqlite/base.py,sha256=WLPHBhZbXbiqPoRV1VgDrXJqku4UuvJpBhYeQ0k5rI8,
97
96
  datachain/sql/sqlite/types.py,sha256=yzvp0sXSEoEYXs6zaYC_2YubarQoZH-MiUNXcpuEP4s,1573
98
97
  datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
99
98
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
100
- datachain-0.3.15.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
101
- datachain-0.3.15.dist-info/METADATA,sha256=E3jImGtRTyvMPTSqFsgwhsHsnZn_9SRVeThmrDXRuf0,17073
102
- datachain-0.3.15.dist-info/WHEEL,sha256=5Mi1sN9lKoFv_gxcPtisEVrJZihrm_beibeg5R6xb4I,91
103
- datachain-0.3.15.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
104
- datachain-0.3.15.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
105
- datachain-0.3.15.dist-info/RECORD,,
99
+ datachain-0.3.17.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
100
+ datachain-0.3.17.dist-info/METADATA,sha256=bPMIQkvQjnflmMQhJa3BH3Mi4DpHTAI8KQ5Vd2ur8Mo,17073
101
+ datachain-0.3.17.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
102
+ datachain-0.3.17.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
103
+ datachain-0.3.17.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
104
+ datachain-0.3.17.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.0.0)
2
+ Generator: setuptools (75.1.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,96 +0,0 @@
1
- import hashlib
2
- import tarfile
3
- from functools import partial
4
-
5
- from datachain.sql.types import String
6
-
7
- from .schema import C, DatasetRow, Object
8
- from .udf import udf
9
-
10
- md5 = partial(hashlib.md5, usedforsecurity=False)
11
-
12
- __all__ = ["checksum", "index_tar"]
13
-
14
-
15
- def load_tar(raw):
16
- with tarfile.open(fileobj=raw, mode="r:") as tar:
17
- return tar.getmembers()
18
-
19
-
20
- @udf(
21
- (
22
- C.source,
23
- C.path,
24
- C.size,
25
- C.is_latest,
26
- C.last_modified,
27
- C.version,
28
- C.etag,
29
- Object(load_tar),
30
- ),
31
- DatasetRow.schema,
32
- )
33
- def index_tar(
34
- source,
35
- parent_path,
36
- size,
37
- is_latest,
38
- last_modified,
39
- version,
40
- etag,
41
- tar_entries,
42
- ):
43
- # generate original tar files as well, along with subobjects
44
- yield DatasetRow.create(
45
- source=source,
46
- path=parent_path,
47
- size=size,
48
- is_latest=bool(is_latest),
49
- last_modified=last_modified,
50
- version=version,
51
- etag=etag,
52
- )
53
-
54
- for info in tar_entries:
55
- if info.isfile():
56
- full_path = f"{parent_path}/{info.name}"
57
- yield DatasetRow.create(
58
- source=source,
59
- path=full_path,
60
- size=info.size,
61
- location={
62
- "vtype": "tar",
63
- "offset": info.offset_data,
64
- "size": info.size,
65
- "parent": {
66
- "source": source,
67
- "path": parent_path,
68
- "version": version,
69
- "size": size,
70
- "etag": etag,
71
- "location": None,
72
- },
73
- },
74
- )
75
-
76
-
77
- BUFSIZE = 2**18
78
-
79
-
80
- def file_digest(fileobj):
81
- """Calculate the digest of a file-like object."""
82
- buf = bytearray(BUFSIZE) # Reusable buffer to reduce allocations.
83
- view = memoryview(buf)
84
- digestobj = md5()
85
- # From 3.11's hashlib.filedigest()
86
- while True:
87
- size = fileobj.readinto(buf)
88
- if size == 0:
89
- break # EOF
90
- digestobj.update(view[:size])
91
- return digestobj.hexdigest()
92
-
93
-
94
- @udf(params=[Object(file_digest)], output={"checksum": String})
95
- def checksum(digest):
96
- return (digest,)