datachain 0.3.16__py3-none-any.whl → 0.3.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -68,8 +68,6 @@ from datachain.utils import (
68
68
  DataChainDir,
69
69
  batched,
70
70
  datachain_paths_join,
71
- import_object,
72
- parse_params_string,
73
71
  )
74
72
 
75
73
  from .datasource import DataSource
@@ -843,7 +841,7 @@ class Catalog:
843
841
  from datachain.query import DatasetQuery
844
842
 
845
843
  def _row_to_node(d: dict[str, Any]) -> Node:
846
- del d["source"]
844
+ del d["file__source"]
847
845
  return Node.from_dict(d)
848
846
 
849
847
  enlisted_sources: list[tuple[bool, bool, Any]] = []
@@ -1148,30 +1146,28 @@ class Catalog:
1148
1146
  if not sources:
1149
1147
  raise ValueError("Sources needs to be non empty list")
1150
1148
 
1151
- from datachain.query import DatasetQuery
1149
+ from datachain.lib.dc import DataChain
1150
+ from datachain.query.session import Session
1151
+
1152
+ session = Session.get(catalog=self, client_config=client_config)
1152
1153
 
1153
- dataset_queries = []
1154
+ chains = []
1154
1155
  for source in sources:
1155
1156
  if source.startswith(DATASET_PREFIX):
1156
- dq = DatasetQuery(
1157
- name=source[len(DATASET_PREFIX) :],
1158
- catalog=self,
1159
- client_config=client_config,
1157
+ dc = DataChain.from_dataset(
1158
+ source[len(DATASET_PREFIX) :], session=session
1160
1159
  )
1161
1160
  else:
1162
- dq = DatasetQuery(
1163
- path=source,
1164
- catalog=self,
1165
- client_config=client_config,
1166
- recursive=recursive,
1161
+ dc = DataChain.from_storage(
1162
+ source, session=session, recursive=recursive
1167
1163
  )
1168
1164
 
1169
- dataset_queries.append(dq)
1165
+ chains.append(dc)
1170
1166
 
1171
1167
  # create union of all dataset queries created from sources
1172
- dq = reduce(lambda ds1, ds2: ds1.union(ds2), dataset_queries)
1168
+ dc = reduce(lambda dc1, dc2: dc1.union(dc2), chains)
1173
1169
  try:
1174
- dq.save(name)
1170
+ dc.save(name)
1175
1171
  except Exception as e: # noqa: BLE001
1176
1172
  try:
1177
1173
  ds = self.get_dataset(name)
@@ -1731,26 +1727,6 @@ class Catalog:
1731
1727
  output, sources, client_config=client_config, recursive=recursive
1732
1728
  )
1733
1729
 
1734
- def apply_udf(
1735
- self,
1736
- udf_location: str,
1737
- source: str,
1738
- target_name: str,
1739
- parallel: Optional[int] = None,
1740
- params: Optional[str] = None,
1741
- ):
1742
- from datachain.query import DatasetQuery
1743
-
1744
- if source.startswith(DATASET_PREFIX):
1745
- ds = DatasetQuery(name=source[len(DATASET_PREFIX) :], catalog=self)
1746
- else:
1747
- ds = DatasetQuery(path=source, catalog=self)
1748
- udf = import_object(udf_location)
1749
- if params:
1750
- args, kwargs = parse_params_string(params)
1751
- udf = udf(*args, **kwargs)
1752
- ds.add_signals(udf, parallel=parallel).save(target_name)
1753
-
1754
1730
  def query(
1755
1731
  self,
1756
1732
  query_script: str,
datachain/cli.py CHANGED
@@ -494,27 +494,6 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
494
494
  help="Query parameters",
495
495
  )
496
496
 
497
- apply_udf_parser = subp.add_parser(
498
- "apply-udf", parents=[parent_parser], description="Apply UDF"
499
- )
500
- apply_udf_parser.add_argument("udf", type=str, help="UDF location")
501
- apply_udf_parser.add_argument("source", type=str, help="Source storage or dataset")
502
- apply_udf_parser.add_argument("target", type=str, help="Target dataset name")
503
- apply_udf_parser.add_argument(
504
- "--parallel",
505
- nargs="?",
506
- type=int,
507
- const=-1,
508
- default=None,
509
- metavar="N",
510
- help=(
511
- "Use multiprocessing to run the UDF with N worker processes. "
512
- "N defaults to the CPU count."
513
- ),
514
- )
515
- apply_udf_parser.add_argument(
516
- "--udf-params", type=str, default=None, help="UDF class parameters"
517
- )
518
497
  subp.add_parser(
519
498
  "clear-cache", parents=[parent_parser], description="Clear the local file cache"
520
499
  )
@@ -1016,10 +995,6 @@ def main(argv: Optional[list[str]] = None) -> int: # noqa: C901, PLR0912, PLR09
1016
995
  parallel=args.parallel,
1017
996
  params=args.param,
1018
997
  )
1019
- elif args.command == "apply-udf":
1020
- catalog.apply_udf(
1021
- args.udf, args.source, args.target, args.parallel, args.udf_params
1022
- )
1023
998
  elif args.command == "clear-cache":
1024
999
  clear_cache(catalog)
1025
1000
  elif args.command == "gc":
@@ -297,39 +297,6 @@ class AbstractMetastore(ABC, Serializable):
297
297
  #
298
298
  # Dataset dependencies
299
299
  #
300
-
301
- def add_dependency(
302
- self,
303
- dependency: DatasetDependency,
304
- source_dataset_name: str,
305
- source_dataset_version: int,
306
- ) -> None:
307
- """Add dependency to dataset or storage."""
308
- if dependency.is_dataset:
309
- self.add_dataset_dependency(
310
- source_dataset_name,
311
- source_dataset_version,
312
- dependency.dataset_name,
313
- int(dependency.version),
314
- )
315
- else:
316
- self.add_storage_dependency(
317
- source_dataset_name,
318
- source_dataset_version,
319
- StorageURI(dependency.name),
320
- dependency.version,
321
- )
322
-
323
- @abstractmethod
324
- def add_storage_dependency(
325
- self,
326
- source_dataset_name: str,
327
- source_dataset_version: int,
328
- storage_uri: StorageURI,
329
- storage_timestamp_str: Optional[str] = None,
330
- ) -> None:
331
- """Adds storage dependency to dataset."""
332
-
333
300
  @abstractmethod
334
301
  def add_dataset_dependency(
335
302
  self,
@@ -1268,32 +1235,6 @@ class AbstractDBMetastore(AbstractMetastore):
1268
1235
  #
1269
1236
  # Dataset dependencies
1270
1237
  #
1271
-
1272
- def _insert_dataset_dependency(self, data: dict[str, Any]) -> None:
1273
- """Method for inserting dependencies."""
1274
- self.db.execute(self._datasets_dependencies_insert().values(**data))
1275
-
1276
- def add_storage_dependency(
1277
- self,
1278
- source_dataset_name: str,
1279
- source_dataset_version: int,
1280
- storage_uri: StorageURI,
1281
- storage_timestamp_str: Optional[str] = None,
1282
- ) -> None:
1283
- source_dataset = self.get_dataset(source_dataset_name)
1284
- storage = self.get_storage(storage_uri)
1285
-
1286
- self._insert_dataset_dependency(
1287
- {
1288
- "source_dataset_id": source_dataset.id,
1289
- "source_dataset_version_id": (
1290
- source_dataset.get_version(source_dataset_version).id
1291
- ),
1292
- "bucket_id": storage.id,
1293
- "bucket_version": storage_timestamp_str,
1294
- }
1295
- )
1296
-
1297
1238
  def add_dataset_dependency(
1298
1239
  self,
1299
1240
  source_dataset_name: str,
@@ -1305,15 +1246,15 @@ class AbstractDBMetastore(AbstractMetastore):
1305
1246
  source_dataset = self.get_dataset(source_dataset_name)
1306
1247
  dataset = self.get_dataset(dataset_name)
1307
1248
 
1308
- self._insert_dataset_dependency(
1309
- {
1310
- "source_dataset_id": source_dataset.id,
1311
- "source_dataset_version_id": (
1249
+ self.db.execute(
1250
+ self._datasets_dependencies_insert().values(
1251
+ source_dataset_id=source_dataset.id,
1252
+ source_dataset_version_id=(
1312
1253
  source_dataset.get_version(source_dataset_version).id
1313
1254
  ),
1314
- "dataset_id": dataset.id,
1315
- "dataset_version_id": dataset.get_version(dataset_version).id,
1316
- }
1255
+ dataset_id=dataset.id,
1256
+ dataset_version_id=dataset.get_version(dataset_version).id,
1257
+ )
1317
1258
  )
1318
1259
 
1319
1260
  def update_dataset_dependency_source(
@@ -651,11 +651,14 @@ class SQLiteWarehouse(AbstractWarehouse):
651
651
  self, dataset: DatasetRecord, version: int
652
652
  ) -> list[StorageURI]:
653
653
  dr = self.dataset_rows(dataset, version)
654
- query = dr.select(dr.c.source).distinct()
654
+ query = dr.select(dr.c.file__source).distinct()
655
655
  cur = self.db.cursor()
656
656
  cur.row_factory = sqlite3.Row # type: ignore[assignment]
657
657
 
658
- return [StorageURI(row["source"]) for row in self.db.execute(query, cursor=cur)]
658
+ return [
659
+ StorageURI(row["file__source"])
660
+ for row in self.db.execute(query, cursor=cur)
661
+ ]
659
662
 
660
663
  def merge_dataset_rows(
661
664
  self,
@@ -942,28 +942,6 @@ class AbstractWarehouse(ABC, Serializable):
942
942
  self.db.drop_table(Table(name, self.db.metadata), if_exists=True)
943
943
  pbar.update(1)
944
944
 
945
- def changed_query(
946
- self,
947
- source_query: sa.sql.selectable.Select,
948
- target_query: sa.sql.selectable.Select,
949
- ) -> sa.sql.selectable.Select:
950
- sq = source_query.alias("source_query")
951
- tq = target_query.alias("target_query")
952
-
953
- source_target_join = sa.join(
954
- sq, tq, (sq.c.source == tq.c.source) & (sq.c.path == tq.c.path)
955
- )
956
-
957
- return (
958
- select(*sq.c)
959
- .select_from(source_target_join)
960
- .where(
961
- (sq.c.last_modified > tq.c.last_modified)
962
- & (sq.c.is_latest == true())
963
- & (tq.c.is_latest == true())
964
- )
965
- )
966
-
967
945
 
968
946
  def _random_string(length: int) -> str:
969
947
  return "".join(
datachain/lib/dc.py CHANGED
@@ -1337,8 +1337,7 @@ class DataChain(DatasetQuery):
1337
1337
  other.signals_schema.resolve(*right_on).db_signals(),
1338
1338
  ) # type: ignore[arg-type]
1339
1339
  )
1340
-
1341
- return super()._subtract(other, signals) # type: ignore[arg-type]
1340
+ return super().subtract(other, signals) # type: ignore[arg-type]
1342
1341
 
1343
1342
  @classmethod
1344
1343
  def from_values(
datachain/lib/listing.py CHANGED
@@ -77,6 +77,7 @@ def parse_listing_uri(uri: str, cache, client_config) -> tuple[str, str, str]:
77
77
  """
78
78
  Parsing uri and returns listing dataset name, listing uri and listing path
79
79
  """
80
+ client_config = client_config or {}
80
81
  client = Client.get_client(uri, cache, **client_config)
81
82
  storage_uri, path = Client.parse_url(uri)
82
83
 
datachain/lib/tar.py CHANGED
@@ -30,4 +30,5 @@ def process_tar(file: File) -> Iterator[File]:
30
30
  with file.open() as fd:
31
31
  with tarfile.open(fileobj=fd) as tar:
32
32
  for entry in tar.getmembers():
33
- yield build_tar_member(file, entry)
33
+ if entry.isfile():
34
+ yield build_tar_member(file, entry)
datachain/node.py CHANGED
@@ -114,9 +114,23 @@ class Node:
114
114
  )
115
115
 
116
116
  @classmethod
117
- def from_dict(cls, d: dict[str, Any]) -> "Self":
118
- kw = {f.name: d[f.name] for f in attrs.fields(cls) if f.name in d}
119
- return cls(**kw)
117
+ def from_dict(cls, d: dict[str, Any], file_prefix: str = "file") -> "Self":
118
+ def _dval(field_name: str):
119
+ return d.get(f"{file_prefix}__{field_name}")
120
+
121
+ return cls(
122
+ sys__id=d["sys__id"],
123
+ sys__rand=d["sys__rand"],
124
+ source=_dval("source"),
125
+ path=_dval("path"),
126
+ etag=_dval("etag"),
127
+ is_latest=_dval("is_latest"),
128
+ size=_dval("size"),
129
+ last_modified=_dval("last_modified"),
130
+ version=_dval("version"),
131
+ location=_dval("location"),
132
+ dir_type=DirType.FILE,
133
+ )
120
134
 
121
135
  @classmethod
122
136
  def from_dir(cls, path, **kwargs) -> "Node":
@@ -2,7 +2,6 @@ from .dataset import DatasetQuery
2
2
  from .params import param
3
3
  from .schema import C, DatasetRow, LocalFilename, Object, Stream
4
4
  from .session import Session
5
- from .udf import udf
6
5
 
7
6
  __all__ = [
8
7
  "C",
@@ -13,5 +12,4 @@ __all__ = [
13
12
  "Session",
14
13
  "Stream",
15
14
  "param",
16
- "udf",
17
15
  ]
@@ -3,7 +3,6 @@ import inspect
3
3
  import logging
4
4
  import os
5
5
  import random
6
- import re
7
6
  import string
8
7
  import subprocess
9
8
  import sys
@@ -36,7 +35,6 @@ from sqlalchemy.sql.selectable import Select
36
35
 
37
36
  from datachain.asyn import ASYNC_WORKERS, AsyncMapper, OrderedMapper
38
37
  from datachain.catalog import QUERY_SCRIPT_CANCELED_EXIT_CODE, get_catalog
39
- from datachain.client import Client
40
38
  from datachain.data_storage.schema import (
41
39
  PARTITION_COLUMN_ID,
42
40
  partition_col_names,
@@ -46,7 +44,6 @@ from datachain.dataset import DatasetStatus, RowDict
46
44
  from datachain.error import DatasetNotFoundError, QueryScriptCancelError
47
45
  from datachain.progress import CombinedDownloadCallback
48
46
  from datachain.sql.functions import rand
49
- from datachain.storage import Storage, StorageURI
50
47
  from datachain.utils import (
51
48
  batched,
52
49
  determine_processes,
@@ -77,9 +74,7 @@ INSERT_BATCH_SIZE = 10000
77
74
 
78
75
  PartitionByType = Union[ColumnElement, Sequence[ColumnElement]]
79
76
  JoinPredicateType = Union[str, ColumnClause, ColumnElement]
80
- # dependency can be either dataset_name + dataset_version tuple or just storage uri
81
- # depending what type of dependency we are adding
82
- DatasetDependencyType = Union[tuple[str, int], StorageURI]
77
+ DatasetDependencyType = tuple[str, int]
83
78
 
84
79
  logger = logging.getLogger("datachain")
85
80
 
@@ -185,38 +180,6 @@ class QueryStep(StartingStep):
185
180
  )
186
181
 
187
182
 
188
- @frozen
189
- class IndexingStep(StartingStep):
190
- path: str
191
- catalog: "Catalog"
192
- kwargs: dict[str, Any]
193
- recursive: Optional[bool] = True
194
-
195
- def apply(self):
196
- self.catalog.index([self.path], **self.kwargs)
197
- uri, path = Client.parse_url(self.path)
198
- _partial_id, partial_path = self.catalog.metastore.get_valid_partial_id(
199
- uri, path
200
- )
201
- dataset = self.catalog.get_dataset(Storage.dataset_name(uri, partial_path))
202
- dataset_rows = self.catalog.warehouse.dataset_rows(
203
- dataset, dataset.latest_version
204
- )
205
-
206
- def q(*columns):
207
- col_names = [c.name for c in columns]
208
- return self.catalog.warehouse.nodes_dataset_query(
209
- dataset_rows,
210
- column_names=col_names,
211
- path=path,
212
- recursive=self.recursive,
213
- )
214
-
215
- storage = self.catalog.metastore.get_storage(uri)
216
-
217
- return step_result(q, dataset_rows.c, dependencies=[storage.uri])
218
-
219
-
220
183
  def generator_then_call(generator, func: Callable):
221
184
  """
222
185
  Yield items from generator then execute a function and yield
@@ -230,7 +193,7 @@ def generator_then_call(generator, func: Callable):
230
193
  class DatasetDiffOperation(Step):
231
194
  """
232
195
  Abstract class for operations that are calculation some kind of diff between
233
- datasets queries like subtract, changed etc.
196
+ datasets queries like subtract etc.
234
197
  """
235
198
 
236
199
  dq: "DatasetQuery"
@@ -304,28 +267,6 @@ class Subtract(DatasetDiffOperation):
304
267
  return sq.select().except_(sq.select().where(where_clause))
305
268
 
306
269
 
307
- @frozen
308
- class Changed(DatasetDiffOperation):
309
- """
310
- Calculates rows that are changed in a source query compared to target query
311
- Changed means it has same source + path but different last_modified
312
- Example:
313
- >>> ds = DatasetQuery(name="dogs_cats") # some older dataset with embeddings
314
- >>> ds_updated = (
315
- DatasetQuery("gs://dvcx-datalakes/dogs-and-cats")
316
- .filter(C.size > 1000) # we can also filter out source query
317
- .changed(ds)
318
- .add_signals(calc_embeddings) # calculae embeddings only on changed rows
319
- .union(ds) # union with old dataset that's missing updated rows
320
- .save("dogs_cats_updated")
321
- )
322
-
323
- """
324
-
325
- def query(self, source_query: Select, target_query: Select) -> Select:
326
- return self.catalog.warehouse.changed_query(source_query, target_query)
327
-
328
-
329
270
  def adjust_outputs(
330
271
  warehouse: "AbstractWarehouse", row: dict[str, Any], udf_col_types: list[tuple]
331
272
  ) -> dict[str, Any]:
@@ -1096,28 +1037,14 @@ class ResultIter:
1096
1037
  class DatasetQuery:
1097
1038
  def __init__(
1098
1039
  self,
1099
- path: str = "",
1100
- name: str = "",
1040
+ name: str,
1101
1041
  version: Optional[int] = None,
1102
1042
  catalog: Optional["Catalog"] = None,
1103
- client_config=None,
1104
- recursive: Optional[bool] = True,
1105
1043
  session: Optional[Session] = None,
1106
- anon: bool = False,
1107
- indexing_feature_schema: Optional[dict] = None,
1108
1044
  indexing_column_types: Optional[dict[str, Any]] = None,
1109
- update: Optional[bool] = False,
1110
1045
  in_memory: bool = False,
1111
1046
  ):
1112
- if client_config is None:
1113
- client_config = {}
1114
-
1115
- if anon:
1116
- client_config["anon"] = True
1117
-
1118
- self.session = Session.get(
1119
- session, catalog=catalog, client_config=client_config, in_memory=in_memory
1120
- )
1047
+ self.session = Session.get(session, catalog=catalog, in_memory=in_memory)
1121
1048
  self.catalog = catalog or self.session.catalog
1122
1049
  self.steps: list[Step] = []
1123
1050
  self._chunk_index: Optional[int] = None
@@ -1131,26 +1058,14 @@ class DatasetQuery:
1131
1058
  self.feature_schema: Optional[dict] = None
1132
1059
  self.column_types: Optional[dict[str, Any]] = None
1133
1060
 
1134
- if path:
1135
- kwargs = {"update": True} if update else {}
1136
- self.starting_step = IndexingStep(path, self.catalog, kwargs, recursive)
1137
- self.feature_schema = indexing_feature_schema
1138
- self.column_types = indexing_column_types
1139
- elif name:
1140
- self.name = name
1141
- ds = self.catalog.get_dataset(name)
1142
- self.version = version or ds.latest_version
1143
- self.feature_schema = ds.get_version(self.version).feature_schema
1144
- self.column_types = copy(ds.schema)
1145
- if "sys__id" in self.column_types:
1146
- self.column_types.pop("sys__id")
1147
- self.starting_step = QueryStep(self.catalog, name, self.version)
1148
- else:
1149
- raise ValueError("must provide path or name")
1150
-
1151
- @staticmethod
1152
- def is_storage_path(path):
1153
- return bool(re.compile(r"^[a-zA-Z0-9]+://").match(path))
1061
+ self.name = name
1062
+ ds = self.catalog.get_dataset(name)
1063
+ self.version = version or ds.latest_version
1064
+ self.feature_schema = ds.get_version(self.version).feature_schema
1065
+ self.column_types = copy(ds.schema)
1066
+ if "sys__id" in self.column_types:
1067
+ self.column_types.pop("sys__id")
1068
+ self.starting_step = QueryStep(self.catalog, name, self.version)
1154
1069
 
1155
1070
  def __iter__(self):
1156
1071
  return iter(self.db_results())
@@ -1595,21 +1510,11 @@ class DatasetQuery:
1595
1510
  return query
1596
1511
 
1597
1512
  @detach
1598
- def subtract(self, dq: "DatasetQuery") -> "Self":
1599
- return self._subtract(dq, on=[("source", "source"), ("path", "path")])
1600
-
1601
- @detach
1602
- def _subtract(self, dq: "DatasetQuery", on: Sequence[tuple[str, str]]) -> "Self":
1513
+ def subtract(self, dq: "DatasetQuery", on: Sequence[tuple[str, str]]) -> "Self":
1603
1514
  query = self.clone()
1604
1515
  query.steps.append(Subtract(dq, self.catalog, on=on))
1605
1516
  return query
1606
1517
 
1607
- @detach
1608
- def changed(self, dq: "DatasetQuery") -> "Self":
1609
- query = self.clone()
1610
- query.steps.append(Changed(dq, self.catalog))
1611
- return query
1612
-
1613
1518
  @detach
1614
1519
  def generate(
1615
1520
  self,
@@ -1640,24 +1545,13 @@ class DatasetQuery:
1640
1545
 
1641
1546
  def _add_dependencies(self, dataset: "DatasetRecord", version: int):
1642
1547
  for dependency in self.dependencies:
1643
- if isinstance(dependency, tuple):
1644
- # dataset dependency
1645
- ds_dependency_name, ds_dependency_version = dependency
1646
- self.catalog.metastore.add_dataset_dependency(
1647
- dataset.name,
1648
- version,
1649
- ds_dependency_name,
1650
- ds_dependency_version,
1651
- )
1652
- else:
1653
- # storage dependency - its name is a valid StorageURI
1654
- storage = self.catalog.metastore.get_storage(dependency)
1655
- self.catalog.metastore.add_storage_dependency(
1656
- StorageURI(dataset.name),
1657
- version,
1658
- storage.uri,
1659
- storage.timestamp_str,
1660
- )
1548
+ ds_dependency_name, ds_dependency_version = dependency
1549
+ self.catalog.metastore.add_dataset_dependency(
1550
+ dataset.name,
1551
+ version,
1552
+ ds_dependency_name,
1553
+ ds_dependency_version,
1554
+ )
1661
1555
 
1662
1556
  def exec(self) -> "Self":
1663
1557
  """Execute the query."""
datachain/query/schema.py CHANGED
@@ -19,6 +19,17 @@ if TYPE_CHECKING:
19
19
  DEFAULT_DELIMITER = "__"
20
20
 
21
21
 
22
+ def file_signals(row, signal_name="file"):
23
+ # TODO this is workaround until we decide what to do with these classes
24
+ prefix = f"{signal_name}{DEFAULT_DELIMITER}"
25
+ return {
26
+ c_name.removeprefix(prefix): c_value
27
+ for c_name, c_value in row.items()
28
+ if c_name.startswith(prefix)
29
+ and DEFAULT_DELIMITER not in c_name.removeprefix(prefix)
30
+ }
31
+
32
+
22
33
  class ColumnMeta(type):
23
34
  @staticmethod
24
35
  def to_db_name(name: str) -> str:
@@ -86,8 +97,8 @@ class Object(UDFParameter):
86
97
  cb: Callback = DEFAULT_CALLBACK,
87
98
  **kwargs,
88
99
  ) -> Any:
89
- client = catalog.get_client(row["source"])
90
- uid = catalog._get_row_uid(row)
100
+ client = catalog.get_client(row["file__source"])
101
+ uid = catalog._get_row_uid(file_signals(row))
91
102
  if cache:
92
103
  client.download(uid, callback=cb)
93
104
  with client.open_object(uid, use_cache=cache, cb=cb) as f:
@@ -103,8 +114,8 @@ class Object(UDFParameter):
103
114
  cb: Callback = DEFAULT_CALLBACK,
104
115
  **kwargs,
105
116
  ) -> Any:
106
- client = catalog.get_client(row["source"])
107
- uid = catalog._get_row_uid(row)
117
+ client = catalog.get_client(row["file__source"])
118
+ uid = catalog._get_row_uid(file_signals(row))
108
119
  if cache:
109
120
  await client._download(uid, callback=cb)
110
121
  obj = await mapper.to_thread(
@@ -129,8 +140,8 @@ class Stream(UDFParameter):
129
140
  cb: Callback = DEFAULT_CALLBACK,
130
141
  **kwargs,
131
142
  ) -> Any:
132
- client = catalog.get_client(row["source"])
133
- uid = catalog._get_row_uid(row)
143
+ client = catalog.get_client(row["file__source"])
144
+ uid = catalog._get_row_uid(file_signals(row))
134
145
  if cache:
135
146
  client.download(uid, callback=cb)
136
147
  return client.open_object(uid, use_cache=cache, cb=cb)
@@ -145,8 +156,8 @@ class Stream(UDFParameter):
145
156
  cb: Callback = DEFAULT_CALLBACK,
146
157
  **kwargs,
147
158
  ) -> Any:
148
- client = catalog.get_client(row["source"])
149
- uid = catalog._get_row_uid(row)
159
+ client = catalog.get_client(row["file__source"])
160
+ uid = catalog._get_row_uid(file_signals(row))
150
161
  if cache:
151
162
  await client._download(uid, callback=cb)
152
163
  return await mapper.to_thread(
@@ -178,8 +189,8 @@ class LocalFilename(UDFParameter):
178
189
  # If the glob pattern is specified and the row filename
179
190
  # does not match it, then return None
180
191
  return None
181
- client = catalog.get_client(row["source"])
182
- uid = catalog._get_row_uid(row)
192
+ client = catalog.get_client(row["file__source"])
193
+ uid = catalog._get_row_uid(file_signals(row))
183
194
  client.download(uid, callback=cb)
184
195
  return client.cache.get_path(uid)
185
196
 
@@ -197,8 +208,8 @@ class LocalFilename(UDFParameter):
197
208
  # If the glob pattern is specified and the row filename
198
209
  # does not match it, then return None
199
210
  return None
200
- client = catalog.get_client(row["source"])
201
- uid = catalog._get_row_uid(row)
211
+ client = catalog.get_client(row["file__source"])
212
+ uid = catalog._get_row_uid(file_signals(row))
202
213
  await client._download(uid, callback=cb)
203
214
  return client.cache.get_path(uid)
204
215
 
datachain/query/udf.py CHANGED
@@ -1,8 +1,7 @@
1
1
  import typing
2
- from collections.abc import Iterable, Iterator, Mapping, Sequence
2
+ from collections.abc import Iterable, Iterator, Sequence
3
3
  from dataclasses import dataclass
4
4
  from functools import WRAPPER_ASSIGNMENTS
5
- from inspect import isclass
6
5
  from typing import (
7
6
  TYPE_CHECKING,
8
7
  Any,
@@ -23,11 +22,7 @@ from .batch import (
23
22
  RowsOutputBatch,
24
23
  UDFInputBatch,
25
24
  )
26
- from .schema import (
27
- UDFParameter,
28
- UDFParamSpec,
29
- normalize_param,
30
- )
25
+ from .schema import UDFParameter
31
26
 
32
27
  if TYPE_CHECKING:
33
28
  from datachain.catalog import Catalog
@@ -66,41 +61,6 @@ class UDFProperties:
66
61
  return self.output.keys()
67
62
 
68
63
 
69
- def udf(
70
- params: Sequence[UDFParamSpec],
71
- output: UDFOutputSpec,
72
- *,
73
- method: Optional[str] = None, # only used for class-based UDFs
74
- batch: int = 1,
75
- ):
76
- """
77
- Decorate a function or a class to be used as a UDF.
78
-
79
- The decorator expects both the outputs and inputs of the UDF to be specified.
80
- The outputs are defined as a collection of tuples containing the signal name
81
- and type.
82
- Parameters are defined as a list of column objects (e.g. C.name).
83
- Optionally, UDFs can be run on batches of rows to improve performance, this
84
- is determined by the 'batch' parameter. When operating on batches of inputs,
85
- the UDF function will be called with a single argument - a list
86
- of tuples containing inputs (e.g. ((input1_a, input1_b), (input2_a, input2b))).
87
- """
88
- if isinstance(params, str):
89
- params = (params,)
90
- if not isinstance(output, Mapping):
91
- raise TypeError(f"'output' must be a mapping, got {type(output).__name__}")
92
-
93
- properties = UDFProperties([normalize_param(p) for p in params], output, batch)
94
-
95
- def decorator(udf_base: Union[Callable, type]):
96
- if isclass(udf_base):
97
- return UDFClassWrapper(udf_base, properties, method=method)
98
- if callable(udf_base):
99
- return UDFWrapper(udf_base, properties)
100
-
101
- return decorator
102
-
103
-
104
64
  class UDFBase:
105
65
  """A base class for implementing stateful UDFs."""
106
66
 
datachain/utils.py CHANGED
@@ -1,5 +1,4 @@
1
1
  import glob
2
- import importlib.util
3
2
  import io
4
3
  import json
5
4
  import os
@@ -198,45 +197,6 @@ def get_envs_by_prefix(prefix: str) -> dict[str, str]:
198
197
  return variables
199
198
 
200
199
 
201
- def import_object(object_spec):
202
- filename, identifier = object_spec.rsplit(":", 1)
203
- filename = filename.strip()
204
- identifier = identifier.strip()
205
-
206
- if not identifier.isidentifier() or not filename.endswith(".py"):
207
- raise ValueError(f"Invalid object spec: {object_spec}")
208
-
209
- modname = os.path.abspath(filename)
210
- if modname in sys.modules:
211
- module = sys.modules[modname]
212
- else:
213
- # Use importlib to find and load the module from the given filename
214
- spec = importlib.util.spec_from_file_location(modname, filename)
215
- module = importlib.util.module_from_spec(spec)
216
- sys.modules[modname] = module
217
- spec.loader.exec_module(module)
218
-
219
- return getattr(module, identifier)
220
-
221
-
222
- def parse_params_string(params: str):
223
- """
224
- Parse a string containing UDF class constructor parameters in the form
225
- `a, b, key=val` into *args and **kwargs.
226
- """
227
- args = []
228
- kwargs = {}
229
- for part in params.split():
230
- if "=" in part:
231
- key, val = part.split("=")
232
- kwargs[key] = val
233
- else:
234
- args.append(part)
235
- if any((args, kwargs)):
236
- return args, kwargs
237
- return None, None
238
-
239
-
240
200
  _T_co = TypeVar("_T_co", covariant=True)
241
201
 
242
202
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.3.16
3
+ Version: 0.3.17
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -2,22 +2,22 @@ datachain/__init__.py,sha256=GeyhE-5LgfJav2OKYGaieP2lBvf2Gm-ihj7thnK9zjI,800
2
2
  datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
3
3
  datachain/asyn.py,sha256=Lg3Ck1PQLjQziMx9KU4atzbEnJXTE0924WMYkhgWtGU,8247
4
4
  datachain/cache.py,sha256=WP-ktH_bRn3w2g1JOOQ7rCPsZyR4OM6K1Kb7yZsSSns,4056
5
- datachain/cli.py,sha256=alMjnoBUBLvBSMBR51N09rA_aUEdHJwyxSRogF7VbbA,30891
5
+ datachain/cli.py,sha256=zObcD5W8dzUJKk2RGQ1MxQLEr3jnox6bybU8WyDaIqE,29941
6
6
  datachain/cli_utils.py,sha256=jrn9ejGXjybeO1ur3fjdSiAyCHZrX0qsLLbJzN9ErPM,2418
7
7
  datachain/config.py,sha256=PfC7W5yO6HFO6-iMB4YB-0RR88LPiGmD6sS_SfVbGso,1979
8
8
  datachain/dataset.py,sha256=sHnsmKfMg2bK88gZH1izk8jlbmJDEhQpyOemdaPQVFo,14761
9
9
  datachain/error.py,sha256=OnZ8OaBtDdTZPy8XQiy29SAjqdQArQeorYbP5ju7ldc,1199
10
10
  datachain/job.py,sha256=Jt4sNutMHJReaGsj3r3scueN5aESLGfhimAa8pUP7Is,1271
11
11
  datachain/listing.py,sha256=vfjOlcb98A7xkGGKWEYON6l7lfrOqNv6kldmdVnlJn4,8178
12
- datachain/node.py,sha256=2pF3Y9oYzElfiUBcw2LIv7LNNt--V4E-K021zjv0b0I,4748
12
+ datachain/node.py,sha256=-Y8O7q7NtIm_jX0HgjhjvdFwm73TrO5QBslxvFVwTJE,5208
13
13
  datachain/nodes_fetcher.py,sha256=kca19yvu11JxoVY1t4_ydp1FmchiV88GnNicNBQ9NIA,831
14
14
  datachain/nodes_thread_pool.py,sha256=ZyzBvUImIPmi4WlKC2SW2msA0UhtembbTdcs2nx29A0,3191
15
15
  datachain/progress.py,sha256=7_8FtJs770ITK9sMq-Lt4k4k18QmYl4yIG_kCoWID3o,4559
16
16
  datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
17
  datachain/storage.py,sha256=RiSJLYdHUjnrEWkLBKPcETHpAxld_B2WxLg711t0aZI,3733
18
- datachain/utils.py,sha256=Z9-lPNvrrAh_VWpzVBJ7L5-Oy_Oo1V0ZW7G0MVDyPK4,13065
18
+ datachain/utils.py,sha256=VGAcTWjGF0e2qB3Se77shhpiqGMo-ol0QAwf3MH5b7c,11857
19
19
  datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
20
- datachain/catalog/catalog.py,sha256=kPg5ILeCWSjXCj3ewUZY6kzj36HTEqajB3mJDkbs-Vo,69023
20
+ datachain/catalog/catalog.py,sha256=IAaaSVFxtJxVlIGEbu8sHinmYyeDGY6dg7APrtOchVk,68278
21
21
  datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
22
22
  datachain/catalog/loader.py,sha256=-6VelNfXUdgUnwInVyA8g86Boxv2xqhTh9xNS-Zlwig,8242
23
23
  datachain/client/__init__.py,sha256=T4wiYL9KIM0ZZ_UqIyzV8_ufzYlewmizlV4iymHNluE,86
@@ -32,28 +32,28 @@ datachain/data_storage/__init__.py,sha256=cEOJpyu1JDZtfUupYucCDNFI6e5Wmp_Oyzq6rZ
32
32
  datachain/data_storage/db_engine.py,sha256=81Ol1of9TTTzD97ORajCnP366Xz2mEJt6C-kTUCaru4,3406
33
33
  datachain/data_storage/id_generator.py,sha256=lCEoU0BM37Ai2aRpSbwo5oQT0GqZnSpYwwvizathRMQ,4292
34
34
  datachain/data_storage/job.py,sha256=w-7spowjkOa1P5fUVtJou3OltT0L48P0RYWZ9rSJ9-s,383
35
- datachain/data_storage/metastore.py,sha256=cHN0xmbUvChyayHHZm3Vqxr87jFqojPSlGBqhTPStlE,54519
35
+ datachain/data_storage/metastore.py,sha256=BePe3bVxo-Zuuccok8TLRo4cMHVnAIa8hfZMadbxzqM,52649
36
36
  datachain/data_storage/schema.py,sha256=AGbjyEir5UmRZXI3m0jChZogUh5wd8csj6-YlUWaAxQ,8383
37
37
  datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
38
- datachain/data_storage/sqlite.py,sha256=3OehNpYb4WJYt4RhPxZrQn9UL1yiHX7Fp1W53o-Y1NA,28788
39
- datachain/data_storage/warehouse.py,sha256=g_yWXpw5iC-VYi8gH0ctDlwO3Mo6AT-32j3Nw6TFgqw,32857
38
+ datachain/data_storage/sqlite.py,sha256=EBKJncuzcyQfcKFm2mUjvHjHRTODsteM-k_zndunBrw,28834
39
+ datachain/data_storage/warehouse.py,sha256=Vwhu_OfcNAoTtg1BHui80VCzlPeTUjZQL0QWziu8awY,32186
40
40
  datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
41
41
  datachain/lib/arrow.py,sha256=voY9KuJ2uhPxw_DS6rIjwfKjWXi84T3LFJ7kGFcDQuk,7272
42
42
  datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
43
43
  datachain/lib/data_model.py,sha256=gHIjlow84GMRDa78yLL1Ud-N18or21fnTyPEwsatpXY,2045
44
44
  datachain/lib/dataset_info.py,sha256=srPPhI2UHf6hFPBecyFEVw2SS5aPisIIMsvGgKqi7ss,2366
45
- datachain/lib/dc.py,sha256=HERJNR4TISbaAtSLARV72INgKPfQRItyd1l28P-GtzU,68871
45
+ datachain/lib/dc.py,sha256=DkzuKS14kgAEax47Gi9w_XJXV3dbboW85A0YOxwNjKY,68869
46
46
  datachain/lib/file.py,sha256=elQLorLbIkusuQSVfiuC_KrGSZI8cGm-iT8fHmckJlo,13774
47
47
  datachain/lib/hf.py,sha256=cPnmLuprr0pYABH7KqA5FARQ1JGlywdDwD3yDzVAm4k,5920
48
48
  datachain/lib/image.py,sha256=AMXYwQsmarZjRbPCZY3M1jDsM2WAB_b3cTY4uOIuXNU,2675
49
- datachain/lib/listing.py,sha256=e4O1gs3rKJ0eGwb0hSEfD-l9U7x-f-TYqYGF7Ni-x38,3973
49
+ datachain/lib/listing.py,sha256=NrKb7_6jwp1vEqp5TavSvx3SbLJdvuBzSEclPvbQr30,4013
50
50
  datachain/lib/listing_info.py,sha256=36NZ-tXY5Y118wurkajuWWbcE8UCjkRwZlacDtN9F3g,954
51
51
  datachain/lib/meta_formats.py,sha256=3f-0vpMTesagS9iMd3y9-u9r-7g0eqYsxmK4fVfNWlw,6635
52
52
  datachain/lib/model_store.py,sha256=DNIv8Y6Jtk1_idNLzIpsThOsdW2BMAudyUCbPUcgcxk,2515
53
53
  datachain/lib/pytorch.py,sha256=8LNyFaBrx8zws--MEsFg5g3pb8oLnaQAUlgGvtjKxX4,5960
54
54
  datachain/lib/settings.py,sha256=39thOpYJw-zPirzeNO6pmRC2vPrQvt4eBsw1xLWDFsw,2344
55
55
  datachain/lib/signal_schema.py,sha256=iqgubjCBRiUJB30miv05qFX4uU04dA_Pzi3DCUsHZGs,24177
56
- datachain/lib/tar.py,sha256=d7FpYyxbHCL1twRt_Oe9QoPbZa2Tn5lj7iWP0HvvRn0,999
56
+ datachain/lib/tar.py,sha256=3WIzao6yD5fbLqXLTt9GhPGNonbFIs_fDRu-9vgLgsA,1038
57
57
  datachain/lib/text.py,sha256=UNHm8fhidk7wdrWqacEWaA6I9ykfYqarQ2URby7jc7M,1261
58
58
  datachain/lib/udf.py,sha256=nG7DDuPgZ5ZuijwvDoCq-OZMxlDM8vFNzyxMmik0Y1c,11716
59
59
  datachain/lib/udf_signature.py,sha256=gMStcEeYJka5M6cg50Z9orC6y6HzCAJ3MkFqqn1fjZg,7137
@@ -67,17 +67,16 @@ datachain/lib/convert/python_to_sql.py,sha256=40SAOdoOgikZRhn8iomCPDRoxC3RFxjJLi
67
67
  datachain/lib/convert/sql_to_python.py,sha256=lGnKzSF_tz9Y_5SSKkrIU95QEjpcDzvOxIRkEKTQag0,443
68
68
  datachain/lib/convert/unflatten.py,sha256=Ogvh_5wg2f38_At_1lN0D_e2uZOOpYEvwvB2xdq56Tw,2012
69
69
  datachain/lib/convert/values_to_tuples.py,sha256=YOdbjzHq-uj6-cV2Qq43G72eN2avMNDGl4x5t6yQMl8,3931
70
- datachain/query/__init__.py,sha256=tv-spkjUCYamMN9ys_90scYrZ8kJ7C7d1MTYVmxGtk4,325
70
+ datachain/query/__init__.py,sha256=0NBOZVgIDpCcj1Ci883dQ9A0iiwe03xzmotkOCFbxYc,293
71
71
  datachain/query/batch.py,sha256=-vlpINJiertlnaoUVv1C95RatU0F6zuhpIYRufJRo1M,3660
72
- datachain/query/builtins.py,sha256=U6yHPF9bzxqK5iwyqCqbJxo8ggBVx9FtuXxRrQQ0SNM,2244
73
- datachain/query/dataset.py,sha256=tBmAlcz6orJbKWkcvGVE4wom-EWInFaXHJYMSpVZnhA,58892
72
+ datachain/query/dataset.py,sha256=27QCFhwz4hK-pqTY0hvfFqwxNIH5yxuSjWRl1ZfELd0,55004
74
73
  datachain/query/dispatch.py,sha256=GBh3EZHDp5AaXxrjOpfrpfsuy7Umnqxu-MAXcK9X3gc,12945
75
74
  datachain/query/metrics.py,sha256=r5b0ygYhokbXp8Mg3kCH8iFSRw0jxzyeBe-C-J_bKFc,938
76
75
  datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
77
76
  datachain/query/queue.py,sha256=waqM_KzavU8C-G95-4211Nd4GXna_u2747Chgwtgz2w,3839
78
- datachain/query/schema.py,sha256=ytlkA1xFAUOia25u8d6pxvxBSRl3uivLuOe2eHaw-qc,7550
77
+ datachain/query/schema.py,sha256=A-Btmu-rR00841PNcs9CjcppfTUc176wT6-m3BOaSgo,8049
79
78
  datachain/query/session.py,sha256=UPH5Z4fzCDsvj81ji0e8GA6Mgra3bOAEpVq4htqOtis,4317
80
- datachain/query/udf.py,sha256=j3NhmKK5rYG5TclcM2Sr0LhS1tmYLMjzMugx9G9iFLM,8100
79
+ datachain/query/udf.py,sha256=QRDcSgJ_zrY63dyMVD8nq4ky9Q2kaKSoM6bsDqpXOvQ,6682
81
80
  datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
82
81
  datachain/remote/studio.py,sha256=f5s6qSZ9uB4URGUoU_8_W1KZRRQQVSm6cgEBkBUEfuE,7226
83
82
  datachain/sql/__init__.py,sha256=A2djrbQwSMUZZEIKGnm-mnRA-NDSbiDJNpAmmwGNyIo,303
@@ -97,9 +96,9 @@ datachain/sql/sqlite/base.py,sha256=WLPHBhZbXbiqPoRV1VgDrXJqku4UuvJpBhYeQ0k5rI8,
97
96
  datachain/sql/sqlite/types.py,sha256=yzvp0sXSEoEYXs6zaYC_2YubarQoZH-MiUNXcpuEP4s,1573
98
97
  datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
99
98
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
100
- datachain-0.3.16.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
101
- datachain-0.3.16.dist-info/METADATA,sha256=EjMy4f4OVbwVttlWRzzXRLr-uAEAGNMPMmge96_CI2o,17073
102
- datachain-0.3.16.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
103
- datachain-0.3.16.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
104
- datachain-0.3.16.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
105
- datachain-0.3.16.dist-info/RECORD,,
99
+ datachain-0.3.17.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
100
+ datachain-0.3.17.dist-info/METADATA,sha256=bPMIQkvQjnflmMQhJa3BH3Mi4DpHTAI8KQ5Vd2ur8Mo,17073
101
+ datachain-0.3.17.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
102
+ datachain-0.3.17.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
103
+ datachain-0.3.17.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
104
+ datachain-0.3.17.dist-info/RECORD,,
@@ -1,96 +0,0 @@
1
- import hashlib
2
- import tarfile
3
- from functools import partial
4
-
5
- from datachain.sql.types import String
6
-
7
- from .schema import C, DatasetRow, Object
8
- from .udf import udf
9
-
10
- md5 = partial(hashlib.md5, usedforsecurity=False)
11
-
12
- __all__ = ["checksum", "index_tar"]
13
-
14
-
15
- def load_tar(raw):
16
- with tarfile.open(fileobj=raw, mode="r:") as tar:
17
- return tar.getmembers()
18
-
19
-
20
- @udf(
21
- (
22
- C.source,
23
- C.path,
24
- C.size,
25
- C.is_latest,
26
- C.last_modified,
27
- C.version,
28
- C.etag,
29
- Object(load_tar),
30
- ),
31
- DatasetRow.schema,
32
- )
33
- def index_tar(
34
- source,
35
- parent_path,
36
- size,
37
- is_latest,
38
- last_modified,
39
- version,
40
- etag,
41
- tar_entries,
42
- ):
43
- # generate original tar files as well, along with subobjects
44
- yield DatasetRow.create(
45
- source=source,
46
- path=parent_path,
47
- size=size,
48
- is_latest=bool(is_latest),
49
- last_modified=last_modified,
50
- version=version,
51
- etag=etag,
52
- )
53
-
54
- for info in tar_entries:
55
- if info.isfile():
56
- full_path = f"{parent_path}/{info.name}"
57
- yield DatasetRow.create(
58
- source=source,
59
- path=full_path,
60
- size=info.size,
61
- location={
62
- "vtype": "tar",
63
- "offset": info.offset_data,
64
- "size": info.size,
65
- "parent": {
66
- "source": source,
67
- "path": parent_path,
68
- "version": version,
69
- "size": size,
70
- "etag": etag,
71
- "location": None,
72
- },
73
- },
74
- )
75
-
76
-
77
- BUFSIZE = 2**18
78
-
79
-
80
- def file_digest(fileobj):
81
- """Calculate the digest of a file-like object."""
82
- buf = bytearray(BUFSIZE) # Reusable buffer to reduce allocations.
83
- view = memoryview(buf)
84
- digestobj = md5()
85
- # From 3.11's hashlib.filedigest()
86
- while True:
87
- size = fileobj.readinto(buf)
88
- if size == 0:
89
- break # EOF
90
- digestobj.update(view[:size])
91
- return digestobj.hexdigest()
92
-
93
-
94
- @udf(params=[Object(file_digest)], output={"checksum": String})
95
- def checksum(digest):
96
- return (digest,)