datachain 0.2.16__py3-none-any.whl → 0.2.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -236,36 +236,36 @@ class DatasetRowsFetcher(NodesThreadPool):
236
236
  import lz4.frame
237
237
  import pandas as pd
238
238
 
239
- metastore = self.metastore.clone() # metastore is not thread safe
240
- warehouse = self.warehouse.clone() # warehouse is not thread safe
241
- dataset = metastore.get_dataset(self.dataset_name)
242
-
243
- urls = list(urls)
244
- while urls:
245
- for url in urls:
246
- if self.should_check_for_status():
247
- self.check_for_status()
248
-
249
- r = requests.get(url, timeout=PULL_DATASET_CHUNK_TIMEOUT)
250
- if r.status_code == 404:
251
- time.sleep(PULL_DATASET_SLEEP_INTERVAL)
252
- # moving to the next url
253
- continue
239
+ # metastore and warehouse are not thread safe
240
+ with self.metastore.clone() as metastore, self.warehouse.clone() as warehouse:
241
+ dataset = metastore.get_dataset(self.dataset_name)
254
242
 
255
- r.raise_for_status()
243
+ urls = list(urls)
244
+ while urls:
245
+ for url in urls:
246
+ if self.should_check_for_status():
247
+ self.check_for_status()
256
248
 
257
- df = pd.read_parquet(io.BytesIO(lz4.frame.decompress(r.content)))
249
+ r = requests.get(url, timeout=PULL_DATASET_CHUNK_TIMEOUT)
250
+ if r.status_code == 404:
251
+ time.sleep(PULL_DATASET_SLEEP_INTERVAL)
252
+ # moving to the next url
253
+ continue
258
254
 
259
- self.fix_columns(df)
255
+ r.raise_for_status()
260
256
 
261
- # id will be autogenerated in DB
262
- df = df.drop("sys__id", axis=1)
257
+ df = pd.read_parquet(io.BytesIO(lz4.frame.decompress(r.content)))
263
258
 
264
- inserted = warehouse.insert_dataset_rows(
265
- df, dataset, self.dataset_version
266
- )
267
- self.increase_counter(inserted) # type: ignore [arg-type]
268
- urls.remove(url)
259
+ self.fix_columns(df)
260
+
261
+ # id will be autogenerated in DB
262
+ df = df.drop("sys__id", axis=1)
263
+
264
+ inserted = warehouse.insert_dataset_rows(
265
+ df, dataset, self.dataset_version
266
+ )
267
+ self.increase_counter(inserted) # type: ignore [arg-type]
268
+ urls.remove(url)
269
269
 
270
270
 
271
271
  @dataclass
@@ -720,7 +720,6 @@ class Catalog:
720
720
  client.uri, posixpath.join(prefix, "")
721
721
  )
722
722
  source_metastore = self.metastore.clone(client.uri)
723
- source_warehouse = self.warehouse.clone()
724
723
 
725
724
  columns = [
726
725
  Column("vtype", String),
@@ -1835,25 +1834,29 @@ class Catalog:
1835
1834
  if signed_urls:
1836
1835
  shuffle(signed_urls)
1837
1836
 
1838
- rows_fetcher = DatasetRowsFetcher(
1839
- self.metastore.clone(),
1840
- self.warehouse.clone(),
1841
- remote_config,
1842
- dataset.name,
1843
- version,
1844
- schema,
1845
- )
1846
- try:
1847
- rows_fetcher.run(
1848
- batched(
1849
- signed_urls,
1850
- math.ceil(len(signed_urls) / PULL_DATASET_MAX_THREADS),
1851
- ),
1852
- dataset_save_progress_bar,
1837
+ with (
1838
+ self.metastore.clone() as metastore,
1839
+ self.warehouse.clone() as warehouse,
1840
+ ):
1841
+ rows_fetcher = DatasetRowsFetcher(
1842
+ metastore,
1843
+ warehouse,
1844
+ remote_config,
1845
+ dataset.name,
1846
+ version,
1847
+ schema,
1853
1848
  )
1854
- except:
1855
- self.remove_dataset(dataset.name, version)
1856
- raise
1849
+ try:
1850
+ rows_fetcher.run(
1851
+ batched(
1852
+ signed_urls,
1853
+ math.ceil(len(signed_urls) / PULL_DATASET_MAX_THREADS),
1854
+ ),
1855
+ dataset_save_progress_bar,
1856
+ )
1857
+ except:
1858
+ self.remove_dataset(dataset.name, version)
1859
+ raise
1857
1860
 
1858
1861
  dataset = self.metastore.update_dataset_status(
1859
1862
  dataset,
@@ -4,7 +4,6 @@ from collections.abc import Iterator
4
4
  from typing import TYPE_CHECKING, Any, ClassVar, Optional, Union
5
5
 
6
6
  import sqlalchemy as sa
7
- from attrs import frozen
8
7
  from sqlalchemy.sql import FROM_LINTING
9
8
  from sqlalchemy.sql.roles import DDLRole
10
9
 
@@ -23,13 +22,18 @@ logger = logging.getLogger("datachain")
23
22
  SELECT_BATCH_SIZE = 100_000 # number of rows to fetch at a time
24
23
 
25
24
 
26
- @frozen
27
25
  class DatabaseEngine(ABC, Serializable):
28
26
  dialect: ClassVar["Dialect"]
29
27
 
30
28
  engine: "Engine"
31
29
  metadata: "MetaData"
32
30
 
31
+ def __enter__(self) -> "DatabaseEngine":
32
+ return self
33
+
34
+ def __exit__(self, exc_type, exc_value, traceback) -> None:
35
+ self.close()
36
+
33
37
  @abstractmethod
34
38
  def clone(self) -> "DatabaseEngine":
35
39
  """Clones DatabaseEngine implementation."""
@@ -33,6 +33,16 @@ class AbstractIDGenerator(ABC, Serializable):
33
33
  def cleanup_for_tests(self):
34
34
  """Cleanup for tests."""
35
35
 
36
+ def close(self) -> None:
37
+ """Closes any active database connections."""
38
+
39
+ def close_on_exit(self) -> None:
40
+ """Closes any active database or HTTP connections, called on Session exit or
41
+ for test cleanup only, as some ID Generator implementations may handle this
42
+ differently.
43
+ """
44
+ self.close()
45
+
36
46
  @abstractmethod
37
47
  def init_id(self, uri: str) -> None:
38
48
  """Initializes the ID generator for the given URI with zero last_id."""
@@ -83,6 +93,10 @@ class AbstractDBIDGenerator(AbstractIDGenerator):
83
93
  def clone(self) -> "AbstractDBIDGenerator":
84
94
  """Clones AbstractIDGenerator implementation."""
85
95
 
96
+ def close(self) -> None:
97
+ """Closes any active database connections."""
98
+ self.db.close()
99
+
86
100
  @property
87
101
  def db(self) -> "DatabaseEngine":
88
102
  return self._db
@@ -78,6 +78,13 @@ class AbstractMetastore(ABC, Serializable):
78
78
  self.uri = uri
79
79
  self.partial_id: Optional[int] = partial_id
80
80
 
81
+ def __enter__(self) -> "AbstractMetastore":
82
+ """Returns self upon entering context manager."""
83
+ return self
84
+
85
+ def __exit__(self, exc_type, exc_value, traceback) -> None:
86
+ """Default behavior is to do nothing, as connections may be shared."""
87
+
81
88
  @abstractmethod
82
89
  def clone(
83
90
  self,
@@ -97,6 +104,12 @@ class AbstractMetastore(ABC, Serializable):
97
104
  def close(self) -> None:
98
105
  """Closes any active database or HTTP connections."""
99
106
 
107
+ def close_on_exit(self) -> None:
108
+ """Closes any active database or HTTP connections, called on Session exit or
109
+ for test cleanup only, as some Metastore implementations may handle this
110
+ differently."""
111
+ self.close()
112
+
100
113
  def cleanup_tables(self, temp_table_names: list[str]) -> None:
101
114
  """Cleanup temp tables."""
102
115
 
@@ -15,7 +15,6 @@ from typing import (
15
15
  )
16
16
 
17
17
  import sqlalchemy
18
- from attrs import frozen
19
18
  from sqlalchemy import MetaData, Table, UniqueConstraint, exists, select
20
19
  from sqlalchemy.dialects import sqlite
21
20
  from sqlalchemy.schema import CreateIndex, CreateTable, DropTable
@@ -40,6 +39,7 @@ from datachain.utils import DataChainDir
40
39
 
41
40
  if TYPE_CHECKING:
42
41
  from sqlalchemy.dialects.sqlite import Insert
42
+ from sqlalchemy.engine.base import Engine
43
43
  from sqlalchemy.schema import SchemaItem
44
44
  from sqlalchemy.sql.elements import ColumnClause, ColumnElement, TextClause
45
45
  from sqlalchemy.sql.selectable import Select
@@ -52,6 +52,8 @@ RETRY_START_SEC = 0.01
52
52
  RETRY_MAX_TIMES = 10
53
53
  RETRY_FACTOR = 2
54
54
 
55
+ DETECT_TYPES = sqlite3.PARSE_DECLTYPES | sqlite3.PARSE_COLNAMES
56
+
55
57
  Column = Union[str, "ColumnClause[Any]", "TextClause"]
56
58
 
57
59
  datachain.sql.sqlite.setup()
@@ -80,26 +82,41 @@ def retry_sqlite_locks(func):
80
82
  return wrapper
81
83
 
82
84
 
83
- @frozen
84
85
  class SQLiteDatabaseEngine(DatabaseEngine):
85
86
  dialect = sqlite_dialect
86
87
 
87
88
  db: sqlite3.Connection
88
89
  db_file: Optional[str]
90
+ is_closed: bool
91
+
92
+ def __init__(
93
+ self,
94
+ engine: "Engine",
95
+ metadata: "MetaData",
96
+ db: sqlite3.Connection,
97
+ db_file: Optional[str] = None,
98
+ ):
99
+ self.engine = engine
100
+ self.metadata = metadata
101
+ self.db = db
102
+ self.db_file = db_file
103
+ self.is_closed = False
89
104
 
90
105
  @classmethod
91
106
  def from_db_file(cls, db_file: Optional[str] = None) -> "SQLiteDatabaseEngine":
92
- detect_types = sqlite3.PARSE_DECLTYPES | sqlite3.PARSE_COLNAMES
107
+ return cls(*cls._connect(db_file=db_file))
93
108
 
109
+ @staticmethod
110
+ def _connect(db_file: Optional[str] = None):
94
111
  try:
95
112
  if db_file == ":memory:":
96
113
  # Enable multithreaded usage of the same in-memory db
97
114
  db = sqlite3.connect(
98
- "file::memory:?cache=shared", uri=True, detect_types=detect_types
115
+ "file::memory:?cache=shared", uri=True, detect_types=DETECT_TYPES
99
116
  )
100
117
  else:
101
118
  db = sqlite3.connect(
102
- db_file or DataChainDir.find().db, detect_types=detect_types
119
+ db_file or DataChainDir.find().db, detect_types=DETECT_TYPES
103
120
  )
104
121
  create_user_defined_sql_functions(db)
105
122
  engine = sqlalchemy.create_engine(
@@ -118,7 +135,7 @@ class SQLiteDatabaseEngine(DatabaseEngine):
118
135
 
119
136
  load_usearch_extension(db)
120
137
 
121
- return cls(engine, MetaData(), db, db_file)
138
+ return engine, MetaData(), db, db_file
122
139
  except RuntimeError:
123
140
  raise DataChainError("Can't connect to SQLite DB") from None
124
141
 
@@ -138,6 +155,16 @@ class SQLiteDatabaseEngine(DatabaseEngine):
138
155
  {},
139
156
  )
140
157
 
158
+ def _reconnect(self) -> None:
159
+ if not self.is_closed:
160
+ raise RuntimeError("Cannot reconnect on still-open DB!")
161
+ engine, metadata, db, db_file = self._connect(db_file=self.db_file)
162
+ self.engine = engine
163
+ self.metadata = metadata
164
+ self.db = db
165
+ self.db_file = db_file
166
+ self.is_closed = False
167
+
141
168
  @retry_sqlite_locks
142
169
  def execute(
143
170
  self,
@@ -145,6 +172,9 @@ class SQLiteDatabaseEngine(DatabaseEngine):
145
172
  cursor: Optional[sqlite3.Cursor] = None,
146
173
  conn=None,
147
174
  ) -> sqlite3.Cursor:
175
+ if self.is_closed:
176
+ # Reconnect in case of being closed previously.
177
+ self._reconnect()
148
178
  if cursor is not None:
149
179
  result = cursor.execute(*self.compile_to_args(query))
150
180
  elif conn is not None:
@@ -179,6 +209,7 @@ class SQLiteDatabaseEngine(DatabaseEngine):
179
209
 
180
210
  def close(self) -> None:
181
211
  self.db.close()
212
+ self.is_closed = True
182
213
 
183
214
  @contextmanager
184
215
  def transaction(self):
@@ -359,6 +390,10 @@ class SQLiteMetastore(AbstractDBMetastore):
359
390
 
360
391
  self._init_tables()
361
392
 
393
+ def __exit__(self, exc_type, exc_value, traceback) -> None:
394
+ """Close connection upon exit from context manager."""
395
+ self.close()
396
+
362
397
  def clone(
363
398
  self,
364
399
  uri: StorageURI = StorageURI(""),
@@ -521,6 +556,10 @@ class SQLiteWarehouse(AbstractWarehouse):
521
556
 
522
557
  self.db = db or SQLiteDatabaseEngine.from_db_file(db_file)
523
558
 
559
+ def __exit__(self, exc_type, exc_value, traceback) -> None:
560
+ """Close connection upon exit from context manager."""
561
+ self.close()
562
+
524
563
  def clone(self, use_new_connection: bool = False) -> "SQLiteWarehouse":
525
564
  return SQLiteWarehouse(self.id_generator.clone(), db=self.db.clone())
526
565
 
@@ -70,6 +70,13 @@ class AbstractWarehouse(ABC, Serializable):
70
70
  def __init__(self, id_generator: "AbstractIDGenerator"):
71
71
  self.id_generator = id_generator
72
72
 
73
+ def __enter__(self) -> "AbstractWarehouse":
74
+ return self
75
+
76
+ def __exit__(self, exc_type, exc_value, traceback) -> None:
77
+ # Default behavior is to do nothing, as connections may be shared.
78
+ pass
79
+
73
80
  def cleanup_for_tests(self):
74
81
  """Cleanup for tests."""
75
82
 
@@ -158,6 +165,12 @@ class AbstractWarehouse(ABC, Serializable):
158
165
  """Closes any active database connections."""
159
166
  self.db.close()
160
167
 
168
+ def close_on_exit(self) -> None:
169
+ """Closes any active database or HTTP connections, called on Session exit or
170
+ for test cleanup only, as some Warehouse implementations may handle this
171
+ differently."""
172
+ self.close()
173
+
161
174
  #
162
175
  # Query Tables
163
176
  #
datachain/lib/arrow.py CHANGED
@@ -1,5 +1,6 @@
1
1
  import re
2
2
  from collections.abc import Sequence
3
+ from tempfile import NamedTemporaryFile
3
4
  from typing import TYPE_CHECKING, Optional
4
5
 
5
6
  import pyarrow as pa
@@ -43,13 +44,17 @@ class ArrowGenerator(Generator):
43
44
  self.kwargs = kwargs
44
45
 
45
46
  def process(self, file: File):
46
- path = file.get_path()
47
- ds = dataset(
48
- path, filesystem=file.get_fs(), schema=self.input_schema, **self.kwargs
49
- )
47
+ if self.nrows:
48
+ path = _nrows_file(file, self.nrows)
49
+ ds = dataset(path, schema=self.input_schema, **self.kwargs)
50
+ else:
51
+ path = file.get_path()
52
+ ds = dataset(
53
+ path, filesystem=file.get_fs(), schema=self.input_schema, **self.kwargs
54
+ )
50
55
  index = 0
51
56
  with tqdm(desc="Parsed by pyarrow", unit=" rows") as pbar:
52
- for record_batch in ds.to_batches(use_threads=False):
57
+ for record_batch in ds.to_batches():
53
58
  for record in record_batch.to_pylist():
54
59
  vals = list(record.values())
55
60
  if self.output_schema:
@@ -60,8 +65,6 @@ class ArrowGenerator(Generator):
60
65
  else:
61
66
  yield vals
62
67
  index += 1
63
- if self.nrows and index >= self.nrows:
64
- return
65
68
  pbar.update(len(record_batch))
66
69
 
67
70
 
@@ -125,3 +128,15 @@ def _arrow_type_mapper(col_type: pa.DataType) -> type: # noqa: PLR0911
125
128
  if isinstance(col_type, pa.lib.DictionaryType):
126
129
  return _arrow_type_mapper(col_type.value_type) # type: ignore[return-value]
127
130
  raise TypeError(f"{col_type!r} datatypes not supported")
131
+
132
+
133
+ def _nrows_file(file: File, nrows: int) -> str:
134
+ tf = NamedTemporaryFile(delete=False)
135
+ with file.open(mode="r") as reader:
136
+ with open(tf.name, "a") as writer:
137
+ for row, line in enumerate(reader):
138
+ if row >= nrows:
139
+ break
140
+ writer.write(line)
141
+ writer.write("\n")
142
+ return tf.name
datachain/lib/dc.py CHANGED
@@ -829,8 +829,19 @@ class DataChain(DatasetQuery):
829
829
  )
830
830
  ```
831
831
  """
832
- chain = super().mutate(**kwargs)
833
- chain.signals_schema = self.signals_schema.mutate(kwargs)
832
+ mutated = {}
833
+ schema = self.signals_schema
834
+ for name, value in kwargs.items():
835
+ if isinstance(value, Column):
836
+ # renaming existing column
837
+ for signal in schema.db_signals(name=value.name, as_columns=True):
838
+ mutated[signal.name.replace(value.name, name, 1)] = signal
839
+ else:
840
+ # adding new signal
841
+ mutated[name] = value
842
+
843
+ chain = super().mutate(**mutated)
844
+ chain.signals_schema = schema.mutate(kwargs)
834
845
  return chain
835
846
 
836
847
  @property
@@ -1099,7 +1110,7 @@ class DataChain(DatasetQuery):
1099
1110
  )
1100
1111
  else:
1101
1112
  signals = self.signals_schema.resolve(*on).db_signals()
1102
- return super()._subtract(other, signals)
1113
+ return super()._subtract(other, signals) # type: ignore[arg-type]
1103
1114
 
1104
1115
  @classmethod
1105
1116
  def from_values(
@@ -1261,8 +1272,21 @@ class DataChain(DatasetQuery):
1261
1272
  dc = dc.parse_tabular(format="json")
1262
1273
  ```
1263
1274
  """
1275
+ from pyarrow.dataset import CsvFileFormat, JsonFileFormat
1276
+
1264
1277
  from datachain.lib.arrow import ArrowGenerator, infer_schema, schema_to_output
1265
1278
 
1279
+ if nrows:
1280
+ format = kwargs.get("format")
1281
+ if format not in ["csv", "json"] and not isinstance(
1282
+ format, (CsvFileFormat, JsonFileFormat)
1283
+ ):
1284
+ raise DatasetPrepareError(
1285
+ self.name,
1286
+ "error in `parse_tabular` - "
1287
+ "`nrows` only supported for csv and json formats.",
1288
+ )
1289
+
1266
1290
  schema = None
1267
1291
  col_names = output if isinstance(output, Sequence) else None
1268
1292
  if col_names or not output:
@@ -1360,6 +1384,8 @@ class DataChain(DatasetQuery):
1360
1384
  else:
1361
1385
  msg = f"error parsing csv - incompatible output type {type(output)}"
1362
1386
  raise DatasetPrepareError(chain.name, msg)
1387
+ elif nrows:
1388
+ nrows += 1
1363
1389
 
1364
1390
  parse_options = ParseOptions(delimiter=delimiter)
1365
1391
  read_options = ReadOptions(column_names=column_names)
@@ -1382,7 +1408,6 @@ class DataChain(DatasetQuery):
1382
1408
  object_name: str = "",
1383
1409
  model_name: str = "",
1384
1410
  source: bool = True,
1385
- nrows=None,
1386
1411
  **kwargs,
1387
1412
  ) -> "DataChain":
1388
1413
  """Generate chain from parquet files.
@@ -1395,7 +1420,6 @@ class DataChain(DatasetQuery):
1395
1420
  object_name : Created object column name.
1396
1421
  model_name : Generated model name.
1397
1422
  source : Whether to include info about the source file.
1398
- nrows : Optional row limit.
1399
1423
 
1400
1424
  Example:
1401
1425
  Reading a single file:
@@ -1414,7 +1438,6 @@ class DataChain(DatasetQuery):
1414
1438
  object_name=object_name,
1415
1439
  model_name=model_name,
1416
1440
  source=source,
1417
- nrows=None,
1418
1441
  format="parquet",
1419
1442
  partitioning=partitioning,
1420
1443
  )
datachain/lib/file.py CHANGED
@@ -317,9 +317,9 @@ class TextFile(File):
317
317
  """`DataModel` for reading text files."""
318
318
 
319
319
  @contextmanager
320
- def open(self):
321
- """Open the file and return a file object in text mode."""
322
- with super().open(mode="r") as stream:
320
+ def open(self, mode: Literal["rb", "r"] = "r"):
321
+ """Open the file and return a file object (default to text mode)."""
322
+ with super().open(mode=mode) as stream:
323
323
  yield stream
324
324
 
325
325
  def read_text(self):
@@ -25,7 +25,7 @@ from datachain.lib.data_model import DataModel, DataType
25
25
  from datachain.lib.file import File
26
26
  from datachain.lib.model_store import ModelStore
27
27
  from datachain.lib.utils import DataChainParamsError
28
- from datachain.query.schema import DEFAULT_DELIMITER
28
+ from datachain.query.schema import DEFAULT_DELIMITER, Column
29
29
 
30
30
  if TYPE_CHECKING:
31
31
  from datachain.catalog import Catalog
@@ -222,13 +222,30 @@ class SignalSchema:
222
222
  res.append(obj)
223
223
  return res
224
224
 
225
- def db_signals(self) -> list[str]:
226
- return [
225
+ def db_signals(
226
+ self, name: Optional[str] = None, as_columns=False
227
+ ) -> Union[list[str], list[Column]]:
228
+ """
229
+ Returns DB columns as strings or Column objects with proper types
230
+ Optionally, it can filter results by specific object, returning only his signals
231
+ """
232
+ signals = [
227
233
  DEFAULT_DELIMITER.join(path)
228
- for path, _, has_subtree, _ in self.get_flat_tree()
234
+ if not as_columns
235
+ else Column(DEFAULT_DELIMITER.join(path), python_to_sql(_type))
236
+ for path, _type, has_subtree, _ in self.get_flat_tree()
229
237
  if not has_subtree
230
238
  ]
231
239
 
240
+ if name:
241
+ signals = [
242
+ s
243
+ for s in signals
244
+ if str(s) == name or str(s).startswith(f"{name}{DEFAULT_DELIMITER}")
245
+ ]
246
+
247
+ return signals # type: ignore[return-value]
248
+
232
249
  def resolve(self, *names: str) -> "SignalSchema":
233
250
  schema = {}
234
251
  for field in names:
@@ -282,7 +299,18 @@ class SignalSchema:
282
299
  return SignalSchema(schema)
283
300
 
284
301
  def mutate(self, args_map: dict) -> "SignalSchema":
285
- return SignalSchema(self.values | sql_to_python(args_map))
302
+ new_values = self.values.copy()
303
+
304
+ for name, value in args_map.items():
305
+ if isinstance(value, Column) and value.name in self.values:
306
+ # renaming existing signal
307
+ del new_values[value.name]
308
+ new_values[name] = self.values[value.name]
309
+ else:
310
+ # adding new signal
311
+ new_values.update(sql_to_python({name: value}))
312
+
313
+ return SignalSchema(new_values)
286
314
 
287
315
  def clone_without_sys_signals(self) -> "SignalSchema":
288
316
  schema = copy.deepcopy(self.values)
datachain/listing.py CHANGED
@@ -44,6 +44,16 @@ class Listing:
44
44
  self.dataset,
45
45
  )
46
46
 
47
+ def __enter__(self) -> "Listing":
48
+ return self
49
+
50
+ def __exit__(self, exc_type, exc_value, traceback) -> None:
51
+ self.close()
52
+
53
+ def close(self) -> None:
54
+ self.metastore.close()
55
+ self.warehouse.close()
56
+
47
57
  @property
48
58
  def id(self):
49
59
  return self.storage.id
@@ -56,16 +66,18 @@ class Listing:
56
66
  sync(get_loop(), self._fetch, start_prefix, method)
57
67
 
58
68
  async def _fetch(self, start_prefix: str, method: str) -> None:
59
- self = self.clone()
60
- if start_prefix:
61
- start_prefix = start_prefix.rstrip("/")
62
- try:
63
- async for entries in self.client.scandir(start_prefix, method=method):
64
- self.insert_entries(entries)
65
- if len(entries) > 1:
66
- self.metastore.update_last_inserted_at()
67
- finally:
68
- self.insert_entries_done()
69
+ with self.clone() as fetch_listing:
70
+ if start_prefix:
71
+ start_prefix = start_prefix.rstrip("/")
72
+ try:
73
+ async for entries in fetch_listing.client.scandir(
74
+ start_prefix, method=method
75
+ ):
76
+ fetch_listing.insert_entries(entries)
77
+ if len(entries) > 1:
78
+ fetch_listing.metastore.update_last_inserted_at()
79
+ finally:
80
+ fetch_listing.insert_entries_done()
69
81
 
70
82
  def insert_entry(self, entry: Entry) -> None:
71
83
  self.warehouse.insert_rows(
@@ -1051,8 +1051,11 @@ class DatasetQuery:
1051
1051
  if anon:
1052
1052
  client_config["anon"] = True
1053
1053
 
1054
+ self.session = Session.get(
1055
+ session, catalog=catalog, client_config=client_config
1056
+ )
1057
+ self.catalog = catalog or self.session.catalog
1054
1058
  self.steps: list[Step] = []
1055
- self.catalog = catalog or get_catalog(client_config=client_config)
1056
1059
  self._chunk_index: Optional[int] = None
1057
1060
  self._chunk_total: Optional[int] = None
1058
1061
  self.temp_table_names: list[str] = []
@@ -1063,7 +1066,6 @@ class DatasetQuery:
1063
1066
  self.version: Optional[int] = None
1064
1067
  self.feature_schema: Optional[dict] = None
1065
1068
  self.column_types: Optional[dict[str, Any]] = None
1066
- self.session = Session.get(session, catalog=catalog)
1067
1069
 
1068
1070
  if path:
1069
1071
  kwargs = {"update": True} if update else {}
@@ -1200,12 +1202,10 @@ class DatasetQuery:
1200
1202
  # This is needed to always use a new connection with all metastore and warehouse
1201
1203
  # implementations, as errors may close or render unusable the existing
1202
1204
  # connections.
1203
- metastore = self.catalog.metastore.clone(use_new_connection=True)
1204
- metastore.cleanup_tables(self.temp_table_names)
1205
- metastore.close()
1206
- warehouse = self.catalog.warehouse.clone(use_new_connection=True)
1207
- warehouse.cleanup_tables(self.temp_table_names)
1208
- warehouse.close()
1205
+ with self.catalog.metastore.clone(use_new_connection=True) as metastore:
1206
+ metastore.cleanup_tables(self.temp_table_names)
1207
+ with self.catalog.warehouse.clone(use_new_connection=True) as warehouse:
1208
+ warehouse.cleanup_tables(self.temp_table_names)
1209
1209
  self.temp_table_names = []
1210
1210
 
1211
1211
  def db_results(self, row_factory=None, **kwargs):
@@ -1248,19 +1248,12 @@ class DatasetQuery:
1248
1248
  def row_iter() -> Generator[RowDict, None, None]:
1249
1249
  # warehouse isn't threadsafe, we need to clone() it
1250
1250
  # in the thread that uses the results
1251
- warehouse = None
1252
- try:
1253
- warehouse = self.catalog.warehouse.clone()
1251
+ with self.catalog.warehouse.clone() as warehouse:
1254
1252
  gen = warehouse.dataset_select_paginated(
1255
1253
  query, limit=query._limit, order_by=query._order_by_clauses
1256
1254
  )
1257
1255
  with contextlib.closing(gen) as rows:
1258
1256
  yield from rows
1259
- finally:
1260
- # clone doesn't necessarily create a new connection
1261
- # we can't do `warehouse.close()` for now. It is a bad design
1262
- # in clone / close interface that needs to be fixed.
1263
- pass
1264
1257
 
1265
1258
  async def get_params(row: RowDict) -> tuple:
1266
1259
  return tuple(
@@ -1383,10 +1376,14 @@ class DatasetQuery:
1383
1376
  @detach
1384
1377
  def limit(self, n: int) -> "Self":
1385
1378
  query = self.clone(new_table=False)
1386
- for step in query.steps:
1387
- if isinstance(step, SQLLimit) and step.n < n:
1388
- return query
1389
- query.steps.append(SQLLimit(n))
1379
+ if (
1380
+ query.steps
1381
+ and (last_step := query.steps[-1])
1382
+ and isinstance(last_step, SQLLimit)
1383
+ ):
1384
+ query.steps[-1] = SQLLimit(min(n, last_step.n))
1385
+ else:
1386
+ query.steps.append(SQLLimit(n))
1390
1387
  return query
1391
1388
 
1392
1389
  @detach
@@ -41,7 +41,12 @@ class Session:
41
41
  SESSION_UUID_LEN = 6
42
42
  TEMP_TABLE_UUID_LEN = 6
43
43
 
44
- def __init__(self, name="", catalog: Optional["Catalog"] = None):
44
+ def __init__(
45
+ self,
46
+ name="",
47
+ catalog: Optional["Catalog"] = None,
48
+ client_config: Optional[dict] = None,
49
+ ):
45
50
  if re.match(r"^[0-9a-zA-Z]+$", name) is None:
46
51
  raise ValueError(
47
52
  f"Session name can contain only letters or numbers - '{name}' given."
@@ -52,13 +57,18 @@ class Session:
52
57
 
53
58
  session_uuid = uuid4().hex[: self.SESSION_UUID_LEN]
54
59
  self.name = f"{name}_{session_uuid}"
55
- self.catalog = catalog or get_catalog()
60
+ self.is_new_catalog = not catalog
61
+ self.catalog = catalog or get_catalog(client_config=client_config)
56
62
 
57
63
  def __enter__(self):
58
64
  return self
59
65
 
60
66
  def __exit__(self, exc_type, exc_val, exc_tb):
61
67
  self._cleanup_temp_datasets()
68
+ if self.is_new_catalog:
69
+ self.catalog.metastore.close_on_exit()
70
+ self.catalog.warehouse.close_on_exit()
71
+ self.catalog.id_generator.close_on_exit()
62
72
 
63
73
  def generate_temp_dataset_name(self) -> str:
64
74
  tmp_table_uid = uuid4().hex[: self.TEMP_TABLE_UUID_LEN]
@@ -75,7 +85,10 @@ class Session:
75
85
 
76
86
  @classmethod
77
87
  def get(
78
- cls, session: Optional["Session"] = None, catalog: Optional["Catalog"] = None
88
+ cls,
89
+ session: Optional["Session"] = None,
90
+ catalog: Optional["Catalog"] = None,
91
+ client_config: Optional[dict] = None,
79
92
  ) -> "Session":
80
93
  """Creates a Session() object from a catalog.
81
94
 
@@ -88,7 +101,9 @@ class Session:
88
101
  return session
89
102
 
90
103
  if cls.GLOBAL_SESSION is None:
91
- cls.GLOBAL_SESSION_CTX = Session(cls.GLOBAL_SESSION_NAME, catalog)
104
+ cls.GLOBAL_SESSION_CTX = Session(
105
+ cls.GLOBAL_SESSION_NAME, catalog, client_config=client_config
106
+ )
92
107
  cls.GLOBAL_SESSION = cls.GLOBAL_SESSION_CTX.__enter__()
93
108
  atexit.register(cls._global_cleanup)
94
109
  return cls.GLOBAL_SESSION
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.2.16
3
+ Version: 0.2.17
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -100,28 +100,87 @@ Requires-Dist: usearch ; extra == 'vector'
100
100
  AI 🔗 DataChain
101
101
  ----------------
102
102
 
103
- DataChain is an open-source Python library for processing and curating unstructured
104
- data at scale.
103
+ DataChain is a data-frame library designed for AI-specific scenarios. It helps ML and
104
+ AI engineers build a metadata layer on top of unstructured files and analyze data using
105
+ this layer.
105
106
 
106
- 🤖 AI-Driven Data Curation: Use local ML models or LLM APIs calls to enrich your data.
107
+ 📂 **Raw Files Processing**
108
+ Process raw files (images, video, text, PDFs) directly from storage (S3, GCP, Azure,
109
+ Local), version and update datasets.
107
110
 
108
- 🚀 GenAI Dataset scale: Handle tens of millions of multimodal files.
111
+ 🌟 **Metadata layer.**
112
+ Build a metadata layer on top of files using structured sources like CSV, Parquet,
113
+ and JSON files.
109
114
 
110
- 🐍 Python-friendly: Use strictly-typed `Pydantic`_ objects instead of JSON.
115
+ **Metadata enrichment.**
116
+ Enhance the metadata layer with outputs from local ML model inferences and LLM calls.
111
117
 
118
+ 🛠️ **Data Transformation.**
119
+ Transform metadata using traditional methods like filtering, grouping, joining, and
120
+ others.
112
121
 
113
- Datachain supports parallel processing, parallel data
114
- downloads, and out-of-memory computing. It excels at optimizing offline batch operations.
115
-
116
- The typical use cases include Computer Vision data curation, LLM analytics,
117
- and validation of multimodal AI applications.
122
+ 🐍 **User-friendly interface.**
123
+ Operate efficiently with familiar Python objects and object fields, eliminating the
124
+ need for SQL.
118
125
 
119
126
 
120
127
  .. code:: console
121
128
 
122
129
  $ pip install datachain
123
130
 
124
- |Flowchart|
131
+
132
+ Data Structures
133
+ ===============
134
+
135
+ DataChain introduces expressive data structures tailored for AI-specific workload:
136
+
137
+ - **Dataset:** Preserves the file-references and meta-information. Takes care of Python
138
+ object serialization, dataset versioning and difference. Operations on dataset:
139
+
140
+ - **Transformations:** traditional data-frame or SQL operations such as filtering,
141
+ grouping, joining.
142
+ - **Enrichments:** mapping, aggregating and generating using customer’s Python
143
+ code. This is needed to work with ML inference and LLM calls.
144
+
145
+ - **Chain** is a sequence of operations on datasets. Chain executes operations in lazy
146
+ mode - only when needed.
147
+
148
+ DataChain name comes from these major data structures: dataset and chaining.
149
+
150
+
151
+ What’s new in DataChain?
152
+ ========================
153
+
154
+ The project combines multiple ideas from different areas in order to simplify AI
155
+ use-cases and at the same time to fit it into traditional data infrastructure.
156
+
157
+ - **Python-Native for AI.** Utilizes Python instead of SQL for data manipulation as the
158
+ native language for AI. It’s powered by `Pydantic`_ data models.
159
+ - **Separation of CPU-GPU workloads.** Distinguishes CPU-heavy transformations (filter,
160
+ group_by, join) from GPU heavy enrichments (ML-inference or LLM calls). That’s mostly
161
+ needed for distributed computations.
162
+ - **Resuming data processing** (in development). Introduces idempotent operations,
163
+ allowing data processing to resume from the last successful process file/record/batch
164
+ if it fails due to issues like failed LLM calls, ML inference or file download.
165
+
166
+ Additional relatively new ideas:
167
+
168
+ - **Functional style data processing.** Using a functional/chaining approach to data
169
+ processing rather than declarative SQL, inspired by R-dplyr and some Python libraries.
170
+ - **Data Versioning.** Treats raw files in cloud storage as the source of truth for data
171
+ and implements data versioning, extending ideas from DVC (developed by the same team).
172
+
173
+
174
+ What DataChain is NOT?
175
+ ======================
176
+
177
+ - **Not a database** (Postgres, MySQL). Instead, it uses databases under the hood:
178
+ `SQLite`_ in open-source and ClickHouse and other data warehouses for the commercial
179
+ version.
180
+ - **Not a data processing tool / data warehouse** (Spark, Snowflake, Big Query) since
181
+ it delegates heavy data transformations to underlying data warehouses and focuses on
182
+ AI specific data enrichments and orchestrating all the pieces together.
183
+
125
184
 
126
185
  Quick Start
127
186
  -----------
@@ -8,7 +8,7 @@ datachain/config.py,sha256=PfC7W5yO6HFO6-iMB4YB-0RR88LPiGmD6sS_SfVbGso,1979
8
8
  datachain/dataset.py,sha256=MZezyuJWNj_3PEtzr0epPMNyWAOTrhTSPI5FmemV6L4,14470
9
9
  datachain/error.py,sha256=GY9KYTmb7GHXn2gGHV9X-PBhgwLj3i7VpK7tGHtAoGM,1279
10
10
  datachain/job.py,sha256=bk25bIqClhgRPzlXAhxpTtDeewibQe5l3S8Cf7db0gM,1229
11
- datachain/listing.py,sha256=sX8vZNzAzoTel1li6VJiYeHUJwseUERVEoW9D5P7tII,8192
11
+ datachain/listing.py,sha256=JEhi5WOSV2LUqRQgt0-fdmJ8Zb5fNpNFzBQcuTtx63o,8555
12
12
  datachain/node.py,sha256=LwzSOSM9SbPLI5RvYDsiEkk7d5rbMX8huzM_m7uWKx4,5917
13
13
  datachain/nodes_fetcher.py,sha256=kca19yvu11JxoVY1t4_ydp1FmchiV88GnNicNBQ9NIA,831
14
14
  datachain/nodes_thread_pool.py,sha256=ZyzBvUImIPmi4WlKC2SW2msA0UhtembbTdcs2nx29A0,3191
@@ -17,7 +17,7 @@ datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
17
  datachain/storage.py,sha256=RiSJLYdHUjnrEWkLBKPcETHpAxld_B2WxLg711t0aZI,3733
18
18
  datachain/utils.py,sha256=kgH5NPj47eC_KrFTd6ZS206lKVhnJVFt5XsqkK6ppTc,12483
19
19
  datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
20
- datachain/catalog/catalog.py,sha256=LZo9pIgi_HOUWpxX1c7RMt5OnrlDHXx2YpL5oP8X0kk,80397
20
+ datachain/catalog/catalog.py,sha256=z0tclel0kNdSzJojNRRnRVhgt-K7ElO3CeuurlwQMGI,80612
21
21
  datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
22
22
  datachain/catalog/loader.py,sha256=GJ8zhEYkC7TuaPzCsjJQ4LtTdECu-wwYzC12MikPOMQ,7307
23
23
  datachain/catalog/subclass.py,sha256=B5R0qxeTYEyVAAPM1RutBPSoXZc8L5mVVZeSGXki9Sw,2096
@@ -29,27 +29,27 @@ datachain/client/gcs.py,sha256=ucX8e6JrqlFY-f80zkv084vxnKdtxpO32QJ-RG8Nv1s,4454
29
29
  datachain/client/local.py,sha256=NQVkLTJQ-a7Udavqbh_4uT-IejfZQYn10j22owz9sis,5150
30
30
  datachain/client/s3.py,sha256=TmW4f7VUM5CMZjSmgyFQFKeMUGrXt2SLoLEbLOUleiU,6296
31
31
  datachain/data_storage/__init__.py,sha256=cEOJpyu1JDZtfUupYucCDNFI6e5Wmp_Oyzq6rZv32Y8,398
32
- datachain/data_storage/db_engine.py,sha256=rgBuqJ-M1j5QyqiUQuJRewctuvRRj8LBDL54-aPEFxE,3287
33
- datachain/data_storage/id_generator.py,sha256=VlDALKijggegAnNMJwuMETJgnLoPYxpkrkld5DNTPQw,3839
32
+ datachain/data_storage/db_engine.py,sha256=81Ol1of9TTTzD97ORajCnP366Xz2mEJt6C-kTUCaru4,3406
33
+ datachain/data_storage/id_generator.py,sha256=lCEoU0BM37Ai2aRpSbwo5oQT0GqZnSpYwwvizathRMQ,4292
34
34
  datachain/data_storage/job.py,sha256=w-7spowjkOa1P5fUVtJou3OltT0L48P0RYWZ9rSJ9-s,383
35
- datachain/data_storage/metastore.py,sha256=ody-hWyrisGuNlzy24bc7QBqPXWIg64NcucIhZYronk,54842
35
+ datachain/data_storage/metastore.py,sha256=nxcY6nwyEmQWMAo33sNGO-FgUFQs2amBGGnZz2ftEz0,55362
36
36
  datachain/data_storage/schema.py,sha256=FQvt5MUMSnI5ZAE7Nthae4aaJpt8JC4nH8KiWDuhJkk,8135
37
37
  datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
38
- datachain/data_storage/sqlite.py,sha256=w0d_cZ2u9LpQYFFXll22mnxHaxPOoJdHlsKAZmONQpA,25605
39
- datachain/data_storage/warehouse.py,sha256=3iD946WXgGxohZ5lagmwydFZr7j7RceZW423QXU_7_U,33120
38
+ datachain/data_storage/sqlite.py,sha256=0r6L_a2hdGRoR_gl06v1qWhEFOS_Q31aldHyk07Yx-M,26857
39
+ datachain/data_storage/warehouse.py,sha256=G79jsQwA6anYPWoiBXngwPyx-uP7yGIWqhZGc4TL5mY,33591
40
40
  datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
41
- datachain/lib/arrow.py,sha256=9C5AVH6tLo9hwzav-1tLLnmWP-3_SReYCOfcOC54pu0,4437
41
+ datachain/lib/arrow.py,sha256=R8wDUDEa-5hYjI3HW9cqvOYYJpeeah5lbhFIL3gkmcE,4915
42
42
  datachain/lib/clip.py,sha256=16u4b_y2Y15nUS2UN_8ximMo6r_-_4IQpmct2ol-e-g,5730
43
43
  datachain/lib/data_model.py,sha256=qfTtQNncS5pt9SvXdMEa5kClniaT6XBGBfO7onEz2TI,1632
44
44
  datachain/lib/dataset_info.py,sha256=lONGr71ozo1DS4CQEhnpKORaU4qFb6Ketv8Xm8CVm2U,2188
45
- datachain/lib/dc.py,sha256=6RtwA7MC3hosxi9RBgpOXjkv46SdN99g9N_u4mCDUUo,56071
46
- datachain/lib/file.py,sha256=n9GBmZ1CjzDjHkbUBsUrs8JOJrAoh3MV2Cc8hBkex20,11957
45
+ datachain/lib/dc.py,sha256=bZx7VJ389SJ5gRTkckFD044LHq_hOgHqvhTD7gJoBZY,56963
46
+ datachain/lib/file.py,sha256=MCklths3w9SgQTR0LACnDohfGdEc3t30XD0qNq1oTlI,12000
47
47
  datachain/lib/image.py,sha256=TgYhRhzd4nkytfFMeykQkPyzqb5Le_-tU81unVMPn4Q,2328
48
48
  datachain/lib/meta_formats.py,sha256=jlSYWRUeDMjun_YCsQ2JxyaDJpEpokzHDPmKUAoCXnU,7034
49
49
  datachain/lib/model_store.py,sha256=c4USXsBBjrGH8VOh4seIgOiav-qHOwdoixtxfLgU63c,2409
50
50
  datachain/lib/pytorch.py,sha256=9PsypKseyKfIimTmTQOgb-pbNXgeeAHLdlWx0qRPULY,5660
51
51
  datachain/lib/settings.py,sha256=39thOpYJw-zPirzeNO6pmRC2vPrQvt4eBsw1xLWDFsw,2344
52
- datachain/lib/signal_schema.py,sha256=XQTINSN_FJK76Jn8qd03g6J0cum58knP8U7Iuw-zKyU,14704
52
+ datachain/lib/signal_schema.py,sha256=VL9TR0CJ3eRzjIDr-8e-e7cZKuMBbPUZtY2lGAsucc0,15734
53
53
  datachain/lib/text.py,sha256=dVe2Ilc_gW2EV0kun0UwegiCkapWcd20cef7CgINWHU,1083
54
54
  datachain/lib/udf.py,sha256=IjuDt2B8E3xEHhcJnaK_ZhmivdrOYPXz5uf7ylpktws,11815
55
55
  datachain/lib/udf_signature.py,sha256=gMStcEeYJka5M6cg50Z9orC6y6HzCAJ3MkFqqn1fjZg,7137
@@ -66,12 +66,12 @@ datachain/lib/convert/values_to_tuples.py,sha256=aVoHWMOUGLAiS6_BBwKJqVIne91VffO
66
66
  datachain/query/__init__.py,sha256=tv-spkjUCYamMN9ys_90scYrZ8kJ7C7d1MTYVmxGtk4,325
67
67
  datachain/query/batch.py,sha256=j-_ZcuQra2Ro3Wj4crtqQCg-7xuv-p84hr4QHdvT7as,3479
68
68
  datachain/query/builtins.py,sha256=ZKNs49t8Oa_OaboCBIEqtXZt7c1Qe9OR_C_HpoDriIU,2781
69
- datachain/query/dataset.py,sha256=iTz3c5nJ-WmoQ5zcvKGT9ly6xVKJtD_fk76LA7zecWk,60164
69
+ datachain/query/dataset.py,sha256=-AGkz3-K_b-2YBJCMqQz-Qq7FKzMcScPty_77S0AQtE,59938
70
70
  datachain/query/dispatch.py,sha256=oGX9ZuoKWPB_EyqAZD_eULcO3OejY44_keSmFS6SHT0,13315
71
71
  datachain/query/metrics.py,sha256=vsECqbZfoSDBnvC3GQlziKXmISVYDLgHP1fMPEOtKyo,640
72
72
  datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
73
73
  datachain/query/schema.py,sha256=hAvux_GxUmuG_PwtnKkkizld9f0Gvt2JBzbu3m74fvE,7840
74
- datachain/query/session.py,sha256=am4XCNj8NlZPAYJSvh43C13dQ5NsfzzuyVDjPgYAgJE,3655
74
+ datachain/query/session.py,sha256=qTzkXgwMJdJhal3rVt3hdv3x1EXT1IHuXcwkC-Ex0As,4111
75
75
  datachain/query/udf.py,sha256=c0IOTkcedpOQEmX-Idlrrl1__1IecNXL0N9oUO9Dtkg,7755
76
76
  datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
77
77
  datachain/remote/studio.py,sha256=f5s6qSZ9uB4URGUoU_8_W1KZRRQQVSm6cgEBkBUEfuE,7226
@@ -92,9 +92,9 @@ datachain/sql/sqlite/base.py,sha256=Jb1csbIARjEvwbylnvgNA7ChozSyoL3CQzOGBUf8QAw,
92
92
  datachain/sql/sqlite/types.py,sha256=yzvp0sXSEoEYXs6zaYC_2YubarQoZH-MiUNXcpuEP4s,1573
93
93
  datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
94
94
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
95
- datachain-0.2.16.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
96
- datachain-0.2.16.dist-info/METADATA,sha256=1f326fK-ZnS0nPvETuUj9PaI4R5SatpGVDIsQiJ0OvM,14577
97
- datachain-0.2.16.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
98
- datachain-0.2.16.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
99
- datachain-0.2.16.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
100
- datachain-0.2.16.dist-info/RECORD,,
95
+ datachain-0.2.17.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
96
+ datachain-0.2.17.dist-info/METADATA,sha256=STR0-4R9NOW55GgadrPA_-fmx5-WckcwhTmyH_OgaUs,17269
97
+ datachain-0.2.17.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
98
+ datachain-0.2.17.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
99
+ datachain-0.2.17.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
100
+ datachain-0.2.17.dist-info/RECORD,,