datachain 0.3.20__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/catalog/catalog.py +0 -3
- datachain/cli.py +3 -2
- datachain/data_storage/metastore.py +8 -12
- datachain/data_storage/warehouse.py +1 -3
- datachain/dataset.py +0 -8
- datachain/lib/dc.py +197 -113
- datachain/lib/listing.py +5 -3
- datachain/lib/pytorch.py +5 -1
- datachain/query/dataset.py +1 -1
- {datachain-0.3.20.dist-info → datachain-0.5.0.dist-info}/METADATA +1 -1
- {datachain-0.3.20.dist-info → datachain-0.5.0.dist-info}/RECORD +15 -15
- {datachain-0.3.20.dist-info → datachain-0.5.0.dist-info}/LICENSE +0 -0
- {datachain-0.3.20.dist-info → datachain-0.5.0.dist-info}/WHEEL +0 -0
- {datachain-0.3.20.dist-info → datachain-0.5.0.dist-info}/entry_points.txt +0 -0
- {datachain-0.3.20.dist-info → datachain-0.5.0.dist-info}/top_level.txt +0 -0
datachain/catalog/catalog.py
CHANGED
|
@@ -979,7 +979,6 @@ class Catalog:
|
|
|
979
979
|
script_output="",
|
|
980
980
|
create_rows_table=True,
|
|
981
981
|
job_id: Optional[str] = None,
|
|
982
|
-
is_job_result: bool = False,
|
|
983
982
|
) -> DatasetRecord:
|
|
984
983
|
"""
|
|
985
984
|
Creates dataset version if it doesn't exist.
|
|
@@ -1001,7 +1000,6 @@ class Catalog:
|
|
|
1001
1000
|
script_output=script_output,
|
|
1002
1001
|
schema=schema,
|
|
1003
1002
|
job_id=job_id,
|
|
1004
|
-
is_job_result=is_job_result,
|
|
1005
1003
|
ignore_if_exists=True,
|
|
1006
1004
|
)
|
|
1007
1005
|
|
|
@@ -1211,7 +1209,6 @@ class Catalog:
|
|
|
1211
1209
|
size=dataset_version.size,
|
|
1212
1210
|
preview=dataset_version.preview,
|
|
1213
1211
|
job_id=dataset_version.job_id,
|
|
1214
|
-
is_job_result=dataset_version.is_job_result,
|
|
1215
1212
|
)
|
|
1216
1213
|
# to avoid re-creating rows table, we are just renaming it for a new version
|
|
1217
1214
|
# of target dataset
|
datachain/cli.py
CHANGED
|
@@ -12,7 +12,7 @@ from typing import TYPE_CHECKING, Optional, Union
|
|
|
12
12
|
|
|
13
13
|
import shtab
|
|
14
14
|
|
|
15
|
-
from datachain import utils
|
|
15
|
+
from datachain import Session, utils
|
|
16
16
|
from datachain.cli_utils import BooleanOptionalAction, CommaSeparatedArgs, KeyValueArgs
|
|
17
17
|
from datachain.lib.dc import DataChain
|
|
18
18
|
from datachain.telemetry import telemetry
|
|
@@ -770,7 +770,8 @@ def show(
|
|
|
770
770
|
show_records(records, collapse_columns=not no_collapse)
|
|
771
771
|
if schema and dataset_version.feature_schema:
|
|
772
772
|
print("\nSchema:")
|
|
773
|
-
|
|
773
|
+
session = Session.get(catalog=catalog)
|
|
774
|
+
dc = DataChain.from_dataset(name=name, version=version, session=session)
|
|
774
775
|
dc.print_schema()
|
|
775
776
|
|
|
776
777
|
|
|
@@ -15,7 +15,6 @@ from uuid import uuid4
|
|
|
15
15
|
from sqlalchemy import (
|
|
16
16
|
JSON,
|
|
17
17
|
BigInteger,
|
|
18
|
-
Boolean,
|
|
19
18
|
Column,
|
|
20
19
|
DateTime,
|
|
21
20
|
ForeignKey,
|
|
@@ -228,7 +227,7 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
228
227
|
self,
|
|
229
228
|
dataset: DatasetRecord,
|
|
230
229
|
version: int,
|
|
231
|
-
status: int
|
|
230
|
+
status: int,
|
|
232
231
|
sources: str = "",
|
|
233
232
|
feature_schema: Optional[dict] = None,
|
|
234
233
|
query_script: str = "",
|
|
@@ -243,7 +242,6 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
243
242
|
size: Optional[int] = None,
|
|
244
243
|
preview: Optional[list[dict]] = None,
|
|
245
244
|
job_id: Optional[str] = None,
|
|
246
|
-
is_job_result: bool = False,
|
|
247
245
|
) -> DatasetRecord:
|
|
248
246
|
"""Creates new dataset version."""
|
|
249
247
|
|
|
@@ -449,7 +447,6 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
449
447
|
Column("name", Text, nullable=False),
|
|
450
448
|
Column("description", Text),
|
|
451
449
|
Column("labels", JSON, nullable=True),
|
|
452
|
-
Column("shadow", Boolean, nullable=False),
|
|
453
450
|
Column("status", Integer, nullable=False),
|
|
454
451
|
Column("feature_schema", JSON, nullable=True),
|
|
455
452
|
Column("created_at", DateTime(timezone=True)),
|
|
@@ -482,8 +479,11 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
482
479
|
nullable=False,
|
|
483
480
|
),
|
|
484
481
|
Column("version", Integer, nullable=False),
|
|
485
|
-
|
|
486
|
-
|
|
482
|
+
Column(
|
|
483
|
+
"status",
|
|
484
|
+
Integer,
|
|
485
|
+
nullable=False,
|
|
486
|
+
),
|
|
487
487
|
Column("feature_schema", JSON, nullable=True),
|
|
488
488
|
Column("created_at", DateTime(timezone=True)),
|
|
489
489
|
Column("finished_at", DateTime(timezone=True)),
|
|
@@ -497,7 +497,6 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
497
497
|
Column("query_script", Text, nullable=False, default=""),
|
|
498
498
|
Column("schema", JSON, nullable=True),
|
|
499
499
|
Column("job_id", Text, nullable=True),
|
|
500
|
-
Column("is_job_result", Boolean, nullable=False, default=False),
|
|
501
500
|
UniqueConstraint("dataset_id", "version"),
|
|
502
501
|
]
|
|
503
502
|
|
|
@@ -971,7 +970,6 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
971
970
|
# TODO abstract this method and add registered = True based on kwargs
|
|
972
971
|
query = self._datasets_insert().values(
|
|
973
972
|
name=name,
|
|
974
|
-
shadow=False,
|
|
975
973
|
status=status,
|
|
976
974
|
feature_schema=json.dumps(feature_schema or {}),
|
|
977
975
|
created_at=datetime.now(timezone.utc),
|
|
@@ -994,7 +992,7 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
994
992
|
self,
|
|
995
993
|
dataset: DatasetRecord,
|
|
996
994
|
version: int,
|
|
997
|
-
status: int
|
|
995
|
+
status: int,
|
|
998
996
|
sources: str = "",
|
|
999
997
|
feature_schema: Optional[dict] = None,
|
|
1000
998
|
query_script: str = "",
|
|
@@ -1009,7 +1007,6 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
1009
1007
|
size: Optional[int] = None,
|
|
1010
1008
|
preview: Optional[list[dict]] = None,
|
|
1011
1009
|
job_id: Optional[str] = None,
|
|
1012
|
-
is_job_result: bool = False,
|
|
1013
1010
|
conn=None,
|
|
1014
1011
|
) -> DatasetRecord:
|
|
1015
1012
|
"""Creates new dataset version."""
|
|
@@ -1021,7 +1018,7 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
1021
1018
|
query = self._datasets_versions_insert().values(
|
|
1022
1019
|
dataset_id=dataset.id,
|
|
1023
1020
|
version=version,
|
|
1024
|
-
status=status,
|
|
1021
|
+
status=status,
|
|
1025
1022
|
feature_schema=json.dumps(feature_schema or {}),
|
|
1026
1023
|
created_at=created_at or datetime.now(timezone.utc),
|
|
1027
1024
|
finished_at=finished_at,
|
|
@@ -1035,7 +1032,6 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
1035
1032
|
size=size,
|
|
1036
1033
|
preview=json.dumps(preview or []),
|
|
1037
1034
|
job_id=job_id or os.getenv("DATACHAIN_JOB_ID"),
|
|
1038
|
-
is_job_result=is_job_result,
|
|
1039
1035
|
)
|
|
1040
1036
|
if ignore_if_exists and hasattr(query, "on_conflict_do_nothing"):
|
|
1041
1037
|
# SQLite and PostgreSQL both support 'on_conflict_do_nothing',
|
|
@@ -919,9 +919,7 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
919
919
|
def is_temp_table_name(self, name: str) -> bool:
|
|
920
920
|
"""Returns if the given table name refers to a temporary
|
|
921
921
|
or no longer needed table."""
|
|
922
|
-
return name.startswith(
|
|
923
|
-
(self.TMP_TABLE_NAME_PREFIX, self.UDF_TABLE_NAME_PREFIX, "ds_shadow_")
|
|
924
|
-
) or name.endswith("_shadow")
|
|
922
|
+
return name.startswith((self.TMP_TABLE_NAME_PREFIX, self.UDF_TABLE_NAME_PREFIX))
|
|
925
923
|
|
|
926
924
|
def get_temp_table_names(self) -> list[str]:
|
|
927
925
|
return [
|
datachain/dataset.py
CHANGED
|
@@ -179,7 +179,6 @@ class DatasetVersion:
|
|
|
179
179
|
sources: str = ""
|
|
180
180
|
query_script: str = ""
|
|
181
181
|
job_id: Optional[str] = None
|
|
182
|
-
is_job_result: bool = False
|
|
183
182
|
|
|
184
183
|
@classmethod
|
|
185
184
|
def parse( # noqa: PLR0913
|
|
@@ -201,7 +200,6 @@ class DatasetVersion:
|
|
|
201
200
|
sources: str = "",
|
|
202
201
|
query_script: str = "",
|
|
203
202
|
job_id: Optional[str] = None,
|
|
204
|
-
is_job_result: bool = False,
|
|
205
203
|
):
|
|
206
204
|
return cls(
|
|
207
205
|
id,
|
|
@@ -221,7 +219,6 @@ class DatasetVersion:
|
|
|
221
219
|
sources,
|
|
222
220
|
query_script,
|
|
223
221
|
job_id,
|
|
224
|
-
is_job_result,
|
|
225
222
|
)
|
|
226
223
|
|
|
227
224
|
def __eq__(self, other):
|
|
@@ -270,7 +267,6 @@ class DatasetRecord:
|
|
|
270
267
|
name: str
|
|
271
268
|
description: Optional[str]
|
|
272
269
|
labels: list[str]
|
|
273
|
-
shadow: bool
|
|
274
270
|
schema: dict[str, Union[SQLType, type[SQLType]]]
|
|
275
271
|
feature_schema: dict
|
|
276
272
|
versions: list[DatasetVersion]
|
|
@@ -299,7 +295,6 @@ class DatasetRecord:
|
|
|
299
295
|
name: str,
|
|
300
296
|
description: Optional[str],
|
|
301
297
|
labels: str,
|
|
302
|
-
shadow: int,
|
|
303
298
|
status: int,
|
|
304
299
|
feature_schema: Optional[str],
|
|
305
300
|
created_at: datetime,
|
|
@@ -327,7 +322,6 @@ class DatasetRecord:
|
|
|
327
322
|
version_query_script: Optional[str],
|
|
328
323
|
version_schema: str,
|
|
329
324
|
version_job_id: Optional[str] = None,
|
|
330
|
-
version_is_job_result: bool = False,
|
|
331
325
|
) -> "DatasetRecord":
|
|
332
326
|
labels_lst: list[str] = json.loads(labels) if labels else []
|
|
333
327
|
schema_dct: dict[str, Any] = json.loads(schema) if schema else {}
|
|
@@ -353,7 +347,6 @@ class DatasetRecord:
|
|
|
353
347
|
version_sources, # type: ignore[arg-type]
|
|
354
348
|
version_query_script, # type: ignore[arg-type]
|
|
355
349
|
version_job_id,
|
|
356
|
-
version_is_job_result,
|
|
357
350
|
)
|
|
358
351
|
|
|
359
352
|
return cls(
|
|
@@ -361,7 +354,6 @@ class DatasetRecord:
|
|
|
361
354
|
name,
|
|
362
355
|
description,
|
|
363
356
|
labels_lst,
|
|
364
|
-
bool(shadow),
|
|
365
357
|
cls.parse_schema(schema_dct), # type: ignore[arg-type]
|
|
366
358
|
json.loads(feature_schema) if feature_schema else {},
|
|
367
359
|
[dataset_version],
|
datachain/lib/dc.py
CHANGED
|
@@ -54,7 +54,6 @@ from datachain.query import Session
|
|
|
54
54
|
from datachain.query.dataset import (
|
|
55
55
|
DatasetQuery,
|
|
56
56
|
PartitionByType,
|
|
57
|
-
detach,
|
|
58
57
|
)
|
|
59
58
|
from datachain.query.schema import DEFAULT_DELIMITER, Column, DatasetRow
|
|
60
59
|
from datachain.sql.functions import path as pathfunc
|
|
@@ -159,7 +158,7 @@ class Sys(DataModel):
|
|
|
159
158
|
rand: int
|
|
160
159
|
|
|
161
160
|
|
|
162
|
-
class DataChain
|
|
161
|
+
class DataChain:
|
|
163
162
|
"""DataChain - a data structure for batch data processing and evaluation.
|
|
164
163
|
|
|
165
164
|
It represents a sequence of data manipulation steps such as reading data from
|
|
@@ -238,33 +237,20 @@ class DataChain(DatasetQuery):
|
|
|
238
237
|
"size": 0,
|
|
239
238
|
}
|
|
240
239
|
|
|
241
|
-
def __init__(
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
else:
|
|
256
|
-
self._settings = Settings()
|
|
257
|
-
self._setup: dict = {}
|
|
258
|
-
|
|
259
|
-
self.signals_schema = SignalSchema({"sys": Sys})
|
|
260
|
-
if self.feature_schema:
|
|
261
|
-
self.signals_schema |= SignalSchema.deserialize(self.feature_schema)
|
|
262
|
-
else:
|
|
263
|
-
self.signals_schema |= SignalSchema.from_column_types(
|
|
264
|
-
self.column_types or {}
|
|
265
|
-
)
|
|
266
|
-
|
|
267
|
-
self._sys = False
|
|
240
|
+
def __init__(
|
|
241
|
+
self,
|
|
242
|
+
query: DatasetQuery,
|
|
243
|
+
settings: Settings,
|
|
244
|
+
signal_schema: SignalSchema,
|
|
245
|
+
setup: Optional[dict] = None,
|
|
246
|
+
_sys: bool = False,
|
|
247
|
+
) -> None:
|
|
248
|
+
"""Don't instantiate this directly, use one of the from_XXX constructors."""
|
|
249
|
+
self._query = query
|
|
250
|
+
self._settings = settings
|
|
251
|
+
self.signals_schema = signal_schema
|
|
252
|
+
self._setup: dict = setup or {}
|
|
253
|
+
self._sys = _sys
|
|
268
254
|
|
|
269
255
|
@property
|
|
270
256
|
def schema(self) -> dict[str, DataType]:
|
|
@@ -290,18 +276,55 @@ class DataChain(DatasetQuery):
|
|
|
290
276
|
def c(self, column: Union[str, Column]) -> Column:
|
|
291
277
|
"""Returns Column instance attached to the current chain."""
|
|
292
278
|
c = self.column(column) if isinstance(column, str) else self.column(column.name)
|
|
293
|
-
c.table = self.table
|
|
279
|
+
c.table = self._query.table
|
|
294
280
|
return c
|
|
295
281
|
|
|
282
|
+
@property
|
|
283
|
+
def session(self) -> Session:
|
|
284
|
+
"""Session of the chain."""
|
|
285
|
+
return self._query.session
|
|
286
|
+
|
|
287
|
+
@property
|
|
288
|
+
def name(self) -> Optional[str]:
|
|
289
|
+
"""Name of the underlying dataset, if there is one."""
|
|
290
|
+
return self._query.name
|
|
291
|
+
|
|
292
|
+
@property
|
|
293
|
+
def version(self) -> Optional[int]:
|
|
294
|
+
"""Version of the underlying dataset, if there is one."""
|
|
295
|
+
return self._query.version
|
|
296
|
+
|
|
297
|
+
def __or__(self, other: "Self") -> "Self":
|
|
298
|
+
"""Return `self.union(other)`."""
|
|
299
|
+
return self.union(other)
|
|
300
|
+
|
|
296
301
|
def print_schema(self) -> None:
|
|
297
302
|
"""Print schema of the chain."""
|
|
298
303
|
self._effective_signals_schema.print_tree()
|
|
299
304
|
|
|
300
|
-
def clone(self
|
|
305
|
+
def clone(self) -> "Self":
|
|
301
306
|
"""Make a copy of the chain in a new table."""
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
307
|
+
return self._evolve(query=self._query.clone(new_table=True))
|
|
308
|
+
|
|
309
|
+
def _evolve(
|
|
310
|
+
self,
|
|
311
|
+
*,
|
|
312
|
+
query: Optional[DatasetQuery] = None,
|
|
313
|
+
settings: Optional[Settings] = None,
|
|
314
|
+
signal_schema=None,
|
|
315
|
+
_sys=None,
|
|
316
|
+
) -> "Self":
|
|
317
|
+
if query is None:
|
|
318
|
+
query = self._query.clone(new_table=False)
|
|
319
|
+
if settings is None:
|
|
320
|
+
settings = self._settings
|
|
321
|
+
if signal_schema is None:
|
|
322
|
+
signal_schema = copy.deepcopy(self.signals_schema)
|
|
323
|
+
if _sys is None:
|
|
324
|
+
_sys = self._sys
|
|
325
|
+
return type(self)(
|
|
326
|
+
query, settings, signal_schema=signal_schema, setup=self._setup, _sys=_sys
|
|
327
|
+
)
|
|
305
328
|
|
|
306
329
|
def settings(
|
|
307
330
|
self,
|
|
@@ -332,11 +355,11 @@ class DataChain(DatasetQuery):
|
|
|
332
355
|
)
|
|
333
356
|
```
|
|
334
357
|
"""
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
return
|
|
358
|
+
if sys is None:
|
|
359
|
+
sys = self._sys
|
|
360
|
+
settings = copy.copy(self._settings)
|
|
361
|
+
settings.add(Settings(cache, parallel, workers, min_task_size))
|
|
362
|
+
return self._evolve(settings=settings, _sys=sys)
|
|
340
363
|
|
|
341
364
|
def reset_settings(self, settings: Optional[Settings] = None) -> "Self":
|
|
342
365
|
"""Reset all settings to default values."""
|
|
@@ -434,7 +457,7 @@ class DataChain(DatasetQuery):
|
|
|
434
457
|
version: Optional[int] = None,
|
|
435
458
|
session: Optional[Session] = None,
|
|
436
459
|
settings: Optional[dict] = None,
|
|
437
|
-
) -> "
|
|
460
|
+
) -> "Self":
|
|
438
461
|
"""Get data from a saved Dataset. It returns the chain itself.
|
|
439
462
|
|
|
440
463
|
Parameters:
|
|
@@ -446,7 +469,24 @@ class DataChain(DatasetQuery):
|
|
|
446
469
|
chain = DataChain.from_dataset("my_cats")
|
|
447
470
|
```
|
|
448
471
|
"""
|
|
449
|
-
|
|
472
|
+
query = DatasetQuery(
|
|
473
|
+
name=name,
|
|
474
|
+
version=version,
|
|
475
|
+
session=session,
|
|
476
|
+
indexing_column_types=File._datachain_column_types,
|
|
477
|
+
)
|
|
478
|
+
telemetry.send_event_once("class", "datachain_init", name=name, version=version)
|
|
479
|
+
if settings:
|
|
480
|
+
_settings = Settings(**settings)
|
|
481
|
+
else:
|
|
482
|
+
_settings = Settings()
|
|
483
|
+
|
|
484
|
+
signals_schema = SignalSchema({"sys": Sys})
|
|
485
|
+
if query.feature_schema:
|
|
486
|
+
signals_schema |= SignalSchema.deserialize(query.feature_schema)
|
|
487
|
+
else:
|
|
488
|
+
signals_schema |= SignalSchema.from_column_types(query.column_types or {})
|
|
489
|
+
return cls(query, _settings, signals_schema)
|
|
450
490
|
|
|
451
491
|
@classmethod
|
|
452
492
|
def from_json(
|
|
@@ -699,7 +739,11 @@ class DataChain(DatasetQuery):
|
|
|
699
739
|
version : version of a dataset. Default - the last version that exist.
|
|
700
740
|
"""
|
|
701
741
|
schema = self.signals_schema.clone_without_sys_signals().serialize()
|
|
702
|
-
return
|
|
742
|
+
return self._evolve(
|
|
743
|
+
query=self._query.save(
|
|
744
|
+
name=name, version=version, feature_schema=schema, **kwargs
|
|
745
|
+
)
|
|
746
|
+
)
|
|
703
747
|
|
|
704
748
|
def apply(self, func, *args, **kwargs):
|
|
705
749
|
"""Apply any function to the chain.
|
|
@@ -765,13 +809,14 @@ class DataChain(DatasetQuery):
|
|
|
765
809
|
"""
|
|
766
810
|
udf_obj = self._udf_to_obj(Mapper, func, params, output, signal_map)
|
|
767
811
|
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
|
|
812
|
+
return self._evolve(
|
|
813
|
+
query=self._query.add_signals(
|
|
814
|
+
udf_obj.to_udf_wrapper(),
|
|
815
|
+
**self._settings.to_dict(),
|
|
816
|
+
),
|
|
817
|
+
signal_schema=self.signals_schema | udf_obj.output,
|
|
771
818
|
)
|
|
772
819
|
|
|
773
|
-
return chain.add_schema(udf_obj.output).reset_settings(self._settings)
|
|
774
|
-
|
|
775
820
|
def gen(
|
|
776
821
|
self,
|
|
777
822
|
func: Optional[Callable] = None,
|
|
@@ -800,14 +845,14 @@ class DataChain(DatasetQuery):
|
|
|
800
845
|
```
|
|
801
846
|
"""
|
|
802
847
|
udf_obj = self._udf_to_obj(Generator, func, params, output, signal_map)
|
|
803
|
-
|
|
804
|
-
self
|
|
805
|
-
|
|
806
|
-
|
|
848
|
+
return self._evolve(
|
|
849
|
+
query=self._query.generate(
|
|
850
|
+
udf_obj.to_udf_wrapper(),
|
|
851
|
+
**self._settings.to_dict(),
|
|
852
|
+
),
|
|
853
|
+
signal_schema=udf_obj.output,
|
|
807
854
|
)
|
|
808
855
|
|
|
809
|
-
return chain.reset_schema(udf_obj.output).reset_settings(self._settings)
|
|
810
|
-
|
|
811
856
|
def agg(
|
|
812
857
|
self,
|
|
813
858
|
func: Optional[Callable] = None,
|
|
@@ -840,15 +885,15 @@ class DataChain(DatasetQuery):
|
|
|
840
885
|
```
|
|
841
886
|
"""
|
|
842
887
|
udf_obj = self._udf_to_obj(Aggregator, func, params, output, signal_map)
|
|
843
|
-
|
|
844
|
-
self
|
|
845
|
-
|
|
846
|
-
|
|
847
|
-
|
|
888
|
+
return self._evolve(
|
|
889
|
+
query=self._query.generate(
|
|
890
|
+
udf_obj.to_udf_wrapper(),
|
|
891
|
+
partition_by=partition_by,
|
|
892
|
+
**self._settings.to_dict(),
|
|
893
|
+
),
|
|
894
|
+
signal_schema=udf_obj.output,
|
|
848
895
|
)
|
|
849
896
|
|
|
850
|
-
return chain.reset_schema(udf_obj.output).reset_settings(self._settings)
|
|
851
|
-
|
|
852
897
|
def batch_map(
|
|
853
898
|
self,
|
|
854
899
|
func: Optional[Callable] = None,
|
|
@@ -876,14 +921,14 @@ class DataChain(DatasetQuery):
|
|
|
876
921
|
```
|
|
877
922
|
"""
|
|
878
923
|
udf_obj = self._udf_to_obj(BatchMapper, func, params, output, signal_map)
|
|
879
|
-
|
|
880
|
-
self
|
|
881
|
-
|
|
882
|
-
|
|
924
|
+
return self._evolve(
|
|
925
|
+
query=self._query.add_signals(
|
|
926
|
+
udf_obj.to_udf_wrapper(batch),
|
|
927
|
+
**self._settings.to_dict(),
|
|
928
|
+
),
|
|
929
|
+
signal_schema=self.signals_schema | udf_obj.output,
|
|
883
930
|
)
|
|
884
931
|
|
|
885
|
-
return chain.add_schema(udf_obj.output).reset_settings(self._settings)
|
|
886
|
-
|
|
887
932
|
def _udf_to_obj(
|
|
888
933
|
self,
|
|
889
934
|
target_class: type[UDFBase],
|
|
@@ -907,17 +952,12 @@ class DataChain(DatasetQuery):
|
|
|
907
952
|
return target_class._create(sign, params_schema)
|
|
908
953
|
|
|
909
954
|
def _extend_to_data_model(self, method_name, *args, **kwargs):
|
|
910
|
-
|
|
955
|
+
query_func = getattr(self._query, method_name)
|
|
911
956
|
|
|
912
957
|
new_schema = self.signals_schema.resolve(*args)
|
|
913
958
|
columns = [C(col) for col in new_schema.db_signals()]
|
|
914
|
-
|
|
915
|
-
if isinstance(res, DataChain):
|
|
916
|
-
res.signals_schema = new_schema
|
|
917
|
-
|
|
918
|
-
return res
|
|
959
|
+
return query_func(*columns, **kwargs)
|
|
919
960
|
|
|
920
|
-
@detach
|
|
921
961
|
@resolve_columns
|
|
922
962
|
def order_by(self, *args, descending: bool = False) -> "Self":
|
|
923
963
|
"""Orders by specified set of signals.
|
|
@@ -928,9 +968,8 @@ class DataChain(DatasetQuery):
|
|
|
928
968
|
if descending:
|
|
929
969
|
args = tuple(sqlalchemy.desc(a) for a in args)
|
|
930
970
|
|
|
931
|
-
return
|
|
971
|
+
return self._evolve(query=self._query.order_by(*args))
|
|
932
972
|
|
|
933
|
-
@detach
|
|
934
973
|
def distinct(self, arg: str, *args: str) -> "Self": # type: ignore[override]
|
|
935
974
|
"""Removes duplicate rows based on uniqueness of some input column(s)
|
|
936
975
|
i.e if rows are found with the same value of input column(s), only one
|
|
@@ -942,29 +981,30 @@ class DataChain(DatasetQuery):
|
|
|
942
981
|
)
|
|
943
982
|
```
|
|
944
983
|
"""
|
|
945
|
-
return
|
|
984
|
+
return self._evolve(
|
|
985
|
+
query=self._query.distinct(
|
|
986
|
+
*self.signals_schema.resolve(arg, *args).db_signals()
|
|
987
|
+
)
|
|
988
|
+
)
|
|
946
989
|
|
|
947
|
-
@detach
|
|
948
990
|
def select(self, *args: str, _sys: bool = True) -> "Self":
|
|
949
991
|
"""Select only a specified set of signals."""
|
|
950
992
|
new_schema = self.signals_schema.resolve(*args)
|
|
951
993
|
if _sys:
|
|
952
994
|
new_schema = SignalSchema({"sys": Sys}) | new_schema
|
|
953
995
|
columns = new_schema.db_signals()
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
|
|
996
|
+
return self._evolve(
|
|
997
|
+
query=self._query.select(*columns), signal_schema=new_schema
|
|
998
|
+
)
|
|
957
999
|
|
|
958
|
-
@detach
|
|
959
1000
|
def select_except(self, *args: str) -> "Self":
|
|
960
1001
|
"""Select all the signals expect the specified signals."""
|
|
961
1002
|
new_schema = self.signals_schema.select_except_signals(*args)
|
|
962
1003
|
columns = new_schema.db_signals()
|
|
963
|
-
|
|
964
|
-
|
|
965
|
-
|
|
1004
|
+
return self._evolve(
|
|
1005
|
+
query=self._query.select(*columns), signal_schema=new_schema
|
|
1006
|
+
)
|
|
966
1007
|
|
|
967
|
-
@detach
|
|
968
1008
|
def mutate(self, **kwargs) -> "Self":
|
|
969
1009
|
"""Create new signals based on existing signals.
|
|
970
1010
|
|
|
@@ -1029,9 +1069,9 @@ class DataChain(DatasetQuery):
|
|
|
1029
1069
|
# adding new signal
|
|
1030
1070
|
mutated[name] = value
|
|
1031
1071
|
|
|
1032
|
-
|
|
1033
|
-
|
|
1034
|
-
|
|
1072
|
+
return self._evolve(
|
|
1073
|
+
query=self._query.mutate(**mutated), signal_schema=schema.mutate(kwargs)
|
|
1074
|
+
)
|
|
1035
1075
|
|
|
1036
1076
|
@property
|
|
1037
1077
|
def _effective_signals_schema(self) -> "SignalSchema":
|
|
@@ -1058,7 +1098,7 @@ class DataChain(DatasetQuery):
|
|
|
1058
1098
|
a tuple of row values.
|
|
1059
1099
|
"""
|
|
1060
1100
|
db_signals = self._effective_signals_schema.db_signals()
|
|
1061
|
-
with
|
|
1101
|
+
with self._query.select(*db_signals).as_iterable() as rows:
|
|
1062
1102
|
if row_factory:
|
|
1063
1103
|
rows = (row_factory(db_signals, r) for r in rows)
|
|
1064
1104
|
yield from rows
|
|
@@ -1126,7 +1166,7 @@ class DataChain(DatasetQuery):
|
|
|
1126
1166
|
chain = self.select(*cols) if cols else self
|
|
1127
1167
|
signals_schema = chain._effective_signals_schema
|
|
1128
1168
|
db_signals = signals_schema.db_signals()
|
|
1129
|
-
with
|
|
1169
|
+
with self._query.select(*db_signals).as_iterable() as rows:
|
|
1130
1170
|
for row in rows:
|
|
1131
1171
|
ret = signals_schema.row_to_features(
|
|
1132
1172
|
row, catalog=chain.session.catalog, cache=chain._settings.cache
|
|
@@ -1156,7 +1196,7 @@ class DataChain(DatasetQuery):
|
|
|
1156
1196
|
"""
|
|
1157
1197
|
from datachain.torch import PytorchDataset
|
|
1158
1198
|
|
|
1159
|
-
if self.attached:
|
|
1199
|
+
if self._query.attached:
|
|
1160
1200
|
chain = self
|
|
1161
1201
|
else:
|
|
1162
1202
|
chain = self.save()
|
|
@@ -1164,7 +1204,7 @@ class DataChain(DatasetQuery):
|
|
|
1164
1204
|
return PytorchDataset(
|
|
1165
1205
|
chain.name,
|
|
1166
1206
|
chain.version,
|
|
1167
|
-
catalog=self.catalog,
|
|
1207
|
+
catalog=self.session.catalog,
|
|
1168
1208
|
transform=transform,
|
|
1169
1209
|
tokenizer=tokenizer,
|
|
1170
1210
|
tokenizer_kwargs=tokenizer_kwargs,
|
|
@@ -1175,7 +1215,6 @@ class DataChain(DatasetQuery):
|
|
|
1175
1215
|
schema = self.signals_schema.clone_without_file_signals()
|
|
1176
1216
|
return self.select(*schema.values.keys())
|
|
1177
1217
|
|
|
1178
|
-
@detach
|
|
1179
1218
|
def merge(
|
|
1180
1219
|
self,
|
|
1181
1220
|
right_ds: "DataChain",
|
|
@@ -1240,7 +1279,7 @@ class DataChain(DatasetQuery):
|
|
|
1240
1279
|
)
|
|
1241
1280
|
|
|
1242
1281
|
if self == right_ds:
|
|
1243
|
-
right_ds = right_ds.clone(
|
|
1282
|
+
right_ds = right_ds.clone()
|
|
1244
1283
|
|
|
1245
1284
|
errors = []
|
|
1246
1285
|
|
|
@@ -1266,9 +1305,11 @@ class DataChain(DatasetQuery):
|
|
|
1266
1305
|
on, right_on, f"Could not resolve {', '.join(errors)}"
|
|
1267
1306
|
)
|
|
1268
1307
|
|
|
1269
|
-
|
|
1270
|
-
|
|
1271
|
-
|
|
1308
|
+
query = self._query.join(
|
|
1309
|
+
right_ds._query, sqlalchemy.and_(*ops), inner, rname + "{name}"
|
|
1310
|
+
)
|
|
1311
|
+
query.feature_schema = None
|
|
1312
|
+
ds = self._evolve(query=query)
|
|
1272
1313
|
|
|
1273
1314
|
signals_schema = self.signals_schema.clone_without_sys_signals()
|
|
1274
1315
|
right_signals_schema = right_ds.signals_schema.clone_without_sys_signals()
|
|
@@ -1278,6 +1319,14 @@ class DataChain(DatasetQuery):
|
|
|
1278
1319
|
|
|
1279
1320
|
return ds
|
|
1280
1321
|
|
|
1322
|
+
def union(self, other: "Self") -> "Self":
|
|
1323
|
+
"""Return the set union of the two datasets.
|
|
1324
|
+
|
|
1325
|
+
Parameters:
|
|
1326
|
+
other: chain whose rows will be added to `self`.
|
|
1327
|
+
"""
|
|
1328
|
+
return self._evolve(query=self._query.union(other._query))
|
|
1329
|
+
|
|
1281
1330
|
def subtract( # type: ignore[override]
|
|
1282
1331
|
self,
|
|
1283
1332
|
other: "DataChain",
|
|
@@ -1341,7 +1390,7 @@ class DataChain(DatasetQuery):
|
|
|
1341
1390
|
other.signals_schema.resolve(*right_on).db_signals(),
|
|
1342
1391
|
) # type: ignore[arg-type]
|
|
1343
1392
|
)
|
|
1344
|
-
return
|
|
1393
|
+
return self._evolve(query=self._query.subtract(other._query, signals)) # type: ignore[arg-type]
|
|
1345
1394
|
|
|
1346
1395
|
@classmethod
|
|
1347
1396
|
def from_values(
|
|
@@ -1449,7 +1498,7 @@ class DataChain(DatasetQuery):
|
|
|
1449
1498
|
transpose : Whether to transpose rows and columns.
|
|
1450
1499
|
truncate : Whether or not to truncate the contents of columns.
|
|
1451
1500
|
"""
|
|
1452
|
-
dc = self.limit(limit) if limit > 0 else self
|
|
1501
|
+
dc = self.limit(limit) if limit > 0 else self # type: ignore[misc]
|
|
1453
1502
|
df = dc.to_pandas(flatten)
|
|
1454
1503
|
|
|
1455
1504
|
if df.empty:
|
|
@@ -1782,7 +1831,7 @@ class DataChain(DatasetQuery):
|
|
|
1782
1831
|
settings: Optional[dict] = None,
|
|
1783
1832
|
in_memory: bool = False,
|
|
1784
1833
|
schema: Optional[dict[str, DataType]] = None,
|
|
1785
|
-
) -> "
|
|
1834
|
+
) -> "Self":
|
|
1786
1835
|
"""Create a DataChain from the provided records. This method can be used for
|
|
1787
1836
|
programmatically generating a chain in contrast of reading data from storages
|
|
1788
1837
|
or other sources.
|
|
@@ -1837,7 +1886,7 @@ class DataChain(DatasetQuery):
|
|
|
1837
1886
|
insert_q = dr.get_table().insert()
|
|
1838
1887
|
for record in to_insert:
|
|
1839
1888
|
db.execute(insert_q.values(**record))
|
|
1840
|
-
return
|
|
1889
|
+
return cls.from_dataset(name=dsr.name, session=session, settings=settings)
|
|
1841
1890
|
|
|
1842
1891
|
def sum(self, fr: DataType): # type: ignore[override]
|
|
1843
1892
|
"""Compute the sum of a column."""
|
|
@@ -1898,8 +1947,8 @@ class DataChain(DatasetQuery):
|
|
|
1898
1947
|
) -> None:
|
|
1899
1948
|
"""Method that exports all files from chain to some folder."""
|
|
1900
1949
|
if placement == "filename" and (
|
|
1901
|
-
|
|
1902
|
-
!= self.count()
|
|
1950
|
+
self._query.distinct(pathfunc.name(C(f"{signal}__path"))).count()
|
|
1951
|
+
!= self._query.count()
|
|
1903
1952
|
):
|
|
1904
1953
|
raise ValueError("Files with the same name found")
|
|
1905
1954
|
|
|
@@ -1919,10 +1968,9 @@ class DataChain(DatasetQuery):
|
|
|
1919
1968
|
NOTE: Samples are not deterministic, and streamed/paginated queries or
|
|
1920
1969
|
multiple workers will draw samples with replacement.
|
|
1921
1970
|
"""
|
|
1922
|
-
return
|
|
1971
|
+
return self._evolve(query=self._query.sample(n))
|
|
1923
1972
|
|
|
1924
|
-
|
|
1925
|
-
def filter(self, *args) -> "Self":
|
|
1973
|
+
def filter(self, *args: Any) -> "Self":
|
|
1926
1974
|
"""Filter the chain according to conditions.
|
|
1927
1975
|
|
|
1928
1976
|
Example:
|
|
@@ -1955,14 +2003,50 @@ class DataChain(DatasetQuery):
|
|
|
1955
2003
|
)
|
|
1956
2004
|
```
|
|
1957
2005
|
"""
|
|
1958
|
-
return
|
|
2006
|
+
return self._evolve(query=self._query.filter(*args))
|
|
1959
2007
|
|
|
1960
|
-
@detach
|
|
1961
2008
|
def limit(self, n: int) -> "Self":
|
|
1962
|
-
"""Return the first n rows of the chain.
|
|
1963
|
-
|
|
2009
|
+
"""Return the first `n` rows of the chain.
|
|
2010
|
+
|
|
2011
|
+
If the chain is unordered, which rows are returned is undefined.
|
|
2012
|
+
If the chain has less than `n` rows, the whole chain is returned.
|
|
2013
|
+
|
|
2014
|
+
Parameters:
|
|
2015
|
+
n (int): Number of rows to return.
|
|
2016
|
+
"""
|
|
2017
|
+
return self._evolve(query=self._query.limit(n))
|
|
1964
2018
|
|
|
1965
|
-
@detach
|
|
1966
2019
|
def offset(self, offset: int) -> "Self":
|
|
1967
|
-
"""Return the results starting with the offset row.
|
|
1968
|
-
|
|
2020
|
+
"""Return the results starting with the offset row.
|
|
2021
|
+
|
|
2022
|
+
If the chain is unordered, which rows are skipped in undefined.
|
|
2023
|
+
If the chain has less than `offset` rows, the result is an empty chain.
|
|
2024
|
+
|
|
2025
|
+
Parameters:
|
|
2026
|
+
offset (int): Number of rows to skip.
|
|
2027
|
+
"""
|
|
2028
|
+
return self._evolve(query=self._query.offset(offset))
|
|
2029
|
+
|
|
2030
|
+
def count(self) -> int:
|
|
2031
|
+
"""Return the number of rows in the chain."""
|
|
2032
|
+
return self._query.count()
|
|
2033
|
+
|
|
2034
|
+
def exec(self) -> "Self":
|
|
2035
|
+
"""Execute the chain."""
|
|
2036
|
+
return self._evolve(query=self._query.exec())
|
|
2037
|
+
|
|
2038
|
+
def chunk(self, index: int, total: int) -> "Self":
|
|
2039
|
+
"""Split a chain into smaller chunks for e.g. parallelization.
|
|
2040
|
+
|
|
2041
|
+
Example:
|
|
2042
|
+
```py
|
|
2043
|
+
chain = DataChain.from_storage(...)
|
|
2044
|
+
chunk_1 = query._chunk(0, 2)
|
|
2045
|
+
chunk_2 = query._chunk(1, 2)
|
|
2046
|
+
```
|
|
2047
|
+
|
|
2048
|
+
Note:
|
|
2049
|
+
Bear in mind that `index` is 0-indexed but `total` isn't.
|
|
2050
|
+
Use 0/3, 1/3 and 2/3, not 1/3, 2/3 and 3/3.
|
|
2051
|
+
"""
|
|
2052
|
+
return self._evolve(query=self._query.chunk(index, total))
|
datachain/lib/listing.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import posixpath
|
|
2
2
|
from collections.abc import Iterator
|
|
3
3
|
from datetime import datetime, timedelta, timezone
|
|
4
|
-
from typing import TYPE_CHECKING, Callable, Optional
|
|
4
|
+
from typing import TYPE_CHECKING, Callable, Optional, TypeVar
|
|
5
5
|
|
|
6
6
|
from fsspec.asyn import get_loop
|
|
7
7
|
from sqlalchemy.sql.expression import true
|
|
@@ -20,6 +20,8 @@ if TYPE_CHECKING:
|
|
|
20
20
|
LISTING_TTL = 4 * 60 * 60 # cached listing lasts 4 hours
|
|
21
21
|
LISTING_PREFIX = "lst__" # listing datasets start with this name
|
|
22
22
|
|
|
23
|
+
D = TypeVar("D", bound="DataChain")
|
|
24
|
+
|
|
23
25
|
|
|
24
26
|
def list_bucket(uri: str, cache, client_config=None) -> Callable:
|
|
25
27
|
"""
|
|
@@ -38,11 +40,11 @@ def list_bucket(uri: str, cache, client_config=None) -> Callable:
|
|
|
38
40
|
|
|
39
41
|
|
|
40
42
|
def ls(
|
|
41
|
-
dc:
|
|
43
|
+
dc: D,
|
|
42
44
|
path: str,
|
|
43
45
|
recursive: Optional[bool] = True,
|
|
44
46
|
object_name="file",
|
|
45
|
-
):
|
|
47
|
+
) -> D:
|
|
46
48
|
"""
|
|
47
49
|
Return files by some path from DataChain instance which contains bucket listing.
|
|
48
50
|
Path can have globs.
|
datachain/lib/pytorch.py
CHANGED
|
@@ -9,6 +9,7 @@ from torch.utils.data import IterableDataset, get_worker_info
|
|
|
9
9
|
from torchvision.transforms import v2
|
|
10
10
|
from tqdm import tqdm
|
|
11
11
|
|
|
12
|
+
from datachain import Session
|
|
12
13
|
from datachain.catalog import Catalog, get_catalog
|
|
13
14
|
from datachain.lib.dc import DataChain
|
|
14
15
|
from datachain.lib.text import convert_text
|
|
@@ -87,8 +88,11 @@ class PytorchDataset(IterableDataset):
|
|
|
87
88
|
def __iter__(self) -> Iterator[Any]:
|
|
88
89
|
if self.catalog is None:
|
|
89
90
|
self.catalog = self._get_catalog()
|
|
91
|
+
session = Session.get(catalog=self.catalog)
|
|
90
92
|
total_rank, total_workers = self.get_rank_and_workers()
|
|
91
|
-
ds = DataChain(
|
|
93
|
+
ds = DataChain.from_dataset(
|
|
94
|
+
name=self.name, version=self.version, session=session
|
|
95
|
+
)
|
|
92
96
|
ds = ds.remove_file_signals()
|
|
93
97
|
|
|
94
98
|
if self.num_samples > 0:
|
datachain/query/dataset.py
CHANGED
|
@@ -1037,7 +1037,7 @@ class DatasetQuery:
|
|
|
1037
1037
|
session: Optional[Session] = None,
|
|
1038
1038
|
indexing_column_types: Optional[dict[str, Any]] = None,
|
|
1039
1039
|
in_memory: bool = False,
|
|
1040
|
-
):
|
|
1040
|
+
) -> None:
|
|
1041
1041
|
self.session = Session.get(session, catalog=catalog, in_memory=in_memory)
|
|
1042
1042
|
self.catalog = catalog or self.session.catalog
|
|
1043
1043
|
self.steps: list[Step] = []
|
|
@@ -2,10 +2,10 @@ datachain/__init__.py,sha256=ofPJ6B-d-ybSDRrE7J6wqF_ZRAB2W9U8l-eeuBtqPLg,865
|
|
|
2
2
|
datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
|
|
3
3
|
datachain/asyn.py,sha256=Lg3Ck1PQLjQziMx9KU4atzbEnJXTE0924WMYkhgWtGU,8247
|
|
4
4
|
datachain/cache.py,sha256=s0YHN7qurmQv-eC265TjeureK84TebWWAnL07cxchZQ,2997
|
|
5
|
-
datachain/cli.py,sha256=
|
|
5
|
+
datachain/cli.py,sha256=vVK7hNEyF7p5bUTmixkbgS7JYyTSpXeyRZJkWfpYUOw,30164
|
|
6
6
|
datachain/cli_utils.py,sha256=jrn9ejGXjybeO1ur3fjdSiAyCHZrX0qsLLbJzN9ErPM,2418
|
|
7
7
|
datachain/config.py,sha256=PfC7W5yO6HFO6-iMB4YB-0RR88LPiGmD6sS_SfVbGso,1979
|
|
8
|
-
datachain/dataset.py,sha256=
|
|
8
|
+
datachain/dataset.py,sha256=w7qqJP7xYrm9CmBSmSezSxUQHZDsHKkwviF8AYUob7o,14671
|
|
9
9
|
datachain/error.py,sha256=vbIbamnFMIojh1UpmxWoA6Omup7WFAFNJnf8xAkGWwI,1146
|
|
10
10
|
datachain/job.py,sha256=Jt4sNutMHJReaGsj3r3scueN5aESLGfhimAa8pUP7Is,1271
|
|
11
11
|
datachain/listing.py,sha256=TkMmBzCiru26x4RaZiagWJTmTGbiy6yGrAsSJMr8cFE,8213
|
|
@@ -18,7 +18,7 @@ datachain/storage.py,sha256=RiSJLYdHUjnrEWkLBKPcETHpAxld_B2WxLg711t0aZI,3733
|
|
|
18
18
|
datachain/telemetry.py,sha256=0A4IOPPp9VlP5pyW9eBfaTK3YhHGzHl7dQudQjUAx9A,994
|
|
19
19
|
datachain/utils.py,sha256=KeFSRHsiYthnTu4a6bH-rw04mX1m8krTX0f2NqfQGFI,12114
|
|
20
20
|
datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
|
|
21
|
-
datachain/catalog/catalog.py,sha256=
|
|
21
|
+
datachain/catalog/catalog.py,sha256=FuKuIiCwPgN5Ea25hnFe_ZFZH9YEUZ2ma9k_Lczk-JU,63867
|
|
22
22
|
datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
|
|
23
23
|
datachain/catalog/loader.py,sha256=-6VelNfXUdgUnwInVyA8g86Boxv2xqhTh9xNS-Zlwig,8242
|
|
24
24
|
datachain/client/__init__.py,sha256=T4wiYL9KIM0ZZ_UqIyzV8_ufzYlewmizlV4iymHNluE,86
|
|
@@ -33,25 +33,25 @@ datachain/data_storage/__init__.py,sha256=cEOJpyu1JDZtfUupYucCDNFI6e5Wmp_Oyzq6rZ
|
|
|
33
33
|
datachain/data_storage/db_engine.py,sha256=81Ol1of9TTTzD97ORajCnP366Xz2mEJt6C-kTUCaru4,3406
|
|
34
34
|
datachain/data_storage/id_generator.py,sha256=lCEoU0BM37Ai2aRpSbwo5oQT0GqZnSpYwwvizathRMQ,4292
|
|
35
35
|
datachain/data_storage/job.py,sha256=w-7spowjkOa1P5fUVtJou3OltT0L48P0RYWZ9rSJ9-s,383
|
|
36
|
-
datachain/data_storage/metastore.py,sha256=
|
|
36
|
+
datachain/data_storage/metastore.py,sha256=NV4FJ_W16Q19Sx70i5Qtre-n4DC2kMD0qw0vBz3j7Ks,52228
|
|
37
37
|
datachain/data_storage/schema.py,sha256=AGbjyEir5UmRZXI3m0jChZogUh5wd8csj6-YlUWaAxQ,8383
|
|
38
38
|
datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
|
|
39
39
|
datachain/data_storage/sqlite.py,sha256=EBKJncuzcyQfcKFm2mUjvHjHRTODsteM-k_zndunBrw,28834
|
|
40
|
-
datachain/data_storage/warehouse.py,sha256=
|
|
40
|
+
datachain/data_storage/warehouse.py,sha256=fXhVfao3NfWFGbbG5uJ-Ga4bX1FiKVfcbDyQgECYfk8,32122
|
|
41
41
|
datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
42
42
|
datachain/lib/arrow.py,sha256=aUsoQmxDmuSnB8Ik9p57Y66gc_dgx6NBqkDDIfLsvno,7630
|
|
43
43
|
datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
|
|
44
44
|
datachain/lib/data_model.py,sha256=gHIjlow84GMRDa78yLL1Ud-N18or21fnTyPEwsatpXY,2045
|
|
45
45
|
datachain/lib/dataset_info.py,sha256=srPPhI2UHf6hFPBecyFEVw2SS5aPisIIMsvGgKqi7ss,2366
|
|
46
|
-
datachain/lib/dc.py,sha256=
|
|
46
|
+
datachain/lib/dc.py,sha256=yTyHrKIswCzdlvl2n-wdEVZEEF5VQpkLJPzPfUL9CTU,72054
|
|
47
47
|
datachain/lib/file.py,sha256=LjTW_-PDAnoUhvyB4bJ8Y8n__XGqrxvmd9mDOF0Gir8,14875
|
|
48
48
|
datachain/lib/hf.py,sha256=cPnmLuprr0pYABH7KqA5FARQ1JGlywdDwD3yDzVAm4k,5920
|
|
49
49
|
datachain/lib/image.py,sha256=AMXYwQsmarZjRbPCZY3M1jDsM2WAB_b3cTY4uOIuXNU,2675
|
|
50
|
-
datachain/lib/listing.py,sha256=
|
|
50
|
+
datachain/lib/listing.py,sha256=_2oQXh03RAOydeyW3G4OSXCncZaapMGlyGCYcvuUPhc,4145
|
|
51
51
|
datachain/lib/listing_info.py,sha256=36NZ-tXY5Y118wurkajuWWbcE8UCjkRwZlacDtN9F3g,954
|
|
52
52
|
datachain/lib/meta_formats.py,sha256=3f-0vpMTesagS9iMd3y9-u9r-7g0eqYsxmK4fVfNWlw,6635
|
|
53
53
|
datachain/lib/model_store.py,sha256=DNIv8Y6Jtk1_idNLzIpsThOsdW2BMAudyUCbPUcgcxk,2515
|
|
54
|
-
datachain/lib/pytorch.py,sha256=
|
|
54
|
+
datachain/lib/pytorch.py,sha256=W-ARi2xH1f1DUkVfRuerW-YWYgSaJASmNCxtz2lrJGI,6072
|
|
55
55
|
datachain/lib/settings.py,sha256=39thOpYJw-zPirzeNO6pmRC2vPrQvt4eBsw1xLWDFsw,2344
|
|
56
56
|
datachain/lib/signal_schema.py,sha256=iqgubjCBRiUJB30miv05qFX4uU04dA_Pzi3DCUsHZGs,24177
|
|
57
57
|
datachain/lib/tar.py,sha256=3WIzao6yD5fbLqXLTt9GhPGNonbFIs_fDRu-9vgLgsA,1038
|
|
@@ -70,7 +70,7 @@ datachain/lib/convert/unflatten.py,sha256=Ogvh_5wg2f38_At_1lN0D_e2uZOOpYEvwvB2xd
|
|
|
70
70
|
datachain/lib/convert/values_to_tuples.py,sha256=YOdbjzHq-uj6-cV2Qq43G72eN2avMNDGl4x5t6yQMl8,3931
|
|
71
71
|
datachain/query/__init__.py,sha256=0NBOZVgIDpCcj1Ci883dQ9A0iiwe03xzmotkOCFbxYc,293
|
|
72
72
|
datachain/query/batch.py,sha256=-vlpINJiertlnaoUVv1C95RatU0F6zuhpIYRufJRo1M,3660
|
|
73
|
-
datachain/query/dataset.py,sha256=
|
|
73
|
+
datachain/query/dataset.py,sha256=tLCTaj4K93BY93GgOPv9PknZByEF89zpHc7y9s8ZF_w,53610
|
|
74
74
|
datachain/query/dispatch.py,sha256=CFAc09O6UllcyUSSEY1GUlEMPzeO8RYhXinNN4HBl9M,12405
|
|
75
75
|
datachain/query/metrics.py,sha256=r5b0ygYhokbXp8Mg3kCH8iFSRw0jxzyeBe-C-J_bKFc,938
|
|
76
76
|
datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
|
|
@@ -97,9 +97,9 @@ datachain/sql/sqlite/base.py,sha256=WLPHBhZbXbiqPoRV1VgDrXJqku4UuvJpBhYeQ0k5rI8,
|
|
|
97
97
|
datachain/sql/sqlite/types.py,sha256=yzvp0sXSEoEYXs6zaYC_2YubarQoZH-MiUNXcpuEP4s,1573
|
|
98
98
|
datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
|
|
99
99
|
datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
|
|
100
|
-
datachain-0.
|
|
101
|
-
datachain-0.
|
|
102
|
-
datachain-0.
|
|
103
|
-
datachain-0.
|
|
104
|
-
datachain-0.
|
|
105
|
-
datachain-0.
|
|
100
|
+
datachain-0.5.0.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
|
|
101
|
+
datachain-0.5.0.dist-info/METADATA,sha256=tKSZNiHZY0WJ_w6irkpSF7qDfuOTfiYNEQ6St3eBs-M,17156
|
|
102
|
+
datachain-0.5.0.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
|
|
103
|
+
datachain-0.5.0.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
|
|
104
|
+
datachain-0.5.0.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
|
|
105
|
+
datachain-0.5.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|