datachain 0.1.13__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (44) hide show
  1. datachain/_version.py +2 -2
  2. datachain/asyn.py +3 -3
  3. datachain/catalog/__init__.py +3 -3
  4. datachain/catalog/catalog.py +6 -6
  5. datachain/catalog/loader.py +3 -3
  6. datachain/cli.py +2 -1
  7. datachain/client/azure.py +37 -1
  8. datachain/client/fsspec.py +1 -1
  9. datachain/client/local.py +1 -1
  10. datachain/data_storage/__init__.py +1 -1
  11. datachain/data_storage/metastore.py +11 -3
  12. datachain/data_storage/schema.py +2 -3
  13. datachain/data_storage/warehouse.py +31 -30
  14. datachain/dataset.py +1 -3
  15. datachain/lib/arrow.py +85 -0
  16. datachain/lib/dc.py +377 -178
  17. datachain/lib/feature.py +41 -90
  18. datachain/lib/feature_registry.py +3 -1
  19. datachain/lib/feature_utils.py +2 -2
  20. datachain/lib/file.py +20 -20
  21. datachain/lib/image.py +9 -2
  22. datachain/lib/meta_formats.py +66 -34
  23. datachain/lib/settings.py +5 -5
  24. datachain/lib/signal_schema.py +103 -105
  25. datachain/lib/udf.py +3 -12
  26. datachain/lib/udf_signature.py +11 -6
  27. datachain/lib/webdataset_laion.py +5 -22
  28. datachain/listing.py +8 -8
  29. datachain/node.py +1 -1
  30. datachain/progress.py +1 -1
  31. datachain/query/builtins.py +1 -1
  32. datachain/query/dataset.py +39 -110
  33. datachain/query/dispatch.py +1 -1
  34. datachain/query/metrics.py +19 -0
  35. datachain/query/schema.py +13 -3
  36. datachain/sql/__init__.py +1 -1
  37. datachain/utils.py +1 -122
  38. {datachain-0.1.13.dist-info → datachain-0.2.0.dist-info}/METADATA +10 -3
  39. {datachain-0.1.13.dist-info → datachain-0.2.0.dist-info}/RECORD +43 -42
  40. {datachain-0.1.13.dist-info → datachain-0.2.0.dist-info}/WHEEL +1 -1
  41. datachain/lib/parquet.py +0 -32
  42. {datachain-0.1.13.dist-info → datachain-0.2.0.dist-info}/LICENSE +0 -0
  43. {datachain-0.1.13.dist-info → datachain-0.2.0.dist-info}/entry_points.txt +0 -0
  44. {datachain-0.1.13.dist-info → datachain-0.2.0.dist-info}/top_level.txt +0 -0
@@ -56,13 +56,13 @@ from datachain.storage import Storage, StorageURI
56
56
  from datachain.utils import batched, determine_processes
57
57
 
58
58
  from .batch import RowBatch
59
+ from .metrics import metrics
59
60
  from .schema import C, UDFParamSpec, normalize_param
60
61
  from .session import Session
61
62
  from .udf import UDFBase, UDFClassWrapper, UDFFactory, UDFType
62
63
 
63
64
  if TYPE_CHECKING:
64
65
  import pandas as pd
65
- from pandas.core.interchange.dataframe_protocol import DataFrame as DataFrameXchg
66
66
  from sqlalchemy.sql.elements import ClauseElement
67
67
  from sqlalchemy.sql.schema import Table
68
68
  from sqlalchemy.sql.selectable import GenerativeSelect
@@ -71,7 +71,6 @@ if TYPE_CHECKING:
71
71
  from datachain.catalog import Catalog
72
72
  from datachain.data_storage import AbstractWarehouse
73
73
  from datachain.dataset import DatasetRecord
74
- from datachain.sql.types import SQLType
75
74
 
76
75
  from .udf import UDFResult
77
76
 
@@ -197,7 +196,7 @@ class IndexingStep(StartingStep):
197
196
  def apply(self):
198
197
  self.catalog.index([self.path], **self.kwargs)
199
198
  uri, path = self.parse_path()
200
- partial_id, partial_path = self.catalog.metastore.get_valid_partial_id(
199
+ _partial_id, partial_path = self.catalog.metastore.get_valid_partial_id(
201
200
  uri, path
202
201
  )
203
202
  dataset = self.catalog.get_dataset(Storage.dataset_name(uri, partial_path))
@@ -523,30 +522,23 @@ class UDF(Step, ABC):
523
522
  "cache": self.cache,
524
523
  }
525
524
 
526
- feature_module_name, feature_file = self.process_feature_module()
527
-
528
- # Write the module content to a .py file
529
- with open(f"{feature_module_name}.py", "w") as module_file:
530
- module_file.write(feature_file)
531
-
532
- process_data = dumps(udf_info, recurse=True)
533
525
  # Run the UDFDispatcher in another process to avoid needing
534
526
  # if __name__ == '__main__': in user scripts
535
527
  datachain_exec_path = os.environ.get("DATACHAIN_EXEC_PATH", "datachain")
536
528
 
537
529
  envs = dict(os.environ)
538
530
  envs.update({"PYTHONPATH": os.getcwd()})
539
- try:
540
- result = subprocess.run(
541
- [datachain_exec_path, "--internal-run-udf"], # noqa: S603
531
+ with self.process_feature_module():
532
+ process_data = dumps(udf_info, recurse=True)
533
+ result = subprocess.run( # noqa: S603
534
+ [datachain_exec_path, "--internal-run-udf"],
542
535
  input=process_data,
543
536
  check=False,
544
537
  env=envs,
545
538
  )
546
539
  if result.returncode != 0:
547
540
  raise RuntimeError("UDF Execution Failed!")
548
- finally:
549
- os.unlink(f"{feature_module_name}.py")
541
+
550
542
  else:
551
543
  # Otherwise process single-threaded (faster for smaller UDFs)
552
544
  # Optionally instantiate the UDF instance if a class is provided.
@@ -600,6 +592,7 @@ class UDF(Step, ABC):
600
592
  self.catalog.warehouse.close()
601
593
  raise
602
594
 
595
+ @contextlib.contextmanager
603
596
  def process_feature_module(self):
604
597
  # Generate a random name for the feature module
605
598
  feature_module_name = "tmp" + _random_string(10)
@@ -611,10 +604,14 @@ class UDF(Step, ABC):
611
604
  for name, obj in inspect.getmembers(sys.modules["__main__"], _imports)
612
605
  if not (name.startswith("__") and name.endswith("__"))
613
606
  ]
607
+ main_module = sys.modules["__main__"]
608
+
614
609
  # Get the feature classes from the main module
615
- feature_classes = dict(
616
- inspect.getmembers(sys.modules["__main__"], _feature_predicate)
617
- )
610
+ feature_classes = {
611
+ name: obj
612
+ for name, obj in main_module.__dict__.items()
613
+ if _feature_predicate(obj)
614
+ }
618
615
  # Get the source code of the feature classes
619
616
  feature_sources = [source.getsource(cls) for _, cls in feature_classes.items()]
620
617
  # Set the module name for the feature classes to the generated name
@@ -626,7 +623,18 @@ class UDF(Step, ABC):
626
623
  # Combine the import lines and feature sources
627
624
  feature_file = "".join(import_lines) + "\n".join(feature_sources)
628
625
 
629
- return feature_module_name, feature_file
626
+ # Write the module content to a .py file
627
+ with open(f"{feature_module_name}.py", "w") as module_file:
628
+ module_file.write(feature_file)
629
+
630
+ try:
631
+ yield feature_module_name
632
+ finally:
633
+ for cls in feature_classes.values():
634
+ cls.__module__ = main_module.__name__
635
+ os.unlink(f"{feature_module_name}.py")
636
+ # Remove the dynamic module from sys.modules
637
+ del sys.modules[feature_module_name]
630
638
 
631
639
  def create_partitions_table(self, query: Select) -> "Table":
632
640
  """
@@ -685,8 +693,7 @@ class UDF(Step, ABC):
685
693
  )
686
694
 
687
695
  query, tables = self.process_input_query(query)
688
- for t in tables:
689
- temp_tables.append(t.name)
696
+ temp_tables.extend(t.name for t in tables)
690
697
  udf_table = self.create_udf_table(_query)
691
698
  temp_tables.append(udf_table.name)
692
699
  self.populate_udf_table(udf_table, query)
@@ -1120,6 +1127,12 @@ class DatasetQuery:
1120
1127
  indexing_feature_schema: Optional[dict] = None,
1121
1128
  indexing_column_types: Optional[dict[str, Any]] = None,
1122
1129
  ):
1130
+ if client_config is None:
1131
+ client_config = {}
1132
+
1133
+ if anon:
1134
+ client_config["anon"] = True
1135
+
1123
1136
  self.steps: list[Step] = []
1124
1137
  self.catalog = catalog or get_catalog(client_config=client_config)
1125
1138
  self._chunk_index: Optional[int] = None
@@ -1134,22 +1147,14 @@ class DatasetQuery:
1134
1147
  self.column_types: Optional[dict[str, Any]] = None
1135
1148
  self.session = Session.get(session, catalog=catalog)
1136
1149
 
1137
- if client_config is None:
1138
- client_config = {}
1139
-
1140
- if anon:
1141
- client_config["anon"] = True
1142
-
1143
1150
  if path:
1144
- self.starting_step = IndexingStep(
1145
- path, self.catalog, {"client_config": client_config}, recursive
1146
- )
1151
+ self.starting_step = IndexingStep(path, self.catalog, {}, recursive)
1147
1152
  self.feature_schema = indexing_feature_schema
1148
1153
  self.column_types = indexing_column_types
1149
1154
  elif name:
1150
1155
  ds = self.catalog.get_dataset(name)
1151
1156
  self.version = version or ds.latest_version
1152
- self.feature_schema = ds.feature_schema
1157
+ self.feature_schema = ds.get_version(self.version).feature_schema
1153
1158
  self.column_types = copy(ds.schema)
1154
1159
  if "id" in self.column_types:
1155
1160
  self.column_types.pop("id")
@@ -1348,8 +1353,7 @@ class DatasetQuery:
1348
1353
  MapperCls = OrderedMapper if query._order_by_clauses else AsyncMapper # noqa: N806
1349
1354
  with contextlib.closing(row_iter()) as rows:
1350
1355
  mapper = MapperCls(get_params, rows, workers=workers)
1351
- for params in mapper.iterate():
1352
- yield params
1356
+ yield from mapper.iterate()
1353
1357
  finally:
1354
1358
  self.cleanup()
1355
1359
 
@@ -1386,82 +1390,6 @@ class DatasetQuery:
1386
1390
  records = self.to_records()
1387
1391
  return pd.DataFrame.from_records(records)
1388
1392
 
1389
- @classmethod
1390
- def from_dataframe(
1391
- cls,
1392
- df: Union["DataFrameXchg", "pd.DataFrame"],
1393
- name: str = "",
1394
- version: Optional[int] = None,
1395
- catalog: Optional["Catalog"] = None,
1396
- session: Optional[Session] = None,
1397
- ) -> "Self":
1398
- from datachain.utils import dtype_mapper
1399
-
1400
- catalog = catalog or get_catalog()
1401
- assert catalog is not None
1402
- session = Session.get(session, catalog=catalog)
1403
- assert session is not None
1404
-
1405
- try:
1406
- if name and version and catalog.get_dataset(name).has_version(version):
1407
- raise RuntimeError(f"Dataset {name} already has version {version}")
1408
- except DatasetNotFoundError:
1409
- pass
1410
-
1411
- if not name and version:
1412
- raise RuntimeError("Cannot set version for temporary datasets")
1413
-
1414
- import pandas as pd # noqa: F401
1415
- from pandas.api.interchange import from_dataframe
1416
-
1417
- # This is not optimal for dataframes other than pd.DataFrame, as it may copy
1418
- # all the data to a new dataframe.
1419
- pd_df = from_dataframe(df)
1420
-
1421
- dtype: dict[str, type[SQLType]] = {
1422
- str(pd_df.columns[i]): dtype_mapper(pd_df.iloc[:, i])
1423
- for i in range(len(pd_df.columns))
1424
- }
1425
-
1426
- name = name or session.generate_temp_dataset_name()
1427
- dataset = catalog.create_dataset(
1428
- name,
1429
- version=version,
1430
- columns=[Column(name, typ) for name, typ in dtype.items()],
1431
- )
1432
- version = version or dataset.latest_version
1433
-
1434
- dr = catalog.warehouse.dataset_rows(dataset)
1435
- pd_df.to_sql(
1436
- dr.table.name,
1437
- catalog.warehouse.db.engine,
1438
- if_exists="append",
1439
- index=False,
1440
- chunksize=10_000,
1441
- dtype=dtype,
1442
- )
1443
-
1444
- catalog.metastore.update_dataset_status(
1445
- dataset, DatasetStatus.COMPLETE, version=version
1446
- )
1447
- catalog.update_dataset_version_with_warehouse_info(dataset, version)
1448
- return cls(name=name, version=version, catalog=catalog, session=session)
1449
-
1450
- from_pandas = from_dataframe
1451
-
1452
- @classmethod
1453
- def from_parquet(
1454
- cls,
1455
- uri: str,
1456
- *args,
1457
- **kwargs,
1458
- ) -> "Self":
1459
- import pandas as pd
1460
-
1461
- pd_df = pd.read_parquet(uri, dtype_backend="pyarrow")
1462
-
1463
- return cls.from_dataframe(pd_df, *args, **kwargs)
1464
-
1465
1393
  def shuffle(self) -> "Self":
1466
1394
  # ToDo: implement shaffle based on seed and/or generating random column
1467
1395
  return self.order_by(C.random)
@@ -1853,6 +1781,7 @@ def _get_output_fd_for_write() -> Union[str, int]:
1853
1781
  class ExecutionResult:
1854
1782
  preview: list[dict] = attrs.field(factory=list)
1855
1783
  dataset: Optional[tuple[str, int]] = None
1784
+ metrics: dict[str, Any] = attrs.field(factory=dict)
1856
1785
 
1857
1786
 
1858
1787
  def _send_result(dataset_query: DatasetQuery) -> None:
@@ -1886,7 +1815,7 @@ def _send_result(dataset_query: DatasetQuery) -> None:
1886
1815
  dataset = dataset_query.name, dataset_query.version
1887
1816
 
1888
1817
  preview = preview_query.to_records()
1889
- result = ExecutionResult(preview, dataset)
1818
+ result = ExecutionResult(preview, dataset, metrics)
1890
1819
  data = attrs.asdict(result)
1891
1820
 
1892
1821
  with open(_get_output_fd_for_write(), mode="w") as f:
@@ -257,7 +257,7 @@ class UDFDispatcher:
257
257
 
258
258
  if self.buffer_size < n_workers:
259
259
  raise RuntimeError(
260
- f"Parallel run error: buffer size is smaller than "
260
+ "Parallel run error: buffer size is smaller than "
261
261
  f"number of workers: {self.buffer_size} < {n_workers}"
262
262
  )
263
263
 
@@ -0,0 +1,19 @@
1
+ from typing import Optional, Union
2
+
3
+ metrics: dict[str, Union[str, int, float, bool, None]] = {}
4
+
5
+
6
+ def set(key: str, value: Union[str, int, float, bool, None]) -> None: # noqa: PYI041
7
+ """Set a metric value."""
8
+ if not isinstance(key, str):
9
+ raise TypeError("Key must be a string")
10
+ if not key:
11
+ raise ValueError("Key must not be empty")
12
+ if not isinstance(value, (str, int, float, bool, type(None))):
13
+ raise TypeError("Value must be a string, int, float or bool")
14
+ metrics[key] = value
15
+
16
+
17
+ def get(key: str) -> Optional[Union[str, int, float, bool]]:
18
+ """Get a metric value."""
19
+ return metrics[key]
datachain/query/schema.py CHANGED
@@ -18,20 +18,30 @@ if TYPE_CHECKING:
18
18
  from datachain.dataset import RowDict
19
19
 
20
20
 
21
+ DEFAULT_DELIMITER = "__"
22
+
23
+
21
24
  class ColumnMeta(type):
25
+ @staticmethod
26
+ def to_db_name(name: str) -> str:
27
+ return name.replace(".", DEFAULT_DELIMITER)
28
+
22
29
  def __getattr__(cls, name: str):
23
- return cls(name)
30
+ return cls(ColumnMeta.to_db_name(name))
24
31
 
25
32
 
26
33
  class Column(sa.ColumnClause, metaclass=ColumnMeta):
27
34
  inherit_cache: Optional[bool] = True
28
35
 
29
36
  def __init__(self, text, type_=None, is_literal=False, _selectable=None):
30
- self.name = text
37
+ self.name = ColumnMeta.to_db_name(text)
31
38
  super().__init__(
32
- text, type_=type_, is_literal=is_literal, _selectable=_selectable
39
+ self.name, type_=type_, is_literal=is_literal, _selectable=_selectable
33
40
  )
34
41
 
42
+ def __getattr__(self, name: str):
43
+ return Column(self.name + DEFAULT_DELIMITER + name)
44
+
35
45
  def glob(self, glob_str):
36
46
  return self.op("GLOB")(glob_str)
37
47
 
datachain/sql/__init__.py CHANGED
@@ -7,10 +7,10 @@ from .selectable import select, values
7
7
 
8
8
  __all__ = [
9
9
  "column",
10
+ "functions",
10
11
  "literal",
11
12
  "select",
12
13
  "values",
13
- "functions",
14
14
  ]
15
15
 
16
16
  default_setup()
datachain/utils.py CHANGED
@@ -18,9 +18,6 @@ from dateutil.parser import isoparse
18
18
 
19
19
  if TYPE_CHECKING:
20
20
  import pandas as pd
21
- import pyarrow as pa
22
-
23
- from datachain.sql.types import SQLType
24
21
 
25
22
  NUL = b"\0"
26
23
  TIME_ZERO = datetime.fromtimestamp(0, tz=timezone.utc)
@@ -78,7 +75,7 @@ class DataChainDir:
78
75
  if create:
79
76
  instance.init()
80
77
  else:
81
- NotADirectoryError(root)
78
+ raise NotADirectoryError(root)
82
79
  return instance
83
80
 
84
81
 
@@ -363,121 +360,3 @@ class JSONSerialize(json.JSONEncoder):
363
360
  return str(obj)
364
361
 
365
362
  return super().default(obj)
366
-
367
-
368
- def dtype_mapper(col: Union["pd.Index", "pd.Series"]) -> type["SQLType"]: # noqa: PLR0911
369
- from pandas import ArrowDtype
370
- from pandas.api.types import infer_dtype
371
-
372
- from datachain.sql.types import (
373
- Binary,
374
- Boolean,
375
- DateTime,
376
- Float,
377
- Float32,
378
- Float64,
379
- Int,
380
- Int32,
381
- Int64,
382
- String,
383
- UInt64,
384
- )
385
-
386
- if isinstance(col.dtype, ArrowDtype):
387
- return arrow_type_mapper(col.dtype.pyarrow_dtype)
388
-
389
- col_type = infer_dtype(col, skipna=True)
390
-
391
- if col_type in ("datetime", "datetime64"):
392
- return DateTime
393
- if col_type == "bytes":
394
- return Binary
395
- if col_type == "floating":
396
- if col.dtype == "float32":
397
- return Float32
398
- if col.dtype == "float64":
399
- return Float64
400
- return Float
401
- if col_type == "integer":
402
- if col.dtype.name.lower() in ("int8", "int16", "int32"):
403
- return Int32
404
- if col.dtype.name.lower() == "int64":
405
- return Int64
406
- if col.dtype.name.lower().startswith("uint"):
407
- return UInt64
408
- return Int
409
- if col_type == "boolean":
410
- return Boolean
411
- if col_type == "date":
412
- return DateTime
413
- if col_type in (
414
- "complex",
415
- "time",
416
- "timedelta",
417
- "timedelta64",
418
- "period",
419
- "interval",
420
- ):
421
- raise ValueError(f"{col_type!r} datatypes not supported")
422
- return String
423
-
424
-
425
- def arrow_type_mapper(col_type: "pa.DataType") -> type["SQLType"]: # noqa: PLR0911,C901
426
- try:
427
- import pyarrow as pa
428
- except ImportError as exc:
429
- raise ImportError(
430
- "Missing required dependency pyarrow for inferring types"
431
- ) from exc
432
-
433
- from datachain.sql.types import (
434
- JSON,
435
- Array,
436
- Binary,
437
- Boolean,
438
- DateTime,
439
- Float,
440
- Float32,
441
- Float64,
442
- Int,
443
- Int32,
444
- Int64,
445
- String,
446
- UInt64,
447
- )
448
-
449
- if pa.types.is_timestamp(col_type):
450
- return DateTime
451
- if pa.types.is_binary(col_type):
452
- return Binary
453
- if pa.types.is_floating(col_type):
454
- if pa.types.is_float32(col_type):
455
- return Float32
456
- if pa.types.is_float64(col_type):
457
- return Float64
458
- return Float
459
- if pa.types.is_integer(col_type):
460
- if (
461
- pa.types.is_int8(col_type)
462
- or pa.types.is_int16(col_type)
463
- or pa.types.is_int32(col_type)
464
- ):
465
- return Int32
466
- if pa.types.is_int64(col_type):
467
- return Int64
468
- if pa.types.is_unsigned_integer(col_type):
469
- return UInt64
470
- return Int
471
- if pa.types.is_boolean(col_type):
472
- return Boolean
473
- if pa.types.is_date(col_type):
474
- return DateTime
475
- if pa.types.is_string(col_type):
476
- return String
477
- if pa.types.is_list(col_type):
478
- return Array(arrow_type_mapper(col_type.value_type)) # type: ignore[return-value]
479
- if pa.types.is_struct(col_type) or pa.types.is_map(col_type):
480
- return JSON
481
- if isinstance(col_type, pa.lib.DictionaryType):
482
- return arrow_type_mapper(col_type.value_type) # type: ignore[return-value]
483
- raise ValueError(f"{col_type!r} datatypes not supported")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.1.13
3
+ Version: 0.2.0
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -44,12 +44,19 @@ Requires-Dist: torch >=2.1.0 ; extra == 'cv'
44
44
  Requires-Dist: torchvision ; extra == 'cv'
45
45
  Requires-Dist: transformers >=4.36.0 ; extra == 'cv'
46
46
  Provides-Extra: dev
47
- Requires-Dist: datachain[tests] ; extra == 'dev'
48
- Requires-Dist: mypy ==1.10.0 ; extra == 'dev'
47
+ Requires-Dist: datachain[docs,tests] ; extra == 'dev'
48
+ Requires-Dist: mypy ==1.10.1 ; extra == 'dev'
49
49
  Requires-Dist: types-python-dateutil ; extra == 'dev'
50
50
  Requires-Dist: types-PyYAML ; extra == 'dev'
51
51
  Requires-Dist: types-requests ; extra == 'dev'
52
52
  Requires-Dist: types-ujson ; extra == 'dev'
53
+ Provides-Extra: docs
54
+ Requires-Dist: mkdocs >=1.5.2 ; extra == 'docs'
55
+ Requires-Dist: mkdocs-gen-files >=0.5.0 ; extra == 'docs'
56
+ Requires-Dist: mkdocs-material >=9.3.1 ; extra == 'docs'
57
+ Requires-Dist: mkdocs-section-index >=0.3.6 ; extra == 'docs'
58
+ Requires-Dist: mkdocstrings-python >=1.6.3 ; extra == 'docs'
59
+ Requires-Dist: mkdocs-literate-nav >=0.6.1 ; extra == 'docs'
53
60
  Provides-Extra: remote
54
61
  Requires-Dist: datachain[pandas] ; extra == 'remote'
55
62
  Requires-Dist: lz4 ; extra == 'remote'
@@ -1,82 +1,83 @@
1
1
  datachain/__init__.py,sha256=9a0qX6tqyA9KC3ahLmGarqlRTZJXhM7HijAWpfUaOnQ,102
2
2
  datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
3
- datachain/_version.py,sha256=S22EPqqZRb53L2H7sobVA3TUXv9skvkYd-YtLuHuV6M,413
4
- datachain/asyn.py,sha256=opARBVZJxTKU3EGYd-8gcpNXoshuCfVz_b0ut3oxC50,7641
3
+ datachain/_version.py,sha256=H-qsvrxCpdhaQzyddR-yajEqI71hPxLa4KxzpP3uS1g,411
4
+ datachain/asyn.py,sha256=CKCFQJ0CbB3r04S7mUTXxriKzPnOvdUaVPXjM8vCtJw,7644
5
5
  datachain/cache.py,sha256=FaPWrqWznPffmskTb1pdPkt2jAMMf__9FC2zEnP0vDU,4022
6
- datachain/cli.py,sha256=1mBozBJS9Nq-EeahxwyKH8ef64E2v93o0CAEzxjcbkY,32209
6
+ datachain/cli.py,sha256=FLKRimIq917Dq0EmG3yLzMTqDaMA0vyCRUREOobUspY,32256
7
7
  datachain/cli_utils.py,sha256=jrn9ejGXjybeO1ur3fjdSiAyCHZrX0qsLLbJzN9ErPM,2418
8
8
  datachain/config.py,sha256=PfC7W5yO6HFO6-iMB4YB-0RR88LPiGmD6sS_SfVbGso,1979
9
- datachain/dataset.py,sha256=4ksFJlfo_CEmt5xqXPca-hhQL1syFpKxCl_ZOhTS30s,14506
9
+ datachain/dataset.py,sha256=MZezyuJWNj_3PEtzr0epPMNyWAOTrhTSPI5FmemV6L4,14470
10
10
  datachain/error.py,sha256=GY9KYTmb7GHXn2gGHV9X-PBhgwLj3i7VpK7tGHtAoGM,1279
11
- datachain/listing.py,sha256=-Cm74Ne2Q36QuCpA22feDA_v-7uPqkwAOg-QzkiZAGQ,8243
12
- datachain/node.py,sha256=jCBvwiEUYSKQa27Tb6RORgaUjoiz7mOX63NQmP7JQY0,5703
11
+ datachain/listing.py,sha256=1arE_9gpjhHqGQCpQZj_mLoocrZWRNDHJ-bkPc08NQs,8247
12
+ datachain/node.py,sha256=fHe7k5ajI2g2qnzsG-_NQR_T-QdBYctVeEa8c8dsu_Y,5703
13
13
  datachain/nodes_fetcher.py,sha256=kca19yvu11JxoVY1t4_ydp1FmchiV88GnNicNBQ9NIA,831
14
14
  datachain/nodes_thread_pool.py,sha256=ZyzBvUImIPmi4WlKC2SW2msA0UhtembbTdcs2nx29A0,3191
15
- datachain/progress.py,sha256=cFKpoPon4iRjc4C213j5fKdl-Ga_80rUaKlS67kMa_Y,4550
15
+ datachain/progress.py,sha256=7_8FtJs770ITK9sMq-Lt4k4k18QmYl4yIG_kCoWID3o,4559
16
16
  datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
17
  datachain/storage.py,sha256=RiSJLYdHUjnrEWkLBKPcETHpAxld_B2WxLg711t0aZI,3733
18
- datachain/utils.py,sha256=FW1LR5qCL5BtCYk-B-6LUCCMq8zOobkKKMrLqfFfCAg,13535
19
- datachain/catalog/__init__.py,sha256=Gkto1V7rUbVjJmgMEnB_VpVeHOfV47IQh1fSjEKnit4,409
20
- datachain/catalog/catalog.py,sha256=7ZqCsyr7W4enOIX6jiLJbBfFZvjkqjI1E_NOyL3V3AA,78585
18
+ datachain/utils.py,sha256=DV-_OON2OomEbxuQuK1lE_2qNTf28QByNcNcEhYsilE,10202
19
+ datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
20
+ datachain/catalog/catalog.py,sha256=5WkICtTYCN5xSMGDd5djLnEBw8kkcDf-IpFYf7kfeuQ,78654
21
21
  datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
22
- datachain/catalog/loader.py,sha256=FTI9s1b8iX0_TffSAx1mwm-ucsRV14NHX-F1xtTXRSE,7310
22
+ datachain/catalog/loader.py,sha256=GJ8zhEYkC7TuaPzCsjJQ4LtTdECu-wwYzC12MikPOMQ,7307
23
23
  datachain/catalog/subclass.py,sha256=B5R0qxeTYEyVAAPM1RutBPSoXZc8L5mVVZeSGXki9Sw,2096
24
24
  datachain/client/__init__.py,sha256=T4wiYL9KIM0ZZ_UqIyzV8_ufzYlewmizlV4iymHNluE,86
25
- datachain/client/azure.py,sha256=5yRxhejOpifYWswbyKZ1Y8pWb6v0K2DfnvVNB_ywF0w,920
25
+ datachain/client/azure.py,sha256=rxvF5erntGD32Y3DYK_TUCsyV2ALfuWWTnE8IWGwKEo,2542
26
26
  datachain/client/fileslice.py,sha256=bT7TYco1Qe3bqoc8aUkUZcPdPofJDHlryL5BsTn9xsY,3021
27
- datachain/client/fsspec.py,sha256=kSr_fgqpWB5YljM9my7R6FzJ59AxHg3nrriR9f6C-4Y,13389
27
+ datachain/client/fsspec.py,sha256=F1Iyyw0iTrp2wQTFeignGtaHpm5Rg_cvbKaIzBX5aSc,13390
28
28
  datachain/client/gcs.py,sha256=ucX8e6JrqlFY-f80zkv084vxnKdtxpO32QJ-RG8Nv1s,4454
29
- datachain/client/local.py,sha256=FwtlrUdpKi1jdqk43XTK8uEPsAqy57Kf9X1FldxFxyk,5148
29
+ datachain/client/local.py,sha256=NQVkLTJQ-a7Udavqbh_4uT-IejfZQYn10j22owz9sis,5150
30
30
  datachain/client/s3.py,sha256=TmW4f7VUM5CMZjSmgyFQFKeMUGrXt2SLoLEbLOUleiU,6296
31
- datachain/data_storage/__init__.py,sha256=arlkQIj2J0ozcT_GvNDxm6PLT9NeabHvIsxPNDY_TxQ,398
31
+ datachain/data_storage/__init__.py,sha256=cEOJpyu1JDZtfUupYucCDNFI6e5Wmp_Oyzq6rZv32Y8,398
32
32
  datachain/data_storage/db_engine.py,sha256=mxOoWP4ntBMgLeTAk4dlEeIJArAz4x_tFrHytcAfLpo,3341
33
33
  datachain/data_storage/id_generator.py,sha256=VlDALKijggegAnNMJwuMETJgnLoPYxpkrkld5DNTPQw,3839
34
34
  datachain/data_storage/job.py,sha256=w-7spowjkOa1P5fUVtJou3OltT0L48P0RYWZ9rSJ9-s,383
35
- datachain/data_storage/metastore.py,sha256=GnJH2NlFngdj30aK9CSaimJNnh_x_pSjntWUnvQuI2A,53649
36
- datachain/data_storage/schema.py,sha256=pF3KBi-8Pz3n5jRYoJpDR3gF8qUFdyAu2XR58J4Fyuo,8724
35
+ datachain/data_storage/metastore.py,sha256=y-4fYvuOPnWeYxAvqhDnw6CdlTvQiurg0Gg4TaG9LR0,54074
36
+ datachain/data_storage/schema.py,sha256=FrhmeZ_btT1CfVisa4ScabS11ixZ3xn3d_whvVsBtDA,8700
37
37
  datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
38
38
  datachain/data_storage/sqlite.py,sha256=eHTiJ0VIxU-chnhKNTN14EsaSnw5LAaxTLi9aMCZpl4,24978
39
- datachain/data_storage/warehouse.py,sha256=sQLOrv6DH8UcWH1aqlg3YJKmaHr696XkVafBxccZZ3U,33213
39
+ datachain/data_storage/warehouse.py,sha256=tL2mYoXVZe-coKLTRXEJ0sMdEr2BD0GwgIWip5PP5CM,33300
40
40
  datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
41
+ datachain/lib/arrow.py,sha256=7lAas8hSh3vL7S7s2KOlkYn4viQpfVbM_FQ_hLCh5oc,2593
41
42
  datachain/lib/cached_stream.py,sha256=BQI6gpJ2y7_-jqQo_0VB9ntbkOVISvj9wlDwGDQbqw8,3537
42
43
  datachain/lib/claude.py,sha256=iAauA1zNVNONpLzUo1t0QN5PZ5Ot6cZkfib7Ka_c638,1969
43
- datachain/lib/dc.py,sha256=MAy1Bsxknaz2aduZ28ffuq88x8Ja8QHA59CsyyiUlZE,27048
44
- datachain/lib/feature.py,sha256=C5lxQ_Ef4rL0-mef4A4EeoqB0rcNZ0ExRE26ehx20RM,14196
45
- datachain/lib/feature_registry.py,sha256=hg_S_9JPEYaQ-8PI64mU0sEhSJ-rcrKtwQk5TPBotEw,1570
46
- datachain/lib/feature_utils.py,sha256=6wbKZ2xq08b751EFBRJy1OZLqWYd_gxq9A_Em_aMFk4,4713
47
- datachain/lib/file.py,sha256=ZNGzmJSq7PNVxLhGLNdR9YSYkP-1ZeqY_yhDMcDNfkI,8586
44
+ datachain/lib/dc.py,sha256=szYQC4FOoYDMlSEDAPWZ25z4Nn-WeoaKiqKwwXbOJws,35355
45
+ datachain/lib/feature.py,sha256=KiPiMrU8ec-bJuUs70Xh4jytZdzKk9puQNQnx03K-po,12057
46
+ datachain/lib/feature_registry.py,sha256=YQsLYChNkYK6p2MpcVfAyBybtfN5EMiOJ8LIYakjmeQ,1602
47
+ datachain/lib/feature_utils.py,sha256=LIK233IWGWFhuav5Rm8de0xIOSnuwA1ubk6OYrxrfN0,4712
48
+ datachain/lib/file.py,sha256=K0jH8Q5Xle2TiVDTCzmopku_7Lh-IVufV_mgtaCNHYI,8744
48
49
  datachain/lib/gpt4_vision.py,sha256=idyXVZVWzltstGaVIu5RYE5UNbdqcPEjIWy81O1MwkM,2922
49
50
  datachain/lib/hf_image_to_text.py,sha256=HiPSWzJRDT-vnz9DXJbJBNCMNl9wmpxiSS3PbbVz8SE,3310
50
51
  datachain/lib/hf_pipeline.py,sha256=f0AH_XCziOF1OKN3d1w1swTBLaeajMJ8xgdsX37i5-o,2287
51
- datachain/lib/image.py,sha256=gb-My4rx5zMwOlDkcu_2G8GtRAMfsRvd7-QWUBErDw8,3486
52
+ datachain/lib/image.py,sha256=l2lgUR3YQzjpBmTJewzUtL5zJsLDQH32lbbaLu9WvWA,3631
52
53
  datachain/lib/image_transform.py,sha256=NXWtnVOcofWBgl_YMxb4ABpaT7JTBMx7tLKvErH1IC4,3024
53
54
  datachain/lib/iptc_exif_xmp.py,sha256=xrbxFeY-wRP6T5JsUgE3EXfTxKvZVymRaRD_VIfxD0A,2236
54
- datachain/lib/meta_formats.py,sha256=-JAS47NOO6rx1vmr0Cy-G_txxmTvMflXfzJiFD7rWlQ,5742
55
- datachain/lib/parquet.py,sha256=_MbRBzcgLLLegjKZNGF9Rm9IkYRSy0IqOksVjL1nntg,917
55
+ datachain/lib/meta_formats.py,sha256=wIVVLRLp45Zk4vjZRd_P1UtD24vpDCb-vILWtcsACwk,6630
56
56
  datachain/lib/pytorch.py,sha256=oU16XXAyAmiiabe1IoQoID00-u3uZ5GhCN48uAl6WDs,5421
57
57
  datachain/lib/reader.py,sha256=rPXXNoTUdm6PQwkAlaU-nOBreP_q4ett_EjFStrA_W0,1727
58
- datachain/lib/settings.py,sha256=mVtzyA_y9JA-6chMv1baggDvgeFsaUszySp660Gu4gw,2854
59
- datachain/lib/signal_schema.py,sha256=WPKHzgZ6HatbDQ2IN_L0JPi46n6acfHpkq91DYdlgSg,11753
58
+ datachain/lib/settings.py,sha256=6Nkoh8riETrftYwDp3aniK53Dsjc07MdztL8N0cW1D8,2849
59
+ datachain/lib/signal_schema.py,sha256=KaH194dAH8Zt8FtlNAgdVqcZlJc42y7RbcB37ldPPAY,11688
60
60
  datachain/lib/text.py,sha256=EEZrYohADi5rAGg3aLLRwtvyAV9js_yWAGhr2C3QbwI,2424
61
- datachain/lib/udf.py,sha256=PeZ-UbprfxlmgVbzH4FtNib3kIhTi9C869QM8RuM5dw,6292
62
- datachain/lib/udf_signature.py,sha256=1cOMcGXHbdBjyBRkvNxIEt9A_CoyiADxio2wkYu8U5M,7140
61
+ datachain/lib/udf.py,sha256=kPc_6fQ4DzbiYiXvbps7QPlJWTu9MSCS8eUfGqOhjG4,6124
62
+ datachain/lib/udf_signature.py,sha256=DAWMQ0dvFkKabpY5MV5K2q9YmOSTKfiV8KuUBs_6kMg,7258
63
63
  datachain/lib/unstructured.py,sha256=9Y6rAelXdYqkNbPaqz6DhXjhS8d6qXcP0ieIsWkzvkk,1143
64
64
  datachain/lib/utils.py,sha256=YQKzuW096SGe7QwHwdyS47k_9l2Rh73b-wBqt1-niw4,213
65
65
  datachain/lib/vfile.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
66
66
  datachain/lib/webdataset.py,sha256=JouI5WORgkl-am_DwQwWqO8RI1UwgbUPWsauZZj2Fmc,8221
67
- datachain/lib/webdataset_laion.py,sha256=tHn3Zhqx7Eb5Ywy_mobs6jDI0o_pFUbsuHqv0W_aNho,2840
67
+ datachain/lib/webdataset_laion.py,sha256=HAtSCbVvEQqzKkoRamRxDKaQALSB3QmJRU2yWRFNxwY,2147
68
68
  datachain/query/__init__.py,sha256=tv-spkjUCYamMN9ys_90scYrZ8kJ7C7d1MTYVmxGtk4,325
69
69
  datachain/query/batch.py,sha256=sOMxXbaNii7lVyFIEZ2noqbhy_S8qtZ-WWxrka72shc,3474
70
- datachain/query/builtins.py,sha256=RyVEPZEuC7K1vlulrsaUjATLG_tZEvYYW7N5i6Fg-tQ,2781
71
- datachain/query/dataset.py,sha256=2DZAaEwX9gQlQgrRY3t-ymXN9SUkN_3XN0AfMFT6Mto,66861
72
- datachain/query/dispatch.py,sha256=9zcwKkLIuK5-xyRSQNw3yTqYLMHVbuZIn6KcB0g_ZBQ,13107
70
+ datachain/query/builtins.py,sha256=ZKNs49t8Oa_OaboCBIEqtXZt7c1Qe9OR_C_HpoDriIU,2781
71
+ datachain/query/dataset.py,sha256=c0ZoNEjAMmn0BdSnRm8XRWEsbaMH3xa_jd6FBJQDY1o,64576
72
+ datachain/query/dispatch.py,sha256=fEk1qalxAb5JJhN-iq0Mg9MyWve4XoN1Q7uvrX4mJY4,13106
73
+ datachain/query/metrics.py,sha256=vsECqbZfoSDBnvC3GQlziKXmISVYDLgHP1fMPEOtKyo,640
73
74
  datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
74
- datachain/query/schema.py,sha256=CGu9NBIFvX4iHQnaThLLxwWndxqkyUtYmo2JBgnZ4YQ,7660
75
+ datachain/query/schema.py,sha256=tWlUiu9eiS5y8BTQaPI2raGclt0YzcO3DoUN1OkwnrE,7946
75
76
  datachain/query/session.py,sha256=e4_vv4RqAjU-g3KK0avgLd9MEsmJBzRVEj1w8v7fP1k,3663
76
77
  datachain/query/udf.py,sha256=0WkBPW5ymZbOGMimSXpVWVc8whjTuYfRrnxPWNHabSk,7127
77
78
  datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
78
79
  datachain/remote/studio.py,sha256=bZb85WjtqMNFBoRuPbH-TEGpAyz0afROR7E9UgIef_Y,7438
79
- datachain/sql/__init__.py,sha256=AUU_NHscXxNt8gfI9WZg08x41JaI4aQNpBlUr6HA4rU,303
80
+ datachain/sql/__init__.py,sha256=A2djrbQwSMUZZEIKGnm-mnRA-NDSbiDJNpAmmwGNyIo,303
80
81
  datachain/sql/selectable.py,sha256=fBM-wS1TUA42kVEAAiwqGtibIevyZAEritwt8PZGyLQ,1589
81
82
  datachain/sql/types.py,sha256=BzUm0nCcMPASvdqpQouX5bdVcK3G3DBfeeNhau7X_hA,10234
82
83
  datachain/sql/utils.py,sha256=rzlJw08etivdrcuQPqNVvVWhuVSyUPUQEEc6DOhu258,818
@@ -92,9 +93,9 @@ datachain/sql/sqlite/__init__.py,sha256=TAdJX0Bg28XdqPO-QwUVKy8rg78cgMileHvMNot7
92
93
  datachain/sql/sqlite/base.py,sha256=XVxn4pB-N4pPfiby5uVvfH7feNzRKlBNzsc5eyKPvhI,10965
93
94
  datachain/sql/sqlite/types.py,sha256=oP93nLfTBaYnN0z_4Dsv-HZm8j9rrUf1esMM-z3JLbg,1754
94
95
  datachain/sql/sqlite/vector.py,sha256=stBeEW6fbVbILmAtV4khjXdJIGT13HkRWJeCoqIOk50,315
95
- datachain-0.1.13.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
96
- datachain-0.1.13.dist-info/METADATA,sha256=aqjqnY-YxqDJZhpkKaPQ35QZkehWOcsGIdqNzdLRw-0,13972
97
- datachain-0.1.13.dist-info/WHEEL,sha256=mguMlWGMX-VHnMpKOjjQidIo1ssRlCFu4a4mBpz1s2M,91
98
- datachain-0.1.13.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
99
- datachain-0.1.13.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
100
- datachain-0.1.13.dist-info/RECORD,,
96
+ datachain-0.2.0.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
97
+ datachain-0.2.0.dist-info/METADATA,sha256=iMX8hWEMXu-4MtXlD_SVwW3ija6bOLqSbeQvHoiMNfQ,14344
98
+ datachain-0.2.0.dist-info/WHEEL,sha256=y4mX-SOX4fYIkonsAGA5N0Oy-8_gI4FXw5HNI1xqvWg,91
99
+ datachain-0.2.0.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
100
+ datachain-0.2.0.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
101
+ datachain-0.2.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (70.1.1)
2
+ Generator: setuptools (70.2.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5