datachain 0.14.2__py3-none-any.whl → 0.39.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. datachain/__init__.py +20 -0
  2. datachain/asyn.py +11 -12
  3. datachain/cache.py +7 -7
  4. datachain/catalog/__init__.py +2 -2
  5. datachain/catalog/catalog.py +621 -507
  6. datachain/catalog/dependency.py +164 -0
  7. datachain/catalog/loader.py +28 -18
  8. datachain/checkpoint.py +43 -0
  9. datachain/cli/__init__.py +24 -33
  10. datachain/cli/commands/__init__.py +1 -8
  11. datachain/cli/commands/datasets.py +83 -52
  12. datachain/cli/commands/ls.py +17 -17
  13. datachain/cli/commands/show.py +4 -4
  14. datachain/cli/parser/__init__.py +8 -74
  15. datachain/cli/parser/job.py +95 -3
  16. datachain/cli/parser/studio.py +11 -4
  17. datachain/cli/parser/utils.py +1 -2
  18. datachain/cli/utils.py +2 -15
  19. datachain/client/azure.py +4 -4
  20. datachain/client/fsspec.py +45 -28
  21. datachain/client/gcs.py +6 -6
  22. datachain/client/hf.py +29 -2
  23. datachain/client/http.py +157 -0
  24. datachain/client/local.py +15 -11
  25. datachain/client/s3.py +17 -9
  26. datachain/config.py +4 -8
  27. datachain/data_storage/db_engine.py +12 -6
  28. datachain/data_storage/job.py +5 -1
  29. datachain/data_storage/metastore.py +1252 -186
  30. datachain/data_storage/schema.py +58 -45
  31. datachain/data_storage/serializer.py +105 -15
  32. datachain/data_storage/sqlite.py +286 -127
  33. datachain/data_storage/warehouse.py +250 -113
  34. datachain/dataset.py +353 -148
  35. datachain/delta.py +391 -0
  36. datachain/diff/__init__.py +27 -29
  37. datachain/error.py +60 -0
  38. datachain/func/__init__.py +2 -1
  39. datachain/func/aggregate.py +66 -42
  40. datachain/func/array.py +242 -38
  41. datachain/func/base.py +7 -4
  42. datachain/func/conditional.py +110 -60
  43. datachain/func/func.py +96 -45
  44. datachain/func/numeric.py +55 -38
  45. datachain/func/path.py +32 -20
  46. datachain/func/random.py +2 -2
  47. datachain/func/string.py +67 -37
  48. datachain/func/window.py +7 -8
  49. datachain/hash_utils.py +123 -0
  50. datachain/job.py +11 -7
  51. datachain/json.py +138 -0
  52. datachain/lib/arrow.py +58 -22
  53. datachain/lib/audio.py +245 -0
  54. datachain/lib/clip.py +14 -13
  55. datachain/lib/convert/flatten.py +5 -3
  56. datachain/lib/convert/python_to_sql.py +6 -10
  57. datachain/lib/convert/sql_to_python.py +8 -0
  58. datachain/lib/convert/values_to_tuples.py +156 -51
  59. datachain/lib/data_model.py +42 -20
  60. datachain/lib/dataset_info.py +36 -8
  61. datachain/lib/dc/__init__.py +8 -2
  62. datachain/lib/dc/csv.py +25 -28
  63. datachain/lib/dc/database.py +398 -0
  64. datachain/lib/dc/datachain.py +1289 -425
  65. datachain/lib/dc/datasets.py +320 -38
  66. datachain/lib/dc/hf.py +38 -24
  67. datachain/lib/dc/json.py +29 -32
  68. datachain/lib/dc/listings.py +112 -8
  69. datachain/lib/dc/pandas.py +16 -12
  70. datachain/lib/dc/parquet.py +35 -23
  71. datachain/lib/dc/records.py +31 -23
  72. datachain/lib/dc/storage.py +154 -64
  73. datachain/lib/dc/storage_pattern.py +251 -0
  74. datachain/lib/dc/utils.py +24 -16
  75. datachain/lib/dc/values.py +8 -9
  76. datachain/lib/file.py +622 -89
  77. datachain/lib/hf.py +69 -39
  78. datachain/lib/image.py +14 -14
  79. datachain/lib/listing.py +14 -11
  80. datachain/lib/listing_info.py +1 -2
  81. datachain/lib/meta_formats.py +3 -4
  82. datachain/lib/model_store.py +39 -7
  83. datachain/lib/namespaces.py +125 -0
  84. datachain/lib/projects.py +130 -0
  85. datachain/lib/pytorch.py +32 -21
  86. datachain/lib/settings.py +192 -56
  87. datachain/lib/signal_schema.py +427 -104
  88. datachain/lib/tar.py +1 -2
  89. datachain/lib/text.py +8 -7
  90. datachain/lib/udf.py +164 -76
  91. datachain/lib/udf_signature.py +60 -35
  92. datachain/lib/utils.py +118 -4
  93. datachain/lib/video.py +17 -9
  94. datachain/lib/webdataset.py +61 -56
  95. datachain/lib/webdataset_laion.py +15 -16
  96. datachain/listing.py +22 -10
  97. datachain/model/bbox.py +3 -1
  98. datachain/model/ultralytics/bbox.py +16 -12
  99. datachain/model/ultralytics/pose.py +16 -12
  100. datachain/model/ultralytics/segment.py +16 -12
  101. datachain/namespace.py +84 -0
  102. datachain/node.py +6 -6
  103. datachain/nodes_thread_pool.py +0 -1
  104. datachain/plugins.py +24 -0
  105. datachain/project.py +78 -0
  106. datachain/query/batch.py +40 -41
  107. datachain/query/dataset.py +604 -322
  108. datachain/query/dispatch.py +261 -154
  109. datachain/query/metrics.py +4 -6
  110. datachain/query/params.py +2 -3
  111. datachain/query/queue.py +3 -12
  112. datachain/query/schema.py +11 -6
  113. datachain/query/session.py +200 -33
  114. datachain/query/udf.py +34 -2
  115. datachain/remote/studio.py +171 -69
  116. datachain/script_meta.py +12 -12
  117. datachain/semver.py +68 -0
  118. datachain/sql/__init__.py +2 -0
  119. datachain/sql/functions/array.py +33 -1
  120. datachain/sql/postgresql_dialect.py +9 -0
  121. datachain/sql/postgresql_types.py +21 -0
  122. datachain/sql/sqlite/__init__.py +5 -1
  123. datachain/sql/sqlite/base.py +102 -29
  124. datachain/sql/sqlite/types.py +8 -13
  125. datachain/sql/types.py +70 -15
  126. datachain/studio.py +223 -46
  127. datachain/toolkit/split.py +31 -10
  128. datachain/utils.py +101 -59
  129. {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/METADATA +77 -22
  130. datachain-0.39.0.dist-info/RECORD +173 -0
  131. {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/WHEEL +1 -1
  132. datachain/cli/commands/query.py +0 -53
  133. datachain/query/utils.py +0 -42
  134. datachain-0.14.2.dist-info/RECORD +0 -158
  135. {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/entry_points.txt +0 -0
  136. {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/licenses/LICENSE +0 -0
  137. {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/top_level.txt +0 -0
@@ -9,6 +9,14 @@ def sql_to_python(sql_exp: ColumnElement) -> Any:
9
9
  type_ = sql_exp.type.python_type
10
10
  if type_ == Decimal:
11
11
  type_ = float
12
+ elif type_ is list:
13
+ if hasattr(sql_exp.type, "item_type") and hasattr(
14
+ sql_exp.type.item_type, "python_type"
15
+ ):
16
+ item_type = getattr(sql_exp.type.item_type, "python_type", Any)
17
+ type_ = list[item_type] # type: ignore[valid-type]
18
+ else:
19
+ type_ = list
12
20
  except NotImplementedError:
13
21
  type_ = str
14
22
  return type_
@@ -1,62 +1,177 @@
1
+ import itertools
1
2
  from collections.abc import Sequence
2
- from typing import Any, Union
3
-
4
- from datachain.lib.data_model import (
5
- DataType,
6
- DataTypeNames,
7
- DataValue,
8
- is_chain_type,
9
- )
3
+ from typing import Any
4
+
5
+ from datachain.lib.data_model import DataType, DataTypeNames, DataValue, is_chain_type
10
6
  from datachain.lib.utils import DataChainParamsError
11
7
 
12
8
 
13
9
  class ValuesToTupleError(DataChainParamsError):
14
10
  def __init__(self, ds_name: str, msg: str):
11
+ self.ds_name = ds_name
12
+ self.msg = msg
13
+
15
14
  if ds_name:
16
15
  ds_name = f"' {ds_name}'"
16
+
17
17
  super().__init__(f"Cannot convert signals for dataset{ds_name}: {msg}")
18
18
 
19
+ def __reduce__(self):
20
+ return ValuesToTupleError, (self.ds_name, self.msg)
19
21
 
20
- def values_to_tuples( # noqa: C901, PLR0912
21
- ds_name: str = "",
22
- output: Union[None, DataType, Sequence[str], dict[str, DataType]] = None,
23
- **fr_map: Sequence[DataValue],
24
- ) -> tuple[Any, Any, Any]:
25
- if output:
26
- if not isinstance(output, (Sequence, str, dict)):
27
- if len(fr_map) != 1:
28
- raise ValuesToTupleError(
29
- ds_name,
30
- f"only one output type was specified, {len(fr_map)} expected",
31
- )
32
- if not isinstance(output, type):
33
- raise ValuesToTupleError(
34
- ds_name,
35
- f"output must specify a type while '{output}' was given",
36
- )
37
22
 
38
- key: str = next(iter(fr_map.keys()))
39
- output = {key: output} # type: ignore[dict-item]
23
+ def _find_first_non_none(sequence: Sequence[Any]) -> Any | None:
24
+ """Find the first non-None element in a sequence."""
25
+ try:
26
+ return next(itertools.dropwhile(lambda i: i is None, sequence))
27
+ except StopIteration:
28
+ return None
29
+
30
+
31
+ def _infer_list_item_type(lst: list) -> type:
32
+ """Infer the item type of a list, handling None values and nested lists."""
33
+ if len(lst) == 0:
34
+ # Default to str when list is empty to avoid generic list
35
+ return str
36
+
37
+ first_item = _find_first_non_none(lst)
38
+ if first_item is None:
39
+ # Default to str when all items are None
40
+ return str
40
41
 
41
- if not isinstance(output, dict):
42
+ item_type = type(first_item)
43
+
44
+ # Handle nested lists one level deep
45
+ if isinstance(first_item, list) and len(first_item) > 0:
46
+ nested_item = _find_first_non_none(first_item)
47
+ if nested_item is not None:
48
+ return list[type(nested_item)] # type: ignore[misc, return-value]
49
+ # Default to str for nested lists with all None
50
+ return list[str] # type: ignore[return-value]
51
+
52
+ return item_type
53
+
54
+
55
+ def _infer_dict_value_type(dct: dict) -> type:
56
+ """Infer the value type of a dict, handling None values and list values."""
57
+ if len(dct) == 0:
58
+ # Default to str when dict is empty to avoid generic dict values
59
+ return str
60
+
61
+ # Find first non-None value
62
+ first_value = None
63
+ for val in dct.values():
64
+ if val is not None:
65
+ first_value = val
66
+ break
67
+
68
+ if first_value is None:
69
+ # Default to str when all values are None
70
+ return str
71
+
72
+ # Handle list values
73
+ if isinstance(first_value, list) and len(first_value) > 0:
74
+ list_item = _find_first_non_none(first_value)
75
+ if list_item is not None:
76
+ return list[type(list_item)] # type: ignore[misc, return-value]
77
+ # Default to str for lists with all None
78
+ return list[str] # type: ignore[return-value]
79
+
80
+ return type(first_value)
81
+
82
+
83
+ def _infer_type_from_sequence(
84
+ sequence: Sequence[DataValue], signal_name: str, ds_name: str
85
+ ) -> type:
86
+ """
87
+ Infer the type from a sequence of values.
88
+
89
+ Returns str if all values are None, otherwise infers from the first non-None value.
90
+ Handles lists and dicts with proper type inference for nested structures.
91
+ """
92
+ first_element = _find_first_non_none(sequence)
93
+
94
+ if first_element is None:
95
+ # Default to str if column is empty or all values are None
96
+ return str
97
+
98
+ typ = type(first_element)
99
+
100
+ if not is_chain_type(typ):
101
+ raise ValuesToTupleError(
102
+ ds_name,
103
+ f"signal '{signal_name}' has unsupported type '{typ.__name__}'."
104
+ f" Please use DataModel types: {DataTypeNames}",
105
+ )
106
+
107
+ if isinstance(first_element, list):
108
+ item_type = _infer_list_item_type(first_element)
109
+ return list[item_type] # type: ignore[valid-type, return-value]
110
+
111
+ if isinstance(first_element, dict):
112
+ # If the first dict is empty, use str as default key/value types
113
+ if len(first_element) == 0:
114
+ return dict[str, str] # type: ignore[return-value]
115
+ first_key = next(iter(first_element.keys()))
116
+ value_type = _infer_dict_value_type(first_element)
117
+ return dict[type(first_key), value_type] # type: ignore[misc, return-value]
118
+
119
+ return typ
120
+
121
+
122
+ def _validate_and_normalize_output(
123
+ output: DataType | Sequence[str] | dict[str, DataType] | None,
124
+ fr_map: dict[str, Sequence[DataValue]],
125
+ ds_name: str,
126
+ ) -> dict[str, DataType] | None:
127
+ """Validate and normalize the output parameter to a dict format."""
128
+ if not output:
129
+ return None
130
+
131
+ if not isinstance(output, (Sequence, str, dict)):
132
+ if len(fr_map) != 1:
42
133
  raise ValuesToTupleError(
43
134
  ds_name,
44
- "output type must be dict[str, DataType] while "
45
- f"'{type(output).__name__}' is given",
135
+ f"only one output type was specified, {len(fr_map)} expected",
46
136
  )
47
-
48
- if len(output) != len(fr_map):
137
+ if not isinstance(output, type):
49
138
  raise ValuesToTupleError(
50
139
  ds_name,
51
- f"number of outputs '{len(output)}' should match"
52
- f" number of signals '{len(fr_map)}'",
140
+ f"output must specify a type while '{output}' was given",
53
141
  )
54
142
 
143
+ key: str = next(iter(fr_map.keys()))
144
+ return {key: output} # type: ignore[dict-item]
145
+
146
+ if not isinstance(output, dict):
147
+ raise ValuesToTupleError(
148
+ ds_name,
149
+ "output type must be dict[str, DataType] while "
150
+ f"'{type(output).__name__}' is given",
151
+ )
152
+
153
+ if len(output) != len(fr_map):
154
+ raise ValuesToTupleError(
155
+ ds_name,
156
+ f"number of outputs '{len(output)}' should match"
157
+ f" number of signals '{len(fr_map)}'",
158
+ )
159
+
160
+ return output # type: ignore[return-value]
161
+
162
+
163
+ def values_to_tuples(
164
+ ds_name: str = "",
165
+ output: DataType | Sequence[str] | dict[str, DataType] | None = None,
166
+ **fr_map: Sequence[DataValue],
167
+ ) -> tuple[Any, Any, Any]:
168
+ output = _validate_and_normalize_output(output, fr_map, ds_name)
169
+
55
170
  types_map: dict[str, type] = {}
56
171
  length = -1
57
172
  for k, v in fr_map.items():
58
173
  if not isinstance(v, Sequence) or isinstance(v, str): # type: ignore[unreachable]
59
- raise ValuesToTupleError(ds_name, f"signals '{k}' is not a sequence")
174
+ raise ValuesToTupleError(ds_name, f"signal '{k}' is not a sequence")
60
175
  len_ = len(v)
61
176
 
62
177
  if output:
@@ -66,21 +181,11 @@ def values_to_tuples( # noqa: C901, PLR0912
66
181
  f"signal '{k}' is not present in the output",
67
182
  )
68
183
  else:
69
- if len_ == 0:
70
- raise ValuesToTupleError(ds_name, f"signal '{k}' is empty list")
71
-
72
- first_element = next(iter(v))
73
- typ = type(first_element)
74
- if not is_chain_type(typ):
75
- raise ValuesToTupleError(
76
- ds_name,
77
- f"signal '{k}' has unsupported type '{typ.__name__}'."
78
- f" Please use DataModel types: {DataTypeNames}",
79
- )
80
- if isinstance(first_element, list):
81
- types_map[k] = list[type(first_element[0])] # type: ignore[assignment, misc]
82
- else:
83
- types_map[k] = typ
184
+ # FIXME: Stops as soon as it finds the first non-None value.
185
+ # If a non-None value appears early, it won't check the remaining items for
186
+ # `None` values.
187
+ typ = _infer_type_from_sequence(v, k, ds_name)
188
+ types_map[k] = typ
84
189
 
85
190
  if length < 0:
86
191
  length = len_
@@ -104,7 +209,7 @@ def values_to_tuples( # noqa: C901, PLR0912
104
209
  if len(output) > 1: # type: ignore[arg-type]
105
210
  tuple_type = tuple(output_types)
106
211
  res_type = tuple[tuple_type] # type: ignore[valid-type]
107
- res_values: Sequence[Any] = list(zip(*fr_map.values()))
212
+ res_values: Sequence[Any] = list(zip(*fr_map.values(), strict=False))
108
213
  else:
109
214
  res_type = output_types[0] # type: ignore[misc]
110
215
  res_values = next(iter(fr_map.values()))
@@ -1,25 +1,29 @@
1
+ import inspect
2
+ import types
3
+ import uuid
1
4
  from collections.abc import Sequence
2
5
  from datetime import datetime
3
- from typing import ClassVar, Optional, Union, get_args, get_origin
6
+ from typing import ClassVar, Union, get_args, get_origin
4
7
 
5
8
  from pydantic import AliasChoices, BaseModel, Field, create_model
9
+ from pydantic.fields import FieldInfo
6
10
 
7
11
  from datachain.lib.model_store import ModelStore
8
12
  from datachain.lib.utils import normalize_col_names
9
13
 
10
- StandardType = Union[
11
- type[int],
12
- type[str],
13
- type[float],
14
- type[bool],
15
- type[list],
16
- type[dict],
17
- type[bytes],
18
- type[datetime],
19
- ]
20
- DataType = Union[type[BaseModel], StandardType]
14
+ StandardType = (
15
+ type[int]
16
+ | type[str]
17
+ | type[float]
18
+ | type[bool]
19
+ | type[list]
20
+ | type[dict]
21
+ | type[bytes]
22
+ | type[datetime]
23
+ )
24
+ DataType = type[BaseModel] | StandardType
21
25
  DataTypeNames = "BaseModel, int, str, float, bool, list, dict, bytes, datetime"
22
- DataValue = Union[BaseModel, int, str, float, bool, list, dict, bytes, datetime]
26
+ DataValue = BaseModel | int | str | float | bool | list | dict | bytes | datetime
23
27
 
24
28
 
25
29
  class DataModel(BaseModel):
@@ -34,7 +38,7 @@ class DataModel(BaseModel):
34
38
  ModelStore.register(cls)
35
39
 
36
40
  @staticmethod
37
- def register(models: Union[DataType, Sequence[DataType]]):
41
+ def register(models: DataType | Sequence[DataType]):
38
42
  """For registering classes manually. It accepts a single class or a sequence of
39
43
  classes."""
40
44
  if not isinstance(models, Sequence):
@@ -60,8 +64,11 @@ def is_chain_type(t: type) -> bool:
60
64
  if orig is list and len(args) == 1:
61
65
  return is_chain_type(get_args(t)[0])
62
66
 
63
- if orig is Union and len(args) == 2 and (type(None) in args):
64
- return is_chain_type(args[0])
67
+ if orig is dict and len(args) == 2:
68
+ return is_chain_type(args[0]) and is_chain_type(args[1])
69
+
70
+ if orig in (Union, types.UnionType) and len(args) == 2 and (type(None) in args):
71
+ return is_chain_type(args[0] if args[1] is type(None) else args[1])
65
72
 
66
73
  return False
67
74
 
@@ -69,17 +76,19 @@ def is_chain_type(t: type) -> bool:
69
76
  def dict_to_data_model(
70
77
  name: str,
71
78
  data_dict: dict[str, DataType],
72
- original_names: Optional[list[str]] = None,
79
+ original_names: list[str] | None = None,
73
80
  ) -> type[BaseModel]:
74
81
  if not original_names:
75
82
  # Gets a map of a normalized_name -> original_name
76
83
  columns = normalize_col_names(list(data_dict))
77
- data_dict = dict(zip(columns.keys(), data_dict.values()))
84
+ data_dict = dict(zip(columns.keys(), data_dict.values(), strict=False))
78
85
  original_names = list(columns.values())
79
86
 
80
87
  fields = {
81
88
  name: (
82
- anno,
89
+ anno
90
+ if inspect.isclass(anno) and issubclass(anno, BaseModel)
91
+ else anno | None,
83
92
  Field(
84
93
  validation_alias=AliasChoices(name, original_names[idx] or name),
85
94
  default=None,
@@ -89,7 +98,20 @@ def dict_to_data_model(
89
98
  }
90
99
 
91
100
  class _DataModelStrict(BaseModel, extra="forbid"):
92
- pass
101
+ @classmethod
102
+ def _model_fields_by_aliases(cls) -> dict[str, tuple[str, FieldInfo]]:
103
+ """Returns a map of aliases to original field names and info."""
104
+ field_info = {}
105
+ for _name, field in cls.model_fields.items():
106
+ assert isinstance(field.validation_alias, AliasChoices)
107
+ # Add mapping for all aliases (both normalized and original names)
108
+ for alias in field.validation_alias.choices:
109
+ field_info[str(alias)] = (_name, field)
110
+ return field_info
111
+
112
+ # Generate random unique name if not provided
113
+ if not name:
114
+ name = f"DataModel_{uuid.uuid4().hex[:8]}"
93
115
 
94
116
  return create_model(
95
117
  name,
@@ -1,17 +1,19 @@
1
- import json
2
1
  from datetime import datetime
3
- from typing import TYPE_CHECKING, Any, Optional, Union
2
+ from typing import TYPE_CHECKING, Any
4
3
  from uuid import uuid4
5
4
 
6
5
  from pydantic import Field, field_validator
7
6
 
7
+ from datachain import json
8
8
  from datachain.dataset import (
9
+ DEFAULT_DATASET_VERSION,
9
10
  DatasetListRecord,
10
11
  DatasetListVersion,
11
12
  DatasetStatus,
12
13
  )
13
14
  from datachain.job import Job
14
15
  from datachain.lib.data_model import DataModel
16
+ from datachain.query.session import Session
15
17
  from datachain.utils import TIME_ZERO
16
18
 
17
19
  if TYPE_CHECKING:
@@ -20,21 +22,44 @@ if TYPE_CHECKING:
20
22
 
21
23
  class DatasetInfo(DataModel):
22
24
  name: str
25
+ namespace: str
26
+ project: str
23
27
  uuid: str = Field(default=str(uuid4()))
24
- version: int = Field(default=1)
28
+ version: str = Field(default=DEFAULT_DATASET_VERSION)
25
29
  status: int = Field(default=DatasetStatus.CREATED)
26
30
  created_at: datetime = Field(default=TIME_ZERO)
27
- finished_at: Optional[datetime] = Field(default=None)
28
- num_objects: Optional[int] = Field(default=None)
29
- size: Optional[int] = Field(default=None)
31
+ finished_at: datetime | None = Field(default=None)
32
+ num_objects: int | None = Field(default=None)
33
+ size: int | None = Field(default=None)
30
34
  params: dict[str, str] = Field(default={})
31
35
  metrics: dict[str, Any] = Field(default={})
32
36
  error_message: str = Field(default="")
33
37
  error_stack: str = Field(default="")
38
+ attrs: list[str] = Field(default=[])
39
+
40
+ @property
41
+ def is_temp(self) -> bool:
42
+ return Session.is_temp_dataset(self.name)
43
+
44
+ def has_attr(self, attr: str) -> bool:
45
+ s = attr.split("=")
46
+ if len(s) == 1:
47
+ return attr in self.attrs
48
+
49
+ name = s[0]
50
+ value = s[1]
51
+ for a in self.attrs:
52
+ s = a.split("=")
53
+ if value == "*" and s[0] == name:
54
+ return True
55
+ if len(s) == 2 and s[0] == name and s[1] == value:
56
+ return True
57
+
58
+ return False
34
59
 
35
60
  @staticmethod
36
61
  def _validate_dict(
37
- v: Optional[Union[str, dict]],
62
+ v: str | dict | None,
38
63
  ) -> dict:
39
64
  if v is None or v == "":
40
65
  return {}
@@ -63,11 +88,13 @@ class DatasetInfo(DataModel):
63
88
  cls,
64
89
  dataset: DatasetListRecord,
65
90
  version: DatasetListVersion,
66
- job: Optional[Job],
91
+ job: Job | None,
67
92
  ) -> "Self":
68
93
  return cls(
69
94
  uuid=version.uuid,
70
95
  name=dataset.name,
96
+ namespace=dataset.project.namespace.name,
97
+ project=dataset.project.name,
71
98
  version=version.version,
72
99
  status=version.status,
73
100
  created_at=version.created_at,
@@ -78,4 +105,5 @@ class DatasetInfo(DataModel):
78
105
  metrics=job.metrics if job else {},
79
106
  error_message=version.error_message,
80
107
  error_stack=version.error_stack,
108
+ attrs=dataset.attrs,
81
109
  )
@@ -1,6 +1,7 @@
1
1
  from .csv import read_csv
2
+ from .database import read_database
2
3
  from .datachain import C, Column, DataChain
3
- from .datasets import datasets, read_dataset
4
+ from .datasets import datasets, delete_dataset, move_dataset, read_dataset
4
5
  from .hf import read_hf
5
6
  from .json import read_json
6
7
  from .listings import listings
@@ -8,7 +9,7 @@ from .pandas import read_pandas
8
9
  from .parquet import read_parquet
9
10
  from .records import read_records
10
11
  from .storage import read_storage
11
- from .utils import DatasetMergeError, DatasetPrepareError, Sys
12
+ from .utils import DatasetMergeError, DatasetPrepareError, Sys, is_local, is_studio
12
13
  from .values import read_values
13
14
 
14
15
  __all__ = [
@@ -19,8 +20,13 @@ __all__ = [
19
20
  "DatasetPrepareError",
20
21
  "Sys",
21
22
  "datasets",
23
+ "delete_dataset",
24
+ "is_local",
25
+ "is_studio",
22
26
  "listings",
27
+ "move_dataset",
23
28
  "read_csv",
29
+ "read_database",
24
30
  "read_dataset",
25
31
  "read_hf",
26
32
  "read_json",
datachain/lib/dc/csv.py CHANGED
@@ -1,10 +1,6 @@
1
- from collections.abc import Sequence
2
- from typing import (
3
- TYPE_CHECKING,
4
- Callable,
5
- Optional,
6
- Union,
7
- )
1
+ import os
2
+ from collections.abc import Callable, Sequence
3
+ from typing import TYPE_CHECKING
8
4
 
9
5
  from datachain.lib.dc.utils import DatasetPrepareError, OutputType
10
6
  from datachain.lib.model_store import ModelStore
@@ -17,38 +13,38 @@ if TYPE_CHECKING:
17
13
 
18
14
 
19
15
  def read_csv(
20
- path,
21
- delimiter: Optional[str] = None,
16
+ path: str | os.PathLike[str] | list[str] | list[os.PathLike[str]],
17
+ delimiter: str | None = None,
22
18
  header: bool = True,
23
19
  output: OutputType = None,
24
- object_name: str = "",
20
+ column: str = "",
25
21
  model_name: str = "",
26
22
  source: bool = True,
27
- nrows=None,
28
- session: Optional[Session] = None,
29
- settings: Optional[dict] = None,
30
- column_types: Optional[dict[str, "Union[str, ArrowDataType]"]] = None,
31
- parse_options: Optional[dict[str, "Union[str, Union[bool, Callable]]"]] = None,
23
+ nrows: int | None = None,
24
+ session: Session | None = None,
25
+ settings: dict | None = None,
26
+ column_types: dict[str, "str | ArrowDataType"] | None = None,
27
+ parse_options: dict[str, str | bool | Callable] | None = None,
32
28
  **kwargs,
33
29
  ) -> "DataChain":
34
30
  """Generate chain from csv files.
35
31
 
36
32
  Parameters:
37
- path : Storage URI with directory. URI must start with storage prefix such
33
+ path: Storage URI with directory. URI must start with storage prefix such
38
34
  as `s3://`, `gs://`, `az://` or "file:///".
39
- delimiter : Character for delimiting columns. Takes precedence if also
35
+ delimiter: Character for delimiting columns. Takes precedence if also
40
36
  specified in `parse_options`. Defaults to ",".
41
- header : Whether the files include a header row.
42
- output : Dictionary or feature class defining column names and their
37
+ header: Whether the files include a header row.
38
+ output: Dictionary or feature class defining column names and their
43
39
  corresponding types. List of column names is also accepted, in which
44
40
  case types will be inferred.
45
- object_name : Created object column name.
46
- model_name : Generated model name.
47
- source : Whether to include info about the source file.
48
- nrows : Optional row limit.
49
- session : Session to use for the chain.
50
- settings : Settings to use for the chain.
51
- column_types : Dictionary of column names and their corresponding types.
41
+ column: Created column name.
42
+ model_name: Generated model name.
43
+ source: Whether to include info about the source file.
44
+ nrows: Optional row limit.
45
+ session: Session to use for the chain.
46
+ settings: Settings to use for the chain.
47
+ column_types: Dictionary of column names and their corresponding types.
52
48
  It is passed to CSV reader and for each column specified type auto
53
49
  inference is disabled.
54
50
  parse_options: Tells the parser how to process lines.
@@ -67,7 +63,7 @@ def read_csv(
67
63
  chain = dc.read_csv("s3://mybucket/dir")
68
64
  ```
69
65
  """
70
- from pandas.io.parsers.readers import STR_NA_VALUES
66
+ from pandas._libs.parsers import STR_NA_VALUES
71
67
  from pyarrow.csv import ConvertOptions, ParseOptions, ReadOptions
72
68
  from pyarrow.dataset import CsvFileFormat
73
69
  from pyarrow.lib import type_for_alias
@@ -119,9 +115,10 @@ def read_csv(
119
115
  )
120
116
  return chain.parse_tabular(
121
117
  output=output,
122
- object_name=object_name,
118
+ column=column,
123
119
  model_name=model_name,
124
120
  source=source,
125
121
  nrows=nrows,
126
122
  format=format,
123
+ parse_options=parse_options,
127
124
  )