datachain 0.14.4__py3-none-any.whl → 0.15.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -1,7 +1,4 @@
1
- from typing import (
2
- TYPE_CHECKING,
3
- Optional,
4
- )
1
+ from typing import TYPE_CHECKING, Optional, get_origin, get_type_hints
5
2
 
6
3
  from datachain.lib.dataset_info import DatasetInfo
7
4
  from datachain.lib.file import (
@@ -102,7 +99,7 @@ def datasets(
102
99
  session: Optional[Session] = None,
103
100
  settings: Optional[dict] = None,
104
101
  in_memory: bool = False,
105
- object_name: str = "dataset",
102
+ column: Optional[str] = None,
106
103
  include_listing: bool = False,
107
104
  studio: bool = False,
108
105
  ) -> "DataChain":
@@ -112,7 +109,8 @@ def datasets(
112
109
  session: Optional session instance. If not provided, uses default session.
113
110
  settings: Optional dictionary of settings to configure the chain.
114
111
  in_memory: If True, creates an in-memory session. Defaults to False.
115
- object_name: Name of the output object in the chain. Defaults to "dataset".
112
+ column: Name of the output column in the chain. Defaults to None which
113
+ means no top level column will be created.
116
114
  include_listing: If True, includes listing datasets. Defaults to False.
117
115
  studio: If True, returns datasets from Studio only,
118
116
  otherwise returns all local datasets. Defaults to False.
@@ -124,7 +122,7 @@ def datasets(
124
122
  ```py
125
123
  import datachain as dc
126
124
 
127
- chain = dc.datasets()
125
+ chain = dc.datasets(column="dataset")
128
126
  for ds in chain.collect("dataset"):
129
127
  print(f"{ds.name}@v{ds.version}")
130
128
  ```
@@ -139,13 +137,75 @@ def datasets(
139
137
  include_listing=include_listing, studio=studio
140
138
  )
141
139
  ]
142
-
143
140
  datasets_values = [d for d in datasets_values if not d.is_temp]
144
141
 
142
+ if not column:
143
+ # flattening dataset fields
144
+ schema = {
145
+ k: get_origin(v) if get_origin(v) is dict else v
146
+ for k, v in get_type_hints(DatasetInfo).items()
147
+ if k in DatasetInfo.model_fields
148
+ }
149
+ data = {k: [] for k in DatasetInfo.model_fields} # type: ignore[var-annotated]
150
+ for d in [d.model_dump() for d in datasets_values]:
151
+ for field, value in d.items():
152
+ data[field].append(value)
153
+
154
+ return read_values(
155
+ session=session,
156
+ settings=settings,
157
+ in_memory=in_memory,
158
+ output=schema,
159
+ **data, # type: ignore[arg-type]
160
+ )
161
+
145
162
  return read_values(
146
163
  session=session,
147
164
  settings=settings,
148
165
  in_memory=in_memory,
149
- output={object_name: DatasetInfo},
150
- **{object_name: datasets_values}, # type: ignore[arg-type]
166
+ output={column: DatasetInfo},
167
+ **{column: datasets_values}, # type: ignore[arg-type]
151
168
  )
169
+
170
+
171
+ def delete_dataset(
172
+ name: str,
173
+ version: Optional[int] = None,
174
+ force: Optional[bool] = False,
175
+ studio: Optional[bool] = False,
176
+ session: Optional[Session] = None,
177
+ in_memory: bool = False,
178
+ ) -> None:
179
+ """Removes specific dataset version or all dataset versions, depending on
180
+ a force flag.
181
+
182
+ Args:
183
+ name : Dataset name
184
+ version : Optional dataset version
185
+ force: If true, all datasets versions will be removed. Defaults to False.
186
+ studio: If True, removes dataset from Studio only,
187
+ otherwise remove from local. Defaults to False.
188
+ session: Optional session instance. If not provided, uses default session.
189
+ in_memory: If True, creates an in-memory session. Defaults to False.
190
+
191
+ Returns: None
192
+
193
+ Example:
194
+ ```py
195
+ import datachain as dc
196
+ dc.delete_dataset("cats")
197
+ ```
198
+
199
+ ```py
200
+ import datachain as dc
201
+ dc.delete_dataset("cats", version=1)
202
+ ```
203
+ """
204
+
205
+ session = Session.get(session, in_memory=in_memory)
206
+ catalog = session.catalog
207
+ if not force:
208
+ version = version or catalog.get_dataset(name).latest_version
209
+ else:
210
+ version = None
211
+ catalog.remove_dataset(name, version=version, force=force, studio=studio)
datachain/lib/dc/hf.py CHANGED
@@ -23,7 +23,7 @@ def read_hf(
23
23
  *args,
24
24
  session: Optional[Session] = None,
25
25
  settings: Optional[dict] = None,
26
- object_name: str = "",
26
+ column: str = "",
27
27
  model_name: str = "",
28
28
  **kwargs,
29
29
  ) -> "DataChain":
@@ -34,7 +34,7 @@ def read_hf(
34
34
  or an instance of `datasets.Dataset`-like object.
35
35
  session : Session to use for the chain.
36
36
  settings : Settings to use for the chain.
37
- object_name : Generated object column name.
37
+ column : Generated object column name.
38
38
  model_name : Generated model name.
39
39
  kwargs : Parameters to pass to datasets.load_dataset.
40
40
 
@@ -62,12 +62,12 @@ def read_hf(
62
62
  if len(ds_dict) > 1:
63
63
  output = {"split": str}
64
64
 
65
- model_name = model_name or object_name or ""
65
+ model_name = model_name or column or ""
66
66
  hf_features = next(iter(ds_dict.values())).features
67
67
  output = output | get_output_schema(hf_features)
68
68
  model = dict_to_data_model(model_name, output)
69
- if object_name:
70
- output = {object_name: model}
69
+ if column:
70
+ output = {column: model}
71
71
 
72
72
  chain = read_values(split=list(ds_dict.keys()), session=session, settings=settings)
73
73
  return chain.gen(HFGenerator(dataset, model, *args, **kwargs), output=output)
datachain/lib/dc/json.py CHANGED
@@ -28,7 +28,7 @@ def read_json(
28
28
  spec: Optional[DataType] = None,
29
29
  schema_from: Optional[str] = "auto",
30
30
  jmespath: Optional[str] = None,
31
- object_name: Optional[str] = "",
31
+ column: Optional[str] = "",
32
32
  model_name: Optional[str] = None,
33
33
  format: Optional[str] = "json",
34
34
  nrows=None,
@@ -42,7 +42,7 @@ def read_json(
42
42
  type : read file as "binary", "text", or "image" data. Default is "text".
43
43
  spec : optional Data Model
44
44
  schema_from : path to sample to infer spec (if schema not provided)
45
- object_name : generated object column name
45
+ column : generated column name
46
46
  model_name : optional generated model name
47
47
  format: "json", "jsonl"
48
48
  jmespath : optional JMESPATH expression to reduce JSON
@@ -70,13 +70,13 @@ def read_json(
70
70
  name_end = re.search(r"\W", s).start() if re.search(r"\W", s) else len(s) # type: ignore[union-attr]
71
71
  return s[:name_end]
72
72
 
73
- if (not object_name) and jmespath:
74
- object_name = jmespath_to_name(jmespath)
75
- if not object_name:
76
- object_name = format
73
+ if (not column) and jmespath:
74
+ column = jmespath_to_name(jmespath)
75
+ if not column:
76
+ column = format
77
77
  chain = read_storage(uri=path, type=type, **kwargs)
78
78
  signal_dict = {
79
- object_name: read_meta(
79
+ column: read_meta(
80
80
  schema_from=schema_from,
81
81
  format=format,
82
82
  spec=spec,
@@ -19,7 +19,7 @@ if TYPE_CHECKING:
19
19
  def listings(
20
20
  session: Optional[Session] = None,
21
21
  in_memory: bool = False,
22
- object_name: str = "listing",
22
+ column: str = "listing",
23
23
  **kwargs,
24
24
  ) -> "DataChain":
25
25
  """Generate chain with list of cached listings.
@@ -38,6 +38,6 @@ def listings(
38
38
  return read_values(
39
39
  session=session,
40
40
  in_memory=in_memory,
41
- output={object_name: ListingInfo},
42
- **{object_name: catalog.listings()}, # type: ignore[arg-type]
41
+ output={column: ListingInfo},
42
+ **{column: catalog.listings()}, # type: ignore[arg-type]
43
43
  )
@@ -22,7 +22,7 @@ def read_pandas( # type: ignore[override]
22
22
  session: Optional[Session] = None,
23
23
  settings: Optional[dict] = None,
24
24
  in_memory: bool = False,
25
- object_name: str = "",
25
+ column: str = "",
26
26
  ) -> "DataChain":
27
27
  """Generate chain from pandas data-frame.
28
28
 
@@ -37,20 +37,27 @@ def read_pandas( # type: ignore[override]
37
37
  """
38
38
  from .utils import DatasetPrepareError
39
39
 
40
- fr_map = {col.lower(): df[col].tolist() for col in df.columns}
40
+ def get_col_name(col):
41
+ if isinstance(col, tuple):
42
+ # Join tuple elements with underscore for MultiIndex columns
43
+ return "_".join(map(str, col)).lower()
44
+ # Handle regular string column names
45
+ return str(col).lower()
41
46
 
42
- for column in fr_map:
43
- if not column.isidentifier():
47
+ fr_map = {get_col_name(col): df[col].tolist() for col in df.columns}
48
+
49
+ for c in fr_map:
50
+ if not c.isidentifier():
44
51
  raise DatasetPrepareError(
45
52
  name,
46
- f"import from pandas error - '{column}' cannot be a column name",
53
+ f"import from pandas error - '{c}' cannot be a column name",
47
54
  )
48
55
 
49
56
  return read_values(
50
57
  name,
51
58
  session,
52
59
  settings=settings,
53
- object_name=object_name,
60
+ column=column,
54
61
  in_memory=in_memory,
55
62
  **fr_map,
56
63
  )
@@ -19,7 +19,7 @@ def read_parquet(
19
19
  path,
20
20
  partitioning: Any = "hive",
21
21
  output: Optional[dict[str, DataType]] = None,
22
- object_name: str = "",
22
+ column: str = "",
23
23
  model_name: str = "",
24
24
  source: bool = True,
25
25
  session: Optional[Session] = None,
@@ -33,7 +33,7 @@ def read_parquet(
33
33
  as `s3://`, `gs://`, `az://` or "file:///".
34
34
  partitioning : Any pyarrow partitioning schema.
35
35
  output : Dictionary defining column names and their corresponding types.
36
- object_name : Created object column name.
36
+ column : Created column name.
37
37
  model_name : Generated model name.
38
38
  source : Whether to include info about the source file.
39
39
  session : Session to use for the chain.
@@ -57,7 +57,7 @@ def read_parquet(
57
57
  chain = read_storage(path, session=session, settings=settings, **kwargs)
58
58
  return chain.parse_tabular(
59
59
  output=output,
60
- object_name=object_name,
60
+ column=column,
61
61
  model_name=model_name,
62
62
  source=source,
63
63
  format="parquet",
@@ -1,8 +1,5 @@
1
- from typing import (
2
- TYPE_CHECKING,
3
- Optional,
4
- Union,
5
- )
1
+ from collections.abc import Iterable
2
+ from typing import TYPE_CHECKING, Optional, Union
6
3
 
7
4
  import sqlalchemy
8
5
 
@@ -12,6 +9,7 @@ from datachain.lib.file import (
12
9
  )
13
10
  from datachain.lib.signal_schema import SignalSchema
14
11
  from datachain.query import Session
12
+ from datachain.query.schema import Column
15
13
 
16
14
  if TYPE_CHECKING:
17
15
  from typing_extensions import ParamSpec
@@ -22,7 +20,7 @@ if TYPE_CHECKING:
22
20
 
23
21
 
24
22
  def read_records(
25
- to_insert: Optional[Union[dict, list[dict]]],
23
+ to_insert: Optional[Union[dict, Iterable[dict]]],
26
24
  session: Optional[Session] = None,
27
25
  settings: Optional[dict] = None,
28
26
  in_memory: bool = False,
@@ -54,10 +52,11 @@ def read_records(
54
52
 
55
53
  if schema:
56
54
  signal_schema = SignalSchema(schema)
57
- columns = [
58
- sqlalchemy.Column(c.name, c.type) # type: ignore[union-attr]
59
- for c in signal_schema.db_signals(as_columns=True) # type: ignore[assignment]
60
- ]
55
+ columns = []
56
+ for c in signal_schema.db_signals(as_columns=True):
57
+ assert isinstance(c, Column)
58
+ kw = {"nullable": c.nullable} if c.nullable is not None else {}
59
+ columns.append(sqlalchemy.Column(c.name, c.type, **kw))
61
60
  else:
62
61
  columns = [
63
62
  sqlalchemy.Column(name, typ)
@@ -83,8 +82,7 @@ def read_records(
83
82
 
84
83
  warehouse = catalog.warehouse
85
84
  dr = warehouse.dataset_rows(dsr)
86
- db = warehouse.db
87
- insert_q = dr.get_table().insert()
88
- for record in to_insert:
89
- db.execute(insert_q.values(**record))
85
+ table = dr.get_table()
86
+ warehouse.insert_rows(table, to_insert)
87
+ warehouse.insert_rows_done(table)
90
88
  return read_dataset(name=dsr.name, session=session, settings=settings)
@@ -29,7 +29,7 @@ def read_storage(
29
29
  settings: Optional[dict] = None,
30
30
  in_memory: bool = False,
31
31
  recursive: Optional[bool] = True,
32
- object_name: str = "file",
32
+ column: str = "file",
33
33
  update: bool = False,
34
34
  anon: bool = False,
35
35
  client_config: Optional[dict] = None,
@@ -43,7 +43,7 @@ def read_storage(
43
43
  as `s3://`, `gs://`, `az://` or "file:///"
44
44
  type : read file as "binary", "text", or "image" data. Default is "binary".
45
45
  recursive : search recursively for the given path.
46
- object_name : Created object column name.
46
+ column : Created column name.
47
47
  update : force storage reindexing. Default is False.
48
48
  anon : If True, we will treat cloud bucket as public one
49
49
  client_config : Optional client configuration for the storage client.
@@ -124,7 +124,7 @@ def read_storage(
124
124
 
125
125
  dc = read_dataset(list_ds_name, session=session, settings=settings)
126
126
  dc._query.update = update
127
- dc.signals_schema = dc.signals_schema.mutate({f"{object_name}": file_type})
127
+ dc.signals_schema = dc.signals_schema.mutate({f"{column}": file_type})
128
128
 
129
129
  if update or not list_ds_exists:
130
130
 
@@ -140,7 +140,7 @@ def read_storage(
140
140
  .settings(prefetch=0)
141
141
  .gen(
142
142
  list_bucket(lst_uri, cache, client_config=client_config),
143
- output={f"{object_name}": file_type},
143
+ output={f"{column}": file_type},
144
144
  )
145
145
  .save(ds_name, listing=True)
146
146
  )
@@ -149,7 +149,7 @@ def read_storage(
149
149
  lambda ds_name=list_ds_name, lst_uri=list_uri: lst_fn(ds_name, lst_uri)
150
150
  )
151
151
 
152
- chain = ls(dc, list_path, recursive=recursive, object_name=object_name)
152
+ chain = ls(dc, list_path, recursive=recursive, column=column)
153
153
 
154
154
  storage_chain = storage_chain.union(chain) if storage_chain else chain
155
155
  listed_ds_name.add(list_ds_name)
@@ -162,7 +162,7 @@ def read_storage(
162
162
  file=file_values,
163
163
  )
164
164
  file_chain.signals_schema = file_chain.signals_schema.mutate(
165
- {f"{object_name}": file_type}
165
+ {f"{column}": file_type}
166
166
  )
167
167
  storage_chain = storage_chain.union(file_chain) if storage_chain else file_chain
168
168
 
@@ -24,7 +24,7 @@ def read_values(
24
24
  settings: Optional[dict] = None,
25
25
  in_memory: bool = False,
26
26
  output: OutputType = None,
27
- object_name: str = "",
27
+ column: str = "",
28
28
  **fr_map,
29
29
  ) -> "DataChain":
30
30
  """Generate chain from list of values.
@@ -48,6 +48,6 @@ def read_values(
48
48
  settings=settings,
49
49
  in_memory=in_memory,
50
50
  )
51
- if object_name:
52
- output = {object_name: dict_to_data_model(object_name, output)} # type: ignore[arg-type]
51
+ if column:
52
+ output = {column: dict_to_data_model(column, output)} # type: ignore[arg-type]
53
53
  return chain.gen(_func_fr, output=output)
datachain/lib/listing.py CHANGED
@@ -72,7 +72,7 @@ def ls(
72
72
  dc: D,
73
73
  path: str,
74
74
  recursive: Optional[bool] = True,
75
- object_name="file",
75
+ column="file",
76
76
  ) -> D:
77
77
  """
78
78
  Return files by some path from DataChain instance which contains bucket listing.
@@ -82,7 +82,7 @@ def ls(
82
82
  """
83
83
 
84
84
  def _file_c(name: str) -> Column:
85
- return Column(f"{object_name}.{name}")
85
+ return Column(f"{column}.{name}")
86
86
 
87
87
  dc = dc.filter(_file_c("is_latest") == true())
88
88
 
@@ -87,6 +87,12 @@ class SignalResolvingTypeError(SignalResolvingError):
87
87
  )
88
88
 
89
89
 
90
+ class SignalRemoveError(SignalSchemaError):
91
+ def __init__(self, path: Optional[list[str]], msg: str):
92
+ name = " '" + ".".join(path) + "'" if path else ""
93
+ super().__init__(f"cannot remove signal name{name}: {msg}")
94
+
95
+
90
96
  class CustomType(BaseModel):
91
97
  schema_version: int = Field(ge=1, le=2, strict=True)
92
98
  name: str
@@ -575,7 +581,11 @@ class SignalSchema:
575
581
  signals = [
576
582
  DEFAULT_DELIMITER.join(path)
577
583
  if not as_columns
578
- else Column(DEFAULT_DELIMITER.join(path), python_to_sql(_type))
584
+ else Column(
585
+ DEFAULT_DELIMITER.join(path),
586
+ python_to_sql(_type),
587
+ nullable=is_optional(_type),
588
+ )
579
589
  for path, _type, has_subtree, _ in self.get_flat_tree(
580
590
  include_hidden=include_hidden
581
591
  )
@@ -620,18 +630,27 @@ class SignalSchema:
620
630
  return curr_type
621
631
 
622
632
  def select_except_signals(self, *args: str) -> "SignalSchema":
623
- schema = copy.deepcopy(self.values)
624
- for field in args:
625
- if not isinstance(field, str):
626
- raise SignalResolvingTypeError("select_except()", field)
633
+ def has_signal(signal: str):
634
+ signal = signal.replace(".", DEFAULT_DELIMITER)
635
+ return any(signal == s for s in self.db_signals())
627
636
 
628
- if field not in self.values:
637
+ schema = copy.deepcopy(self.values)
638
+ for signal in args:
639
+ if not isinstance(signal, str):
640
+ raise SignalResolvingTypeError("select_except()", signal)
641
+
642
+ if signal not in self.values:
643
+ if has_signal(signal):
644
+ raise SignalRemoveError(
645
+ signal.split("."),
646
+ "select_except() error - removing nested signal would"
647
+ " break parent schema, which isn't supported.",
648
+ )
629
649
  raise SignalResolvingError(
630
- field.split("."),
631
- "select_except() error - the feature name does not exist or "
632
- "inside of feature (not supported)",
650
+ signal.split("."),
651
+ "select_except() error - the signal does not exist",
633
652
  )
634
- del schema[field]
653
+ del schema[signal]
635
654
 
636
655
  return SignalSchema(schema)
637
656
 
@@ -975,3 +994,8 @@ class SignalSchema:
975
994
  }
976
995
 
977
996
  return SignalSchema.deserialize(schema)
997
+
998
+
999
+ def is_optional(type_: Any) -> bool:
1000
+ """Check if a type is Optional."""
1001
+ return get_origin(type_) is Union and type(None) in get_args(type_)
datachain/listing.py CHANGED
@@ -27,14 +27,14 @@ class Listing:
27
27
  client: "Client",
28
28
  dataset_name: Optional["str"] = None,
29
29
  dataset_version: Optional[int] = None,
30
- object_name: str = "file",
30
+ column: str = "file",
31
31
  ):
32
32
  self.metastore = metastore
33
33
  self.warehouse = warehouse
34
34
  self.client = client
35
35
  self.dataset_name = dataset_name # dataset representing bucket listing
36
36
  self.dataset_version = dataset_version # dataset representing bucket listing
37
- self.object_name = object_name
37
+ self.column = column
38
38
 
39
39
  def clone(self) -> "Listing":
40
40
  return self.__class__(
@@ -43,7 +43,7 @@ class Listing:
43
43
  self.client,
44
44
  self.dataset_name,
45
45
  self.dataset_version,
46
- self.object_name,
46
+ self.column,
47
47
  )
48
48
 
49
49
  def __enter__(self) -> "Listing":
@@ -74,7 +74,7 @@ class Listing:
74
74
  return self.warehouse.dataset_rows(
75
75
  dataset,
76
76
  self.dataset_version or dataset.latest_version,
77
- object_name=self.object_name,
77
+ column=self.column,
78
78
  )
79
79
 
80
80
  def expand_path(self, path, use_glob=True) -> list[Node]:
@@ -437,9 +437,17 @@ class UDFStep(Step, ABC):
437
437
  "distributed processing."
438
438
  )
439
439
 
440
- from datachain.catalog.loader import get_udf_distributor_class
440
+ from datachain.catalog.loader import (
441
+ DISTRIBUTED_IMPORT_PATH,
442
+ get_udf_distributor_class,
443
+ )
444
+
445
+ if not (udf_distributor_class := get_udf_distributor_class()):
446
+ raise RuntimeError(
447
+ f"{DISTRIBUTED_IMPORT_PATH} import path is required "
448
+ "for distributed UDF processing."
449
+ )
441
450
 
442
- udf_distributor_class = get_udf_distributor_class()
443
451
  udf_distributor = udf_distributor_class(
444
452
  catalog=catalog,
445
453
  table=udf_table,
@@ -1162,16 +1170,6 @@ class DatasetQuery:
1162
1170
  )
1163
1171
  return sqlalchemy.table(table_name)
1164
1172
 
1165
- @staticmethod
1166
- def delete(
1167
- name: str, version: Optional[int] = None, catalog: Optional["Catalog"] = None
1168
- ) -> None:
1169
- from datachain.catalog import get_catalog
1170
-
1171
- catalog = catalog or get_catalog()
1172
- version = version or catalog.get_dataset(name).latest_version
1173
- catalog.remove_dataset(name, version)
1174
-
1175
1173
  @property
1176
1174
  def attached(self) -> bool:
1177
1175
  """
@@ -13,7 +13,7 @@ from multiprocess import get_context
13
13
 
14
14
  from datachain.catalog import Catalog
15
15
  from datachain.catalog.catalog import clone_catalog_with_cache
16
- from datachain.catalog.loader import get_udf_distributor_class
16
+ from datachain.catalog.loader import DISTRIBUTED_IMPORT_PATH, get_udf_distributor_class
17
17
  from datachain.lib.udf import _get_cache
18
18
  from datachain.query.batch import RowsOutput, RowsOutputBatch
19
19
  from datachain.query.dataset import (
@@ -91,7 +91,12 @@ def udf_entrypoint() -> int:
91
91
 
92
92
 
93
93
  def udf_worker_entrypoint() -> int:
94
- return get_udf_distributor_class().run_worker()
94
+ if not (udf_distributor_class := get_udf_distributor_class()):
95
+ raise RuntimeError(
96
+ f"{DISTRIBUTED_IMPORT_PATH} import path is required "
97
+ "for distributed UDF processing."
98
+ )
99
+ return udf_distributor_class.run_worker()
95
100
 
96
101
 
97
102
  class UDFDispatcher:
datachain/query/schema.py CHANGED
@@ -40,12 +40,15 @@ class ColumnMeta(type):
40
40
  class Column(sa.ColumnClause, metaclass=ColumnMeta):
41
41
  inherit_cache: Optional[bool] = True
42
42
 
43
- def __init__(self, text, type_=None, is_literal=False, _selectable=None):
43
+ def __init__(
44
+ self, text, type_=None, is_literal=False, nullable=None, _selectable=None
45
+ ):
44
46
  """Dataset column."""
45
47
  self.name = ColumnMeta.to_db_name(text)
46
48
  super().__init__(
47
49
  self.name, type_=type_, is_literal=is_literal, _selectable=_selectable
48
50
  )
51
+ self.nullable = nullable
49
52
 
50
53
  def __getattr__(self, name: str):
51
54
  return Column(self.name + DEFAULT_DELIMITER + name)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.14.4
3
+ Version: 0.15.0
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -171,7 +171,7 @@ high confidence scores.
171
171
 
172
172
  import datachain as dc
173
173
 
174
- meta = dc.read_json("gs://datachain-demo/dogs-and-cats/*json", object_name="meta", anon=True)
174
+ meta = dc.read_json("gs://datachain-demo/dogs-and-cats/*json", column="meta", anon=True)
175
175
  images = dc.read_storage("gs://datachain-demo/dogs-and-cats/*jpg", anon=True)
176
176
 
177
177
  images_id = images.map(id=lambda file: file.path.split('.')[-2])
@@ -213,7 +213,7 @@ Python code:
213
213
  return result.lower().startswith("success")
214
214
 
215
215
  chain = (
216
- dc.read_storage("gs://datachain-demo/chatbot-KiT/", object_name="file", anon=True)
216
+ dc.read_storage("gs://datachain-demo/chatbot-KiT/", column="file", anon=True)
217
217
  .settings(parallel=4, cache=True)
218
218
  .map(is_success=eval_dialogue)
219
219
  .save("mistral_files")