datachain 0.3.12__py3-none-any.whl → 0.3.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

datachain/lib/file.py CHANGED
@@ -118,7 +118,6 @@ class File(DataModel):
118
118
  is_latest: bool = Field(default=True)
119
119
  last_modified: datetime = Field(default=TIME_ZERO)
120
120
  location: Optional[Union[dict, list[dict]]] = Field(default=None)
121
- vtype: str = Field(default="")
122
121
 
123
122
  _datachain_column_types: ClassVar[dict[str, Any]] = {
124
123
  "source": String,
@@ -129,7 +128,6 @@ class File(DataModel):
129
128
  "is_latest": Boolean,
130
129
  "last_modified": DateTime,
131
130
  "location": JSON,
132
- "vtype": String,
133
131
  }
134
132
 
135
133
  _unique_id_keys: ClassVar[list[str]] = [
@@ -139,7 +137,6 @@ class File(DataModel):
139
137
  "etag",
140
138
  "version",
141
139
  "is_latest",
142
- "vtype",
143
140
  "location",
144
141
  "last_modified",
145
142
  ]
datachain/lib/listing.py CHANGED
@@ -30,8 +30,7 @@ def list_bucket(uri: str, client_config=None) -> Callable:
30
30
  config = client_config or {}
31
31
  client, path = Client.parse_url(uri, None, **config) # type: ignore[arg-type]
32
32
  for entries in iter_over_async(client.scandir(path.rstrip("/")), get_loop()):
33
- for entry in entries:
34
- yield entry.to_file(client.uri)
33
+ yield from entries
35
34
 
36
35
  return list_func
37
36
 
@@ -1,6 +1,6 @@
1
1
  import inspect
2
2
  import logging
3
- from typing import ClassVar, Optional
3
+ from typing import Any, ClassVar, Optional
4
4
 
5
5
  from pydantic import BaseModel
6
6
 
@@ -69,7 +69,7 @@ class ModelStore:
69
69
  del cls.store[fr.__name__][version]
70
70
 
71
71
  @staticmethod
72
- def is_pydantic(val):
72
+ def is_pydantic(val: Any) -> bool:
73
73
  return (
74
74
  not hasattr(val, "__origin__")
75
75
  and inspect.isclass(val)
datachain/lib/pytorch.py CHANGED
@@ -7,6 +7,7 @@ from torch import float32
7
7
  from torch.distributed import get_rank, get_world_size
8
8
  from torch.utils.data import IterableDataset, get_worker_info
9
9
  from torchvision.transforms import v2
10
+ from tqdm import tqdm
10
11
 
11
12
  from datachain.catalog import Catalog, get_catalog
12
13
  from datachain.lib.dc import DataChain
@@ -93,33 +94,38 @@ class PytorchDataset(IterableDataset):
93
94
  if self.num_samples > 0:
94
95
  ds = ds.sample(self.num_samples)
95
96
  ds = ds.chunk(total_rank, total_workers)
96
- for row_features in ds.collect():
97
- row = []
98
- for fr in row_features:
99
- if hasattr(fr, "read"):
100
- row.append(fr.read()) # type: ignore[unreachable]
101
- else:
102
- row.append(fr)
103
- # Apply transforms
104
- if self.transform:
105
- try:
106
- if isinstance(self.transform, v2.Transform):
107
- row = self.transform(row)
97
+ desc = f"Parsed PyTorch dataset for rank={total_rank} worker"
98
+ with tqdm(desc=desc, unit=" rows") as pbar:
99
+ for row_features in ds.collect():
100
+ row = []
101
+ for fr in row_features:
102
+ if hasattr(fr, "read"):
103
+ row.append(fr.read()) # type: ignore[unreachable]
104
+ else:
105
+ row.append(fr)
106
+ # Apply transforms
107
+ if self.transform:
108
+ try:
109
+ if isinstance(self.transform, v2.Transform):
110
+ row = self.transform(row)
111
+ for i, val in enumerate(row):
112
+ if isinstance(val, Image.Image):
113
+ row[i] = self.transform(val)
114
+ except ValueError:
115
+ logger.warning(
116
+ "Skipping transform due to unsupported data types."
117
+ )
118
+ self.transform = None
119
+ if self.tokenizer:
108
120
  for i, val in enumerate(row):
109
- if isinstance(val, Image.Image):
110
- row[i] = self.transform(val)
111
- except ValueError:
112
- logger.warning("Skipping transform due to unsupported data types.")
113
- self.transform = None
114
- if self.tokenizer:
115
- for i, val in enumerate(row):
116
- if isinstance(val, str) or (
117
- isinstance(val, list) and isinstance(val[0], str)
118
- ):
119
- row[i] = convert_text(
120
- val, self.tokenizer, self.tokenizer_kwargs
121
- ).squeeze(0) # type: ignore[union-attr]
122
- yield row
121
+ if isinstance(val, str) or (
122
+ isinstance(val, list) and isinstance(val[0], str)
123
+ ):
124
+ row[i] = convert_text(
125
+ val, self.tokenizer, self.tokenizer_kwargs
126
+ ).squeeze(0) # type: ignore[union-attr]
127
+ yield row
128
+ pbar.update(1)
123
129
 
124
130
  @staticmethod
125
131
  def get_rank_and_workers() -> tuple[int, int]:
@@ -4,11 +4,14 @@ from collections.abc import Iterator, Sequence
4
4
  from dataclasses import dataclass
5
5
  from datetime import datetime
6
6
  from inspect import isclass
7
- from typing import (
7
+ from typing import ( # noqa: UP035
8
8
  TYPE_CHECKING,
9
9
  Annotated,
10
10
  Any,
11
11
  Callable,
12
+ Dict,
13
+ Final,
14
+ List,
12
15
  Literal,
13
16
  Optional,
14
17
  Union,
@@ -42,8 +45,13 @@ NAMES_TO_TYPES = {
42
45
  "dict": dict,
43
46
  "bytes": bytes,
44
47
  "datetime": datetime,
45
- "Literal": Literal,
48
+ "Final": Final,
46
49
  "Union": Union,
50
+ "Optional": Optional,
51
+ "List": list,
52
+ "Dict": dict,
53
+ "Literal": Any,
54
+ "Any": Any,
47
55
  }
48
56
 
49
57
 
@@ -146,35 +154,11 @@ class SignalSchema:
146
154
  return SignalSchema(signals)
147
155
 
148
156
  @staticmethod
149
- def _get_name_original_type(fr_type: type) -> tuple[str, type]:
150
- """Returns the name of and the original type for the given type,
151
- based on whether the type is Optional or not."""
152
- orig = get_origin(fr_type)
153
- args = get_args(fr_type)
154
- # Check if fr_type is Optional
155
- if orig == Union and len(args) == 2 and (type(None) in args):
156
- fr_type = args[0]
157
- orig = get_origin(fr_type)
158
- if orig in (Literal, LiteralEx):
159
- # Literal has no __name__ in Python 3.9
160
- type_name = "Literal"
161
- elif orig == Union:
162
- # Union also has no __name__ in Python 3.9
163
- type_name = "Union"
164
- else:
165
- type_name = str(fr_type.__name__) # type: ignore[union-attr]
166
- return type_name, fr_type
167
-
168
- @staticmethod
169
- def serialize_custom_model_fields(
170
- name: str, fr: type, custom_types: dict[str, Any]
157
+ def _serialize_custom_model_fields(
158
+ version_name: str, fr: type[BaseModel], custom_types: dict[str, Any]
171
159
  ) -> str:
172
160
  """This serializes any custom type information to the provided custom_types
173
- dict, and returns the name of the type provided."""
174
- if hasattr(fr, "__origin__") or not issubclass(fr, BaseModel):
175
- # Don't store non-feature types.
176
- return name
177
- version_name = ModelStore.get_name(fr)
161
+ dict, and returns the name of the type serialized."""
178
162
  if version_name in custom_types:
179
163
  # This type is already stored in custom_types.
180
164
  return version_name
@@ -183,37 +167,102 @@ class SignalSchema:
183
167
  field_type = info.annotation
184
168
  # All fields should be typed.
185
169
  assert field_type
186
- field_type_name, field_type = SignalSchema._get_name_original_type(
187
- field_type
188
- )
189
- # Serialize this type to custom_types if it is a custom type as well.
190
- fields[field_name] = SignalSchema.serialize_custom_model_fields(
191
- field_type_name, field_type, custom_types
192
- )
170
+ fields[field_name] = SignalSchema._serialize_type(field_type, custom_types)
193
171
  custom_types[version_name] = fields
194
172
  return version_name
195
173
 
174
+ @staticmethod
175
+ def _serialize_type(fr: type, custom_types: dict[str, Any]) -> str:
176
+ """Serialize a given type to a string, including automatic ModelStore
177
+ registration, and save this type and subtypes to custom_types as well."""
178
+ subtypes: list[Any] = []
179
+ type_name = SignalSchema._type_to_str(fr, subtypes)
180
+ # Iterate over all subtypes (includes the input type).
181
+ for st in subtypes:
182
+ if st is None or not ModelStore.is_pydantic(st):
183
+ continue
184
+ # Register and save feature types.
185
+ ModelStore.register(st)
186
+ st_version_name = ModelStore.get_name(st)
187
+ if st is fr:
188
+ # If the main type is Pydantic, then use the ModelStore version name.
189
+ type_name = st_version_name
190
+ # Save this type to custom_types.
191
+ SignalSchema._serialize_custom_model_fields(
192
+ st_version_name, st, custom_types
193
+ )
194
+ return type_name
195
+
196
196
  def serialize(self) -> dict[str, Any]:
197
197
  signals: dict[str, Any] = {}
198
198
  custom_types: dict[str, Any] = {}
199
199
  for name, fr_type in self.values.items():
200
- if (fr := ModelStore.to_pydantic(fr_type)) is not None:
201
- ModelStore.register(fr)
202
- signals[name] = ModelStore.get_name(fr)
203
- type_name, fr_type = SignalSchema._get_name_original_type(fr)
204
- else:
205
- type_name, fr_type = SignalSchema._get_name_original_type(fr_type)
206
- signals[name] = type_name
207
- self.serialize_custom_model_fields(type_name, fr_type, custom_types)
200
+ signals[name] = self._serialize_type(fr_type, custom_types)
208
201
  if custom_types:
209
202
  signals["_custom_types"] = custom_types
210
203
  return signals
211
204
 
212
205
  @staticmethod
213
- def _resolve_type(type_name: str, custom_types: dict[str, Any]) -> Optional[type]:
206
+ def _split_subtypes(type_name: str) -> list[str]:
207
+ """This splits a list of subtypes, including proper square bracket handling."""
208
+ start = 0
209
+ depth = 0
210
+ subtypes = []
211
+ for i, c in enumerate(type_name):
212
+ if c == "[":
213
+ depth += 1
214
+ elif c == "]":
215
+ if depth == 0:
216
+ raise TypeError(
217
+ "Extra closing square bracket when parsing subtype list"
218
+ )
219
+ depth -= 1
220
+ elif c == "," and depth == 0:
221
+ subtypes.append(type_name[start:i].strip())
222
+ start = i + 1
223
+ if depth > 0:
224
+ raise TypeError("Unclosed square bracket when parsing subtype list")
225
+ subtypes.append(type_name[start:].strip())
226
+ return subtypes
227
+
228
+ @staticmethod
229
+ def _resolve_type(type_name: str, custom_types: dict[str, Any]) -> Optional[type]: # noqa: PLR0911
214
230
  """Convert a string-based type back into a python type."""
231
+ type_name = type_name.strip()
232
+ if not type_name:
233
+ raise TypeError("Type cannot be empty")
234
+ if type_name == "NoneType":
235
+ return None
236
+
237
+ bracket_idx = type_name.find("[")
238
+ subtypes: Optional[tuple[Optional[type], ...]] = None
239
+ if bracket_idx > -1:
240
+ if bracket_idx == 0:
241
+ raise TypeError("Type cannot start with '['")
242
+ close_bracket_idx = type_name.rfind("]")
243
+ if close_bracket_idx == -1:
244
+ raise TypeError("Unclosed square bracket when parsing type")
245
+ if close_bracket_idx < bracket_idx:
246
+ raise TypeError("Square brackets are out of order when parsing type")
247
+ if close_bracket_idx == bracket_idx + 1:
248
+ raise TypeError("Empty square brackets when parsing type")
249
+ subtype_names = SignalSchema._split_subtypes(
250
+ type_name[bracket_idx + 1 : close_bracket_idx]
251
+ )
252
+ # Types like Union require the parameters to be a tuple of types.
253
+ subtypes = tuple(
254
+ SignalSchema._resolve_type(st, custom_types) for st in subtype_names
255
+ )
256
+ type_name = type_name[:bracket_idx].strip()
257
+
215
258
  fr = NAMES_TO_TYPES.get(type_name)
216
259
  if fr:
260
+ if subtypes:
261
+ if len(subtypes) == 1:
262
+ # Types like Optional require there to be only one argument.
263
+ return fr[subtypes[0]] # type: ignore[index]
264
+ # Other types like Union require the parameters to be a tuple of types.
265
+ return fr[subtypes] # type: ignore[index]
217
266
  return fr # type: ignore[return-value]
218
267
 
219
268
  model_name, version = ModelStore.parse_name_version(type_name)
@@ -228,7 +277,14 @@ class SignalSchema:
228
277
  for field_name, field_type_str in fields.items()
229
278
  }
230
279
  return create_feature_model(type_name, fields)
231
- return None
280
+ # This can occur if a third-party or custom type is used, which is not available
281
+ # when deserializing.
282
+ warnings.warn(
283
+ f"Could not resolve type: '{type_name}'.",
284
+ SignalSchemaWarning,
285
+ stacklevel=2,
286
+ )
287
+ return Any # type: ignore[return-value]
232
288
 
233
289
  @staticmethod
234
290
  def deserialize(schema: dict[str, Any]) -> "SignalSchema":
@@ -242,9 +298,14 @@ class SignalSchema:
242
298
  # This entry is used as a lookup for custom types,
243
299
  # and is not an actual field.
244
300
  continue
301
+ if not isinstance(type_name, str):
302
+ raise SignalSchemaError(
303
+ f"cannot deserialize '{type_name}': "
304
+ "serialized types must be a string"
305
+ )
245
306
  try:
246
307
  fr = SignalSchema._resolve_type(type_name, custom_types)
247
- if fr is None:
308
+ if fr is Any:
248
309
  # Skip if the type is not found, so all data can be displayed.
249
310
  warnings.warn(
250
311
  f"In signal '{signal}': "
@@ -258,7 +319,7 @@ class SignalSchema:
258
319
  raise SignalSchemaError(
259
320
  f"cannot deserialize '{signal}': {err}"
260
321
  ) from err
261
- signals[signal] = fr
322
+ signals[signal] = fr # type: ignore[assignment]
262
323
 
263
324
  return SignalSchema(signals)
264
325
 
@@ -509,31 +570,58 @@ class SignalSchema:
509
570
  return self.values.pop(name)
510
571
 
511
572
  @staticmethod
512
- def _type_to_str(type_): # noqa: PLR0911
573
+ def _type_to_str(type_: Optional[type], subtypes: Optional[list] = None) -> str: # noqa: PLR0911
574
+ """Convert a type to a string-based representation."""
575
+ if type_ is None:
576
+ return "NoneType"
577
+
513
578
  origin = get_origin(type_)
514
579
 
515
580
  if origin == Union:
516
581
  args = get_args(type_)
517
- formatted_types = ", ".join(SignalSchema._type_to_str(arg) for arg in args)
582
+ formatted_types = ", ".join(
583
+ SignalSchema._type_to_str(arg, subtypes) for arg in args
584
+ )
518
585
  return f"Union[{formatted_types}]"
519
586
  if origin == Optional:
520
587
  args = get_args(type_)
521
- type_str = SignalSchema._type_to_str(args[0])
588
+ type_str = SignalSchema._type_to_str(args[0], subtypes)
522
589
  return f"Optional[{type_str}]"
523
- if origin is list:
590
+ if origin in (list, List): # noqa: UP006
524
591
  args = get_args(type_)
525
- type_str = SignalSchema._type_to_str(args[0])
592
+ type_str = SignalSchema._type_to_str(args[0], subtypes)
526
593
  return f"list[{type_str}]"
527
- if origin is dict:
594
+ if origin in (dict, Dict): # noqa: UP006
528
595
  args = get_args(type_)
529
- type_str = SignalSchema._type_to_str(args[0]) if len(args) > 0 else ""
530
- vals = f", {SignalSchema._type_to_str(args[1])}" if len(args) > 1 else ""
596
+ type_str = (
597
+ SignalSchema._type_to_str(args[0], subtypes) if len(args) > 0 else ""
598
+ )
599
+ vals = (
600
+ f", {SignalSchema._type_to_str(args[1], subtypes)}"
601
+ if len(args) > 1
602
+ else ""
603
+ )
531
604
  return f"dict[{type_str}{vals}]"
532
605
  if origin == Annotated:
533
606
  args = get_args(type_)
534
- return SignalSchema._type_to_str(args[0])
535
- if origin in (Literal, LiteralEx):
607
+ return SignalSchema._type_to_str(args[0], subtypes)
608
+ if origin in (Literal, LiteralEx) or type_ in (Literal, LiteralEx):
536
609
  return "Literal"
610
+ if Any in (origin, type_):
611
+ return "Any"
612
+ if Final in (origin, type_):
613
+ return "Final"
614
+ if subtypes is not None:
615
+ # Include this type in the list of all subtypes, if requested.
616
+ subtypes.append(type_)
617
+ if not hasattr(type_, "__name__"):
618
+ # This can happen for some third-party or custom types, mostly on Python 3.9
619
+ warnings.warn(
620
+ f"Unable to determine name of type '{type_}'.",
621
+ SignalSchemaWarning,
622
+ stacklevel=2,
623
+ )
624
+ return "Any"
537
625
  return type_.__name__
538
626
 
539
627
  @staticmethod
datachain/listing.py CHANGED
@@ -9,7 +9,8 @@ from sqlalchemy import Column
9
9
  from sqlalchemy.sql import func
10
10
  from tqdm import tqdm
11
11
 
12
- from datachain.node import DirType, Entry, Node, NodeWithPath
12
+ from datachain.lib.file import File
13
+ from datachain.node import DirType, Node, NodeWithPath
13
14
  from datachain.sql.functions import path as pathfunc
14
15
  from datachain.utils import suffix_to_number
15
16
 
@@ -80,16 +81,13 @@ class Listing:
80
81
  finally:
81
82
  fetch_listing.insert_entries_done()
82
83
 
83
- def insert_entry(self, entry: Entry) -> None:
84
- self.warehouse.insert_rows(
85
- self.dataset_rows.get_table(),
86
- self.warehouse.prepare_entries(self.client.uri, [entry]),
87
- )
84
+ def insert_entry(self, entry: File) -> None:
85
+ self.insert_entries([entry])
88
86
 
89
- def insert_entries(self, entries: Iterable[Entry]) -> None:
87
+ def insert_entries(self, entries: Iterable[File]) -> None:
90
88
  self.warehouse.insert_rows(
91
89
  self.dataset_rows.get_table(),
92
- self.warehouse.prepare_entries(self.client.uri, entries),
90
+ self.warehouse.prepare_entries(entries),
93
91
  )
94
92
 
95
93
  def insert_entries_done(self) -> None:
@@ -104,7 +102,7 @@ class Listing:
104
102
  return self.warehouse.get_node_by_path(self.dataset_rows, path)
105
103
 
106
104
  def ls_path(self, node, fields):
107
- if node.vtype == "tar" or node.dir_type == DirType.TAR_ARCHIVE:
105
+ if node.location or node.dir_type == DirType.TAR_ARCHIVE:
108
106
  return self.warehouse.select_node_fields_by_parent_path_tar(
109
107
  self.dataset_rows, node.path, fields
110
108
  )
@@ -235,7 +233,7 @@ class Listing:
235
233
  return self.warehouse.size(self.dataset_rows, node, count_files)
236
234
 
237
235
  def subtree_files(self, node: Node, sort=None):
238
- if node.dir_type == DirType.TAR_ARCHIVE or node.vtype != "":
236
+ if node.dir_type == DirType.TAR_ARCHIVE or node.location:
239
237
  include_subobjects = True
240
238
  else:
241
239
  include_subobjects = False
datachain/node.py CHANGED
@@ -4,7 +4,6 @@ from typing import TYPE_CHECKING, Any, Optional
4
4
  import attrs
5
5
 
6
6
  from datachain.cache import UniqueId
7
- from datachain.lib.file import File
8
7
  from datachain.storage import StorageURI
9
8
  from datachain.utils import TIME_ZERO, time_to_str
10
9
 
@@ -49,18 +48,15 @@ class DirTypeGroup:
49
48
  class Node:
50
49
  sys__id: int = 0
51
50
  sys__rand: int = 0
52
- vtype: str = ""
53
- dir_type: Optional[int] = None
54
51
  path: str = ""
55
52
  etag: str = ""
56
53
  version: Optional[str] = None
57
54
  is_latest: bool = True
58
55
  last_modified: Optional[datetime] = None
59
56
  size: int = 0
60
- owner_name: str = ""
61
- owner_id: str = ""
62
57
  location: Optional[str] = None
63
58
  source: StorageURI = StorageURI("")
59
+ dir_type: int = DirType.FILE
64
60
 
65
61
  @property
66
62
  def is_dir(self) -> bool:
@@ -113,7 +109,6 @@ class Node:
113
109
  version=self.version or "",
114
110
  etag=self.etag,
115
111
  is_latest=self.is_latest,
116
- vtype=self.vtype,
117
112
  location=self.location,
118
113
  last_modified=self.last_modified or TIME_ZERO,
119
114
  )
@@ -143,66 +138,6 @@ class Node:
143
138
  return split[0]
144
139
 
145
140
 
146
- @attrs.define
147
- class Entry:
148
- vtype: str = ""
149
- dir_type: Optional[int] = None
150
- path: str = ""
151
- etag: str = ""
152
- version: str = ""
153
- is_latest: bool = True
154
- last_modified: Optional[datetime] = None
155
- size: int = 0
156
- owner_name: str = ""
157
- owner_id: str = ""
158
- location: Optional[str] = None
159
-
160
- @property
161
- def is_dir(self) -> bool:
162
- return self.dir_type == DirType.DIR
163
-
164
- @classmethod
165
- def from_dir(cls, path: str, **kwargs) -> "Entry":
166
- return cls(dir_type=DirType.DIR, path=path, **kwargs)
167
-
168
- @classmethod
169
- def from_file(cls, path: str, **kwargs) -> "Entry":
170
- return cls(dir_type=DirType.FILE, path=path, **kwargs)
171
-
172
- @classmethod
173
- def root(cls):
174
- return cls(dir_type=DirType.DIR)
175
-
176
- @property
177
- def full_path(self) -> str:
178
- if self.is_dir and self.path:
179
- return self.path + "/"
180
- return self.path
181
-
182
- @property
183
- def name(self):
184
- return self.path.rsplit("/", 1)[-1]
185
-
186
- @property
187
- def parent(self):
188
- split = self.path.rsplit("/", 1)
189
- if len(split) <= 1:
190
- return ""
191
- return split[0]
192
-
193
- def to_file(self, source: str) -> File:
194
- return File(
195
- source=source,
196
- path=self.path,
197
- size=self.size,
198
- version=self.version,
199
- etag=self.etag,
200
- is_latest=self.is_latest,
201
- last_modified=self.last_modified,
202
- location=self.location,
203
- )
204
-
205
-
206
141
  def get_path(parent: str, name: str):
207
142
  return f"{parent}/{name}" if parent else name
208
143
 
@@ -229,9 +164,9 @@ class NodeWithPath:
229
164
  TIME_FMT = "%Y-%m-%d %H:%M"
230
165
 
231
166
 
232
- def long_line_str(name: str, timestamp: Optional[datetime], owner: str) -> str:
167
+ def long_line_str(name: str, timestamp: Optional[datetime]) -> str:
233
168
  if timestamp is None:
234
169
  time = "-"
235
170
  else:
236
171
  time = timestamp.strftime(TIME_FMT)
237
- return f"{owner: <19} {time: <19} {name}"
172
+ return f"{time: <19} {name}"
@@ -22,10 +22,6 @@ def load_tar(raw):
22
22
  C.source,
23
23
  C.path,
24
24
  C.size,
25
- C.vtype,
26
- C.dir_type,
27
- C.owner_name,
28
- C.owner_id,
29
25
  C.is_latest,
30
26
  C.last_modified,
31
27
  C.version,
@@ -38,10 +34,6 @@ def index_tar(
38
34
  source,
39
35
  parent_path,
40
36
  size,
41
- vtype,
42
- dir_type,
43
- owner_name,
44
- owner_id,
45
37
  is_latest,
46
38
  last_modified,
47
39
  version,
@@ -53,10 +45,6 @@ def index_tar(
53
45
  source=source,
54
46
  path=parent_path,
55
47
  size=size,
56
- vtype=vtype,
57
- dir_type=dir_type,
58
- owner_name=owner_name,
59
- owner_id=owner_id,
60
48
  is_latest=bool(is_latest),
61
49
  last_modified=last_modified,
62
50
  version=version,
@@ -70,7 +58,6 @@ def index_tar(
70
58
  source=source,
71
59
  path=full_path,
72
60
  size=info.size,
73
- vtype="tar",
74
61
  location={
75
62
  "vtype": "tar",
76
63
  "offset": info.offset_data,
@@ -81,7 +68,6 @@ def index_tar(
81
68
  "version": version,
82
69
  "size": size,
83
70
  "etag": etag,
84
- "vtype": "",
85
71
  "location": None,
86
72
  },
87
73
  },
datachain/query/schema.py CHANGED
@@ -9,7 +9,7 @@ import attrs
9
9
  import sqlalchemy as sa
10
10
  from fsspec.callbacks import DEFAULT_CALLBACK, Callback
11
11
 
12
- from datachain.sql.types import JSON, Boolean, DateTime, Int, Int64, SQLType, String
12
+ from datachain.sql.types import JSON, Boolean, DateTime, Int64, SQLType, String
13
13
 
14
14
  if TYPE_CHECKING:
15
15
  from datachain.catalog import Catalog
@@ -222,10 +222,6 @@ class DatasetRow:
222
222
  "path": String,
223
223
  "size": Int64,
224
224
  "location": JSON,
225
- "vtype": String,
226
- "dir_type": Int,
227
- "owner_name": String,
228
- "owner_id": String,
229
225
  "is_latest": Boolean,
230
226
  "last_modified": DateTime,
231
227
  "version": String,
@@ -238,10 +234,6 @@ class DatasetRow:
238
234
  source: str = "",
239
235
  size: int = 0,
240
236
  location: Optional[dict[str, Any]] = None,
241
- vtype: str = "",
242
- dir_type: int = 0,
243
- owner_name: str = "",
244
- owner_id: str = "",
245
237
  is_latest: bool = True,
246
238
  last_modified: Optional[datetime] = None,
247
239
  version: str = "",
@@ -251,10 +243,7 @@ class DatasetRow:
251
243
  str,
252
244
  int,
253
245
  Optional[str],
254
- str,
255
246
  int,
256
- str,
257
- str,
258
247
  bool,
259
248
  datetime,
260
249
  str,
@@ -271,10 +260,6 @@ class DatasetRow:
271
260
  path,
272
261
  size,
273
262
  location,
274
- vtype,
275
- dir_type,
276
- owner_name,
277
- owner_id,
278
263
  is_latest,
279
264
  last_modified,
280
265
  version,