datachain 0.3.12__py3-none-any.whl → 0.3.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

datachain/asyn.py CHANGED
@@ -1,14 +1,8 @@
1
1
  import asyncio
2
- from collections.abc import Awaitable, Coroutine, Iterable
2
+ from collections.abc import AsyncIterable, Awaitable, Coroutine, Iterable, Iterator
3
3
  from concurrent.futures import ThreadPoolExecutor
4
4
  from heapq import heappop, heappush
5
- from typing import (
6
- Any,
7
- Callable,
8
- Generic,
9
- Optional,
10
- TypeVar,
11
- )
5
+ from typing import Any, Callable, Generic, Optional, TypeVar
12
6
 
13
7
  from fsspec.asyn import get_loop
14
8
 
@@ -16,6 +10,7 @@ ASYNC_WORKERS = 20
16
10
 
17
11
  InputT = TypeVar("InputT", contravariant=True) # noqa: PLC0105
18
12
  ResultT = TypeVar("ResultT", covariant=True) # noqa: PLC0105
13
+ T = TypeVar("T")
19
14
 
20
15
 
21
16
  class AsyncMapper(Generic[InputT, ResultT]):
@@ -226,7 +221,7 @@ class OrderedMapper(AsyncMapper[InputT, ResultT]):
226
221
  self._push_result(self._next_yield, None)
227
222
 
228
223
 
229
- def iter_over_async(ait, loop):
224
+ def iter_over_async(ait: AsyncIterable[T], loop) -> Iterator[T]:
230
225
  """Wrap an asynchronous iterator into a synchronous one"""
231
226
  ait = ait.__aiter__()
232
227
 
datachain/cache.py CHANGED
@@ -29,7 +29,6 @@ class UniqueId:
29
29
  etag: str
30
30
  version: str = ""
31
31
  is_latest: bool = True
32
- vtype: str = ""
33
32
  location: Optional[str] = None
34
33
  last_modified: datetime = TIME_ZERO
35
34
 
@@ -62,7 +62,7 @@ from datachain.listing import Listing
62
62
  from datachain.node import DirType, Node, NodeWithPath
63
63
  from datachain.nodes_thread_pool import NodesThreadPool
64
64
  from datachain.remote.studio import StudioClient
65
- from datachain.sql.types import JSON, Boolean, DateTime, Int, Int64, SQLType, String
65
+ from datachain.sql.types import JSON, Boolean, DateTime, Int64, SQLType, String
66
66
  from datachain.storage import Storage, StorageStatus, StorageURI
67
67
  from datachain.utils import (
68
68
  DataChainDir,
@@ -513,8 +513,6 @@ def find_column_to_str( # noqa: PLR0911
513
513
  )
514
514
  if column == "name":
515
515
  return posixpath.basename(row[field_lookup["path"]]) or ""
516
- if column == "owner":
517
- return row[field_lookup["owner_name"]] or ""
518
516
  if column == "path":
519
517
  is_dir = row[field_lookup["dir_type"]] == DirType.DIR
520
518
  path = row[field_lookup["path"]]
@@ -666,16 +664,12 @@ class Catalog:
666
664
  source_metastore = self.metastore.clone(client.uri)
667
665
 
668
666
  columns = [
669
- Column("vtype", String),
670
- Column("dir_type", Int),
671
667
  Column("path", String),
672
668
  Column("etag", String),
673
669
  Column("version", String),
674
670
  Column("is_latest", Boolean),
675
671
  Column("last_modified", DateTime(timezone=True)),
676
672
  Column("size", Int64),
677
- Column("owner_name", String),
678
- Column("owner_id", String),
679
673
  Column("location", JSON),
680
674
  Column("source", String),
681
675
  ]
@@ -1396,12 +1390,12 @@ class Catalog:
1396
1390
  dataset = self.get_dataset(name)
1397
1391
  return self.warehouse.dataset_table_export_file_names(dataset, version)
1398
1392
 
1399
- def dataset_stats(self, name: str, version: int) -> DatasetStats:
1393
+ def dataset_stats(self, name: str, version: Optional[int]) -> DatasetStats:
1400
1394
  """
1401
1395
  Returns tuple with dataset stats: total number of rows and total dataset size.
1402
1396
  """
1403
1397
  dataset = self.get_dataset(name)
1404
- dataset_version = dataset.get_version(version)
1398
+ dataset_version = dataset.get_version(version or dataset.latest_version)
1405
1399
  return DatasetStats(
1406
1400
  num_objects=dataset_version.num_objects,
1407
1401
  size=dataset_version.size,
@@ -1516,7 +1510,6 @@ class Catalog:
1516
1510
  row["etag"],
1517
1511
  row["version"],
1518
1512
  row["is_latest"],
1519
- row["vtype"],
1520
1513
  row["location"],
1521
1514
  row["last_modified"],
1522
1515
  )
@@ -1987,8 +1980,6 @@ class Catalog:
1987
1980
  field_set.add("path")
1988
1981
  elif column == "name":
1989
1982
  field_set.add("path")
1990
- elif column == "owner":
1991
- field_set.add("owner_name")
1992
1983
  elif column == "path":
1993
1984
  field_set.add("dir_type")
1994
1985
  field_set.add("path")
datachain/cli.py CHANGED
@@ -24,7 +24,7 @@ logger = logging.getLogger("datachain")
24
24
 
25
25
  TTL_HUMAN = "4h"
26
26
  TTL_INT = 4 * 60 * 60
27
- FIND_COLUMNS = ["du", "name", "owner", "path", "size", "type"]
27
+ FIND_COLUMNS = ["du", "name", "path", "size", "type"]
28
28
 
29
29
 
30
30
  def human_time_type(value_str: str, can_be_none: bool = False) -> Optional[int]:
@@ -579,9 +579,8 @@ def _node_data_to_ls_values(row, long_format=False):
579
579
  value = name + ending
580
580
  if long_format:
581
581
  last_modified = row[2]
582
- owner_name = row[3]
583
582
  timestamp = last_modified if not is_dir else None
584
- return long_line_str(value, timestamp, owner_name)
583
+ return long_line_str(value, timestamp)
585
584
  return value
586
585
 
587
586
 
@@ -599,7 +598,7 @@ def _ls_urls_flat(
599
598
  if client_cls.is_root_url(source):
600
599
  buckets = client_cls.ls_buckets(**catalog.client_config)
601
600
  if long:
602
- values = (long_line_str(b.name, b.created, "") for b in buckets)
601
+ values = (long_line_str(b.name, b.created) for b in buckets)
603
602
  else:
604
603
  values = (b.name for b in buckets)
605
604
  yield source, values
@@ -607,7 +606,7 @@ def _ls_urls_flat(
607
606
  found = False
608
607
  fields = ["name", "dir_type"]
609
608
  if long:
610
- fields.extend(["last_modified", "owner_name"])
609
+ fields.append("last_modified")
611
610
  for data_source, results in catalog.ls([source], fields=fields, **kwargs):
612
611
  values = (_node_data_to_ls_values(r, long) for r in results)
613
612
  found = True
@@ -683,7 +682,6 @@ def ls_remote(
683
682
  entry = long_line_str(
684
683
  row["name"] + ("/" if row["dir_type"] else ""),
685
684
  row["last_modified"],
686
- row["owner_name"],
687
685
  )
688
686
  print(format_ls_entry(entry))
689
687
  else:
datachain/client/azure.py CHANGED
@@ -4,7 +4,6 @@ from adlfs import AzureBlobFileSystem
4
4
  from tqdm import tqdm
5
5
 
6
6
  from datachain.lib.file import File
7
- from datachain.node import Entry
8
7
 
9
8
  from .fsspec import DELIMITER, Client, ResultQueue
10
9
 
@@ -14,17 +13,6 @@ class AzureClient(Client):
14
13
  PREFIX = "az://"
15
14
  protocol = "az"
16
15
 
17
- def convert_info(self, v: dict[str, Any], path: str) -> Entry:
18
- version_id = v.get("version_id")
19
- return Entry.from_file(
20
- path=path,
21
- etag=v.get("etag", "").strip('"'),
22
- version=version_id or "",
23
- is_latest=version_id is None or bool(v.get("is_current_version")),
24
- last_modified=v["last_modified"],
25
- size=v.get("size", ""),
26
- )
27
-
28
16
  def info_to_file(self, v: dict[str, Any], path: str) -> File:
29
17
  version_id = v.get("version_id")
30
18
  return File(
@@ -57,7 +45,7 @@ class AzureClient(Client):
57
45
  continue
58
46
  info = (await self.fs._details([b]))[0]
59
47
  entries.append(
60
- self.convert_info(info, self.rel_path(info["name"]))
48
+ self.info_to_file(info, self.rel_path(info["name"]))
61
49
  )
62
50
  if entries:
63
51
  await result_queue.put(entries)
@@ -29,7 +29,7 @@ from tqdm import tqdm
29
29
  from datachain.cache import DataChainCache, UniqueId
30
30
  from datachain.client.fileslice import FileSlice, FileWrapper
31
31
  from datachain.error import ClientError as DataChainClientError
32
- from datachain.node import Entry
32
+ from datachain.lib.file import File
33
33
  from datachain.nodes_fetcher import NodesFetcher
34
34
  from datachain.nodes_thread_pool import NodeChunk
35
35
  from datachain.storage import StorageURI
@@ -45,7 +45,7 @@ DELIMITER = "/" # Path delimiter.
45
45
 
46
46
  DATA_SOURCE_URI_PATTERN = re.compile(r"^[\w]+:\/\/.*$")
47
47
 
48
- ResultQueue = asyncio.Queue[Optional[Sequence[Entry]]]
48
+ ResultQueue = asyncio.Queue[Optional[Sequence[File]]]
49
49
 
50
50
 
51
51
  def _is_win_local_path(uri: str) -> bool:
@@ -188,7 +188,7 @@ class Client(ABC):
188
188
 
189
189
  async def get_current_etag(self, uid: UniqueId) -> str:
190
190
  info = await self.fs._info(self.get_full_path(uid.path))
191
- return self.convert_info(info, "").etag
191
+ return self.info_to_file(info, "").etag
192
192
 
193
193
  async def get_size(self, path: str) -> int:
194
194
  return await self.fs._size(path)
@@ -198,7 +198,7 @@ class Client(ABC):
198
198
 
199
199
  async def scandir(
200
200
  self, start_prefix: str, method: str = "default"
201
- ) -> AsyncIterator[Sequence[Entry]]:
201
+ ) -> AsyncIterator[Sequence[File]]:
202
202
  try:
203
203
  impl = getattr(self, f"_fetch_{method}")
204
204
  except AttributeError:
@@ -264,7 +264,7 @@ class Client(ABC):
264
264
  ) -> None:
265
265
  await self._fetch_nested(start_prefix, result_queue)
266
266
 
267
- async def _fetch_dir(self, prefix, pbar, result_queue) -> set[str]:
267
+ async def _fetch_dir(self, prefix, pbar, result_queue: ResultQueue) -> set[str]:
268
268
  path = f"{self.name}/{prefix}"
269
269
  infos = await self.ls_dir(path)
270
270
  files = []
@@ -277,7 +277,7 @@ class Client(ABC):
277
277
  if info["type"] == "directory":
278
278
  subdirs.add(subprefix)
279
279
  else:
280
- files.append(self.convert_info(info, subprefix))
280
+ files.append(self.info_to_file(info, subprefix))
281
281
  if files:
282
282
  await result_queue.put(files)
283
283
  found_count = len(subdirs) + len(files)
@@ -303,7 +303,7 @@ class Client(ABC):
303
303
  return f"{self.PREFIX}{self.name}/{rel_path}"
304
304
 
305
305
  @abstractmethod
306
- def convert_info(self, v: dict[str, Any], parent: str) -> Entry: ...
306
+ def info_to_file(self, v: dict[str, Any], parent: str) -> File: ...
307
307
 
308
308
  def fetch_nodes(
309
309
  self,
@@ -363,7 +363,6 @@ class Client(ABC):
363
363
  parent["path"],
364
364
  parent["size"],
365
365
  parent["etag"],
366
- vtype=parent["vtype"],
367
366
  location=parent["location"],
368
367
  )
369
368
  f = self.open_object(parent_uid, use_cache=use_cache)
datachain/client/gcs.py CHANGED
@@ -10,7 +10,6 @@ from gcsfs import GCSFileSystem
10
10
  from tqdm import tqdm
11
11
 
12
12
  from datachain.lib.file import File
13
- from datachain.node import Entry
14
13
 
15
14
  from .fsspec import DELIMITER, Client, ResultQueue
16
15
 
@@ -108,19 +107,9 @@ class GCSClient(Client):
108
107
  finally:
109
108
  await page_queue.put(None)
110
109
 
111
- def _entry_from_dict(self, d: dict[str, Any]) -> Entry:
110
+ def _entry_from_dict(self, d: dict[str, Any]) -> File:
112
111
  info = self.fs._process_object(self.name, d)
113
- return self.convert_info(info, self.rel_path(info["name"]))
114
-
115
- def convert_info(self, v: dict[str, Any], path: str) -> Entry:
116
- return Entry.from_file(
117
- path=path,
118
- etag=v.get("etag", ""),
119
- version=v.get("generation", ""),
120
- is_latest=not v.get("timeDeleted"),
121
- last_modified=self.parse_timestamp(v["updated"]),
122
- size=v.get("size", ""),
123
- )
112
+ return self.info_to_file(info, self.rel_path(info["name"]))
124
113
 
125
114
  def info_to_file(self, v: dict[str, Any], path: str) -> File:
126
115
  return File(
datachain/client/hf.py CHANGED
@@ -5,7 +5,6 @@ from typing import Any, cast
5
5
  from huggingface_hub import HfFileSystem
6
6
 
7
7
  from datachain.lib.file import File
8
- from datachain.node import Entry
9
8
 
10
9
  from .fsspec import Client
11
10
 
@@ -22,15 +21,6 @@ class HfClient(Client):
22
21
 
23
22
  return cast(HfFileSystem, super().create_fs(**kwargs))
24
23
 
25
- def convert_info(self, v: dict[str, Any], path: str) -> Entry:
26
- return Entry.from_file(
27
- path=path,
28
- size=v["size"],
29
- version=v["last_commit"].oid,
30
- etag=v.get("blob_id", ""),
31
- last_modified=v["last_commit"].date,
32
- )
33
-
34
24
  def info_to_file(self, v: dict[str, Any], path: str) -> File:
35
25
  return File(
36
26
  path=path,
datachain/client/local.py CHANGED
@@ -7,8 +7,8 @@ from urllib.parse import urlparse
7
7
 
8
8
  from fsspec.implementations.local import LocalFileSystem
9
9
 
10
+ from datachain.cache import UniqueId
10
11
  from datachain.lib.file import File
11
- from datachain.node import Entry
12
12
  from datachain.storage import StorageURI
13
13
 
14
14
  from .fsspec import Client
@@ -114,9 +114,9 @@ class FileClient(Client):
114
114
  use_symlinks=use_symlinks,
115
115
  )
116
116
 
117
- async def get_current_etag(self, uid) -> str:
117
+ async def get_current_etag(self, uid: UniqueId) -> str:
118
118
  info = self.fs.info(self.get_full_path(uid.path))
119
- return self.convert_info(info, "").etag
119
+ return self.info_to_file(info, "").etag
120
120
 
121
121
  async def get_size(self, path: str) -> int:
122
122
  return self.fs.size(path)
@@ -136,15 +136,6 @@ class FileClient(Client):
136
136
  full_path += "/"
137
137
  return full_path
138
138
 
139
- def convert_info(self, v: dict[str, Any], path: str) -> Entry:
140
- return Entry.from_file(
141
- path=path,
142
- etag=v["mtime"].hex(),
143
- is_latest=True,
144
- last_modified=datetime.fromtimestamp(v["mtime"], timezone.utc),
145
- size=v.get("size", ""),
146
- )
147
-
148
139
  def info_to_file(self, v: dict[str, Any], path: str) -> File:
149
140
  return File(
150
141
  source=self.uri,
datachain/client/s3.py CHANGED
@@ -1,12 +1,11 @@
1
1
  import asyncio
2
- from typing import Any, cast
2
+ from typing import Any, Optional, cast
3
3
 
4
4
  from botocore.exceptions import NoCredentialsError
5
5
  from s3fs import S3FileSystem
6
6
  from tqdm import tqdm
7
7
 
8
8
  from datachain.lib.file import File
9
- from datachain.node import Entry
10
9
 
11
10
  from .fsspec import DELIMITER, Client, ResultQueue
12
11
 
@@ -111,24 +110,23 @@ class ClientS3(Client):
111
110
  ) -> None:
112
111
  await self._fetch_flat(start_prefix, result_queue)
113
112
 
114
- def _entry_from_boto(self, v, bucket, versions=False):
115
- return Entry.from_file(
113
+ def _entry_from_boto(self, v, bucket, versions=False) -> File:
114
+ return File(
115
+ source=self.uri,
116
116
  path=v["Key"],
117
117
  etag=v.get("ETag", "").strip('"'),
118
118
  version=ClientS3.clean_s3_version(v.get("VersionId", "")),
119
119
  is_latest=v.get("IsLatest", True),
120
120
  last_modified=v.get("LastModified", ""),
121
121
  size=v["Size"],
122
- owner_name=v.get("Owner", {}).get("DisplayName", ""),
123
- owner_id=v.get("Owner", {}).get("ID", ""),
124
122
  )
125
123
 
126
124
  async def _fetch_dir(
127
125
  self,
128
126
  prefix,
129
127
  pbar,
130
- result_queue,
131
- ):
128
+ result_queue: ResultQueue,
129
+ ) -> set[str]:
132
130
  if prefix:
133
131
  prefix = prefix.lstrip(DELIMITER) + DELIMITER
134
132
  files = []
@@ -143,7 +141,7 @@ class ClientS3(Client):
143
141
  if info["type"] == "directory":
144
142
  subdirs.add(subprefix)
145
143
  else:
146
- files.append(self.convert_info(info, subprefix))
144
+ files.append(self.info_to_file(info, subprefix))
147
145
  pbar.update()
148
146
  found = True
149
147
  if not found:
@@ -154,20 +152,8 @@ class ClientS3(Client):
154
152
  return subdirs
155
153
 
156
154
  @staticmethod
157
- def clean_s3_version(ver):
158
- return ver if ver != "null" else ""
159
-
160
- def convert_info(self, v: dict[str, Any], path: str) -> Entry:
161
- return Entry.from_file(
162
- path=path,
163
- etag=v.get("ETag", "").strip('"'),
164
- version=ClientS3.clean_s3_version(v.get("VersionId", "")),
165
- is_latest=v.get("IsLatest", True),
166
- last_modified=v.get("LastModified", ""),
167
- size=v["size"],
168
- owner_name=v.get("Owner", {}).get("DisplayName", ""),
169
- owner_id=v.get("Owner", {}).get("ID", ""),
170
- )
155
+ def clean_s3_version(ver: Optional[str]) -> str:
156
+ return ver if (ver is not None and ver != "null") else ""
171
157
 
172
158
  def info_to_file(self, v: dict[str, Any], path: str) -> File:
173
159
  return File(
@@ -10,9 +10,8 @@ from typing import (
10
10
 
11
11
  import sqlalchemy as sa
12
12
  from sqlalchemy.sql import func as f
13
- from sqlalchemy.sql.expression import null, true
13
+ from sqlalchemy.sql.expression import false, null, true
14
14
 
15
- from datachain.node import DirType
16
15
  from datachain.sql.functions import path
17
16
  from datachain.sql.types import Int, SQLType, UInt64
18
17
 
@@ -81,8 +80,7 @@ class DirExpansion:
81
80
  def base_select(q):
82
81
  return sa.select(
83
82
  q.c.sys__id,
84
- q.c.vtype,
85
- (q.c.dir_type == DirType.DIR).label("is_dir"),
83
+ false().label("is_dir"),
86
84
  q.c.source,
87
85
  q.c.path,
88
86
  q.c.version,
@@ -94,7 +92,6 @@ class DirExpansion:
94
92
  return (
95
93
  sa.select(
96
94
  f.min(q.c.sys__id).label("sys__id"),
97
- q.c.vtype,
98
95
  q.c.is_dir,
99
96
  q.c.source,
100
97
  q.c.path,
@@ -102,8 +99,8 @@ class DirExpansion:
102
99
  f.max(q.c.location).label("location"),
103
100
  )
104
101
  .select_from(q)
105
- .group_by(q.c.source, q.c.path, q.c.vtype, q.c.is_dir, q.c.version)
106
- .order_by(q.c.source, q.c.path, q.c.vtype, q.c.is_dir, q.c.version)
102
+ .group_by(q.c.source, q.c.path, q.c.is_dir, q.c.version)
103
+ .order_by(q.c.source, q.c.path, q.c.is_dir, q.c.version)
107
104
  )
108
105
 
109
106
  @classmethod
@@ -113,7 +110,6 @@ class DirExpansion:
113
110
  q = q.union_all(
114
111
  sa.select(
115
112
  sa.literal(-1).label("sys__id"),
116
- sa.literal("").label("vtype"),
117
113
  true().label("is_dir"),
118
114
  q.c.source,
119
115
  parent.label("path"),
@@ -43,6 +43,8 @@ if TYPE_CHECKING:
43
43
  from sqlalchemy.sql.elements import ColumnElement
44
44
  from sqlalchemy.types import TypeEngine
45
45
 
46
+ from datachain.lib.file import File
47
+
46
48
 
47
49
  logger = logging.getLogger("datachain")
48
50
 
@@ -58,6 +60,10 @@ quote_schema = sqlite_dialect.identifier_preparer.quote_schema
58
60
  quote = sqlite_dialect.identifier_preparer.quote
59
61
 
60
62
 
63
+ def _get_in_memory_uri():
64
+ return "file::memory:?cache=shared"
65
+
66
+
61
67
  def get_retry_sleep_sec(retry_count: int) -> int:
62
68
  return RETRY_START_SEC * (RETRY_FACTOR**retry_count)
63
69
 
@@ -119,7 +125,7 @@ class SQLiteDatabaseEngine(DatabaseEngine):
119
125
  if db_file == ":memory:":
120
126
  # Enable multithreaded usage of the same in-memory db
121
127
  db = sqlite3.connect(
122
- "file::memory:?cache=shared", uri=True, detect_types=DETECT_TYPES
128
+ _get_in_memory_uri(), uri=True, detect_types=DETECT_TYPES
123
129
  )
124
130
  else:
125
131
  db = sqlite3.connect(
@@ -704,6 +710,9 @@ class SQLiteWarehouse(AbstractWarehouse):
704
710
 
705
711
  self.db.execute(insert_query)
706
712
 
713
+ def prepare_entries(self, entries: "Iterable[File]") -> Iterable[dict[str, Any]]:
714
+ return (e.model_dump() for e in entries)
715
+
707
716
  def insert_rows(self, table: Table, rows: Iterable[dict[str, Any]]) -> None:
708
717
  rows = list(rows)
709
718
  if not rows:
@@ -20,7 +20,7 @@ from datachain.client import Client
20
20
  from datachain.data_storage.schema import convert_rows_custom_column_types
21
21
  from datachain.data_storage.serializer import Serializable
22
22
  from datachain.dataset import DatasetRecord
23
- from datachain.node import DirType, DirTypeGroup, Entry, Node, NodeWithPath, get_path
23
+ from datachain.node import DirType, DirTypeGroup, Node, NodeWithPath, get_path
24
24
  from datachain.sql.functions import path as pathfunc
25
25
  from datachain.sql.types import Int, SQLType
26
26
  from datachain.storage import StorageURI
@@ -28,13 +28,13 @@ from datachain.utils import sql_escape_like
28
28
 
29
29
  if TYPE_CHECKING:
30
30
  from sqlalchemy.sql._typing import _ColumnsClauseArgument
31
- from sqlalchemy.sql.elements import ColumnElement
32
31
  from sqlalchemy.sql.selectable import Select
33
32
  from sqlalchemy.types import TypeEngine
34
33
 
35
34
  from datachain.data_storage import AbstractIDGenerator, schema
36
35
  from datachain.data_storage.db_engine import DatabaseEngine
37
36
  from datachain.data_storage.schema import DataTable
37
+ from datachain.lib.file import File
38
38
 
39
39
  try:
40
40
  import numpy as np
@@ -341,9 +341,7 @@ class AbstractWarehouse(ABC, Serializable):
341
341
 
342
342
  column_objects = [dr.c[c] for c in column_names]
343
343
  # include all object types - file, tar archive, tar file (subobject)
344
- select_query = dr.select(*column_objects).where(
345
- dr.c.dir_type.in_(DirTypeGroup.FILE) & (dr.c.is_latest == true())
346
- )
344
+ select_query = dr.select(*column_objects).where(dr.c.is_latest == true())
347
345
  if path is None:
348
346
  return select_query
349
347
  if recursive:
@@ -404,26 +402,18 @@ class AbstractWarehouse(ABC, Serializable):
404
402
  expressions: tuple[_ColumnsClauseArgument[Any], ...] = (
405
403
  sa.func.count(table.c.sys__id),
406
404
  )
407
- if "file__size" in table.columns:
408
- expressions = (*expressions, sa.func.sum(table.c.file__size))
409
- elif "size" in table.columns:
410
- expressions = (*expressions, sa.func.sum(table.c.size))
405
+ size_columns = [
406
+ c for c in table.columns if c.name == "size" or c.name.endswith("__size")
407
+ ]
408
+ if size_columns:
409
+ expressions = (*expressions, sa.func.sum(sum(size_columns)))
411
410
  query = select(*expressions)
412
411
  ((nrows, *rest),) = self.db.execute(query)
413
- return nrows, rest[0] if rest else None
414
-
415
- def prepare_entries(
416
- self, uri: str, entries: Iterable[Entry]
417
- ) -> list[dict[str, Any]]:
418
- """
419
- Prepares bucket listing entry (row) for inserting into database
420
- """
421
-
422
- def _prepare_entry(entry: Entry):
423
- assert entry.dir_type is not None
424
- return attrs.asdict(entry) | {"source": uri}
412
+ return nrows, rest[0] if rest else 0
425
413
 
426
- return [_prepare_entry(e) for e in entries]
414
+ @abstractmethod
415
+ def prepare_entries(self, entries: "Iterable[File]") -> Iterable[dict[str, Any]]:
416
+ """Convert File entries so they can be passed on to `insert_rows()`"""
427
417
 
428
418
  @abstractmethod
429
419
  def insert_rows(self, table: Table, rows: Iterable[dict[str, Any]]) -> None:
@@ -440,7 +430,7 @@ class AbstractWarehouse(ABC, Serializable):
440
430
  """Inserts dataset rows directly into dataset table"""
441
431
 
442
432
  @abstractmethod
443
- def instr(self, source, target) -> "ColumnElement":
433
+ def instr(self, source, target) -> sa.ColumnElement:
444
434
  """
445
435
  Return SQLAlchemy Boolean determining if a target substring is present in
446
436
  source string column
@@ -500,7 +490,7 @@ class AbstractWarehouse(ABC, Serializable):
500
490
  c = query.selected_columns
501
491
  q = query.where(c.dir_type.in_(file_group))
502
492
  if not include_subobjects:
503
- q = q.where(c.vtype == "")
493
+ q = q.where((c.location == "") | (c.location.is_(None)))
504
494
  return q
505
495
 
506
496
  def get_nodes(self, query) -> Iterator[Node]:
@@ -624,8 +614,7 @@ class AbstractWarehouse(ABC, Serializable):
624
614
 
625
615
  return sa.select(
626
616
  de.c.sys__id,
627
- with_default(dr.c.vtype),
628
- case((de.c.is_dir == true(), DirType.DIR), else_=dr.c.dir_type).label(
617
+ case((de.c.is_dir == true(), DirType.DIR), else_=DirType.FILE).label(
629
618
  "dir_type"
630
619
  ),
631
620
  de.c.path,
@@ -634,8 +623,6 @@ class AbstractWarehouse(ABC, Serializable):
634
623
  with_default(dr.c.is_latest),
635
624
  dr.c.last_modified,
636
625
  with_default(dr.c.size),
637
- with_default(dr.c.owner_name),
638
- with_default(dr.c.owner_id),
639
626
  with_default(dr.c.sys__rand),
640
627
  dr.c.location,
641
628
  de.c.source,
@@ -650,7 +637,6 @@ class AbstractWarehouse(ABC, Serializable):
650
637
  query = dr.select().where(
651
638
  self.path_expr(dr) == path,
652
639
  dr.c.is_latest == true(),
653
- dr.c.dir_type != DirType.DIR,
654
640
  )
655
641
  row = next(self.db.execute(query), None)
656
642
  if row is not None:
@@ -660,7 +646,6 @@ class AbstractWarehouse(ABC, Serializable):
660
646
  dr.select()
661
647
  .where(
662
648
  dr.c.is_latest == true(),
663
- dr.c.dir_type != DirType.DIR,
664
649
  dr.c.path.startswith(path),
665
650
  )
666
651
  .exists()
@@ -761,13 +746,11 @@ class AbstractWarehouse(ABC, Serializable):
761
746
 
762
747
  sub_glob = posixpath.join(path, "*")
763
748
  dr = dataset_rows
764
- selections = [
749
+ selections: list[sa.ColumnElement] = [
765
750
  func.sum(dr.c.size),
766
751
  ]
767
752
  if count_files:
768
- selections.append(
769
- func.sum(dr.c.dir_type.in_(DirTypeGroup.FILE)),
770
- )
753
+ selections.append(func.count())
771
754
  results = next(
772
755
  self.db.execute(
773
756
  dr.select(*selections).where(
datachain/lib/dc.py CHANGED
@@ -234,7 +234,6 @@ class DataChain(DatasetQuery):
234
234
  DEFAULT_FILE_RECORD: ClassVar[dict] = {
235
235
  "source": "",
236
236
  "path": "",
237
- "vtype": "",
238
237
  "size": 0,
239
238
  }
240
239