datachain 0.2.18__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

datachain/cache.py CHANGED
@@ -24,8 +24,7 @@ sha256 = partial(hashlib.sha256, usedforsecurity=False)
24
24
  @attrs.frozen
25
25
  class UniqueId:
26
26
  storage: "StorageURI"
27
- parent: str
28
- name: str
27
+ path: str
29
28
  size: int
30
29
  etag: str
31
30
  version: str = ""
@@ -34,10 +33,6 @@ class UniqueId:
34
33
  location: Optional[str] = None
35
34
  last_modified: datetime = TIME_ZERO
36
35
 
37
- @property
38
- def path(self) -> str:
39
- return f"{self.parent}/{self.name}" if self.parent else self.name
40
-
41
36
  def get_parsed_location(self) -> Optional[dict]:
42
37
  if not self.location:
43
38
  return None
@@ -53,10 +48,10 @@ class UniqueId:
53
48
  return loc_stack[0]
54
49
 
55
50
  def get_hash(self) -> str:
56
- etag = f"{self.vtype}{self.location}" if self.vtype else self.etag
57
- return sha256(
58
- f"{self.storage}/{self.parent}/{self.name}/{self.version}/{etag}".encode()
59
- ).hexdigest()
51
+ fingerprint = f"{self.storage}/{self.path}/{self.version}/{self.etag}"
52
+ if self.location:
53
+ fingerprint += f"/{self.location}"
54
+ return sha256(fingerprint.encode()).hexdigest()
60
55
 
61
56
 
62
57
  def try_scandir(path):
@@ -529,21 +529,16 @@ def find_column_to_str( # noqa: PLR0911
529
529
  if column == "du":
530
530
  return str(
531
531
  src.listing.du(
532
- {
533
- f: row[field_lookup[f]]
534
- for f in ["dir_type", "size", "parent", "name"]
535
- }
532
+ {f: row[field_lookup[f]] for f in ["dir_type", "size", "path"]}
536
533
  )[0]
537
534
  )
538
535
  if column == "name":
539
- return row[field_lookup["name"]] or ""
536
+ return posixpath.basename(row[field_lookup["path"]]) or ""
540
537
  if column == "owner":
541
538
  return row[field_lookup["owner_name"]] or ""
542
539
  if column == "path":
543
540
  is_dir = row[field_lookup["dir_type"]] == DirType.DIR
544
- parent = row[field_lookup["parent"]]
545
- name = row[field_lookup["name"]]
546
- path = f"{parent}/{name}" if parent else name
541
+ path = row[field_lookup["path"]]
547
542
  if is_dir and path:
548
543
  full_path = path + "/"
549
544
  else:
@@ -681,7 +676,7 @@ class Catalog:
681
676
 
682
677
  def parse_url(self, uri: str, **config: Any) -> tuple[Client, str]:
683
678
  config = config or self.client_config
684
- return Client.parse_url(uri, self.metastore, self.cache, **config)
679
+ return Client.parse_url(uri, self.cache, **config)
685
680
 
686
681
  def get_client(self, uri: StorageURI, **config: Any) -> Client:
687
682
  """
@@ -724,8 +719,7 @@ class Catalog:
724
719
  columns = [
725
720
  Column("vtype", String),
726
721
  Column("dir_type", Int),
727
- Column("parent", String),
728
- Column("name", String),
722
+ Column("path", String),
729
723
  Column("etag", String),
730
724
  Column("version", String),
731
725
  Column("is_latest", Boolean),
@@ -1623,8 +1617,7 @@ class Catalog:
1623
1617
  Example output:
1624
1618
  {
1625
1619
  "source": "s3://ldb-public",
1626
- "parent": "animals/dogs",
1627
- "name": "dog.jpg",
1620
+ "path": "animals/dogs/dog.jpg",
1628
1621
  ...
1629
1622
  }
1630
1623
  """
@@ -1675,8 +1668,7 @@ class Catalog:
1675
1668
  def _get_row_uid(self, row: RowDict) -> UniqueId:
1676
1669
  return UniqueId(
1677
1670
  row["source"],
1678
- row["parent"],
1679
- row["name"],
1671
+ row["path"],
1680
1672
  row["size"],
1681
1673
  row["etag"],
1682
1674
  row["version"],
@@ -2308,16 +2300,14 @@ class Catalog:
2308
2300
  if column == "du":
2309
2301
  field_set.add("dir_type")
2310
2302
  field_set.add("size")
2311
- field_set.add("parent")
2312
- field_set.add("name")
2303
+ field_set.add("path")
2313
2304
  elif column == "name":
2314
- field_set.add("name")
2305
+ field_set.add("path")
2315
2306
  elif column == "owner":
2316
2307
  field_set.add("owner_name")
2317
2308
  elif column == "path":
2318
2309
  field_set.add("dir_type")
2319
- field_set.add("parent")
2320
- field_set.add("name")
2310
+ field_set.add("path")
2321
2311
  elif column == "size":
2322
2312
  field_set.add("size")
2323
2313
  elif column == "type":
datachain/client/azure.py CHANGED
@@ -1,4 +1,3 @@
1
- import posixpath
2
1
  from typing import Any
3
2
 
4
3
  from adlfs import AzureBlobFileSystem
@@ -14,16 +13,10 @@ class AzureClient(Client):
14
13
  PREFIX = "az://"
15
14
  protocol = "az"
16
15
 
17
- def convert_info(self, v: dict[str, Any], parent: str) -> Entry:
16
+ def convert_info(self, v: dict[str, Any], path: str) -> Entry:
18
17
  version_id = v.get("version_id")
19
- name = v.get("name", "").split(DELIMITER)[-1]
20
- if version_id:
21
- version_suffix = f"?versionid={version_id}"
22
- if name.endswith(version_suffix):
23
- name = name[: -len(version_suffix)]
24
18
  return Entry.from_file(
25
- parent=parent,
26
- name=name,
19
+ path=path,
27
20
  etag=v.get("etag", "").strip('"'),
28
21
  version=version_id or "",
29
22
  is_latest=version_id is None or bool(v.get("is_current_version")),
@@ -50,9 +43,9 @@ class AzureClient(Client):
50
43
  if not self._is_valid_key(b["name"]):
51
44
  continue
52
45
  info = (await self.fs._details([b]))[0]
53
- full_path = info["name"]
54
- parent = posixpath.dirname(self.rel_path(full_path))
55
- entries.append(self.convert_info(info, parent))
46
+ entries.append(
47
+ self.convert_info(info, self.rel_path(info["name"]))
48
+ )
56
49
  if entries:
57
50
  await result_queue.put(entries)
58
51
  pbar.update(len(entries))
@@ -37,7 +37,6 @@ from datachain.storage import StorageURI
37
37
  if TYPE_CHECKING:
38
38
  from fsspec.spec import AbstractFileSystem
39
39
 
40
- from datachain.data_storage import AbstractMetastore
41
40
 
42
41
  logger = logging.getLogger("datachain")
43
42
 
@@ -116,13 +115,12 @@ class Client(ABC):
116
115
  @staticmethod
117
116
  def parse_url(
118
117
  source: str,
119
- metastore: "AbstractMetastore",
120
118
  cache: DataChainCache,
121
119
  **kwargs,
122
120
  ) -> tuple["Client", str]:
123
121
  cls = Client.get_implementation(source)
124
122
  storage_url, rel_path = cls.split_url(source)
125
- client = cls.from_name(storage_url, metastore, cache, kwargs)
123
+ client = cls.from_name(storage_url, cache, kwargs)
126
124
  return client, rel_path
127
125
 
128
126
  @classmethod
@@ -136,7 +134,6 @@ class Client(ABC):
136
134
  def from_name(
137
135
  cls,
138
136
  name: str,
139
- metastore: "AbstractMetastore",
140
137
  cache: DataChainCache,
141
138
  kwargs: dict[str, Any],
142
139
  ) -> "Client":
@@ -277,7 +274,7 @@ class Client(ABC):
277
274
  if info["type"] == "directory":
278
275
  subdirs.add(subprefix)
279
276
  else:
280
- files.append(self.convert_info(info, prefix))
277
+ files.append(self.convert_info(info, subprefix))
281
278
  if files:
282
279
  await result_queue.put(files)
283
280
  found_count = len(subdirs) + len(files)
@@ -360,12 +357,11 @@ class Client(ABC):
360
357
 
361
358
  parent_uid = UniqueId(
362
359
  parent["source"],
363
- parent["parent"],
364
- parent["name"],
365
- parent["etag"],
360
+ parent["path"],
366
361
  parent["size"],
367
- parent["vtype"],
368
- parent["location"],
362
+ parent["etag"],
363
+ vtype=parent["vtype"],
364
+ location=parent["location"],
369
365
  )
370
366
  f = self.open_object(parent_uid, use_cache=use_cache)
371
367
  return FileSlice(f, offset, size, posixpath.basename(uid.path))
datachain/client/gcs.py CHANGED
@@ -1,7 +1,6 @@
1
1
  import asyncio
2
2
  import json
3
3
  import os
4
- import posixpath
5
4
  from collections.abc import Iterable
6
5
  from datetime import datetime
7
6
  from typing import Any, Optional, cast
@@ -110,20 +109,11 @@ class GCSClient(Client):
110
109
 
111
110
  def _entry_from_dict(self, d: dict[str, Any]) -> Entry:
112
111
  info = self.fs._process_object(self.name, d)
113
- full_path = info["name"]
114
- subprefix = self.rel_path(full_path)
115
- parent = posixpath.dirname(subprefix)
116
- return self.convert_info(info, parent)
117
-
118
- def convert_info(self, v: dict[str, Any], parent: str) -> Entry:
119
- name = v.get("name", "").split(DELIMITER)[-1]
120
- if "generation" in v:
121
- gen = f"#{v['generation']}"
122
- if name.endswith(gen):
123
- name = name[: -len(gen)]
112
+ return self.convert_info(info, self.rel_path(info["name"]))
113
+
114
+ def convert_info(self, v: dict[str, Any], path: str) -> Entry:
124
115
  return Entry.from_file(
125
- parent=parent,
126
- name=name,
116
+ path=path,
127
117
  etag=v.get("etag", ""),
128
118
  version=v.get("generation", ""),
129
119
  is_latest=not v.get("timeDeleted"),
datachain/client/local.py CHANGED
@@ -2,7 +2,7 @@ import os
2
2
  import posixpath
3
3
  from datetime import datetime, timezone
4
4
  from pathlib import Path
5
- from typing import TYPE_CHECKING, Any
5
+ from typing import Any
6
6
  from urllib.parse import urlparse
7
7
 
8
8
  from fsspec.implementations.local import LocalFileSystem
@@ -12,9 +12,6 @@ from datachain.storage import StorageURI
12
12
 
13
13
  from .fsspec import Client
14
14
 
15
- if TYPE_CHECKING:
16
- from datachain.data_storage import AbstractMetastore
17
-
18
15
 
19
16
  class FileClient(Client):
20
17
  FS_CLASS = LocalFileSystem
@@ -97,9 +94,7 @@ class FileClient(Client):
97
94
  return cls.root_dir(), uri.removeprefix(cls.root_path().as_uri())
98
95
 
99
96
  @classmethod
100
- def from_name(
101
- cls, name: str, metastore: "AbstractMetastore", cache, kwargs
102
- ) -> "FileClient":
97
+ def from_name(cls, name: str, cache, kwargs) -> "FileClient":
103
98
  use_symlinks = kwargs.pop("use_symlinks", False)
104
99
  return cls(name, kwargs, cache, use_symlinks=use_symlinks)
105
100
 
@@ -140,11 +135,9 @@ class FileClient(Client):
140
135
  full_path += "/"
141
136
  return full_path
142
137
 
143
- def convert_info(self, v: dict[str, Any], parent: str) -> Entry:
144
- name = posixpath.basename(v["name"])
138
+ def convert_info(self, v: dict[str, Any], path: str) -> Entry:
145
139
  return Entry.from_file(
146
- parent=parent,
147
- name=name,
140
+ path=path,
148
141
  etag=v["mtime"].hex(),
149
142
  is_latest=True,
150
143
  last_modified=datetime.fromtimestamp(v["mtime"], timezone.utc),
datachain/client/s3.py CHANGED
@@ -1,5 +1,4 @@
1
1
  import asyncio
2
- import posixpath
3
2
  from typing import Any, cast
4
3
 
5
4
  from botocore.exceptions import NoCredentialsError
@@ -112,10 +111,8 @@ class ClientS3(Client):
112
111
  await self._fetch_flat(start_prefix, result_queue)
113
112
 
114
113
  def _entry_from_boto(self, v, bucket, versions=False):
115
- parent, name = posixpath.split(v["Key"])
116
114
  return Entry.from_file(
117
- parent=parent,
118
- name=name,
115
+ path=v["Key"],
119
116
  etag=v.get("ETag", "").strip('"'),
120
117
  version=ClientS3.clean_s3_version(v.get("VersionId", "")),
121
118
  is_latest=v.get("IsLatest", True),
@@ -145,7 +142,7 @@ class ClientS3(Client):
145
142
  if info["type"] == "directory":
146
143
  subdirs.add(subprefix)
147
144
  else:
148
- files.append(self.convert_info(info, prefix.rstrip("/")))
145
+ files.append(self.convert_info(info, subprefix))
149
146
  pbar.update()
150
147
  found = True
151
148
  if not found:
@@ -159,10 +156,9 @@ class ClientS3(Client):
159
156
  def clean_s3_version(ver):
160
157
  return ver if ver != "null" else ""
161
158
 
162
- def convert_info(self, v: dict[str, Any], parent: str) -> Entry:
159
+ def convert_info(self, v: dict[str, Any], path: str) -> Entry:
163
160
  return Entry.from_file(
164
- parent=parent,
165
- name=v.get("Key", "").split(DELIMITER)[-1],
161
+ path=path,
166
162
  etag=v.get("ETag", "").strip('"'),
167
163
  version=ClientS3.clean_s3_version(v.get("VersionId", "")),
168
164
  is_latest=v.get("IsLatest", True),
@@ -80,8 +80,7 @@ class DirExpansion:
80
80
  q.c.vtype,
81
81
  (q.c.dir_type == DirType.DIR).label("is_dir"),
82
82
  q.c.source,
83
- q.c.parent,
84
- q.c.name,
83
+ q.c.path,
85
84
  q.c.version,
86
85
  q.c.location,
87
86
  )
@@ -94,36 +93,29 @@ class DirExpansion:
94
93
  q.c.vtype,
95
94
  q.c.is_dir,
96
95
  q.c.source,
97
- q.c.parent,
98
- q.c.name,
96
+ q.c.path,
99
97
  q.c.version,
100
98
  f.max(q.c.location).label("location"),
101
99
  )
102
100
  .select_from(q)
103
- .group_by(
104
- q.c.source, q.c.parent, q.c.name, q.c.vtype, q.c.is_dir, q.c.version
105
- )
106
- .order_by(
107
- q.c.source, q.c.parent, q.c.name, q.c.vtype, q.c.is_dir, q.c.version
108
- )
101
+ .group_by(q.c.source, q.c.path, q.c.vtype, q.c.is_dir, q.c.version)
102
+ .order_by(q.c.source, q.c.path, q.c.vtype, q.c.is_dir, q.c.version)
109
103
  )
110
104
 
111
105
  @classmethod
112
106
  def query(cls, q):
113
107
  q = cls.base_select(q).cte(recursive=True)
114
- parent_parent = path.parent(q.c.parent)
115
- parent_name = path.name(q.c.parent)
108
+ parent = path.parent(q.c.path)
116
109
  q = q.union_all(
117
110
  sa.select(
118
111
  sa.literal(-1).label("sys__id"),
119
112
  sa.literal("").label("vtype"),
120
113
  true().label("is_dir"),
121
114
  q.c.source,
122
- parent_parent.label("parent"),
123
- parent_name.label("name"),
115
+ parent.label("path"),
124
116
  sa.literal("").label("version"),
125
117
  null().label("location"),
126
- ).where((parent_name != "") | (parent_parent != ""))
118
+ ).where(parent != "")
127
119
  )
128
120
  return cls.apply_group_by(q)
129
121
 
@@ -17,8 +17,9 @@ from sqlalchemy.sql.expression import true
17
17
 
18
18
  from datachain.client import Client
19
19
  from datachain.data_storage.serializer import Serializable
20
- from datachain.dataset import DatasetRecord, RowDict
20
+ from datachain.dataset import DatasetRecord
21
21
  from datachain.node import DirType, DirTypeGroup, Entry, Node, NodeWithPath, get_path
22
+ from datachain.sql.functions import path as pathfunc
22
23
  from datachain.sql.types import Int, SQLType
23
24
  from datachain.storage import StorageURI
24
25
  from datachain.utils import sql_escape_like
@@ -200,23 +201,17 @@ class AbstractWarehouse(ABC, Serializable):
200
201
  def dataset_select_paginated(
201
202
  self,
202
203
  query,
203
- limit: Optional[int] = None,
204
- order_by: tuple["ColumnElement[Any]", ...] = (),
205
204
  page_size: int = SELECT_BATCH_SIZE,
206
- ) -> Generator[RowDict, None, None]:
205
+ ) -> Generator[Sequence, None, None]:
207
206
  """
208
207
  This is equivalent to `db.execute`, but for selecting rows in batches
209
208
  """
210
- cols = query.selected_columns
211
- cols_names = [c.name for c in cols]
209
+ limit = query._limit
210
+ paginated_query = query.limit(page_size)
212
211
 
213
- if not order_by:
214
- ordering = [cols.sys__id]
215
- else:
216
- ordering = order_by # type: ignore[assignment]
217
-
218
- # reset query order by and apply new order by id
219
- paginated_query = query.order_by(None).order_by(*ordering).limit(page_size)
212
+ if not paginated_query._order_by_clauses:
213
+ # default order by is order by `sys__id`
214
+ paginated_query = paginated_query.order_by(query.selected_columns.sys__id)
220
215
 
221
216
  results = None
222
217
  offset = 0
@@ -235,7 +230,7 @@ class AbstractWarehouse(ABC, Serializable):
235
230
  processed = False
236
231
  for row in results:
237
232
  processed = True
238
- yield RowDict(zip(cols_names, row))
233
+ yield row
239
234
  num_yielded += 1
240
235
 
241
236
  if not processed:
@@ -373,9 +368,7 @@ class AbstractWarehouse(ABC, Serializable):
373
368
 
374
369
  else:
375
370
  parent = self.get_node_by_path(dr, path.lstrip("/").rstrip("/*"))
376
- select_query = select_query.where(
377
- (dr.c.parent == parent.path) | (self.path_expr(dr) == path)
378
- )
371
+ select_query = select_query.where(pathfunc.parent(dr.c.path) == parent.path)
379
372
  return select_query
380
373
 
381
374
  def rename_dataset_table(
@@ -532,8 +525,8 @@ class AbstractWarehouse(ABC, Serializable):
532
525
  dr,
533
526
  parent_path,
534
527
  type="dir",
535
- conds=[sa.Column("parent") == parent_path],
536
- order_by=["source", "parent", "name"],
528
+ conds=[pathfunc.parent(sa.Column("path")) == parent_path],
529
+ order_by=["source", "path"],
537
530
  )
538
531
  return self.get_nodes(query)
539
532
 
@@ -556,7 +549,7 @@ class AbstractWarehouse(ABC, Serializable):
556
549
  & ~self.instr(relpath, "/")
557
550
  & (self.path_expr(de) != dirpath)
558
551
  )
559
- .order_by(de.c.source, de.c.parent, de.c.name, de.c.version)
552
+ .order_by(de.c.source, de.c.path, de.c.version)
560
553
  )
561
554
 
562
555
  def _get_node_by_path_list(
@@ -572,8 +565,8 @@ class AbstractWarehouse(ABC, Serializable):
572
565
  ).subquery()
573
566
  query = self.expand_query(de, dr)
574
567
 
575
- q = query.where((de.c.parent == parent) & (de.c.name == name)).order_by(
576
- de.c.source, de.c.parent, de.c.name, de.c.version
568
+ q = query.where(de.c.path == get_path(parent, name)).order_by(
569
+ de.c.source, de.c.path, de.c.version
577
570
  )
578
571
  row = next(self.dataset_rows_select(q), None)
579
572
  if not row:
@@ -636,8 +629,7 @@ class AbstractWarehouse(ABC, Serializable):
636
629
  case((de.c.is_dir == true(), DirType.DIR), else_=dr.c.dir_type).label(
637
630
  "dir_type"
638
631
  ),
639
- de.c.parent,
640
- de.c.name,
632
+ de.c.path,
641
633
  with_default(dr.c.etag),
642
634
  de.c.version,
643
635
  with_default(dr.c.is_latest),
@@ -670,7 +662,7 @@ class AbstractWarehouse(ABC, Serializable):
670
662
  .where(
671
663
  dr.c.is_latest == true(),
672
664
  dr.c.dir_type != DirType.DIR,
673
- (dr.c.parent + "/").startswith(path),
665
+ dr.c.path.startswith(path),
674
666
  )
675
667
  .exists()
676
668
  )
@@ -678,8 +670,7 @@ class AbstractWarehouse(ABC, Serializable):
678
670
  if not row:
679
671
  raise FileNotFoundError(f"Unable to resolve path {path}")
680
672
  path = path.removesuffix("/")
681
- parent, name = path.rsplit("/", 1) if "/" in path else ("", path)
682
- return Node.from_dir(parent, name)
673
+ return Node.from_dir(path)
683
674
 
684
675
  def expand_path(self, dataset_rows: "DataTable", path: str) -> list[Node]:
685
676
  """Simulates Unix-like shell expansion"""
@@ -703,18 +694,21 @@ class AbstractWarehouse(ABC, Serializable):
703
694
  de = dr.dataset_dir_expansion(
704
695
  dr.select().where(dr.c.is_latest == true()).subquery()
705
696
  ).subquery()
706
- where_cond = de.c.parent == parent_path
697
+ where_cond = pathfunc.parent(de.c.path) == parent_path
707
698
  if parent_path == "":
708
699
  # Exclude the root dir
709
- where_cond = where_cond & (de.c.name != "")
700
+ where_cond = where_cond & (de.c.path != "")
710
701
  inner_query = self.expand_query(de, dr).where(where_cond).subquery()
702
+
703
+ def field_to_expr(f):
704
+ if f == "name":
705
+ return pathfunc.name(inner_query.c.path)
706
+ return getattr(inner_query.c, f)
707
+
711
708
  return self.db.execute(
712
- sa.select(*(getattr(inner_query.c, f) for f in fields))
713
- .select_from(inner_query)
714
- .order_by(
709
+ select(*(field_to_expr(f) for f in fields)).order_by(
715
710
  inner_query.c.source,
716
- inner_query.c.parent,
717
- inner_query.c.name,
711
+ inner_query.c.path,
718
712
  inner_query.c.version,
719
713
  )
720
714
  )
@@ -727,21 +721,20 @@ class AbstractWarehouse(ABC, Serializable):
727
721
  """
728
722
  dr = dataset_rows
729
723
  dirpath = f"{parent_path}/"
730
- relpath = func.substr(self.path_expr(dr), len(dirpath) + 1)
731
724
 
732
725
  def field_to_expr(f):
733
726
  if f == "name":
734
- return relpath
727
+ return pathfunc.name(dr.c.path)
735
728
  return getattr(dr.c, f)
736
729
 
737
730
  q = (
738
731
  select(*(field_to_expr(f) for f in fields))
739
732
  .where(
740
733
  self.path_expr(dr).like(f"{sql_escape_like(dirpath)}%"),
741
- ~self.instr(relpath, "/"),
734
+ ~self.instr(pathfunc.name(dr.c.path), "/"),
742
735
  dr.c.is_latest == true(),
743
736
  )
744
- .order_by(dr.c.source, dr.c.parent, dr.c.name, dr.c.version, dr.c.etag)
737
+ .order_by(dr.c.source, dr.c.path, dr.c.version, dr.c.etag)
745
738
  )
746
739
  return self.db.execute(q)
747
740
 
@@ -758,7 +751,7 @@ class AbstractWarehouse(ABC, Serializable):
758
751
  if isinstance(node, dict):
759
752
  is_dir = node.get("is_dir", node["dir_type"] in DirTypeGroup.SUBOBJ_DIR)
760
753
  node_size = node["size"]
761
- path = get_path(node["parent"], node["name"])
754
+ path = node["path"]
762
755
  else:
763
756
  is_dir = node.is_container
764
757
  node_size = node.size
@@ -790,7 +783,7 @@ class AbstractWarehouse(ABC, Serializable):
790
783
  return results[0] or 0, 0
791
784
 
792
785
  def path_expr(self, t):
793
- return case((t.c.parent == "", t.c.name), else_=t.c.parent + "/" + t.c.name)
786
+ return t.c.path
794
787
 
795
788
  def _find_query(
796
789
  self,
@@ -947,11 +940,7 @@ class AbstractWarehouse(ABC, Serializable):
947
940
  tq = target_query.alias("target_query")
948
941
 
949
942
  source_target_join = sa.join(
950
- sq,
951
- tq,
952
- (sq.c.source == tq.c.source)
953
- & (sq.c.parent == tq.c.parent)
954
- & (sq.c.name == tq.c.name),
943
+ sq, tq, (sq.c.source == tq.c.source) & (sq.c.path == tq.c.path)
955
944
  )
956
945
 
957
946
  return (
datachain/lib/dc.py CHANGED
@@ -49,6 +49,7 @@ from datachain.query.dataset import (
49
49
  detach,
50
50
  )
51
51
  from datachain.query.schema import Column, DatasetRow
52
+ from datachain.sql.functions import path as pathfunc
52
53
  from datachain.utils import inside_notebook
53
54
 
54
55
  if TYPE_CHECKING:
@@ -202,7 +203,7 @@ class DataChain(DatasetQuery):
202
203
 
203
204
  DEFAULT_FILE_RECORD: ClassVar[dict] = {
204
205
  "source": "",
205
- "name": "",
206
+ "path": "",
206
207
  "vtype": "",
207
208
  "size": 0,
208
209
  }
@@ -1586,10 +1587,11 @@ class DataChain(DatasetQuery):
1586
1587
  use_cache: bool = True,
1587
1588
  ) -> None:
1588
1589
  """Method that exports all files from chain to some folder."""
1589
- if placement == "filename":
1590
- print("Checking if file names are unique")
1591
- if self.distinct(f"{signal}.name").count() != self.count():
1592
- raise ValueError("Files with the same name found")
1590
+ if placement == "filename" and (
1591
+ super().distinct(pathfunc.name(C(f"{signal}__path"))).count()
1592
+ != self.count()
1593
+ ):
1594
+ raise ValueError("Files with the same name found")
1593
1595
 
1594
1596
  for file in self.collect(signal):
1595
1597
  file.export(output, placement, use_cache) # type: ignore[union-attr]
@@ -1621,7 +1623,7 @@ class DataChain(DatasetQuery):
1621
1623
 
1622
1624
  Using glob to match patterns
1623
1625
  ```py
1624
- dc.filter(C("file.name").glob("*.jpg))
1626
+ dc.filter(C("file.name").glob("*.jpg"))
1625
1627
  ```
1626
1628
 
1627
1629
  Using `datachain.sql.functions`