datachain 0.3.11__py3-none-any.whl → 0.3.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

datachain/cache.py CHANGED
@@ -29,7 +29,6 @@ class UniqueId:
29
29
  etag: str
30
30
  version: str = ""
31
31
  is_latest: bool = True
32
- vtype: str = ""
33
32
  location: Optional[str] = None
34
33
  last_modified: datetime = TIME_ZERO
35
34
 
@@ -12,7 +12,6 @@ import sys
12
12
  import time
13
13
  import traceback
14
14
  from collections.abc import Iterable, Iterator, Mapping, Sequence
15
- from contextlib import contextmanager, nullcontext
16
15
  from copy import copy
17
16
  from dataclasses import dataclass
18
17
  from functools import cached_property, reduce
@@ -23,7 +22,6 @@ from typing import (
23
22
  TYPE_CHECKING,
24
23
  Any,
25
24
  Callable,
26
- NamedTuple,
27
25
  NoReturn,
28
26
  Optional,
29
27
  Union,
@@ -58,14 +56,13 @@ from datachain.error import (
58
56
  PendingIndexingError,
59
57
  QueryScriptCancelError,
60
58
  QueryScriptCompileError,
61
- QueryScriptDatasetNotFound,
62
59
  QueryScriptRunError,
63
60
  )
64
61
  from datachain.listing import Listing
65
62
  from datachain.node import DirType, Node, NodeWithPath
66
63
  from datachain.nodes_thread_pool import NodesThreadPool
67
64
  from datachain.remote.studio import StudioClient
68
- from datachain.sql.types import JSON, Boolean, DateTime, Int, Int64, SQLType, String
65
+ from datachain.sql.types import JSON, Boolean, DateTime, Int64, SQLType, String
69
66
  from datachain.storage import Storage, StorageStatus, StorageURI
70
67
  from datachain.utils import (
71
68
  DataChainDir,
@@ -115,44 +112,19 @@ def noop(_: str):
115
112
  pass
116
113
 
117
114
 
118
- @contextmanager
119
- def print_and_capture(
120
- stream: "IO[bytes]|IO[str]", callback: Callable[[str], None] = noop
121
- ) -> "Iterator[list[str]]":
122
- lines: list[str] = []
123
- append = lines.append
115
+ def _process_stream(stream: "IO[bytes]", callback: Callable[[str], None]) -> None:
116
+ buffer = b""
117
+ while byt := stream.read(1): # Read one byte at a time
118
+ buffer += byt
124
119
 
125
- def loop() -> None:
126
- buffer = b""
127
- while byt := stream.read(1): # Read one byte at a time
128
- buffer += byt.encode("utf-8") if isinstance(byt, str) else byt
129
-
130
- if byt in (b"\n", b"\r"): # Check for newline or carriage return
131
- line = buffer.decode("utf-8")
132
- print(line, end="")
133
- callback(line)
134
- append(line)
135
- buffer = b"" # Clear buffer for next line
136
-
137
- if buffer: # Handle any remaining data in the buffer
120
+ if byt in (b"\n", b"\r"): # Check for newline or carriage return
138
121
  line = buffer.decode("utf-8")
139
- print(line, end="")
140
122
  callback(line)
141
- append(line)
142
-
143
- thread = Thread(target=loop, daemon=True)
144
- thread.start()
145
-
146
- try:
147
- yield lines
148
- finally:
149
- thread.join()
150
-
123
+ buffer = b"" # Clear buffer for next line
151
124
 
152
- class QueryResult(NamedTuple):
153
- dataset: Optional[DatasetRecord]
154
- version: Optional[int]
155
- output: str
125
+ if buffer: # Handle any remaining data in the buffer
126
+ line = buffer.decode("utf-8")
127
+ callback(line)
156
128
 
157
129
 
158
130
  class DatasetRowsFetcher(NodesThreadPool):
@@ -541,8 +513,6 @@ def find_column_to_str( # noqa: PLR0911
541
513
  )
542
514
  if column == "name":
543
515
  return posixpath.basename(row[field_lookup["path"]]) or ""
544
- if column == "owner":
545
- return row[field_lookup["owner_name"]] or ""
546
516
  if column == "path":
547
517
  is_dir = row[field_lookup["dir_type"]] == DirType.DIR
548
518
  path = row[field_lookup["path"]]
@@ -651,11 +621,6 @@ class Catalog:
651
621
  code_ast.body[-1:] = new_expressions
652
622
  return code_ast
653
623
 
654
- def compile_query_script(self, script: str) -> str:
655
- code_ast = ast.parse(script)
656
- code_ast = self.attach_query_wrapper(code_ast)
657
- return ast.unparse(code_ast)
658
-
659
624
  def parse_url(self, uri: str, **config: Any) -> tuple[Client, str]:
660
625
  config = config or self.client_config
661
626
  return Client.parse_url(uri, self.cache, **config)
@@ -699,16 +664,12 @@ class Catalog:
699
664
  source_metastore = self.metastore.clone(client.uri)
700
665
 
701
666
  columns = [
702
- Column("vtype", String),
703
- Column("dir_type", Int),
704
667
  Column("path", String),
705
668
  Column("etag", String),
706
669
  Column("version", String),
707
670
  Column("is_latest", Boolean),
708
671
  Column("last_modified", DateTime(timezone=True)),
709
672
  Column("size", Int64),
710
- Column("owner_name", String),
711
- Column("owner_id", String),
712
673
  Column("location", JSON),
713
674
  Column("source", String),
714
675
  ]
@@ -1549,7 +1510,6 @@ class Catalog:
1549
1510
  row["etag"],
1550
1511
  row["version"],
1551
1512
  row["is_latest"],
1552
- row["vtype"],
1553
1513
  row["location"],
1554
1514
  row["last_modified"],
1555
1515
  )
@@ -1805,14 +1765,15 @@ class Catalog:
1805
1765
  def query(
1806
1766
  self,
1807
1767
  query_script: str,
1808
- envs: Optional[Mapping[str, str]] = None,
1809
- python_executable: Optional[str] = None,
1768
+ env: Optional[Mapping[str, str]] = None,
1769
+ python_executable: str = sys.executable,
1810
1770
  save: bool = False,
1811
1771
  capture_output: bool = True,
1812
1772
  output_hook: Callable[[str], None] = noop,
1813
1773
  params: Optional[dict[str, str]] = None,
1814
1774
  job_id: Optional[str] = None,
1815
- ) -> QueryResult:
1775
+ _execute_last_expression: bool = False,
1776
+ ) -> None:
1816
1777
  """
1817
1778
  Method to run custom user Python script to run a query and, as result,
1818
1779
  creates new dataset from the results of a query.
@@ -1835,92 +1796,21 @@ class Catalog:
1835
1796
  C.size > 1000
1836
1797
  )
1837
1798
  """
1838
- if not job_id:
1839
- python_version = f"{sys.version_info.major}.{sys.version_info.minor}"
1840
- job_id = self.metastore.create_job(
1841
- name="",
1842
- query=query_script,
1843
- params=params,
1844
- python_version=python_version,
1845
- )
1846
-
1847
- lines, proc = self.run_query(
1848
- python_executable or sys.executable,
1849
- query_script,
1850
- envs,
1851
- capture_output,
1852
- output_hook,
1853
- params,
1854
- save,
1855
- job_id,
1856
- )
1857
- output = "".join(lines)
1858
-
1859
- if proc.returncode:
1860
- if proc.returncode == QUERY_SCRIPT_CANCELED_EXIT_CODE:
1861
- raise QueryScriptCancelError(
1862
- "Query script was canceled by user",
1863
- return_code=proc.returncode,
1864
- output=output,
1865
- )
1866
- raise QueryScriptRunError(
1867
- f"Query script exited with error code {proc.returncode}",
1868
- return_code=proc.returncode,
1869
- output=output,
1870
- )
1871
-
1872
- def _get_dataset_versions_by_job_id():
1873
- for dr, dv, job in self.list_datasets_versions():
1874
- if job and str(job.id) == job_id:
1875
- yield dr, dv
1876
-
1877
- try:
1878
- dr, dv = max(
1879
- _get_dataset_versions_by_job_id(), key=lambda x: x[1].created_at
1880
- )
1881
- except ValueError as e:
1882
- if not save:
1883
- return QueryResult(dataset=None, version=None, output=output)
1884
-
1885
- raise QueryScriptDatasetNotFound(
1886
- "No dataset found after running Query script",
1887
- output=output,
1888
- ) from e
1889
-
1890
- dr = self.update_dataset(
1891
- dr,
1892
- script_output=output,
1893
- query_script=query_script,
1894
- )
1895
- self.update_dataset_version_with_warehouse_info(
1896
- dr,
1897
- dv.version,
1898
- script_output=output,
1899
- query_script=query_script,
1900
- job_id=job_id,
1901
- is_job_result=True,
1902
- )
1903
- return QueryResult(dataset=dr, version=dv.version, output=output)
1799
+ if _execute_last_expression:
1800
+ try:
1801
+ code_ast = ast.parse(query_script)
1802
+ code_ast = self.attach_query_wrapper(code_ast)
1803
+ query_script_compiled = ast.unparse(code_ast)
1804
+ except Exception as exc:
1805
+ raise QueryScriptCompileError(
1806
+ f"Query script failed to compile, reason: {exc}"
1807
+ ) from exc
1808
+ else:
1809
+ query_script_compiled = query_script
1810
+ assert not save
1904
1811
 
1905
- def run_query(
1906
- self,
1907
- python_executable: str,
1908
- query_script: str,
1909
- envs: Optional[Mapping[str, str]],
1910
- capture_output: bool,
1911
- output_hook: Callable[[str], None],
1912
- params: Optional[dict[str, str]],
1913
- save: bool,
1914
- job_id: Optional[str],
1915
- ) -> tuple[list[str], subprocess.Popen]:
1916
- try:
1917
- query_script_compiled = self.compile_query_script(query_script)
1918
- except Exception as exc:
1919
- raise QueryScriptCompileError(
1920
- f"Query script failed to compile, reason: {exc}"
1921
- ) from exc
1922
- envs = dict(envs or os.environ)
1923
- envs.update(
1812
+ env = dict(env or os.environ)
1813
+ env.update(
1924
1814
  {
1925
1815
  "DATACHAIN_QUERY_PARAMS": json.dumps(params or {}),
1926
1816
  "PYTHONPATH": os.getcwd(), # For local imports
@@ -1929,19 +1819,28 @@ class Catalog:
1929
1819
  "DATACHAIN_JOB_ID": job_id or "",
1930
1820
  },
1931
1821
  )
1932
- with subprocess.Popen( # noqa: S603
1933
- [python_executable, "-c", query_script_compiled],
1934
- env=envs,
1935
- stdout=subprocess.PIPE if capture_output else None,
1936
- stderr=subprocess.STDOUT if capture_output else None,
1937
- bufsize=1,
1938
- text=False,
1939
- ) as proc:
1940
- out = proc.stdout
1941
- _lines: list[str] = []
1942
- ctx = print_and_capture(out, output_hook) if out else nullcontext(_lines)
1943
- with ctx as lines:
1944
- return lines, proc
1822
+ popen_kwargs = {}
1823
+ if capture_output:
1824
+ popen_kwargs = {"stdout": subprocess.PIPE, "stderr": subprocess.STDOUT}
1825
+
1826
+ cmd = [python_executable, "-c", query_script_compiled]
1827
+ with subprocess.Popen(cmd, env=env, **popen_kwargs) as proc: # type: ignore[call-overload] # noqa: S603
1828
+ if capture_output:
1829
+ args = (proc.stdout, output_hook)
1830
+ thread = Thread(target=_process_stream, args=args, daemon=True)
1831
+ thread.start()
1832
+ thread.join() # wait for the reader thread
1833
+
1834
+ if proc.returncode == QUERY_SCRIPT_CANCELED_EXIT_CODE:
1835
+ raise QueryScriptCancelError(
1836
+ "Query script was canceled by user",
1837
+ return_code=proc.returncode,
1838
+ )
1839
+ if proc.returncode:
1840
+ raise QueryScriptRunError(
1841
+ f"Query script exited with error code {proc.returncode}",
1842
+ return_code=proc.returncode,
1843
+ )
1945
1844
 
1946
1845
  def cp(
1947
1846
  self,
@@ -2081,8 +1980,6 @@ class Catalog:
2081
1980
  field_set.add("path")
2082
1981
  elif column == "name":
2083
1982
  field_set.add("path")
2084
- elif column == "owner":
2085
- field_set.add("owner_name")
2086
1983
  elif column == "path":
2087
1984
  field_set.add("dir_type")
2088
1985
  field_set.add("path")
datachain/cli.py CHANGED
@@ -24,7 +24,7 @@ logger = logging.getLogger("datachain")
24
24
 
25
25
  TTL_HUMAN = "4h"
26
26
  TTL_INT = 4 * 60 * 60
27
- FIND_COLUMNS = ["du", "name", "owner", "path", "size", "type"]
27
+ FIND_COLUMNS = ["du", "name", "path", "size", "type"]
28
28
 
29
29
 
30
30
  def human_time_type(value_str: str, can_be_none: bool = False) -> Optional[int]:
@@ -579,9 +579,8 @@ def _node_data_to_ls_values(row, long_format=False):
579
579
  value = name + ending
580
580
  if long_format:
581
581
  last_modified = row[2]
582
- owner_name = row[3]
583
582
  timestamp = last_modified if not is_dir else None
584
- return long_line_str(value, timestamp, owner_name)
583
+ return long_line_str(value, timestamp)
585
584
  return value
586
585
 
587
586
 
@@ -599,7 +598,7 @@ def _ls_urls_flat(
599
598
  if client_cls.is_root_url(source):
600
599
  buckets = client_cls.ls_buckets(**catalog.client_config)
601
600
  if long:
602
- values = (long_line_str(b.name, b.created, "") for b in buckets)
601
+ values = (long_line_str(b.name, b.created) for b in buckets)
603
602
  else:
604
603
  values = (b.name for b in buckets)
605
604
  yield source, values
@@ -607,7 +606,7 @@ def _ls_urls_flat(
607
606
  found = False
608
607
  fields = ["name", "dir_type"]
609
608
  if long:
610
- fields.extend(["last_modified", "owner_name"])
609
+ fields.append("last_modified")
611
610
  for data_source, results in catalog.ls([source], fields=fields, **kwargs):
612
611
  values = (_node_data_to_ls_values(r, long) for r in results)
613
612
  found = True
@@ -683,7 +682,6 @@ def ls_remote(
683
682
  entry = long_line_str(
684
683
  row["name"] + ("/" if row["dir_type"] else ""),
685
684
  row["last_modified"],
686
- row["owner_name"],
687
685
  )
688
686
  print(format_ls_entry(entry))
689
687
  else:
@@ -363,7 +363,6 @@ class Client(ABC):
363
363
  parent["path"],
364
364
  parent["size"],
365
365
  parent["etag"],
366
- vtype=parent["vtype"],
367
366
  location=parent["location"],
368
367
  )
369
368
  f = self.open_object(parent_uid, use_cache=use_cache)
datachain/client/s3.py CHANGED
@@ -119,8 +119,6 @@ class ClientS3(Client):
119
119
  is_latest=v.get("IsLatest", True),
120
120
  last_modified=v.get("LastModified", ""),
121
121
  size=v["Size"],
122
- owner_name=v.get("Owner", {}).get("DisplayName", ""),
123
- owner_id=v.get("Owner", {}).get("ID", ""),
124
122
  )
125
123
 
126
124
  async def _fetch_dir(
@@ -165,8 +163,6 @@ class ClientS3(Client):
165
163
  is_latest=v.get("IsLatest", True),
166
164
  last_modified=v.get("LastModified", ""),
167
165
  size=v["size"],
168
- owner_name=v.get("Owner", {}).get("DisplayName", ""),
169
- owner_id=v.get("Owner", {}).get("ID", ""),
170
166
  )
171
167
 
172
168
  def info_to_file(self, v: dict[str, Any], path: str) -> File:
@@ -10,9 +10,8 @@ from typing import (
10
10
 
11
11
  import sqlalchemy as sa
12
12
  from sqlalchemy.sql import func as f
13
- from sqlalchemy.sql.expression import null, true
13
+ from sqlalchemy.sql.expression import false, null, true
14
14
 
15
- from datachain.node import DirType
16
15
  from datachain.sql.functions import path
17
16
  from datachain.sql.types import Int, SQLType, UInt64
18
17
 
@@ -81,8 +80,7 @@ class DirExpansion:
81
80
  def base_select(q):
82
81
  return sa.select(
83
82
  q.c.sys__id,
84
- q.c.vtype,
85
- (q.c.dir_type == DirType.DIR).label("is_dir"),
83
+ false().label("is_dir"),
86
84
  q.c.source,
87
85
  q.c.path,
88
86
  q.c.version,
@@ -94,7 +92,6 @@ class DirExpansion:
94
92
  return (
95
93
  sa.select(
96
94
  f.min(q.c.sys__id).label("sys__id"),
97
- q.c.vtype,
98
95
  q.c.is_dir,
99
96
  q.c.source,
100
97
  q.c.path,
@@ -102,8 +99,8 @@ class DirExpansion:
102
99
  f.max(q.c.location).label("location"),
103
100
  )
104
101
  .select_from(q)
105
- .group_by(q.c.source, q.c.path, q.c.vtype, q.c.is_dir, q.c.version)
106
- .order_by(q.c.source, q.c.path, q.c.vtype, q.c.is_dir, q.c.version)
102
+ .group_by(q.c.source, q.c.path, q.c.is_dir, q.c.version)
103
+ .order_by(q.c.source, q.c.path, q.c.is_dir, q.c.version)
107
104
  )
108
105
 
109
106
  @classmethod
@@ -113,7 +110,6 @@ class DirExpansion:
113
110
  q = q.union_all(
114
111
  sa.select(
115
112
  sa.literal(-1).label("sys__id"),
116
- sa.literal("").label("vtype"),
117
113
  true().label("is_dir"),
118
114
  q.c.source,
119
115
  parent.label("path"),
@@ -28,7 +28,6 @@ from datachain.utils import sql_escape_like
28
28
 
29
29
  if TYPE_CHECKING:
30
30
  from sqlalchemy.sql._typing import _ColumnsClauseArgument
31
- from sqlalchemy.sql.elements import ColumnElement
32
31
  from sqlalchemy.sql.selectable import Select
33
32
  from sqlalchemy.types import TypeEngine
34
33
 
@@ -341,9 +340,7 @@ class AbstractWarehouse(ABC, Serializable):
341
340
 
342
341
  column_objects = [dr.c[c] for c in column_names]
343
342
  # include all object types - file, tar archive, tar file (subobject)
344
- select_query = dr.select(*column_objects).where(
345
- dr.c.dir_type.in_(DirTypeGroup.FILE) & (dr.c.is_latest == true())
346
- )
343
+ select_query = dr.select(*column_objects).where(dr.c.is_latest == true())
347
344
  if path is None:
348
345
  return select_query
349
346
  if recursive:
@@ -420,7 +417,6 @@ class AbstractWarehouse(ABC, Serializable):
420
417
  """
421
418
 
422
419
  def _prepare_entry(entry: Entry):
423
- assert entry.dir_type is not None
424
420
  return attrs.asdict(entry) | {"source": uri}
425
421
 
426
422
  return [_prepare_entry(e) for e in entries]
@@ -440,7 +436,7 @@ class AbstractWarehouse(ABC, Serializable):
440
436
  """Inserts dataset rows directly into dataset table"""
441
437
 
442
438
  @abstractmethod
443
- def instr(self, source, target) -> "ColumnElement":
439
+ def instr(self, source, target) -> sa.ColumnElement:
444
440
  """
445
441
  Return SQLAlchemy Boolean determining if a target substring is present in
446
442
  source string column
@@ -500,7 +496,7 @@ class AbstractWarehouse(ABC, Serializable):
500
496
  c = query.selected_columns
501
497
  q = query.where(c.dir_type.in_(file_group))
502
498
  if not include_subobjects:
503
- q = q.where(c.vtype == "")
499
+ q = q.where((c.location == "") | (c.location.is_(None)))
504
500
  return q
505
501
 
506
502
  def get_nodes(self, query) -> Iterator[Node]:
@@ -624,8 +620,7 @@ class AbstractWarehouse(ABC, Serializable):
624
620
 
625
621
  return sa.select(
626
622
  de.c.sys__id,
627
- with_default(dr.c.vtype),
628
- case((de.c.is_dir == true(), DirType.DIR), else_=dr.c.dir_type).label(
623
+ case((de.c.is_dir == true(), DirType.DIR), else_=DirType.FILE).label(
629
624
  "dir_type"
630
625
  ),
631
626
  de.c.path,
@@ -634,8 +629,6 @@ class AbstractWarehouse(ABC, Serializable):
634
629
  with_default(dr.c.is_latest),
635
630
  dr.c.last_modified,
636
631
  with_default(dr.c.size),
637
- with_default(dr.c.owner_name),
638
- with_default(dr.c.owner_id),
639
632
  with_default(dr.c.sys__rand),
640
633
  dr.c.location,
641
634
  de.c.source,
@@ -650,7 +643,6 @@ class AbstractWarehouse(ABC, Serializable):
650
643
  query = dr.select().where(
651
644
  self.path_expr(dr) == path,
652
645
  dr.c.is_latest == true(),
653
- dr.c.dir_type != DirType.DIR,
654
646
  )
655
647
  row = next(self.db.execute(query), None)
656
648
  if row is not None:
@@ -660,7 +652,6 @@ class AbstractWarehouse(ABC, Serializable):
660
652
  dr.select()
661
653
  .where(
662
654
  dr.c.is_latest == true(),
663
- dr.c.dir_type != DirType.DIR,
664
655
  dr.c.path.startswith(path),
665
656
  )
666
657
  .exists()
@@ -761,13 +752,11 @@ class AbstractWarehouse(ABC, Serializable):
761
752
 
762
753
  sub_glob = posixpath.join(path, "*")
763
754
  dr = dataset_rows
764
- selections = [
755
+ selections: list[sa.ColumnElement] = [
765
756
  func.sum(dr.c.size),
766
757
  ]
767
758
  if count_files:
768
- selections.append(
769
- func.sum(dr.c.dir_type.in_(DirTypeGroup.FILE)),
770
- )
759
+ selections.append(func.count())
771
760
  results = next(
772
761
  self.db.execute(
773
762
  dr.select(*selections).where(
datachain/error.py CHANGED
@@ -42,10 +42,6 @@ class QueryScriptRunError(Exception):
42
42
  super().__init__(self.message)
43
43
 
44
44
 
45
- class QueryScriptDatasetNotFound(QueryScriptRunError): # noqa: N818
46
- pass
47
-
48
-
49
45
  class QueryScriptCancelError(QueryScriptRunError):
50
46
  pass
51
47
 
datachain/lib/clip.py CHANGED
@@ -18,7 +18,7 @@ def _get_encoder(model: Any, type: Literal["image", "text"]) -> Callable:
18
18
  hasattr(model, method_name) and inspect.ismethod(getattr(model, method_name))
19
19
  ):
20
20
  method = getattr(model, method_name)
21
- return lambda x: method(torch.tensor(x))
21
+ return lambda x: method(torch.as_tensor(x).clone().detach())
22
22
 
23
23
  # Check for model from clip or open_clip library
24
24
  method_name = f"encode_{type}"
datachain/lib/dc.py CHANGED
@@ -234,7 +234,6 @@ class DataChain(DatasetQuery):
234
234
  DEFAULT_FILE_RECORD: ClassVar[dict] = {
235
235
  "source": "",
236
236
  "path": "",
237
- "vtype": "",
238
237
  "size": 0,
239
238
  }
240
239
 
@@ -415,7 +414,7 @@ class DataChain(DatasetQuery):
415
414
  .save(list_dataset_name, listing=True)
416
415
  )
417
416
 
418
- dc = cls.from_dataset(list_dataset_name, session=session)
417
+ dc = cls.from_dataset(list_dataset_name, session=session, settings=settings)
419
418
  dc.signals_schema = dc.signals_schema.mutate({f"{object_name}": file_type})
420
419
 
421
420
  return ls(dc, list_path, recursive=recursive, object_name=object_name)
@@ -426,6 +425,7 @@ class DataChain(DatasetQuery):
426
425
  name: str,
427
426
  version: Optional[int] = None,
428
427
  session: Optional[Session] = None,
428
+ settings: Optional[dict] = None,
429
429
  ) -> "DataChain":
430
430
  """Get data from a saved Dataset. It returns the chain itself.
431
431
 
@@ -438,7 +438,7 @@ class DataChain(DatasetQuery):
438
438
  chain = DataChain.from_dataset("my_cats")
439
439
  ```
440
440
  """
441
- return DataChain(name=name, version=version, session=session)
441
+ return DataChain(name=name, version=version, session=session, settings=settings)
442
442
 
443
443
  @classmethod
444
444
  def from_json(
@@ -1622,6 +1622,8 @@ class DataChain(DatasetQuery):
1622
1622
  model_name: str = "",
1623
1623
  source: bool = True,
1624
1624
  nrows=None,
1625
+ session: Optional[Session] = None,
1626
+ settings: Optional[dict] = None,
1625
1627
  **kwargs,
1626
1628
  ) -> "DataChain":
1627
1629
  """Generate chain from csv files.
@@ -1638,6 +1640,8 @@ class DataChain(DatasetQuery):
1638
1640
  model_name : Generated model name.
1639
1641
  source : Whether to include info about the source file.
1640
1642
  nrows : Optional row limit.
1643
+ session : Session to use for the chain.
1644
+ settings : Settings to use for the chain.
1641
1645
 
1642
1646
  Example:
1643
1647
  Reading a csv file:
@@ -1654,7 +1658,9 @@ class DataChain(DatasetQuery):
1654
1658
  from pyarrow.csv import ConvertOptions, ParseOptions, ReadOptions
1655
1659
  from pyarrow.dataset import CsvFileFormat
1656
1660
 
1657
- chain = DataChain.from_storage(path, **kwargs)
1661
+ chain = DataChain.from_storage(
1662
+ path, session=session, settings=settings, **kwargs
1663
+ )
1658
1664
 
1659
1665
  column_names = None
1660
1666
  if not header:
@@ -1701,6 +1707,8 @@ class DataChain(DatasetQuery):
1701
1707
  object_name: str = "",
1702
1708
  model_name: str = "",
1703
1709
  source: bool = True,
1710
+ session: Optional[Session] = None,
1711
+ settings: Optional[dict] = None,
1704
1712
  **kwargs,
1705
1713
  ) -> "DataChain":
1706
1714
  """Generate chain from parquet files.
@@ -1713,6 +1721,8 @@ class DataChain(DatasetQuery):
1713
1721
  object_name : Created object column name.
1714
1722
  model_name : Generated model name.
1715
1723
  source : Whether to include info about the source file.
1724
+ session : Session to use for the chain.
1725
+ settings : Settings to use for the chain.
1716
1726
 
1717
1727
  Example:
1718
1728
  Reading a single file:
@@ -1725,7 +1735,9 @@ class DataChain(DatasetQuery):
1725
1735
  dc = DataChain.from_parquet("s3://mybucket/dir")
1726
1736
  ```
1727
1737
  """
1728
- chain = DataChain.from_storage(path, **kwargs)
1738
+ chain = DataChain.from_storage(
1739
+ path, session=session, settings=settings, **kwargs
1740
+ )
1729
1741
  return chain.parse_tabular(
1730
1742
  output=output,
1731
1743
  object_name=object_name,
datachain/lib/file.py CHANGED
@@ -118,7 +118,6 @@ class File(DataModel):
118
118
  is_latest: bool = Field(default=True)
119
119
  last_modified: datetime = Field(default=TIME_ZERO)
120
120
  location: Optional[Union[dict, list[dict]]] = Field(default=None)
121
- vtype: str = Field(default="")
122
121
 
123
122
  _datachain_column_types: ClassVar[dict[str, Any]] = {
124
123
  "source": String,
@@ -129,7 +128,6 @@ class File(DataModel):
129
128
  "is_latest": Boolean,
130
129
  "last_modified": DateTime,
131
130
  "location": JSON,
132
- "vtype": String,
133
131
  }
134
132
 
135
133
  _unique_id_keys: ClassVar[list[str]] = [
@@ -139,7 +137,6 @@ class File(DataModel):
139
137
  "etag",
140
138
  "version",
141
139
  "is_latest",
142
- "vtype",
143
140
  "location",
144
141
  "last_modified",
145
142
  ]
@@ -195,14 +192,15 @@ class File(DataModel):
195
192
  with VFileRegistry.resolve(self, self.location) as f: # type: ignore[arg-type]
196
193
  yield f
197
194
 
198
- uid = self.get_uid()
199
- client = self._catalog.get_client(self.source)
200
- if self._caching_enabled:
201
- client.download(uid, callback=self._download_cb)
202
- with client.open_object(
203
- uid, use_cache=self._caching_enabled, cb=self._download_cb
204
- ) as f:
205
- yield io.TextIOWrapper(f) if mode == "r" else f
195
+ else:
196
+ uid = self.get_uid()
197
+ client = self._catalog.get_client(self.source)
198
+ if self._caching_enabled:
199
+ client.download(uid, callback=self._download_cb)
200
+ with client.open_object(
201
+ uid, use_cache=self._caching_enabled, cb=self._download_cb
202
+ ) as f:
203
+ yield io.TextIOWrapper(f) if mode == "r" else f
206
204
 
207
205
  def read(self, length: int = -1):
208
206
  """Returns file contents."""
datachain/lib/image.py CHANGED
@@ -34,7 +34,7 @@ def convert_image(
34
34
  from transformers.image_processing_utils import BaseImageProcessor
35
35
 
36
36
  if isinstance(transform, BaseImageProcessor):
37
- img = torch.tensor(img.pixel_values[0]) # type: ignore[assignment,attr-defined]
37
+ img = torch.as_tensor(img.pixel_values[0]).clone().detach() # type: ignore[assignment,attr-defined]
38
38
  except ImportError:
39
39
  pass
40
40
  if device:
@@ -1,13 +1,10 @@
1
- # pip install datamodel-code-generator
2
- # pip install jmespath
3
- #
4
1
  import csv
5
2
  import json
6
3
  import tempfile
7
4
  import uuid
8
5
  from collections.abc import Iterator
9
6
  from pathlib import Path
10
- from typing import Any, Callable
7
+ from typing import Callable
11
8
 
12
9
  import datamodel_code_generator
13
10
  import jmespath as jsp
@@ -85,7 +82,6 @@ def read_schema(source_file, data_type="csv", expr=None, model_name=None):
85
82
  use_standard_collections=True,
86
83
  )
87
84
  epilogue = f"""
88
- {model_name}.model_rebuild()
89
85
  DataModel.register({model_name})
90
86
  spec = {model_name}
91
87
  """
@@ -122,9 +118,9 @@ def read_meta( # noqa: C901
122
118
  print(f"{model_output}")
123
119
  # Below 'spec' should be a dynamically converted DataModel from Pydantic
124
120
  if not spec:
125
- local_vars: dict[str, Any] = {}
126
- exec(model_output, globals(), local_vars) # type: ignore[arg-type] # noqa: S102
127
- spec = local_vars["spec"]
121
+ gl = globals()
122
+ exec(model_output, gl) # type: ignore[arg-type] # noqa: S102
123
+ spec = gl["spec"]
128
124
 
129
125
  if not (spec) and not (schema_from):
130
126
  raise ValueError(
@@ -1,3 +1,4 @@
1
+ import inspect
1
2
  import logging
2
3
  from typing import ClassVar, Optional
3
4
 
@@ -69,7 +70,11 @@ class ModelStore:
69
70
 
70
71
  @staticmethod
71
72
  def is_pydantic(val):
72
- return not hasattr(val, "__origin__") and issubclass(val, BaseModel)
73
+ return (
74
+ not hasattr(val, "__origin__")
75
+ and inspect.isclass(val)
76
+ and issubclass(val, BaseModel)
77
+ )
73
78
 
74
79
  @staticmethod
75
80
  def to_pydantic(val) -> Optional[type[BaseModel]]:
datachain/lib/text.py CHANGED
@@ -33,7 +33,7 @@ def convert_text(
33
33
  res = tokenizer(text)
34
34
 
35
35
  tokens = res.input_ids if isinstance(tokenizer, PreTrainedTokenizerBase) else res
36
- tokens = torch.tensor(tokens)
36
+ tokens = torch.as_tensor(tokens).clone().detach()
37
37
  if device:
38
38
  tokens = tokens.to(device)
39
39
 
@@ -1,6 +1,7 @@
1
1
  import hashlib
2
2
  import json
3
3
  import tarfile
4
+ import warnings
4
5
  from collections.abc import Iterator, Sequence
5
6
  from pathlib import Path
6
7
  from typing import (
@@ -19,6 +20,18 @@ from datachain.lib.data_model import DataModel
19
20
  from datachain.lib.file import File, TarVFile
20
21
  from datachain.lib.utils import DataChainError
21
22
 
23
+ # The `json` method of the Pydantic `BaseModel` class has been deprecated
24
+ # and will be removed in Pydantic v3. For more details, see:
25
+ # https://github.com/pydantic/pydantic/issues/10033
26
+ # Until then, we can ignore the warning.
27
+ warnings.filterwarnings(
28
+ "ignore",
29
+ category=UserWarning,
30
+ message=(
31
+ 'Field name "json" in "WDSAllFile" shadows an attribute in parent "WDSBasic"'
32
+ ),
33
+ )
34
+
22
35
 
23
36
  class WDSError(DataChainError):
24
37
  def __init__(self, tar_stream, message: str):
@@ -1,3 +1,4 @@
1
+ import warnings
1
2
  from collections.abc import Iterator
2
3
  from typing import Optional
3
4
 
@@ -7,6 +8,18 @@ from pydantic import BaseModel, Field
7
8
  from datachain.lib.file import File
8
9
  from datachain.lib.webdataset import WDSBasic, WDSReadableSubclass
9
10
 
11
+ # The `json` method of the Pydantic `BaseModel` class has been deprecated
12
+ # and will be removed in Pydantic v3. For more details, see:
13
+ # https://github.com/pydantic/pydantic/issues/10033
14
+ # Until then, we can ignore the warning.
15
+ warnings.filterwarnings(
16
+ "ignore",
17
+ category=UserWarning,
18
+ message=(
19
+ 'Field name "json" in "WDSLaion" shadows an attribute in parent "WDSBasic"'
20
+ ),
21
+ )
22
+
10
23
 
11
24
  class Laion(WDSReadableSubclass):
12
25
  uid: str = Field(default="")
datachain/listing.py CHANGED
@@ -104,7 +104,7 @@ class Listing:
104
104
  return self.warehouse.get_node_by_path(self.dataset_rows, path)
105
105
 
106
106
  def ls_path(self, node, fields):
107
- if node.vtype == "tar" or node.dir_type == DirType.TAR_ARCHIVE:
107
+ if node.location or node.dir_type == DirType.TAR_ARCHIVE:
108
108
  return self.warehouse.select_node_fields_by_parent_path_tar(
109
109
  self.dataset_rows, node.path, fields
110
110
  )
@@ -235,7 +235,7 @@ class Listing:
235
235
  return self.warehouse.size(self.dataset_rows, node, count_files)
236
236
 
237
237
  def subtree_files(self, node: Node, sort=None):
238
- if node.dir_type == DirType.TAR_ARCHIVE or node.vtype != "":
238
+ if node.dir_type == DirType.TAR_ARCHIVE or node.location:
239
239
  include_subobjects = True
240
240
  else:
241
241
  include_subobjects = False
datachain/node.py CHANGED
@@ -49,18 +49,15 @@ class DirTypeGroup:
49
49
  class Node:
50
50
  sys__id: int = 0
51
51
  sys__rand: int = 0
52
- vtype: str = ""
53
- dir_type: Optional[int] = None
54
52
  path: str = ""
55
53
  etag: str = ""
56
54
  version: Optional[str] = None
57
55
  is_latest: bool = True
58
56
  last_modified: Optional[datetime] = None
59
57
  size: int = 0
60
- owner_name: str = ""
61
- owner_id: str = ""
62
58
  location: Optional[str] = None
63
59
  source: StorageURI = StorageURI("")
60
+ dir_type: int = DirType.FILE
64
61
 
65
62
  @property
66
63
  def is_dir(self) -> bool:
@@ -113,7 +110,6 @@ class Node:
113
110
  version=self.version or "",
114
111
  etag=self.etag,
115
112
  is_latest=self.is_latest,
116
- vtype=self.vtype,
117
113
  location=self.location,
118
114
  last_modified=self.last_modified or TIME_ZERO,
119
115
  )
@@ -145,38 +141,20 @@ class Node:
145
141
 
146
142
  @attrs.define
147
143
  class Entry:
148
- vtype: str = ""
149
- dir_type: Optional[int] = None
150
144
  path: str = ""
151
145
  etag: str = ""
152
146
  version: str = ""
153
147
  is_latest: bool = True
154
148
  last_modified: Optional[datetime] = None
155
149
  size: int = 0
156
- owner_name: str = ""
157
- owner_id: str = ""
158
150
  location: Optional[str] = None
159
151
 
160
- @property
161
- def is_dir(self) -> bool:
162
- return self.dir_type == DirType.DIR
163
-
164
- @classmethod
165
- def from_dir(cls, path: str, **kwargs) -> "Entry":
166
- return cls(dir_type=DirType.DIR, path=path, **kwargs)
167
-
168
152
  @classmethod
169
153
  def from_file(cls, path: str, **kwargs) -> "Entry":
170
- return cls(dir_type=DirType.FILE, path=path, **kwargs)
171
-
172
- @classmethod
173
- def root(cls):
174
- return cls(dir_type=DirType.DIR)
154
+ return cls(path=path, **kwargs)
175
155
 
176
156
  @property
177
157
  def full_path(self) -> str:
178
- if self.is_dir and self.path:
179
- return self.path + "/"
180
158
  return self.path
181
159
 
182
160
  @property
@@ -229,9 +207,9 @@ class NodeWithPath:
229
207
  TIME_FMT = "%Y-%m-%d %H:%M"
230
208
 
231
209
 
232
- def long_line_str(name: str, timestamp: Optional[datetime], owner: str) -> str:
210
+ def long_line_str(name: str, timestamp: Optional[datetime]) -> str:
233
211
  if timestamp is None:
234
212
  time = "-"
235
213
  else:
236
214
  time = timestamp.strftime(TIME_FMT)
237
- return f"{owner: <19} {time: <19} {name}"
215
+ return f"{time: <19} {name}"
@@ -22,10 +22,6 @@ def load_tar(raw):
22
22
  C.source,
23
23
  C.path,
24
24
  C.size,
25
- C.vtype,
26
- C.dir_type,
27
- C.owner_name,
28
- C.owner_id,
29
25
  C.is_latest,
30
26
  C.last_modified,
31
27
  C.version,
@@ -38,10 +34,6 @@ def index_tar(
38
34
  source,
39
35
  parent_path,
40
36
  size,
41
- vtype,
42
- dir_type,
43
- owner_name,
44
- owner_id,
45
37
  is_latest,
46
38
  last_modified,
47
39
  version,
@@ -53,10 +45,6 @@ def index_tar(
53
45
  source=source,
54
46
  path=parent_path,
55
47
  size=size,
56
- vtype=vtype,
57
- dir_type=dir_type,
58
- owner_name=owner_name,
59
- owner_id=owner_id,
60
48
  is_latest=bool(is_latest),
61
49
  last_modified=last_modified,
62
50
  version=version,
@@ -70,7 +58,6 @@ def index_tar(
70
58
  source=source,
71
59
  path=full_path,
72
60
  size=info.size,
73
- vtype="tar",
74
61
  location={
75
62
  "vtype": "tar",
76
63
  "offset": info.offset_data,
@@ -81,7 +68,6 @@ def index_tar(
81
68
  "version": version,
82
69
  "size": size,
83
70
  "etag": etag,
84
- "vtype": "",
85
71
  "location": None,
86
72
  },
87
73
  },
datachain/query/schema.py CHANGED
@@ -9,7 +9,7 @@ import attrs
9
9
  import sqlalchemy as sa
10
10
  from fsspec.callbacks import DEFAULT_CALLBACK, Callback
11
11
 
12
- from datachain.sql.types import JSON, Boolean, DateTime, Int, Int64, SQLType, String
12
+ from datachain.sql.types import JSON, Boolean, DateTime, Int64, SQLType, String
13
13
 
14
14
  if TYPE_CHECKING:
15
15
  from datachain.catalog import Catalog
@@ -222,10 +222,6 @@ class DatasetRow:
222
222
  "path": String,
223
223
  "size": Int64,
224
224
  "location": JSON,
225
- "vtype": String,
226
- "dir_type": Int,
227
- "owner_name": String,
228
- "owner_id": String,
229
225
  "is_latest": Boolean,
230
226
  "last_modified": DateTime,
231
227
  "version": String,
@@ -238,10 +234,6 @@ class DatasetRow:
238
234
  source: str = "",
239
235
  size: int = 0,
240
236
  location: Optional[dict[str, Any]] = None,
241
- vtype: str = "",
242
- dir_type: int = 0,
243
- owner_name: str = "",
244
- owner_id: str = "",
245
237
  is_latest: bool = True,
246
238
  last_modified: Optional[datetime] = None,
247
239
  version: str = "",
@@ -251,10 +243,7 @@ class DatasetRow:
251
243
  str,
252
244
  int,
253
245
  Optional[str],
254
- str,
255
246
  int,
256
- str,
257
- str,
258
247
  bool,
259
248
  datetime,
260
249
  str,
@@ -271,10 +260,6 @@ class DatasetRow:
271
260
  path,
272
261
  size,
273
262
  location,
274
- vtype,
275
- dir_type,
276
- owner_name,
277
- owner_id,
278
263
  is_latest,
279
264
  last_modified,
280
265
  version,
datachain/utils.py CHANGED
@@ -340,11 +340,8 @@ def show_df(
340
340
  "etag",
341
341
  "is_latest",
342
342
  "last_modified",
343
- "owner_id",
344
- "owner_name",
345
343
  "size",
346
344
  "version",
347
- "vtype",
348
345
  ],
349
346
  inplace=True,
350
347
  errors="ignore",
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.3.11
3
+ Version: 0.3.13
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -80,7 +80,6 @@ Requires-Dist: pytest-cov >=4.1.0 ; extra == 'tests'
80
80
  Requires-Dist: pytest-mock >=3.12.0 ; extra == 'tests'
81
81
  Requires-Dist: pytest-servers[all] >=0.5.5 ; extra == 'tests'
82
82
  Requires-Dist: pytest-benchmark[histogram] ; extra == 'tests'
83
- Requires-Dist: pytest-asyncio >=0.23.2 ; extra == 'tests'
84
83
  Requires-Dist: pytest-xdist >=3.3.1 ; extra == 'tests'
85
84
  Requires-Dist: virtualenv ; extra == 'tests'
86
85
  Requires-Dist: dulwich ; extra == 'tests'
@@ -96,12 +95,14 @@ Requires-Dist: transformers >=4.36.0 ; extra == 'torch'
96
95
  Provides-Extra: vector
97
96
  Requires-Dist: usearch ; extra == 'vector'
98
97
 
99
- .. image:: docs/assets/datachain_logotype.svg
100
- :height: 48
101
- :alt: DataChain logo
98
+ ================
99
+ |logo| DataChain
100
+ ================
102
101
 
103
102
  |PyPI| |Python Version| |Codecov| |Tests|
104
103
 
104
+ .. |logo| image:: docs/assets/datachain.svg
105
+ :height: 24
105
106
  .. |PyPI| image:: https://img.shields.io/pypi/v/datachain.svg
106
107
  :target: https://pypi.org/project/datachain/
107
108
  :alt: PyPI
@@ -115,8 +116,6 @@ Requires-Dist: usearch ; extra == 'vector'
115
116
  :target: https://github.com/iterative/datachain/actions/workflows/tests.yml
116
117
  :alt: Tests
117
118
 
118
- ----------------
119
-
120
119
  DataChain is a modern Pythonic data-frame library designed for artificial intelligence.
121
120
  It is made to organize your unstructured data into datasets and wrangle it at scale on
122
121
  your local machine. Datachain does not abstract or hide the AI models and API calls, but helps to integrate them into the postmodern data stack.
@@ -1,65 +1,65 @@
1
1
  datachain/__init__.py,sha256=GeyhE-5LgfJav2OKYGaieP2lBvf2Gm-ihj7thnK9zjI,800
2
2
  datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
3
3
  datachain/asyn.py,sha256=biF8M8fQujtj5xs0VLi8S16eBtzG6kceWlO_NILbCsg,8197
4
- datachain/cache.py,sha256=wznC2pge6RhlPTaJfBVGjmBc6bxWCPThu4aTFMltvFU,4076
5
- datachain/cli.py,sha256=ECf_z5X8ILDJdUn2Cpb_z-ZjSRIzn7skiuMGfM-y9i0,30999
4
+ datachain/cache.py,sha256=WP-ktH_bRn3w2g1JOOQ7rCPsZyR4OM6K1Kb7yZsSSns,4056
5
+ datachain/cli.py,sha256=alMjnoBUBLvBSMBR51N09rA_aUEdHJwyxSRogF7VbbA,30891
6
6
  datachain/cli_utils.py,sha256=jrn9ejGXjybeO1ur3fjdSiAyCHZrX0qsLLbJzN9ErPM,2418
7
7
  datachain/config.py,sha256=PfC7W5yO6HFO6-iMB4YB-0RR88LPiGmD6sS_SfVbGso,1979
8
8
  datachain/dataset.py,sha256=EcYjhHg1dxxPbDwSuIxc-mDRDo3v_pYf79fMy4re1oA,14740
9
- datachain/error.py,sha256=GY9KYTmb7GHXn2gGHV9X-PBhgwLj3i7VpK7tGHtAoGM,1279
9
+ datachain/error.py,sha256=OnZ8OaBtDdTZPy8XQiy29SAjqdQArQeorYbP5ju7ldc,1199
10
10
  datachain/job.py,sha256=Jt4sNutMHJReaGsj3r3scueN5aESLGfhimAa8pUP7Is,1271
11
- datachain/listing.py,sha256=keLkvPfumDA3gijeIiinH5yGWe71qCxgF5HqqP5AeH4,8299
12
- datachain/node.py,sha256=dcm_7dVurFHpI0EHV2K6SjpJyh-gN4PVWAB-20quk04,6382
11
+ datachain/listing.py,sha256=LgL0lV10AzD1v52ajSaJKFnyiq4hNXwQiqaGySWGQsw,8290
12
+ datachain/node.py,sha256=gacKxUPLgJ1ul6LJWz7nylYjUWPbyUY5cqaBFDOnO9E,5756
13
13
  datachain/nodes_fetcher.py,sha256=kca19yvu11JxoVY1t4_ydp1FmchiV88GnNicNBQ9NIA,831
14
14
  datachain/nodes_thread_pool.py,sha256=ZyzBvUImIPmi4WlKC2SW2msA0UhtembbTdcs2nx29A0,3191
15
15
  datachain/progress.py,sha256=7_8FtJs770ITK9sMq-Lt4k4k18QmYl4yIG_kCoWID3o,4559
16
16
  datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
17
  datachain/storage.py,sha256=RiSJLYdHUjnrEWkLBKPcETHpAxld_B2WxLg711t0aZI,3733
18
- datachain/utils.py,sha256=HKUdVqreBTzcCULAYRw1sC6z33OaomVD1WoMSoFcPHg,13148
18
+ datachain/utils.py,sha256=Z9-lPNvrrAh_VWpzVBJ7L5-Oy_Oo1V0ZW7G0MVDyPK4,13065
19
19
  datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
20
- datachain/catalog/catalog.py,sha256=NgS7_SlmpJdUSp1v8KdCuLTjFklmYvT_jOLdzTyyK5I,72313
20
+ datachain/catalog/catalog.py,sha256=hhLciKHD0dVwniFzUsYORQ72WpnM40QYT0ydoyx1Kvw,69308
21
21
  datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
22
22
  datachain/catalog/loader.py,sha256=-6VelNfXUdgUnwInVyA8g86Boxv2xqhTh9xNS-Zlwig,8242
23
23
  datachain/client/__init__.py,sha256=T4wiYL9KIM0ZZ_UqIyzV8_ufzYlewmizlV4iymHNluE,86
24
24
  datachain/client/azure.py,sha256=LXSahE0Z6r4dXqpBkKnq3J5fg7N7ymC1lSn-1SoILGc,2687
25
25
  datachain/client/fileslice.py,sha256=bT7TYco1Qe3bqoc8aUkUZcPdPofJDHlryL5BsTn9xsY,3021
26
- datachain/client/fsspec.py,sha256=LQb5tr-pP9umCFYo3nGJR_dZxUyiSN7IDE8jhp1TXco,13333
26
+ datachain/client/fsspec.py,sha256=Hy3-4HRV-3MozOybqAnF-qL0EoMYFHynpTG_YZphjZE,13298
27
27
  datachain/client/gcs.py,sha256=P_E3mhzhXR9mJ_wc3AYZuczzwOJ0-D3J5qhJXeSU-xk,4518
28
28
  datachain/client/hf.py,sha256=R-F6Ks6aVM9wSNkIXOkOnZFwsJlfdRwJjymRa78RLjM,1246
29
29
  datachain/client/local.py,sha256=H8TNY8pi2kA8y9_f_1XLUjJF66f229qC_b2y4xGkzdU,5300
30
- datachain/client/s3.py,sha256=aQxfMH8G8bUjmHF1-6P90MSkXsU5DgOPEVlKWLu459I,6568
30
+ datachain/client/s3.py,sha256=zs41EvYW1bS_pUxnkCnJILzUJpL2V1jvvVKSN4BKYcc,6326
31
31
  datachain/data_storage/__init__.py,sha256=cEOJpyu1JDZtfUupYucCDNFI6e5Wmp_Oyzq6rZv32Y8,398
32
32
  datachain/data_storage/db_engine.py,sha256=81Ol1of9TTTzD97ORajCnP366Xz2mEJt6C-kTUCaru4,3406
33
33
  datachain/data_storage/id_generator.py,sha256=lCEoU0BM37Ai2aRpSbwo5oQT0GqZnSpYwwvizathRMQ,4292
34
34
  datachain/data_storage/job.py,sha256=w-7spowjkOa1P5fUVtJou3OltT0L48P0RYWZ9rSJ9-s,383
35
35
  datachain/data_storage/metastore.py,sha256=cHN0xmbUvChyayHHZm3Vqxr87jFqojPSlGBqhTPStlE,54519
36
- datachain/data_storage/schema.py,sha256=JKpSEz8igpwZ9zkpRPYVXZxEpiXuLKEs2WNhH0KqM6U,8552
36
+ datachain/data_storage/schema.py,sha256=AGbjyEir5UmRZXI3m0jChZogUh5wd8csj6-YlUWaAxQ,8383
37
37
  datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
38
38
  datachain/data_storage/sqlite.py,sha256=Z4B2KDL4C8Uio2aLMxaKv0t2MoOtCV3bSqWg4X9mTFg,28048
39
- datachain/data_storage/warehouse.py,sha256=f7ETnYIXx5KMcPfwg_4bh_00QJiMLIliwE_41vmRGUo,33037
39
+ datachain/data_storage/warehouse.py,sha256=s5hhVUWrlEopE6eGOqzXHeNtRapK30G8gj0Vkt_HHFQ,32649
40
40
  datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
41
41
  datachain/lib/arrow.py,sha256=dV17oGiknqEW55ogGK_9T0ycNFwd2z-EFOW0AQiR6TU,5840
42
- datachain/lib/clip.py,sha256=33RL11OIqfbwyhvBgiMGM8rDAnZx1IRmxk9dY89ls3Q,6130
42
+ datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
43
43
  datachain/lib/data_model.py,sha256=gHIjlow84GMRDa78yLL1Ud-N18or21fnTyPEwsatpXY,2045
44
44
  datachain/lib/dataset_info.py,sha256=srPPhI2UHf6hFPBecyFEVw2SS5aPisIIMsvGgKqi7ss,2366
45
- datachain/lib/dc.py,sha256=s4E-bD6_T6JFJ7TEa5Y9RS705lIfcV9OUJwDD6RNCX0,68156
46
- datachain/lib/file.py,sha256=WOOYw3LcGROA6wshJ_aZkSgcTqfB4UxTbZDTx9KqAOg,11429
45
+ datachain/lib/dc.py,sha256=C-sfWRinV8pDK2P6UHLbScOahTlTiVQpoxUUdVllF2k,68710
46
+ datachain/lib/file.py,sha256=rXmyzUFgnLQ4J3CyOCcg-guhzAz4x9Ug595FbNn4Y2E,11398
47
47
  datachain/lib/hf.py,sha256=ZiMvgy3DYiklGKZv-w7gevrHOgn3bGfpTlpDPOHCNqs,5336
48
- datachain/lib/image.py,sha256=WbcwSaFzuyqjg4x4hH5CUogeUQjkZFjQHqw_oDEV1nA,2655
48
+ datachain/lib/image.py,sha256=AMXYwQsmarZjRbPCZY3M1jDsM2WAB_b3cTY4uOIuXNU,2675
49
49
  datachain/lib/listing.py,sha256=S9Xn_Saxu4xk3K_01VexkfMZW0INQiATlidt2bzgWKY,3938
50
50
  datachain/lib/listing_info.py,sha256=sr5KzCXlCxlPuRmy_pVadD4miLpp5y0btvyaIPcluwI,996
51
- datachain/lib/meta_formats.py,sha256=67uF9trQ2II6xFvN0u6eo5NNRf5xvCkpMHj7ThiG41Y,6777
52
- datachain/lib/model_store.py,sha256=c4USXsBBjrGH8VOh4seIgOiav-qHOwdoixtxfLgU63c,2409
51
+ datachain/lib/meta_formats.py,sha256=3f-0vpMTesagS9iMd3y9-u9r-7g0eqYsxmK4fVfNWlw,6635
52
+ datachain/lib/model_store.py,sha256=xcrQ69-jcQs716U4UFOSoSKM7EvFIWqxlPhIcE4X7oI,2497
53
53
  datachain/lib/pytorch.py,sha256=vK3GbWCy7kunN7ubul6w1hrWmJLja56uTCiMG_7XVQA,5623
54
54
  datachain/lib/settings.py,sha256=39thOpYJw-zPirzeNO6pmRC2vPrQvt4eBsw1xLWDFsw,2344
55
55
  datachain/lib/signal_schema.py,sha256=hqQLwUmt3w8RLa96MtubK9N2CBXqqTPrUkSRXc0ktt4,20275
56
- datachain/lib/text.py,sha256=vqs1SQdsw1vCzfvOanIeT4xY2R2TmPonElBgYDVeZmY,1241
56
+ datachain/lib/text.py,sha256=UNHm8fhidk7wdrWqacEWaA6I9ykfYqarQ2URby7jc7M,1261
57
57
  datachain/lib/udf.py,sha256=nG7DDuPgZ5ZuijwvDoCq-OZMxlDM8vFNzyxMmik0Y1c,11716
58
58
  datachain/lib/udf_signature.py,sha256=gMStcEeYJka5M6cg50Z9orC6y6HzCAJ3MkFqqn1fjZg,7137
59
59
  datachain/lib/utils.py,sha256=5-kJlAZE0D9nXXweAjo7-SP_AWGo28feaDByONYaooQ,463
60
60
  datachain/lib/vfile.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
61
- datachain/lib/webdataset.py,sha256=Q3UlCk66341sq-nvFbBCX4Cv3cYXBK9n12ejG4axPXE,8298
62
- datachain/lib/webdataset_laion.py,sha256=PQP6tQmUP7Xu9fPuAGK1JDBYA6T5UufYMUTGaxgspJA,2118
61
+ datachain/lib/webdataset.py,sha256=ZzGLtOUA-QjP4kttGgNqhrioDuDnomWFlsow4fLdezQ,8717
62
+ datachain/lib/webdataset_laion.py,sha256=aGMWeFmeYNK75ewO9JTA11iB1i3QtTzUfenQA5jajfo,2535
63
63
  datachain/lib/convert/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
64
64
  datachain/lib/convert/flatten.py,sha256=Uebc5CeqCsacp-nr6IG9i6OGuUavXqdqnoGctZBk3RQ,1384
65
65
  datachain/lib/convert/python_to_sql.py,sha256=40SAOdoOgikZRhn8iomCPDRoxC3RFxjJLivEAA9MHDU,2880
@@ -68,13 +68,13 @@ datachain/lib/convert/unflatten.py,sha256=Ogvh_5wg2f38_At_1lN0D_e2uZOOpYEvwvB2xd
68
68
  datachain/lib/convert/values_to_tuples.py,sha256=YOdbjzHq-uj6-cV2Qq43G72eN2avMNDGl4x5t6yQMl8,3931
69
69
  datachain/query/__init__.py,sha256=tv-spkjUCYamMN9ys_90scYrZ8kJ7C7d1MTYVmxGtk4,325
70
70
  datachain/query/batch.py,sha256=-vlpINJiertlnaoUVv1C95RatU0F6zuhpIYRufJRo1M,3660
71
- datachain/query/builtins.py,sha256=EmKPYsoQ46zwdyOn54MuCzvYFmfsBn5F8zyF7UBUfrc,2550
71
+ datachain/query/builtins.py,sha256=U6yHPF9bzxqK5iwyqCqbJxo8ggBVx9FtuXxRrQQ0SNM,2244
72
72
  datachain/query/dataset.py,sha256=B2EmGOL8gjrdU_WhU88Dj7FsxvxrNeKwe2STXnU9T9E,58369
73
73
  datachain/query/dispatch.py,sha256=GBh3EZHDp5AaXxrjOpfrpfsuy7Umnqxu-MAXcK9X3gc,12945
74
74
  datachain/query/metrics.py,sha256=r5b0ygYhokbXp8Mg3kCH8iFSRw0jxzyeBe-C-J_bKFc,938
75
75
  datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
76
76
  datachain/query/queue.py,sha256=waqM_KzavU8C-G95-4211Nd4GXna_u2747Chgwtgz2w,3839
77
- datachain/query/schema.py,sha256=BvHipN79CnSTbVFcfIEwzo1npe7HmThnk0iY-CSLEkM,7899
77
+ datachain/query/schema.py,sha256=ytlkA1xFAUOia25u8d6pxvxBSRl3uivLuOe2eHaw-qc,7550
78
78
  datachain/query/session.py,sha256=UPH5Z4fzCDsvj81ji0e8GA6Mgra3bOAEpVq4htqOtis,4317
79
79
  datachain/query/udf.py,sha256=j3NhmKK5rYG5TclcM2Sr0LhS1tmYLMjzMugx9G9iFLM,8100
80
80
  datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -96,9 +96,9 @@ datachain/sql/sqlite/base.py,sha256=WLPHBhZbXbiqPoRV1VgDrXJqku4UuvJpBhYeQ0k5rI8,
96
96
  datachain/sql/sqlite/types.py,sha256=yzvp0sXSEoEYXs6zaYC_2YubarQoZH-MiUNXcpuEP4s,1573
97
97
  datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
98
98
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
99
- datachain-0.3.11.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
100
- datachain-0.3.11.dist-info/METADATA,sha256=iSdfjWpVT1Iqzlg82eN5QzJ-icaYxkG7TUKEpEOi5sk,17124
101
- datachain-0.3.11.dist-info/WHEEL,sha256=cVxcB9AmuTcXqmwrtPhNK88dr7IR_b6qagTj0UvIEbY,91
102
- datachain-0.3.11.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
103
- datachain-0.3.11.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
104
- datachain-0.3.11.dist-info/RECORD,,
99
+ datachain-0.3.13.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
100
+ datachain-0.3.13.dist-info/METADATA,sha256=pzMOR9LYuLR26Wifk4GPS9Wi1mmqCC5CIBZyA-X5_oo,17073
101
+ datachain-0.3.13.dist-info/WHEEL,sha256=cVxcB9AmuTcXqmwrtPhNK88dr7IR_b6qagTj0UvIEbY,91
102
+ datachain-0.3.13.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
103
+ datachain-0.3.13.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
104
+ datachain-0.3.13.dist-info/RECORD,,