datachain 0.3.9__py3-none-any.whl → 0.3.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -0,0 +1,32 @@
1
+ from datetime import datetime, timedelta, timezone
2
+ from typing import Optional
3
+
4
+ from datachain.client import Client
5
+ from datachain.lib.dataset_info import DatasetInfo
6
+ from datachain.lib.listing import LISTING_PREFIX, LISTING_TTL
7
+
8
+
9
+ class ListingInfo(DatasetInfo):
10
+ @property
11
+ def uri(self) -> str:
12
+ return self.name.removeprefix(LISTING_PREFIX)
13
+
14
+ @property
15
+ def storage_uri(self) -> str:
16
+ client, _ = Client.parse_url(self.uri, None) # type: ignore[arg-type]
17
+ return client.uri
18
+
19
+ @property
20
+ def expires(self) -> Optional[datetime]:
21
+ if not self.finished_at:
22
+ return None
23
+ return self.finished_at + timedelta(seconds=LISTING_TTL)
24
+
25
+ @property
26
+ def is_expired(self) -> bool:
27
+ return datetime.now(timezone.utc) > self.expires if self.expires else False
28
+
29
+ @property
30
+ def last_inserted_at(self):
31
+ # TODO we need to add updated_at to dataset version or explicit last_inserted_at
32
+ raise NotImplementedError
@@ -2,14 +2,14 @@
2
2
  # pip install jmespath
3
3
  #
4
4
  import csv
5
- import io
6
5
  import json
7
- import subprocess
8
- import sys
6
+ import tempfile
9
7
  import uuid
10
8
  from collections.abc import Iterator
9
+ from pathlib import Path
11
10
  from typing import Any, Callable
12
11
 
12
+ import datamodel_code_generator
13
13
  import jmespath as jsp
14
14
  from pydantic import BaseModel, ConfigDict, Field, ValidationError # noqa: F401
15
15
 
@@ -47,17 +47,16 @@ def read_schema(source_file, data_type="csv", expr=None, model_name=None):
47
47
  data_string = ""
48
48
  # using uiid to get around issue #1617
49
49
  if not model_name:
50
- uid_str = str(generate_uuid()).replace(
51
- "-", ""
52
- ) # comply with Python class names
50
+ # comply with Python class names
51
+ uid_str = str(generate_uuid()).replace("-", "")
53
52
  model_name = f"Model{data_type}{uid_str}"
54
53
  try:
55
54
  with source_file.open() as fd: # CSV can be larger than memory
56
55
  if data_type == "csv":
57
- data_string += fd.readline().decode("utf-8", "ignore").replace("\r", "")
58
- data_string += fd.readline().decode("utf-8", "ignore").replace("\r", "")
56
+ data_string += fd.readline().replace("\r", "")
57
+ data_string += fd.readline().replace("\r", "")
59
58
  elif data_type == "jsonl":
60
- data_string = fd.readline().decode("utf-8", "ignore").replace("\r", "")
59
+ data_string = fd.readline().replace("\r", "")
61
60
  else:
62
61
  data_string = fd.read() # other meta must fit into RAM
63
62
  except OSError as e:
@@ -70,33 +69,27 @@ def read_schema(source_file, data_type="csv", expr=None, model_name=None):
70
69
  if data_type == "jsonl":
71
70
  data_type = "json" # treat json line as plain JSON in auto-schema
72
71
  data_string = json.dumps(json_object)
73
- command = [
74
- "datamodel-codegen",
75
- "--input-file-type",
76
- data_type,
77
- "--class-name",
78
- model_name,
79
- "--base-class",
80
- "datachain.lib.meta_formats.UserModel",
81
- ]
82
- try:
83
- result = subprocess.run( # noqa: S603
84
- command,
85
- input=data_string,
86
- text=True,
87
- capture_output=True,
88
- check=True,
72
+
73
+ input_file_types = {i.value: i for i in datamodel_code_generator.InputFileType}
74
+ input_file_type = input_file_types[data_type]
75
+ with tempfile.TemporaryDirectory() as tmpdir:
76
+ output = Path(tmpdir) / "model.py"
77
+ datamodel_code_generator.generate(
78
+ data_string,
79
+ input_file_type=input_file_type,
80
+ output=output,
81
+ target_python_version=datamodel_code_generator.PythonVersion.PY_39,
82
+ base_class="datachain.lib.meta_formats.UserModel",
83
+ class_name=model_name,
84
+ additional_imports=["datachain.lib.data_model.DataModel"],
85
+ use_standard_collections=True,
89
86
  )
90
- model_output = (
91
- result.stdout
92
- ) # This will contain the output from datamodel-codegen
93
- except subprocess.CalledProcessError as e:
94
- model_output = f"An error occurred in datamodel-codegen: {e.stderr}"
95
- print(f"{model_output}")
96
- print("from datachain.lib.data_model import DataModel")
97
- print("\n" + f"DataModel.register({model_name})" + "\n")
98
- print("\n" + f"spec={model_name}" + "\n")
99
- return model_output
87
+ epilogue = f"""
88
+ {model_name}.model_rebuild()
89
+ DataModel.register({model_name})
90
+ spec = {model_name}
91
+ """
92
+ return output.read_text() + epilogue
100
93
 
101
94
 
102
95
  #
@@ -113,34 +106,24 @@ def read_meta( # noqa: C901
113
106
  ) -> Callable:
114
107
  from datachain.lib.dc import DataChain
115
108
 
116
- # ugly hack: datachain is run redirecting printed outputs to a variable
117
109
  if schema_from:
118
- captured_output = io.StringIO()
119
- current_stdout = sys.stdout
120
- sys.stdout = captured_output
121
- try:
122
- chain = (
123
- DataChain.from_storage(schema_from)
124
- .limit(1)
125
- .map( # dummy column created (#1615)
126
- meta_schema=lambda file: read_schema(
127
- file, data_type=meta_type, expr=jmespath, model_name=model_name
128
- ),
129
- output=str,
130
- )
110
+ chain = (
111
+ DataChain.from_storage(schema_from, type="text")
112
+ .limit(1)
113
+ .map( # dummy column created (#1615)
114
+ meta_schema=lambda file: read_schema(
115
+ file, data_type=meta_type, expr=jmespath, model_name=model_name
116
+ ),
117
+ output=str,
131
118
  )
132
- chain.exec()
133
- finally:
134
- sys.stdout = current_stdout
135
- model_output = captured_output.getvalue()
136
- captured_output.close()
137
-
119
+ )
120
+ (model_output,) = chain.collect("meta_schema")
138
121
  if print_schema:
139
122
  print(f"{model_output}")
140
123
  # Below 'spec' should be a dynamically converted DataModel from Pydantic
141
124
  if not spec:
142
125
  local_vars: dict[str, Any] = {}
143
- exec(model_output, globals(), local_vars) # noqa: S102
126
+ exec(model_output, globals(), local_vars) # type: ignore[arg-type] # noqa: S102
144
127
  spec = local_vars["spec"]
145
128
 
146
129
  if not (spec) and not (schema_from):
@@ -16,7 +16,6 @@ from typing import (
16
16
  get_origin,
17
17
  )
18
18
 
19
- import sqlalchemy as sa
20
19
  from pydantic import BaseModel, create_model
21
20
  from typing_extensions import Literal as LiteralEx
22
21
 
@@ -341,7 +340,7 @@ class SignalSchema:
341
340
  signals = [
342
341
  DEFAULT_DELIMITER.join(path)
343
342
  if not as_columns
344
- else sa.Column(DEFAULT_DELIMITER.join(path), python_to_sql(_type))
343
+ else Column(DEFAULT_DELIMITER.join(path), python_to_sql(_type))
345
344
  for path, _type, has_subtree, _ in self.get_flat_tree()
346
345
  if not has_subtree
347
346
  ]
@@ -415,6 +414,10 @@ class SignalSchema:
415
414
  # renaming existing signal
416
415
  del new_values[value.name]
417
416
  new_values[name] = self.values[value.name]
417
+ elif name in self.values:
418
+ # changing the type of existing signal, e.g File -> ImageFile
419
+ del new_values[name]
420
+ new_values[name] = args_map[name]
418
421
  else:
419
422
  # adding new signal
420
423
  new_values.update(sql_to_python({name: value}))
datachain/node.py CHANGED
@@ -4,6 +4,7 @@ from typing import TYPE_CHECKING, Any, Optional
4
4
  import attrs
5
5
 
6
6
  from datachain.cache import UniqueId
7
+ from datachain.lib.file import File
7
8
  from datachain.storage import StorageURI
8
9
  from datachain.utils import TIME_ZERO, time_to_str
9
10
 
@@ -189,6 +190,18 @@ class Entry:
189
190
  return ""
190
191
  return split[0]
191
192
 
193
+ def to_file(self, source: str) -> File:
194
+ return File(
195
+ source=source,
196
+ path=self.path,
197
+ size=self.size,
198
+ version=self.version,
199
+ etag=self.etag,
200
+ is_latest=self.is_latest,
201
+ last_modified=self.last_modified,
202
+ location=self.location,
203
+ )
204
+
192
205
 
193
206
  def get_path(parent: str, name: str):
194
207
  return f"{parent}/{name}" if parent else name
@@ -1,7 +1,5 @@
1
1
  import contextlib
2
- import datetime
3
2
  import inspect
4
- import json
5
3
  import logging
6
4
  import os
7
5
  import random
@@ -38,11 +36,7 @@ from sqlalchemy.sql.selectable import Select
38
36
  from tqdm import tqdm
39
37
 
40
38
  from datachain.asyn import ASYNC_WORKERS, AsyncMapper, OrderedMapper
41
- from datachain.catalog import (
42
- QUERY_SCRIPT_CANCELED_EXIT_CODE,
43
- QUERY_SCRIPT_INVALID_LAST_STATEMENT_EXIT_CODE,
44
- get_catalog,
45
- )
39
+ from datachain.catalog import QUERY_SCRIPT_CANCELED_EXIT_CODE, get_catalog
46
40
  from datachain.data_storage.schema import (
47
41
  PARTITION_COLUMN_ID,
48
42
  partition_col_names,
@@ -60,7 +54,6 @@ from datachain.utils import (
60
54
  get_datachain_executable,
61
55
  )
62
56
 
63
- from .metrics import metrics
64
57
  from .schema import C, UDFParamSpec, normalize_param
65
58
  from .session import Session
66
59
  from .udf import UDFBase, UDFClassWrapper, UDFFactory, UDFType
@@ -219,7 +212,7 @@ class IndexingStep(StartingStep):
219
212
  recursive=self.recursive,
220
213
  )
221
214
 
222
- storage = self.catalog.get_storage(uri)
215
+ storage = self.catalog.metastore.get_storage(uri)
223
216
 
224
217
  return step_result(q, dataset_rows.c, dependencies=[storage.uri])
225
218
 
@@ -1175,8 +1168,12 @@ class DatasetQuery:
1175
1168
  """
1176
1169
  return self.name is not None and self.version is not None
1177
1170
 
1178
- def c(self, name: Union[C, str]) -> "ColumnClause[Any]":
1179
- col = sqlalchemy.column(name) if isinstance(name, str) else name
1171
+ def c(self, column: Union[C, str]) -> "ColumnClause[Any]":
1172
+ col: sqlalchemy.ColumnClause = (
1173
+ sqlalchemy.column(column)
1174
+ if isinstance(column, str)
1175
+ else sqlalchemy.column(column.name, column.type)
1176
+ )
1180
1177
  col.table = self.table
1181
1178
  return col
1182
1179
 
@@ -1634,7 +1631,7 @@ class DatasetQuery:
1634
1631
  )
1635
1632
  else:
1636
1633
  # storage dependency - its name is a valid StorageURI
1637
- storage = self.catalog.get_storage(dependency)
1634
+ storage = self.catalog.metastore.get_storage(dependency)
1638
1635
  self.catalog.metastore.add_storage_dependency(
1639
1636
  StorageURI(dataset.name),
1640
1637
  version,
@@ -1712,113 +1709,23 @@ class DatasetQuery:
1712
1709
  return self.__class__(name=name, version=version, catalog=self.catalog)
1713
1710
 
1714
1711
 
1715
- def _get_output_fd_for_write() -> Union[str, int]:
1716
- handle = os.getenv("DATACHAIN_OUTPUT_FD")
1717
- if not handle:
1718
- return os.devnull
1719
-
1720
- if os.name != "nt":
1721
- return int(handle)
1722
-
1723
- import msvcrt
1724
-
1725
- return msvcrt.open_osfhandle(int(handle), os.O_WRONLY) # type: ignore[attr-defined]
1726
-
1727
-
1728
- @attrs.define
1729
- class ExecutionResult:
1730
- preview: list[dict] = attrs.field(factory=list)
1731
- dataset: Optional[tuple[str, int]] = None
1732
- metrics: dict[str, Any] = attrs.field(factory=dict)
1733
-
1734
-
1735
- def _send_result(dataset_query: DatasetQuery) -> None:
1736
- class JSONSerialize(json.JSONEncoder):
1737
- def default(self, obj):
1738
- if isinstance(obj, (datetime.datetime, datetime.date)):
1739
- return obj.isoformat()
1740
- if isinstance(obj, bytes):
1741
- return list(obj[:1024])
1742
- return super().default(obj)
1743
-
1744
- try:
1745
- preview_args: dict[str, Any] = json.loads(
1746
- os.getenv("DATACHAIN_QUERY_PREVIEW_ARGS", "")
1747
- )
1748
- except ValueError:
1749
- preview_args = {}
1750
-
1751
- columns = preview_args.get("columns") or []
1752
-
1753
- if type(dataset_query) is DatasetQuery:
1754
- preview_query = dataset_query.select(*columns)
1755
- else:
1756
- preview_query = dataset_query.select(*columns, _sys=False)
1757
-
1758
- preview_query = preview_query.limit(preview_args.get("limit", 10)).offset(
1759
- preview_args.get("offset", 0)
1760
- )
1761
-
1762
- dataset: Optional[tuple[str, int]] = None
1763
- if dataset_query.attached:
1764
- assert dataset_query.name, "Dataset name should be provided"
1765
- assert dataset_query.version, "Dataset version should be provided"
1766
- dataset = dataset_query.name, dataset_query.version
1767
-
1768
- preview = preview_query.to_db_records()
1769
- result = ExecutionResult(preview, dataset, metrics)
1770
- data = attrs.asdict(result)
1771
-
1772
- with open(_get_output_fd_for_write(), mode="w") as f:
1773
- json.dump(data, f, cls=JSONSerialize)
1774
-
1775
-
1776
- def query_wrapper(dataset_query: DatasetQuery) -> DatasetQuery:
1712
+ def query_wrapper(dataset_query: Any) -> Any:
1777
1713
  """
1778
1714
  Wrapper function that wraps the last statement of user query script.
1779
1715
  Last statement MUST be instance of DatasetQuery, otherwise script exits with
1780
1716
  error code 10
1781
1717
  """
1782
1718
  if not isinstance(dataset_query, DatasetQuery):
1783
- sys.exit(QUERY_SCRIPT_INVALID_LAST_STATEMENT_EXIT_CODE)
1719
+ return dataset_query
1784
1720
 
1785
1721
  catalog = dataset_query.catalog
1786
1722
  save = bool(os.getenv("DATACHAIN_QUERY_SAVE"))
1787
- save_as = os.getenv("DATACHAIN_QUERY_SAVE_AS")
1788
1723
 
1789
1724
  is_session_temp_dataset = dataset_query.name and dataset_query.name.startswith(
1790
1725
  dataset_query.session.get_temp_prefix()
1791
1726
  )
1792
1727
 
1793
- if save_as:
1794
- if dataset_query.attached:
1795
- dataset_name = dataset_query.name
1796
- version = dataset_query.version
1797
- assert dataset_name, "Dataset name should be provided in attached mode"
1798
- assert version, "Dataset version should be provided in attached mode"
1799
-
1800
- dataset = catalog.get_dataset(dataset_name)
1801
-
1802
- try:
1803
- target_dataset = catalog.get_dataset(save_as)
1804
- except DatasetNotFoundError:
1805
- target_dataset = None
1806
-
1807
- if target_dataset:
1808
- dataset = catalog.register_dataset(dataset, version, target_dataset)
1809
- else:
1810
- dataset = catalog.register_new_dataset(dataset, version, save_as)
1811
-
1812
- dataset_query = DatasetQuery(
1813
- name=dataset.name,
1814
- version=dataset.latest_version,
1815
- catalog=catalog,
1816
- )
1817
- else:
1818
- dataset_query = dataset_query.save(save_as)
1819
- elif save and (is_session_temp_dataset or not dataset_query.attached):
1728
+ if save and (is_session_temp_dataset or not dataset_query.attached):
1820
1729
  name = catalog.generate_query_dataset_name()
1821
1730
  dataset_query = dataset_query.save(name)
1822
-
1823
- _send_result(dataset_query)
1824
1731
  return dataset_query
@@ -1,3 +1,4 @@
1
+ import os
1
2
  from typing import Optional, Union
2
3
 
3
4
  metrics: dict[str, Union[str, int, float, bool, None]] = {}
@@ -13,6 +14,13 @@ def set(key: str, value: Union[str, int, float, bool, None]) -> None: # noqa: P
13
14
  raise TypeError("Value must be a string, int, float or bool")
14
15
  metrics[key] = value
15
16
 
17
+ if job_id := os.getenv("DATACHAIN_JOB_ID"):
18
+ from datachain.data_storage.job import JobStatus
19
+ from datachain.query.session import Session
20
+
21
+ metastore = Session.get().catalog.metastore
22
+ metastore.set_job_status(job_id, JobStatus.RUNNING, metrics=metrics)
23
+
16
24
 
17
25
  def get(key: str) -> Optional[Union[str, int, float, bool]]:
18
26
  """Get a metric value."""
datachain/utils.py CHANGED
@@ -448,3 +448,8 @@ def get_datachain_executable() -> list[str]:
448
448
  if datachain_exec_path := os.getenv("DATACHAIN_EXEC_PATH"):
449
449
  return [datachain_exec_path]
450
450
  return [sys.executable, "-m", "datachain"]
451
+
452
+
453
+ def uses_glob(path: str) -> bool:
454
+ """Checks if some URI path has glob syntax in it"""
455
+ return glob.has_magic(os.path.basename(os.path.normpath(path)))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.3.9
3
+ Version: 0.3.11
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -42,6 +42,7 @@ Requires-Dist: datamodel-code-generator >=0.25
42
42
  Requires-Dist: Pillow <11,>=10.0.0
43
43
  Requires-Dist: msgpack <2,>=1.0.4
44
44
  Requires-Dist: psutil
45
+ Requires-Dist: huggingface-hub
45
46
  Requires-Dist: numpy <2,>=1 ; sys_platform == "win32"
46
47
  Provides-Extra: dev
47
48
  Requires-Dist: datachain[docs,tests] ; extra == 'dev'
@@ -67,7 +68,7 @@ Requires-Dist: pdfplumber ==0.11.4 ; extra == 'examples'
67
68
  Requires-Dist: huggingface-hub[hf_transfer] ; extra == 'examples'
68
69
  Provides-Extra: hf
69
70
  Requires-Dist: numba >=0.60.0 ; extra == 'hf'
70
- Requires-Dist: datasets[audio,vision] ; extra == 'hf'
71
+ Requires-Dist: datasets[audio,vision] >=2.21.0 ; extra == 'hf'
71
72
  Provides-Extra: remote
72
73
  Requires-Dist: lz4 ; extra == 'remote'
73
74
  Requires-Dist: requests >=2.22.0 ; extra == 'remote'
@@ -95,6 +96,10 @@ Requires-Dist: transformers >=4.36.0 ; extra == 'torch'
95
96
  Provides-Extra: vector
96
97
  Requires-Dist: usearch ; extra == 'vector'
97
98
 
99
+ .. image:: docs/assets/datachain_logotype.svg
100
+ :height: 48
101
+ :alt: DataChain logo
102
+
98
103
  |PyPI| |Python Version| |Codecov| |Tests|
99
104
 
100
105
  .. |PyPI| image:: https://img.shields.io/pypi/v/datachain.svg
@@ -110,7 +115,6 @@ Requires-Dist: usearch ; extra == 'vector'
110
115
  :target: https://github.com/iterative/datachain/actions/workflows/tests.yml
111
116
  :alt: Tests
112
117
 
113
- AI 🔗 DataChain
114
118
  ----------------
115
119
 
116
120
  DataChain is a modern Pythonic data-frame library designed for artificial intelligence.
@@ -2,56 +2,57 @@ datachain/__init__.py,sha256=GeyhE-5LgfJav2OKYGaieP2lBvf2Gm-ihj7thnK9zjI,800
2
2
  datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
3
3
  datachain/asyn.py,sha256=biF8M8fQujtj5xs0VLi8S16eBtzG6kceWlO_NILbCsg,8197
4
4
  datachain/cache.py,sha256=wznC2pge6RhlPTaJfBVGjmBc6bxWCPThu4aTFMltvFU,4076
5
- datachain/cli.py,sha256=otR2eN0JL-JhZ9SOTPcPwt_-_TiT-vHifx2h4YzD6Tg,32052
5
+ datachain/cli.py,sha256=ECf_z5X8ILDJdUn2Cpb_z-ZjSRIzn7skiuMGfM-y9i0,30999
6
6
  datachain/cli_utils.py,sha256=jrn9ejGXjybeO1ur3fjdSiAyCHZrX0qsLLbJzN9ErPM,2418
7
7
  datachain/config.py,sha256=PfC7W5yO6HFO6-iMB4YB-0RR88LPiGmD6sS_SfVbGso,1979
8
- datachain/dataset.py,sha256=MZezyuJWNj_3PEtzr0epPMNyWAOTrhTSPI5FmemV6L4,14470
8
+ datachain/dataset.py,sha256=EcYjhHg1dxxPbDwSuIxc-mDRDo3v_pYf79fMy4re1oA,14740
9
9
  datachain/error.py,sha256=GY9KYTmb7GHXn2gGHV9X-PBhgwLj3i7VpK7tGHtAoGM,1279
10
- datachain/job.py,sha256=bk25bIqClhgRPzlXAhxpTtDeewibQe5l3S8Cf7db0gM,1229
10
+ datachain/job.py,sha256=Jt4sNutMHJReaGsj3r3scueN5aESLGfhimAa8pUP7Is,1271
11
11
  datachain/listing.py,sha256=keLkvPfumDA3gijeIiinH5yGWe71qCxgF5HqqP5AeH4,8299
12
- datachain/node.py,sha256=ihrP5l9HKpXLR0fR1wyb7QIdb7NR26dX6bB09qGX5B4,6005
12
+ datachain/node.py,sha256=dcm_7dVurFHpI0EHV2K6SjpJyh-gN4PVWAB-20quk04,6382
13
13
  datachain/nodes_fetcher.py,sha256=kca19yvu11JxoVY1t4_ydp1FmchiV88GnNicNBQ9NIA,831
14
14
  datachain/nodes_thread_pool.py,sha256=ZyzBvUImIPmi4WlKC2SW2msA0UhtembbTdcs2nx29A0,3191
15
15
  datachain/progress.py,sha256=7_8FtJs770ITK9sMq-Lt4k4k18QmYl4yIG_kCoWID3o,4559
16
16
  datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
17
  datachain/storage.py,sha256=RiSJLYdHUjnrEWkLBKPcETHpAxld_B2WxLg711t0aZI,3733
18
- datachain/utils.py,sha256=ROVCLwb37VmFRzgTlSGUDw4eJNgYGiQ4yMX581HfUX8,12988
18
+ datachain/utils.py,sha256=HKUdVqreBTzcCULAYRw1sC6z33OaomVD1WoMSoFcPHg,13148
19
19
  datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
20
- datachain/catalog/catalog.py,sha256=kGpp9IEyr1YS7QFWjLYprRT1gp7freyt-WLaLNzqUZg,77859
20
+ datachain/catalog/catalog.py,sha256=NgS7_SlmpJdUSp1v8KdCuLTjFklmYvT_jOLdzTyyK5I,72313
21
21
  datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
22
22
  datachain/catalog/loader.py,sha256=-6VelNfXUdgUnwInVyA8g86Boxv2xqhTh9xNS-Zlwig,8242
23
- datachain/catalog/subclass.py,sha256=B5R0qxeTYEyVAAPM1RutBPSoXZc8L5mVVZeSGXki9Sw,2096
24
23
  datachain/client/__init__.py,sha256=T4wiYL9KIM0ZZ_UqIyzV8_ufzYlewmizlV4iymHNluE,86
25
24
  datachain/client/azure.py,sha256=LXSahE0Z6r4dXqpBkKnq3J5fg7N7ymC1lSn-1SoILGc,2687
26
25
  datachain/client/fileslice.py,sha256=bT7TYco1Qe3bqoc8aUkUZcPdPofJDHlryL5BsTn9xsY,3021
27
- datachain/client/fsspec.py,sha256=G4QTm3KPhlaV74T3gLXJ86345_ak8CH38ezn2ET-oLc,13230
26
+ datachain/client/fsspec.py,sha256=LQb5tr-pP9umCFYo3nGJR_dZxUyiSN7IDE8jhp1TXco,13333
28
27
  datachain/client/gcs.py,sha256=P_E3mhzhXR9mJ_wc3AYZuczzwOJ0-D3J5qhJXeSU-xk,4518
28
+ datachain/client/hf.py,sha256=R-F6Ks6aVM9wSNkIXOkOnZFwsJlfdRwJjymRa78RLjM,1246
29
29
  datachain/client/local.py,sha256=H8TNY8pi2kA8y9_f_1XLUjJF66f229qC_b2y4xGkzdU,5300
30
30
  datachain/client/s3.py,sha256=aQxfMH8G8bUjmHF1-6P90MSkXsU5DgOPEVlKWLu459I,6568
31
31
  datachain/data_storage/__init__.py,sha256=cEOJpyu1JDZtfUupYucCDNFI6e5Wmp_Oyzq6rZv32Y8,398
32
32
  datachain/data_storage/db_engine.py,sha256=81Ol1of9TTTzD97ORajCnP366Xz2mEJt6C-kTUCaru4,3406
33
33
  datachain/data_storage/id_generator.py,sha256=lCEoU0BM37Ai2aRpSbwo5oQT0GqZnSpYwwvizathRMQ,4292
34
34
  datachain/data_storage/job.py,sha256=w-7spowjkOa1P5fUVtJou3OltT0L48P0RYWZ9rSJ9-s,383
35
- datachain/data_storage/metastore.py,sha256=nxcY6nwyEmQWMAo33sNGO-FgUFQs2amBGGnZz2ftEz0,55362
35
+ datachain/data_storage/metastore.py,sha256=cHN0xmbUvChyayHHZm3Vqxr87jFqojPSlGBqhTPStlE,54519
36
36
  datachain/data_storage/schema.py,sha256=JKpSEz8igpwZ9zkpRPYVXZxEpiXuLKEs2WNhH0KqM6U,8552
37
37
  datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
38
- datachain/data_storage/sqlite.py,sha256=jLgkvikYkENQUO_ykoNFfsBc2ofZXwFHLMa1nyWP3aw,28316
39
- datachain/data_storage/warehouse.py,sha256=cvlfa-nyIxqrrpSRtCdeVjlTwhn7rcIoWjOq91HhItU,33668
38
+ datachain/data_storage/sqlite.py,sha256=Z4B2KDL4C8Uio2aLMxaKv0t2MoOtCV3bSqWg4X9mTFg,28048
39
+ datachain/data_storage/warehouse.py,sha256=f7ETnYIXx5KMcPfwg_4bh_00QJiMLIliwE_41vmRGUo,33037
40
40
  datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
41
- datachain/lib/arrow.py,sha256=17-jHLdYhsSdO5kfKWpBS5OAWbMjNi5r8ao0zGXUBoA,4941
41
+ datachain/lib/arrow.py,sha256=dV17oGiknqEW55ogGK_9T0ycNFwd2z-EFOW0AQiR6TU,5840
42
42
  datachain/lib/clip.py,sha256=33RL11OIqfbwyhvBgiMGM8rDAnZx1IRmxk9dY89ls3Q,6130
43
43
  datachain/lib/data_model.py,sha256=gHIjlow84GMRDa78yLL1Ud-N18or21fnTyPEwsatpXY,2045
44
- datachain/lib/dataset_info.py,sha256=lONGr71ozo1DS4CQEhnpKORaU4qFb6Ketv8Xm8CVm2U,2188
45
- datachain/lib/dc.py,sha256=tY_ccOsv9njsXF23cwoZ7tSTCDKCfakyRvsIBLKE0SE,63976
46
- datachain/lib/file.py,sha256=ZHpdilDPYCob8uqtwUPtBvBNxVvQRq4AC_0IGg5m-G4,12003
47
- datachain/lib/hf.py,sha256=mYaHFPS4CW2-stRZHBMWW-NKN4dhrnhjZobBgRocnvo,5317
44
+ datachain/lib/dataset_info.py,sha256=srPPhI2UHf6hFPBecyFEVw2SS5aPisIIMsvGgKqi7ss,2366
45
+ datachain/lib/dc.py,sha256=s4E-bD6_T6JFJ7TEa5Y9RS705lIfcV9OUJwDD6RNCX0,68156
46
+ datachain/lib/file.py,sha256=WOOYw3LcGROA6wshJ_aZkSgcTqfB4UxTbZDTx9KqAOg,11429
47
+ datachain/lib/hf.py,sha256=ZiMvgy3DYiklGKZv-w7gevrHOgn3bGfpTlpDPOHCNqs,5336
48
48
  datachain/lib/image.py,sha256=WbcwSaFzuyqjg4x4hH5CUogeUQjkZFjQHqw_oDEV1nA,2655
49
- datachain/lib/listing.py,sha256=nXLmGae_oQke4hnurzzWiHTEjHjWiqqHdB41Wb-hMTk,3521
50
- datachain/lib/meta_formats.py,sha256=Hels85LJmNCz1aYVJvhymNdAt3qdJ2-qoxsIiUezrow,7198
49
+ datachain/lib/listing.py,sha256=S9Xn_Saxu4xk3K_01VexkfMZW0INQiATlidt2bzgWKY,3938
50
+ datachain/lib/listing_info.py,sha256=sr5KzCXlCxlPuRmy_pVadD4miLpp5y0btvyaIPcluwI,996
51
+ datachain/lib/meta_formats.py,sha256=67uF9trQ2II6xFvN0u6eo5NNRf5xvCkpMHj7ThiG41Y,6777
51
52
  datachain/lib/model_store.py,sha256=c4USXsBBjrGH8VOh4seIgOiav-qHOwdoixtxfLgU63c,2409
52
53
  datachain/lib/pytorch.py,sha256=vK3GbWCy7kunN7ubul6w1hrWmJLja56uTCiMG_7XVQA,5623
53
54
  datachain/lib/settings.py,sha256=39thOpYJw-zPirzeNO6pmRC2vPrQvt4eBsw1xLWDFsw,2344
54
- datachain/lib/signal_schema.py,sha256=rW1R6nIzdtmqWzpXk7aNAfrQD58_gbvkvEGyNTQ4WNM,20099
55
+ datachain/lib/signal_schema.py,sha256=hqQLwUmt3w8RLa96MtubK9N2CBXqqTPrUkSRXc0ktt4,20275
55
56
  datachain/lib/text.py,sha256=vqs1SQdsw1vCzfvOanIeT4xY2R2TmPonElBgYDVeZmY,1241
56
57
  datachain/lib/udf.py,sha256=nG7DDuPgZ5ZuijwvDoCq-OZMxlDM8vFNzyxMmik0Y1c,11716
57
58
  datachain/lib/udf_signature.py,sha256=gMStcEeYJka5M6cg50Z9orC6y6HzCAJ3MkFqqn1fjZg,7137
@@ -68,9 +69,9 @@ datachain/lib/convert/values_to_tuples.py,sha256=YOdbjzHq-uj6-cV2Qq43G72eN2avMND
68
69
  datachain/query/__init__.py,sha256=tv-spkjUCYamMN9ys_90scYrZ8kJ7C7d1MTYVmxGtk4,325
69
70
  datachain/query/batch.py,sha256=-vlpINJiertlnaoUVv1C95RatU0F6zuhpIYRufJRo1M,3660
70
71
  datachain/query/builtins.py,sha256=EmKPYsoQ46zwdyOn54MuCzvYFmfsBn5F8zyF7UBUfrc,2550
71
- datachain/query/dataset.py,sha256=mHqSyovJlCQ7pKVMQKKKCiTJs3bP1GDXLKpOSpzVxx8,61378
72
+ datachain/query/dataset.py,sha256=B2EmGOL8gjrdU_WhU88Dj7FsxvxrNeKwe2STXnU9T9E,58369
72
73
  datachain/query/dispatch.py,sha256=GBh3EZHDp5AaXxrjOpfrpfsuy7Umnqxu-MAXcK9X3gc,12945
73
- datachain/query/metrics.py,sha256=vsECqbZfoSDBnvC3GQlziKXmISVYDLgHP1fMPEOtKyo,640
74
+ datachain/query/metrics.py,sha256=r5b0ygYhokbXp8Mg3kCH8iFSRw0jxzyeBe-C-J_bKFc,938
74
75
  datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
75
76
  datachain/query/queue.py,sha256=waqM_KzavU8C-G95-4211Nd4GXna_u2747Chgwtgz2w,3839
76
77
  datachain/query/schema.py,sha256=BvHipN79CnSTbVFcfIEwzo1npe7HmThnk0iY-CSLEkM,7899
@@ -95,9 +96,9 @@ datachain/sql/sqlite/base.py,sha256=WLPHBhZbXbiqPoRV1VgDrXJqku4UuvJpBhYeQ0k5rI8,
95
96
  datachain/sql/sqlite/types.py,sha256=yzvp0sXSEoEYXs6zaYC_2YubarQoZH-MiUNXcpuEP4s,1573
96
97
  datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
97
98
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
98
- datachain-0.3.9.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
99
- datachain-0.3.9.dist-info/METADATA,sha256=r5uNlVdal7YrsX7nYE56c_Ak8YZIgXqCiSwNJF5KjlY,17015
100
- datachain-0.3.9.dist-info/WHEEL,sha256=UvcQYKBHoFqaQd6LKyqHw9fxEolWLQnlzP0h_LgJAfI,91
101
- datachain-0.3.9.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
102
- datachain-0.3.9.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
103
- datachain-0.3.9.dist-info/RECORD,,
99
+ datachain-0.3.11.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
100
+ datachain-0.3.11.dist-info/METADATA,sha256=iSdfjWpVT1Iqzlg82eN5QzJ-icaYxkG7TUKEpEOi5sk,17124
101
+ datachain-0.3.11.dist-info/WHEEL,sha256=cVxcB9AmuTcXqmwrtPhNK88dr7IR_b6qagTj0UvIEbY,91
102
+ datachain-0.3.11.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
103
+ datachain-0.3.11.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
104
+ datachain-0.3.11.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (74.0.0)
2
+ Generator: setuptools (74.1.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,60 +0,0 @@
1
- import ast
2
-
3
-
4
- class SubclassFinder(ast.NodeVisitor):
5
- """Finds subclasses of a target class in an AST."""
6
-
7
- def __init__(self, target_classes: list[str]):
8
- self.imports: list[ast.AST] = []
9
- self.main_body: list[ast.AST] = []
10
-
11
- self.target_classes: list[str] = target_classes
12
- self.aliases: dict[str, str] = {}
13
- self.feature_class: list[ast.AST] = []
14
-
15
- def visit_ImportFrom(self, node): # noqa: N802
16
- module = node.module
17
- for alias in node.names:
18
- full_name = f"{module}.{alias.name}"
19
- self.aliases[alias.asname or alias.name] = full_name
20
- self.imports.append(node)
21
-
22
- def visit_Import(self, node): # noqa: N802
23
- for alias in node.names:
24
- self.aliases[alias.asname or alias.name] = alias.name
25
- self.imports.append(node)
26
-
27
- def visit_ClassDef(self, node): # noqa: N802
28
- base_names = [self.get_base_name(base) for base in node.bases]
29
- if any(self.is_subclass(name) for name in base_names):
30
- self.feature_class.append(node)
31
- else:
32
- self.main_body.append(node)
33
-
34
- def visit(self, node):
35
- if isinstance(
36
- node,
37
- (ast.Import, ast.ImportFrom, ast.ClassDef, ast.Module),
38
- ):
39
- return super().visit(node)
40
- self.main_body.append(node)
41
- return node
42
-
43
- def get_base_name(self, node):
44
- if isinstance(node, ast.Name):
45
- return self.aliases.get(node.id, node.id)
46
- if isinstance(node, ast.Attribute):
47
- return self.get_full_attr_name(node)
48
- if isinstance(node, ast.Subscript):
49
- return self.get_base_name(node.value)
50
- return None
51
-
52
- def get_full_attr_name(self, node):
53
- if isinstance(node.value, ast.Name):
54
- return f"{node.value.id}.{node.attr}"
55
- if isinstance(node.value, ast.Attribute):
56
- return f"{self.get_full_attr_name(node.value)}.{node.attr}"
57
- return node.attr
58
-
59
- def is_subclass(self, base_name):
60
- return base_name and base_name.split(".")[-1] in self.target_classes