datachain 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -1,3 +1,4 @@
1
+ import ast
1
2
  import contextlib
2
3
  import datetime
3
4
  import inspect
@@ -51,9 +52,10 @@ from datachain.data_storage.schema import (
51
52
  from datachain.dataset import DatasetStatus, RowDict
52
53
  from datachain.error import DatasetNotFoundError, QueryScriptCancelError
53
54
  from datachain.progress import CombinedDownloadCallback
55
+ from datachain.query.schema import DEFAULT_DELIMITER
54
56
  from datachain.sql.functions import rand
55
57
  from datachain.storage import Storage, StorageURI
56
- from datachain.utils import batched, determine_processes
58
+ from datachain.utils import batched, determine_processes, inside_notebook
57
59
 
58
60
  from .batch import RowBatch
59
61
  from .metrics import metrics
@@ -62,7 +64,6 @@ from .session import Session
62
64
  from .udf import UDFBase, UDFClassWrapper, UDFFactory, UDFType
63
65
 
64
66
  if TYPE_CHECKING:
65
- import pandas as pd
66
67
  from sqlalchemy.sql.elements import ClauseElement
67
68
  from sqlalchemy.sql.schema import Table
68
69
  from sqlalchemy.sql.selectable import GenerativeSelect
@@ -547,8 +548,9 @@ class UDF(Step, ABC):
547
548
  else:
548
549
  udf = self.udf
549
550
 
550
- if hasattr(udf.func, "bootstrap") and callable(udf.func.bootstrap):
551
- udf.func.bootstrap()
551
+ if hasattr(udf.func, "setup") and callable(udf.func.setup):
552
+ udf.func.setup()
553
+
552
554
  warehouse = self.catalog.warehouse
553
555
 
554
556
  with contextlib.closing(
@@ -599,12 +601,15 @@ class UDF(Step, ABC):
599
601
  # Create a dynamic module with the generated name
600
602
  dynamic_module = types.ModuleType(feature_module_name)
601
603
  # Get the import lines for the necessary objects from the main module
602
- import_lines = [
603
- source.getimport(obj, alias=name)
604
- for name, obj in inspect.getmembers(sys.modules["__main__"], _imports)
605
- if not (name.startswith("__") and name.endswith("__"))
606
- ]
607
604
  main_module = sys.modules["__main__"]
605
+ if getattr(main_module, "__file__", None):
606
+ import_lines = list(get_imports(main_module))
607
+ else:
608
+ import_lines = [
609
+ source.getimport(obj, alias=name)
610
+ for name, obj in main_module.__dict__.items()
611
+ if _imports(obj) and not (name.startswith("__") and name.endswith("__"))
612
+ ]
608
613
 
609
614
  # Get the feature classes from the main module
610
615
  feature_classes = {
@@ -612,6 +617,10 @@ class UDF(Step, ABC):
612
617
  for name, obj in main_module.__dict__.items()
613
618
  if _feature_predicate(obj)
614
619
  }
620
+ if not feature_classes:
621
+ yield None
622
+ return
623
+
615
624
  # Get the source code of the feature classes
616
625
  feature_sources = [source.getsource(cls) for _, cls in feature_classes.items()]
617
626
  # Set the module name for the feature classes to the generated name
@@ -621,7 +630,7 @@ class UDF(Step, ABC):
621
630
  # Add the dynamic module to the sys.modules dictionary
622
631
  sys.modules[feature_module_name] = dynamic_module
623
632
  # Combine the import lines and feature sources
624
- feature_file = "".join(import_lines) + "\n".join(feature_sources)
633
+ feature_file = "\n".join(import_lines) + "\n" + "\n".join(feature_sources)
625
634
 
626
635
  # Write the module content to a .py file
627
636
  with open(f"{feature_module_name}.py", "w") as module_file:
@@ -1362,33 +1371,11 @@ class DatasetQuery:
1362
1371
  cols = result.columns
1363
1372
  return [dict(zip(cols, row)) for row in result]
1364
1373
 
1365
- @classmethod
1366
- def create_empty_record(
1367
- cls, name: Optional[str] = None, session: Optional[Session] = None
1368
- ) -> "DatasetRecord":
1369
- session = Session.get(session)
1370
- if name is None:
1371
- name = session.generate_temp_dataset_name()
1372
- columns = session.catalog.warehouse.dataset_row_cls.file_columns()
1373
- return session.catalog.create_dataset(name, columns=columns)
1374
-
1375
- @classmethod
1376
- def insert_record(
1377
- cls,
1378
- dsr: "DatasetRecord",
1379
- record: dict[str, Any],
1380
- session: Optional[Session] = None,
1381
- ) -> None:
1382
- session = Session.get(session)
1383
- dr = session.catalog.warehouse.dataset_rows(dsr)
1384
- insert_q = dr.get_table().insert().values(**record)
1385
- session.catalog.warehouse.db.execute(insert_q)
1386
-
1387
1374
  def to_pandas(self) -> "pd.DataFrame":
1388
- import pandas as pd
1389
-
1390
1375
  records = self.to_records()
1391
- return pd.DataFrame.from_records(records)
1376
+ df = pd.DataFrame.from_records(records)
1377
+ df.columns = [c.replace(DEFAULT_DELIMITER, ".") for c in df.columns]
1378
+ return df
1392
1379
 
1393
1380
  def shuffle(self) -> "Self":
1394
1381
  # ToDo: implement shaffle based on seed and/or generating random column
@@ -1410,8 +1397,17 @@ class DatasetQuery:
1410
1397
 
1411
1398
  def show(self, limit=20) -> None:
1412
1399
  df = self.limit(limit).to_pandas()
1413
- no_footer = re.sub(r"\n\[\d+ rows x \d+ columns\]$", "", str(df))
1414
- print(no_footer.rstrip(" \n"))
1400
+
1401
+ options = ["display.max_colwidth", 50, "display.show_dimensions", False]
1402
+ with pd.option_context(*options):
1403
+ if inside_notebook():
1404
+ from IPython.display import display
1405
+
1406
+ display(df)
1407
+
1408
+ else:
1409
+ print(df.to_string())
1410
+
1415
1411
  if len(df) == limit:
1416
1412
  print(f"[limited by {limit} objects]")
1417
1413
 
@@ -1692,6 +1688,15 @@ class DatasetQuery:
1692
1688
  storage.timestamp_str,
1693
1689
  )
1694
1690
 
1691
+ def exec(self) -> "Self":
1692
+ """Execute the query."""
1693
+ try:
1694
+ query = self.clone()
1695
+ query.apply_steps()
1696
+ finally:
1697
+ self.cleanup()
1698
+ return query
1699
+
1695
1700
  def save(
1696
1701
  self,
1697
1702
  name: Optional[str] = None,
@@ -1878,3 +1883,24 @@ def _feature_predicate(obj):
1878
1883
 
1879
1884
  def _imports(obj):
1880
1885
  return not source.isfrommain(obj)
1886
+
1887
+
1888
+ def get_imports(m):
1889
+ root = ast.parse(inspect.getsource(m))
1890
+
1891
+ for node in ast.iter_child_nodes(root):
1892
+ if isinstance(node, ast.Import):
1893
+ module = None
1894
+ elif isinstance(node, ast.ImportFrom):
1895
+ module = node.module
1896
+ else:
1897
+ continue
1898
+
1899
+ for n in node.names:
1900
+ import_script = ""
1901
+ if module:
1902
+ import_script += f"from {module} "
1903
+ import_script += f"import {n.name}"
1904
+ if n.asname:
1905
+ import_script += f" as {n.asname}"
1906
+ yield import_script
@@ -370,8 +370,8 @@ class UDFWorker:
370
370
  return WorkerCallback(self.done_queue)
371
371
 
372
372
  def run(self) -> None:
373
- if hasattr(self.udf.func, "bootstrap") and callable(self.udf.func.bootstrap):
374
- self.udf.func.bootstrap()
373
+ if hasattr(self.udf.func, "setup") and callable(self.udf.func.setup):
374
+ self.udf.func.setup()
375
375
  while (batch := get_from_queue(self.task_queue)) != STOP_SIGNAL:
376
376
  n_rows = len(batch.rows) if isinstance(batch, RowBatch) else 1
377
377
  udf_output = self.udf(
datachain/query/schema.py CHANGED
@@ -3,14 +3,12 @@ import json
3
3
  from abc import ABC, abstractmethod
4
4
  from datetime import datetime, timezone
5
5
  from fnmatch import fnmatch
6
- from random import getrandbits
7
6
  from typing import TYPE_CHECKING, Any, Callable, ClassVar, Optional, Union
8
7
 
9
8
  import attrs
10
9
  import sqlalchemy as sa
11
10
  from fsspec.callbacks import DEFAULT_CALLBACK, Callback
12
11
 
13
- from datachain.data_storage.warehouse import RANDOM_BITS
14
12
  from datachain.sql.types import JSON, Boolean, DateTime, Int, Int64, SQLType, String
15
13
 
16
14
  if TYPE_CHECKING:
@@ -217,7 +215,7 @@ class DatasetRow:
217
215
  "source": String,
218
216
  "parent": String,
219
217
  "name": String,
220
- "size": Int,
218
+ "size": Int64,
221
219
  "location": JSON,
222
220
  "vtype": String,
223
221
  "dir_type": Int,
@@ -227,8 +225,6 @@ class DatasetRow:
227
225
  "last_modified": DateTime,
228
226
  "version": String,
229
227
  "etag": String,
230
- # system column
231
- "random": Int64,
232
228
  }
233
229
 
234
230
  @staticmethod
@@ -267,8 +263,6 @@ class DatasetRow:
267
263
 
268
264
  last_modified = last_modified or datetime.now(timezone.utc)
269
265
 
270
- random = getrandbits(RANDOM_BITS)
271
-
272
266
  return ( # type: ignore [return-value]
273
267
  source,
274
268
  parent,
@@ -283,7 +277,6 @@ class DatasetRow:
283
277
  last_modified,
284
278
  version,
285
279
  etag,
286
- random,
287
280
  )
288
281
 
289
282
  @staticmethod
datachain/query/udf.py CHANGED
@@ -14,6 +14,7 @@ from typing import (
14
14
  from fsspec.callbacks import DEFAULT_CALLBACK, Callback
15
15
 
16
16
  from datachain.dataset import RowDict
17
+ from datachain.lib.utils import AbstractUDF
17
18
 
18
19
  from .batch import Batch, BatchingStrategy, NoBatching, Partition, RowBatch
19
20
  from .schema import (
@@ -58,14 +59,6 @@ class UDFProperties:
58
59
  def signal_names(self) -> Iterable[str]:
59
60
  return self.output.keys()
60
61
 
61
- def parameter_parser(self) -> Callable:
62
- """Generate a parameter list from a dataset row."""
63
-
64
- def plist(catalog: "Catalog", row: "RowDict", **kwargs) -> list:
65
- return [p.get_value(catalog, row, **kwargs) for p in self.params]
66
-
67
- return plist
68
-
69
62
 
70
63
  def udf(
71
64
  params: Sequence[UDFParamSpec],
@@ -113,32 +106,37 @@ class UDFBase:
113
106
  self.func = func
114
107
  self.properties = properties
115
108
  self.signal_names = properties.signal_names()
116
- self.parameter_parser = properties.parameter_parser()
117
109
  self.output = properties.output
118
110
 
119
111
  def __call__(
120
112
  self,
121
113
  catalog: "Catalog",
122
- param: "BatchingResult",
114
+ arg: "BatchingResult",
123
115
  is_generator: bool = False,
124
116
  cache: bool = False,
125
117
  cb: Callback = DEFAULT_CALLBACK,
126
118
  ) -> Iterable[UDFResult]:
127
- if isinstance(param, RowBatch):
119
+ if isinstance(self.func, AbstractUDF):
120
+ self.func._catalog = catalog # type: ignore[unreachable]
121
+
122
+ if isinstance(arg, RowBatch):
128
123
  udf_inputs = [
129
- self.parameter_parser(catalog, row, cache=cache, cb=cb)
130
- for row in param.rows
124
+ self.bind_parameters(catalog, row, cache=cache, cb=cb)
125
+ for row in arg.rows
131
126
  ]
132
127
  udf_outputs = self.func(udf_inputs)
133
- return self._process_results(param.rows, udf_outputs, is_generator)
134
- if isinstance(param, RowDict):
135
- udf_inputs = self.parameter_parser(catalog, param, cache=cache, cb=cb)
128
+ return self._process_results(arg.rows, udf_outputs, is_generator)
129
+ if isinstance(arg, RowDict):
130
+ udf_inputs = self.bind_parameters(catalog, arg, cache=cache, cb=cb)
136
131
  udf_outputs = self.func(*udf_inputs)
137
132
  if not is_generator:
138
133
  # udf_outputs is generator already if is_generator=True
139
134
  udf_outputs = [udf_outputs]
140
- return self._process_results([param], udf_outputs, is_generator)
141
- raise ValueError(f"unexpected UDF parameter {param}")
135
+ return self._process_results([arg], udf_outputs, is_generator)
136
+ raise ValueError(f"Unexpected UDF argument: {arg}")
137
+
138
+ def bind_parameters(self, catalog: "Catalog", row: "RowDict", **kwargs) -> list:
139
+ return [p.get_value(catalog, row, **kwargs) for p in self.properties.params]
142
140
 
143
141
  def _process_results(
144
142
  self,
datachain/utils.py CHANGED
@@ -360,3 +360,31 @@ class JSONSerialize(json.JSONEncoder):
360
360
  return str(obj)
361
361
 
362
362
  return super().default(obj)
363
+
364
+
365
+ def inside_colab() -> bool:
366
+ try:
367
+ from google import colab # noqa: F401
368
+ except ImportError:
369
+ return False
370
+ return True
371
+
372
+
373
+ def inside_notebook() -> bool:
374
+ if inside_colab():
375
+ return True
376
+
377
+ try:
378
+ shell = get_ipython().__class__.__name__ # type: ignore[name-defined]
379
+ except NameError:
380
+ return False
381
+
382
+ if shell == "ZMQInteractiveShell":
383
+ try:
384
+ import IPython
385
+
386
+ return IPython.__version__ >= "6.0.0"
387
+ except ImportError:
388
+ return False
389
+
390
+ return False
@@ -1,9 +1,10 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.2.1
3
+ Version: 0.2.3
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
7
+ Project-URL: Documentation, https://datachain.dvc.ai
7
8
  Project-URL: Issues, https://github.com/iterative/dvcx/issues
8
9
  Project-URL: Source, https://github.com/iterative/dvcx
9
10
  Classifier: Programming Language :: Python :: 3
@@ -7,16 +7,16 @@ datachain/cli_utils.py,sha256=jrn9ejGXjybeO1ur3fjdSiAyCHZrX0qsLLbJzN9ErPM,2418
7
7
  datachain/config.py,sha256=PfC7W5yO6HFO6-iMB4YB-0RR88LPiGmD6sS_SfVbGso,1979
8
8
  datachain/dataset.py,sha256=MZezyuJWNj_3PEtzr0epPMNyWAOTrhTSPI5FmemV6L4,14470
9
9
  datachain/error.py,sha256=GY9KYTmb7GHXn2gGHV9X-PBhgwLj3i7VpK7tGHtAoGM,1279
10
- datachain/listing.py,sha256=1arE_9gpjhHqGQCpQZj_mLoocrZWRNDHJ-bkPc08NQs,8247
10
+ datachain/listing.py,sha256=sX8vZNzAzoTel1li6VJiYeHUJwseUERVEoW9D5P7tII,8192
11
11
  datachain/node.py,sha256=fHe7k5ajI2g2qnzsG-_NQR_T-QdBYctVeEa8c8dsu_Y,5703
12
12
  datachain/nodes_fetcher.py,sha256=kca19yvu11JxoVY1t4_ydp1FmchiV88GnNicNBQ9NIA,831
13
13
  datachain/nodes_thread_pool.py,sha256=ZyzBvUImIPmi4WlKC2SW2msA0UhtembbTdcs2nx29A0,3191
14
14
  datachain/progress.py,sha256=7_8FtJs770ITK9sMq-Lt4k4k18QmYl4yIG_kCoWID3o,4559
15
15
  datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
16
  datachain/storage.py,sha256=RiSJLYdHUjnrEWkLBKPcETHpAxld_B2WxLg711t0aZI,3733
17
- datachain/utils.py,sha256=DV-_OON2OomEbxuQuK1lE_2qNTf28QByNcNcEhYsilE,10202
17
+ datachain/utils.py,sha256=12yQAV8tfyCHqp_xJcJBeNnr1L_BO8e2bOPyXdM68gs,10759
18
18
  datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
19
- datachain/catalog/catalog.py,sha256=5WkICtTYCN5xSMGDd5djLnEBw8kkcDf-IpFYf7kfeuQ,78654
19
+ datachain/catalog/catalog.py,sha256=pulKGJgAmxqSmFqBhA-J0wCKdBqGX4vqpV0cAvV6vUw,79578
20
20
  datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
21
21
  datachain/catalog/loader.py,sha256=GJ8zhEYkC7TuaPzCsjJQ4LtTdECu-wwYzC12MikPOMQ,7307
22
22
  datachain/catalog/subclass.py,sha256=B5R0qxeTYEyVAAPM1RutBPSoXZc8L5mVVZeSGXki9Sw,2096
@@ -28,52 +28,52 @@ datachain/client/gcs.py,sha256=ucX8e6JrqlFY-f80zkv084vxnKdtxpO32QJ-RG8Nv1s,4454
28
28
  datachain/client/local.py,sha256=NQVkLTJQ-a7Udavqbh_4uT-IejfZQYn10j22owz9sis,5150
29
29
  datachain/client/s3.py,sha256=TmW4f7VUM5CMZjSmgyFQFKeMUGrXt2SLoLEbLOUleiU,6296
30
30
  datachain/data_storage/__init__.py,sha256=cEOJpyu1JDZtfUupYucCDNFI6e5Wmp_Oyzq6rZv32Y8,398
31
- datachain/data_storage/db_engine.py,sha256=mxOoWP4ntBMgLeTAk4dlEeIJArAz4x_tFrHytcAfLpo,3341
31
+ datachain/data_storage/db_engine.py,sha256=rgBuqJ-M1j5QyqiUQuJRewctuvRRj8LBDL54-aPEFxE,3287
32
32
  datachain/data_storage/id_generator.py,sha256=VlDALKijggegAnNMJwuMETJgnLoPYxpkrkld5DNTPQw,3839
33
33
  datachain/data_storage/job.py,sha256=w-7spowjkOa1P5fUVtJou3OltT0L48P0RYWZ9rSJ9-s,383
34
34
  datachain/data_storage/metastore.py,sha256=y-4fYvuOPnWeYxAvqhDnw6CdlTvQiurg0Gg4TaG9LR0,54074
35
- datachain/data_storage/schema.py,sha256=t58LexPOCam_vWV0W52otEDNXgtFPHX3QFApEncFy2s,8809
35
+ datachain/data_storage/schema.py,sha256=bY3q2OUaUraos0s5BnwWkhgce8YpeNmIl7M1ifshoes,8074
36
36
  datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
37
37
  datachain/data_storage/sqlite.py,sha256=F68Q_AIqNAObZ5kJ0GnBqRC6e2D2sRehkQo8UzrHgtI,25079
38
- datachain/data_storage/warehouse.py,sha256=tL2mYoXVZe-coKLTRXEJ0sMdEr2BD0GwgIWip5PP5CM,33300
38
+ datachain/data_storage/warehouse.py,sha256=h35JiJoCGtwkMctis_x3NHxkwEejX5sIWvJOluZxrOI,33132
39
39
  datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
40
- datachain/lib/arrow.py,sha256=7lAas8hSh3vL7S7s2KOlkYn4viQpfVbM_FQ_hLCh5oc,2593
40
+ datachain/lib/arrow.py,sha256=FF3WWUOjB6Prw8ygfiLsrVfrdob0S01lPzEazuGqoO8,2556
41
41
  datachain/lib/cached_stream.py,sha256=t2ifK0hZVZiVn0MQ8D3FaFK1-qK84TwJW2Dw1SRsw9g,1066
42
42
  datachain/lib/claude.py,sha256=iAauA1zNVNONpLzUo1t0QN5PZ5Ot6cZkfib7Ka_c638,1969
43
- datachain/lib/dc.py,sha256=PBbEZhSPnbvB6jh2eTgZyDSouAGbjgEv8xabW45_vmk,35460
43
+ datachain/lib/clip.py,sha256=rDeZlFGs0DXBlpmh5ZQJhR9Sz13bWAZGQjfYm1hsUI4,5388
44
+ datachain/lib/dc.py,sha256=Sf99R0oOqf7tlS2gieaG56z3bF7YVcMjhJOZrFRfFs8,34778
44
45
  datachain/lib/feature.py,sha256=QDloA9HE7URf9J_veKrguYBvSg-0cbXZFTswNxrKsB8,12135
45
46
  datachain/lib/feature_registry.py,sha256=K3jGQzBp2HZDjR9hdGe1BZaXOAne8RpkCRRQdTVjkTs,1622
46
- datachain/lib/feature_utils.py,sha256=LIK233IWGWFhuav5Rm8de0xIOSnuwA1ubk6OYrxrfN0,4712
47
- datachain/lib/file.py,sha256=GQrqGgCEHICrUTdzTz_yhXqJWiae9EPTte1sd3hKeEU,8246
47
+ datachain/lib/feature_utils.py,sha256=F4ZENO6tTQvd36a-O1AurYjFSUpoyZaT4qgXsKjQDts,4650
48
+ datachain/lib/file.py,sha256=TdhsPYmG0Atkd_QAO997oA8AuM854wNbjjLLT1uiD2M,8346
48
49
  datachain/lib/gpt4_vision.py,sha256=idyXVZVWzltstGaVIu5RYE5UNbdqcPEjIWy81O1MwkM,2922
49
50
  datachain/lib/hf_image_to_text.py,sha256=HiPSWzJRDT-vnz9DXJbJBNCMNl9wmpxiSS3PbbVz8SE,3310
50
51
  datachain/lib/hf_pipeline.py,sha256=f0AH_XCziOF1OKN3d1w1swTBLaeajMJ8xgdsX37i5-o,2287
51
- datachain/lib/image.py,sha256=l2lgUR3YQzjpBmTJewzUtL5zJsLDQH32lbbaLu9WvWA,3631
52
+ datachain/lib/image.py,sha256=ZYfDqr9p-RRmWBeWFQwXLS1J3vQS616ykfMUvQVpqBY,2717
52
53
  datachain/lib/image_transform.py,sha256=NXWtnVOcofWBgl_YMxb4ABpaT7JTBMx7tLKvErH1IC4,3024
53
54
  datachain/lib/iptc_exif_xmp.py,sha256=xrbxFeY-wRP6T5JsUgE3EXfTxKvZVymRaRD_VIfxD0A,2236
54
55
  datachain/lib/meta_formats.py,sha256=wIVVLRLp45Zk4vjZRd_P1UtD24vpDCb-vILWtcsACwk,6630
55
- datachain/lib/pytorch.py,sha256=oU16XXAyAmiiabe1IoQoID00-u3uZ5GhCN48uAl6WDs,5421
56
- datachain/lib/reader.py,sha256=rPXXNoTUdm6PQwkAlaU-nOBreP_q4ett_EjFStrA_W0,1727
56
+ datachain/lib/pytorch.py,sha256=Z7iZCsqJzUT0PynVo23Xu4Fx7qIuuEZyH83R1tR5mfI,5561
57
57
  datachain/lib/settings.py,sha256=6Nkoh8riETrftYwDp3aniK53Dsjc07MdztL8N0cW1D8,2849
58
- datachain/lib/signal_schema.py,sha256=KaH194dAH8Zt8FtlNAgdVqcZlJc42y7RbcB37ldPPAY,11688
59
- datachain/lib/text.py,sha256=EEZrYohADi5rAGg3aLLRwtvyAV9js_yWAGhr2C3QbwI,2424
60
- datachain/lib/udf.py,sha256=D9TMxkAvj3zPRnZmkCxadEDtiG3B45t2xAEpuO14MOQ,5600
61
- datachain/lib/udf_signature.py,sha256=DAWMQ0dvFkKabpY5MV5K2q9YmOSTKfiV8KuUBs_6kMg,7258
58
+ datachain/lib/signal_schema.py,sha256=KTegbx-yMvtaKEoUxLgDx5MxMA8De-nmdtqnV1932N8,10151
59
+ datachain/lib/text.py,sha256=PUT1O0jNJoQGsuhff2LgDpzTWk2eMdwIKqEDBrE448M,1307
60
+ datachain/lib/udf.py,sha256=kMlOsHCVybnnq4AMtYqjylZH7x2tGE62FsDPOu9qhWM,6612
61
+ datachain/lib/udf_signature.py,sha256=CUKgoVpM_N8CgvMncpAw2RYchoiJdAGdDSdluoP0hIk,7161
62
62
  datachain/lib/unstructured.py,sha256=9Y6rAelXdYqkNbPaqz6DhXjhS8d6qXcP0ieIsWkzvkk,1143
63
- datachain/lib/utils.py,sha256=YQKzuW096SGe7QwHwdyS47k_9l2Rh73b-wBqt1-niw4,213
63
+ datachain/lib/utils.py,sha256=5-kJlAZE0D9nXXweAjo7-SP_AWGo28feaDByONYaooQ,463
64
64
  datachain/lib/vfile.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
65
- datachain/lib/webdataset.py,sha256=JouI5WORgkl-am_DwQwWqO8RI1UwgbUPWsauZZj2Fmc,8221
65
+ datachain/lib/webdataset.py,sha256=GWB_pocfRZGoU4Lhd7Wh3hx2Rnm_fJWXX4S_zXJIEmk,8286
66
66
  datachain/lib/webdataset_laion.py,sha256=HAtSCbVvEQqzKkoRamRxDKaQALSB3QmJRU2yWRFNxwY,2147
67
67
  datachain/query/__init__.py,sha256=tv-spkjUCYamMN9ys_90scYrZ8kJ7C7d1MTYVmxGtk4,325
68
68
  datachain/query/batch.py,sha256=sOMxXbaNii7lVyFIEZ2noqbhy_S8qtZ-WWxrka72shc,3474
69
69
  datachain/query/builtins.py,sha256=ZKNs49t8Oa_OaboCBIEqtXZt7c1Qe9OR_C_HpoDriIU,2781
70
- datachain/query/dataset.py,sha256=QYrtZApS8djybkuDfGO0tt8O6sCBlmkg9TE__R4eM-I,64475
71
- datachain/query/dispatch.py,sha256=fEk1qalxAb5JJhN-iq0Mg9MyWve4XoN1Q7uvrX4mJY4,13106
70
+ datachain/query/dataset.py,sha256=vpu2wQYC5uWc-LdZrNV-PV7xQapbYCtqyrXiiIa77DI,64982
71
+ datachain/query/dispatch.py,sha256=ZeL5dga5d4cJDBftK7gAQ_mx4C7zq6t3z0Hdt7mcZYY,13094
72
72
  datachain/query/metrics.py,sha256=vsECqbZfoSDBnvC3GQlziKXmISVYDLgHP1fMPEOtKyo,640
73
73
  datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
74
- datachain/query/schema.py,sha256=tWlUiu9eiS5y8BTQaPI2raGclt0YzcO3DoUN1OkwnrE,7946
74
+ datachain/query/schema.py,sha256=n1NBOj6JO2I26mZD4vSURmVC2rk3mjIkJQheeLogoy4,7748
75
75
  datachain/query/session.py,sha256=e4_vv4RqAjU-g3KK0avgLd9MEsmJBzRVEj1w8v7fP1k,3663
76
- datachain/query/udf.py,sha256=0WkBPW5ymZbOGMimSXpVWVc8whjTuYfRrnxPWNHabSk,7127
76
+ datachain/query/udf.py,sha256=gnLDM7LKH8_bbdDeVHnlDKaBdbWc_NAbwvYCc4i-OlU,7101
77
77
  datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
78
78
  datachain/remote/studio.py,sha256=bZb85WjtqMNFBoRuPbH-TEGpAyz0afROR7E9UgIef_Y,7438
79
79
  datachain/sql/__init__.py,sha256=A2djrbQwSMUZZEIKGnm-mnRA-NDSbiDJNpAmmwGNyIo,303
@@ -92,9 +92,9 @@ datachain/sql/sqlite/__init__.py,sha256=TAdJX0Bg28XdqPO-QwUVKy8rg78cgMileHvMNot7
92
92
  datachain/sql/sqlite/base.py,sha256=nPMF6_FF04hclDNZev_YfxMgbJAsWEdF-rU2pUhqBtc,12048
93
93
  datachain/sql/sqlite/types.py,sha256=oP93nLfTBaYnN0z_4Dsv-HZm8j9rrUf1esMM-z3JLbg,1754
94
94
  datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
95
- datachain-0.2.1.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
96
- datachain-0.2.1.dist-info/METADATA,sha256=kgX6auIOqU0DtW6dRyGWs1TrlGYLf1kN_By0XFW3t0Q,14346
97
- datachain-0.2.1.dist-info/WHEEL,sha256=y4mX-SOX4fYIkonsAGA5N0Oy-8_gI4FXw5HNI1xqvWg,91
98
- datachain-0.2.1.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
99
- datachain-0.2.1.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
100
- datachain-0.2.1.dist-info/RECORD,,
95
+ datachain-0.2.3.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
96
+ datachain-0.2.3.dist-info/METADATA,sha256=NmviJ7UsETesadrJjeyoYjeNqul6GMd9D4zDZLk23Co,14399
97
+ datachain-0.2.3.dist-info/WHEEL,sha256=Z4pYXqR_rTB7OWNDYFOm1qRk0RX6GFP2o8LgvP453Hk,91
98
+ datachain-0.2.3.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
99
+ datachain-0.2.3.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
100
+ datachain-0.2.3.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (70.2.0)
2
+ Generator: setuptools (70.3.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
datachain/lib/reader.py DELETED
@@ -1,49 +0,0 @@
1
- from abc import ABC, abstractmethod
2
- from typing import TYPE_CHECKING, Any
3
-
4
- if TYPE_CHECKING:
5
- from datachain.lib.feature_utils import FeatureLike
6
-
7
-
8
- class FeatureReader(ABC):
9
- def __init__(self, fr_class: "FeatureLike"):
10
- """
11
- Class to call on feature values to perform post-processing. Used when
12
- iterating over dataset with `ds.to_pytorch()` and `ds.get_values()`.
13
-
14
- The class must include:
15
- - `self.fr_class` to define the feature class to read.
16
- - `self.__call__(self, value)` to call on the feature value returned by
17
- `self.fr_class.get_value()`.
18
-
19
- Examples:
20
- >>> class PrefixReader(FeatureReader):
21
- >>> def __call__(self, value):
22
- >>> return "prefix-" + value
23
- >>> for row in ds.get_values(PrefixReader(MyFeature)):
24
- >>> print(row)
25
-
26
- >>> class SuffixReader(FeatureReader):
27
- >>> def __init__(self, fr_class, suffix):
28
- >>> self.suffix = suffix
29
- >>> super().__init__(fr_class)
30
- >>> def __call__(self, value):
31
- >>> return value + self.suffix
32
- >>> for row in ds.get_values(SuffixReader(MyFeature, "-suffix")):
33
- >>> print(row)
34
- """
35
- self.fr_class = fr_class
36
-
37
- @abstractmethod
38
- def __call__(self, value: Any) -> Any:
39
- pass
40
-
41
-
42
- class LabelReader(FeatureReader):
43
- def __init__(self, fr_class: "FeatureLike", classes: list):
44
- """Get column values as 0-based integer index of classes."""
45
- self.classes = classes
46
- super().__init__(fr_class)
47
-
48
- def __call__(self, value: str) -> int:
49
- return self.classes.index(value)