datachain 0.2.9__py3-none-any.whl → 0.2.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (51) hide show
  1. datachain/__init__.py +17 -8
  2. datachain/catalog/catalog.py +5 -5
  3. datachain/cli.py +0 -2
  4. datachain/data_storage/schema.py +5 -5
  5. datachain/data_storage/sqlite.py +1 -1
  6. datachain/data_storage/warehouse.py +7 -7
  7. datachain/lib/arrow.py +25 -8
  8. datachain/lib/clip.py +6 -11
  9. datachain/lib/convert/__init__.py +0 -0
  10. datachain/lib/convert/flatten.py +67 -0
  11. datachain/lib/convert/type_converter.py +96 -0
  12. datachain/lib/convert/unflatten.py +69 -0
  13. datachain/lib/convert/values_to_tuples.py +85 -0
  14. datachain/lib/data_model.py +74 -0
  15. datachain/lib/dc.py +225 -168
  16. datachain/lib/file.py +41 -41
  17. datachain/lib/gpt4_vision.py +1 -9
  18. datachain/lib/hf_image_to_text.py +9 -17
  19. datachain/lib/hf_pipeline.py +4 -12
  20. datachain/lib/image.py +2 -18
  21. datachain/lib/image_transform.py +0 -1
  22. datachain/lib/iptc_exif_xmp.py +8 -15
  23. datachain/lib/meta_formats.py +1 -5
  24. datachain/lib/model_store.py +77 -0
  25. datachain/lib/pytorch.py +9 -21
  26. datachain/lib/signal_schema.py +139 -60
  27. datachain/lib/text.py +5 -16
  28. datachain/lib/udf.py +114 -30
  29. datachain/lib/udf_signature.py +5 -5
  30. datachain/lib/webdataset.py +3 -3
  31. datachain/lib/webdataset_laion.py +2 -3
  32. datachain/node.py +4 -4
  33. datachain/query/batch.py +1 -1
  34. datachain/query/dataset.py +51 -178
  35. datachain/query/dispatch.py +43 -30
  36. datachain/query/udf.py +46 -26
  37. datachain/remote/studio.py +1 -9
  38. datachain/torch/__init__.py +21 -0
  39. datachain/utils.py +39 -0
  40. {datachain-0.2.9.dist-info → datachain-0.2.11.dist-info}/METADATA +14 -12
  41. {datachain-0.2.9.dist-info → datachain-0.2.11.dist-info}/RECORD +45 -43
  42. {datachain-0.2.9.dist-info → datachain-0.2.11.dist-info}/WHEEL +1 -1
  43. datachain/image/__init__.py +0 -3
  44. datachain/lib/cached_stream.py +0 -38
  45. datachain/lib/claude.py +0 -69
  46. datachain/lib/feature.py +0 -412
  47. datachain/lib/feature_registry.py +0 -51
  48. datachain/lib/feature_utils.py +0 -154
  49. {datachain-0.2.9.dist-info → datachain-0.2.11.dist-info}/LICENSE +0 -0
  50. {datachain-0.2.9.dist-info → datachain-0.2.11.dist-info}/entry_points.txt +0 -0
  51. {datachain-0.2.9.dist-info → datachain-0.2.11.dist-info}/top_level.txt +0 -0
datachain/query/udf.py CHANGED
@@ -1,5 +1,5 @@
1
1
  import typing
2
- from collections.abc import Iterable, Mapping, Sequence
2
+ from collections.abc import Iterable, Iterator, Mapping, Sequence
3
3
  from dataclasses import dataclass
4
4
  from functools import WRAPPER_ASSIGNMENTS
5
5
  from inspect import isclass
@@ -14,7 +14,6 @@ from typing import (
14
14
  from fsspec.callbacks import DEFAULT_CALLBACK, Callback
15
15
 
16
16
  from datachain.dataset import RowDict
17
- from datachain.lib.utils import AbstractUDF
18
17
 
19
18
  from .batch import Batch, BatchingStrategy, NoBatching, Partition, RowBatch
20
19
  from .schema import (
@@ -100,15 +99,28 @@ class UDFBase:
100
99
 
101
100
  def __init__(
102
101
  self,
103
- func: Callable,
104
102
  properties: UDFProperties,
105
103
  ):
106
- self.func = func
107
104
  self.properties = properties
108
105
  self.signal_names = properties.signal_names()
109
106
  self.output = properties.output
110
107
 
111
- def __call__(
108
+ def run(
109
+ self,
110
+ udf_inputs: "Iterable[BatchingResult]",
111
+ catalog: "Catalog",
112
+ is_generator: bool,
113
+ cache: bool,
114
+ download_cb: Callback = DEFAULT_CALLBACK,
115
+ processed_cb: Callback = DEFAULT_CALLBACK,
116
+ ) -> Iterator[Iterable["UDFResult"]]:
117
+ for batch in udf_inputs:
118
+ n_rows = len(batch.rows) if isinstance(batch, RowBatch) else 1
119
+ output = self.run_once(catalog, batch, is_generator, cache, cb=download_cb)
120
+ processed_cb.relative_update(n_rows)
121
+ yield output
122
+
123
+ def run_once(
112
124
  self,
113
125
  catalog: "Catalog",
114
126
  arg: "BatchingResult",
@@ -116,24 +128,7 @@ class UDFBase:
116
128
  cache: bool = False,
117
129
  cb: Callback = DEFAULT_CALLBACK,
118
130
  ) -> Iterable[UDFResult]:
119
- if isinstance(self.func, AbstractUDF):
120
- self.func._catalog = catalog # type: ignore[unreachable]
121
-
122
- if isinstance(arg, RowBatch):
123
- udf_inputs = [
124
- self.bind_parameters(catalog, row, cache=cache, cb=cb)
125
- for row in arg.rows
126
- ]
127
- udf_outputs = self.func(udf_inputs)
128
- return self._process_results(arg.rows, udf_outputs, is_generator)
129
- if isinstance(arg, RowDict):
130
- udf_inputs = self.bind_parameters(catalog, arg, cache=cache, cb=cb)
131
- udf_outputs = self.func(*udf_inputs)
132
- if not is_generator:
133
- # udf_outputs is generator already if is_generator=True
134
- udf_outputs = [udf_outputs]
135
- return self._process_results([arg], udf_outputs, is_generator)
136
- raise ValueError(f"Unexpected UDF argument: {arg}")
131
+ raise NotImplementedError
137
132
 
138
133
  def bind_parameters(self, catalog: "Catalog", row: "RowDict", **kwargs) -> list:
139
134
  return [p.get_value(catalog, row, **kwargs) for p in self.properties.params]
@@ -152,9 +147,9 @@ class UDFBase:
152
147
  return (dict(zip(self.signal_names, row)) for row in results)
153
148
 
154
149
  # outputting signals
155
- row_ids = [row["id"] for row in rows]
150
+ row_ids = [row["sys__id"] for row in rows]
156
151
  return [
157
- dict(id=row_id, **dict(zip(self.signal_names, signals)))
152
+ {"sys__id": row_id} | dict(zip(self.signal_names, signals))
158
153
  for row_id, signals in zip(row_ids, results)
159
154
  if signals is not None # skip rows with no output
160
155
  ]
@@ -194,12 +189,37 @@ class UDFWrapper(UDFBase):
194
189
  func: Callable,
195
190
  properties: UDFProperties,
196
191
  ):
197
- super().__init__(func, properties)
192
+ self.func = func
193
+ super().__init__(properties)
198
194
  # This emulates the behavior of functools.wraps for a class decorator
199
195
  for attr in WRAPPER_ASSIGNMENTS:
200
196
  if hasattr(func, attr):
201
197
  setattr(self, attr, getattr(func, attr))
202
198
 
199
+ def run_once(
200
+ self,
201
+ catalog: "Catalog",
202
+ arg: "BatchingResult",
203
+ is_generator: bool = False,
204
+ cache: bool = False,
205
+ cb: Callback = DEFAULT_CALLBACK,
206
+ ) -> Iterable[UDFResult]:
207
+ if isinstance(arg, RowBatch):
208
+ udf_inputs = [
209
+ self.bind_parameters(catalog, row, cache=cache, cb=cb)
210
+ for row in arg.rows
211
+ ]
212
+ udf_outputs = self.func(udf_inputs)
213
+ return self._process_results(arg.rows, udf_outputs, is_generator)
214
+ if isinstance(arg, RowDict):
215
+ udf_inputs = self.bind_parameters(catalog, arg, cache=cache, cb=cb)
216
+ udf_outputs = self.func(*udf_inputs)
217
+ if not is_generator:
218
+ # udf_outputs is generator already if is_generator=True
219
+ udf_outputs = [udf_outputs]
220
+ return self._process_results([arg], udf_outputs, is_generator)
221
+ raise ValueError(f"Unexpected UDF argument: {arg}")
222
+
203
223
  # This emulates the behavior of functools.wraps for a class decorator
204
224
  def __repr__(self):
205
225
  return repr(self.func)
@@ -190,19 +190,11 @@ class StudioClient:
190
190
  def dataset_rows_chunk(
191
191
  self, name: str, version: int, offset: int
192
192
  ) -> Response[DatasetRowsData]:
193
- def _parse_row(row):
194
- row["id"] = int(row["id"])
195
- return row
196
-
197
193
  req_data = {"dataset_name": name, "dataset_version": version}
198
- response = self._send_request_msgpack(
194
+ return self._send_request_msgpack(
199
195
  "dataset-rows",
200
196
  {**req_data, "offset": offset, "limit": DATASET_ROWS_CHUNK_SIZE},
201
197
  )
202
- if response.ok:
203
- response.data = [_parse_row(r) for r in response.data]
204
-
205
- return response
206
198
 
207
199
  def dataset_stats(self, name: str, version: int) -> Response[DatasetStatsData]:
208
200
  response = self._send_request(
@@ -0,0 +1,21 @@
1
+ try:
2
+ from datachain.lib.clip import similarity_scores as clip_similarity_scores
3
+ from datachain.lib.image import convert_image, convert_images
4
+ from datachain.lib.pytorch import PytorchDataset, label_to_int
5
+ from datachain.lib.text import convert_text
6
+
7
+ except ImportError as exc:
8
+ raise ImportError(
9
+ "Missing dependencies for torch:\n"
10
+ "To install run:\n\n"
11
+ " pip install 'datachain[torch]'\n"
12
+ ) from exc
13
+
14
+ __all__ = [
15
+ "PytorchDataset",
16
+ "clip_similarity_scores",
17
+ "convert_image",
18
+ "convert_images",
19
+ "convert_text",
20
+ "label_to_int",
21
+ ]
datachain/utils.py CHANGED
@@ -1,5 +1,6 @@
1
1
  import glob
2
2
  import importlib.util
3
+ import io
3
4
  import json
4
5
  import os
5
6
  import os.path as osp
@@ -13,8 +14,10 @@ from itertools import islice
13
14
  from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union
14
15
  from uuid import UUID
15
16
 
17
+ import cloudpickle
16
18
  from dateutil import tz
17
19
  from dateutil.parser import isoparse
20
+ from pydantic import BaseModel
18
21
 
19
22
  if TYPE_CHECKING:
20
23
  import pandas as pd
@@ -388,3 +391,39 @@ def inside_notebook() -> bool:
388
391
  return False
389
392
 
390
393
  return False
394
+
395
+
396
+ def get_all_subclasses(cls):
397
+ """Return all subclasses of a given class.
398
+ Can return duplicates due to multiple inheritance."""
399
+ for subclass in cls.__subclasses__():
400
+ yield from get_all_subclasses(subclass)
401
+ yield subclass
402
+
403
+
404
+ def filtered_cloudpickle_dumps(obj: Any) -> bytes:
405
+ """Equivalent to cloudpickle.dumps, but this supports Pydantic models."""
406
+ model_namespaces = {}
407
+
408
+ with io.BytesIO() as f:
409
+ pickler = cloudpickle.CloudPickler(f)
410
+
411
+ for model_class in get_all_subclasses(BaseModel):
412
+ # This "is not None" check is needed, because due to multiple inheritance,
413
+ # it is theoretically possible to get the same class twice from
414
+ # get_all_subclasses.
415
+ if model_class.__pydantic_parent_namespace__ is not None:
416
+ # __pydantic_parent_namespace__ can contain many unnecessary and
417
+ # unpickleable entities, so should be removed for serialization.
418
+ model_namespaces[model_class] = (
419
+ model_class.__pydantic_parent_namespace__
420
+ )
421
+ model_class.__pydantic_parent_namespace__ = None
422
+
423
+ try:
424
+ pickler.dump(obj)
425
+ return f.getvalue()
426
+ finally:
427
+ for model_class, namespace in model_namespaces.items():
428
+ # Restore original __pydantic_parent_namespace__ locally.
429
+ model_class.__pydantic_parent_namespace__ = namespace
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.2.9
3
+ Version: 0.2.11
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -34,16 +34,13 @@ Requires-Dist: shtab <2,>=1.3.4
34
34
  Requires-Dist: sqlalchemy >=2
35
35
  Requires-Dist: multiprocess ==0.70.16
36
36
  Requires-Dist: dill ==0.3.8
37
+ Requires-Dist: cloudpickle
37
38
  Requires-Dist: ujson >=5.9.0
38
39
  Requires-Dist: pydantic <3,>=2
39
40
  Requires-Dist: jmespath >=1.0
40
41
  Requires-Dist: datamodel-code-generator >=0.25
42
+ Requires-Dist: Pillow <11,>=10.0.0
41
43
  Requires-Dist: numpy <2,>=1 ; sys_platform == "win32"
42
- Provides-Extra: cv
43
- Requires-Dist: Pillow <11,>=10.0.0 ; extra == 'cv'
44
- Requires-Dist: torch >=2.1.0 ; extra == 'cv'
45
- Requires-Dist: torchvision ; extra == 'cv'
46
- Requires-Dist: transformers >=4.36.0 ; extra == 'cv'
47
44
  Provides-Extra: dev
48
45
  Requires-Dist: datachain[docs,tests] ; extra == 'dev'
49
46
  Requires-Dist: mypy ==1.10.1 ; extra == 'dev'
@@ -63,7 +60,7 @@ Requires-Dist: lz4 ; extra == 'remote'
63
60
  Requires-Dist: msgpack <2,>=1.0.4 ; extra == 'remote'
64
61
  Requires-Dist: requests >=2.22.0 ; extra == 'remote'
65
62
  Provides-Extra: tests
66
- Requires-Dist: datachain[cv,remote,vector] ; extra == 'tests'
63
+ Requires-Dist: datachain[remote,torch,vector] ; extra == 'tests'
67
64
  Requires-Dist: pytest <9,>=8 ; extra == 'tests'
68
65
  Requires-Dist: pytest-sugar >=0.9.6 ; extra == 'tests'
69
66
  Requires-Dist: pytest-cov >=4.1.0 ; extra == 'tests'
@@ -78,6 +75,10 @@ Requires-Dist: hypothesis ; extra == 'tests'
78
75
  Requires-Dist: open-clip-torch ; extra == 'tests'
79
76
  Requires-Dist: aiotools >=1.7.0 ; extra == 'tests'
80
77
  Requires-Dist: requests-mock ; extra == 'tests'
78
+ Provides-Extra: torch
79
+ Requires-Dist: torch >=2.1.0 ; extra == 'torch'
80
+ Requires-Dist: torchvision ; extra == 'torch'
81
+ Requires-Dist: transformers >=4.36.0 ; extra == 'torch'
81
82
  Provides-Extra: vector
82
83
  Requires-Dist: usearch ; extra == 'vector'
83
84
 
@@ -89,11 +90,11 @@ Requires-Dist: usearch ; extra == 'vector'
89
90
  .. |Python Version| image:: https://img.shields.io/pypi/pyversions/datachain
90
91
  :target: https://pypi.org/project/datachain
91
92
  :alt: Python Version
92
- .. |Codecov| image:: https://codecov.io/gh/iterative/dvcx/branch/main/graph/badge.svg?token=VSCP2T9R5X
93
- :target: https://app.codecov.io/gh/iterative/dvcx
93
+ .. |Codecov| image:: https://codecov.io/gh/iterative/datachain/graph/badge.svg?token=byliXGGyGB
94
+ :target: https://codecov.io/gh/iterative/datachain
94
95
  :alt: Codecov
95
- .. |Tests| image:: https://github.com/iterative/dvcx/workflows/Tests/badge.svg
96
- :target: https://github.com/iterative/dvcx/actions?workflow=Tests
96
+ .. |Tests| image:: https://github.com/iterative/datachain/workflows/Tests/badge.svg
97
+ :target: https://github.com/iterative/datachain/actions?workflow=Tests
97
98
  :alt: Tests
98
99
 
99
100
  AI 🔗 DataChain
@@ -397,7 +398,8 @@ Chain results can be exported or passed directly to Pytorch dataloader. For exam
397
398
  Tutorials
398
399
  ------------------
399
400
 
400
- * `Multimodal <examples/multimodal/clip_fine_tuning.ipynb>`_ (try in `Colab <https://colab.research.google.com/github/iterative/dvclive/blob/main/examples/multimodal/clip_fine_tuning.ipynb>`__)
401
+ * `Computer Vision <examples/computer_vision/fashion_product_images/1-quick-start.ipynb>`_ (try in `Colab <https://colab.research.google.com/github/iterative/datachain/blob/main/examples/computer_vision/fashion_product_images/1-quick-start.ipynb>`__)
402
+ * `Multimodal <examples/multimodal/clip_fine_tuning.ipynb>`_ (try in `Colab <https://colab.research.google.com/github/iterative/datachain/blob/main/examples/multimodal/clip_fine_tuning.ipynb>`__)
401
403
 
402
404
  Contributions
403
405
  --------------------
@@ -1,22 +1,22 @@
1
- datachain/__init__.py,sha256=WTZQycUOpP1b-Ry_Qje5HH0EE14ptne-ZiQQ5070UMA,798
1
+ datachain/__init__.py,sha256=L5IlHOD4AaHkV7P5dbUwdq90I3bGFLtOghoZ1WVFGcs,841
2
2
  datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
3
3
  datachain/asyn.py,sha256=CKCFQJ0CbB3r04S7mUTXxriKzPnOvdUaVPXjM8vCtJw,7644
4
4
  datachain/cache.py,sha256=FaPWrqWznPffmskTb1pdPkt2jAMMf__9FC2zEnP0vDU,4022
5
- datachain/cli.py,sha256=lInqYMhk8YuPY-ZWkfWZmE-ZmdIChJgbs305-a_MWpo,32457
5
+ datachain/cli.py,sha256=gikzwEXTDKyzY1xOAUziXN2-OVqnOhDMJTd7SHq0Jxc,32406
6
6
  datachain/cli_utils.py,sha256=jrn9ejGXjybeO1ur3fjdSiAyCHZrX0qsLLbJzN9ErPM,2418
7
7
  datachain/config.py,sha256=PfC7W5yO6HFO6-iMB4YB-0RR88LPiGmD6sS_SfVbGso,1979
8
8
  datachain/dataset.py,sha256=MZezyuJWNj_3PEtzr0epPMNyWAOTrhTSPI5FmemV6L4,14470
9
9
  datachain/error.py,sha256=GY9KYTmb7GHXn2gGHV9X-PBhgwLj3i7VpK7tGHtAoGM,1279
10
10
  datachain/listing.py,sha256=sX8vZNzAzoTel1li6VJiYeHUJwseUERVEoW9D5P7tII,8192
11
- datachain/node.py,sha256=fHe7k5ajI2g2qnzsG-_NQR_T-QdBYctVeEa8c8dsu_Y,5703
11
+ datachain/node.py,sha256=fsQDJUmRMSRHhL1u6qQlWgreHbH760Ls-yDzFLhbW-U,5724
12
12
  datachain/nodes_fetcher.py,sha256=kca19yvu11JxoVY1t4_ydp1FmchiV88GnNicNBQ9NIA,831
13
13
  datachain/nodes_thread_pool.py,sha256=ZyzBvUImIPmi4WlKC2SW2msA0UhtembbTdcs2nx29A0,3191
14
14
  datachain/progress.py,sha256=7_8FtJs770ITK9sMq-Lt4k4k18QmYl4yIG_kCoWID3o,4559
15
15
  datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
16
  datachain/storage.py,sha256=RiSJLYdHUjnrEWkLBKPcETHpAxld_B2WxLg711t0aZI,3733
17
- datachain/utils.py,sha256=12yQAV8tfyCHqp_xJcJBeNnr1L_BO8e2bOPyXdM68gs,10759
17
+ datachain/utils.py,sha256=AWUXRk7yvDpHcqzzPWwzv8HtF1-jDVEBHKxAgT7u02E,12288
18
18
  datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
19
- datachain/catalog/catalog.py,sha256=pulKGJgAmxqSmFqBhA-J0wCKdBqGX4vqpV0cAvV6vUw,79578
19
+ datachain/catalog/catalog.py,sha256=A5W9Ffoz1lZkzl6A3igaMC5jrus8VIYVLJLX8JTVKrk,79603
20
20
  datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
21
21
  datachain/catalog/loader.py,sha256=GJ8zhEYkC7TuaPzCsjJQ4LtTdECu-wwYzC12MikPOMQ,7307
22
22
  datachain/catalog/subclass.py,sha256=B5R0qxeTYEyVAAPM1RutBPSoXZc8L5mVVZeSGXki9Sw,2096
@@ -32,51 +32,52 @@ datachain/data_storage/db_engine.py,sha256=rgBuqJ-M1j5QyqiUQuJRewctuvRRj8LBDL54-
32
32
  datachain/data_storage/id_generator.py,sha256=VlDALKijggegAnNMJwuMETJgnLoPYxpkrkld5DNTPQw,3839
33
33
  datachain/data_storage/job.py,sha256=w-7spowjkOa1P5fUVtJou3OltT0L48P0RYWZ9rSJ9-s,383
34
34
  datachain/data_storage/metastore.py,sha256=y-4fYvuOPnWeYxAvqhDnw6CdlTvQiurg0Gg4TaG9LR0,54074
35
- datachain/data_storage/schema.py,sha256=bY3q2OUaUraos0s5BnwWkhgce8YpeNmIl7M1ifshoes,8074
35
+ datachain/data_storage/schema.py,sha256=hUykqT-As-__WffMdWTrSZwv9k5EYYowRke3OENQ3aY,8102
36
36
  datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
37
- datachain/data_storage/sqlite.py,sha256=F68Q_AIqNAObZ5kJ0GnBqRC6e2D2sRehkQo8UzrHgtI,25079
38
- datachain/data_storage/warehouse.py,sha256=h35JiJoCGtwkMctis_x3NHxkwEejX5sIWvJOluZxrOI,33132
39
- datachain/image/__init__.py,sha256=g3l7vJFzg0-s5OAmBtGargsxt12TuKU4Ex6S0fOmEeY,101
37
+ datachain/data_storage/sqlite.py,sha256=cIYobczfH72c4l-iMkxpkgcTuuvvT8Xi64iP7Zr3Skw,25084
38
+ datachain/data_storage/warehouse.py,sha256=UbD37_jqaM4BY2SsQaTiJre-eSa7HcPejrTp936L080,33170
40
39
  datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
41
- datachain/lib/arrow.py,sha256=FF3WWUOjB6Prw8ygfiLsrVfrdob0S01lPzEazuGqoO8,2556
42
- datachain/lib/cached_stream.py,sha256=t2ifK0hZVZiVn0MQ8D3FaFK1-qK84TwJW2Dw1SRsw9g,1066
43
- datachain/lib/claude.py,sha256=iAauA1zNVNONpLzUo1t0QN5PZ5Ot6cZkfib7Ka_c638,1969
44
- datachain/lib/clip.py,sha256=rDeZlFGs0DXBlpmh5ZQJhR9Sz13bWAZGQjfYm1hsUI4,5388
45
- datachain/lib/dc.py,sha256=D3cgib-U0Mo0x5wEK1_NfgymAldHqCvooZwtyohi53Q,34426
46
- datachain/lib/feature.py,sha256=iMwbMyQUyjRUeB-vhAucnx59kNSVvX_xEChTW5B9klY,12244
47
- datachain/lib/feature_registry.py,sha256=K3jGQzBp2HZDjR9hdGe1BZaXOAne8RpkCRRQdTVjkTs,1622
48
- datachain/lib/feature_utils.py,sha256=2yLdZd9o4AJ5QQX7kqgbCxCT78aT7HE12CLxQ6QRpbc,4982
49
- datachain/lib/file.py,sha256=LGBwC7tFU7VcSWk5kjPpEWPBQas5me69L2uTDNvYXGM,8326
50
- datachain/lib/gpt4_vision.py,sha256=idyXVZVWzltstGaVIu5RYE5UNbdqcPEjIWy81O1MwkM,2922
51
- datachain/lib/hf_image_to_text.py,sha256=HiPSWzJRDT-vnz9DXJbJBNCMNl9wmpxiSS3PbbVz8SE,3310
52
- datachain/lib/hf_pipeline.py,sha256=f0AH_XCziOF1OKN3d1w1swTBLaeajMJ8xgdsX37i5-o,2287
53
- datachain/lib/image.py,sha256=ZYfDqr9p-RRmWBeWFQwXLS1J3vQS616ykfMUvQVpqBY,2717
54
- datachain/lib/image_transform.py,sha256=NXWtnVOcofWBgl_YMxb4ABpaT7JTBMx7tLKvErH1IC4,3024
55
- datachain/lib/iptc_exif_xmp.py,sha256=xrbxFeY-wRP6T5JsUgE3EXfTxKvZVymRaRD_VIfxD0A,2236
56
- datachain/lib/meta_formats.py,sha256=wIVVLRLp45Zk4vjZRd_P1UtD24vpDCb-vILWtcsACwk,6630
57
- datachain/lib/pytorch.py,sha256=Ea1sXhborF6zcywQjLpXgKnbr1lTez4Bfu3m0Gr78FI,5843
40
+ datachain/lib/arrow.py,sha256=ttSiH8Xr08zxypAa3-BNTxMO2NBuZfYICwmG1qQwvWU,3268
41
+ datachain/lib/clip.py,sha256=YRa15Whnn6C8BMA-OAu0mYjc4h9i_n7pffRGdtfrTBA,5222
42
+ datachain/lib/data_model.py,sha256=DpV_-1JqJptCf0w4cnzPlHm5Yl4FQaveRgVCDZFaHXs,2012
43
+ datachain/lib/dc.py,sha256=rd-7gVcMRZ2M-O8aQhNx85H31w-kRQHpXSwtf26dSk4,35849
44
+ datachain/lib/file.py,sha256=Uik1sq2l-uknpikH4Gdm7ZR0EcQYP2TrNg-urECjbW4,8304
45
+ datachain/lib/gpt4_vision.py,sha256=CZ-a64olZNp9TNmLGngmbN6b02UYImzwK3dPClnjxTI,2716
46
+ datachain/lib/hf_image_to_text.py,sha256=uVl4mnUl8gnHrJ3wfSZlxBevH-cxqOswxLArLAHxRrE,3077
47
+ datachain/lib/hf_pipeline.py,sha256=MBFzixVa25_6QVR9RyOq8Rr9UIQ-sFVcBHducx_sZcY,2069
48
+ datachain/lib/image.py,sha256=K0n_P7kmobWTgxe-rDbr5yY3vBrOPnseziE3DXwFFVo,2325
49
+ datachain/lib/image_transform.py,sha256=hfgvIrSMGBx_MEXECyvrFoO1NyPBHoDb28j2lT2dsf8,2953
50
+ datachain/lib/iptc_exif_xmp.py,sha256=rmlxjOmAP31OCgbGBAwIgd1F_6QVBoSWsOPG6UsBg_w,2007
51
+ datachain/lib/meta_formats.py,sha256=SF7UPPe-U-1HL6DBO1NfwZLIChjkHrHasgHf5ztCUoU,6436
52
+ datachain/lib/model_store.py,sha256=JFpI1P0WFpsO6eAU49AdWmff5T8azqLrqOMB08pYJjg,2331
53
+ datachain/lib/pytorch.py,sha256=7fd2g0dI9zrMfRl3IVwIvXRH0v6TwSAyZGAbqKdEjcI,5505
58
54
  datachain/lib/settings.py,sha256=6Nkoh8riETrftYwDp3aniK53Dsjc07MdztL8N0cW1D8,2849
59
- datachain/lib/signal_schema.py,sha256=hD56hyO1H3A5H2oyTUwPcNu6UOQ_XY0DeA0nrXBqFaU,11492
60
- datachain/lib/text.py,sha256=PUT1O0jNJoQGsuhff2LgDpzTWk2eMdwIKqEDBrE448M,1307
61
- datachain/lib/udf.py,sha256=axMvqYz4tdyg_C3nyuOcDsu3Aqr19jWv2vl54U_8LQM,6595
62
- datachain/lib/udf_signature.py,sha256=CUKgoVpM_N8CgvMncpAw2RYchoiJdAGdDSdluoP0hIk,7161
55
+ datachain/lib/signal_schema.py,sha256=mRdq5qEGnFQgbSawzDPi2MCZ6PULTMigd51B2RuNxpg,14173
56
+ datachain/lib/text.py,sha256=d2V-52cqzVm5OT68BcLYyHrglvFMVR5DPzsbtRRv3D0,1063
57
+ datachain/lib/udf.py,sha256=RqCiGuNKL5P8eS84s_mmVYjK1gvkuRYdnIKm9qe-i2U,9698
58
+ datachain/lib/udf_signature.py,sha256=R81QqZseG_xeBFzJSgt-wrTQeUU-1RrWkHckLm_HEUU,7135
63
59
  datachain/lib/unstructured.py,sha256=9Y6rAelXdYqkNbPaqz6DhXjhS8d6qXcP0ieIsWkzvkk,1143
64
60
  datachain/lib/utils.py,sha256=5-kJlAZE0D9nXXweAjo7-SP_AWGo28feaDByONYaooQ,463
65
61
  datachain/lib/vfile.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
66
- datachain/lib/webdataset.py,sha256=GWB_pocfRZGoU4Lhd7Wh3hx2Rnm_fJWXX4S_zXJIEmk,8286
67
- datachain/lib/webdataset_laion.py,sha256=HAtSCbVvEQqzKkoRamRxDKaQALSB3QmJRU2yWRFNxwY,2147
62
+ datachain/lib/webdataset.py,sha256=nIa6ubv94CwnATeeSdE7f_F9Zkz9LuBTfbXvFg3_-Ak,8295
63
+ datachain/lib/webdataset_laion.py,sha256=PQP6tQmUP7Xu9fPuAGK1JDBYA6T5UufYMUTGaxgspJA,2118
64
+ datachain/lib/convert/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
65
+ datachain/lib/convert/flatten.py,sha256=XdAj0f9W32ABjOo8UyYm0y0H_yHDn3qEHERTyXuhJxk,1592
66
+ datachain/lib/convert/type_converter.py,sha256=W-wvCIcb6OwWjRJ3EWJE4-LbpoqxsRBd6gYNpFlm8qo,2643
67
+ datachain/lib/convert/unflatten.py,sha256=Ogvh_5wg2f38_At_1lN0D_e2uZOOpYEvwvB2xdq56Tw,2012
68
+ datachain/lib/convert/values_to_tuples.py,sha256=MWz9pHT-AaPQN8hNMUYfuOHstyuNv0QEckwXlKgFbLA,3088
68
69
  datachain/query/__init__.py,sha256=tv-spkjUCYamMN9ys_90scYrZ8kJ7C7d1MTYVmxGtk4,325
69
- datachain/query/batch.py,sha256=sOMxXbaNii7lVyFIEZ2noqbhy_S8qtZ-WWxrka72shc,3474
70
+ datachain/query/batch.py,sha256=j-_ZcuQra2Ro3Wj4crtqQCg-7xuv-p84hr4QHdvT7as,3479
70
71
  datachain/query/builtins.py,sha256=ZKNs49t8Oa_OaboCBIEqtXZt7c1Qe9OR_C_HpoDriIU,2781
71
- datachain/query/dataset.py,sha256=vpu2wQYC5uWc-LdZrNV-PV7xQapbYCtqyrXiiIa77DI,64982
72
- datachain/query/dispatch.py,sha256=ZeL5dga5d4cJDBftK7gAQ_mx4C7zq6t3z0Hdt7mcZYY,13094
72
+ datachain/query/dataset.py,sha256=P1KBv_R0YnKjNDHzOJwAx9qhwI08l0dLgaXfak3ps7k,60578
73
+ datachain/query/dispatch.py,sha256=oGX9ZuoKWPB_EyqAZD_eULcO3OejY44_keSmFS6SHT0,13315
73
74
  datachain/query/metrics.py,sha256=vsECqbZfoSDBnvC3GQlziKXmISVYDLgHP1fMPEOtKyo,640
74
75
  datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
75
76
  datachain/query/schema.py,sha256=n1NBOj6JO2I26mZD4vSURmVC2rk3mjIkJQheeLogoy4,7748
76
77
  datachain/query/session.py,sha256=e4_vv4RqAjU-g3KK0avgLd9MEsmJBzRVEj1w8v7fP1k,3663
77
- datachain/query/udf.py,sha256=gnLDM7LKH8_bbdDeVHnlDKaBdbWc_NAbwvYCc4i-OlU,7101
78
+ datachain/query/udf.py,sha256=c0IOTkcedpOQEmX-Idlrrl1__1IecNXL0N9oUO9Dtkg,7755
78
79
  datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
79
- datachain/remote/studio.py,sha256=bZb85WjtqMNFBoRuPbH-TEGpAyz0afROR7E9UgIef_Y,7438
80
+ datachain/remote/studio.py,sha256=f5s6qSZ9uB4URGUoU_8_W1KZRRQQVSm6cgEBkBUEfuE,7226
80
81
  datachain/sql/__init__.py,sha256=A2djrbQwSMUZZEIKGnm-mnRA-NDSbiDJNpAmmwGNyIo,303
81
82
  datachain/sql/selectable.py,sha256=fBM-wS1TUA42kVEAAiwqGtibIevyZAEritwt8PZGyLQ,1589
82
83
  datachain/sql/types.py,sha256=BzUm0nCcMPASvdqpQouX5bdVcK3G3DBfeeNhau7X_hA,10234
@@ -94,9 +95,10 @@ datachain/sql/sqlite/base.py,sha256=nPMF6_FF04hclDNZev_YfxMgbJAsWEdF-rU2pUhqBtc,
94
95
  datachain/sql/sqlite/types.py,sha256=oP93nLfTBaYnN0z_4Dsv-HZm8j9rrUf1esMM-z3JLbg,1754
95
96
  datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
96
97
  datachain/text/__init__.py,sha256=-yxHL2gVl3H0Zxam6iWUO6F1Mc4QAFHX6z-5fjHND74,72
97
- datachain-0.2.9.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
98
- datachain-0.2.9.dist-info/METADATA,sha256=k1Q5NcgOeFu71IXx7Fygiuh8gt7GIU6jzN6X1-oHcPM,16475
99
- datachain-0.2.9.dist-info/WHEEL,sha256=Z4pYXqR_rTB7OWNDYFOm1qRk0RX6GFP2o8LgvP453Hk,91
100
- datachain-0.2.9.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
101
- datachain-0.2.9.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
102
- datachain-0.2.9.dist-info/RECORD,,
98
+ datachain/torch/__init__.py,sha256=9QJW8h0FevIXEykRsxQ7XzMDXvdIkv3kVf_UY95CTyg,600
99
+ datachain-0.2.11.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
100
+ datachain-0.2.11.dist-info/METADATA,sha256=OVKgVc-Wc75AAQIY6hGL1CEBmnwksfgOXfiUen_xAOM,16759
101
+ datachain-0.2.11.dist-info/WHEEL,sha256=FZ75kcLy9M91ncbIgG8dnpCncbiKXSRGJ_PFILs6SFg,91
102
+ datachain-0.2.11.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
103
+ datachain-0.2.11.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
104
+ datachain-0.2.11.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (70.3.0)
2
+ Generator: setuptools (71.0.1)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,3 +0,0 @@
1
- from datachain.lib.image import ImageFile, convert_images
2
-
3
- __all__ = ["ImageFile", "convert_images"]
@@ -1,38 +0,0 @@
1
- from abc import ABC
2
- from contextlib import AbstractContextManager
3
-
4
- from datachain.cache import UniqueId
5
-
6
-
7
- class AbstractCachedStream(AbstractContextManager, ABC):
8
- def __init__(self, catalog, uid: UniqueId):
9
- self.catalog = catalog
10
- self.uid = uid
11
- self.mode = "rb"
12
-
13
- def set_mode(self, mode):
14
- self.mode = mode
15
-
16
-
17
- class PreCachedStream(AbstractCachedStream):
18
- def __init__(self, catalog, uid: UniqueId):
19
- super().__init__(catalog, uid)
20
- self.client = self.catalog.get_client(self.uid.storage)
21
- self.cached_file = None
22
-
23
- def get_path_in_cache(self):
24
- return self.catalog.cache.path_from_checksum(self.uid.get_hash())
25
-
26
- def __enter__(self):
27
- self.client.download(self.uid)
28
- self.cached_file = open(self.get_path_in_cache(), self.mode)
29
- return self.cached_file
30
-
31
- def __exit__(self, *args):
32
- self.cached_file.close()
33
-
34
-
35
- class PreDownloadStream(PreCachedStream):
36
- def __exit__(self, *args):
37
- super().__exit__(*args)
38
- self.catalog.cache.remove(self.uid)
datachain/lib/claude.py DELETED
@@ -1,69 +0,0 @@
1
- import os
2
- from typing import Callable, Literal, Optional
3
-
4
- import anthropic
5
-
6
- from datachain.lib.feature import Feature
7
- from datachain.lib.file import File
8
-
9
- default_model_name = "claude-3-haiku-20240307"
10
- DEFAULT_OUTPUT_TOKENS = 1024
11
-
12
- # This classes can be auto-generated:
13
- # >> from anthropic.types.message import Message
14
- # >> ClaudeMessage = pydantic_to_feature(Message)
15
- # However, auto-generated pydentic classes do not work in multithreading mode.
16
-
17
-
18
- class UsageFr(Feature):
19
- input_tokens: int = 0
20
- output_tokens: int = 0
21
-
22
-
23
- class TextBlockFr(Feature):
24
- text: str = ""
25
- type: str = "text"
26
-
27
-
28
- class ClaudeMessage(Feature):
29
- id: str = ""
30
- content: list[TextBlockFr]
31
- model: str = ""
32
- role: str = ""
33
- stop_reason: Optional[Literal["end_turn", "max_tokens", "stop_sequence"]] = None
34
- stop_sequence: Optional[str] = None
35
- type: Literal["message"] = "message"
36
- usage: UsageFr = UsageFr()
37
-
38
-
39
- def claude_processor(
40
- prompt: str,
41
- messages: Optional[list] = None,
42
- model: str = "claude-3-haiku-20240307",
43
- api_key: Optional[str] = "",
44
- max_retries: int = 5,
45
- temperature: float = 0.9,
46
- max_tokens: int = 1024,
47
- **kwargs,
48
- ) -> Callable:
49
- if not messages:
50
- messages = []
51
- api_key = api_key or os.environ.get("ANTHROPIC_API_KEY")
52
-
53
- def claude_func(file) -> ClaudeMessage:
54
- try:
55
- data = file.get_value() if isinstance(file, File) else file
56
- client = anthropic.Anthropic(api_key=api_key, max_retries=max_retries)
57
- response = client.messages.create(
58
- model=model,
59
- system=prompt,
60
- messages=[{"role": "user", "content": data}, *messages],
61
- temperature=temperature,
62
- max_tokens=max_tokens,
63
- **kwargs,
64
- )
65
- return ClaudeMessage(**response.model_dump())
66
- except Exception: # noqa: BLE001
67
- return ClaudeMessage(content=[])
68
-
69
- return claude_func