datachain 0.2.9__py3-none-any.whl → 0.2.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/__init__.py +17 -8
- datachain/catalog/catalog.py +5 -5
- datachain/cli.py +0 -2
- datachain/data_storage/schema.py +5 -5
- datachain/data_storage/sqlite.py +1 -1
- datachain/data_storage/warehouse.py +7 -7
- datachain/lib/arrow.py +25 -8
- datachain/lib/clip.py +6 -11
- datachain/lib/convert/__init__.py +0 -0
- datachain/lib/convert/flatten.py +67 -0
- datachain/lib/convert/type_converter.py +96 -0
- datachain/lib/convert/unflatten.py +69 -0
- datachain/lib/convert/values_to_tuples.py +85 -0
- datachain/lib/data_model.py +74 -0
- datachain/lib/dc.py +225 -168
- datachain/lib/file.py +41 -41
- datachain/lib/gpt4_vision.py +1 -9
- datachain/lib/hf_image_to_text.py +9 -17
- datachain/lib/hf_pipeline.py +4 -12
- datachain/lib/image.py +2 -18
- datachain/lib/image_transform.py +0 -1
- datachain/lib/iptc_exif_xmp.py +8 -15
- datachain/lib/meta_formats.py +1 -5
- datachain/lib/model_store.py +77 -0
- datachain/lib/pytorch.py +9 -21
- datachain/lib/signal_schema.py +139 -60
- datachain/lib/text.py +5 -16
- datachain/lib/udf.py +114 -30
- datachain/lib/udf_signature.py +5 -5
- datachain/lib/webdataset.py +3 -3
- datachain/lib/webdataset_laion.py +2 -3
- datachain/node.py +4 -4
- datachain/query/batch.py +1 -1
- datachain/query/dataset.py +51 -178
- datachain/query/dispatch.py +43 -30
- datachain/query/udf.py +46 -26
- datachain/remote/studio.py +1 -9
- datachain/torch/__init__.py +21 -0
- datachain/utils.py +39 -0
- {datachain-0.2.9.dist-info → datachain-0.2.11.dist-info}/METADATA +14 -12
- {datachain-0.2.9.dist-info → datachain-0.2.11.dist-info}/RECORD +45 -43
- {datachain-0.2.9.dist-info → datachain-0.2.11.dist-info}/WHEEL +1 -1
- datachain/image/__init__.py +0 -3
- datachain/lib/cached_stream.py +0 -38
- datachain/lib/claude.py +0 -69
- datachain/lib/feature.py +0 -412
- datachain/lib/feature_registry.py +0 -51
- datachain/lib/feature_utils.py +0 -154
- {datachain-0.2.9.dist-info → datachain-0.2.11.dist-info}/LICENSE +0 -0
- {datachain-0.2.9.dist-info → datachain-0.2.11.dist-info}/entry_points.txt +0 -0
- {datachain-0.2.9.dist-info → datachain-0.2.11.dist-info}/top_level.txt +0 -0
datachain/query/udf.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import typing
|
|
2
|
-
from collections.abc import Iterable, Mapping, Sequence
|
|
2
|
+
from collections.abc import Iterable, Iterator, Mapping, Sequence
|
|
3
3
|
from dataclasses import dataclass
|
|
4
4
|
from functools import WRAPPER_ASSIGNMENTS
|
|
5
5
|
from inspect import isclass
|
|
@@ -14,7 +14,6 @@ from typing import (
|
|
|
14
14
|
from fsspec.callbacks import DEFAULT_CALLBACK, Callback
|
|
15
15
|
|
|
16
16
|
from datachain.dataset import RowDict
|
|
17
|
-
from datachain.lib.utils import AbstractUDF
|
|
18
17
|
|
|
19
18
|
from .batch import Batch, BatchingStrategy, NoBatching, Partition, RowBatch
|
|
20
19
|
from .schema import (
|
|
@@ -100,15 +99,28 @@ class UDFBase:
|
|
|
100
99
|
|
|
101
100
|
def __init__(
|
|
102
101
|
self,
|
|
103
|
-
func: Callable,
|
|
104
102
|
properties: UDFProperties,
|
|
105
103
|
):
|
|
106
|
-
self.func = func
|
|
107
104
|
self.properties = properties
|
|
108
105
|
self.signal_names = properties.signal_names()
|
|
109
106
|
self.output = properties.output
|
|
110
107
|
|
|
111
|
-
def
|
|
108
|
+
def run(
|
|
109
|
+
self,
|
|
110
|
+
udf_inputs: "Iterable[BatchingResult]",
|
|
111
|
+
catalog: "Catalog",
|
|
112
|
+
is_generator: bool,
|
|
113
|
+
cache: bool,
|
|
114
|
+
download_cb: Callback = DEFAULT_CALLBACK,
|
|
115
|
+
processed_cb: Callback = DEFAULT_CALLBACK,
|
|
116
|
+
) -> Iterator[Iterable["UDFResult"]]:
|
|
117
|
+
for batch in udf_inputs:
|
|
118
|
+
n_rows = len(batch.rows) if isinstance(batch, RowBatch) else 1
|
|
119
|
+
output = self.run_once(catalog, batch, is_generator, cache, cb=download_cb)
|
|
120
|
+
processed_cb.relative_update(n_rows)
|
|
121
|
+
yield output
|
|
122
|
+
|
|
123
|
+
def run_once(
|
|
112
124
|
self,
|
|
113
125
|
catalog: "Catalog",
|
|
114
126
|
arg: "BatchingResult",
|
|
@@ -116,24 +128,7 @@ class UDFBase:
|
|
|
116
128
|
cache: bool = False,
|
|
117
129
|
cb: Callback = DEFAULT_CALLBACK,
|
|
118
130
|
) -> Iterable[UDFResult]:
|
|
119
|
-
|
|
120
|
-
self.func._catalog = catalog # type: ignore[unreachable]
|
|
121
|
-
|
|
122
|
-
if isinstance(arg, RowBatch):
|
|
123
|
-
udf_inputs = [
|
|
124
|
-
self.bind_parameters(catalog, row, cache=cache, cb=cb)
|
|
125
|
-
for row in arg.rows
|
|
126
|
-
]
|
|
127
|
-
udf_outputs = self.func(udf_inputs)
|
|
128
|
-
return self._process_results(arg.rows, udf_outputs, is_generator)
|
|
129
|
-
if isinstance(arg, RowDict):
|
|
130
|
-
udf_inputs = self.bind_parameters(catalog, arg, cache=cache, cb=cb)
|
|
131
|
-
udf_outputs = self.func(*udf_inputs)
|
|
132
|
-
if not is_generator:
|
|
133
|
-
# udf_outputs is generator already if is_generator=True
|
|
134
|
-
udf_outputs = [udf_outputs]
|
|
135
|
-
return self._process_results([arg], udf_outputs, is_generator)
|
|
136
|
-
raise ValueError(f"Unexpected UDF argument: {arg}")
|
|
131
|
+
raise NotImplementedError
|
|
137
132
|
|
|
138
133
|
def bind_parameters(self, catalog: "Catalog", row: "RowDict", **kwargs) -> list:
|
|
139
134
|
return [p.get_value(catalog, row, **kwargs) for p in self.properties.params]
|
|
@@ -152,9 +147,9 @@ class UDFBase:
|
|
|
152
147
|
return (dict(zip(self.signal_names, row)) for row in results)
|
|
153
148
|
|
|
154
149
|
# outputting signals
|
|
155
|
-
row_ids = [row["
|
|
150
|
+
row_ids = [row["sys__id"] for row in rows]
|
|
156
151
|
return [
|
|
157
|
-
|
|
152
|
+
{"sys__id": row_id} | dict(zip(self.signal_names, signals))
|
|
158
153
|
for row_id, signals in zip(row_ids, results)
|
|
159
154
|
if signals is not None # skip rows with no output
|
|
160
155
|
]
|
|
@@ -194,12 +189,37 @@ class UDFWrapper(UDFBase):
|
|
|
194
189
|
func: Callable,
|
|
195
190
|
properties: UDFProperties,
|
|
196
191
|
):
|
|
197
|
-
|
|
192
|
+
self.func = func
|
|
193
|
+
super().__init__(properties)
|
|
198
194
|
# This emulates the behavior of functools.wraps for a class decorator
|
|
199
195
|
for attr in WRAPPER_ASSIGNMENTS:
|
|
200
196
|
if hasattr(func, attr):
|
|
201
197
|
setattr(self, attr, getattr(func, attr))
|
|
202
198
|
|
|
199
|
+
def run_once(
|
|
200
|
+
self,
|
|
201
|
+
catalog: "Catalog",
|
|
202
|
+
arg: "BatchingResult",
|
|
203
|
+
is_generator: bool = False,
|
|
204
|
+
cache: bool = False,
|
|
205
|
+
cb: Callback = DEFAULT_CALLBACK,
|
|
206
|
+
) -> Iterable[UDFResult]:
|
|
207
|
+
if isinstance(arg, RowBatch):
|
|
208
|
+
udf_inputs = [
|
|
209
|
+
self.bind_parameters(catalog, row, cache=cache, cb=cb)
|
|
210
|
+
for row in arg.rows
|
|
211
|
+
]
|
|
212
|
+
udf_outputs = self.func(udf_inputs)
|
|
213
|
+
return self._process_results(arg.rows, udf_outputs, is_generator)
|
|
214
|
+
if isinstance(arg, RowDict):
|
|
215
|
+
udf_inputs = self.bind_parameters(catalog, arg, cache=cache, cb=cb)
|
|
216
|
+
udf_outputs = self.func(*udf_inputs)
|
|
217
|
+
if not is_generator:
|
|
218
|
+
# udf_outputs is generator already if is_generator=True
|
|
219
|
+
udf_outputs = [udf_outputs]
|
|
220
|
+
return self._process_results([arg], udf_outputs, is_generator)
|
|
221
|
+
raise ValueError(f"Unexpected UDF argument: {arg}")
|
|
222
|
+
|
|
203
223
|
# This emulates the behavior of functools.wraps for a class decorator
|
|
204
224
|
def __repr__(self):
|
|
205
225
|
return repr(self.func)
|
datachain/remote/studio.py
CHANGED
|
@@ -190,19 +190,11 @@ class StudioClient:
|
|
|
190
190
|
def dataset_rows_chunk(
|
|
191
191
|
self, name: str, version: int, offset: int
|
|
192
192
|
) -> Response[DatasetRowsData]:
|
|
193
|
-
def _parse_row(row):
|
|
194
|
-
row["id"] = int(row["id"])
|
|
195
|
-
return row
|
|
196
|
-
|
|
197
193
|
req_data = {"dataset_name": name, "dataset_version": version}
|
|
198
|
-
|
|
194
|
+
return self._send_request_msgpack(
|
|
199
195
|
"dataset-rows",
|
|
200
196
|
{**req_data, "offset": offset, "limit": DATASET_ROWS_CHUNK_SIZE},
|
|
201
197
|
)
|
|
202
|
-
if response.ok:
|
|
203
|
-
response.data = [_parse_row(r) for r in response.data]
|
|
204
|
-
|
|
205
|
-
return response
|
|
206
198
|
|
|
207
199
|
def dataset_stats(self, name: str, version: int) -> Response[DatasetStatsData]:
|
|
208
200
|
response = self._send_request(
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
try:
|
|
2
|
+
from datachain.lib.clip import similarity_scores as clip_similarity_scores
|
|
3
|
+
from datachain.lib.image import convert_image, convert_images
|
|
4
|
+
from datachain.lib.pytorch import PytorchDataset, label_to_int
|
|
5
|
+
from datachain.lib.text import convert_text
|
|
6
|
+
|
|
7
|
+
except ImportError as exc:
|
|
8
|
+
raise ImportError(
|
|
9
|
+
"Missing dependencies for torch:\n"
|
|
10
|
+
"To install run:\n\n"
|
|
11
|
+
" pip install 'datachain[torch]'\n"
|
|
12
|
+
) from exc
|
|
13
|
+
|
|
14
|
+
__all__ = [
|
|
15
|
+
"PytorchDataset",
|
|
16
|
+
"clip_similarity_scores",
|
|
17
|
+
"convert_image",
|
|
18
|
+
"convert_images",
|
|
19
|
+
"convert_text",
|
|
20
|
+
"label_to_int",
|
|
21
|
+
]
|
datachain/utils.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import glob
|
|
2
2
|
import importlib.util
|
|
3
|
+
import io
|
|
3
4
|
import json
|
|
4
5
|
import os
|
|
5
6
|
import os.path as osp
|
|
@@ -13,8 +14,10 @@ from itertools import islice
|
|
|
13
14
|
from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union
|
|
14
15
|
from uuid import UUID
|
|
15
16
|
|
|
17
|
+
import cloudpickle
|
|
16
18
|
from dateutil import tz
|
|
17
19
|
from dateutil.parser import isoparse
|
|
20
|
+
from pydantic import BaseModel
|
|
18
21
|
|
|
19
22
|
if TYPE_CHECKING:
|
|
20
23
|
import pandas as pd
|
|
@@ -388,3 +391,39 @@ def inside_notebook() -> bool:
|
|
|
388
391
|
return False
|
|
389
392
|
|
|
390
393
|
return False
|
|
394
|
+
|
|
395
|
+
|
|
396
|
+
def get_all_subclasses(cls):
|
|
397
|
+
"""Return all subclasses of a given class.
|
|
398
|
+
Can return duplicates due to multiple inheritance."""
|
|
399
|
+
for subclass in cls.__subclasses__():
|
|
400
|
+
yield from get_all_subclasses(subclass)
|
|
401
|
+
yield subclass
|
|
402
|
+
|
|
403
|
+
|
|
404
|
+
def filtered_cloudpickle_dumps(obj: Any) -> bytes:
|
|
405
|
+
"""Equivalent to cloudpickle.dumps, but this supports Pydantic models."""
|
|
406
|
+
model_namespaces = {}
|
|
407
|
+
|
|
408
|
+
with io.BytesIO() as f:
|
|
409
|
+
pickler = cloudpickle.CloudPickler(f)
|
|
410
|
+
|
|
411
|
+
for model_class in get_all_subclasses(BaseModel):
|
|
412
|
+
# This "is not None" check is needed, because due to multiple inheritance,
|
|
413
|
+
# it is theoretically possible to get the same class twice from
|
|
414
|
+
# get_all_subclasses.
|
|
415
|
+
if model_class.__pydantic_parent_namespace__ is not None:
|
|
416
|
+
# __pydantic_parent_namespace__ can contain many unnecessary and
|
|
417
|
+
# unpickleable entities, so should be removed for serialization.
|
|
418
|
+
model_namespaces[model_class] = (
|
|
419
|
+
model_class.__pydantic_parent_namespace__
|
|
420
|
+
)
|
|
421
|
+
model_class.__pydantic_parent_namespace__ = None
|
|
422
|
+
|
|
423
|
+
try:
|
|
424
|
+
pickler.dump(obj)
|
|
425
|
+
return f.getvalue()
|
|
426
|
+
finally:
|
|
427
|
+
for model_class, namespace in model_namespaces.items():
|
|
428
|
+
# Restore original __pydantic_parent_namespace__ locally.
|
|
429
|
+
model_class.__pydantic_parent_namespace__ = namespace
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.11
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -34,16 +34,13 @@ Requires-Dist: shtab <2,>=1.3.4
|
|
|
34
34
|
Requires-Dist: sqlalchemy >=2
|
|
35
35
|
Requires-Dist: multiprocess ==0.70.16
|
|
36
36
|
Requires-Dist: dill ==0.3.8
|
|
37
|
+
Requires-Dist: cloudpickle
|
|
37
38
|
Requires-Dist: ujson >=5.9.0
|
|
38
39
|
Requires-Dist: pydantic <3,>=2
|
|
39
40
|
Requires-Dist: jmespath >=1.0
|
|
40
41
|
Requires-Dist: datamodel-code-generator >=0.25
|
|
42
|
+
Requires-Dist: Pillow <11,>=10.0.0
|
|
41
43
|
Requires-Dist: numpy <2,>=1 ; sys_platform == "win32"
|
|
42
|
-
Provides-Extra: cv
|
|
43
|
-
Requires-Dist: Pillow <11,>=10.0.0 ; extra == 'cv'
|
|
44
|
-
Requires-Dist: torch >=2.1.0 ; extra == 'cv'
|
|
45
|
-
Requires-Dist: torchvision ; extra == 'cv'
|
|
46
|
-
Requires-Dist: transformers >=4.36.0 ; extra == 'cv'
|
|
47
44
|
Provides-Extra: dev
|
|
48
45
|
Requires-Dist: datachain[docs,tests] ; extra == 'dev'
|
|
49
46
|
Requires-Dist: mypy ==1.10.1 ; extra == 'dev'
|
|
@@ -63,7 +60,7 @@ Requires-Dist: lz4 ; extra == 'remote'
|
|
|
63
60
|
Requires-Dist: msgpack <2,>=1.0.4 ; extra == 'remote'
|
|
64
61
|
Requires-Dist: requests >=2.22.0 ; extra == 'remote'
|
|
65
62
|
Provides-Extra: tests
|
|
66
|
-
Requires-Dist: datachain[
|
|
63
|
+
Requires-Dist: datachain[remote,torch,vector] ; extra == 'tests'
|
|
67
64
|
Requires-Dist: pytest <9,>=8 ; extra == 'tests'
|
|
68
65
|
Requires-Dist: pytest-sugar >=0.9.6 ; extra == 'tests'
|
|
69
66
|
Requires-Dist: pytest-cov >=4.1.0 ; extra == 'tests'
|
|
@@ -78,6 +75,10 @@ Requires-Dist: hypothesis ; extra == 'tests'
|
|
|
78
75
|
Requires-Dist: open-clip-torch ; extra == 'tests'
|
|
79
76
|
Requires-Dist: aiotools >=1.7.0 ; extra == 'tests'
|
|
80
77
|
Requires-Dist: requests-mock ; extra == 'tests'
|
|
78
|
+
Provides-Extra: torch
|
|
79
|
+
Requires-Dist: torch >=2.1.0 ; extra == 'torch'
|
|
80
|
+
Requires-Dist: torchvision ; extra == 'torch'
|
|
81
|
+
Requires-Dist: transformers >=4.36.0 ; extra == 'torch'
|
|
81
82
|
Provides-Extra: vector
|
|
82
83
|
Requires-Dist: usearch ; extra == 'vector'
|
|
83
84
|
|
|
@@ -89,11 +90,11 @@ Requires-Dist: usearch ; extra == 'vector'
|
|
|
89
90
|
.. |Python Version| image:: https://img.shields.io/pypi/pyversions/datachain
|
|
90
91
|
:target: https://pypi.org/project/datachain
|
|
91
92
|
:alt: Python Version
|
|
92
|
-
.. |Codecov| image:: https://codecov.io/gh/iterative/
|
|
93
|
-
:target: https://
|
|
93
|
+
.. |Codecov| image:: https://codecov.io/gh/iterative/datachain/graph/badge.svg?token=byliXGGyGB
|
|
94
|
+
:target: https://codecov.io/gh/iterative/datachain
|
|
94
95
|
:alt: Codecov
|
|
95
|
-
.. |Tests| image:: https://github.com/iterative/
|
|
96
|
-
:target: https://github.com/iterative/
|
|
96
|
+
.. |Tests| image:: https://github.com/iterative/datachain/workflows/Tests/badge.svg
|
|
97
|
+
:target: https://github.com/iterative/datachain/actions?workflow=Tests
|
|
97
98
|
:alt: Tests
|
|
98
99
|
|
|
99
100
|
AI 🔗 DataChain
|
|
@@ -397,7 +398,8 @@ Chain results can be exported or passed directly to Pytorch dataloader. For exam
|
|
|
397
398
|
Tutorials
|
|
398
399
|
------------------
|
|
399
400
|
|
|
400
|
-
* `
|
|
401
|
+
* `Computer Vision <examples/computer_vision/fashion_product_images/1-quick-start.ipynb>`_ (try in `Colab <https://colab.research.google.com/github/iterative/datachain/blob/main/examples/computer_vision/fashion_product_images/1-quick-start.ipynb>`__)
|
|
402
|
+
* `Multimodal <examples/multimodal/clip_fine_tuning.ipynb>`_ (try in `Colab <https://colab.research.google.com/github/iterative/datachain/blob/main/examples/multimodal/clip_fine_tuning.ipynb>`__)
|
|
401
403
|
|
|
402
404
|
Contributions
|
|
403
405
|
--------------------
|
|
@@ -1,22 +1,22 @@
|
|
|
1
|
-
datachain/__init__.py,sha256=
|
|
1
|
+
datachain/__init__.py,sha256=L5IlHOD4AaHkV7P5dbUwdq90I3bGFLtOghoZ1WVFGcs,841
|
|
2
2
|
datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
|
|
3
3
|
datachain/asyn.py,sha256=CKCFQJ0CbB3r04S7mUTXxriKzPnOvdUaVPXjM8vCtJw,7644
|
|
4
4
|
datachain/cache.py,sha256=FaPWrqWznPffmskTb1pdPkt2jAMMf__9FC2zEnP0vDU,4022
|
|
5
|
-
datachain/cli.py,sha256=
|
|
5
|
+
datachain/cli.py,sha256=gikzwEXTDKyzY1xOAUziXN2-OVqnOhDMJTd7SHq0Jxc,32406
|
|
6
6
|
datachain/cli_utils.py,sha256=jrn9ejGXjybeO1ur3fjdSiAyCHZrX0qsLLbJzN9ErPM,2418
|
|
7
7
|
datachain/config.py,sha256=PfC7W5yO6HFO6-iMB4YB-0RR88LPiGmD6sS_SfVbGso,1979
|
|
8
8
|
datachain/dataset.py,sha256=MZezyuJWNj_3PEtzr0epPMNyWAOTrhTSPI5FmemV6L4,14470
|
|
9
9
|
datachain/error.py,sha256=GY9KYTmb7GHXn2gGHV9X-PBhgwLj3i7VpK7tGHtAoGM,1279
|
|
10
10
|
datachain/listing.py,sha256=sX8vZNzAzoTel1li6VJiYeHUJwseUERVEoW9D5P7tII,8192
|
|
11
|
-
datachain/node.py,sha256=
|
|
11
|
+
datachain/node.py,sha256=fsQDJUmRMSRHhL1u6qQlWgreHbH760Ls-yDzFLhbW-U,5724
|
|
12
12
|
datachain/nodes_fetcher.py,sha256=kca19yvu11JxoVY1t4_ydp1FmchiV88GnNicNBQ9NIA,831
|
|
13
13
|
datachain/nodes_thread_pool.py,sha256=ZyzBvUImIPmi4WlKC2SW2msA0UhtembbTdcs2nx29A0,3191
|
|
14
14
|
datachain/progress.py,sha256=7_8FtJs770ITK9sMq-Lt4k4k18QmYl4yIG_kCoWID3o,4559
|
|
15
15
|
datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
16
16
|
datachain/storage.py,sha256=RiSJLYdHUjnrEWkLBKPcETHpAxld_B2WxLg711t0aZI,3733
|
|
17
|
-
datachain/utils.py,sha256=
|
|
17
|
+
datachain/utils.py,sha256=AWUXRk7yvDpHcqzzPWwzv8HtF1-jDVEBHKxAgT7u02E,12288
|
|
18
18
|
datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
|
|
19
|
-
datachain/catalog/catalog.py,sha256=
|
|
19
|
+
datachain/catalog/catalog.py,sha256=A5W9Ffoz1lZkzl6A3igaMC5jrus8VIYVLJLX8JTVKrk,79603
|
|
20
20
|
datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
|
|
21
21
|
datachain/catalog/loader.py,sha256=GJ8zhEYkC7TuaPzCsjJQ4LtTdECu-wwYzC12MikPOMQ,7307
|
|
22
22
|
datachain/catalog/subclass.py,sha256=B5R0qxeTYEyVAAPM1RutBPSoXZc8L5mVVZeSGXki9Sw,2096
|
|
@@ -32,51 +32,52 @@ datachain/data_storage/db_engine.py,sha256=rgBuqJ-M1j5QyqiUQuJRewctuvRRj8LBDL54-
|
|
|
32
32
|
datachain/data_storage/id_generator.py,sha256=VlDALKijggegAnNMJwuMETJgnLoPYxpkrkld5DNTPQw,3839
|
|
33
33
|
datachain/data_storage/job.py,sha256=w-7spowjkOa1P5fUVtJou3OltT0L48P0RYWZ9rSJ9-s,383
|
|
34
34
|
datachain/data_storage/metastore.py,sha256=y-4fYvuOPnWeYxAvqhDnw6CdlTvQiurg0Gg4TaG9LR0,54074
|
|
35
|
-
datachain/data_storage/schema.py,sha256=
|
|
35
|
+
datachain/data_storage/schema.py,sha256=hUykqT-As-__WffMdWTrSZwv9k5EYYowRke3OENQ3aY,8102
|
|
36
36
|
datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
|
|
37
|
-
datachain/data_storage/sqlite.py,sha256=
|
|
38
|
-
datachain/data_storage/warehouse.py,sha256=
|
|
39
|
-
datachain/image/__init__.py,sha256=g3l7vJFzg0-s5OAmBtGargsxt12TuKU4Ex6S0fOmEeY,101
|
|
37
|
+
datachain/data_storage/sqlite.py,sha256=cIYobczfH72c4l-iMkxpkgcTuuvvT8Xi64iP7Zr3Skw,25084
|
|
38
|
+
datachain/data_storage/warehouse.py,sha256=UbD37_jqaM4BY2SsQaTiJre-eSa7HcPejrTp936L080,33170
|
|
40
39
|
datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
41
|
-
datachain/lib/arrow.py,sha256=
|
|
42
|
-
datachain/lib/
|
|
43
|
-
datachain/lib/
|
|
44
|
-
datachain/lib/
|
|
45
|
-
datachain/lib/
|
|
46
|
-
datachain/lib/
|
|
47
|
-
datachain/lib/
|
|
48
|
-
datachain/lib/
|
|
49
|
-
datachain/lib/
|
|
50
|
-
datachain/lib/
|
|
51
|
-
datachain/lib/
|
|
52
|
-
datachain/lib/
|
|
53
|
-
datachain/lib/
|
|
54
|
-
datachain/lib/
|
|
55
|
-
datachain/lib/iptc_exif_xmp.py,sha256=xrbxFeY-wRP6T5JsUgE3EXfTxKvZVymRaRD_VIfxD0A,2236
|
|
56
|
-
datachain/lib/meta_formats.py,sha256=wIVVLRLp45Zk4vjZRd_P1UtD24vpDCb-vILWtcsACwk,6630
|
|
57
|
-
datachain/lib/pytorch.py,sha256=Ea1sXhborF6zcywQjLpXgKnbr1lTez4Bfu3m0Gr78FI,5843
|
|
40
|
+
datachain/lib/arrow.py,sha256=ttSiH8Xr08zxypAa3-BNTxMO2NBuZfYICwmG1qQwvWU,3268
|
|
41
|
+
datachain/lib/clip.py,sha256=YRa15Whnn6C8BMA-OAu0mYjc4h9i_n7pffRGdtfrTBA,5222
|
|
42
|
+
datachain/lib/data_model.py,sha256=DpV_-1JqJptCf0w4cnzPlHm5Yl4FQaveRgVCDZFaHXs,2012
|
|
43
|
+
datachain/lib/dc.py,sha256=rd-7gVcMRZ2M-O8aQhNx85H31w-kRQHpXSwtf26dSk4,35849
|
|
44
|
+
datachain/lib/file.py,sha256=Uik1sq2l-uknpikH4Gdm7ZR0EcQYP2TrNg-urECjbW4,8304
|
|
45
|
+
datachain/lib/gpt4_vision.py,sha256=CZ-a64olZNp9TNmLGngmbN6b02UYImzwK3dPClnjxTI,2716
|
|
46
|
+
datachain/lib/hf_image_to_text.py,sha256=uVl4mnUl8gnHrJ3wfSZlxBevH-cxqOswxLArLAHxRrE,3077
|
|
47
|
+
datachain/lib/hf_pipeline.py,sha256=MBFzixVa25_6QVR9RyOq8Rr9UIQ-sFVcBHducx_sZcY,2069
|
|
48
|
+
datachain/lib/image.py,sha256=K0n_P7kmobWTgxe-rDbr5yY3vBrOPnseziE3DXwFFVo,2325
|
|
49
|
+
datachain/lib/image_transform.py,sha256=hfgvIrSMGBx_MEXECyvrFoO1NyPBHoDb28j2lT2dsf8,2953
|
|
50
|
+
datachain/lib/iptc_exif_xmp.py,sha256=rmlxjOmAP31OCgbGBAwIgd1F_6QVBoSWsOPG6UsBg_w,2007
|
|
51
|
+
datachain/lib/meta_formats.py,sha256=SF7UPPe-U-1HL6DBO1NfwZLIChjkHrHasgHf5ztCUoU,6436
|
|
52
|
+
datachain/lib/model_store.py,sha256=JFpI1P0WFpsO6eAU49AdWmff5T8azqLrqOMB08pYJjg,2331
|
|
53
|
+
datachain/lib/pytorch.py,sha256=7fd2g0dI9zrMfRl3IVwIvXRH0v6TwSAyZGAbqKdEjcI,5505
|
|
58
54
|
datachain/lib/settings.py,sha256=6Nkoh8riETrftYwDp3aniK53Dsjc07MdztL8N0cW1D8,2849
|
|
59
|
-
datachain/lib/signal_schema.py,sha256=
|
|
60
|
-
datachain/lib/text.py,sha256=
|
|
61
|
-
datachain/lib/udf.py,sha256=
|
|
62
|
-
datachain/lib/udf_signature.py,sha256=
|
|
55
|
+
datachain/lib/signal_schema.py,sha256=mRdq5qEGnFQgbSawzDPi2MCZ6PULTMigd51B2RuNxpg,14173
|
|
56
|
+
datachain/lib/text.py,sha256=d2V-52cqzVm5OT68BcLYyHrglvFMVR5DPzsbtRRv3D0,1063
|
|
57
|
+
datachain/lib/udf.py,sha256=RqCiGuNKL5P8eS84s_mmVYjK1gvkuRYdnIKm9qe-i2U,9698
|
|
58
|
+
datachain/lib/udf_signature.py,sha256=R81QqZseG_xeBFzJSgt-wrTQeUU-1RrWkHckLm_HEUU,7135
|
|
63
59
|
datachain/lib/unstructured.py,sha256=9Y6rAelXdYqkNbPaqz6DhXjhS8d6qXcP0ieIsWkzvkk,1143
|
|
64
60
|
datachain/lib/utils.py,sha256=5-kJlAZE0D9nXXweAjo7-SP_AWGo28feaDByONYaooQ,463
|
|
65
61
|
datachain/lib/vfile.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
66
|
-
datachain/lib/webdataset.py,sha256=
|
|
67
|
-
datachain/lib/webdataset_laion.py,sha256=
|
|
62
|
+
datachain/lib/webdataset.py,sha256=nIa6ubv94CwnATeeSdE7f_F9Zkz9LuBTfbXvFg3_-Ak,8295
|
|
63
|
+
datachain/lib/webdataset_laion.py,sha256=PQP6tQmUP7Xu9fPuAGK1JDBYA6T5UufYMUTGaxgspJA,2118
|
|
64
|
+
datachain/lib/convert/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
65
|
+
datachain/lib/convert/flatten.py,sha256=XdAj0f9W32ABjOo8UyYm0y0H_yHDn3qEHERTyXuhJxk,1592
|
|
66
|
+
datachain/lib/convert/type_converter.py,sha256=W-wvCIcb6OwWjRJ3EWJE4-LbpoqxsRBd6gYNpFlm8qo,2643
|
|
67
|
+
datachain/lib/convert/unflatten.py,sha256=Ogvh_5wg2f38_At_1lN0D_e2uZOOpYEvwvB2xdq56Tw,2012
|
|
68
|
+
datachain/lib/convert/values_to_tuples.py,sha256=MWz9pHT-AaPQN8hNMUYfuOHstyuNv0QEckwXlKgFbLA,3088
|
|
68
69
|
datachain/query/__init__.py,sha256=tv-spkjUCYamMN9ys_90scYrZ8kJ7C7d1MTYVmxGtk4,325
|
|
69
|
-
datachain/query/batch.py,sha256=
|
|
70
|
+
datachain/query/batch.py,sha256=j-_ZcuQra2Ro3Wj4crtqQCg-7xuv-p84hr4QHdvT7as,3479
|
|
70
71
|
datachain/query/builtins.py,sha256=ZKNs49t8Oa_OaboCBIEqtXZt7c1Qe9OR_C_HpoDriIU,2781
|
|
71
|
-
datachain/query/dataset.py,sha256=
|
|
72
|
-
datachain/query/dispatch.py,sha256=
|
|
72
|
+
datachain/query/dataset.py,sha256=P1KBv_R0YnKjNDHzOJwAx9qhwI08l0dLgaXfak3ps7k,60578
|
|
73
|
+
datachain/query/dispatch.py,sha256=oGX9ZuoKWPB_EyqAZD_eULcO3OejY44_keSmFS6SHT0,13315
|
|
73
74
|
datachain/query/metrics.py,sha256=vsECqbZfoSDBnvC3GQlziKXmISVYDLgHP1fMPEOtKyo,640
|
|
74
75
|
datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
|
|
75
76
|
datachain/query/schema.py,sha256=n1NBOj6JO2I26mZD4vSURmVC2rk3mjIkJQheeLogoy4,7748
|
|
76
77
|
datachain/query/session.py,sha256=e4_vv4RqAjU-g3KK0avgLd9MEsmJBzRVEj1w8v7fP1k,3663
|
|
77
|
-
datachain/query/udf.py,sha256=
|
|
78
|
+
datachain/query/udf.py,sha256=c0IOTkcedpOQEmX-Idlrrl1__1IecNXL0N9oUO9Dtkg,7755
|
|
78
79
|
datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
79
|
-
datachain/remote/studio.py,sha256=
|
|
80
|
+
datachain/remote/studio.py,sha256=f5s6qSZ9uB4URGUoU_8_W1KZRRQQVSm6cgEBkBUEfuE,7226
|
|
80
81
|
datachain/sql/__init__.py,sha256=A2djrbQwSMUZZEIKGnm-mnRA-NDSbiDJNpAmmwGNyIo,303
|
|
81
82
|
datachain/sql/selectable.py,sha256=fBM-wS1TUA42kVEAAiwqGtibIevyZAEritwt8PZGyLQ,1589
|
|
82
83
|
datachain/sql/types.py,sha256=BzUm0nCcMPASvdqpQouX5bdVcK3G3DBfeeNhau7X_hA,10234
|
|
@@ -94,9 +95,10 @@ datachain/sql/sqlite/base.py,sha256=nPMF6_FF04hclDNZev_YfxMgbJAsWEdF-rU2pUhqBtc,
|
|
|
94
95
|
datachain/sql/sqlite/types.py,sha256=oP93nLfTBaYnN0z_4Dsv-HZm8j9rrUf1esMM-z3JLbg,1754
|
|
95
96
|
datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
|
|
96
97
|
datachain/text/__init__.py,sha256=-yxHL2gVl3H0Zxam6iWUO6F1Mc4QAFHX6z-5fjHND74,72
|
|
97
|
-
datachain
|
|
98
|
-
datachain-0.2.
|
|
99
|
-
datachain-0.2.
|
|
100
|
-
datachain-0.2.
|
|
101
|
-
datachain-0.2.
|
|
102
|
-
datachain-0.2.
|
|
98
|
+
datachain/torch/__init__.py,sha256=9QJW8h0FevIXEykRsxQ7XzMDXvdIkv3kVf_UY95CTyg,600
|
|
99
|
+
datachain-0.2.11.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
|
|
100
|
+
datachain-0.2.11.dist-info/METADATA,sha256=OVKgVc-Wc75AAQIY6hGL1CEBmnwksfgOXfiUen_xAOM,16759
|
|
101
|
+
datachain-0.2.11.dist-info/WHEEL,sha256=FZ75kcLy9M91ncbIgG8dnpCncbiKXSRGJ_PFILs6SFg,91
|
|
102
|
+
datachain-0.2.11.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
|
|
103
|
+
datachain-0.2.11.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
|
|
104
|
+
datachain-0.2.11.dist-info/RECORD,,
|
datachain/image/__init__.py
DELETED
datachain/lib/cached_stream.py
DELETED
|
@@ -1,38 +0,0 @@
|
|
|
1
|
-
from abc import ABC
|
|
2
|
-
from contextlib import AbstractContextManager
|
|
3
|
-
|
|
4
|
-
from datachain.cache import UniqueId
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
class AbstractCachedStream(AbstractContextManager, ABC):
|
|
8
|
-
def __init__(self, catalog, uid: UniqueId):
|
|
9
|
-
self.catalog = catalog
|
|
10
|
-
self.uid = uid
|
|
11
|
-
self.mode = "rb"
|
|
12
|
-
|
|
13
|
-
def set_mode(self, mode):
|
|
14
|
-
self.mode = mode
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
class PreCachedStream(AbstractCachedStream):
|
|
18
|
-
def __init__(self, catalog, uid: UniqueId):
|
|
19
|
-
super().__init__(catalog, uid)
|
|
20
|
-
self.client = self.catalog.get_client(self.uid.storage)
|
|
21
|
-
self.cached_file = None
|
|
22
|
-
|
|
23
|
-
def get_path_in_cache(self):
|
|
24
|
-
return self.catalog.cache.path_from_checksum(self.uid.get_hash())
|
|
25
|
-
|
|
26
|
-
def __enter__(self):
|
|
27
|
-
self.client.download(self.uid)
|
|
28
|
-
self.cached_file = open(self.get_path_in_cache(), self.mode)
|
|
29
|
-
return self.cached_file
|
|
30
|
-
|
|
31
|
-
def __exit__(self, *args):
|
|
32
|
-
self.cached_file.close()
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
class PreDownloadStream(PreCachedStream):
|
|
36
|
-
def __exit__(self, *args):
|
|
37
|
-
super().__exit__(*args)
|
|
38
|
-
self.catalog.cache.remove(self.uid)
|
datachain/lib/claude.py
DELETED
|
@@ -1,69 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
from typing import Callable, Literal, Optional
|
|
3
|
-
|
|
4
|
-
import anthropic
|
|
5
|
-
|
|
6
|
-
from datachain.lib.feature import Feature
|
|
7
|
-
from datachain.lib.file import File
|
|
8
|
-
|
|
9
|
-
default_model_name = "claude-3-haiku-20240307"
|
|
10
|
-
DEFAULT_OUTPUT_TOKENS = 1024
|
|
11
|
-
|
|
12
|
-
# This classes can be auto-generated:
|
|
13
|
-
# >> from anthropic.types.message import Message
|
|
14
|
-
# >> ClaudeMessage = pydantic_to_feature(Message)
|
|
15
|
-
# However, auto-generated pydentic classes do not work in multithreading mode.
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
class UsageFr(Feature):
|
|
19
|
-
input_tokens: int = 0
|
|
20
|
-
output_tokens: int = 0
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
class TextBlockFr(Feature):
|
|
24
|
-
text: str = ""
|
|
25
|
-
type: str = "text"
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
class ClaudeMessage(Feature):
|
|
29
|
-
id: str = ""
|
|
30
|
-
content: list[TextBlockFr]
|
|
31
|
-
model: str = ""
|
|
32
|
-
role: str = ""
|
|
33
|
-
stop_reason: Optional[Literal["end_turn", "max_tokens", "stop_sequence"]] = None
|
|
34
|
-
stop_sequence: Optional[str] = None
|
|
35
|
-
type: Literal["message"] = "message"
|
|
36
|
-
usage: UsageFr = UsageFr()
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
def claude_processor(
|
|
40
|
-
prompt: str,
|
|
41
|
-
messages: Optional[list] = None,
|
|
42
|
-
model: str = "claude-3-haiku-20240307",
|
|
43
|
-
api_key: Optional[str] = "",
|
|
44
|
-
max_retries: int = 5,
|
|
45
|
-
temperature: float = 0.9,
|
|
46
|
-
max_tokens: int = 1024,
|
|
47
|
-
**kwargs,
|
|
48
|
-
) -> Callable:
|
|
49
|
-
if not messages:
|
|
50
|
-
messages = []
|
|
51
|
-
api_key = api_key or os.environ.get("ANTHROPIC_API_KEY")
|
|
52
|
-
|
|
53
|
-
def claude_func(file) -> ClaudeMessage:
|
|
54
|
-
try:
|
|
55
|
-
data = file.get_value() if isinstance(file, File) else file
|
|
56
|
-
client = anthropic.Anthropic(api_key=api_key, max_retries=max_retries)
|
|
57
|
-
response = client.messages.create(
|
|
58
|
-
model=model,
|
|
59
|
-
system=prompt,
|
|
60
|
-
messages=[{"role": "user", "content": data}, *messages],
|
|
61
|
-
temperature=temperature,
|
|
62
|
-
max_tokens=max_tokens,
|
|
63
|
-
**kwargs,
|
|
64
|
-
)
|
|
65
|
-
return ClaudeMessage(**response.model_dump())
|
|
66
|
-
except Exception: # noqa: BLE001
|
|
67
|
-
return ClaudeMessage(content=[])
|
|
68
|
-
|
|
69
|
-
return claude_func
|