datachain 0.3.8__py3-none-any.whl → 0.3.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/catalog/catalog.py +13 -91
- datachain/cli.py +6 -38
- datachain/client/fsspec.py +3 -0
- datachain/client/hf.py +47 -0
- datachain/data_storage/metastore.py +2 -29
- datachain/data_storage/sqlite.py +3 -12
- datachain/data_storage/warehouse.py +20 -29
- datachain/dataset.py +44 -32
- datachain/lib/arrow.py +22 -6
- datachain/lib/dataset_info.py +4 -0
- datachain/lib/dc.py +149 -35
- datachain/lib/file.py +10 -33
- datachain/lib/hf.py +2 -1
- datachain/lib/listing.py +102 -94
- datachain/lib/listing_info.py +32 -0
- datachain/lib/meta_formats.py +4 -4
- datachain/lib/signal_schema.py +5 -2
- datachain/lib/webdataset.py +1 -1
- datachain/node.py +13 -0
- datachain/query/dataset.py +25 -87
- datachain/query/metrics.py +8 -0
- datachain/utils.py +5 -0
- {datachain-0.3.8.dist-info → datachain-0.3.10.dist-info}/METADATA +14 -14
- {datachain-0.3.8.dist-info → datachain-0.3.10.dist-info}/RECORD +28 -26
- {datachain-0.3.8.dist-info → datachain-0.3.10.dist-info}/WHEEL +1 -1
- {datachain-0.3.8.dist-info → datachain-0.3.10.dist-info}/LICENSE +0 -0
- {datachain-0.3.8.dist-info → datachain-0.3.10.dist-info}/entry_points.txt +0 -0
- {datachain-0.3.8.dist-info → datachain-0.3.10.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
from datetime import datetime, timedelta, timezone
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
from datachain.client import Client
|
|
5
|
+
from datachain.lib.dataset_info import DatasetInfo
|
|
6
|
+
from datachain.lib.listing import LISTING_PREFIX, LISTING_TTL
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class ListingInfo(DatasetInfo):
|
|
10
|
+
@property
|
|
11
|
+
def uri(self) -> str:
|
|
12
|
+
return self.name.removeprefix(LISTING_PREFIX)
|
|
13
|
+
|
|
14
|
+
@property
|
|
15
|
+
def storage_uri(self) -> str:
|
|
16
|
+
client, _ = Client.parse_url(self.uri, None) # type: ignore[arg-type]
|
|
17
|
+
return client.uri
|
|
18
|
+
|
|
19
|
+
@property
|
|
20
|
+
def expires(self) -> Optional[datetime]:
|
|
21
|
+
if not self.finished_at:
|
|
22
|
+
return None
|
|
23
|
+
return self.finished_at + timedelta(seconds=LISTING_TTL)
|
|
24
|
+
|
|
25
|
+
@property
|
|
26
|
+
def is_expired(self) -> bool:
|
|
27
|
+
return datetime.now(timezone.utc) > self.expires if self.expires else False
|
|
28
|
+
|
|
29
|
+
@property
|
|
30
|
+
def last_inserted_at(self):
|
|
31
|
+
# TODO we need to add updated_at to dataset version or explicit last_inserted_at
|
|
32
|
+
raise NotImplementedError
|
datachain/lib/meta_formats.py
CHANGED
|
@@ -54,10 +54,10 @@ def read_schema(source_file, data_type="csv", expr=None, model_name=None):
|
|
|
54
54
|
try:
|
|
55
55
|
with source_file.open() as fd: # CSV can be larger than memory
|
|
56
56
|
if data_type == "csv":
|
|
57
|
-
data_string += fd.readline().
|
|
58
|
-
data_string += fd.readline().
|
|
57
|
+
data_string += fd.readline().replace("\r", "")
|
|
58
|
+
data_string += fd.readline().replace("\r", "")
|
|
59
59
|
elif data_type == "jsonl":
|
|
60
|
-
data_string = fd.readline().
|
|
60
|
+
data_string = fd.readline().replace("\r", "")
|
|
61
61
|
else:
|
|
62
62
|
data_string = fd.read() # other meta must fit into RAM
|
|
63
63
|
except OSError as e:
|
|
@@ -120,7 +120,7 @@ def read_meta( # noqa: C901
|
|
|
120
120
|
sys.stdout = captured_output
|
|
121
121
|
try:
|
|
122
122
|
chain = (
|
|
123
|
-
DataChain.from_storage(schema_from)
|
|
123
|
+
DataChain.from_storage(schema_from, type="text")
|
|
124
124
|
.limit(1)
|
|
125
125
|
.map( # dummy column created (#1615)
|
|
126
126
|
meta_schema=lambda file: read_schema(
|
datachain/lib/signal_schema.py
CHANGED
|
@@ -16,7 +16,6 @@ from typing import (
|
|
|
16
16
|
get_origin,
|
|
17
17
|
)
|
|
18
18
|
|
|
19
|
-
import sqlalchemy as sa
|
|
20
19
|
from pydantic import BaseModel, create_model
|
|
21
20
|
from typing_extensions import Literal as LiteralEx
|
|
22
21
|
|
|
@@ -341,7 +340,7 @@ class SignalSchema:
|
|
|
341
340
|
signals = [
|
|
342
341
|
DEFAULT_DELIMITER.join(path)
|
|
343
342
|
if not as_columns
|
|
344
|
-
else
|
|
343
|
+
else Column(DEFAULT_DELIMITER.join(path), python_to_sql(_type))
|
|
345
344
|
for path, _type, has_subtree, _ in self.get_flat_tree()
|
|
346
345
|
if not has_subtree
|
|
347
346
|
]
|
|
@@ -415,6 +414,10 @@ class SignalSchema:
|
|
|
415
414
|
# renaming existing signal
|
|
416
415
|
del new_values[value.name]
|
|
417
416
|
new_values[name] = self.values[value.name]
|
|
417
|
+
elif name in self.values:
|
|
418
|
+
# changing the type of existing signal, e.g File -> ImageFile
|
|
419
|
+
del new_values[name]
|
|
420
|
+
new_values[name] = args_map[name]
|
|
418
421
|
else:
|
|
419
422
|
# adding new signal
|
|
420
423
|
new_values.update(sql_to_python({name: value}))
|
datachain/lib/webdataset.py
CHANGED
|
@@ -222,7 +222,7 @@ class TarStream(File):
|
|
|
222
222
|
self._tar = None
|
|
223
223
|
|
|
224
224
|
def open(self):
|
|
225
|
-
self._tar = tarfile.open(fileobj=super().open())
|
|
225
|
+
self._tar = tarfile.open(fileobj=super().open()) # noqa: SIM115
|
|
226
226
|
return self
|
|
227
227
|
|
|
228
228
|
def getmembers(self) -> list[tarfile.TarInfo]:
|
datachain/node.py
CHANGED
|
@@ -4,6 +4,7 @@ from typing import TYPE_CHECKING, Any, Optional
|
|
|
4
4
|
import attrs
|
|
5
5
|
|
|
6
6
|
from datachain.cache import UniqueId
|
|
7
|
+
from datachain.lib.file import File
|
|
7
8
|
from datachain.storage import StorageURI
|
|
8
9
|
from datachain.utils import TIME_ZERO, time_to_str
|
|
9
10
|
|
|
@@ -189,6 +190,18 @@ class Entry:
|
|
|
189
190
|
return ""
|
|
190
191
|
return split[0]
|
|
191
192
|
|
|
193
|
+
def to_file(self, source: str) -> File:
|
|
194
|
+
return File(
|
|
195
|
+
source=source,
|
|
196
|
+
path=self.path,
|
|
197
|
+
size=self.size,
|
|
198
|
+
version=self.version,
|
|
199
|
+
etag=self.etag,
|
|
200
|
+
is_latest=self.is_latest,
|
|
201
|
+
last_modified=self.last_modified,
|
|
202
|
+
location=self.location,
|
|
203
|
+
)
|
|
204
|
+
|
|
192
205
|
|
|
193
206
|
def get_path(parent: str, name: str):
|
|
194
207
|
return f"{parent}/{name}" if parent else name
|
datachain/query/dataset.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import contextlib
|
|
2
|
-
import datetime
|
|
3
2
|
import inspect
|
|
4
3
|
import json
|
|
5
4
|
import logging
|
|
@@ -60,7 +59,6 @@ from datachain.utils import (
|
|
|
60
59
|
get_datachain_executable,
|
|
61
60
|
)
|
|
62
61
|
|
|
63
|
-
from .metrics import metrics
|
|
64
62
|
from .schema import C, UDFParamSpec, normalize_param
|
|
65
63
|
from .session import Session
|
|
66
64
|
from .udf import UDFBase, UDFClassWrapper, UDFFactory, UDFType
|
|
@@ -219,7 +217,7 @@ class IndexingStep(StartingStep):
|
|
|
219
217
|
recursive=self.recursive,
|
|
220
218
|
)
|
|
221
219
|
|
|
222
|
-
storage = self.catalog.get_storage(uri)
|
|
220
|
+
storage = self.catalog.metastore.get_storage(uri)
|
|
223
221
|
|
|
224
222
|
return step_result(q, dataset_rows.c, dependencies=[storage.uri])
|
|
225
223
|
|
|
@@ -296,15 +294,23 @@ class DatasetDiffOperation(Step):
|
|
|
296
294
|
|
|
297
295
|
@frozen
|
|
298
296
|
class Subtract(DatasetDiffOperation):
|
|
299
|
-
on: Sequence[str]
|
|
297
|
+
on: Sequence[tuple[str, str]]
|
|
300
298
|
|
|
301
299
|
def query(self, source_query: Select, target_query: Select) -> sa.Selectable:
|
|
302
300
|
sq = source_query.alias("source_query")
|
|
303
301
|
tq = target_query.alias("target_query")
|
|
304
302
|
where_clause = sa.and_(
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
303
|
+
*[
|
|
304
|
+
getattr(
|
|
305
|
+
sq.c, col_name[0] if isinstance(col_name, tuple) else col_name
|
|
306
|
+
).is_not_distinct_from(
|
|
307
|
+
getattr(
|
|
308
|
+
tq.c, col_name[1] if isinstance(col_name, tuple) else col_name
|
|
309
|
+
)
|
|
310
|
+
)
|
|
311
|
+
for col_name in self.on
|
|
312
|
+
]
|
|
313
|
+
)
|
|
308
314
|
return sq.select().except_(sq.select().where(where_clause))
|
|
309
315
|
|
|
310
316
|
|
|
@@ -1571,10 +1577,10 @@ class DatasetQuery:
|
|
|
1571
1577
|
|
|
1572
1578
|
@detach
|
|
1573
1579
|
def subtract(self, dq: "DatasetQuery") -> "Self":
|
|
1574
|
-
return self._subtract(dq, on=["source", "path"])
|
|
1580
|
+
return self._subtract(dq, on=[("source", "source"), ("path", "path")])
|
|
1575
1581
|
|
|
1576
1582
|
@detach
|
|
1577
|
-
def _subtract(self, dq: "DatasetQuery", on: Sequence[str]) -> "Self":
|
|
1583
|
+
def _subtract(self, dq: "DatasetQuery", on: Sequence[tuple[str, str]]) -> "Self":
|
|
1578
1584
|
query = self.clone()
|
|
1579
1585
|
query.steps.append(Subtract(dq, self.catalog, on=on))
|
|
1580
1586
|
return query
|
|
@@ -1626,7 +1632,7 @@ class DatasetQuery:
|
|
|
1626
1632
|
)
|
|
1627
1633
|
else:
|
|
1628
1634
|
# storage dependency - its name is a valid StorageURI
|
|
1629
|
-
storage = self.catalog.get_storage(dependency)
|
|
1635
|
+
storage = self.catalog.metastore.get_storage(dependency)
|
|
1630
1636
|
self.catalog.metastore.add_storage_dependency(
|
|
1631
1637
|
StorageURI(dataset.name),
|
|
1632
1638
|
version,
|
|
@@ -1717,54 +1723,6 @@ def _get_output_fd_for_write() -> Union[str, int]:
|
|
|
1717
1723
|
return msvcrt.open_osfhandle(int(handle), os.O_WRONLY) # type: ignore[attr-defined]
|
|
1718
1724
|
|
|
1719
1725
|
|
|
1720
|
-
@attrs.define
|
|
1721
|
-
class ExecutionResult:
|
|
1722
|
-
preview: list[dict] = attrs.field(factory=list)
|
|
1723
|
-
dataset: Optional[tuple[str, int]] = None
|
|
1724
|
-
metrics: dict[str, Any] = attrs.field(factory=dict)
|
|
1725
|
-
|
|
1726
|
-
|
|
1727
|
-
def _send_result(dataset_query: DatasetQuery) -> None:
|
|
1728
|
-
class JSONSerialize(json.JSONEncoder):
|
|
1729
|
-
def default(self, obj):
|
|
1730
|
-
if isinstance(obj, (datetime.datetime, datetime.date)):
|
|
1731
|
-
return obj.isoformat()
|
|
1732
|
-
if isinstance(obj, bytes):
|
|
1733
|
-
return list(obj[:1024])
|
|
1734
|
-
return super().default(obj)
|
|
1735
|
-
|
|
1736
|
-
try:
|
|
1737
|
-
preview_args: dict[str, Any] = json.loads(
|
|
1738
|
-
os.getenv("DATACHAIN_QUERY_PREVIEW_ARGS", "")
|
|
1739
|
-
)
|
|
1740
|
-
except ValueError:
|
|
1741
|
-
preview_args = {}
|
|
1742
|
-
|
|
1743
|
-
columns = preview_args.get("columns") or []
|
|
1744
|
-
|
|
1745
|
-
if type(dataset_query) is DatasetQuery:
|
|
1746
|
-
preview_query = dataset_query.select(*columns)
|
|
1747
|
-
else:
|
|
1748
|
-
preview_query = dataset_query.select(*columns, _sys=False)
|
|
1749
|
-
|
|
1750
|
-
preview_query = preview_query.limit(preview_args.get("limit", 10)).offset(
|
|
1751
|
-
preview_args.get("offset", 0)
|
|
1752
|
-
)
|
|
1753
|
-
|
|
1754
|
-
dataset: Optional[tuple[str, int]] = None
|
|
1755
|
-
if dataset_query.attached:
|
|
1756
|
-
assert dataset_query.name, "Dataset name should be provided"
|
|
1757
|
-
assert dataset_query.version, "Dataset version should be provided"
|
|
1758
|
-
dataset = dataset_query.name, dataset_query.version
|
|
1759
|
-
|
|
1760
|
-
preview = preview_query.to_db_records()
|
|
1761
|
-
result = ExecutionResult(preview, dataset, metrics)
|
|
1762
|
-
data = attrs.asdict(result)
|
|
1763
|
-
|
|
1764
|
-
with open(_get_output_fd_for_write(), mode="w") as f:
|
|
1765
|
-
json.dump(data, f, cls=JSONSerialize)
|
|
1766
|
-
|
|
1767
|
-
|
|
1768
1726
|
def query_wrapper(dataset_query: DatasetQuery) -> DatasetQuery:
|
|
1769
1727
|
"""
|
|
1770
1728
|
Wrapper function that wraps the last statement of user query script.
|
|
@@ -1776,41 +1734,21 @@ def query_wrapper(dataset_query: DatasetQuery) -> DatasetQuery:
|
|
|
1776
1734
|
|
|
1777
1735
|
catalog = dataset_query.catalog
|
|
1778
1736
|
save = bool(os.getenv("DATACHAIN_QUERY_SAVE"))
|
|
1779
|
-
save_as = os.getenv("DATACHAIN_QUERY_SAVE_AS")
|
|
1780
1737
|
|
|
1781
1738
|
is_session_temp_dataset = dataset_query.name and dataset_query.name.startswith(
|
|
1782
1739
|
dataset_query.session.get_temp_prefix()
|
|
1783
1740
|
)
|
|
1784
1741
|
|
|
1785
|
-
if
|
|
1786
|
-
if dataset_query.attached:
|
|
1787
|
-
dataset_name = dataset_query.name
|
|
1788
|
-
version = dataset_query.version
|
|
1789
|
-
assert dataset_name, "Dataset name should be provided in attached mode"
|
|
1790
|
-
assert version, "Dataset version should be provided in attached mode"
|
|
1791
|
-
|
|
1792
|
-
dataset = catalog.get_dataset(dataset_name)
|
|
1793
|
-
|
|
1794
|
-
try:
|
|
1795
|
-
target_dataset = catalog.get_dataset(save_as)
|
|
1796
|
-
except DatasetNotFoundError:
|
|
1797
|
-
target_dataset = None
|
|
1798
|
-
|
|
1799
|
-
if target_dataset:
|
|
1800
|
-
dataset = catalog.register_dataset(dataset, version, target_dataset)
|
|
1801
|
-
else:
|
|
1802
|
-
dataset = catalog.register_new_dataset(dataset, version, save_as)
|
|
1803
|
-
|
|
1804
|
-
dataset_query = DatasetQuery(
|
|
1805
|
-
name=dataset.name,
|
|
1806
|
-
version=dataset.latest_version,
|
|
1807
|
-
catalog=catalog,
|
|
1808
|
-
)
|
|
1809
|
-
else:
|
|
1810
|
-
dataset_query = dataset_query.save(save_as)
|
|
1811
|
-
elif save and (is_session_temp_dataset or not dataset_query.attached):
|
|
1742
|
+
if save and (is_session_temp_dataset or not dataset_query.attached):
|
|
1812
1743
|
name = catalog.generate_query_dataset_name()
|
|
1813
1744
|
dataset_query = dataset_query.save(name)
|
|
1814
1745
|
|
|
1815
|
-
|
|
1746
|
+
dataset: Optional[tuple[str, int]] = None
|
|
1747
|
+
if dataset_query.attached:
|
|
1748
|
+
assert dataset_query.name, "Dataset name should be provided"
|
|
1749
|
+
assert dataset_query.version, "Dataset version should be provided"
|
|
1750
|
+
dataset = dataset_query.name, dataset_query.version
|
|
1751
|
+
|
|
1752
|
+
with open(_get_output_fd_for_write(), mode="w") as f:
|
|
1753
|
+
json.dump(dataset, f)
|
|
1816
1754
|
return dataset_query
|
datachain/query/metrics.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import os
|
|
1
2
|
from typing import Optional, Union
|
|
2
3
|
|
|
3
4
|
metrics: dict[str, Union[str, int, float, bool, None]] = {}
|
|
@@ -13,6 +14,13 @@ def set(key: str, value: Union[str, int, float, bool, None]) -> None: # noqa: P
|
|
|
13
14
|
raise TypeError("Value must be a string, int, float or bool")
|
|
14
15
|
metrics[key] = value
|
|
15
16
|
|
|
17
|
+
if job_id := os.getenv("DATACHAIN_JOB_ID"):
|
|
18
|
+
from datachain.data_storage.job import JobStatus
|
|
19
|
+
from datachain.query.session import Session
|
|
20
|
+
|
|
21
|
+
metastore = Session.get().catalog.metastore
|
|
22
|
+
metastore.set_job_status(job_id, JobStatus.RUNNING, metrics=metrics)
|
|
23
|
+
|
|
16
24
|
|
|
17
25
|
def get(key: str) -> Optional[Union[str, int, float, bool]]:
|
|
18
26
|
"""Get a metric value."""
|
datachain/utils.py
CHANGED
|
@@ -448,3 +448,8 @@ def get_datachain_executable() -> list[str]:
|
|
|
448
448
|
if datachain_exec_path := os.getenv("DATACHAIN_EXEC_PATH"):
|
|
449
449
|
return [datachain_exec_path]
|
|
450
450
|
return [sys.executable, "-m", "datachain"]
|
|
451
|
+
|
|
452
|
+
|
|
453
|
+
def uses_glob(path: str) -> bool:
|
|
454
|
+
"""Checks if some URI path has glob syntax in it"""
|
|
455
|
+
return glob.has_magic(os.path.basename(os.path.normpath(path)))
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.10
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -42,6 +42,7 @@ Requires-Dist: datamodel-code-generator >=0.25
|
|
|
42
42
|
Requires-Dist: Pillow <11,>=10.0.0
|
|
43
43
|
Requires-Dist: msgpack <2,>=1.0.4
|
|
44
44
|
Requires-Dist: psutil
|
|
45
|
+
Requires-Dist: huggingface-hub
|
|
45
46
|
Requires-Dist: numpy <2,>=1 ; sys_platform == "win32"
|
|
46
47
|
Provides-Extra: dev
|
|
47
48
|
Requires-Dist: datachain[docs,tests] ; extra == 'dev'
|
|
@@ -67,7 +68,7 @@ Requires-Dist: pdfplumber ==0.11.4 ; extra == 'examples'
|
|
|
67
68
|
Requires-Dist: huggingface-hub[hf_transfer] ; extra == 'examples'
|
|
68
69
|
Provides-Extra: hf
|
|
69
70
|
Requires-Dist: numba >=0.60.0 ; extra == 'hf'
|
|
70
|
-
Requires-Dist: datasets[audio,vision] ; extra == 'hf'
|
|
71
|
+
Requires-Dist: datasets[audio,vision] >=2.21.0 ; extra == 'hf'
|
|
71
72
|
Provides-Extra: remote
|
|
72
73
|
Requires-Dist: lz4 ; extra == 'remote'
|
|
73
74
|
Requires-Dist: requests >=2.22.0 ; extra == 'remote'
|
|
@@ -115,31 +116,30 @@ AI 🔗 DataChain
|
|
|
115
116
|
|
|
116
117
|
DataChain is a modern Pythonic data-frame library designed for artificial intelligence.
|
|
117
118
|
It is made to organize your unstructured data into datasets and wrangle it at scale on
|
|
118
|
-
your local machine.
|
|
119
|
+
your local machine. Datachain does not abstract or hide the AI models and API calls, but helps to integrate them into the postmodern data stack.
|
|
119
120
|
|
|
120
121
|
Key Features
|
|
121
122
|
============
|
|
122
123
|
|
|
123
124
|
📂 **Storage as a Source of Truth.**
|
|
124
|
-
- Process unstructured data without redundant copies
|
|
125
|
+
- Process unstructured data without redundant copies from S3, GCP, Azure, and local
|
|
125
126
|
file systems.
|
|
126
|
-
- Multimodal data: images, video, text, PDFs, JSONs, CSVs, parquet.
|
|
127
|
-
-
|
|
127
|
+
- Multimodal data support: images, video, text, PDFs, JSONs, CSVs, parquet.
|
|
128
|
+
- Unite files and metadata together into persistent, versioned, columnar datasets.
|
|
128
129
|
|
|
129
130
|
🐍 **Python-friendly data pipelines.**
|
|
130
131
|
- Operate on Python objects and object fields.
|
|
131
|
-
- Built-in parallelization and out-of-memory compute without
|
|
132
|
-
Spark jobs.
|
|
132
|
+
- Built-in parallelization and out-of-memory compute without SQL or Spark.
|
|
133
133
|
|
|
134
134
|
🧠 **Data Enrichment and Processing.**
|
|
135
|
-
- Generate metadata
|
|
136
|
-
- Filter, join, and group by
|
|
137
|
-
- Pass datasets to Pytorch and Tensorflow, or export back into storage.
|
|
135
|
+
- Generate metadata using local AI models and LLM APIs.
|
|
136
|
+
- Filter, join, and group by metadata. Search by vector embeddings.
|
|
137
|
+
- Pass datasets to Pytorch and Tensorflow, or export them back into storage.
|
|
138
138
|
|
|
139
139
|
🚀 **Efficiency.**
|
|
140
140
|
- Parallelization, out-of-memory workloads and data caching.
|
|
141
141
|
- Vectorized operations on Python object fields: sum, count, avg, etc.
|
|
142
|
-
-
|
|
142
|
+
- Optimized vector search.
|
|
143
143
|
|
|
144
144
|
|
|
145
145
|
Quick Start
|
|
@@ -164,7 +164,7 @@ where each image has a matching JSON file like `cat.1009.json`:
|
|
|
164
164
|
"inference": {"class": "dog", "confidence": 0.68}
|
|
165
165
|
}
|
|
166
166
|
|
|
167
|
-
Example of downloading only high-confidence cat images using JSON metadata:
|
|
167
|
+
Example of downloading only "high-confidence cat" inferred images using JSON metadata:
|
|
168
168
|
|
|
169
169
|
|
|
170
170
|
.. code:: py
|
|
@@ -234,7 +234,7 @@ detected are then copied to the local directory.
|
|
|
234
234
|
LLM judging chatbots
|
|
235
235
|
=============================
|
|
236
236
|
|
|
237
|
-
LLMs can work as
|
|
237
|
+
LLMs can work as universal classifiers. In the example below,
|
|
238
238
|
we employ a free API from Mistral to judge the `publicly available`_ chatbot dialogs. Please get a free
|
|
239
239
|
Mistral API key at https://console.mistral.ai
|
|
240
240
|
|
|
@@ -2,62 +2,64 @@ datachain/__init__.py,sha256=GeyhE-5LgfJav2OKYGaieP2lBvf2Gm-ihj7thnK9zjI,800
|
|
|
2
2
|
datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
|
|
3
3
|
datachain/asyn.py,sha256=biF8M8fQujtj5xs0VLi8S16eBtzG6kceWlO_NILbCsg,8197
|
|
4
4
|
datachain/cache.py,sha256=wznC2pge6RhlPTaJfBVGjmBc6bxWCPThu4aTFMltvFU,4076
|
|
5
|
-
datachain/cli.py,sha256=
|
|
5
|
+
datachain/cli.py,sha256=ECf_z5X8ILDJdUn2Cpb_z-ZjSRIzn7skiuMGfM-y9i0,30999
|
|
6
6
|
datachain/cli_utils.py,sha256=jrn9ejGXjybeO1ur3fjdSiAyCHZrX0qsLLbJzN9ErPM,2418
|
|
7
7
|
datachain/config.py,sha256=PfC7W5yO6HFO6-iMB4YB-0RR88LPiGmD6sS_SfVbGso,1979
|
|
8
|
-
datachain/dataset.py,sha256=
|
|
8
|
+
datachain/dataset.py,sha256=EcYjhHg1dxxPbDwSuIxc-mDRDo3v_pYf79fMy4re1oA,14740
|
|
9
9
|
datachain/error.py,sha256=GY9KYTmb7GHXn2gGHV9X-PBhgwLj3i7VpK7tGHtAoGM,1279
|
|
10
10
|
datachain/job.py,sha256=bk25bIqClhgRPzlXAhxpTtDeewibQe5l3S8Cf7db0gM,1229
|
|
11
11
|
datachain/listing.py,sha256=keLkvPfumDA3gijeIiinH5yGWe71qCxgF5HqqP5AeH4,8299
|
|
12
|
-
datachain/node.py,sha256=
|
|
12
|
+
datachain/node.py,sha256=dcm_7dVurFHpI0EHV2K6SjpJyh-gN4PVWAB-20quk04,6382
|
|
13
13
|
datachain/nodes_fetcher.py,sha256=kca19yvu11JxoVY1t4_ydp1FmchiV88GnNicNBQ9NIA,831
|
|
14
14
|
datachain/nodes_thread_pool.py,sha256=ZyzBvUImIPmi4WlKC2SW2msA0UhtembbTdcs2nx29A0,3191
|
|
15
15
|
datachain/progress.py,sha256=7_8FtJs770ITK9sMq-Lt4k4k18QmYl4yIG_kCoWID3o,4559
|
|
16
16
|
datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
17
17
|
datachain/storage.py,sha256=RiSJLYdHUjnrEWkLBKPcETHpAxld_B2WxLg711t0aZI,3733
|
|
18
|
-
datachain/utils.py,sha256=
|
|
18
|
+
datachain/utils.py,sha256=HKUdVqreBTzcCULAYRw1sC6z33OaomVD1WoMSoFcPHg,13148
|
|
19
19
|
datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
|
|
20
|
-
datachain/catalog/catalog.py,sha256=
|
|
20
|
+
datachain/catalog/catalog.py,sha256=VO-Otcg3QLbb3E9H8gmgu-xJWQqIbWmLP2QyPg8cUos,75386
|
|
21
21
|
datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
|
|
22
22
|
datachain/catalog/loader.py,sha256=-6VelNfXUdgUnwInVyA8g86Boxv2xqhTh9xNS-Zlwig,8242
|
|
23
23
|
datachain/catalog/subclass.py,sha256=B5R0qxeTYEyVAAPM1RutBPSoXZc8L5mVVZeSGXki9Sw,2096
|
|
24
24
|
datachain/client/__init__.py,sha256=T4wiYL9KIM0ZZ_UqIyzV8_ufzYlewmizlV4iymHNluE,86
|
|
25
25
|
datachain/client/azure.py,sha256=LXSahE0Z6r4dXqpBkKnq3J5fg7N7ymC1lSn-1SoILGc,2687
|
|
26
26
|
datachain/client/fileslice.py,sha256=bT7TYco1Qe3bqoc8aUkUZcPdPofJDHlryL5BsTn9xsY,3021
|
|
27
|
-
datachain/client/fsspec.py,sha256=
|
|
27
|
+
datachain/client/fsspec.py,sha256=LQb5tr-pP9umCFYo3nGJR_dZxUyiSN7IDE8jhp1TXco,13333
|
|
28
28
|
datachain/client/gcs.py,sha256=P_E3mhzhXR9mJ_wc3AYZuczzwOJ0-D3J5qhJXeSU-xk,4518
|
|
29
|
+
datachain/client/hf.py,sha256=R-F6Ks6aVM9wSNkIXOkOnZFwsJlfdRwJjymRa78RLjM,1246
|
|
29
30
|
datachain/client/local.py,sha256=H8TNY8pi2kA8y9_f_1XLUjJF66f229qC_b2y4xGkzdU,5300
|
|
30
31
|
datachain/client/s3.py,sha256=aQxfMH8G8bUjmHF1-6P90MSkXsU5DgOPEVlKWLu459I,6568
|
|
31
32
|
datachain/data_storage/__init__.py,sha256=cEOJpyu1JDZtfUupYucCDNFI6e5Wmp_Oyzq6rZv32Y8,398
|
|
32
33
|
datachain/data_storage/db_engine.py,sha256=81Ol1of9TTTzD97ORajCnP366Xz2mEJt6C-kTUCaru4,3406
|
|
33
34
|
datachain/data_storage/id_generator.py,sha256=lCEoU0BM37Ai2aRpSbwo5oQT0GqZnSpYwwvizathRMQ,4292
|
|
34
35
|
datachain/data_storage/job.py,sha256=w-7spowjkOa1P5fUVtJou3OltT0L48P0RYWZ9rSJ9-s,383
|
|
35
|
-
datachain/data_storage/metastore.py,sha256=
|
|
36
|
+
datachain/data_storage/metastore.py,sha256=cHN0xmbUvChyayHHZm3Vqxr87jFqojPSlGBqhTPStlE,54519
|
|
36
37
|
datachain/data_storage/schema.py,sha256=JKpSEz8igpwZ9zkpRPYVXZxEpiXuLKEs2WNhH0KqM6U,8552
|
|
37
38
|
datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
|
|
38
|
-
datachain/data_storage/sqlite.py,sha256=
|
|
39
|
-
datachain/data_storage/warehouse.py,sha256=
|
|
39
|
+
datachain/data_storage/sqlite.py,sha256=Z4B2KDL4C8Uio2aLMxaKv0t2MoOtCV3bSqWg4X9mTFg,28048
|
|
40
|
+
datachain/data_storage/warehouse.py,sha256=f7ETnYIXx5KMcPfwg_4bh_00QJiMLIliwE_41vmRGUo,33037
|
|
40
41
|
datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
41
|
-
datachain/lib/arrow.py,sha256=
|
|
42
|
+
datachain/lib/arrow.py,sha256=dV17oGiknqEW55ogGK_9T0ycNFwd2z-EFOW0AQiR6TU,5840
|
|
42
43
|
datachain/lib/clip.py,sha256=33RL11OIqfbwyhvBgiMGM8rDAnZx1IRmxk9dY89ls3Q,6130
|
|
43
44
|
datachain/lib/data_model.py,sha256=gHIjlow84GMRDa78yLL1Ud-N18or21fnTyPEwsatpXY,2045
|
|
44
|
-
datachain/lib/dataset_info.py,sha256=
|
|
45
|
-
datachain/lib/dc.py,sha256=
|
|
46
|
-
datachain/lib/file.py,sha256=
|
|
47
|
-
datachain/lib/hf.py,sha256=
|
|
45
|
+
datachain/lib/dataset_info.py,sha256=srPPhI2UHf6hFPBecyFEVw2SS5aPisIIMsvGgKqi7ss,2366
|
|
46
|
+
datachain/lib/dc.py,sha256=TOC5-Ar8GQBkFpWkxVeg1og_iCJt_c0FCqA8IGzUrAk,66929
|
|
47
|
+
datachain/lib/file.py,sha256=WOOYw3LcGROA6wshJ_aZkSgcTqfB4UxTbZDTx9KqAOg,11429
|
|
48
|
+
datachain/lib/hf.py,sha256=ZiMvgy3DYiklGKZv-w7gevrHOgn3bGfpTlpDPOHCNqs,5336
|
|
48
49
|
datachain/lib/image.py,sha256=WbcwSaFzuyqjg4x4hH5CUogeUQjkZFjQHqw_oDEV1nA,2655
|
|
49
|
-
datachain/lib/listing.py,sha256=
|
|
50
|
-
datachain/lib/
|
|
50
|
+
datachain/lib/listing.py,sha256=S9Xn_Saxu4xk3K_01VexkfMZW0INQiATlidt2bzgWKY,3938
|
|
51
|
+
datachain/lib/listing_info.py,sha256=sr5KzCXlCxlPuRmy_pVadD4miLpp5y0btvyaIPcluwI,996
|
|
52
|
+
datachain/lib/meta_formats.py,sha256=0YM7PMcGSLpUKZppyzFi8RvoSwYOqbciFGvzkvYdTXA,7133
|
|
51
53
|
datachain/lib/model_store.py,sha256=c4USXsBBjrGH8VOh4seIgOiav-qHOwdoixtxfLgU63c,2409
|
|
52
54
|
datachain/lib/pytorch.py,sha256=vK3GbWCy7kunN7ubul6w1hrWmJLja56uTCiMG_7XVQA,5623
|
|
53
55
|
datachain/lib/settings.py,sha256=39thOpYJw-zPirzeNO6pmRC2vPrQvt4eBsw1xLWDFsw,2344
|
|
54
|
-
datachain/lib/signal_schema.py,sha256=
|
|
56
|
+
datachain/lib/signal_schema.py,sha256=hqQLwUmt3w8RLa96MtubK9N2CBXqqTPrUkSRXc0ktt4,20275
|
|
55
57
|
datachain/lib/text.py,sha256=vqs1SQdsw1vCzfvOanIeT4xY2R2TmPonElBgYDVeZmY,1241
|
|
56
58
|
datachain/lib/udf.py,sha256=nG7DDuPgZ5ZuijwvDoCq-OZMxlDM8vFNzyxMmik0Y1c,11716
|
|
57
59
|
datachain/lib/udf_signature.py,sha256=gMStcEeYJka5M6cg50Z9orC6y6HzCAJ3MkFqqn1fjZg,7137
|
|
58
60
|
datachain/lib/utils.py,sha256=5-kJlAZE0D9nXXweAjo7-SP_AWGo28feaDByONYaooQ,463
|
|
59
61
|
datachain/lib/vfile.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
60
|
-
datachain/lib/webdataset.py,sha256=
|
|
62
|
+
datachain/lib/webdataset.py,sha256=Q3UlCk66341sq-nvFbBCX4Cv3cYXBK9n12ejG4axPXE,8298
|
|
61
63
|
datachain/lib/webdataset_laion.py,sha256=PQP6tQmUP7Xu9fPuAGK1JDBYA6T5UufYMUTGaxgspJA,2118
|
|
62
64
|
datachain/lib/convert/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
63
65
|
datachain/lib/convert/flatten.py,sha256=Uebc5CeqCsacp-nr6IG9i6OGuUavXqdqnoGctZBk3RQ,1384
|
|
@@ -68,9 +70,9 @@ datachain/lib/convert/values_to_tuples.py,sha256=YOdbjzHq-uj6-cV2Qq43G72eN2avMND
|
|
|
68
70
|
datachain/query/__init__.py,sha256=tv-spkjUCYamMN9ys_90scYrZ8kJ7C7d1MTYVmxGtk4,325
|
|
69
71
|
datachain/query/batch.py,sha256=-vlpINJiertlnaoUVv1C95RatU0F6zuhpIYRufJRo1M,3660
|
|
70
72
|
datachain/query/builtins.py,sha256=EmKPYsoQ46zwdyOn54MuCzvYFmfsBn5F8zyF7UBUfrc,2550
|
|
71
|
-
datachain/query/dataset.py,sha256=
|
|
73
|
+
datachain/query/dataset.py,sha256=v5gCAWswv6DoEWkN7DuOc7BL4Afz8p5ZSA_GNxn5_R4,59056
|
|
72
74
|
datachain/query/dispatch.py,sha256=GBh3EZHDp5AaXxrjOpfrpfsuy7Umnqxu-MAXcK9X3gc,12945
|
|
73
|
-
datachain/query/metrics.py,sha256=
|
|
75
|
+
datachain/query/metrics.py,sha256=r5b0ygYhokbXp8Mg3kCH8iFSRw0jxzyeBe-C-J_bKFc,938
|
|
74
76
|
datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
|
|
75
77
|
datachain/query/queue.py,sha256=waqM_KzavU8C-G95-4211Nd4GXna_u2747Chgwtgz2w,3839
|
|
76
78
|
datachain/query/schema.py,sha256=BvHipN79CnSTbVFcfIEwzo1npe7HmThnk0iY-CSLEkM,7899
|
|
@@ -95,9 +97,9 @@ datachain/sql/sqlite/base.py,sha256=WLPHBhZbXbiqPoRV1VgDrXJqku4UuvJpBhYeQ0k5rI8,
|
|
|
95
97
|
datachain/sql/sqlite/types.py,sha256=yzvp0sXSEoEYXs6zaYC_2YubarQoZH-MiUNXcpuEP4s,1573
|
|
96
98
|
datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
|
|
97
99
|
datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
|
|
98
|
-
datachain-0.3.
|
|
99
|
-
datachain-0.3.
|
|
100
|
-
datachain-0.3.
|
|
101
|
-
datachain-0.3.
|
|
102
|
-
datachain-0.3.
|
|
103
|
-
datachain-0.3.
|
|
100
|
+
datachain-0.3.10.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
|
|
101
|
+
datachain-0.3.10.dist-info/METADATA,sha256=eUsgu4Y4iK_rJbx66MCmeKuPaWS1iMKRL6mtbEB6ucY,17056
|
|
102
|
+
datachain-0.3.10.dist-info/WHEEL,sha256=cVxcB9AmuTcXqmwrtPhNK88dr7IR_b6qagTj0UvIEbY,91
|
|
103
|
+
datachain-0.3.10.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
|
|
104
|
+
datachain-0.3.10.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
|
|
105
|
+
datachain-0.3.10.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|