datachain 0.2.12__py3-none-any.whl → 0.2.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/catalog/catalog.py +7 -1
- datachain/cli.py +11 -0
- datachain/data_storage/metastore.py +0 -4
- datachain/data_storage/schema.py +7 -3
- datachain/data_storage/sqlite.py +1 -4
- datachain/data_storage/warehouse.py +1 -24
- datachain/lib/convert/flatten.py +4 -4
- datachain/lib/convert/values_to_tuples.py +4 -1
- datachain/lib/dc.py +100 -5
- datachain/lib/file.py +23 -22
- datachain/lib/meta_formats.py +6 -5
- datachain/query/dataset.py +29 -23
- datachain/sql/sqlite/base.py +3 -3
- datachain/sql/sqlite/types.py +5 -13
- {datachain-0.2.12.dist-info → datachain-0.2.14.dist-info}/METADATA +42 -44
- {datachain-0.2.12.dist-info → datachain-0.2.14.dist-info}/RECORD +20 -20
- {datachain-0.2.12.dist-info → datachain-0.2.14.dist-info}/LICENSE +0 -0
- {datachain-0.2.12.dist-info → datachain-0.2.14.dist-info}/WHEEL +0 -0
- {datachain-0.2.12.dist-info → datachain-0.2.14.dist-info}/entry_points.txt +0 -0
- {datachain-0.2.12.dist-info → datachain-0.2.14.dist-info}/top_level.txt +0 -0
datachain/catalog/catalog.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import ast
|
|
2
|
+
import glob
|
|
2
3
|
import io
|
|
3
4
|
import json
|
|
4
5
|
import logging
|
|
@@ -709,7 +710,12 @@ class Catalog:
|
|
|
709
710
|
|
|
710
711
|
client_config = client_config or self.client_config
|
|
711
712
|
client, path = self.parse_url(source, **client_config)
|
|
712
|
-
|
|
713
|
+
stem = os.path.basename(os.path.normpath(path))
|
|
714
|
+
prefix = (
|
|
715
|
+
posixpath.dirname(path)
|
|
716
|
+
if glob.has_magic(stem) or client.fs.isfile(source)
|
|
717
|
+
else path
|
|
718
|
+
)
|
|
713
719
|
storage_dataset_name = Storage.dataset_name(
|
|
714
720
|
client.uri, posixpath.join(prefix, "")
|
|
715
721
|
)
|
datachain/cli.py
CHANGED
|
@@ -491,6 +491,7 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
491
491
|
type=int,
|
|
492
492
|
help="Dataset version",
|
|
493
493
|
)
|
|
494
|
+
show_parser.add_argument("--schema", action="store_true", help="Show schema")
|
|
494
495
|
add_show_args(show_parser)
|
|
495
496
|
|
|
496
497
|
query_parser = subp.add_parser(
|
|
@@ -816,10 +817,15 @@ def show(
|
|
|
816
817
|
offset: int = 0,
|
|
817
818
|
columns: Sequence[str] = (),
|
|
818
819
|
no_collapse: bool = False,
|
|
820
|
+
schema: bool = False,
|
|
819
821
|
) -> None:
|
|
822
|
+
from datachain.lib.dc import DataChain
|
|
820
823
|
from datachain.query import DatasetQuery
|
|
821
824
|
from datachain.utils import show_records
|
|
822
825
|
|
|
826
|
+
dataset = catalog.get_dataset(name)
|
|
827
|
+
dataset_version = dataset.get_version(version or dataset.latest_version)
|
|
828
|
+
|
|
823
829
|
query = (
|
|
824
830
|
DatasetQuery(name=name, version=version, catalog=catalog)
|
|
825
831
|
.select(*columns)
|
|
@@ -828,6 +834,10 @@ def show(
|
|
|
828
834
|
)
|
|
829
835
|
records = query.to_db_records()
|
|
830
836
|
show_records(records, collapse_columns=not no_collapse)
|
|
837
|
+
if schema and dataset_version.feature_schema:
|
|
838
|
+
print("\nSchema:")
|
|
839
|
+
dc = DataChain(name=name, version=version, catalog=catalog)
|
|
840
|
+
dc.print_schema()
|
|
831
841
|
|
|
832
842
|
|
|
833
843
|
def query(
|
|
@@ -1013,6 +1023,7 @@ def main(argv: Optional[list[str]] = None) -> int: # noqa: C901, PLR0912, PLR09
|
|
|
1013
1023
|
offset=args.offset,
|
|
1014
1024
|
columns=args.columns,
|
|
1015
1025
|
no_collapse=args.no_collapse,
|
|
1026
|
+
schema=args.schema,
|
|
1016
1027
|
)
|
|
1017
1028
|
elif args.command == "rm-dataset":
|
|
1018
1029
|
rm_dataset(catalog, args.name, version=args.version, force=args.force)
|
|
@@ -421,10 +421,6 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
421
421
|
) -> None:
|
|
422
422
|
"""Set the status of the given job and dataset."""
|
|
423
423
|
|
|
424
|
-
@abstractmethod
|
|
425
|
-
def get_possibly_stale_jobs(self) -> list[tuple[str, str, int]]:
|
|
426
|
-
"""Returns the possibly stale jobs."""
|
|
427
|
-
|
|
428
424
|
|
|
429
425
|
class AbstractDBMetastore(AbstractMetastore):
|
|
430
426
|
"""
|
datachain/data_storage/schema.py
CHANGED
|
@@ -19,8 +19,12 @@ from datachain.sql.types import Int, SQLType, UInt64
|
|
|
19
19
|
if TYPE_CHECKING:
|
|
20
20
|
from sqlalchemy import Engine
|
|
21
21
|
from sqlalchemy.engine.interfaces import Dialect
|
|
22
|
-
from sqlalchemy.sql.base import
|
|
23
|
-
|
|
22
|
+
from sqlalchemy.sql.base import (
|
|
23
|
+
ColumnCollection,
|
|
24
|
+
Executable,
|
|
25
|
+
ReadOnlyColumnCollection,
|
|
26
|
+
)
|
|
27
|
+
from sqlalchemy.sql.elements import ColumnElement
|
|
24
28
|
|
|
25
29
|
|
|
26
30
|
def dedup_columns(columns: Iterable[sa.Column]) -> list[sa.Column]:
|
|
@@ -43,7 +47,7 @@ def dedup_columns(columns: Iterable[sa.Column]) -> list[sa.Column]:
|
|
|
43
47
|
|
|
44
48
|
|
|
45
49
|
def convert_rows_custom_column_types(
|
|
46
|
-
columns: "
|
|
50
|
+
columns: "ColumnCollection[str, ColumnElement[Any]]",
|
|
47
51
|
rows: Iterator[tuple[Any, ...]],
|
|
48
52
|
dialect: "Dialect",
|
|
49
53
|
):
|
datachain/data_storage/sqlite.py
CHANGED
|
@@ -496,9 +496,6 @@ class SQLiteMetastore(AbstractDBMetastore):
|
|
|
496
496
|
def _jobs_insert(self) -> "Insert":
|
|
497
497
|
return sqlite.insert(self._jobs)
|
|
498
498
|
|
|
499
|
-
def get_possibly_stale_jobs(self) -> list[tuple[str, str, int]]:
|
|
500
|
-
raise NotImplementedError("get_possibly_stale_jobs not implemented for SQLite")
|
|
501
|
-
|
|
502
499
|
|
|
503
500
|
class SQLiteWarehouse(AbstractWarehouse):
|
|
504
501
|
"""
|
|
@@ -594,7 +591,7 @@ class SQLiteWarehouse(AbstractWarehouse):
|
|
|
594
591
|
):
|
|
595
592
|
rows = self.db.execute(select_query, **kwargs)
|
|
596
593
|
yield from convert_rows_custom_column_types(
|
|
597
|
-
select_query.
|
|
594
|
+
select_query.selected_columns, rows, sqlite_dialect
|
|
598
595
|
)
|
|
599
596
|
|
|
600
597
|
def get_dataset_sources(
|
|
@@ -494,7 +494,7 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
494
494
|
This gets nodes based on the provided query, and should be used sparingly,
|
|
495
495
|
as it will be slow on any OLAP database systems.
|
|
496
496
|
"""
|
|
497
|
-
columns = [c.name for c in query.
|
|
497
|
+
columns = [c.name for c in query.selected_columns]
|
|
498
498
|
for row in self.db.execute(query):
|
|
499
499
|
d = dict(zip(columns, row))
|
|
500
500
|
yield Node(**d)
|
|
@@ -912,29 +912,6 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
912
912
|
for name in names:
|
|
913
913
|
self.db.drop_table(Table(name, self.db.metadata), if_exists=True)
|
|
914
914
|
|
|
915
|
-
def subtract_query(
|
|
916
|
-
self,
|
|
917
|
-
source_query: sa.sql.selectable.Select,
|
|
918
|
-
target_query: sa.sql.selectable.Select,
|
|
919
|
-
) -> sa.sql.selectable.Select:
|
|
920
|
-
sq = source_query.alias("source_query")
|
|
921
|
-
tq = target_query.alias("target_query")
|
|
922
|
-
|
|
923
|
-
source_target_join = sa.join(
|
|
924
|
-
sq,
|
|
925
|
-
tq,
|
|
926
|
-
(sq.c.source == tq.c.source)
|
|
927
|
-
& (sq.c.parent == tq.c.parent)
|
|
928
|
-
& (sq.c.name == tq.c.name),
|
|
929
|
-
isouter=True,
|
|
930
|
-
)
|
|
931
|
-
|
|
932
|
-
return (
|
|
933
|
-
select(*sq.c)
|
|
934
|
-
.select_from(source_target_join)
|
|
935
|
-
.where((tq.c.name == None) | (tq.c.name == "")) # noqa: E711
|
|
936
|
-
)
|
|
937
|
-
|
|
938
915
|
def changed_query(
|
|
939
916
|
self,
|
|
940
917
|
source_query: sa.sql.selectable.Select,
|
datachain/lib/convert/flatten.py
CHANGED
|
@@ -48,10 +48,10 @@ def _flatten_fields_values(fields, obj: BaseModel):
|
|
|
48
48
|
value = getattr(obj, name)
|
|
49
49
|
|
|
50
50
|
if isinstance(value, list):
|
|
51
|
-
|
|
52
|
-
val.model_dump()
|
|
53
|
-
|
|
54
|
-
|
|
51
|
+
if value and ModelStore.is_pydantic(type(value[0])):
|
|
52
|
+
yield [val.model_dump() for val in value]
|
|
53
|
+
else:
|
|
54
|
+
yield value
|
|
55
55
|
elif isinstance(value, dict):
|
|
56
56
|
yield {
|
|
57
57
|
key: val.model_dump() if ModelStore.is_pydantic(type(val)) else val
|
|
@@ -71,7 +71,10 @@ def values_to_tuples( # noqa: C901, PLR0912
|
|
|
71
71
|
f"signal '{k}' has unsupported type '{typ.__name__}'."
|
|
72
72
|
f" Please use DataModel types: {DataTypeNames}",
|
|
73
73
|
)
|
|
74
|
-
|
|
74
|
+
if typ is list:
|
|
75
|
+
types_map[k] = list[type(v[0][0])] # type: ignore[misc]
|
|
76
|
+
else:
|
|
77
|
+
types_map[k] = typ
|
|
75
78
|
|
|
76
79
|
if length < 0:
|
|
77
80
|
length = len_
|
datachain/lib/dc.py
CHANGED
|
@@ -342,7 +342,7 @@ class DataChain(DatasetQuery):
|
|
|
342
342
|
spec: Optional[DataType] = None,
|
|
343
343
|
schema_from: Optional[str] = "auto",
|
|
344
344
|
jmespath: Optional[str] = None,
|
|
345
|
-
object_name: str = "",
|
|
345
|
+
object_name: Optional[str] = "",
|
|
346
346
|
model_name: Optional[str] = None,
|
|
347
347
|
show_schema: Optional[bool] = False,
|
|
348
348
|
meta_type: Optional[str] = "json",
|
|
@@ -364,12 +364,12 @@ class DataChain(DatasetQuery):
|
|
|
364
364
|
nrows : optional row limit for jsonl and JSON arrays
|
|
365
365
|
|
|
366
366
|
Example:
|
|
367
|
-
infer JSON schema from data, reduce using JMESPATH
|
|
367
|
+
infer JSON schema from data, reduce using JMESPATH
|
|
368
368
|
```py
|
|
369
369
|
chain = DataChain.from_json("gs://json", jmespath="key1.key2")
|
|
370
370
|
```
|
|
371
371
|
|
|
372
|
-
infer JSON schema from a particular path
|
|
372
|
+
infer JSON schema from a particular path
|
|
373
373
|
```py
|
|
374
374
|
chain = DataChain.from_json("gs://json_ds", schema_from="gs://json/my.json")
|
|
375
375
|
```
|
|
@@ -384,7 +384,7 @@ class DataChain(DatasetQuery):
|
|
|
384
384
|
if (not object_name) and jmespath:
|
|
385
385
|
object_name = jmespath_to_name(jmespath)
|
|
386
386
|
if not object_name:
|
|
387
|
-
object_name =
|
|
387
|
+
object_name = meta_type
|
|
388
388
|
chain = DataChain.from_storage(path=path, type=type, **kwargs)
|
|
389
389
|
signal_dict = {
|
|
390
390
|
object_name: read_meta(
|
|
@@ -397,7 +397,67 @@ class DataChain(DatasetQuery):
|
|
|
397
397
|
nrows=nrows,
|
|
398
398
|
)
|
|
399
399
|
}
|
|
400
|
-
return chain.gen(**signal_dict) # type: ignore[arg-type]
|
|
400
|
+
return chain.gen(**signal_dict) # type: ignore[misc, arg-type]
|
|
401
|
+
|
|
402
|
+
@classmethod
|
|
403
|
+
def from_jsonl(
|
|
404
|
+
cls,
|
|
405
|
+
path,
|
|
406
|
+
type: Literal["binary", "text", "image"] = "text",
|
|
407
|
+
spec: Optional[DataType] = None,
|
|
408
|
+
schema_from: Optional[str] = "auto",
|
|
409
|
+
jmespath: Optional[str] = None,
|
|
410
|
+
object_name: Optional[str] = "",
|
|
411
|
+
model_name: Optional[str] = None,
|
|
412
|
+
show_schema: Optional[bool] = False,
|
|
413
|
+
meta_type: Optional[str] = "jsonl",
|
|
414
|
+
nrows=None,
|
|
415
|
+
**kwargs,
|
|
416
|
+
) -> "DataChain":
|
|
417
|
+
"""Get data from JSON lines. It returns the chain itself.
|
|
418
|
+
|
|
419
|
+
Parameters:
|
|
420
|
+
path : storage URI with directory. URI must start with storage prefix such
|
|
421
|
+
as `s3://`, `gs://`, `az://` or "file:///"
|
|
422
|
+
type : read file as "binary", "text", or "image" data. Default is "binary".
|
|
423
|
+
spec : optional Data Model
|
|
424
|
+
schema_from : path to sample to infer spec (if schema not provided)
|
|
425
|
+
object_name : generated object column name
|
|
426
|
+
model_name : optional generated model name
|
|
427
|
+
show_schema : print auto-generated schema
|
|
428
|
+
jmespath : optional JMESPATH expression to reduce JSON
|
|
429
|
+
nrows : optional row limit for jsonl and JSON arrays
|
|
430
|
+
|
|
431
|
+
Example:
|
|
432
|
+
infer JSONl schema from data, limit parsing to 1 row
|
|
433
|
+
```py
|
|
434
|
+
chain = DataChain.from_jsonl("gs://myjsonl", nrows=1)
|
|
435
|
+
```
|
|
436
|
+
"""
|
|
437
|
+
if schema_from == "auto":
|
|
438
|
+
schema_from = path
|
|
439
|
+
|
|
440
|
+
def jmespath_to_name(s: str):
|
|
441
|
+
name_end = re.search(r"\W", s).start() if re.search(r"\W", s) else len(s) # type: ignore[union-attr]
|
|
442
|
+
return s[:name_end]
|
|
443
|
+
|
|
444
|
+
if (not object_name) and jmespath:
|
|
445
|
+
object_name = jmespath_to_name(jmespath)
|
|
446
|
+
if not object_name:
|
|
447
|
+
object_name = meta_type
|
|
448
|
+
chain = DataChain.from_storage(path=path, type=type, **kwargs)
|
|
449
|
+
signal_dict = {
|
|
450
|
+
object_name: read_meta(
|
|
451
|
+
schema_from=schema_from,
|
|
452
|
+
meta_type=meta_type,
|
|
453
|
+
spec=spec,
|
|
454
|
+
model_name=model_name,
|
|
455
|
+
show_schema=show_schema,
|
|
456
|
+
jmespath=jmespath,
|
|
457
|
+
nrows=nrows,
|
|
458
|
+
)
|
|
459
|
+
}
|
|
460
|
+
return chain.gen(**signal_dict) # type: ignore[misc, arg-type]
|
|
401
461
|
|
|
402
462
|
@classmethod
|
|
403
463
|
def datasets(
|
|
@@ -951,6 +1011,41 @@ class DataChain(DatasetQuery):
|
|
|
951
1011
|
|
|
952
1012
|
return ds
|
|
953
1013
|
|
|
1014
|
+
def subtract( # type: ignore[override]
|
|
1015
|
+
self,
|
|
1016
|
+
other: "DataChain",
|
|
1017
|
+
on: Optional[Union[str, Sequence[str]]] = None,
|
|
1018
|
+
) -> "Self":
|
|
1019
|
+
"""Remove rows that appear in another chain.
|
|
1020
|
+
|
|
1021
|
+
Parameters:
|
|
1022
|
+
other: chain whose rows will be removed from `self`
|
|
1023
|
+
on: columns to consider for determining row equality. If unspecified,
|
|
1024
|
+
defaults to all common columns between `self` and `other`.
|
|
1025
|
+
"""
|
|
1026
|
+
if isinstance(on, str):
|
|
1027
|
+
on = [on]
|
|
1028
|
+
if on is None:
|
|
1029
|
+
other_columns = set(other._effective_signals_schema.db_signals())
|
|
1030
|
+
signals = [
|
|
1031
|
+
c
|
|
1032
|
+
for c in self._effective_signals_schema.db_signals()
|
|
1033
|
+
if c in other_columns
|
|
1034
|
+
]
|
|
1035
|
+
if not signals:
|
|
1036
|
+
raise DataChainParamsError("subtract(): no common columns")
|
|
1037
|
+
elif not isinstance(on, Sequence):
|
|
1038
|
+
raise TypeError(
|
|
1039
|
+
f"'on' must be 'str' or 'Sequence' object but got type '{type(on)}'",
|
|
1040
|
+
)
|
|
1041
|
+
elif not on:
|
|
1042
|
+
raise DataChainParamsError(
|
|
1043
|
+
"'on' cannot be empty",
|
|
1044
|
+
)
|
|
1045
|
+
else:
|
|
1046
|
+
signals = self.signals_schema.resolve(*on).db_signals()
|
|
1047
|
+
return super()._subtract(other, signals)
|
|
1048
|
+
|
|
954
1049
|
@classmethod
|
|
955
1050
|
def from_values(
|
|
956
1051
|
cls,
|
datachain/lib/file.py
CHANGED
|
@@ -12,7 +12,6 @@ from urllib.parse import unquote, urlparse
|
|
|
12
12
|
from urllib.request import url2pathname
|
|
13
13
|
|
|
14
14
|
from fsspec.callbacks import DEFAULT_CALLBACK, Callback
|
|
15
|
-
from fsspec.implementations.local import LocalFileSystem
|
|
16
15
|
from PIL import Image
|
|
17
16
|
from pydantic import Field, field_validator
|
|
18
17
|
|
|
@@ -20,7 +19,7 @@ from datachain.cache import UniqueId
|
|
|
20
19
|
from datachain.client.fileslice import FileSlice
|
|
21
20
|
from datachain.lib.data_model import DataModel
|
|
22
21
|
from datachain.lib.utils import DataChainError
|
|
23
|
-
from datachain.sql.types import JSON, Int, String
|
|
22
|
+
from datachain.sql.types import JSON, Boolean, DateTime, Int, String
|
|
24
23
|
from datachain.utils import TIME_ZERO
|
|
25
24
|
|
|
26
25
|
if TYPE_CHECKING:
|
|
@@ -126,11 +125,13 @@ class File(DataModel):
|
|
|
126
125
|
"source": String,
|
|
127
126
|
"parent": String,
|
|
128
127
|
"name": String,
|
|
128
|
+
"size": Int,
|
|
129
129
|
"version": String,
|
|
130
130
|
"etag": String,
|
|
131
|
-
"
|
|
132
|
-
"
|
|
131
|
+
"is_latest": Boolean,
|
|
132
|
+
"last_modified": DateTime,
|
|
133
133
|
"location": JSON,
|
|
134
|
+
"vtype": String,
|
|
134
135
|
}
|
|
135
136
|
|
|
136
137
|
_unique_id_keys: ClassVar[list[str]] = [
|
|
@@ -214,7 +215,7 @@ class File(DataModel):
|
|
|
214
215
|
with self.open(mode="r") as stream:
|
|
215
216
|
return stream.read()
|
|
216
217
|
|
|
217
|
-
def
|
|
218
|
+
def save(self, destination: str):
|
|
218
219
|
"""Writes it's content to destination"""
|
|
219
220
|
with open(destination, mode="wb") as f:
|
|
220
221
|
f.write(self.read())
|
|
@@ -232,7 +233,7 @@ class File(DataModel):
|
|
|
232
233
|
dst_dir = os.path.dirname(dst)
|
|
233
234
|
os.makedirs(dst_dir, exist_ok=True)
|
|
234
235
|
|
|
235
|
-
self.
|
|
236
|
+
self.save(dst)
|
|
236
237
|
|
|
237
238
|
def _set_stream(
|
|
238
239
|
self,
|
|
@@ -281,9 +282,8 @@ class File(DataModel):
|
|
|
281
282
|
def get_path(self) -> str:
|
|
282
283
|
"""Returns file path."""
|
|
283
284
|
path = unquote(self.get_uri())
|
|
284
|
-
|
|
285
|
-
if
|
|
286
|
-
# Drop file:// protocol
|
|
285
|
+
source = urlparse(self.source)
|
|
286
|
+
if source.scheme == "file":
|
|
287
287
|
path = urlparse(path).path
|
|
288
288
|
path = url2pathname(path)
|
|
289
289
|
return path
|
|
@@ -298,13 +298,10 @@ class File(DataModel):
|
|
|
298
298
|
elif placement == "etag":
|
|
299
299
|
path = f"{self.etag}{self.get_file_suffix()}"
|
|
300
300
|
elif placement == "fullpath":
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
path = (
|
|
306
|
-
Path(urlparse(self.source).netloc) / unquote(self.get_full_name())
|
|
307
|
-
).as_posix()
|
|
301
|
+
path = unquote(self.get_full_name())
|
|
302
|
+
source = urlparse(self.source)
|
|
303
|
+
if source.scheme and source.scheme != "file":
|
|
304
|
+
path = posixpath.join(source.netloc, path)
|
|
308
305
|
elif placement == "checksum":
|
|
309
306
|
raise NotImplementedError("Checksum placement not implemented yet")
|
|
310
307
|
else:
|
|
@@ -330,7 +327,7 @@ class TextFile(File):
|
|
|
330
327
|
with self.open() as stream:
|
|
331
328
|
return stream.read()
|
|
332
329
|
|
|
333
|
-
def
|
|
330
|
+
def save(self, destination: str):
|
|
334
331
|
"""Writes it's content to destination"""
|
|
335
332
|
with open(destination, mode="w") as f:
|
|
336
333
|
f.write(self.read_text())
|
|
@@ -344,7 +341,7 @@ class ImageFile(File):
|
|
|
344
341
|
fobj = super().read()
|
|
345
342
|
return Image.open(BytesIO(fobj))
|
|
346
343
|
|
|
347
|
-
def
|
|
344
|
+
def save(self, destination: str):
|
|
348
345
|
"""Writes it's content to destination"""
|
|
349
346
|
self.read().save(destination)
|
|
350
347
|
|
|
@@ -360,21 +357,25 @@ def get_file(type_: Literal["binary", "text", "image"] = "binary"):
|
|
|
360
357
|
source: str,
|
|
361
358
|
parent: str,
|
|
362
359
|
name: str,
|
|
360
|
+
size: int,
|
|
363
361
|
version: str,
|
|
364
362
|
etag: str,
|
|
365
|
-
|
|
366
|
-
|
|
363
|
+
is_latest: bool,
|
|
364
|
+
last_modified: datetime,
|
|
367
365
|
location: Optional[Union[dict, list[dict]]],
|
|
366
|
+
vtype: str,
|
|
368
367
|
) -> file: # type: ignore[valid-type]
|
|
369
368
|
return file(
|
|
370
369
|
source=source,
|
|
371
370
|
parent=parent,
|
|
372
371
|
name=name,
|
|
372
|
+
size=size,
|
|
373
373
|
version=version,
|
|
374
374
|
etag=etag,
|
|
375
|
-
|
|
376
|
-
|
|
375
|
+
is_latest=is_latest,
|
|
376
|
+
last_modified=last_modified,
|
|
377
377
|
location=location,
|
|
378
|
+
vtype=vtype,
|
|
378
379
|
)
|
|
379
380
|
|
|
380
381
|
return get_file_type
|
datachain/lib/meta_formats.py
CHANGED
|
@@ -11,9 +11,9 @@ from collections.abc import Iterator
|
|
|
11
11
|
from typing import Any, Callable
|
|
12
12
|
|
|
13
13
|
import jmespath as jsp
|
|
14
|
-
from pydantic import ValidationError
|
|
14
|
+
from pydantic import Field, ValidationError # noqa: F401
|
|
15
15
|
|
|
16
|
-
from datachain.lib.data_model import
|
|
16
|
+
from datachain.lib.data_model import DataModel # noqa: F401
|
|
17
17
|
from datachain.lib.file import File
|
|
18
18
|
|
|
19
19
|
|
|
@@ -87,7 +87,8 @@ def read_schema(source_file, data_type="csv", expr=None, model_name=None):
|
|
|
87
87
|
except subprocess.CalledProcessError as e:
|
|
88
88
|
model_output = f"An error occurred in datamodel-codegen: {e.stderr}"
|
|
89
89
|
print(f"{model_output}")
|
|
90
|
-
print("\n" +
|
|
90
|
+
print("\n" + "from datachain.lib.data_model import DataModel" + "\n")
|
|
91
|
+
print("\n" + f"DataModel.register({model_name})" + "\n")
|
|
91
92
|
print("\n" + f"spec={model_name}" + "\n")
|
|
92
93
|
return model_output
|
|
93
94
|
|
|
@@ -147,7 +148,7 @@ def read_meta( # noqa: C901
|
|
|
147
148
|
|
|
148
149
|
def parse_data(
|
|
149
150
|
file: File,
|
|
150
|
-
|
|
151
|
+
data_model=spec,
|
|
151
152
|
meta_type=meta_type,
|
|
152
153
|
jmespath=jmespath,
|
|
153
154
|
nrows=nrows,
|
|
@@ -155,7 +156,7 @@ def read_meta( # noqa: C901
|
|
|
155
156
|
def validator(json_object: dict) -> spec:
|
|
156
157
|
json_string = json.dumps(json_object)
|
|
157
158
|
try:
|
|
158
|
-
data_instance =
|
|
159
|
+
data_instance = data_model.model_validate_json(json_string)
|
|
159
160
|
yield data_instance
|
|
160
161
|
except ValidationError as e:
|
|
161
162
|
print(f"Validation error occurred in file {file.name}:", e)
|
datachain/query/dataset.py
CHANGED
|
@@ -25,6 +25,7 @@ from typing import (
|
|
|
25
25
|
|
|
26
26
|
import attrs
|
|
27
27
|
import sqlalchemy
|
|
28
|
+
import sqlalchemy as sa
|
|
28
29
|
from attrs import frozen
|
|
29
30
|
from fsspec.callbacks import DEFAULT_CALLBACK, Callback, TqdmCallback
|
|
30
31
|
from sqlalchemy import Column
|
|
@@ -250,7 +251,7 @@ class DatasetDiffOperation(Step):
|
|
|
250
251
|
self,
|
|
251
252
|
source_query: Select,
|
|
252
253
|
target_query: Select,
|
|
253
|
-
) ->
|
|
254
|
+
) -> sa.Selectable:
|
|
254
255
|
"""
|
|
255
256
|
Should return select query that calculates desired diff between dataset queries
|
|
256
257
|
"""
|
|
@@ -268,7 +269,7 @@ class DatasetDiffOperation(Step):
|
|
|
268
269
|
|
|
269
270
|
columns = [
|
|
270
271
|
c if isinstance(c, Column) else Column(c.name, c.type)
|
|
271
|
-
for c in source_query.
|
|
272
|
+
for c in source_query.selected_columns
|
|
272
273
|
]
|
|
273
274
|
temp_table = self.catalog.warehouse.create_dataset_rows_table(
|
|
274
275
|
temp_table_name,
|
|
@@ -292,23 +293,16 @@ class DatasetDiffOperation(Step):
|
|
|
292
293
|
|
|
293
294
|
@frozen
|
|
294
295
|
class Subtract(DatasetDiffOperation):
|
|
295
|
-
|
|
296
|
-
Calculates rows that are in a source query but are not in target query (diff)
|
|
297
|
-
This can be used to do delta updates (calculate UDF only on newly added rows)
|
|
298
|
-
Example:
|
|
299
|
-
>>> ds = DatasetQuery(name="dogs_cats") # some older dataset with embeddings
|
|
300
|
-
>>> ds_updated = (
|
|
301
|
-
DatasetQuery("gs://dvcx-datalakes/dogs-and-cats")
|
|
302
|
-
.filter(C.size > 1000) # we can also filter out source query
|
|
303
|
-
.subtract(ds)
|
|
304
|
-
.add_signals(calc_embeddings) # calculae embeddings only on new rows
|
|
305
|
-
.union(ds) # union with old dataset that's missing new rows
|
|
306
|
-
.save("dogs_cats_updated")
|
|
307
|
-
)
|
|
308
|
-
"""
|
|
296
|
+
on: Sequence[str]
|
|
309
297
|
|
|
310
|
-
def query(self, source_query: Select, target_query: Select) ->
|
|
311
|
-
|
|
298
|
+
def query(self, source_query: Select, target_query: Select) -> sa.Selectable:
|
|
299
|
+
sq = source_query.alias("source_query")
|
|
300
|
+
tq = target_query.alias("target_query")
|
|
301
|
+
where_clause = sa.and_(
|
|
302
|
+
getattr(sq.c, col_name).is_not_distinct_from(getattr(tq.c, col_name))
|
|
303
|
+
for col_name in self.on
|
|
304
|
+
) # type: ignore[arg-type]
|
|
305
|
+
return sq.select().except_(sq.select().where(where_clause))
|
|
312
306
|
|
|
313
307
|
|
|
314
308
|
@frozen
|
|
@@ -820,8 +814,16 @@ class SQLMutate(SQLClause):
|
|
|
820
814
|
args: tuple[ColumnElement, ...]
|
|
821
815
|
|
|
822
816
|
def apply_sql_clause(self, query: Select) -> Select:
|
|
823
|
-
|
|
824
|
-
|
|
817
|
+
original_subquery = query.subquery()
|
|
818
|
+
# this is needed for new column to be used in clauses
|
|
819
|
+
# like ORDER BY, otherwise new column is not recognized
|
|
820
|
+
subquery = (
|
|
821
|
+
sqlalchemy.select(*original_subquery.c, *self.args)
|
|
822
|
+
.select_from(original_subquery)
|
|
823
|
+
.subquery()
|
|
824
|
+
)
|
|
825
|
+
|
|
826
|
+
return sqlalchemy.select(*subquery.c).select_from(subquery)
|
|
825
827
|
|
|
826
828
|
|
|
827
829
|
@frozen
|
|
@@ -1252,7 +1254,7 @@ class DatasetQuery:
|
|
|
1252
1254
|
def as_iterable(self, **kwargs) -> Iterator[ResultIter]:
|
|
1253
1255
|
try:
|
|
1254
1256
|
query = self.apply_steps().select()
|
|
1255
|
-
selected_columns = [c.name for c in query.
|
|
1257
|
+
selected_columns = [c.name for c in query.selected_columns]
|
|
1256
1258
|
yield ResultIter(
|
|
1257
1259
|
self.catalog.warehouse.dataset_rows_select(query, **kwargs),
|
|
1258
1260
|
selected_columns,
|
|
@@ -1556,8 +1558,12 @@ class DatasetQuery:
|
|
|
1556
1558
|
|
|
1557
1559
|
@detach
|
|
1558
1560
|
def subtract(self, dq: "DatasetQuery") -> "Self":
|
|
1561
|
+
return self._subtract(dq, on=["source", "parent", "name"])
|
|
1562
|
+
|
|
1563
|
+
@detach
|
|
1564
|
+
def _subtract(self, dq: "DatasetQuery", on: Sequence[str]) -> "Self":
|
|
1559
1565
|
query = self.clone()
|
|
1560
|
-
query.steps.append(Subtract(dq, self.catalog))
|
|
1566
|
+
query.steps.append(Subtract(dq, self.catalog, on=on))
|
|
1561
1567
|
return query
|
|
1562
1568
|
|
|
1563
1569
|
@detach
|
|
@@ -1676,7 +1682,7 @@ class DatasetQuery:
|
|
|
1676
1682
|
f.row_number().over(order_by=q._order_by_clauses).label("sys__id")
|
|
1677
1683
|
)
|
|
1678
1684
|
|
|
1679
|
-
cols = tuple(c.name for c in q.
|
|
1685
|
+
cols = tuple(c.name for c in q.selected_columns)
|
|
1680
1686
|
insert_q = sqlalchemy.insert(dr.get_table()).from_select(cols, q)
|
|
1681
1687
|
self.catalog.warehouse.db.execute(insert_q, **kwargs)
|
|
1682
1688
|
self.catalog.metastore.update_dataset_status(
|
datachain/sql/sqlite/base.py
CHANGED
|
@@ -5,8 +5,8 @@ from datetime import MAXYEAR, MINYEAR, datetime, timezone
|
|
|
5
5
|
from types import MappingProxyType
|
|
6
6
|
from typing import Callable, Optional
|
|
7
7
|
|
|
8
|
+
import orjson
|
|
8
9
|
import sqlalchemy as sa
|
|
9
|
-
import ujson
|
|
10
10
|
from sqlalchemy.dialects import sqlite
|
|
11
11
|
from sqlalchemy.ext.compiler import compiles
|
|
12
12
|
from sqlalchemy.sql.elements import literal
|
|
@@ -149,7 +149,7 @@ def missing_vector_function(name, exc):
|
|
|
149
149
|
|
|
150
150
|
|
|
151
151
|
def sqlite_string_split(string: str, sep: str, maxsplit: int = -1) -> str:
|
|
152
|
-
return
|
|
152
|
+
return orjson.dumps(string.split(sep, maxsplit)).decode("utf-8")
|
|
153
153
|
|
|
154
154
|
|
|
155
155
|
def register_user_defined_sql_functions() -> None:
|
|
@@ -274,7 +274,7 @@ def compile_euclidean_distance(element, compiler, **kwargs):
|
|
|
274
274
|
|
|
275
275
|
|
|
276
276
|
def py_json_array_length(arr):
|
|
277
|
-
return len(
|
|
277
|
+
return len(orjson.loads(arr))
|
|
278
278
|
|
|
279
279
|
|
|
280
280
|
def compile_array_length(element, compiler, **kwargs):
|
datachain/sql/sqlite/types.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
|
-
import json
|
|
2
1
|
import sqlite3
|
|
3
2
|
|
|
4
|
-
import
|
|
3
|
+
import orjson
|
|
5
4
|
from sqlalchemy import types
|
|
6
5
|
|
|
7
6
|
from datachain.sql.types import TypeConverter, TypeReadConverter
|
|
@@ -29,22 +28,15 @@ class Array(types.UserDefinedType):
|
|
|
29
28
|
|
|
30
29
|
|
|
31
30
|
def adapt_array(arr):
|
|
32
|
-
return
|
|
31
|
+
return orjson.dumps(arr).decode("utf-8")
|
|
33
32
|
|
|
34
33
|
|
|
35
34
|
def convert_array(arr):
|
|
36
|
-
return
|
|
35
|
+
return orjson.loads(arr)
|
|
37
36
|
|
|
38
37
|
|
|
39
38
|
def adapt_np_array(arr):
|
|
40
|
-
|
|
41
|
-
if isinstance(obj, np.ndarray):
|
|
42
|
-
return obj.tolist()
|
|
43
|
-
return obj
|
|
44
|
-
|
|
45
|
-
if np.issubdtype(arr.dtype, np.object_):
|
|
46
|
-
return json.dumps(arr.tolist(), default=_json_serialize)
|
|
47
|
-
return ujson.dumps(arr.tolist())
|
|
39
|
+
return orjson.dumps(arr, option=orjson.OPT_SERIALIZE_NUMPY).decode("utf-8")
|
|
48
40
|
|
|
49
41
|
|
|
50
42
|
def adapt_np_generic(val):
|
|
@@ -70,5 +62,5 @@ class SQLiteTypeConverter(TypeConverter):
|
|
|
70
62
|
class SQLiteTypeReadConverter(TypeReadConverter):
|
|
71
63
|
def array(self, value, item_type, dialect):
|
|
72
64
|
if isinstance(value, str):
|
|
73
|
-
value =
|
|
65
|
+
value = orjson.loads(value)
|
|
74
66
|
return super().array(value, item_type, dialect)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.14
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -35,7 +35,7 @@ Requires-Dist: sqlalchemy >=2
|
|
|
35
35
|
Requires-Dist: multiprocess ==0.70.16
|
|
36
36
|
Requires-Dist: dill ==0.3.8
|
|
37
37
|
Requires-Dist: cloudpickle
|
|
38
|
-
Requires-Dist:
|
|
38
|
+
Requires-Dist: orjson >=3.10.5
|
|
39
39
|
Requires-Dist: pydantic <3,>=2
|
|
40
40
|
Requires-Dist: jmespath >=1.0
|
|
41
41
|
Requires-Dist: datamodel-code-generator >=0.25
|
|
@@ -45,9 +45,9 @@ Provides-Extra: dev
|
|
|
45
45
|
Requires-Dist: datachain[docs,tests] ; extra == 'dev'
|
|
46
46
|
Requires-Dist: mypy ==1.10.1 ; extra == 'dev'
|
|
47
47
|
Requires-Dist: types-python-dateutil ; extra == 'dev'
|
|
48
|
+
Requires-Dist: types-pytz ; extra == 'dev'
|
|
48
49
|
Requires-Dist: types-PyYAML ; extra == 'dev'
|
|
49
50
|
Requires-Dist: types-requests ; extra == 'dev'
|
|
50
|
-
Requires-Dist: types-ujson ; extra == 'dev'
|
|
51
51
|
Provides-Extra: docs
|
|
52
52
|
Requires-Dist: mkdocs >=1.5.2 ; extra == 'docs'
|
|
53
53
|
Requires-Dist: mkdocs-gen-files >=0.5.0 ; extra == 'docs'
|
|
@@ -103,20 +103,18 @@ AI 🔗 DataChain
|
|
|
103
103
|
DataChain is an open-source Python library for processing and curating unstructured
|
|
104
104
|
data at scale.
|
|
105
105
|
|
|
106
|
-
🤖 AI-Driven Data Curation: Use local ML models
|
|
106
|
+
🤖 AI-Driven Data Curation: Use local ML models or LLM APIs calls to enrich your data.
|
|
107
107
|
|
|
108
|
-
🚀 GenAI Dataset scale: Handle
|
|
108
|
+
🚀 GenAI Dataset scale: Handle tens of millions of multimodal files.
|
|
109
109
|
|
|
110
|
-
🐍 Python-friendly: Use strictly
|
|
110
|
+
🐍 Python-friendly: Use strictly-typed `Pydantic`_ objects instead of JSON.
|
|
111
111
|
|
|
112
112
|
|
|
113
|
-
|
|
114
|
-
downloads, and out-of-memory computing. It excels at optimizing batch operations.
|
|
115
|
-
While most GenAI tools focus on online applications and realtime, DataChain is designed
|
|
116
|
-
for offline data processing, data curation and ETL.
|
|
113
|
+
Datachain supports parallel processing, parallel data
|
|
114
|
+
downloads, and out-of-memory computing. It excels at optimizing offline batch operations.
|
|
117
115
|
|
|
118
|
-
The typical use cases
|
|
119
|
-
and validation.
|
|
116
|
+
The typical use cases include Computer Vision data curation, LLM analytics,
|
|
117
|
+
and validation of multimodal AI applications.
|
|
120
118
|
|
|
121
119
|
|
|
122
120
|
.. code:: console
|
|
@@ -128,25 +126,25 @@ and validation.
|
|
|
128
126
|
Quick Start
|
|
129
127
|
-----------
|
|
130
128
|
|
|
131
|
-
|
|
132
|
-
|
|
129
|
+
Data curation with a local model
|
|
130
|
+
=================================
|
|
133
131
|
|
|
134
132
|
We will evaluate chatbot dialogs stored as text files in Google Cloud Storage
|
|
135
|
-
- 50 files total in
|
|
136
|
-
These dialogs involve users looking for better wireless plans
|
|
137
|
-
Our goal is to identify successful dialogs.
|
|
133
|
+
- 50 files total in this example.
|
|
134
|
+
These dialogs involve users chatting with a bot while looking for better wireless plans.
|
|
135
|
+
Our goal is to identify the successful dialogs.
|
|
138
136
|
|
|
139
|
-
The data used in the examples is publicly available.
|
|
137
|
+
The data used in the examples is `publicly available`_. The sample code is designed to run on a local machine.
|
|
140
138
|
|
|
141
|
-
First, we'll
|
|
139
|
+
First, we'll show batch inference with a simple sentiment model using the `transformers` library:
|
|
142
140
|
|
|
143
141
|
.. code:: shell
|
|
144
142
|
|
|
145
143
|
pip install transformers
|
|
146
144
|
|
|
147
|
-
The code below downloads files the cloud, applies function
|
|
148
|
-
|
|
149
|
-
are copied to local directory
|
|
145
|
+
The code below downloads files the cloud, and applies a user-defined function
|
|
146
|
+
to each one of them. All files with a positive sentiment
|
|
147
|
+
detected are then copied to the local directory.
|
|
150
148
|
|
|
151
149
|
.. code:: py
|
|
152
150
|
|
|
@@ -169,7 +167,7 @@ are copied to local directory `output/`.
|
|
|
169
167
|
)
|
|
170
168
|
|
|
171
169
|
positive_chain = chain.filter(Column("is_positive") == True)
|
|
172
|
-
positive_chain.export_files("./
|
|
170
|
+
positive_chain.export_files("./output")
|
|
173
171
|
|
|
174
172
|
print(f"{positive_chain.count()} files were exported")
|
|
175
173
|
|
|
@@ -185,11 +183,11 @@ are copied to local directory `output/`.
|
|
|
185
183
|
13
|
|
186
184
|
|
|
187
185
|
|
|
188
|
-
LLM judging
|
|
189
|
-
|
|
186
|
+
LLM judging chatbots
|
|
187
|
+
=============================
|
|
190
188
|
|
|
191
|
-
|
|
192
|
-
we
|
|
189
|
+
LLMs can work as efficient universal classifiers. In the example below,
|
|
190
|
+
we employ a free API from Mistral to judge the chatbot performance. Please get a free
|
|
193
191
|
Mistral API key at https://console.mistral.ai
|
|
194
192
|
|
|
195
193
|
.. code:: shell
|
|
@@ -197,9 +195,7 @@ Mistral API key at https://console.mistral.ai
|
|
|
197
195
|
$ pip install mistralai
|
|
198
196
|
$ export MISTRAL_API_KEY=_your_key_
|
|
199
197
|
|
|
200
|
-
|
|
201
|
-
Note, only 4 threads were used in this example `parallel=4` due to a limitation of
|
|
202
|
-
the free LLM service.
|
|
198
|
+
DataChain can parallelize API calls; the free Mistral tier supports up to 4 requests at the same time.
|
|
203
199
|
|
|
204
200
|
.. code:: py
|
|
205
201
|
|
|
@@ -231,7 +227,7 @@ the free LLM service.
|
|
|
231
227
|
print(f"{successful_chain.count()} files were exported")
|
|
232
228
|
|
|
233
229
|
|
|
234
|
-
With the
|
|
230
|
+
With the instruction above, the Mistral model considers 31/50 files to hold the successful dialogues:
|
|
235
231
|
|
|
236
232
|
.. code:: shell
|
|
237
233
|
|
|
@@ -245,11 +241,11 @@ With the current prompt, we found 31 files considered successful dialogs:
|
|
|
245
241
|
Serializing Python-objects
|
|
246
242
|
==========================
|
|
247
243
|
|
|
248
|
-
LLM responses contain valuable information for analytics
|
|
249
|
-
model
|
|
244
|
+
LLM responses may contain valuable information for analytics – such as the number of tokens used, or the
|
|
245
|
+
model performance parameters.
|
|
250
246
|
|
|
251
|
-
Instead of extracting this information from the Mistral data structure (class
|
|
252
|
-
`ChatCompletionResponse`),
|
|
247
|
+
Instead of extracting this information from the Mistral response data structure (class
|
|
248
|
+
`ChatCompletionResponse`), DataChain can serialize the entire LLM response to the internal DB:
|
|
253
249
|
|
|
254
250
|
|
|
255
251
|
.. code:: py
|
|
@@ -297,21 +293,23 @@ Output:
|
|
|
297
293
|
64.0% dialogs were successful
|
|
298
294
|
|
|
299
295
|
|
|
300
|
-
|
|
296
|
+
Iterating over Python data structures
|
|
301
297
|
=============================================
|
|
302
298
|
|
|
303
|
-
In the previous examples,
|
|
304
|
-
(`SQLite`_ in
|
|
305
|
-
These datasets
|
|
299
|
+
In the previous examples, datasets were saved in the embedded database
|
|
300
|
+
(`SQLite`_ in folder `.datachain` of the working directory).
|
|
301
|
+
These datasets were automatically versioned, and can be accessed using
|
|
306
302
|
`DataChain.from_dataset("dataset_name")`.
|
|
307
303
|
|
|
304
|
+
Here is how to retrieve a saved dataset and iterate over the objects:
|
|
305
|
+
|
|
308
306
|
.. code:: py
|
|
309
307
|
|
|
310
308
|
chain = DataChain.from_dataset("response")
|
|
311
309
|
|
|
312
|
-
# Iterating one-by-one: out
|
|
310
|
+
# Iterating one-by-one: support out-of-memory workflow
|
|
313
311
|
for file, response in chain.limit(5).collect("file", "response"):
|
|
314
|
-
#
|
|
312
|
+
# verify the collected Python objects
|
|
315
313
|
assert isinstance(response, ChatCompletionResponse)
|
|
316
314
|
|
|
317
315
|
status = response.choices[0].message.content[:7]
|
|
@@ -332,9 +330,8 @@ Output:
|
|
|
332
330
|
Vectorized analytics over Python objects
|
|
333
331
|
========================================
|
|
334
332
|
|
|
335
|
-
Some operations can
|
|
336
|
-
|
|
337
|
-
Mistral calls cost $2 per 1M input tokens and $6 per 1M output tokens:
|
|
333
|
+
Some operations can run inside the DB without deserialization.
|
|
334
|
+
For instance, let's calculate the total cost of using the LLM APIs, assuming the Mixtral call costs $2 per 1M input tokens and $6 per 1M output tokens:
|
|
338
335
|
|
|
339
336
|
.. code:: py
|
|
340
337
|
|
|
@@ -406,6 +403,7 @@ Community and Support
|
|
|
406
403
|
.. github-only
|
|
407
404
|
.. _Contributor Guide: CONTRIBUTING.rst
|
|
408
405
|
.. _Pydantic: https://github.com/pydantic/pydantic
|
|
406
|
+
.. _publicly available: https://radar.kit.edu/radar/en/dataset/FdJmclKpjHzLfExE.ExpBot%2B-%2BA%2Bdataset%2Bof%2B79%2Bdialogs%2Bwith%2Ban%2Bexperimental%2Bcustomer%2Bservice%2Bchatbot
|
|
409
407
|
.. _SQLite: https://www.sqlite.org/
|
|
410
408
|
.. _Getting Started: https://datachain.dvc.ai/
|
|
411
409
|
.. |Flowchart| image:: https://github.com/iterative/datachain/blob/main/docs/assets/flowchart.png?raw=true
|
|
@@ -2,7 +2,7 @@ datachain/__init__.py,sha256=GeyhE-5LgfJav2OKYGaieP2lBvf2Gm-ihj7thnK9zjI,800
|
|
|
2
2
|
datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
|
|
3
3
|
datachain/asyn.py,sha256=CKCFQJ0CbB3r04S7mUTXxriKzPnOvdUaVPXjM8vCtJw,7644
|
|
4
4
|
datachain/cache.py,sha256=N6PCEFJlWRpq7f_zeBNoaURFCJFAV7ibsLJqyiMHbBg,4207
|
|
5
|
-
datachain/cli.py,sha256=
|
|
5
|
+
datachain/cli.py,sha256=Twb6BXjNxfAAGj42dUOJ7Ah5etkrTDVfMzAmINWUSOI,33104
|
|
6
6
|
datachain/cli_utils.py,sha256=jrn9ejGXjybeO1ur3fjdSiAyCHZrX0qsLLbJzN9ErPM,2418
|
|
7
7
|
datachain/config.py,sha256=PfC7W5yO6HFO6-iMB4YB-0RR88LPiGmD6sS_SfVbGso,1979
|
|
8
8
|
datachain/dataset.py,sha256=MZezyuJWNj_3PEtzr0epPMNyWAOTrhTSPI5FmemV6L4,14470
|
|
@@ -17,7 +17,7 @@ datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
|
17
17
|
datachain/storage.py,sha256=RiSJLYdHUjnrEWkLBKPcETHpAxld_B2WxLg711t0aZI,3733
|
|
18
18
|
datachain/utils.py,sha256=kgH5NPj47eC_KrFTd6ZS206lKVhnJVFt5XsqkK6ppTc,12483
|
|
19
19
|
datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
|
|
20
|
-
datachain/catalog/catalog.py,sha256=
|
|
20
|
+
datachain/catalog/catalog.py,sha256=ab-PLPa9CMeHCo9asHjkqw4mZ6tHM4x8bsswfMtr65w,80575
|
|
21
21
|
datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
|
|
22
22
|
datachain/catalog/loader.py,sha256=GJ8zhEYkC7TuaPzCsjJQ4LtTdECu-wwYzC12MikPOMQ,7307
|
|
23
23
|
datachain/catalog/subclass.py,sha256=B5R0qxeTYEyVAAPM1RutBPSoXZc8L5mVVZeSGXki9Sw,2096
|
|
@@ -32,20 +32,20 @@ datachain/data_storage/__init__.py,sha256=cEOJpyu1JDZtfUupYucCDNFI6e5Wmp_Oyzq6rZ
|
|
|
32
32
|
datachain/data_storage/db_engine.py,sha256=rgBuqJ-M1j5QyqiUQuJRewctuvRRj8LBDL54-aPEFxE,3287
|
|
33
33
|
datachain/data_storage/id_generator.py,sha256=VlDALKijggegAnNMJwuMETJgnLoPYxpkrkld5DNTPQw,3839
|
|
34
34
|
datachain/data_storage/job.py,sha256=w-7spowjkOa1P5fUVtJou3OltT0L48P0RYWZ9rSJ9-s,383
|
|
35
|
-
datachain/data_storage/metastore.py,sha256=
|
|
36
|
-
datachain/data_storage/schema.py,sha256=
|
|
35
|
+
datachain/data_storage/metastore.py,sha256=wVcT8MiSH_paWEXN6eZ8Z3msrHY6vWtVFTH5kwHteRE,54852
|
|
36
|
+
datachain/data_storage/schema.py,sha256=FQvt5MUMSnI5ZAE7Nthae4aaJpt8JC4nH8KiWDuhJkk,8135
|
|
37
37
|
datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
|
|
38
|
-
datachain/data_storage/sqlite.py,sha256=
|
|
39
|
-
datachain/data_storage/warehouse.py,sha256=
|
|
38
|
+
datachain/data_storage/sqlite.py,sha256=i4h8ZY15A2YNXd2PU5BZPoRaBqqs9lOdPtBjC0BZy3s,24935
|
|
39
|
+
datachain/data_storage/warehouse.py,sha256=fQO6UZc2MFgFPRnpCQW7c1GCl3FJBYE4dtA_ZXWuA8M,32627
|
|
40
40
|
datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
41
41
|
datachain/lib/arrow.py,sha256=WBZ4iVU0CcmCgog1wS-Nrtqhzvf2I4_QqDJtzhaECeA,3641
|
|
42
42
|
datachain/lib/clip.py,sha256=16u4b_y2Y15nUS2UN_8ximMo6r_-_4IQpmct2ol-e-g,5730
|
|
43
43
|
datachain/lib/data_model.py,sha256=jPYDmTYbixy4LhdToOyvldYGYZxblhp6Tn4MF-VAd-o,1495
|
|
44
44
|
datachain/lib/dataset_info.py,sha256=lONGr71ozo1DS4CQEhnpKORaU4qFb6Ketv8Xm8CVm2U,2188
|
|
45
|
-
datachain/lib/dc.py,sha256=
|
|
46
|
-
datachain/lib/file.py,sha256=
|
|
45
|
+
datachain/lib/dc.py,sha256=I3BLJJK17kB8velBSCTjtoR8CcPZOHPgFTibS9OclmY,54155
|
|
46
|
+
datachain/lib/file.py,sha256=n9GBmZ1CjzDjHkbUBsUrs8JOJrAoh3MV2Cc8hBkex20,11957
|
|
47
47
|
datachain/lib/image.py,sha256=TgYhRhzd4nkytfFMeykQkPyzqb5Le_-tU81unVMPn4Q,2328
|
|
48
|
-
datachain/lib/meta_formats.py,sha256=
|
|
48
|
+
datachain/lib/meta_formats.py,sha256=WRjUzaBKo0IJFHhKz7dxzAKXjR4OvuzsLjkdjyewL6Q,7001
|
|
49
49
|
datachain/lib/model_store.py,sha256=c4USXsBBjrGH8VOh4seIgOiav-qHOwdoixtxfLgU63c,2409
|
|
50
50
|
datachain/lib/pytorch.py,sha256=9PsypKseyKfIimTmTQOgb-pbNXgeeAHLdlWx0qRPULY,5660
|
|
51
51
|
datachain/lib/settings.py,sha256=6Nkoh8riETrftYwDp3aniK53Dsjc07MdztL8N0cW1D8,2849
|
|
@@ -58,15 +58,15 @@ datachain/lib/vfile.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
|
58
58
|
datachain/lib/webdataset.py,sha256=nIa6ubv94CwnATeeSdE7f_F9Zkz9LuBTfbXvFg3_-Ak,8295
|
|
59
59
|
datachain/lib/webdataset_laion.py,sha256=PQP6tQmUP7Xu9fPuAGK1JDBYA6T5UufYMUTGaxgspJA,2118
|
|
60
60
|
datachain/lib/convert/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
61
|
-
datachain/lib/convert/flatten.py,sha256=
|
|
61
|
+
datachain/lib/convert/flatten.py,sha256=vrj2Kg-I1YAq2OGAFIwFUqtIesGpweve3c1ipeFOvDQ,1615
|
|
62
62
|
datachain/lib/convert/python_to_sql.py,sha256=54G6dsMhxo1GKCzPziOqCKo2d4VRWmsJhJYRJxt1Thw,2615
|
|
63
63
|
datachain/lib/convert/sql_to_python.py,sha256=HK414fexSQ4Ur-OY7_pKvDKEGdtos1CeeAFa4RxH4nU,532
|
|
64
64
|
datachain/lib/convert/unflatten.py,sha256=Ogvh_5wg2f38_At_1lN0D_e2uZOOpYEvwvB2xdq56Tw,2012
|
|
65
|
-
datachain/lib/convert/values_to_tuples.py,sha256=
|
|
65
|
+
datachain/lib/convert/values_to_tuples.py,sha256=aVoHWMOUGLAiS6_BBwKJqVIne91VffOW6-dWyNE7oHg,3715
|
|
66
66
|
datachain/query/__init__.py,sha256=tv-spkjUCYamMN9ys_90scYrZ8kJ7C7d1MTYVmxGtk4,325
|
|
67
67
|
datachain/query/batch.py,sha256=j-_ZcuQra2Ro3Wj4crtqQCg-7xuv-p84hr4QHdvT7as,3479
|
|
68
68
|
datachain/query/builtins.py,sha256=ZKNs49t8Oa_OaboCBIEqtXZt7c1Qe9OR_C_HpoDriIU,2781
|
|
69
|
-
datachain/query/dataset.py,sha256=
|
|
69
|
+
datachain/query/dataset.py,sha256=VhsbHTOps-E4_trLzkJWGQV3zblN6LdlyHED9-3H5Vo,61388
|
|
70
70
|
datachain/query/dispatch.py,sha256=oGX9ZuoKWPB_EyqAZD_eULcO3OejY44_keSmFS6SHT0,13315
|
|
71
71
|
datachain/query/metrics.py,sha256=vsECqbZfoSDBnvC3GQlziKXmISVYDLgHP1fMPEOtKyo,640
|
|
72
72
|
datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
|
|
@@ -88,13 +88,13 @@ datachain/sql/functions/path.py,sha256=zixpERotTFP6LZ7I4TiGtyRA8kXOoZmH1yzH9oRW0
|
|
|
88
88
|
datachain/sql/functions/random.py,sha256=vBwEEj98VH4LjWixUCygQ5Bz1mv1nohsCG0-ZTELlVg,271
|
|
89
89
|
datachain/sql/functions/string.py,sha256=hIrF1fTvlPamDtm8UMnWDcnGfbbjCsHxZXS30U2Rzxo,651
|
|
90
90
|
datachain/sql/sqlite/__init__.py,sha256=TAdJX0Bg28XdqPO-QwUVKy8rg78cgMileHvMNot7d04,166
|
|
91
|
-
datachain/sql/sqlite/base.py,sha256=
|
|
92
|
-
datachain/sql/sqlite/types.py,sha256=
|
|
91
|
+
datachain/sql/sqlite/base.py,sha256=Jb1csbIARjEvwbylnvgNA7ChozSyoL3CQzOGBUf8QAw,12067
|
|
92
|
+
datachain/sql/sqlite/types.py,sha256=yzvp0sXSEoEYXs6zaYC_2YubarQoZH-MiUNXcpuEP4s,1573
|
|
93
93
|
datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
|
|
94
94
|
datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
|
|
95
|
-
datachain-0.2.
|
|
96
|
-
datachain-0.2.
|
|
97
|
-
datachain-0.2.
|
|
98
|
-
datachain-0.2.
|
|
99
|
-
datachain-0.2.
|
|
100
|
-
datachain-0.2.
|
|
95
|
+
datachain-0.2.14.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
|
|
96
|
+
datachain-0.2.14.dist-info/METADATA,sha256=UiBiVmF8nF2aIimMNPn3XB14OhIbRj0w4w5q72qTaRM,14577
|
|
97
|
+
datachain-0.2.14.dist-info/WHEEL,sha256=Wyh-_nZ0DJYolHNn1_hMa4lM7uDedD_RGVwbmTjyItk,91
|
|
98
|
+
datachain-0.2.14.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
|
|
99
|
+
datachain-0.2.14.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
|
|
100
|
+
datachain-0.2.14.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|