datachain 0.3.1__py3-none-any.whl → 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/catalog/catalog.py +10 -1
- datachain/data_storage/schema.py +22 -8
- datachain/data_storage/sqlite.py +5 -0
- datachain/lib/dc.py +27 -13
- datachain/lib/meta_formats.py +8 -2
- datachain/node.py +1 -1
- datachain/query/schema.py +4 -0
- datachain/sql/default/base.py +3 -0
- datachain/sql/sqlite/base.py +3 -0
- datachain/sql/types.py +120 -11
- {datachain-0.3.1.dist-info → datachain-0.3.2.dist-info}/METADATA +74 -86
- {datachain-0.3.1.dist-info → datachain-0.3.2.dist-info}/RECORD +16 -16
- {datachain-0.3.1.dist-info → datachain-0.3.2.dist-info}/WHEEL +1 -1
- {datachain-0.3.1.dist-info → datachain-0.3.2.dist-info}/LICENSE +0 -0
- {datachain-0.3.1.dist-info → datachain-0.3.2.dist-info}/entry_points.txt +0 -0
- {datachain-0.3.1.dist-info → datachain-0.3.2.dist-info}/top_level.txt +0 -0
datachain/catalog/catalog.py
CHANGED
|
@@ -1627,8 +1627,17 @@ class Catalog:
|
|
|
1627
1627
|
version = self.get_dataset(dataset_name).get_version(dataset_version)
|
|
1628
1628
|
|
|
1629
1629
|
file_signals_values = {}
|
|
1630
|
+
file_schemas = {}
|
|
1631
|
+
# TODO: To remove after we properly fix deserialization
|
|
1632
|
+
for signal, type_name in version.feature_schema.items():
|
|
1633
|
+
from datachain.lib.model_store import ModelStore
|
|
1630
1634
|
|
|
1631
|
-
|
|
1635
|
+
type_name_parsed, v = ModelStore.parse_name_version(type_name)
|
|
1636
|
+
fr = ModelStore.get(type_name_parsed, v)
|
|
1637
|
+
if fr and issubclass(fr, File):
|
|
1638
|
+
file_schemas[signal] = type_name
|
|
1639
|
+
|
|
1640
|
+
schema = SignalSchema.deserialize(file_schemas)
|
|
1632
1641
|
for file_signals in schema.get_signals(File):
|
|
1633
1642
|
prefix = file_signals.replace(".", DEFAULT_DELIMITER) + DEFAULT_DELIMITER
|
|
1634
1643
|
file_signals_values[file_signals] = {
|
datachain/data_storage/schema.py
CHANGED
|
@@ -67,7 +67,11 @@ def convert_rows_custom_column_types(
|
|
|
67
67
|
for row in rows:
|
|
68
68
|
row_list = list(row)
|
|
69
69
|
for idx, t in custom_columns_types:
|
|
70
|
-
row_list[idx] =
|
|
70
|
+
row_list[idx] = (
|
|
71
|
+
t.default_value(dialect)
|
|
72
|
+
if row_list[idx] is None
|
|
73
|
+
else t.on_read_convert(row_list[idx], dialect)
|
|
74
|
+
)
|
|
71
75
|
|
|
72
76
|
yield tuple(row_list)
|
|
73
77
|
|
|
@@ -136,7 +140,15 @@ class DataTable:
|
|
|
136
140
|
self.column_types: dict[str, SQLType] = column_types or {}
|
|
137
141
|
|
|
138
142
|
@staticmethod
|
|
139
|
-
def copy_column(
|
|
143
|
+
def copy_column(
|
|
144
|
+
column: sa.Column,
|
|
145
|
+
primary_key: Optional[bool] = None,
|
|
146
|
+
index: Optional[bool] = None,
|
|
147
|
+
nullable: Optional[bool] = None,
|
|
148
|
+
default: Optional[Any] = None,
|
|
149
|
+
server_default: Optional[Any] = None,
|
|
150
|
+
unique: Optional[bool] = None,
|
|
151
|
+
) -> sa.Column:
|
|
140
152
|
"""
|
|
141
153
|
Copy a sqlalchemy Column object intended for use as a signal column.
|
|
142
154
|
|
|
@@ -150,12 +162,14 @@ class DataTable:
|
|
|
150
162
|
return sa.Column(
|
|
151
163
|
column.name,
|
|
152
164
|
column.type,
|
|
153
|
-
primary_key=column.primary_key,
|
|
154
|
-
index=column.index,
|
|
155
|
-
nullable=column.nullable,
|
|
156
|
-
default=column.default,
|
|
157
|
-
server_default=
|
|
158
|
-
|
|
165
|
+
primary_key=primary_key if primary_key is not None else column.primary_key,
|
|
166
|
+
index=index if index is not None else column.index,
|
|
167
|
+
nullable=nullable if nullable is not None else column.nullable,
|
|
168
|
+
default=default if default is not None else column.default,
|
|
169
|
+
server_default=(
|
|
170
|
+
server_default if server_default is not None else column.server_default
|
|
171
|
+
),
|
|
172
|
+
unique=unique if unique is not None else column.unique,
|
|
159
173
|
)
|
|
160
174
|
|
|
161
175
|
@classmethod
|
datachain/data_storage/sqlite.py
CHANGED
|
@@ -122,6 +122,11 @@ class SQLiteDatabaseEngine(DatabaseEngine):
|
|
|
122
122
|
engine = sqlalchemy.create_engine(
|
|
123
123
|
"sqlite+pysqlite:///", creator=lambda: db, future=True
|
|
124
124
|
)
|
|
125
|
+
# ensure we run SA on_connect init (e.g it registers regexp function),
|
|
126
|
+
# also makes sure that it's consistent. Otherwise in some cases it
|
|
127
|
+
# seems we are getting different results if engine object is used in a
|
|
128
|
+
# different thread first and enine is not used in the Main thread.
|
|
129
|
+
engine.connect().close()
|
|
125
130
|
|
|
126
131
|
db.isolation_level = None # Use autocommit mode
|
|
127
132
|
db.execute("PRAGMA foreign_keys = ON")
|
datachain/lib/dc.py
CHANGED
|
@@ -508,7 +508,7 @@ class DataChain(DatasetQuery):
|
|
|
508
508
|
|
|
509
509
|
def print_json_schema( # type: ignore[override]
|
|
510
510
|
self, jmespath: Optional[str] = None, model_name: Optional[str] = None
|
|
511
|
-
) -> "
|
|
511
|
+
) -> "Self":
|
|
512
512
|
"""Print JSON data model and save it. It returns the chain itself.
|
|
513
513
|
|
|
514
514
|
Parameters:
|
|
@@ -533,7 +533,7 @@ class DataChain(DatasetQuery):
|
|
|
533
533
|
|
|
534
534
|
def print_jsonl_schema( # type: ignore[override]
|
|
535
535
|
self, jmespath: Optional[str] = None, model_name: Optional[str] = None
|
|
536
|
-
) -> "
|
|
536
|
+
) -> "Self":
|
|
537
537
|
"""Print JSON data model and save it. It returns the chain itself.
|
|
538
538
|
|
|
539
539
|
Parameters:
|
|
@@ -549,7 +549,7 @@ class DataChain(DatasetQuery):
|
|
|
549
549
|
|
|
550
550
|
def save( # type: ignore[override]
|
|
551
551
|
self, name: Optional[str] = None, version: Optional[int] = None
|
|
552
|
-
) -> "
|
|
552
|
+
) -> "Self":
|
|
553
553
|
"""Save to a Dataset. It returns the chain itself.
|
|
554
554
|
|
|
555
555
|
Parameters:
|
|
@@ -785,7 +785,7 @@ class DataChain(DatasetQuery):
|
|
|
785
785
|
descending (bool): Whether to sort in descending order or not.
|
|
786
786
|
"""
|
|
787
787
|
if descending:
|
|
788
|
-
args = tuple(
|
|
788
|
+
args = tuple(sqlalchemy.desc(a) for a in args)
|
|
789
789
|
|
|
790
790
|
return super().order_by(*args)
|
|
791
791
|
|
|
@@ -1206,14 +1206,14 @@ class DataChain(DatasetQuery):
|
|
|
1206
1206
|
"""
|
|
1207
1207
|
headers, max_length = self._effective_signals_schema.get_headers_with_length()
|
|
1208
1208
|
if flatten or max_length < 2:
|
|
1209
|
-
|
|
1209
|
+
columns = []
|
|
1210
1210
|
if headers:
|
|
1211
|
-
|
|
1212
|
-
return
|
|
1211
|
+
columns = [".".join(filter(None, header)) for header in headers]
|
|
1212
|
+
return pd.DataFrame.from_records(self.to_records(), columns=columns)
|
|
1213
1213
|
|
|
1214
|
-
|
|
1215
|
-
|
|
1216
|
-
|
|
1214
|
+
return pd.DataFrame(
|
|
1215
|
+
self.results(), columns=pd.MultiIndex.from_tuples(map(tuple, headers))
|
|
1216
|
+
)
|
|
1217
1217
|
|
|
1218
1218
|
def show(
|
|
1219
1219
|
self,
|
|
@@ -1232,6 +1232,12 @@ class DataChain(DatasetQuery):
|
|
|
1232
1232
|
"""
|
|
1233
1233
|
dc = self.limit(limit) if limit > 0 else self
|
|
1234
1234
|
df = dc.to_pandas(flatten)
|
|
1235
|
+
|
|
1236
|
+
if df.empty:
|
|
1237
|
+
print("Empty result")
|
|
1238
|
+
print(f"Columns: {list(df.columns)}")
|
|
1239
|
+
return
|
|
1240
|
+
|
|
1235
1241
|
if transpose:
|
|
1236
1242
|
df = df.T
|
|
1237
1243
|
|
|
@@ -1270,7 +1276,7 @@ class DataChain(DatasetQuery):
|
|
|
1270
1276
|
source: bool = True,
|
|
1271
1277
|
nrows: Optional[int] = None,
|
|
1272
1278
|
**kwargs,
|
|
1273
|
-
) -> "
|
|
1279
|
+
) -> "Self":
|
|
1274
1280
|
"""Generate chain from list of tabular files.
|
|
1275
1281
|
|
|
1276
1282
|
Parameters:
|
|
@@ -1390,7 +1396,8 @@ class DataChain(DatasetQuery):
|
|
|
1390
1396
|
dc = DataChain.from_csv("s3://mybucket/dir")
|
|
1391
1397
|
```
|
|
1392
1398
|
"""
|
|
1393
|
-
from
|
|
1399
|
+
from pandas.io.parsers.readers import STR_NA_VALUES
|
|
1400
|
+
from pyarrow.csv import ConvertOptions, ParseOptions, ReadOptions
|
|
1394
1401
|
from pyarrow.dataset import CsvFileFormat
|
|
1395
1402
|
|
|
1396
1403
|
chain = DataChain.from_storage(path, **kwargs)
|
|
@@ -1414,7 +1421,14 @@ class DataChain(DatasetQuery):
|
|
|
1414
1421
|
|
|
1415
1422
|
parse_options = ParseOptions(delimiter=delimiter)
|
|
1416
1423
|
read_options = ReadOptions(column_names=column_names)
|
|
1417
|
-
|
|
1424
|
+
convert_options = ConvertOptions(
|
|
1425
|
+
strings_can_be_null=True, null_values=STR_NA_VALUES
|
|
1426
|
+
)
|
|
1427
|
+
format = CsvFileFormat(
|
|
1428
|
+
parse_options=parse_options,
|
|
1429
|
+
read_options=read_options,
|
|
1430
|
+
convert_options=convert_options,
|
|
1431
|
+
)
|
|
1418
1432
|
return chain.parse_tabular(
|
|
1419
1433
|
output=output,
|
|
1420
1434
|
object_name=object_name,
|
datachain/lib/meta_formats.py
CHANGED
|
@@ -11,12 +11,16 @@ from collections.abc import Iterator
|
|
|
11
11
|
from typing import Any, Callable
|
|
12
12
|
|
|
13
13
|
import jmespath as jsp
|
|
14
|
-
from pydantic import Field, ValidationError # noqa: F401
|
|
14
|
+
from pydantic import BaseModel, ConfigDict, Field, ValidationError # noqa: F401
|
|
15
15
|
|
|
16
16
|
from datachain.lib.data_model import DataModel # noqa: F401
|
|
17
17
|
from datachain.lib.file import File
|
|
18
18
|
|
|
19
19
|
|
|
20
|
+
class UserModel(BaseModel):
|
|
21
|
+
model_config = ConfigDict(populate_by_name=True)
|
|
22
|
+
|
|
23
|
+
|
|
20
24
|
def generate_uuid():
|
|
21
25
|
return uuid.uuid4() # Generates a random UUID.
|
|
22
26
|
|
|
@@ -72,6 +76,8 @@ def read_schema(source_file, data_type="csv", expr=None, model_name=None):
|
|
|
72
76
|
data_type,
|
|
73
77
|
"--class-name",
|
|
74
78
|
model_name,
|
|
79
|
+
"--base-class",
|
|
80
|
+
"datachain.lib.meta_formats.UserModel",
|
|
75
81
|
]
|
|
76
82
|
try:
|
|
77
83
|
result = subprocess.run( # noqa: S603
|
|
@@ -87,7 +93,7 @@ def read_schema(source_file, data_type="csv", expr=None, model_name=None):
|
|
|
87
93
|
except subprocess.CalledProcessError as e:
|
|
88
94
|
model_output = f"An error occurred in datamodel-codegen: {e.stderr}"
|
|
89
95
|
print(f"{model_output}")
|
|
90
|
-
print("
|
|
96
|
+
print("from datachain.lib.data_model import DataModel")
|
|
91
97
|
print("\n" + f"DataModel.register({model_name})" + "\n")
|
|
92
98
|
print("\n" + f"spec={model_name}" + "\n")
|
|
93
99
|
return model_output
|
datachain/node.py
CHANGED
datachain/query/schema.py
CHANGED
|
@@ -45,6 +45,10 @@ class Column(sa.ColumnClause, metaclass=ColumnMeta):
|
|
|
45
45
|
"""Search for matches using glob pattern matching."""
|
|
46
46
|
return self.op("GLOB")(glob_str)
|
|
47
47
|
|
|
48
|
+
def regexp(self, regexp_str):
|
|
49
|
+
"""Search for matches using regexp pattern matching."""
|
|
50
|
+
return self.op("REGEXP")(regexp_str)
|
|
51
|
+
|
|
48
52
|
|
|
49
53
|
class UDFParameter(ABC):
|
|
50
54
|
@abstractmethod
|
datachain/sql/default/base.py
CHANGED
|
@@ -1,8 +1,10 @@
|
|
|
1
1
|
from datachain.sql.types import (
|
|
2
|
+
DBDefaults,
|
|
2
3
|
TypeConverter,
|
|
3
4
|
TypeDefaults,
|
|
4
5
|
TypeReadConverter,
|
|
5
6
|
register_backend_types,
|
|
7
|
+
register_db_defaults,
|
|
6
8
|
register_type_defaults,
|
|
7
9
|
register_type_read_converters,
|
|
8
10
|
)
|
|
@@ -18,5 +20,6 @@ def setup() -> None:
|
|
|
18
20
|
register_backend_types("default", TypeConverter())
|
|
19
21
|
register_type_read_converters("default", TypeReadConverter())
|
|
20
22
|
register_type_defaults("default", TypeDefaults())
|
|
23
|
+
register_db_defaults("default", DBDefaults())
|
|
21
24
|
|
|
22
25
|
setup_is_complete = True
|
datachain/sql/sqlite/base.py
CHANGED
|
@@ -22,8 +22,10 @@ from datachain.sql.sqlite.types import (
|
|
|
22
22
|
register_type_converters,
|
|
23
23
|
)
|
|
24
24
|
from datachain.sql.types import (
|
|
25
|
+
DBDefaults,
|
|
25
26
|
TypeDefaults,
|
|
26
27
|
register_backend_types,
|
|
28
|
+
register_db_defaults,
|
|
27
29
|
register_type_defaults,
|
|
28
30
|
register_type_read_converters,
|
|
29
31
|
)
|
|
@@ -66,6 +68,7 @@ def setup():
|
|
|
66
68
|
register_backend_types("sqlite", SQLiteTypeConverter())
|
|
67
69
|
register_type_read_converters("sqlite", SQLiteTypeReadConverter())
|
|
68
70
|
register_type_defaults("sqlite", TypeDefaults())
|
|
71
|
+
register_db_defaults("sqlite", DBDefaults())
|
|
69
72
|
|
|
70
73
|
compiles(sql_path.parent, "sqlite")(compile_path_parent)
|
|
71
74
|
compiles(sql_path.name, "sqlite")(compile_path_name)
|
datachain/sql/types.py
CHANGED
|
@@ -17,6 +17,7 @@ from datetime import datetime
|
|
|
17
17
|
from types import MappingProxyType
|
|
18
18
|
from typing import Any, Union
|
|
19
19
|
|
|
20
|
+
import sqlalchemy as sa
|
|
20
21
|
from sqlalchemy import TypeDecorator, types
|
|
21
22
|
|
|
22
23
|
_registry: dict[str, "TypeConverter"] = {}
|
|
@@ -28,6 +29,9 @@ read_converter_registry = MappingProxyType(_read_converter_registry)
|
|
|
28
29
|
_type_defaults_registry: dict[str, "TypeDefaults"] = {}
|
|
29
30
|
type_defaults_registry = MappingProxyType(_type_defaults_registry)
|
|
30
31
|
|
|
32
|
+
_db_defaults_registry: dict[str, "DBDefaults"] = {}
|
|
33
|
+
db_defaults_registry = MappingProxyType(_db_defaults_registry)
|
|
34
|
+
|
|
31
35
|
NullType = types.NullType
|
|
32
36
|
|
|
33
37
|
|
|
@@ -43,6 +47,10 @@ def register_type_defaults(dialect_name: str, td: "TypeDefaults"):
|
|
|
43
47
|
_type_defaults_registry[dialect_name] = td
|
|
44
48
|
|
|
45
49
|
|
|
50
|
+
def register_db_defaults(dialect_name: str, dbd: "DBDefaults"):
|
|
51
|
+
_db_defaults_registry[dialect_name] = dbd
|
|
52
|
+
|
|
53
|
+
|
|
46
54
|
def converter(dialect) -> "TypeConverter":
|
|
47
55
|
name = dialect.name
|
|
48
56
|
try:
|
|
@@ -71,6 +79,14 @@ def type_defaults(dialect) -> "TypeDefaults":
|
|
|
71
79
|
raise ValueError(f"No type defaults registered for dialect: {name!r}") from None
|
|
72
80
|
|
|
73
81
|
|
|
82
|
+
def db_defaults(dialect) -> "DBDefaults":
|
|
83
|
+
name = dialect.name
|
|
84
|
+
try:
|
|
85
|
+
return db_defaults_registry[name]
|
|
86
|
+
except KeyError:
|
|
87
|
+
raise ValueError(f"No DB defaults registered for dialect: {name!r}") from None
|
|
88
|
+
|
|
89
|
+
|
|
74
90
|
class SQLType(TypeDecorator):
|
|
75
91
|
impl: type[types.TypeEngine[Any]] = types.TypeEngine
|
|
76
92
|
cache_ok = True
|
|
@@ -97,6 +113,10 @@ class String(SQLType):
|
|
|
97
113
|
def default_value(dialect):
|
|
98
114
|
return type_defaults(dialect).string()
|
|
99
115
|
|
|
116
|
+
@staticmethod
|
|
117
|
+
def db_default_value(dialect):
|
|
118
|
+
return db_defaults(dialect).string()
|
|
119
|
+
|
|
100
120
|
def on_read_convert(self, value, dialect):
|
|
101
121
|
return read_converter(dialect).string(value)
|
|
102
122
|
|
|
@@ -115,6 +135,10 @@ class Boolean(SQLType):
|
|
|
115
135
|
def default_value(dialect):
|
|
116
136
|
return type_defaults(dialect).boolean()
|
|
117
137
|
|
|
138
|
+
@staticmethod
|
|
139
|
+
def db_default_value(dialect):
|
|
140
|
+
return db_defaults(dialect).boolean()
|
|
141
|
+
|
|
118
142
|
def on_read_convert(self, value, dialect):
|
|
119
143
|
return read_converter(dialect).boolean(value)
|
|
120
144
|
|
|
@@ -133,6 +157,10 @@ class Int(SQLType):
|
|
|
133
157
|
def default_value(dialect):
|
|
134
158
|
return type_defaults(dialect).int()
|
|
135
159
|
|
|
160
|
+
@staticmethod
|
|
161
|
+
def db_default_value(dialect):
|
|
162
|
+
return db_defaults(dialect).int()
|
|
163
|
+
|
|
136
164
|
def on_read_convert(self, value, dialect):
|
|
137
165
|
return read_converter(dialect).int(value)
|
|
138
166
|
|
|
@@ -145,6 +173,10 @@ class Int32(Int):
|
|
|
145
173
|
def default_value(dialect):
|
|
146
174
|
return type_defaults(dialect).int32()
|
|
147
175
|
|
|
176
|
+
@staticmethod
|
|
177
|
+
def db_default_value(dialect):
|
|
178
|
+
return db_defaults(dialect).int32()
|
|
179
|
+
|
|
148
180
|
def on_read_convert(self, value, dialect):
|
|
149
181
|
return read_converter(dialect).int32(value)
|
|
150
182
|
|
|
@@ -157,6 +189,10 @@ class Int64(Int):
|
|
|
157
189
|
def default_value(dialect):
|
|
158
190
|
return type_defaults(dialect).int64()
|
|
159
191
|
|
|
192
|
+
@staticmethod
|
|
193
|
+
def db_default_value(dialect):
|
|
194
|
+
return db_defaults(dialect).int64()
|
|
195
|
+
|
|
160
196
|
def on_read_convert(self, value, dialect):
|
|
161
197
|
return read_converter(dialect).int64(value)
|
|
162
198
|
|
|
@@ -169,12 +205,16 @@ class UInt64(Int):
|
|
|
169
205
|
def default_value(dialect):
|
|
170
206
|
return type_defaults(dialect).uint64()
|
|
171
207
|
|
|
208
|
+
@staticmethod
|
|
209
|
+
def db_default_value(dialect):
|
|
210
|
+
return db_defaults(dialect).uint64()
|
|
211
|
+
|
|
172
212
|
def on_read_convert(self, value, dialect):
|
|
173
213
|
return read_converter(dialect).uint64(value)
|
|
174
214
|
|
|
175
215
|
|
|
176
216
|
class Float(SQLType):
|
|
177
|
-
impl = types.
|
|
217
|
+
impl = types.FLOAT
|
|
178
218
|
|
|
179
219
|
@property
|
|
180
220
|
def python_type(self):
|
|
@@ -187,6 +227,10 @@ class Float(SQLType):
|
|
|
187
227
|
def default_value(dialect):
|
|
188
228
|
return type_defaults(dialect).float()
|
|
189
229
|
|
|
230
|
+
@staticmethod
|
|
231
|
+
def db_default_value(dialect):
|
|
232
|
+
return db_defaults(dialect).float()
|
|
233
|
+
|
|
190
234
|
def on_read_convert(self, value, dialect):
|
|
191
235
|
return read_converter(dialect).float(value)
|
|
192
236
|
|
|
@@ -199,6 +243,10 @@ class Float32(Float):
|
|
|
199
243
|
def default_value(dialect):
|
|
200
244
|
return type_defaults(dialect).float32()
|
|
201
245
|
|
|
246
|
+
@staticmethod
|
|
247
|
+
def db_default_value(dialect):
|
|
248
|
+
return db_defaults(dialect).float32()
|
|
249
|
+
|
|
202
250
|
def on_read_convert(self, value, dialect):
|
|
203
251
|
return read_converter(dialect).float32(value)
|
|
204
252
|
|
|
@@ -211,6 +259,10 @@ class Float64(Float):
|
|
|
211
259
|
def default_value(dialect):
|
|
212
260
|
return type_defaults(dialect).float64()
|
|
213
261
|
|
|
262
|
+
@staticmethod
|
|
263
|
+
def db_default_value(dialect):
|
|
264
|
+
return db_defaults(dialect).float64()
|
|
265
|
+
|
|
214
266
|
def on_read_convert(self, value, dialect):
|
|
215
267
|
return read_converter(dialect).float64(value)
|
|
216
268
|
|
|
@@ -247,6 +299,10 @@ class Array(SQLType):
|
|
|
247
299
|
def default_value(dialect):
|
|
248
300
|
return type_defaults(dialect).array()
|
|
249
301
|
|
|
302
|
+
@staticmethod
|
|
303
|
+
def db_default_value(dialect):
|
|
304
|
+
return db_defaults(dialect).array()
|
|
305
|
+
|
|
250
306
|
def on_read_convert(self, value, dialect):
|
|
251
307
|
r = read_converter(dialect).array(value, self.item_type, dialect)
|
|
252
308
|
if isinstance(self.item_type, JSON):
|
|
@@ -268,6 +324,10 @@ class JSON(SQLType):
|
|
|
268
324
|
def default_value(dialect):
|
|
269
325
|
return type_defaults(dialect).json()
|
|
270
326
|
|
|
327
|
+
@staticmethod
|
|
328
|
+
def db_default_value(dialect):
|
|
329
|
+
return db_defaults(dialect).json()
|
|
330
|
+
|
|
271
331
|
def on_read_convert(self, value, dialect):
|
|
272
332
|
return read_converter(dialect).json(value)
|
|
273
333
|
|
|
@@ -286,6 +346,10 @@ class DateTime(SQLType):
|
|
|
286
346
|
def default_value(dialect):
|
|
287
347
|
return type_defaults(dialect).datetime()
|
|
288
348
|
|
|
349
|
+
@staticmethod
|
|
350
|
+
def db_default_value(dialect):
|
|
351
|
+
return db_defaults(dialect).datetime()
|
|
352
|
+
|
|
289
353
|
def on_read_convert(self, value, dialect):
|
|
290
354
|
return read_converter(dialect).datetime(value)
|
|
291
355
|
|
|
@@ -304,6 +368,10 @@ class Binary(SQLType):
|
|
|
304
368
|
def default_value(dialect):
|
|
305
369
|
return type_defaults(dialect).binary()
|
|
306
370
|
|
|
371
|
+
@staticmethod
|
|
372
|
+
def db_default_value(dialect):
|
|
373
|
+
return db_defaults(dialect).binary()
|
|
374
|
+
|
|
307
375
|
def on_read_convert(self, value, dialect):
|
|
308
376
|
return read_converter(dialect).binary(value)
|
|
309
377
|
|
|
@@ -328,13 +396,17 @@ class TypeReadConverter:
|
|
|
328
396
|
return value
|
|
329
397
|
|
|
330
398
|
def float(self, value):
|
|
399
|
+
if value is None:
|
|
400
|
+
return float("nan")
|
|
401
|
+
if isinstance(value, str) and value.lower() == "nan":
|
|
402
|
+
return float("nan")
|
|
331
403
|
return value
|
|
332
404
|
|
|
333
405
|
def float32(self, value):
|
|
334
|
-
return value
|
|
406
|
+
return self.float(value)
|
|
335
407
|
|
|
336
408
|
def float64(self, value):
|
|
337
|
-
return value
|
|
409
|
+
return self.float(value)
|
|
338
410
|
|
|
339
411
|
def array(self, value, item_type, dialect):
|
|
340
412
|
if value is None or item_type is None:
|
|
@@ -347,10 +419,9 @@ class TypeReadConverter:
|
|
|
347
419
|
def datetime(self, value):
|
|
348
420
|
return value
|
|
349
421
|
|
|
350
|
-
def uuid(self, value):
|
|
351
|
-
return value
|
|
352
|
-
|
|
353
422
|
def binary(self, value):
|
|
423
|
+
if isinstance(value, str):
|
|
424
|
+
return value.encode()
|
|
354
425
|
return value
|
|
355
426
|
|
|
356
427
|
|
|
@@ -415,13 +486,13 @@ class TypeDefaults:
|
|
|
415
486
|
return None
|
|
416
487
|
|
|
417
488
|
def float(self):
|
|
418
|
-
return
|
|
489
|
+
return float("nan")
|
|
419
490
|
|
|
420
491
|
def float32(self):
|
|
421
|
-
return
|
|
492
|
+
return self.float()
|
|
422
493
|
|
|
423
494
|
def float64(self):
|
|
424
|
-
return
|
|
495
|
+
return self.float()
|
|
425
496
|
|
|
426
497
|
def array(self):
|
|
427
498
|
return None
|
|
@@ -432,11 +503,49 @@ class TypeDefaults:
|
|
|
432
503
|
def datetime(self):
|
|
433
504
|
return None
|
|
434
505
|
|
|
435
|
-
def
|
|
506
|
+
def binary(self):
|
|
436
507
|
return None
|
|
437
508
|
|
|
509
|
+
|
|
510
|
+
class DBDefaults:
|
|
511
|
+
def string(self):
|
|
512
|
+
return sa.text("''")
|
|
513
|
+
|
|
514
|
+
def boolean(self):
|
|
515
|
+
return sa.text("False")
|
|
516
|
+
|
|
517
|
+
def int(self):
|
|
518
|
+
return sa.text("0")
|
|
519
|
+
|
|
520
|
+
def int32(self):
|
|
521
|
+
return self.int()
|
|
522
|
+
|
|
523
|
+
def int64(self):
|
|
524
|
+
return self.int()
|
|
525
|
+
|
|
526
|
+
def uint64(self):
|
|
527
|
+
return self.int()
|
|
528
|
+
|
|
529
|
+
def float(self):
|
|
530
|
+
return sa.text("NaN")
|
|
531
|
+
|
|
532
|
+
def float32(self):
|
|
533
|
+
return self.float()
|
|
534
|
+
|
|
535
|
+
def float64(self):
|
|
536
|
+
return self.float()
|
|
537
|
+
|
|
538
|
+
def array(self):
|
|
539
|
+
return sa.text("'[]'")
|
|
540
|
+
|
|
541
|
+
def json(self):
|
|
542
|
+
return sa.text("'{}'")
|
|
543
|
+
|
|
544
|
+
def datetime(self):
|
|
545
|
+
return sa.text("'1970-01-01 00:00:00'")
|
|
546
|
+
|
|
438
547
|
def binary(self):
|
|
439
|
-
return
|
|
548
|
+
return sa.text("''")
|
|
440
549
|
|
|
441
550
|
|
|
442
551
|
TYPES = [
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.2
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -55,6 +55,15 @@ Requires-Dist: mkdocs-material >=9.3.1 ; extra == 'docs'
|
|
|
55
55
|
Requires-Dist: mkdocs-section-index >=0.3.6 ; extra == 'docs'
|
|
56
56
|
Requires-Dist: mkdocstrings-python >=1.6.3 ; extra == 'docs'
|
|
57
57
|
Requires-Dist: mkdocs-literate-nav >=0.6.1 ; extra == 'docs'
|
|
58
|
+
Provides-Extra: examples
|
|
59
|
+
Requires-Dist: datachain[tests] ; extra == 'examples'
|
|
60
|
+
Requires-Dist: numpy <2,>=1 ; extra == 'examples'
|
|
61
|
+
Requires-Dist: defusedxml ; extra == 'examples'
|
|
62
|
+
Requires-Dist: accelerate ; extra == 'examples'
|
|
63
|
+
Requires-Dist: unstructured[pdf] ; extra == 'examples'
|
|
64
|
+
Requires-Dist: pdfplumber ==0.11.3 ; extra == 'examples'
|
|
65
|
+
Requires-Dist: huggingface-hub[hf_transfer] ; extra == 'examples'
|
|
66
|
+
Requires-Dist: nltk ==3.8.1 ; extra == 'examples'
|
|
58
67
|
Provides-Extra: remote
|
|
59
68
|
Requires-Dist: lz4 ; extra == 'remote'
|
|
60
69
|
Requires-Dist: msgpack <2,>=1.0.4 ; extra == 'remote'
|
|
@@ -100,102 +109,78 @@ Requires-Dist: usearch ; extra == 'vector'
|
|
|
100
109
|
AI 🔗 DataChain
|
|
101
110
|
----------------
|
|
102
111
|
|
|
103
|
-
DataChain is a data-frame library designed for
|
|
104
|
-
|
|
105
|
-
|
|
112
|
+
DataChain is a modern Pythonic data-frame library designed for artificial intelligence.
|
|
113
|
+
It is made to organize your unstructured data into datasets and wrangle it at scale on
|
|
114
|
+
your local machine.
|
|
106
115
|
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
Local), version and update datasets.
|
|
116
|
+
Key Features
|
|
117
|
+
============
|
|
110
118
|
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
119
|
+
📂 **Storage as a Source of Truth.**
|
|
120
|
+
- Process unstructured data without redundant copies: S3, GCP, Azure, and local
|
|
121
|
+
file systems.
|
|
122
|
+
- Multimodal data: images, video, text, PDFs, JSONs, CSVs, parquet.
|
|
123
|
+
- Join files and metadata together into persistent, versioned, columnar datasets.
|
|
114
124
|
|
|
115
|
-
|
|
116
|
-
|
|
125
|
+
🐍 **Python-friendly data pipelines.**
|
|
126
|
+
- Operate on Python objects and object fields.
|
|
127
|
+
- Built-in parallelization and out-of-memory compute without a need in SQL or
|
|
128
|
+
Spark jobs.
|
|
117
129
|
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
130
|
+
🧠 **Data Enrichment and Processing.**
|
|
131
|
+
- Generate metadata columns using local AI models and LLM APIs.
|
|
132
|
+
- Filter, join, and group by AI metadata. Vector similarity search.
|
|
133
|
+
- Pass datasets to Pytorch and Tensorflow, or export back into storage.
|
|
121
134
|
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
135
|
+
🚀 **Efficiency.**
|
|
136
|
+
- Parallelization, out-of-memory workloads and data caching.
|
|
137
|
+
- Vectorized operations on Python object fields: sum, count, avg, etc.
|
|
138
|
+
- Vector search on embeddings.
|
|
125
139
|
|
|
126
140
|
|
|
141
|
+
Quick Start
|
|
142
|
+
-----------
|
|
143
|
+
|
|
127
144
|
.. code:: console
|
|
128
145
|
|
|
129
146
|
$ pip install datachain
|
|
130
147
|
|
|
131
148
|
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
DataChain introduces expressive data structures tailored for AI-specific workload:
|
|
136
|
-
|
|
137
|
-
- **Dataset:** Preserves the file-references and meta-information. Takes care of Python
|
|
138
|
-
object serialization, dataset versioning and difference. Operations on dataset:
|
|
139
|
-
|
|
140
|
-
- **Transformations:** traditional data-frame or SQL operations such as filtering,
|
|
141
|
-
grouping, joining.
|
|
142
|
-
- **Enrichments:** mapping, aggregating and generating using customer’s Python
|
|
143
|
-
code. This is needed to work with ML inference and LLM calls.
|
|
144
|
-
|
|
145
|
-
- **Chain** is a sequence of operations on datasets. Chain executes operations in lazy
|
|
146
|
-
mode - only when needed.
|
|
147
|
-
|
|
148
|
-
DataChain name comes from these major data structures: dataset and chaining.
|
|
149
|
-
|
|
149
|
+
Selecting files using JSON metadata
|
|
150
|
+
======================================
|
|
150
151
|
|
|
151
|
-
|
|
152
|
-
|
|
152
|
+
A storage consists of images of cats and dogs (`dog.1048.jpg`, `cat.1009.jpg`),
|
|
153
|
+
annotated with ground truth and model inferences in the 'json-pairs' format,
|
|
154
|
+
where each image has a matching JSON file like `cat.1009.json`:
|
|
153
155
|
|
|
154
|
-
|
|
155
|
-
use-cases and at the same time to fit it into traditional data infrastructure.
|
|
156
|
+
.. code:: json
|
|
156
157
|
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
needed for distributed computations.
|
|
162
|
-
- **Resuming data processing** (in development). Introduces idempotent operations,
|
|
163
|
-
allowing data processing to resume from the last successful process file/record/batch
|
|
164
|
-
if it fails due to issues like failed LLM calls, ML inference or file download.
|
|
158
|
+
{
|
|
159
|
+
"class": "cat", "id": "1009", "num_annotators": 8,
|
|
160
|
+
"inference": {"class": "dog", "confidence": 0.68}
|
|
161
|
+
}
|
|
165
162
|
|
|
166
|
-
|
|
163
|
+
Example of downloading only high-confidence cat images using JSON metadata:
|
|
167
164
|
|
|
168
|
-
- **Functional style data processing.** Using a functional/chaining approach to data
|
|
169
|
-
processing rather than declarative SQL, inspired by R-dplyr and some Python libraries.
|
|
170
|
-
- **Data Versioning.** Treats raw files in cloud storage as the source of truth for data
|
|
171
|
-
and implements data versioning, extending ideas from DVC (developed by the same team).
|
|
172
165
|
|
|
166
|
+
.. code:: py
|
|
173
167
|
|
|
174
|
-
|
|
175
|
-
======================
|
|
176
|
-
|
|
177
|
-
- **Not a database** (Postgres, MySQL). Instead, it uses databases under the hood:
|
|
178
|
-
`SQLite`_ in open-source and ClickHouse and other data warehouses for the commercial
|
|
179
|
-
version.
|
|
180
|
-
- **Not a data processing tool / data warehouse** (Spark, Snowflake, Big Query) since
|
|
181
|
-
it delegates heavy data transformations to underlying data warehouses and focuses on
|
|
182
|
-
AI specific data enrichments and orchestrating all the pieces together.
|
|
183
|
-
|
|
168
|
+
from datachain import Column, DataChain
|
|
184
169
|
|
|
185
|
-
|
|
186
|
-
|
|
170
|
+
meta = DataChain.from_json("gs://datachain-demo/dogs-and-cats/*json", object_name="meta")
|
|
171
|
+
images = DataChain.from_storage("gs://datachain-demo/dogs-and-cats/*jpg")
|
|
187
172
|
|
|
188
|
-
|
|
189
|
-
|
|
173
|
+
images_id = images.map(id=lambda file: file.path.split('.')[-2])
|
|
174
|
+
annotated = images_id.merge(meta, on="id", right_on="meta.id")
|
|
190
175
|
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
Our goal is to identify the successful dialogs.
|
|
176
|
+
likely_cats = annotated.filter((Column("meta.inference.confidence") > 0.93) \
|
|
177
|
+
& (Column("meta.inference.class_") == "cat"))
|
|
178
|
+
likely_cats.export_files("high-confidence-cats/", signal="file")
|
|
195
179
|
|
|
196
|
-
The data used in the examples is `publicly available`_. The sample code is designed to run on a local machine.
|
|
197
180
|
|
|
198
|
-
|
|
181
|
+
Data curation with a local AI model
|
|
182
|
+
===================================
|
|
183
|
+
Batch inference with a simple sentiment model using the `transformers` library:
|
|
199
184
|
|
|
200
185
|
.. code:: shell
|
|
201
186
|
|
|
@@ -246,30 +231,30 @@ LLM judging chatbots
|
|
|
246
231
|
=============================
|
|
247
232
|
|
|
248
233
|
LLMs can work as efficient universal classifiers. In the example below,
|
|
249
|
-
we employ a free API from Mistral to judge the chatbot
|
|
234
|
+
we employ a free API from Mistral to judge the `publicly available`_ chatbot dialogs. Please get a free
|
|
250
235
|
Mistral API key at https://console.mistral.ai
|
|
251
236
|
|
|
237
|
+
|
|
252
238
|
.. code:: shell
|
|
253
239
|
|
|
254
|
-
$ pip install mistralai
|
|
240
|
+
$ pip install mistralai (Requires version >=1.0.0)
|
|
255
241
|
$ export MISTRAL_API_KEY=_your_key_
|
|
256
242
|
|
|
257
243
|
DataChain can parallelize API calls; the free Mistral tier supports up to 4 requests at the same time.
|
|
258
244
|
|
|
259
245
|
.. code:: py
|
|
260
246
|
|
|
261
|
-
from mistralai
|
|
262
|
-
from mistralai.models.chat_completion import ChatMessage
|
|
247
|
+
from mistralai import Mistral
|
|
263
248
|
from datachain import File, DataChain, Column
|
|
264
249
|
|
|
265
250
|
PROMPT = "Was this dialog successful? Answer in a single word: Success or Failure."
|
|
266
251
|
|
|
267
252
|
def eval_dialogue(file: File) -> bool:
|
|
268
|
-
client =
|
|
269
|
-
response = client.chat(
|
|
253
|
+
client = Mistral()
|
|
254
|
+
response = client.chat.complete(
|
|
270
255
|
model="open-mixtral-8x22b",
|
|
271
|
-
messages=[
|
|
272
|
-
|
|
256
|
+
messages=[{"role": "system", "content": PROMPT},
|
|
257
|
+
{"role": "user", "content": file.read()}])
|
|
273
258
|
result = response.choices[0].message.content
|
|
274
259
|
return result.lower().startswith("success")
|
|
275
260
|
|
|
@@ -309,8 +294,8 @@ Instead of extracting this information from the Mistral response data structure
|
|
|
309
294
|
|
|
310
295
|
.. code:: py
|
|
311
296
|
|
|
312
|
-
from mistralai
|
|
313
|
-
from mistralai.models
|
|
297
|
+
from mistralai import Mistral
|
|
298
|
+
from mistralai.models import ChatCompletionResponse
|
|
314
299
|
from datachain import File, DataChain, Column
|
|
315
300
|
|
|
316
301
|
PROMPT = "Was this dialog successful? Answer in a single word: Success or Failure."
|
|
@@ -319,8 +304,8 @@ Instead of extracting this information from the Mistral response data structure
|
|
|
319
304
|
client = MistralClient()
|
|
320
305
|
return client.chat(
|
|
321
306
|
model="open-mixtral-8x22b",
|
|
322
|
-
messages=[
|
|
323
|
-
|
|
307
|
+
messages=[{"role": "system", "content": PROMPT},
|
|
308
|
+
{"role": "user", "content": file.read()}])
|
|
324
309
|
|
|
325
310
|
chain = (
|
|
326
311
|
DataChain.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file")
|
|
@@ -438,7 +423,10 @@ Tutorials
|
|
|
438
423
|
---------
|
|
439
424
|
|
|
440
425
|
* `Getting Started`_
|
|
441
|
-
* `Multimodal <examples/multimodal/clip_fine_tuning.ipynb>`_ (try in `Colab <https://colab.research.google.com/github/iterative/datachain/blob/main/
|
|
426
|
+
* `Multimodal <https://github.com/iterative/datachain-examples/blob/main/multimodal/clip_fine_tuning.ipynb>`_ (try in `Colab <https://colab.research.google.com/github/iterative/datachain-examples/blob/main/multimodal/clip_fine_tuning.ipynb>`__)
|
|
427
|
+
* `LLM evaluations <https://github.com/iterative/datachain-examples/blob/main/llm/llm_chatbot_evaluation.ipynb>`_ (try in `Colab <https://colab.research.google.com/github/iterative/datachain-examples/blob/main/llm/llm_chatbot_evaluation.ipynb>`__)
|
|
428
|
+
* `Reading JSON metadata <https://github.com/iterative/datachain-examples/blob/main/formats/json-metadata-tutorial.ipynb>`_ (try in `Colab <https://colab.research.google.com/github/iterative/datachain-examples/blob/main/formats/json-metadata-tutorial.ipynb>`__)
|
|
429
|
+
|
|
442
430
|
|
|
443
431
|
Contributions
|
|
444
432
|
-------------
|
|
@@ -9,7 +9,7 @@ datachain/dataset.py,sha256=MZezyuJWNj_3PEtzr0epPMNyWAOTrhTSPI5FmemV6L4,14470
|
|
|
9
9
|
datachain/error.py,sha256=GY9KYTmb7GHXn2gGHV9X-PBhgwLj3i7VpK7tGHtAoGM,1279
|
|
10
10
|
datachain/job.py,sha256=bk25bIqClhgRPzlXAhxpTtDeewibQe5l3S8Cf7db0gM,1229
|
|
11
11
|
datachain/listing.py,sha256=keLkvPfumDA3gijeIiinH5yGWe71qCxgF5HqqP5AeH4,8299
|
|
12
|
-
datachain/node.py,sha256=
|
|
12
|
+
datachain/node.py,sha256=ihrP5l9HKpXLR0fR1wyb7QIdb7NR26dX6bB09qGX5B4,6005
|
|
13
13
|
datachain/nodes_fetcher.py,sha256=kca19yvu11JxoVY1t4_ydp1FmchiV88GnNicNBQ9NIA,831
|
|
14
14
|
datachain/nodes_thread_pool.py,sha256=ZyzBvUImIPmi4WlKC2SW2msA0UhtembbTdcs2nx29A0,3191
|
|
15
15
|
datachain/progress.py,sha256=7_8FtJs770ITK9sMq-Lt4k4k18QmYl4yIG_kCoWID3o,4559
|
|
@@ -17,7 +17,7 @@ datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
|
17
17
|
datachain/storage.py,sha256=RiSJLYdHUjnrEWkLBKPcETHpAxld_B2WxLg711t0aZI,3733
|
|
18
18
|
datachain/utils.py,sha256=ROVCLwb37VmFRzgTlSGUDw4eJNgYGiQ4yMX581HfUX8,12988
|
|
19
19
|
datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
|
|
20
|
-
datachain/catalog/catalog.py,sha256=
|
|
20
|
+
datachain/catalog/catalog.py,sha256=9fxRJjiM8tK3ZePHFErYqY6LkJFA6bvlp-KHq-_kSYk,80703
|
|
21
21
|
datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
|
|
22
22
|
datachain/catalog/loader.py,sha256=GJ8zhEYkC7TuaPzCsjJQ4LtTdECu-wwYzC12MikPOMQ,7307
|
|
23
23
|
datachain/catalog/subclass.py,sha256=B5R0qxeTYEyVAAPM1RutBPSoXZc8L5mVVZeSGXki9Sw,2096
|
|
@@ -33,19 +33,19 @@ datachain/data_storage/db_engine.py,sha256=81Ol1of9TTTzD97ORajCnP366Xz2mEJt6C-kT
|
|
|
33
33
|
datachain/data_storage/id_generator.py,sha256=lCEoU0BM37Ai2aRpSbwo5oQT0GqZnSpYwwvizathRMQ,4292
|
|
34
34
|
datachain/data_storage/job.py,sha256=w-7spowjkOa1P5fUVtJou3OltT0L48P0RYWZ9rSJ9-s,383
|
|
35
35
|
datachain/data_storage/metastore.py,sha256=nxcY6nwyEmQWMAo33sNGO-FgUFQs2amBGGnZz2ftEz0,55362
|
|
36
|
-
datachain/data_storage/schema.py,sha256=
|
|
36
|
+
datachain/data_storage/schema.py,sha256=GwJIHkjhrnBxJAV1WvCMM8jiJN5h79LXDyzMmUDtRw0,8523
|
|
37
37
|
datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
|
|
38
|
-
datachain/data_storage/sqlite.py,sha256=
|
|
38
|
+
datachain/data_storage/sqlite.py,sha256=IKd4epEjVxAoQQHsE7WTY4kgOiFyUiWhvaGm-61rJfg,27218
|
|
39
39
|
datachain/data_storage/warehouse.py,sha256=MXYkUG69UK2wbIFsZFvT7rKzXlnSitDMp3Vzj_IIsnA,33089
|
|
40
40
|
datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
41
41
|
datachain/lib/arrow.py,sha256=R8wDUDEa-5hYjI3HW9cqvOYYJpeeah5lbhFIL3gkmcE,4915
|
|
42
42
|
datachain/lib/clip.py,sha256=16u4b_y2Y15nUS2UN_8ximMo6r_-_4IQpmct2ol-e-g,5730
|
|
43
43
|
datachain/lib/data_model.py,sha256=qfTtQNncS5pt9SvXdMEa5kClniaT6XBGBfO7onEz2TI,1632
|
|
44
44
|
datachain/lib/dataset_info.py,sha256=lONGr71ozo1DS4CQEhnpKORaU4qFb6Ketv8Xm8CVm2U,2188
|
|
45
|
-
datachain/lib/dc.py,sha256=
|
|
45
|
+
datachain/lib/dc.py,sha256=Q9HL7Axfo9i5hodlkD2GwklN4i0BVULm9_A11ckuj2A,58352
|
|
46
46
|
datachain/lib/file.py,sha256=ZHpdilDPYCob8uqtwUPtBvBNxVvQRq4AC_0IGg5m-G4,12003
|
|
47
47
|
datachain/lib/image.py,sha256=TgYhRhzd4nkytfFMeykQkPyzqb5Le_-tU81unVMPn4Q,2328
|
|
48
|
-
datachain/lib/meta_formats.py,sha256=
|
|
48
|
+
datachain/lib/meta_formats.py,sha256=Hels85LJmNCz1aYVJvhymNdAt3qdJ2-qoxsIiUezrow,7198
|
|
49
49
|
datachain/lib/model_store.py,sha256=c4USXsBBjrGH8VOh4seIgOiav-qHOwdoixtxfLgU63c,2409
|
|
50
50
|
datachain/lib/pytorch.py,sha256=9PsypKseyKfIimTmTQOgb-pbNXgeeAHLdlWx0qRPULY,5660
|
|
51
51
|
datachain/lib/settings.py,sha256=39thOpYJw-zPirzeNO6pmRC2vPrQvt4eBsw1xLWDFsw,2344
|
|
@@ -71,17 +71,17 @@ datachain/query/dispatch.py,sha256=GBh3EZHDp5AaXxrjOpfrpfsuy7Umnqxu-MAXcK9X3gc,1
|
|
|
71
71
|
datachain/query/metrics.py,sha256=vsECqbZfoSDBnvC3GQlziKXmISVYDLgHP1fMPEOtKyo,640
|
|
72
72
|
datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
|
|
73
73
|
datachain/query/queue.py,sha256=waqM_KzavU8C-G95-4211Nd4GXna_u2747Chgwtgz2w,3839
|
|
74
|
-
datachain/query/schema.py,sha256=
|
|
74
|
+
datachain/query/schema.py,sha256=BvHipN79CnSTbVFcfIEwzo1npe7HmThnk0iY-CSLEkM,7899
|
|
75
75
|
datachain/query/session.py,sha256=qTzkXgwMJdJhal3rVt3hdv3x1EXT1IHuXcwkC-Ex0As,4111
|
|
76
76
|
datachain/query/udf.py,sha256=j3NhmKK5rYG5TclcM2Sr0LhS1tmYLMjzMugx9G9iFLM,8100
|
|
77
77
|
datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
78
78
|
datachain/remote/studio.py,sha256=f5s6qSZ9uB4URGUoU_8_W1KZRRQQVSm6cgEBkBUEfuE,7226
|
|
79
79
|
datachain/sql/__init__.py,sha256=A2djrbQwSMUZZEIKGnm-mnRA-NDSbiDJNpAmmwGNyIo,303
|
|
80
80
|
datachain/sql/selectable.py,sha256=fBM-wS1TUA42kVEAAiwqGtibIevyZAEritwt8PZGyLQ,1589
|
|
81
|
-
datachain/sql/types.py,sha256=
|
|
81
|
+
datachain/sql/types.py,sha256=1MFvECB_5A6QwQKKY3VPhvitgKDlc2aB7iBjY4hv1_s,13034
|
|
82
82
|
datachain/sql/utils.py,sha256=rzlJw08etivdrcuQPqNVvVWhuVSyUPUQEEc6DOhu258,818
|
|
83
83
|
datachain/sql/default/__init__.py,sha256=XQ2cEZpzWiABqjV-6yYHUBGI9vN_UHxbxZENESmVAWw,45
|
|
84
|
-
datachain/sql/default/base.py,sha256=
|
|
84
|
+
datachain/sql/default/base.py,sha256=QD-31C6JnyOXzogyDx90sUhm7QvgXIYpeHEASH84igU,628
|
|
85
85
|
datachain/sql/functions/__init__.py,sha256=Ioyy7nSetrTLVnHGcGcmZU99HxUFcx-5PFbrh2dPNH0,396
|
|
86
86
|
datachain/sql/functions/array.py,sha256=EB7nJSncUc1PuxlHyzU2gVhF8DuXaxpGlxb5e8X2KFY,1297
|
|
87
87
|
datachain/sql/functions/conditional.py,sha256=q7YUKfunXeEldXaxgT-p5pUTcOEVU_tcQ2BJlquTRPs,207
|
|
@@ -89,13 +89,13 @@ datachain/sql/functions/path.py,sha256=zixpERotTFP6LZ7I4TiGtyRA8kXOoZmH1yzH9oRW0
|
|
|
89
89
|
datachain/sql/functions/random.py,sha256=vBwEEj98VH4LjWixUCygQ5Bz1mv1nohsCG0-ZTELlVg,271
|
|
90
90
|
datachain/sql/functions/string.py,sha256=hIrF1fTvlPamDtm8UMnWDcnGfbbjCsHxZXS30U2Rzxo,651
|
|
91
91
|
datachain/sql/sqlite/__init__.py,sha256=TAdJX0Bg28XdqPO-QwUVKy8rg78cgMileHvMNot7d04,166
|
|
92
|
-
datachain/sql/sqlite/base.py,sha256=
|
|
92
|
+
datachain/sql/sqlite/base.py,sha256=w6HbEkGdmNGDnDY3_75E-wDb6qNskVpq0qbHGADsERk,12327
|
|
93
93
|
datachain/sql/sqlite/types.py,sha256=yzvp0sXSEoEYXs6zaYC_2YubarQoZH-MiUNXcpuEP4s,1573
|
|
94
94
|
datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
|
|
95
95
|
datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
|
|
96
|
-
datachain-0.3.
|
|
97
|
-
datachain-0.3.
|
|
98
|
-
datachain-0.3.
|
|
99
|
-
datachain-0.3.
|
|
100
|
-
datachain-0.3.
|
|
101
|
-
datachain-0.3.
|
|
96
|
+
datachain-0.3.2.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
|
|
97
|
+
datachain-0.3.2.dist-info/METADATA,sha256=i8evXYMe4FgBqxV7TYdWTRuh7MxRT6jfqmzL-tbk_JQ,16789
|
|
98
|
+
datachain-0.3.2.dist-info/WHEEL,sha256=HiCZjzuy6Dw0hdX5R3LCFPDmFS4BWl8H-8W39XfmgX4,91
|
|
99
|
+
datachain-0.3.2.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
|
|
100
|
+
datachain-0.3.2.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
|
|
101
|
+
datachain-0.3.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|