datachain 0.2.16__py3-none-any.whl → 0.2.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/catalog/catalog.py +47 -44
- datachain/data_storage/db_engine.py +6 -2
- datachain/data_storage/id_generator.py +14 -0
- datachain/data_storage/metastore.py +13 -0
- datachain/data_storage/sqlite.py +45 -6
- datachain/data_storage/warehouse.py +13 -0
- datachain/lib/arrow.py +22 -7
- datachain/lib/convert/sql_to_python.py +13 -18
- datachain/lib/dc.py +53 -6
- datachain/lib/file.py +3 -3
- datachain/lib/signal_schema.py +33 -5
- datachain/listing.py +22 -10
- datachain/query/dataset.py +17 -20
- datachain/query/session.py +19 -4
- datachain/sql/functions/__init__.py +3 -2
- datachain/sql/functions/array.py +8 -0
- datachain/sql/sqlite/base.py +5 -0
- {datachain-0.2.16.dist-info → datachain-0.2.18.dist-info}/METADATA +71 -12
- {datachain-0.2.16.dist-info → datachain-0.2.18.dist-info}/RECORD +23 -23
- {datachain-0.2.16.dist-info → datachain-0.2.18.dist-info}/LICENSE +0 -0
- {datachain-0.2.16.dist-info → datachain-0.2.18.dist-info}/WHEEL +0 -0
- {datachain-0.2.16.dist-info → datachain-0.2.18.dist-info}/entry_points.txt +0 -0
- {datachain-0.2.16.dist-info → datachain-0.2.18.dist-info}/top_level.txt +0 -0
datachain/catalog/catalog.py
CHANGED
|
@@ -236,36 +236,36 @@ class DatasetRowsFetcher(NodesThreadPool):
|
|
|
236
236
|
import lz4.frame
|
|
237
237
|
import pandas as pd
|
|
238
238
|
|
|
239
|
-
metastore
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
urls = list(urls)
|
|
244
|
-
while urls:
|
|
245
|
-
for url in urls:
|
|
246
|
-
if self.should_check_for_status():
|
|
247
|
-
self.check_for_status()
|
|
248
|
-
|
|
249
|
-
r = requests.get(url, timeout=PULL_DATASET_CHUNK_TIMEOUT)
|
|
250
|
-
if r.status_code == 404:
|
|
251
|
-
time.sleep(PULL_DATASET_SLEEP_INTERVAL)
|
|
252
|
-
# moving to the next url
|
|
253
|
-
continue
|
|
239
|
+
# metastore and warehouse are not thread safe
|
|
240
|
+
with self.metastore.clone() as metastore, self.warehouse.clone() as warehouse:
|
|
241
|
+
dataset = metastore.get_dataset(self.dataset_name)
|
|
254
242
|
|
|
255
|
-
|
|
243
|
+
urls = list(urls)
|
|
244
|
+
while urls:
|
|
245
|
+
for url in urls:
|
|
246
|
+
if self.should_check_for_status():
|
|
247
|
+
self.check_for_status()
|
|
256
248
|
|
|
257
|
-
|
|
249
|
+
r = requests.get(url, timeout=PULL_DATASET_CHUNK_TIMEOUT)
|
|
250
|
+
if r.status_code == 404:
|
|
251
|
+
time.sleep(PULL_DATASET_SLEEP_INTERVAL)
|
|
252
|
+
# moving to the next url
|
|
253
|
+
continue
|
|
258
254
|
|
|
259
|
-
|
|
255
|
+
r.raise_for_status()
|
|
260
256
|
|
|
261
|
-
|
|
262
|
-
df = df.drop("sys__id", axis=1)
|
|
257
|
+
df = pd.read_parquet(io.BytesIO(lz4.frame.decompress(r.content)))
|
|
263
258
|
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
259
|
+
self.fix_columns(df)
|
|
260
|
+
|
|
261
|
+
# id will be autogenerated in DB
|
|
262
|
+
df = df.drop("sys__id", axis=1)
|
|
263
|
+
|
|
264
|
+
inserted = warehouse.insert_dataset_rows(
|
|
265
|
+
df, dataset, self.dataset_version
|
|
266
|
+
)
|
|
267
|
+
self.increase_counter(inserted) # type: ignore [arg-type]
|
|
268
|
+
urls.remove(url)
|
|
269
269
|
|
|
270
270
|
|
|
271
271
|
@dataclass
|
|
@@ -720,7 +720,6 @@ class Catalog:
|
|
|
720
720
|
client.uri, posixpath.join(prefix, "")
|
|
721
721
|
)
|
|
722
722
|
source_metastore = self.metastore.clone(client.uri)
|
|
723
|
-
source_warehouse = self.warehouse.clone()
|
|
724
723
|
|
|
725
724
|
columns = [
|
|
726
725
|
Column("vtype", String),
|
|
@@ -1835,25 +1834,29 @@ class Catalog:
|
|
|
1835
1834
|
if signed_urls:
|
|
1836
1835
|
shuffle(signed_urls)
|
|
1837
1836
|
|
|
1838
|
-
|
|
1839
|
-
self.metastore.clone(),
|
|
1840
|
-
self.warehouse.clone(),
|
|
1841
|
-
|
|
1842
|
-
|
|
1843
|
-
|
|
1844
|
-
|
|
1845
|
-
|
|
1846
|
-
|
|
1847
|
-
|
|
1848
|
-
|
|
1849
|
-
signed_urls,
|
|
1850
|
-
math.ceil(len(signed_urls) / PULL_DATASET_MAX_THREADS),
|
|
1851
|
-
),
|
|
1852
|
-
dataset_save_progress_bar,
|
|
1837
|
+
with (
|
|
1838
|
+
self.metastore.clone() as metastore,
|
|
1839
|
+
self.warehouse.clone() as warehouse,
|
|
1840
|
+
):
|
|
1841
|
+
rows_fetcher = DatasetRowsFetcher(
|
|
1842
|
+
metastore,
|
|
1843
|
+
warehouse,
|
|
1844
|
+
remote_config,
|
|
1845
|
+
dataset.name,
|
|
1846
|
+
version,
|
|
1847
|
+
schema,
|
|
1853
1848
|
)
|
|
1854
|
-
|
|
1855
|
-
|
|
1856
|
-
|
|
1849
|
+
try:
|
|
1850
|
+
rows_fetcher.run(
|
|
1851
|
+
batched(
|
|
1852
|
+
signed_urls,
|
|
1853
|
+
math.ceil(len(signed_urls) / PULL_DATASET_MAX_THREADS),
|
|
1854
|
+
),
|
|
1855
|
+
dataset_save_progress_bar,
|
|
1856
|
+
)
|
|
1857
|
+
except:
|
|
1858
|
+
self.remove_dataset(dataset.name, version)
|
|
1859
|
+
raise
|
|
1857
1860
|
|
|
1858
1861
|
dataset = self.metastore.update_dataset_status(
|
|
1859
1862
|
dataset,
|
|
@@ -4,7 +4,6 @@ from collections.abc import Iterator
|
|
|
4
4
|
from typing import TYPE_CHECKING, Any, ClassVar, Optional, Union
|
|
5
5
|
|
|
6
6
|
import sqlalchemy as sa
|
|
7
|
-
from attrs import frozen
|
|
8
7
|
from sqlalchemy.sql import FROM_LINTING
|
|
9
8
|
from sqlalchemy.sql.roles import DDLRole
|
|
10
9
|
|
|
@@ -23,13 +22,18 @@ logger = logging.getLogger("datachain")
|
|
|
23
22
|
SELECT_BATCH_SIZE = 100_000 # number of rows to fetch at a time
|
|
24
23
|
|
|
25
24
|
|
|
26
|
-
@frozen
|
|
27
25
|
class DatabaseEngine(ABC, Serializable):
|
|
28
26
|
dialect: ClassVar["Dialect"]
|
|
29
27
|
|
|
30
28
|
engine: "Engine"
|
|
31
29
|
metadata: "MetaData"
|
|
32
30
|
|
|
31
|
+
def __enter__(self) -> "DatabaseEngine":
|
|
32
|
+
return self
|
|
33
|
+
|
|
34
|
+
def __exit__(self, exc_type, exc_value, traceback) -> None:
|
|
35
|
+
self.close()
|
|
36
|
+
|
|
33
37
|
@abstractmethod
|
|
34
38
|
def clone(self) -> "DatabaseEngine":
|
|
35
39
|
"""Clones DatabaseEngine implementation."""
|
|
@@ -33,6 +33,16 @@ class AbstractIDGenerator(ABC, Serializable):
|
|
|
33
33
|
def cleanup_for_tests(self):
|
|
34
34
|
"""Cleanup for tests."""
|
|
35
35
|
|
|
36
|
+
def close(self) -> None:
|
|
37
|
+
"""Closes any active database connections."""
|
|
38
|
+
|
|
39
|
+
def close_on_exit(self) -> None:
|
|
40
|
+
"""Closes any active database or HTTP connections, called on Session exit or
|
|
41
|
+
for test cleanup only, as some ID Generator implementations may handle this
|
|
42
|
+
differently.
|
|
43
|
+
"""
|
|
44
|
+
self.close()
|
|
45
|
+
|
|
36
46
|
@abstractmethod
|
|
37
47
|
def init_id(self, uri: str) -> None:
|
|
38
48
|
"""Initializes the ID generator for the given URI with zero last_id."""
|
|
@@ -83,6 +93,10 @@ class AbstractDBIDGenerator(AbstractIDGenerator):
|
|
|
83
93
|
def clone(self) -> "AbstractDBIDGenerator":
|
|
84
94
|
"""Clones AbstractIDGenerator implementation."""
|
|
85
95
|
|
|
96
|
+
def close(self) -> None:
|
|
97
|
+
"""Closes any active database connections."""
|
|
98
|
+
self.db.close()
|
|
99
|
+
|
|
86
100
|
@property
|
|
87
101
|
def db(self) -> "DatabaseEngine":
|
|
88
102
|
return self._db
|
|
@@ -78,6 +78,13 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
78
78
|
self.uri = uri
|
|
79
79
|
self.partial_id: Optional[int] = partial_id
|
|
80
80
|
|
|
81
|
+
def __enter__(self) -> "AbstractMetastore":
|
|
82
|
+
"""Returns self upon entering context manager."""
|
|
83
|
+
return self
|
|
84
|
+
|
|
85
|
+
def __exit__(self, exc_type, exc_value, traceback) -> None:
|
|
86
|
+
"""Default behavior is to do nothing, as connections may be shared."""
|
|
87
|
+
|
|
81
88
|
@abstractmethod
|
|
82
89
|
def clone(
|
|
83
90
|
self,
|
|
@@ -97,6 +104,12 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
97
104
|
def close(self) -> None:
|
|
98
105
|
"""Closes any active database or HTTP connections."""
|
|
99
106
|
|
|
107
|
+
def close_on_exit(self) -> None:
|
|
108
|
+
"""Closes any active database or HTTP connections, called on Session exit or
|
|
109
|
+
for test cleanup only, as some Metastore implementations may handle this
|
|
110
|
+
differently."""
|
|
111
|
+
self.close()
|
|
112
|
+
|
|
100
113
|
def cleanup_tables(self, temp_table_names: list[str]) -> None:
|
|
101
114
|
"""Cleanup temp tables."""
|
|
102
115
|
|
datachain/data_storage/sqlite.py
CHANGED
|
@@ -15,7 +15,6 @@ from typing import (
|
|
|
15
15
|
)
|
|
16
16
|
|
|
17
17
|
import sqlalchemy
|
|
18
|
-
from attrs import frozen
|
|
19
18
|
from sqlalchemy import MetaData, Table, UniqueConstraint, exists, select
|
|
20
19
|
from sqlalchemy.dialects import sqlite
|
|
21
20
|
from sqlalchemy.schema import CreateIndex, CreateTable, DropTable
|
|
@@ -40,6 +39,7 @@ from datachain.utils import DataChainDir
|
|
|
40
39
|
|
|
41
40
|
if TYPE_CHECKING:
|
|
42
41
|
from sqlalchemy.dialects.sqlite import Insert
|
|
42
|
+
from sqlalchemy.engine.base import Engine
|
|
43
43
|
from sqlalchemy.schema import SchemaItem
|
|
44
44
|
from sqlalchemy.sql.elements import ColumnClause, ColumnElement, TextClause
|
|
45
45
|
from sqlalchemy.sql.selectable import Select
|
|
@@ -52,6 +52,8 @@ RETRY_START_SEC = 0.01
|
|
|
52
52
|
RETRY_MAX_TIMES = 10
|
|
53
53
|
RETRY_FACTOR = 2
|
|
54
54
|
|
|
55
|
+
DETECT_TYPES = sqlite3.PARSE_DECLTYPES | sqlite3.PARSE_COLNAMES
|
|
56
|
+
|
|
55
57
|
Column = Union[str, "ColumnClause[Any]", "TextClause"]
|
|
56
58
|
|
|
57
59
|
datachain.sql.sqlite.setup()
|
|
@@ -80,26 +82,41 @@ def retry_sqlite_locks(func):
|
|
|
80
82
|
return wrapper
|
|
81
83
|
|
|
82
84
|
|
|
83
|
-
@frozen
|
|
84
85
|
class SQLiteDatabaseEngine(DatabaseEngine):
|
|
85
86
|
dialect = sqlite_dialect
|
|
86
87
|
|
|
87
88
|
db: sqlite3.Connection
|
|
88
89
|
db_file: Optional[str]
|
|
90
|
+
is_closed: bool
|
|
91
|
+
|
|
92
|
+
def __init__(
|
|
93
|
+
self,
|
|
94
|
+
engine: "Engine",
|
|
95
|
+
metadata: "MetaData",
|
|
96
|
+
db: sqlite3.Connection,
|
|
97
|
+
db_file: Optional[str] = None,
|
|
98
|
+
):
|
|
99
|
+
self.engine = engine
|
|
100
|
+
self.metadata = metadata
|
|
101
|
+
self.db = db
|
|
102
|
+
self.db_file = db_file
|
|
103
|
+
self.is_closed = False
|
|
89
104
|
|
|
90
105
|
@classmethod
|
|
91
106
|
def from_db_file(cls, db_file: Optional[str] = None) -> "SQLiteDatabaseEngine":
|
|
92
|
-
|
|
107
|
+
return cls(*cls._connect(db_file=db_file))
|
|
93
108
|
|
|
109
|
+
@staticmethod
|
|
110
|
+
def _connect(db_file: Optional[str] = None):
|
|
94
111
|
try:
|
|
95
112
|
if db_file == ":memory:":
|
|
96
113
|
# Enable multithreaded usage of the same in-memory db
|
|
97
114
|
db = sqlite3.connect(
|
|
98
|
-
"file::memory:?cache=shared", uri=True, detect_types=
|
|
115
|
+
"file::memory:?cache=shared", uri=True, detect_types=DETECT_TYPES
|
|
99
116
|
)
|
|
100
117
|
else:
|
|
101
118
|
db = sqlite3.connect(
|
|
102
|
-
db_file or DataChainDir.find().db, detect_types=
|
|
119
|
+
db_file or DataChainDir.find().db, detect_types=DETECT_TYPES
|
|
103
120
|
)
|
|
104
121
|
create_user_defined_sql_functions(db)
|
|
105
122
|
engine = sqlalchemy.create_engine(
|
|
@@ -118,7 +135,7 @@ class SQLiteDatabaseEngine(DatabaseEngine):
|
|
|
118
135
|
|
|
119
136
|
load_usearch_extension(db)
|
|
120
137
|
|
|
121
|
-
return
|
|
138
|
+
return engine, MetaData(), db, db_file
|
|
122
139
|
except RuntimeError:
|
|
123
140
|
raise DataChainError("Can't connect to SQLite DB") from None
|
|
124
141
|
|
|
@@ -138,6 +155,16 @@ class SQLiteDatabaseEngine(DatabaseEngine):
|
|
|
138
155
|
{},
|
|
139
156
|
)
|
|
140
157
|
|
|
158
|
+
def _reconnect(self) -> None:
|
|
159
|
+
if not self.is_closed:
|
|
160
|
+
raise RuntimeError("Cannot reconnect on still-open DB!")
|
|
161
|
+
engine, metadata, db, db_file = self._connect(db_file=self.db_file)
|
|
162
|
+
self.engine = engine
|
|
163
|
+
self.metadata = metadata
|
|
164
|
+
self.db = db
|
|
165
|
+
self.db_file = db_file
|
|
166
|
+
self.is_closed = False
|
|
167
|
+
|
|
141
168
|
@retry_sqlite_locks
|
|
142
169
|
def execute(
|
|
143
170
|
self,
|
|
@@ -145,6 +172,9 @@ class SQLiteDatabaseEngine(DatabaseEngine):
|
|
|
145
172
|
cursor: Optional[sqlite3.Cursor] = None,
|
|
146
173
|
conn=None,
|
|
147
174
|
) -> sqlite3.Cursor:
|
|
175
|
+
if self.is_closed:
|
|
176
|
+
# Reconnect in case of being closed previously.
|
|
177
|
+
self._reconnect()
|
|
148
178
|
if cursor is not None:
|
|
149
179
|
result = cursor.execute(*self.compile_to_args(query))
|
|
150
180
|
elif conn is not None:
|
|
@@ -179,6 +209,7 @@ class SQLiteDatabaseEngine(DatabaseEngine):
|
|
|
179
209
|
|
|
180
210
|
def close(self) -> None:
|
|
181
211
|
self.db.close()
|
|
212
|
+
self.is_closed = True
|
|
182
213
|
|
|
183
214
|
@contextmanager
|
|
184
215
|
def transaction(self):
|
|
@@ -359,6 +390,10 @@ class SQLiteMetastore(AbstractDBMetastore):
|
|
|
359
390
|
|
|
360
391
|
self._init_tables()
|
|
361
392
|
|
|
393
|
+
def __exit__(self, exc_type, exc_value, traceback) -> None:
|
|
394
|
+
"""Close connection upon exit from context manager."""
|
|
395
|
+
self.close()
|
|
396
|
+
|
|
362
397
|
def clone(
|
|
363
398
|
self,
|
|
364
399
|
uri: StorageURI = StorageURI(""),
|
|
@@ -521,6 +556,10 @@ class SQLiteWarehouse(AbstractWarehouse):
|
|
|
521
556
|
|
|
522
557
|
self.db = db or SQLiteDatabaseEngine.from_db_file(db_file)
|
|
523
558
|
|
|
559
|
+
def __exit__(self, exc_type, exc_value, traceback) -> None:
|
|
560
|
+
"""Close connection upon exit from context manager."""
|
|
561
|
+
self.close()
|
|
562
|
+
|
|
524
563
|
def clone(self, use_new_connection: bool = False) -> "SQLiteWarehouse":
|
|
525
564
|
return SQLiteWarehouse(self.id_generator.clone(), db=self.db.clone())
|
|
526
565
|
|
|
@@ -70,6 +70,13 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
70
70
|
def __init__(self, id_generator: "AbstractIDGenerator"):
|
|
71
71
|
self.id_generator = id_generator
|
|
72
72
|
|
|
73
|
+
def __enter__(self) -> "AbstractWarehouse":
|
|
74
|
+
return self
|
|
75
|
+
|
|
76
|
+
def __exit__(self, exc_type, exc_value, traceback) -> None:
|
|
77
|
+
# Default behavior is to do nothing, as connections may be shared.
|
|
78
|
+
pass
|
|
79
|
+
|
|
73
80
|
def cleanup_for_tests(self):
|
|
74
81
|
"""Cleanup for tests."""
|
|
75
82
|
|
|
@@ -158,6 +165,12 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
158
165
|
"""Closes any active database connections."""
|
|
159
166
|
self.db.close()
|
|
160
167
|
|
|
168
|
+
def close_on_exit(self) -> None:
|
|
169
|
+
"""Closes any active database or HTTP connections, called on Session exit or
|
|
170
|
+
for test cleanup only, as some Warehouse implementations may handle this
|
|
171
|
+
differently."""
|
|
172
|
+
self.close()
|
|
173
|
+
|
|
161
174
|
#
|
|
162
175
|
# Query Tables
|
|
163
176
|
#
|
datachain/lib/arrow.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import re
|
|
2
2
|
from collections.abc import Sequence
|
|
3
|
+
from tempfile import NamedTemporaryFile
|
|
3
4
|
from typing import TYPE_CHECKING, Optional
|
|
4
5
|
|
|
5
6
|
import pyarrow as pa
|
|
@@ -43,13 +44,17 @@ class ArrowGenerator(Generator):
|
|
|
43
44
|
self.kwargs = kwargs
|
|
44
45
|
|
|
45
46
|
def process(self, file: File):
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
47
|
+
if self.nrows:
|
|
48
|
+
path = _nrows_file(file, self.nrows)
|
|
49
|
+
ds = dataset(path, schema=self.input_schema, **self.kwargs)
|
|
50
|
+
else:
|
|
51
|
+
path = file.get_path()
|
|
52
|
+
ds = dataset(
|
|
53
|
+
path, filesystem=file.get_fs(), schema=self.input_schema, **self.kwargs
|
|
54
|
+
)
|
|
50
55
|
index = 0
|
|
51
56
|
with tqdm(desc="Parsed by pyarrow", unit=" rows") as pbar:
|
|
52
|
-
for record_batch in ds.to_batches(
|
|
57
|
+
for record_batch in ds.to_batches():
|
|
53
58
|
for record in record_batch.to_pylist():
|
|
54
59
|
vals = list(record.values())
|
|
55
60
|
if self.output_schema:
|
|
@@ -60,8 +65,6 @@ class ArrowGenerator(Generator):
|
|
|
60
65
|
else:
|
|
61
66
|
yield vals
|
|
62
67
|
index += 1
|
|
63
|
-
if self.nrows and index >= self.nrows:
|
|
64
|
-
return
|
|
65
68
|
pbar.update(len(record_batch))
|
|
66
69
|
|
|
67
70
|
|
|
@@ -125,3 +128,15 @@ def _arrow_type_mapper(col_type: pa.DataType) -> type: # noqa: PLR0911
|
|
|
125
128
|
if isinstance(col_type, pa.lib.DictionaryType):
|
|
126
129
|
return _arrow_type_mapper(col_type.value_type) # type: ignore[return-value]
|
|
127
130
|
raise TypeError(f"{col_type!r} datatypes not supported")
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def _nrows_file(file: File, nrows: int) -> str:
|
|
134
|
+
tf = NamedTemporaryFile(delete=False)
|
|
135
|
+
with file.open(mode="r") as reader:
|
|
136
|
+
with open(tf.name, "a") as writer:
|
|
137
|
+
for row, line in enumerate(reader):
|
|
138
|
+
if row >= nrows:
|
|
139
|
+
break
|
|
140
|
+
writer.write(line)
|
|
141
|
+
writer.write("\n")
|
|
142
|
+
return tf.name
|
|
@@ -1,23 +1,18 @@
|
|
|
1
|
-
from
|
|
1
|
+
from decimal import Decimal
|
|
2
2
|
from typing import Any
|
|
3
3
|
|
|
4
|
-
from sqlalchemy import
|
|
4
|
+
from sqlalchemy import ColumnElement
|
|
5
5
|
|
|
6
|
-
from datachain.data_storage.sqlite import Column
|
|
7
6
|
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
7
|
+
def sql_to_python(args_map: dict[str, ColumnElement]) -> dict[str, Any]:
|
|
8
|
+
res = {}
|
|
9
|
+
for name, sql_exp in args_map.items():
|
|
10
|
+
try:
|
|
11
|
+
type_ = sql_exp.type.python_type
|
|
12
|
+
if type_ == Decimal:
|
|
13
|
+
type_ = float
|
|
14
|
+
except NotImplementedError:
|
|
15
|
+
type_ = str
|
|
16
|
+
res[name] = type_
|
|
17
17
|
|
|
18
|
-
|
|
19
|
-
def sql_to_python(args_map: dict[str, Column]) -> dict[str, Any]:
|
|
20
|
-
return {
|
|
21
|
-
k: SQL_TO_PYTHON.get(type(v.type), str) # type: ignore[union-attr]
|
|
22
|
-
for k, v in args_map.items()
|
|
23
|
-
}
|
|
18
|
+
return res
|
datachain/lib/dc.py
CHANGED
|
@@ -20,8 +20,10 @@ import pandas as pd
|
|
|
20
20
|
import sqlalchemy
|
|
21
21
|
from pydantic import BaseModel, create_model
|
|
22
22
|
from sqlalchemy.sql.functions import GenericFunction
|
|
23
|
+
from sqlalchemy.sql.sqltypes import NullType
|
|
23
24
|
|
|
24
25
|
from datachain import DataModel
|
|
26
|
+
from datachain.lib.convert.python_to_sql import python_to_sql
|
|
25
27
|
from datachain.lib.convert.values_to_tuples import values_to_tuples
|
|
26
28
|
from datachain.lib.data_model import DataType
|
|
27
29
|
from datachain.lib.dataset_info import DatasetInfo
|
|
@@ -110,6 +112,11 @@ class DatasetMergeError(DataChainParamsError): # noqa: D101
|
|
|
110
112
|
super().__init__(f"Merge error on='{on_str}'{right_on_str}: {msg}")
|
|
111
113
|
|
|
112
114
|
|
|
115
|
+
class DataChainColumnError(DataChainParamsError): # noqa: D101
|
|
116
|
+
def __init__(self, col_name, msg): # noqa: D107
|
|
117
|
+
super().__init__(f"Error for column {col_name}: {msg}")
|
|
118
|
+
|
|
119
|
+
|
|
113
120
|
OutputType = Union[None, DataType, Sequence[str], dict[str, DataType]]
|
|
114
121
|
|
|
115
122
|
|
|
@@ -225,6 +232,17 @@ class DataChain(DatasetQuery):
|
|
|
225
232
|
"""Get schema of the chain."""
|
|
226
233
|
return self._effective_signals_schema.values
|
|
227
234
|
|
|
235
|
+
def column(self, name: str) -> Column:
|
|
236
|
+
"""Returns Column instance with a type if name is found in current schema,
|
|
237
|
+
otherwise raises an exception.
|
|
238
|
+
"""
|
|
239
|
+
name_path = name.split(".")
|
|
240
|
+
for path, type_, _, _ in self.signals_schema.get_flat_tree():
|
|
241
|
+
if path == name_path:
|
|
242
|
+
return Column(name, python_to_sql(type_))
|
|
243
|
+
|
|
244
|
+
raise ValueError(f"Column with name {name} not found in the schema")
|
|
245
|
+
|
|
228
246
|
def print_schema(self) -> None:
|
|
229
247
|
"""Print schema of the chain."""
|
|
230
248
|
self._effective_signals_schema.print_tree()
|
|
@@ -829,8 +847,25 @@ class DataChain(DatasetQuery):
|
|
|
829
847
|
)
|
|
830
848
|
```
|
|
831
849
|
"""
|
|
832
|
-
|
|
833
|
-
|
|
850
|
+
for col_name, expr in kwargs.items():
|
|
851
|
+
if not isinstance(expr, Column) and isinstance(expr.type, NullType):
|
|
852
|
+
raise DataChainColumnError(
|
|
853
|
+
col_name, f"Cannot infer type with expression {expr}"
|
|
854
|
+
)
|
|
855
|
+
|
|
856
|
+
mutated = {}
|
|
857
|
+
schema = self.signals_schema
|
|
858
|
+
for name, value in kwargs.items():
|
|
859
|
+
if isinstance(value, Column):
|
|
860
|
+
# renaming existing column
|
|
861
|
+
for signal in schema.db_signals(name=value.name, as_columns=True):
|
|
862
|
+
mutated[signal.name.replace(value.name, name, 1)] = signal
|
|
863
|
+
else:
|
|
864
|
+
# adding new signal
|
|
865
|
+
mutated[name] = value
|
|
866
|
+
|
|
867
|
+
chain = super().mutate(**mutated)
|
|
868
|
+
chain.signals_schema = schema.mutate(kwargs)
|
|
834
869
|
return chain
|
|
835
870
|
|
|
836
871
|
@property
|
|
@@ -1099,7 +1134,7 @@ class DataChain(DatasetQuery):
|
|
|
1099
1134
|
)
|
|
1100
1135
|
else:
|
|
1101
1136
|
signals = self.signals_schema.resolve(*on).db_signals()
|
|
1102
|
-
return super()._subtract(other, signals)
|
|
1137
|
+
return super()._subtract(other, signals) # type: ignore[arg-type]
|
|
1103
1138
|
|
|
1104
1139
|
@classmethod
|
|
1105
1140
|
def from_values(
|
|
@@ -1261,8 +1296,21 @@ class DataChain(DatasetQuery):
|
|
|
1261
1296
|
dc = dc.parse_tabular(format="json")
|
|
1262
1297
|
```
|
|
1263
1298
|
"""
|
|
1299
|
+
from pyarrow.dataset import CsvFileFormat, JsonFileFormat
|
|
1300
|
+
|
|
1264
1301
|
from datachain.lib.arrow import ArrowGenerator, infer_schema, schema_to_output
|
|
1265
1302
|
|
|
1303
|
+
if nrows:
|
|
1304
|
+
format = kwargs.get("format")
|
|
1305
|
+
if format not in ["csv", "json"] and not isinstance(
|
|
1306
|
+
format, (CsvFileFormat, JsonFileFormat)
|
|
1307
|
+
):
|
|
1308
|
+
raise DatasetPrepareError(
|
|
1309
|
+
self.name,
|
|
1310
|
+
"error in `parse_tabular` - "
|
|
1311
|
+
"`nrows` only supported for csv and json formats.",
|
|
1312
|
+
)
|
|
1313
|
+
|
|
1266
1314
|
schema = None
|
|
1267
1315
|
col_names = output if isinstance(output, Sequence) else None
|
|
1268
1316
|
if col_names or not output:
|
|
@@ -1360,6 +1408,8 @@ class DataChain(DatasetQuery):
|
|
|
1360
1408
|
else:
|
|
1361
1409
|
msg = f"error parsing csv - incompatible output type {type(output)}"
|
|
1362
1410
|
raise DatasetPrepareError(chain.name, msg)
|
|
1411
|
+
elif nrows:
|
|
1412
|
+
nrows += 1
|
|
1363
1413
|
|
|
1364
1414
|
parse_options = ParseOptions(delimiter=delimiter)
|
|
1365
1415
|
read_options = ReadOptions(column_names=column_names)
|
|
@@ -1382,7 +1432,6 @@ class DataChain(DatasetQuery):
|
|
|
1382
1432
|
object_name: str = "",
|
|
1383
1433
|
model_name: str = "",
|
|
1384
1434
|
source: bool = True,
|
|
1385
|
-
nrows=None,
|
|
1386
1435
|
**kwargs,
|
|
1387
1436
|
) -> "DataChain":
|
|
1388
1437
|
"""Generate chain from parquet files.
|
|
@@ -1395,7 +1444,6 @@ class DataChain(DatasetQuery):
|
|
|
1395
1444
|
object_name : Created object column name.
|
|
1396
1445
|
model_name : Generated model name.
|
|
1397
1446
|
source : Whether to include info about the source file.
|
|
1398
|
-
nrows : Optional row limit.
|
|
1399
1447
|
|
|
1400
1448
|
Example:
|
|
1401
1449
|
Reading a single file:
|
|
@@ -1414,7 +1462,6 @@ class DataChain(DatasetQuery):
|
|
|
1414
1462
|
object_name=object_name,
|
|
1415
1463
|
model_name=model_name,
|
|
1416
1464
|
source=source,
|
|
1417
|
-
nrows=None,
|
|
1418
1465
|
format="parquet",
|
|
1419
1466
|
partitioning=partitioning,
|
|
1420
1467
|
)
|
datachain/lib/file.py
CHANGED
|
@@ -317,9 +317,9 @@ class TextFile(File):
|
|
|
317
317
|
"""`DataModel` for reading text files."""
|
|
318
318
|
|
|
319
319
|
@contextmanager
|
|
320
|
-
def open(self):
|
|
321
|
-
"""Open the file and return a file object
|
|
322
|
-
with super().open(mode=
|
|
320
|
+
def open(self, mode: Literal["rb", "r"] = "r"):
|
|
321
|
+
"""Open the file and return a file object (default to text mode)."""
|
|
322
|
+
with super().open(mode=mode) as stream:
|
|
323
323
|
yield stream
|
|
324
324
|
|
|
325
325
|
def read_text(self):
|
datachain/lib/signal_schema.py
CHANGED
|
@@ -25,7 +25,7 @@ from datachain.lib.data_model import DataModel, DataType
|
|
|
25
25
|
from datachain.lib.file import File
|
|
26
26
|
from datachain.lib.model_store import ModelStore
|
|
27
27
|
from datachain.lib.utils import DataChainParamsError
|
|
28
|
-
from datachain.query.schema import DEFAULT_DELIMITER
|
|
28
|
+
from datachain.query.schema import DEFAULT_DELIMITER, Column
|
|
29
29
|
|
|
30
30
|
if TYPE_CHECKING:
|
|
31
31
|
from datachain.catalog import Catalog
|
|
@@ -222,13 +222,30 @@ class SignalSchema:
|
|
|
222
222
|
res.append(obj)
|
|
223
223
|
return res
|
|
224
224
|
|
|
225
|
-
def db_signals(
|
|
226
|
-
|
|
225
|
+
def db_signals(
|
|
226
|
+
self, name: Optional[str] = None, as_columns=False
|
|
227
|
+
) -> Union[list[str], list[Column]]:
|
|
228
|
+
"""
|
|
229
|
+
Returns DB columns as strings or Column objects with proper types
|
|
230
|
+
Optionally, it can filter results by specific object, returning only his signals
|
|
231
|
+
"""
|
|
232
|
+
signals = [
|
|
227
233
|
DEFAULT_DELIMITER.join(path)
|
|
228
|
-
|
|
234
|
+
if not as_columns
|
|
235
|
+
else Column(DEFAULT_DELIMITER.join(path), python_to_sql(_type))
|
|
236
|
+
for path, _type, has_subtree, _ in self.get_flat_tree()
|
|
229
237
|
if not has_subtree
|
|
230
238
|
]
|
|
231
239
|
|
|
240
|
+
if name:
|
|
241
|
+
signals = [
|
|
242
|
+
s
|
|
243
|
+
for s in signals
|
|
244
|
+
if str(s) == name or str(s).startswith(f"{name}{DEFAULT_DELIMITER}")
|
|
245
|
+
]
|
|
246
|
+
|
|
247
|
+
return signals # type: ignore[return-value]
|
|
248
|
+
|
|
232
249
|
def resolve(self, *names: str) -> "SignalSchema":
|
|
233
250
|
schema = {}
|
|
234
251
|
for field in names:
|
|
@@ -282,7 +299,18 @@ class SignalSchema:
|
|
|
282
299
|
return SignalSchema(schema)
|
|
283
300
|
|
|
284
301
|
def mutate(self, args_map: dict) -> "SignalSchema":
|
|
285
|
-
|
|
302
|
+
new_values = self.values.copy()
|
|
303
|
+
|
|
304
|
+
for name, value in args_map.items():
|
|
305
|
+
if isinstance(value, Column) and value.name in self.values:
|
|
306
|
+
# renaming existing signal
|
|
307
|
+
del new_values[value.name]
|
|
308
|
+
new_values[name] = self.values[value.name]
|
|
309
|
+
else:
|
|
310
|
+
# adding new signal
|
|
311
|
+
new_values.update(sql_to_python({name: value}))
|
|
312
|
+
|
|
313
|
+
return SignalSchema(new_values)
|
|
286
314
|
|
|
287
315
|
def clone_without_sys_signals(self) -> "SignalSchema":
|
|
288
316
|
schema = copy.deepcopy(self.values)
|
datachain/listing.py
CHANGED
|
@@ -44,6 +44,16 @@ class Listing:
|
|
|
44
44
|
self.dataset,
|
|
45
45
|
)
|
|
46
46
|
|
|
47
|
+
def __enter__(self) -> "Listing":
|
|
48
|
+
return self
|
|
49
|
+
|
|
50
|
+
def __exit__(self, exc_type, exc_value, traceback) -> None:
|
|
51
|
+
self.close()
|
|
52
|
+
|
|
53
|
+
def close(self) -> None:
|
|
54
|
+
self.metastore.close()
|
|
55
|
+
self.warehouse.close()
|
|
56
|
+
|
|
47
57
|
@property
|
|
48
58
|
def id(self):
|
|
49
59
|
return self.storage.id
|
|
@@ -56,16 +66,18 @@ class Listing:
|
|
|
56
66
|
sync(get_loop(), self._fetch, start_prefix, method)
|
|
57
67
|
|
|
58
68
|
async def _fetch(self, start_prefix: str, method: str) -> None:
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
+
with self.clone() as fetch_listing:
|
|
70
|
+
if start_prefix:
|
|
71
|
+
start_prefix = start_prefix.rstrip("/")
|
|
72
|
+
try:
|
|
73
|
+
async for entries in fetch_listing.client.scandir(
|
|
74
|
+
start_prefix, method=method
|
|
75
|
+
):
|
|
76
|
+
fetch_listing.insert_entries(entries)
|
|
77
|
+
if len(entries) > 1:
|
|
78
|
+
fetch_listing.metastore.update_last_inserted_at()
|
|
79
|
+
finally:
|
|
80
|
+
fetch_listing.insert_entries_done()
|
|
69
81
|
|
|
70
82
|
def insert_entry(self, entry: Entry) -> None:
|
|
71
83
|
self.warehouse.insert_rows(
|
datachain/query/dataset.py
CHANGED
|
@@ -1051,8 +1051,11 @@ class DatasetQuery:
|
|
|
1051
1051
|
if anon:
|
|
1052
1052
|
client_config["anon"] = True
|
|
1053
1053
|
|
|
1054
|
+
self.session = Session.get(
|
|
1055
|
+
session, catalog=catalog, client_config=client_config
|
|
1056
|
+
)
|
|
1057
|
+
self.catalog = catalog or self.session.catalog
|
|
1054
1058
|
self.steps: list[Step] = []
|
|
1055
|
-
self.catalog = catalog or get_catalog(client_config=client_config)
|
|
1056
1059
|
self._chunk_index: Optional[int] = None
|
|
1057
1060
|
self._chunk_total: Optional[int] = None
|
|
1058
1061
|
self.temp_table_names: list[str] = []
|
|
@@ -1063,7 +1066,6 @@ class DatasetQuery:
|
|
|
1063
1066
|
self.version: Optional[int] = None
|
|
1064
1067
|
self.feature_schema: Optional[dict] = None
|
|
1065
1068
|
self.column_types: Optional[dict[str, Any]] = None
|
|
1066
|
-
self.session = Session.get(session, catalog=catalog)
|
|
1067
1069
|
|
|
1068
1070
|
if path:
|
|
1069
1071
|
kwargs = {"update": True} if update else {}
|
|
@@ -1200,12 +1202,10 @@ class DatasetQuery:
|
|
|
1200
1202
|
# This is needed to always use a new connection with all metastore and warehouse
|
|
1201
1203
|
# implementations, as errors may close or render unusable the existing
|
|
1202
1204
|
# connections.
|
|
1203
|
-
|
|
1204
|
-
|
|
1205
|
-
|
|
1206
|
-
|
|
1207
|
-
warehouse.cleanup_tables(self.temp_table_names)
|
|
1208
|
-
warehouse.close()
|
|
1205
|
+
with self.catalog.metastore.clone(use_new_connection=True) as metastore:
|
|
1206
|
+
metastore.cleanup_tables(self.temp_table_names)
|
|
1207
|
+
with self.catalog.warehouse.clone(use_new_connection=True) as warehouse:
|
|
1208
|
+
warehouse.cleanup_tables(self.temp_table_names)
|
|
1209
1209
|
self.temp_table_names = []
|
|
1210
1210
|
|
|
1211
1211
|
def db_results(self, row_factory=None, **kwargs):
|
|
@@ -1248,19 +1248,12 @@ class DatasetQuery:
|
|
|
1248
1248
|
def row_iter() -> Generator[RowDict, None, None]:
|
|
1249
1249
|
# warehouse isn't threadsafe, we need to clone() it
|
|
1250
1250
|
# in the thread that uses the results
|
|
1251
|
-
warehouse
|
|
1252
|
-
try:
|
|
1253
|
-
warehouse = self.catalog.warehouse.clone()
|
|
1251
|
+
with self.catalog.warehouse.clone() as warehouse:
|
|
1254
1252
|
gen = warehouse.dataset_select_paginated(
|
|
1255
1253
|
query, limit=query._limit, order_by=query._order_by_clauses
|
|
1256
1254
|
)
|
|
1257
1255
|
with contextlib.closing(gen) as rows:
|
|
1258
1256
|
yield from rows
|
|
1259
|
-
finally:
|
|
1260
|
-
# clone doesn't necessarily create a new connection
|
|
1261
|
-
# we can't do `warehouse.close()` for now. It is a bad design
|
|
1262
|
-
# in clone / close interface that needs to be fixed.
|
|
1263
|
-
pass
|
|
1264
1257
|
|
|
1265
1258
|
async def get_params(row: RowDict) -> tuple:
|
|
1266
1259
|
return tuple(
|
|
@@ -1383,10 +1376,14 @@ class DatasetQuery:
|
|
|
1383
1376
|
@detach
|
|
1384
1377
|
def limit(self, n: int) -> "Self":
|
|
1385
1378
|
query = self.clone(new_table=False)
|
|
1386
|
-
|
|
1387
|
-
|
|
1388
|
-
|
|
1389
|
-
|
|
1379
|
+
if (
|
|
1380
|
+
query.steps
|
|
1381
|
+
and (last_step := query.steps[-1])
|
|
1382
|
+
and isinstance(last_step, SQLLimit)
|
|
1383
|
+
):
|
|
1384
|
+
query.steps[-1] = SQLLimit(min(n, last_step.n))
|
|
1385
|
+
else:
|
|
1386
|
+
query.steps.append(SQLLimit(n))
|
|
1390
1387
|
return query
|
|
1391
1388
|
|
|
1392
1389
|
@detach
|
datachain/query/session.py
CHANGED
|
@@ -41,7 +41,12 @@ class Session:
|
|
|
41
41
|
SESSION_UUID_LEN = 6
|
|
42
42
|
TEMP_TABLE_UUID_LEN = 6
|
|
43
43
|
|
|
44
|
-
def __init__(
|
|
44
|
+
def __init__(
|
|
45
|
+
self,
|
|
46
|
+
name="",
|
|
47
|
+
catalog: Optional["Catalog"] = None,
|
|
48
|
+
client_config: Optional[dict] = None,
|
|
49
|
+
):
|
|
45
50
|
if re.match(r"^[0-9a-zA-Z]+$", name) is None:
|
|
46
51
|
raise ValueError(
|
|
47
52
|
f"Session name can contain only letters or numbers - '{name}' given."
|
|
@@ -52,13 +57,18 @@ class Session:
|
|
|
52
57
|
|
|
53
58
|
session_uuid = uuid4().hex[: self.SESSION_UUID_LEN]
|
|
54
59
|
self.name = f"{name}_{session_uuid}"
|
|
55
|
-
self.
|
|
60
|
+
self.is_new_catalog = not catalog
|
|
61
|
+
self.catalog = catalog or get_catalog(client_config=client_config)
|
|
56
62
|
|
|
57
63
|
def __enter__(self):
|
|
58
64
|
return self
|
|
59
65
|
|
|
60
66
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
61
67
|
self._cleanup_temp_datasets()
|
|
68
|
+
if self.is_new_catalog:
|
|
69
|
+
self.catalog.metastore.close_on_exit()
|
|
70
|
+
self.catalog.warehouse.close_on_exit()
|
|
71
|
+
self.catalog.id_generator.close_on_exit()
|
|
62
72
|
|
|
63
73
|
def generate_temp_dataset_name(self) -> str:
|
|
64
74
|
tmp_table_uid = uuid4().hex[: self.TEMP_TABLE_UUID_LEN]
|
|
@@ -75,7 +85,10 @@ class Session:
|
|
|
75
85
|
|
|
76
86
|
@classmethod
|
|
77
87
|
def get(
|
|
78
|
-
cls,
|
|
88
|
+
cls,
|
|
89
|
+
session: Optional["Session"] = None,
|
|
90
|
+
catalog: Optional["Catalog"] = None,
|
|
91
|
+
client_config: Optional[dict] = None,
|
|
79
92
|
) -> "Session":
|
|
80
93
|
"""Creates a Session() object from a catalog.
|
|
81
94
|
|
|
@@ -88,7 +101,9 @@ class Session:
|
|
|
88
101
|
return session
|
|
89
102
|
|
|
90
103
|
if cls.GLOBAL_SESSION is None:
|
|
91
|
-
cls.GLOBAL_SESSION_CTX = Session(
|
|
104
|
+
cls.GLOBAL_SESSION_CTX = Session(
|
|
105
|
+
cls.GLOBAL_SESSION_NAME, catalog, client_config=client_config
|
|
106
|
+
)
|
|
92
107
|
cls.GLOBAL_SESSION = cls.GLOBAL_SESSION_CTX.__enter__()
|
|
93
108
|
atexit.register(cls._global_cleanup)
|
|
94
109
|
return cls.GLOBAL_SESSION
|
|
@@ -1,16 +1,17 @@
|
|
|
1
1
|
from sqlalchemy.sql.expression import func
|
|
2
2
|
|
|
3
|
-
from . import path, string
|
|
3
|
+
from . import array, path, string
|
|
4
|
+
from .array import avg
|
|
4
5
|
from .conditional import greatest, least
|
|
5
6
|
from .random import rand
|
|
6
7
|
|
|
7
8
|
count = func.count
|
|
8
9
|
sum = func.sum
|
|
9
|
-
avg = func.avg
|
|
10
10
|
min = func.min
|
|
11
11
|
max = func.max
|
|
12
12
|
|
|
13
13
|
__all__ = [
|
|
14
|
+
"array",
|
|
14
15
|
"avg",
|
|
15
16
|
"count",
|
|
16
17
|
"func",
|
datachain/sql/functions/array.py
CHANGED
|
@@ -44,7 +44,15 @@ class sip_hash_64(GenericFunction): # noqa: N801
|
|
|
44
44
|
inherit_cache = True
|
|
45
45
|
|
|
46
46
|
|
|
47
|
+
class avg(GenericFunction): # noqa: N801
|
|
48
|
+
type = Float()
|
|
49
|
+
package = "array"
|
|
50
|
+
name = "avg"
|
|
51
|
+
inherit_cache = True
|
|
52
|
+
|
|
53
|
+
|
|
47
54
|
compiler_not_implemented(cosine_distance)
|
|
48
55
|
compiler_not_implemented(euclidean_distance)
|
|
49
56
|
compiler_not_implemented(length)
|
|
50
57
|
compiler_not_implemented(sip_hash_64)
|
|
58
|
+
compiler_not_implemented(avg)
|
datachain/sql/sqlite/base.py
CHANGED
|
@@ -78,6 +78,7 @@ def setup():
|
|
|
78
78
|
compiles(conditional.least, "sqlite")(compile_least)
|
|
79
79
|
compiles(Values, "sqlite")(compile_values)
|
|
80
80
|
compiles(random.rand, "sqlite")(compile_rand)
|
|
81
|
+
compiles(array.avg, "sqlite")(compile_avg)
|
|
81
82
|
|
|
82
83
|
if load_usearch_extension(sqlite3.connect(":memory:")):
|
|
83
84
|
compiles(array.cosine_distance, "sqlite")(compile_cosine_distance_ext)
|
|
@@ -349,6 +350,10 @@ def compile_rand(element, compiler, **kwargs):
|
|
|
349
350
|
return compiler.process(func.random(), **kwargs)
|
|
350
351
|
|
|
351
352
|
|
|
353
|
+
def compile_avg(element, compiler, **kwargs):
|
|
354
|
+
return compiler.process(func.avg(*element.clauses.clauses), **kwargs)
|
|
355
|
+
|
|
356
|
+
|
|
352
357
|
def load_usearch_extension(conn) -> bool:
|
|
353
358
|
try:
|
|
354
359
|
# usearch is part of the vector optional dependencies
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.18
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -100,28 +100,87 @@ Requires-Dist: usearch ; extra == 'vector'
|
|
|
100
100
|
AI 🔗 DataChain
|
|
101
101
|
----------------
|
|
102
102
|
|
|
103
|
-
DataChain is
|
|
104
|
-
data
|
|
103
|
+
DataChain is a data-frame library designed for AI-specific scenarios. It helps ML and
|
|
104
|
+
AI engineers build a metadata layer on top of unstructured files and analyze data using
|
|
105
|
+
this layer.
|
|
105
106
|
|
|
106
|
-
|
|
107
|
+
📂 **Raw Files Processing**
|
|
108
|
+
Process raw files (images, video, text, PDFs) directly from storage (S3, GCP, Azure,
|
|
109
|
+
Local), version and update datasets.
|
|
107
110
|
|
|
108
|
-
|
|
111
|
+
🌟 **Metadata layer.**
|
|
112
|
+
Build a metadata layer on top of files using structured sources like CSV, Parquet,
|
|
113
|
+
and JSON files.
|
|
109
114
|
|
|
110
|
-
|
|
115
|
+
⭐ **Metadata enrichment.**
|
|
116
|
+
Enhance the metadata layer with outputs from local ML model inferences and LLM calls.
|
|
111
117
|
|
|
118
|
+
🛠️ **Data Transformation.**
|
|
119
|
+
Transform metadata using traditional methods like filtering, grouping, joining, and
|
|
120
|
+
others.
|
|
112
121
|
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
The typical use cases include Computer Vision data curation, LLM analytics,
|
|
117
|
-
and validation of multimodal AI applications.
|
|
122
|
+
🐍 **User-friendly interface.**
|
|
123
|
+
Operate efficiently with familiar Python objects and object fields, eliminating the
|
|
124
|
+
need for SQL.
|
|
118
125
|
|
|
119
126
|
|
|
120
127
|
.. code:: console
|
|
121
128
|
|
|
122
129
|
$ pip install datachain
|
|
123
130
|
|
|
124
|
-
|
|
131
|
+
|
|
132
|
+
Data Structures
|
|
133
|
+
===============
|
|
134
|
+
|
|
135
|
+
DataChain introduces expressive data structures tailored for AI-specific workload:
|
|
136
|
+
|
|
137
|
+
- **Dataset:** Preserves the file-references and meta-information. Takes care of Python
|
|
138
|
+
object serialization, dataset versioning and difference. Operations on dataset:
|
|
139
|
+
|
|
140
|
+
- **Transformations:** traditional data-frame or SQL operations such as filtering,
|
|
141
|
+
grouping, joining.
|
|
142
|
+
- **Enrichments:** mapping, aggregating and generating using customer’s Python
|
|
143
|
+
code. This is needed to work with ML inference and LLM calls.
|
|
144
|
+
|
|
145
|
+
- **Chain** is a sequence of operations on datasets. Chain executes operations in lazy
|
|
146
|
+
mode - only when needed.
|
|
147
|
+
|
|
148
|
+
DataChain name comes from these major data structures: dataset and chaining.
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
What’s new in DataChain?
|
|
152
|
+
========================
|
|
153
|
+
|
|
154
|
+
The project combines multiple ideas from different areas in order to simplify AI
|
|
155
|
+
use-cases and at the same time to fit it into traditional data infrastructure.
|
|
156
|
+
|
|
157
|
+
- **Python-Native for AI.** Utilizes Python instead of SQL for data manipulation as the
|
|
158
|
+
native language for AI. It’s powered by `Pydantic`_ data models.
|
|
159
|
+
- **Separation of CPU-GPU workloads.** Distinguishes CPU-heavy transformations (filter,
|
|
160
|
+
group_by, join) from GPU heavy enrichments (ML-inference or LLM calls). That’s mostly
|
|
161
|
+
needed for distributed computations.
|
|
162
|
+
- **Resuming data processing** (in development). Introduces idempotent operations,
|
|
163
|
+
allowing data processing to resume from the last successful process file/record/batch
|
|
164
|
+
if it fails due to issues like failed LLM calls, ML inference or file download.
|
|
165
|
+
|
|
166
|
+
Additional relatively new ideas:
|
|
167
|
+
|
|
168
|
+
- **Functional style data processing.** Using a functional/chaining approach to data
|
|
169
|
+
processing rather than declarative SQL, inspired by R-dplyr and some Python libraries.
|
|
170
|
+
- **Data Versioning.** Treats raw files in cloud storage as the source of truth for data
|
|
171
|
+
and implements data versioning, extending ideas from DVC (developed by the same team).
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
What DataChain is NOT?
|
|
175
|
+
======================
|
|
176
|
+
|
|
177
|
+
- **Not a database** (Postgres, MySQL). Instead, it uses databases under the hood:
|
|
178
|
+
`SQLite`_ in open-source and ClickHouse and other data warehouses for the commercial
|
|
179
|
+
version.
|
|
180
|
+
- **Not a data processing tool / data warehouse** (Spark, Snowflake, Big Query) since
|
|
181
|
+
it delegates heavy data transformations to underlying data warehouses and focuses on
|
|
182
|
+
AI specific data enrichments and orchestrating all the pieces together.
|
|
183
|
+
|
|
125
184
|
|
|
126
185
|
Quick Start
|
|
127
186
|
-----------
|
|
@@ -8,7 +8,7 @@ datachain/config.py,sha256=PfC7W5yO6HFO6-iMB4YB-0RR88LPiGmD6sS_SfVbGso,1979
|
|
|
8
8
|
datachain/dataset.py,sha256=MZezyuJWNj_3PEtzr0epPMNyWAOTrhTSPI5FmemV6L4,14470
|
|
9
9
|
datachain/error.py,sha256=GY9KYTmb7GHXn2gGHV9X-PBhgwLj3i7VpK7tGHtAoGM,1279
|
|
10
10
|
datachain/job.py,sha256=bk25bIqClhgRPzlXAhxpTtDeewibQe5l3S8Cf7db0gM,1229
|
|
11
|
-
datachain/listing.py,sha256=
|
|
11
|
+
datachain/listing.py,sha256=JEhi5WOSV2LUqRQgt0-fdmJ8Zb5fNpNFzBQcuTtx63o,8555
|
|
12
12
|
datachain/node.py,sha256=LwzSOSM9SbPLI5RvYDsiEkk7d5rbMX8huzM_m7uWKx4,5917
|
|
13
13
|
datachain/nodes_fetcher.py,sha256=kca19yvu11JxoVY1t4_ydp1FmchiV88GnNicNBQ9NIA,831
|
|
14
14
|
datachain/nodes_thread_pool.py,sha256=ZyzBvUImIPmi4WlKC2SW2msA0UhtembbTdcs2nx29A0,3191
|
|
@@ -17,7 +17,7 @@ datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
|
17
17
|
datachain/storage.py,sha256=RiSJLYdHUjnrEWkLBKPcETHpAxld_B2WxLg711t0aZI,3733
|
|
18
18
|
datachain/utils.py,sha256=kgH5NPj47eC_KrFTd6ZS206lKVhnJVFt5XsqkK6ppTc,12483
|
|
19
19
|
datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
|
|
20
|
-
datachain/catalog/catalog.py,sha256=
|
|
20
|
+
datachain/catalog/catalog.py,sha256=z0tclel0kNdSzJojNRRnRVhgt-K7ElO3CeuurlwQMGI,80612
|
|
21
21
|
datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
|
|
22
22
|
datachain/catalog/loader.py,sha256=GJ8zhEYkC7TuaPzCsjJQ4LtTdECu-wwYzC12MikPOMQ,7307
|
|
23
23
|
datachain/catalog/subclass.py,sha256=B5R0qxeTYEyVAAPM1RutBPSoXZc8L5mVVZeSGXki9Sw,2096
|
|
@@ -29,27 +29,27 @@ datachain/client/gcs.py,sha256=ucX8e6JrqlFY-f80zkv084vxnKdtxpO32QJ-RG8Nv1s,4454
|
|
|
29
29
|
datachain/client/local.py,sha256=NQVkLTJQ-a7Udavqbh_4uT-IejfZQYn10j22owz9sis,5150
|
|
30
30
|
datachain/client/s3.py,sha256=TmW4f7VUM5CMZjSmgyFQFKeMUGrXt2SLoLEbLOUleiU,6296
|
|
31
31
|
datachain/data_storage/__init__.py,sha256=cEOJpyu1JDZtfUupYucCDNFI6e5Wmp_Oyzq6rZv32Y8,398
|
|
32
|
-
datachain/data_storage/db_engine.py,sha256=
|
|
33
|
-
datachain/data_storage/id_generator.py,sha256=
|
|
32
|
+
datachain/data_storage/db_engine.py,sha256=81Ol1of9TTTzD97ORajCnP366Xz2mEJt6C-kTUCaru4,3406
|
|
33
|
+
datachain/data_storage/id_generator.py,sha256=lCEoU0BM37Ai2aRpSbwo5oQT0GqZnSpYwwvizathRMQ,4292
|
|
34
34
|
datachain/data_storage/job.py,sha256=w-7spowjkOa1P5fUVtJou3OltT0L48P0RYWZ9rSJ9-s,383
|
|
35
|
-
datachain/data_storage/metastore.py,sha256=
|
|
35
|
+
datachain/data_storage/metastore.py,sha256=nxcY6nwyEmQWMAo33sNGO-FgUFQs2amBGGnZz2ftEz0,55362
|
|
36
36
|
datachain/data_storage/schema.py,sha256=FQvt5MUMSnI5ZAE7Nthae4aaJpt8JC4nH8KiWDuhJkk,8135
|
|
37
37
|
datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
|
|
38
|
-
datachain/data_storage/sqlite.py,sha256=
|
|
39
|
-
datachain/data_storage/warehouse.py,sha256=
|
|
38
|
+
datachain/data_storage/sqlite.py,sha256=0r6L_a2hdGRoR_gl06v1qWhEFOS_Q31aldHyk07Yx-M,26857
|
|
39
|
+
datachain/data_storage/warehouse.py,sha256=G79jsQwA6anYPWoiBXngwPyx-uP7yGIWqhZGc4TL5mY,33591
|
|
40
40
|
datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
41
|
-
datachain/lib/arrow.py,sha256=
|
|
41
|
+
datachain/lib/arrow.py,sha256=R8wDUDEa-5hYjI3HW9cqvOYYJpeeah5lbhFIL3gkmcE,4915
|
|
42
42
|
datachain/lib/clip.py,sha256=16u4b_y2Y15nUS2UN_8ximMo6r_-_4IQpmct2ol-e-g,5730
|
|
43
43
|
datachain/lib/data_model.py,sha256=qfTtQNncS5pt9SvXdMEa5kClniaT6XBGBfO7onEz2TI,1632
|
|
44
44
|
datachain/lib/dataset_info.py,sha256=lONGr71ozo1DS4CQEhnpKORaU4qFb6Ketv8Xm8CVm2U,2188
|
|
45
|
-
datachain/lib/dc.py,sha256=
|
|
46
|
-
datachain/lib/file.py,sha256=
|
|
45
|
+
datachain/lib/dc.py,sha256=F2DrvBLxsLDHY7wDVzMFj_-IRscDxb_STTRMqd0gmyw,57971
|
|
46
|
+
datachain/lib/file.py,sha256=MCklths3w9SgQTR0LACnDohfGdEc3t30XD0qNq1oTlI,12000
|
|
47
47
|
datachain/lib/image.py,sha256=TgYhRhzd4nkytfFMeykQkPyzqb5Le_-tU81unVMPn4Q,2328
|
|
48
48
|
datachain/lib/meta_formats.py,sha256=jlSYWRUeDMjun_YCsQ2JxyaDJpEpokzHDPmKUAoCXnU,7034
|
|
49
49
|
datachain/lib/model_store.py,sha256=c4USXsBBjrGH8VOh4seIgOiav-qHOwdoixtxfLgU63c,2409
|
|
50
50
|
datachain/lib/pytorch.py,sha256=9PsypKseyKfIimTmTQOgb-pbNXgeeAHLdlWx0qRPULY,5660
|
|
51
51
|
datachain/lib/settings.py,sha256=39thOpYJw-zPirzeNO6pmRC2vPrQvt4eBsw1xLWDFsw,2344
|
|
52
|
-
datachain/lib/signal_schema.py,sha256=
|
|
52
|
+
datachain/lib/signal_schema.py,sha256=VL9TR0CJ3eRzjIDr-8e-e7cZKuMBbPUZtY2lGAsucc0,15734
|
|
53
53
|
datachain/lib/text.py,sha256=dVe2Ilc_gW2EV0kun0UwegiCkapWcd20cef7CgINWHU,1083
|
|
54
54
|
datachain/lib/udf.py,sha256=IjuDt2B8E3xEHhcJnaK_ZhmivdrOYPXz5uf7ylpktws,11815
|
|
55
55
|
datachain/lib/udf_signature.py,sha256=gMStcEeYJka5M6cg50Z9orC6y6HzCAJ3MkFqqn1fjZg,7137
|
|
@@ -60,18 +60,18 @@ datachain/lib/webdataset_laion.py,sha256=PQP6tQmUP7Xu9fPuAGK1JDBYA6T5UufYMUTGaxg
|
|
|
60
60
|
datachain/lib/convert/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
61
61
|
datachain/lib/convert/flatten.py,sha256=YMoC00BqEy3zSpvCp6Q0DfxihuPmgjUJj1g2cesWGPs,1790
|
|
62
62
|
datachain/lib/convert/python_to_sql.py,sha256=4gplGlr_Kg-Z40OpJUzJiarDWj7pwbUOk-dPOYYCJ9Q,2629
|
|
63
|
-
datachain/lib/convert/sql_to_python.py,sha256=
|
|
63
|
+
datachain/lib/convert/sql_to_python.py,sha256=lGnKzSF_tz9Y_5SSKkrIU95QEjpcDzvOxIRkEKTQag0,443
|
|
64
64
|
datachain/lib/convert/unflatten.py,sha256=Ogvh_5wg2f38_At_1lN0D_e2uZOOpYEvwvB2xdq56Tw,2012
|
|
65
65
|
datachain/lib/convert/values_to_tuples.py,sha256=aVoHWMOUGLAiS6_BBwKJqVIne91VffOW6-dWyNE7oHg,3715
|
|
66
66
|
datachain/query/__init__.py,sha256=tv-spkjUCYamMN9ys_90scYrZ8kJ7C7d1MTYVmxGtk4,325
|
|
67
67
|
datachain/query/batch.py,sha256=j-_ZcuQra2Ro3Wj4crtqQCg-7xuv-p84hr4QHdvT7as,3479
|
|
68
68
|
datachain/query/builtins.py,sha256=ZKNs49t8Oa_OaboCBIEqtXZt7c1Qe9OR_C_HpoDriIU,2781
|
|
69
|
-
datachain/query/dataset.py,sha256
|
|
69
|
+
datachain/query/dataset.py,sha256=-AGkz3-K_b-2YBJCMqQz-Qq7FKzMcScPty_77S0AQtE,59938
|
|
70
70
|
datachain/query/dispatch.py,sha256=oGX9ZuoKWPB_EyqAZD_eULcO3OejY44_keSmFS6SHT0,13315
|
|
71
71
|
datachain/query/metrics.py,sha256=vsECqbZfoSDBnvC3GQlziKXmISVYDLgHP1fMPEOtKyo,640
|
|
72
72
|
datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
|
|
73
73
|
datachain/query/schema.py,sha256=hAvux_GxUmuG_PwtnKkkizld9f0Gvt2JBzbu3m74fvE,7840
|
|
74
|
-
datachain/query/session.py,sha256=
|
|
74
|
+
datachain/query/session.py,sha256=qTzkXgwMJdJhal3rVt3hdv3x1EXT1IHuXcwkC-Ex0As,4111
|
|
75
75
|
datachain/query/udf.py,sha256=c0IOTkcedpOQEmX-Idlrrl1__1IecNXL0N9oUO9Dtkg,7755
|
|
76
76
|
datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
77
77
|
datachain/remote/studio.py,sha256=f5s6qSZ9uB4URGUoU_8_W1KZRRQQVSm6cgEBkBUEfuE,7226
|
|
@@ -81,20 +81,20 @@ datachain/sql/types.py,sha256=SShudhdIpdfTKDxWDDqOajYRkTCkIgQbilA94g4i-4E,10389
|
|
|
81
81
|
datachain/sql/utils.py,sha256=rzlJw08etivdrcuQPqNVvVWhuVSyUPUQEEc6DOhu258,818
|
|
82
82
|
datachain/sql/default/__init__.py,sha256=XQ2cEZpzWiABqjV-6yYHUBGI9vN_UHxbxZENESmVAWw,45
|
|
83
83
|
datachain/sql/default/base.py,sha256=h44005q3qtMc9cjWmRufWwcBr5CfK_dnvG4IrcSQs_8,536
|
|
84
|
-
datachain/sql/functions/__init__.py,sha256=
|
|
85
|
-
datachain/sql/functions/array.py,sha256=
|
|
84
|
+
datachain/sql/functions/__init__.py,sha256=Ioyy7nSetrTLVnHGcGcmZU99HxUFcx-5PFbrh2dPNH0,396
|
|
85
|
+
datachain/sql/functions/array.py,sha256=EB7nJSncUc1PuxlHyzU2gVhF8DuXaxpGlxb5e8X2KFY,1297
|
|
86
86
|
datachain/sql/functions/conditional.py,sha256=q7YUKfunXeEldXaxgT-p5pUTcOEVU_tcQ2BJlquTRPs,207
|
|
87
87
|
datachain/sql/functions/path.py,sha256=zixpERotTFP6LZ7I4TiGtyRA8kXOoZmH1yzH9oRW0mg,1294
|
|
88
88
|
datachain/sql/functions/random.py,sha256=vBwEEj98VH4LjWixUCygQ5Bz1mv1nohsCG0-ZTELlVg,271
|
|
89
89
|
datachain/sql/functions/string.py,sha256=hIrF1fTvlPamDtm8UMnWDcnGfbbjCsHxZXS30U2Rzxo,651
|
|
90
90
|
datachain/sql/sqlite/__init__.py,sha256=TAdJX0Bg28XdqPO-QwUVKy8rg78cgMileHvMNot7d04,166
|
|
91
|
-
datachain/sql/sqlite/base.py,sha256=
|
|
91
|
+
datachain/sql/sqlite/base.py,sha256=LBYmXqXsVF30fbcnR55evCZHbPDCzMdGk_ogPLps63s,12236
|
|
92
92
|
datachain/sql/sqlite/types.py,sha256=yzvp0sXSEoEYXs6zaYC_2YubarQoZH-MiUNXcpuEP4s,1573
|
|
93
93
|
datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
|
|
94
94
|
datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
|
|
95
|
-
datachain-0.2.
|
|
96
|
-
datachain-0.2.
|
|
97
|
-
datachain-0.2.
|
|
98
|
-
datachain-0.2.
|
|
99
|
-
datachain-0.2.
|
|
100
|
-
datachain-0.2.
|
|
95
|
+
datachain-0.2.18.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
|
|
96
|
+
datachain-0.2.18.dist-info/METADATA,sha256=_wZgyu8nS5Ut_kQcIc_n9979rQcvv8fPuSIHbyCGhX0,17269
|
|
97
|
+
datachain-0.2.18.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
|
|
98
|
+
datachain-0.2.18.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
|
|
99
|
+
datachain-0.2.18.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
|
|
100
|
+
datachain-0.2.18.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|