datachain 0.34.6__py3-none-any.whl → 0.35.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/asyn.py +11 -12
- datachain/cache.py +5 -5
- datachain/catalog/catalog.py +75 -83
- datachain/catalog/loader.py +3 -3
- datachain/checkpoint.py +1 -2
- datachain/cli/__init__.py +2 -4
- datachain/cli/commands/datasets.py +13 -13
- datachain/cli/commands/ls.py +4 -4
- datachain/cli/commands/query.py +3 -3
- datachain/cli/commands/show.py +2 -2
- datachain/cli/parser/job.py +1 -1
- datachain/cli/parser/utils.py +1 -2
- datachain/cli/utils.py +1 -2
- datachain/client/azure.py +2 -2
- datachain/client/fsspec.py +11 -21
- datachain/client/gcs.py +3 -3
- datachain/client/http.py +4 -4
- datachain/client/local.py +4 -4
- datachain/client/s3.py +3 -3
- datachain/config.py +4 -8
- datachain/data_storage/db_engine.py +5 -5
- datachain/data_storage/metastore.py +107 -107
- datachain/data_storage/schema.py +18 -24
- datachain/data_storage/sqlite.py +21 -28
- datachain/data_storage/warehouse.py +13 -13
- datachain/dataset.py +64 -70
- datachain/delta.py +21 -18
- datachain/diff/__init__.py +13 -13
- datachain/func/aggregate.py +9 -11
- datachain/func/array.py +12 -12
- datachain/func/base.py +7 -4
- datachain/func/conditional.py +9 -13
- datachain/func/func.py +45 -42
- datachain/func/numeric.py +5 -7
- datachain/func/string.py +2 -2
- datachain/hash_utils.py +54 -81
- datachain/job.py +8 -8
- datachain/lib/arrow.py +17 -14
- datachain/lib/audio.py +6 -6
- datachain/lib/clip.py +5 -4
- datachain/lib/convert/python_to_sql.py +4 -22
- datachain/lib/convert/values_to_tuples.py +4 -9
- datachain/lib/data_model.py +20 -19
- datachain/lib/dataset_info.py +6 -6
- datachain/lib/dc/csv.py +10 -10
- datachain/lib/dc/database.py +28 -29
- datachain/lib/dc/datachain.py +98 -97
- datachain/lib/dc/datasets.py +22 -22
- datachain/lib/dc/hf.py +4 -4
- datachain/lib/dc/json.py +9 -10
- datachain/lib/dc/listings.py +5 -8
- datachain/lib/dc/pandas.py +3 -6
- datachain/lib/dc/parquet.py +5 -5
- datachain/lib/dc/records.py +5 -5
- datachain/lib/dc/storage.py +12 -12
- datachain/lib/dc/storage_pattern.py +2 -2
- datachain/lib/dc/utils.py +11 -14
- datachain/lib/dc/values.py +3 -6
- datachain/lib/file.py +32 -28
- datachain/lib/hf.py +7 -5
- datachain/lib/image.py +13 -13
- datachain/lib/listing.py +5 -5
- datachain/lib/listing_info.py +1 -2
- datachain/lib/meta_formats.py +1 -2
- datachain/lib/model_store.py +3 -3
- datachain/lib/namespaces.py +4 -6
- datachain/lib/projects.py +5 -9
- datachain/lib/pytorch.py +10 -10
- datachain/lib/settings.py +23 -23
- datachain/lib/signal_schema.py +52 -44
- datachain/lib/text.py +8 -7
- datachain/lib/udf.py +25 -17
- datachain/lib/udf_signature.py +11 -11
- datachain/lib/video.py +3 -4
- datachain/lib/webdataset.py +30 -35
- datachain/lib/webdataset_laion.py +15 -16
- datachain/listing.py +4 -4
- datachain/model/bbox.py +3 -1
- datachain/namespace.py +4 -4
- datachain/node.py +6 -6
- datachain/nodes_thread_pool.py +0 -1
- datachain/plugins.py +1 -7
- datachain/project.py +4 -4
- datachain/query/batch.py +7 -8
- datachain/query/dataset.py +80 -87
- datachain/query/dispatch.py +7 -7
- datachain/query/metrics.py +3 -4
- datachain/query/params.py +2 -3
- datachain/query/schema.py +7 -6
- datachain/query/session.py +7 -7
- datachain/query/udf.py +8 -7
- datachain/query/utils.py +3 -5
- datachain/remote/studio.py +33 -39
- datachain/script_meta.py +12 -12
- datachain/sql/sqlite/base.py +6 -9
- datachain/studio.py +30 -30
- datachain/toolkit/split.py +1 -2
- datachain/utils.py +21 -21
- {datachain-0.34.6.dist-info → datachain-0.35.0.dist-info}/METADATA +2 -3
- datachain-0.35.0.dist-info/RECORD +173 -0
- datachain-0.34.6.dist-info/RECORD +0 -173
- {datachain-0.34.6.dist-info → datachain-0.35.0.dist-info}/WHEEL +0 -0
- {datachain-0.34.6.dist-info → datachain-0.35.0.dist-info}/entry_points.txt +0 -0
- {datachain-0.34.6.dist-info → datachain-0.35.0.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.34.6.dist-info → datachain-0.35.0.dist-info}/top_level.txt +0 -0
datachain/lib/dc/csv.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import os
|
|
2
|
-
from collections.abc import Sequence
|
|
3
|
-
from typing import TYPE_CHECKING
|
|
2
|
+
from collections.abc import Callable, Sequence
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
4
4
|
|
|
5
5
|
from datachain.lib.dc.utils import DatasetPrepareError, OutputType
|
|
6
6
|
from datachain.lib.model_store import ModelStore
|
|
@@ -13,18 +13,18 @@ if TYPE_CHECKING:
|
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
def read_csv(
|
|
16
|
-
path:
|
|
17
|
-
delimiter:
|
|
16
|
+
path: str | os.PathLike[str] | list[str] | list[os.PathLike[str]],
|
|
17
|
+
delimiter: str | None = None,
|
|
18
18
|
header: bool = True,
|
|
19
19
|
output: OutputType = None,
|
|
20
20
|
column: str = "",
|
|
21
21
|
model_name: str = "",
|
|
22
22
|
source: bool = True,
|
|
23
|
-
nrows:
|
|
24
|
-
session:
|
|
25
|
-
settings:
|
|
26
|
-
column_types:
|
|
27
|
-
parse_options:
|
|
23
|
+
nrows: int | None = None,
|
|
24
|
+
session: Session | None = None,
|
|
25
|
+
settings: dict | None = None,
|
|
26
|
+
column_types: dict[str, "str | ArrowDataType"] | None = None,
|
|
27
|
+
parse_options: dict[str, str | bool | Callable] | None = None,
|
|
28
28
|
**kwargs,
|
|
29
29
|
) -> "DataChain":
|
|
30
30
|
"""Generate chain from csv files.
|
|
@@ -63,7 +63,7 @@ def read_csv(
|
|
|
63
63
|
chain = dc.read_csv("s3://mybucket/dir")
|
|
64
64
|
```
|
|
65
65
|
"""
|
|
66
|
-
from pandas.
|
|
66
|
+
from pandas._libs.parsers import STR_NA_VALUES
|
|
67
67
|
from pyarrow.csv import ConvertOptions, ParseOptions, ReadOptions
|
|
68
68
|
from pyarrow.dataset import CsvFileFormat
|
|
69
69
|
from pyarrow.lib import type_for_alias
|
datachain/lib/dc/database.py
CHANGED
|
@@ -2,7 +2,8 @@ import contextlib
|
|
|
2
2
|
import itertools
|
|
3
3
|
import os
|
|
4
4
|
import sqlite3
|
|
5
|
-
from
|
|
5
|
+
from collections.abc import Iterator, Mapping, Sequence
|
|
6
|
+
from typing import TYPE_CHECKING, Any
|
|
6
7
|
|
|
7
8
|
import sqlalchemy
|
|
8
9
|
|
|
@@ -12,8 +13,6 @@ from datachain.utils import batched
|
|
|
12
13
|
DEFAULT_DATABASE_BATCH_SIZE = 10_000
|
|
13
14
|
|
|
14
15
|
if TYPE_CHECKING:
|
|
15
|
-
from collections.abc import Iterator, Mapping, Sequence
|
|
16
|
-
|
|
17
16
|
import sqlalchemy.orm # noqa: TC004
|
|
18
17
|
|
|
19
18
|
from datachain.lib.data_model import DataType
|
|
@@ -21,21 +20,21 @@ if TYPE_CHECKING:
|
|
|
21
20
|
|
|
22
21
|
from .datachain import DataChain
|
|
23
22
|
|
|
24
|
-
ConnectionType =
|
|
25
|
-
str
|
|
26
|
-
sqlalchemy.engine.URL
|
|
27
|
-
sqlalchemy.engine.interfaces.Connectable
|
|
28
|
-
sqlalchemy.engine.Engine
|
|
29
|
-
sqlalchemy.engine.Connection
|
|
30
|
-
sqlalchemy.orm.Session
|
|
31
|
-
sqlite3.Connection
|
|
32
|
-
|
|
23
|
+
ConnectionType = (
|
|
24
|
+
str
|
|
25
|
+
| sqlalchemy.engine.URL
|
|
26
|
+
| sqlalchemy.engine.interfaces.Connectable
|
|
27
|
+
| sqlalchemy.engine.Engine
|
|
28
|
+
| sqlalchemy.engine.Connection
|
|
29
|
+
| sqlalchemy.orm.Session
|
|
30
|
+
| sqlite3.Connection
|
|
31
|
+
)
|
|
33
32
|
|
|
34
33
|
|
|
35
34
|
@contextlib.contextmanager
|
|
36
35
|
def _connect(
|
|
37
36
|
connection: "ConnectionType",
|
|
38
|
-
) ->
|
|
37
|
+
) -> Iterator[sqlalchemy.engine.Connection]:
|
|
39
38
|
import sqlalchemy.orm
|
|
40
39
|
|
|
41
40
|
with contextlib.ExitStack() as stack:
|
|
@@ -74,9 +73,9 @@ def to_database(
|
|
|
74
73
|
connection: "ConnectionType",
|
|
75
74
|
*,
|
|
76
75
|
batch_size: int = DEFAULT_DATABASE_BATCH_SIZE,
|
|
77
|
-
on_conflict:
|
|
78
|
-
conflict_columns:
|
|
79
|
-
column_mapping:
|
|
76
|
+
on_conflict: str | None = None,
|
|
77
|
+
conflict_columns: list[str] | None = None,
|
|
78
|
+
column_mapping: dict[str, str | None] | None = None,
|
|
80
79
|
) -> int:
|
|
81
80
|
"""
|
|
82
81
|
Implementation function for exporting DataChain to database tables.
|
|
@@ -150,8 +149,8 @@ def to_database(
|
|
|
150
149
|
|
|
151
150
|
|
|
152
151
|
def _normalize_column_mapping(
|
|
153
|
-
column_mapping: dict[str,
|
|
154
|
-
) -> dict[str,
|
|
152
|
+
column_mapping: dict[str, str | None],
|
|
153
|
+
) -> dict[str, str | None]:
|
|
155
154
|
"""
|
|
156
155
|
Convert column mapping keys from DataChain format (dots) to database format
|
|
157
156
|
(double underscores).
|
|
@@ -163,7 +162,7 @@ def _normalize_column_mapping(
|
|
|
163
162
|
if not column_mapping:
|
|
164
163
|
return {}
|
|
165
164
|
|
|
166
|
-
normalized_mapping: dict[str,
|
|
165
|
+
normalized_mapping: dict[str, str | None] = {}
|
|
167
166
|
original_keys: dict[str, str] = {}
|
|
168
167
|
for key, value in column_mapping.items():
|
|
169
168
|
db_key = ColumnMeta.to_db_name(key)
|
|
@@ -181,7 +180,7 @@ def _normalize_column_mapping(
|
|
|
181
180
|
from collections import defaultdict
|
|
182
181
|
|
|
183
182
|
default_factory = column_mapping.default_factory
|
|
184
|
-
result: dict[str,
|
|
183
|
+
result: dict[str, str | None] = defaultdict(default_factory)
|
|
185
184
|
result.update(normalized_mapping)
|
|
186
185
|
return result
|
|
187
186
|
|
|
@@ -189,8 +188,8 @@ def _normalize_column_mapping(
|
|
|
189
188
|
|
|
190
189
|
|
|
191
190
|
def _normalize_conflict_columns(
|
|
192
|
-
conflict_columns:
|
|
193
|
-
) ->
|
|
191
|
+
conflict_columns: list[str] | None, column_mapping: dict[str, str | None]
|
|
192
|
+
) -> list[str] | None:
|
|
194
193
|
"""
|
|
195
194
|
Normalize conflict_columns by converting DataChain format to database format
|
|
196
195
|
and applying column mapping.
|
|
@@ -297,15 +296,15 @@ def _process_batch(
|
|
|
297
296
|
|
|
298
297
|
|
|
299
298
|
def read_database(
|
|
300
|
-
query:
|
|
299
|
+
query: "str | sqlalchemy.sql.expression.Executable",
|
|
301
300
|
connection: "ConnectionType",
|
|
302
|
-
params:
|
|
301
|
+
params: Sequence[Mapping[str, Any]] | Mapping[str, Any] | None = None,
|
|
303
302
|
*,
|
|
304
|
-
output:
|
|
305
|
-
session:
|
|
306
|
-
settings:
|
|
303
|
+
output: dict[str, "DataType"] | None = None,
|
|
304
|
+
session: "Session | None" = None,
|
|
305
|
+
settings: dict | None = None,
|
|
307
306
|
in_memory: bool = False,
|
|
308
|
-
infer_schema_length:
|
|
307
|
+
infer_schema_length: int | None = 100,
|
|
309
308
|
) -> "DataChain":
|
|
310
309
|
"""
|
|
311
310
|
Read the results of a SQL query into a DataChain, using a given database connection.
|
|
@@ -382,7 +381,7 @@ def read_database(
|
|
|
382
381
|
def _infer_schema(
|
|
383
382
|
result: "sqlalchemy.engine.Result",
|
|
384
383
|
to_infer: list[str],
|
|
385
|
-
infer_schema_length:
|
|
384
|
+
infer_schema_length: int | None = 100,
|
|
386
385
|
) -> tuple[list["sqlalchemy.Row"], dict[str, "DataType"]]:
|
|
387
386
|
from datachain.lib.convert.values_to_tuples import values_to_tuples
|
|
388
387
|
|