datachain 0.6.2__py3-none-any.whl → 0.6.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/catalog/catalog.py +3 -25
- datachain/cli.py +0 -8
- datachain/client/fsspec.py +10 -5
- datachain/client/local.py +7 -3
- datachain/data_storage/metastore.py +11 -478
- datachain/data_storage/sqlite.py +9 -41
- datachain/data_storage/warehouse.py +1 -2
- datachain/dataset.py +12 -10
- datachain/error.py +0 -4
- datachain/lib/arrow.py +2 -15
- datachain/lib/data_model.py +10 -2
- datachain/lib/utils.py +30 -0
- datachain/node.py +1 -1
- {datachain-0.6.2.dist-info → datachain-0.6.4.dist-info}/METADATA +2 -2
- {datachain-0.6.2.dist-info → datachain-0.6.4.dist-info}/RECORD +19 -20
- {datachain-0.6.2.dist-info → datachain-0.6.4.dist-info}/WHEEL +1 -1
- datachain/storage.py +0 -136
- {datachain-0.6.2.dist-info → datachain-0.6.4.dist-info}/LICENSE +0 -0
- {datachain-0.6.2.dist-info → datachain-0.6.4.dist-info}/entry_points.txt +0 -0
- {datachain-0.6.2.dist-info → datachain-0.6.4.dist-info}/top_level.txt +0 -0
datachain/data_storage/sqlite.py
CHANGED
|
@@ -29,12 +29,11 @@ from datachain.data_storage import AbstractDBMetastore, AbstractWarehouse
|
|
|
29
29
|
from datachain.data_storage.db_engine import DatabaseEngine
|
|
30
30
|
from datachain.data_storage.id_generator import AbstractDBIDGenerator
|
|
31
31
|
from datachain.data_storage.schema import DefaultSchema
|
|
32
|
-
from datachain.dataset import DatasetRecord
|
|
32
|
+
from datachain.dataset import DatasetRecord, StorageURI
|
|
33
33
|
from datachain.error import DataChainError
|
|
34
34
|
from datachain.sql.sqlite import create_user_defined_sql_functions, sqlite_dialect
|
|
35
35
|
from datachain.sql.sqlite.base import load_usearch_extension
|
|
36
36
|
from datachain.sql.types import SQLType
|
|
37
|
-
from datachain.storage import StorageURI
|
|
38
37
|
from datachain.utils import DataChainDir, batched_it
|
|
39
38
|
|
|
40
39
|
if TYPE_CHECKING:
|
|
@@ -392,14 +391,14 @@ class SQLiteMetastore(AbstractDBMetastore):
|
|
|
392
391
|
def __init__(
|
|
393
392
|
self,
|
|
394
393
|
id_generator: "SQLiteIDGenerator",
|
|
395
|
-
uri: StorageURI =
|
|
396
|
-
partial_id: Optional[int] = None,
|
|
394
|
+
uri: Optional[StorageURI] = None,
|
|
397
395
|
db: Optional["SQLiteDatabaseEngine"] = None,
|
|
398
396
|
db_file: Optional[str] = None,
|
|
399
397
|
in_memory: bool = False,
|
|
400
398
|
):
|
|
399
|
+
uri = uri or StorageURI("")
|
|
401
400
|
self.schema: DefaultSchema = DefaultSchema()
|
|
402
|
-
super().__init__(id_generator, uri
|
|
401
|
+
super().__init__(id_generator, uri)
|
|
403
402
|
|
|
404
403
|
# needed for dropping tables in correct order for tests because of
|
|
405
404
|
# foreign keys
|
|
@@ -417,21 +416,16 @@ class SQLiteMetastore(AbstractDBMetastore):
|
|
|
417
416
|
|
|
418
417
|
def clone(
|
|
419
418
|
self,
|
|
420
|
-
uri: StorageURI =
|
|
421
|
-
partial_id: Optional[int] = None,
|
|
419
|
+
uri: Optional[StorageURI] = None,
|
|
422
420
|
use_new_connection: bool = False,
|
|
423
421
|
) -> "SQLiteMetastore":
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
uri = self.uri
|
|
429
|
-
if self.partial_id:
|
|
430
|
-
partial_id = self.partial_id
|
|
422
|
+
uri = uri or StorageURI("")
|
|
423
|
+
if not uri and self.uri:
|
|
424
|
+
uri = self.uri
|
|
425
|
+
|
|
431
426
|
return SQLiteMetastore(
|
|
432
427
|
self.id_generator.clone(),
|
|
433
428
|
uri=uri,
|
|
434
|
-
partial_id=partial_id,
|
|
435
429
|
db=self.db.clone(),
|
|
436
430
|
)
|
|
437
431
|
|
|
@@ -446,7 +440,6 @@ class SQLiteMetastore(AbstractDBMetastore):
|
|
|
446
440
|
{
|
|
447
441
|
"id_generator_clone_params": self.id_generator.clone_params(),
|
|
448
442
|
"uri": self.uri,
|
|
449
|
-
"partial_id": self.partial_id,
|
|
450
443
|
"db_clone_params": self.db.clone_params(),
|
|
451
444
|
},
|
|
452
445
|
)
|
|
@@ -457,7 +450,6 @@ class SQLiteMetastore(AbstractDBMetastore):
|
|
|
457
450
|
*,
|
|
458
451
|
id_generator_clone_params: tuple[Callable, list, dict[str, Any]],
|
|
459
452
|
uri: StorageURI,
|
|
460
|
-
partial_id: Optional[int],
|
|
461
453
|
db_clone_params: tuple[Callable, list, dict[str, Any]],
|
|
462
454
|
) -> "SQLiteMetastore":
|
|
463
455
|
(
|
|
@@ -469,14 +461,11 @@ class SQLiteMetastore(AbstractDBMetastore):
|
|
|
469
461
|
return cls(
|
|
470
462
|
id_generator=id_generator_class(*id_generator_args, **id_generator_kwargs),
|
|
471
463
|
uri=uri,
|
|
472
|
-
partial_id=partial_id,
|
|
473
464
|
db=db_class(*db_args, **db_kwargs),
|
|
474
465
|
)
|
|
475
466
|
|
|
476
467
|
def _init_tables(self) -> None:
|
|
477
468
|
"""Initialize tables."""
|
|
478
|
-
self.db.create_table(self._storages, if_not_exists=True)
|
|
479
|
-
self.default_table_names.append(self._storages.name)
|
|
480
469
|
self.db.create_table(self._datasets, if_not_exists=True)
|
|
481
470
|
self.default_table_names.append(self._datasets.name)
|
|
482
471
|
self.db.create_table(self._datasets_versions, if_not_exists=True)
|
|
@@ -486,28 +475,11 @@ class SQLiteMetastore(AbstractDBMetastore):
|
|
|
486
475
|
self.db.create_table(self._jobs, if_not_exists=True)
|
|
487
476
|
self.default_table_names.append(self._jobs.name)
|
|
488
477
|
|
|
489
|
-
def init(self, uri: StorageURI) -> None:
|
|
490
|
-
if not uri:
|
|
491
|
-
raise ValueError("uri for init() cannot be empty")
|
|
492
|
-
partials_table = self._partials_table(uri)
|
|
493
|
-
self.db.create_table(partials_table, if_not_exists=True)
|
|
494
|
-
|
|
495
|
-
@classmethod
|
|
496
|
-
def _buckets_columns(cls) -> list["SchemaItem"]:
|
|
497
|
-
"""Buckets (storages) table columns."""
|
|
498
|
-
return [*super()._buckets_columns(), UniqueConstraint("uri")]
|
|
499
|
-
|
|
500
478
|
@classmethod
|
|
501
479
|
def _datasets_columns(cls) -> list["SchemaItem"]:
|
|
502
480
|
"""Datasets table columns."""
|
|
503
481
|
return [*super()._datasets_columns(), UniqueConstraint("name")]
|
|
504
482
|
|
|
505
|
-
def _storages_insert(self) -> "Insert":
|
|
506
|
-
return sqlite.insert(self._storages)
|
|
507
|
-
|
|
508
|
-
def _partials_insert(self) -> "Insert":
|
|
509
|
-
return sqlite.insert(self._partials)
|
|
510
|
-
|
|
511
483
|
def _datasets_insert(self) -> "Insert":
|
|
512
484
|
return sqlite.insert(self._datasets)
|
|
513
485
|
|
|
@@ -526,13 +498,9 @@ class SQLiteMetastore(AbstractDBMetastore):
|
|
|
526
498
|
self._datasets_dependencies.c.id,
|
|
527
499
|
self._datasets_dependencies.c.dataset_id,
|
|
528
500
|
self._datasets_dependencies.c.dataset_version_id,
|
|
529
|
-
self._datasets_dependencies.c.bucket_id,
|
|
530
|
-
self._datasets_dependencies.c.bucket_version,
|
|
531
501
|
self._datasets.c.name,
|
|
532
|
-
self._datasets.c.created_at,
|
|
533
502
|
self._datasets_versions.c.version,
|
|
534
503
|
self._datasets_versions.c.created_at,
|
|
535
|
-
self._storages.c.uri,
|
|
536
504
|
]
|
|
537
505
|
|
|
538
506
|
#
|
|
@@ -19,11 +19,10 @@ from tqdm import tqdm
|
|
|
19
19
|
from datachain.client import Client
|
|
20
20
|
from datachain.data_storage.schema import convert_rows_custom_column_types
|
|
21
21
|
from datachain.data_storage.serializer import Serializable
|
|
22
|
-
from datachain.dataset import DatasetRecord
|
|
22
|
+
from datachain.dataset import DatasetRecord, StorageURI
|
|
23
23
|
from datachain.node import DirType, DirTypeGroup, Node, NodeWithPath, get_path
|
|
24
24
|
from datachain.sql.functions import path as pathfunc
|
|
25
25
|
from datachain.sql.types import Int, SQLType
|
|
26
|
-
from datachain.storage import StorageURI
|
|
27
26
|
from datachain.utils import sql_escape_like
|
|
28
27
|
|
|
29
28
|
if TYPE_CHECKING:
|
datachain/dataset.py
CHANGED
|
@@ -3,21 +3,17 @@ import json
|
|
|
3
3
|
from dataclasses import dataclass, fields
|
|
4
4
|
from datetime import datetime
|
|
5
5
|
from typing import (
|
|
6
|
-
TYPE_CHECKING,
|
|
7
6
|
Any,
|
|
7
|
+
NewType,
|
|
8
8
|
Optional,
|
|
9
9
|
TypeVar,
|
|
10
10
|
Union,
|
|
11
11
|
)
|
|
12
12
|
from urllib.parse import urlparse
|
|
13
13
|
|
|
14
|
-
from datachain.client import Client
|
|
15
14
|
from datachain.error import DatasetVersionNotFoundError
|
|
16
15
|
from datachain.sql.types import NAME_TYPES_MAPPING, SQLType
|
|
17
16
|
|
|
18
|
-
if TYPE_CHECKING:
|
|
19
|
-
from datachain.storage import StorageURI
|
|
20
|
-
|
|
21
17
|
T = TypeVar("T", bound="DatasetRecord")
|
|
22
18
|
V = TypeVar("V", bound="DatasetVersion")
|
|
23
19
|
DD = TypeVar("DD", bound="DatasetDependency")
|
|
@@ -27,6 +23,13 @@ QUERY_DATASET_PREFIX = "ds_query_"
|
|
|
27
23
|
LISTING_PREFIX = "lst__"
|
|
28
24
|
|
|
29
25
|
|
|
26
|
+
# StorageURI represents a normalised URI to a valid storage location (full bucket or
|
|
27
|
+
# absolute local path).
|
|
28
|
+
# Valid examples: s3://foo, file:///var/data
|
|
29
|
+
# Invalid examples: s3://foo/, s3://foo/bar, file://~
|
|
30
|
+
StorageURI = NewType("StorageURI", str)
|
|
31
|
+
|
|
32
|
+
|
|
30
33
|
def parse_dataset_uri(uri: str) -> tuple[str, Optional[int]]:
|
|
31
34
|
"""
|
|
32
35
|
Parse dataser uri to extract name and version out of it (if version is defined)
|
|
@@ -94,14 +97,11 @@ class DatasetDependency:
|
|
|
94
97
|
id: int,
|
|
95
98
|
dataset_id: Optional[int],
|
|
96
99
|
dataset_version_id: Optional[int],
|
|
97
|
-
bucket_id: Optional[int],
|
|
98
|
-
bucket_version: Optional[str],
|
|
99
100
|
dataset_name: Optional[str],
|
|
100
|
-
dataset_created_at: Optional[datetime],
|
|
101
101
|
dataset_version: Optional[int],
|
|
102
102
|
dataset_version_created_at: Optional[datetime],
|
|
103
|
-
bucket_uri: Optional["StorageURI"],
|
|
104
103
|
) -> Optional["DatasetDependency"]:
|
|
104
|
+
from datachain.client import Client
|
|
105
105
|
from datachain.lib.listing import is_listing_dataset, listing_uri_from_name
|
|
106
106
|
|
|
107
107
|
if not dataset_id:
|
|
@@ -124,7 +124,7 @@ class DatasetDependency:
|
|
|
124
124
|
if dataset_version
|
|
125
125
|
else None
|
|
126
126
|
),
|
|
127
|
-
dataset_version_created_at
|
|
127
|
+
dataset_version_created_at, # type: ignore[arg-type]
|
|
128
128
|
[],
|
|
129
129
|
)
|
|
130
130
|
|
|
@@ -448,6 +448,8 @@ class DatasetRecord:
|
|
|
448
448
|
For bucket listing we implicitly create underlying dataset to hold data. This
|
|
449
449
|
method is checking if this is one of those datasets.
|
|
450
450
|
"""
|
|
451
|
+
from datachain.client import Client
|
|
452
|
+
|
|
451
453
|
# TODO refactor and maybe remove method in
|
|
452
454
|
# https://github.com/iterative/datachain/issues/318
|
|
453
455
|
return Client.is_data_source_uri(self.name) or self.name.startswith(
|
datachain/error.py
CHANGED
datachain/lib/arrow.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import re
|
|
2
1
|
from collections.abc import Sequence
|
|
3
2
|
from tempfile import NamedTemporaryFile
|
|
4
3
|
from typing import TYPE_CHECKING, Any, Optional
|
|
@@ -13,6 +12,7 @@ from datachain.lib.file import ArrowRow, File
|
|
|
13
12
|
from datachain.lib.model_store import ModelStore
|
|
14
13
|
from datachain.lib.signal_schema import SignalSchema
|
|
15
14
|
from datachain.lib.udf import Generator
|
|
15
|
+
from datachain.lib.utils import normalize_col_names
|
|
16
16
|
|
|
17
17
|
if TYPE_CHECKING:
|
|
18
18
|
from datasets.features.features import Features
|
|
@@ -128,7 +128,7 @@ def schema_to_output(schema: pa.Schema, col_names: Optional[Sequence[str]] = Non
|
|
|
128
128
|
signal_schema = _get_datachain_schema(schema)
|
|
129
129
|
if signal_schema:
|
|
130
130
|
return signal_schema.values
|
|
131
|
-
columns =
|
|
131
|
+
columns = list(normalize_col_names(col_names).keys()) # type: ignore[arg-type]
|
|
132
132
|
hf_schema = _get_hf_schema(schema)
|
|
133
133
|
if hf_schema:
|
|
134
134
|
return {
|
|
@@ -143,19 +143,6 @@ def schema_to_output(schema: pa.Schema, col_names: Optional[Sequence[str]] = Non
|
|
|
143
143
|
return output
|
|
144
144
|
|
|
145
145
|
|
|
146
|
-
def _convert_col_names(col_names: Sequence[str]) -> list[str]:
|
|
147
|
-
default_column = 0
|
|
148
|
-
converted_col_names = []
|
|
149
|
-
for column in col_names:
|
|
150
|
-
column = column.lower()
|
|
151
|
-
column = re.sub("[^0-9a-z_]+", "", column)
|
|
152
|
-
if not column:
|
|
153
|
-
column = f"c{default_column}"
|
|
154
|
-
default_column += 1
|
|
155
|
-
converted_col_names.append(column)
|
|
156
|
-
return converted_col_names
|
|
157
|
-
|
|
158
|
-
|
|
159
146
|
def arrow_type_mapper(col_type: pa.DataType, column: str = "") -> type: # noqa: PLR0911
|
|
160
147
|
"""Convert pyarrow types to basic types."""
|
|
161
148
|
from datetime import datetime
|
datachain/lib/data_model.py
CHANGED
|
@@ -2,9 +2,10 @@ from collections.abc import Sequence
|
|
|
2
2
|
from datetime import datetime
|
|
3
3
|
from typing import ClassVar, Union, get_args, get_origin
|
|
4
4
|
|
|
5
|
-
from pydantic import BaseModel, create_model
|
|
5
|
+
from pydantic import BaseModel, Field, create_model
|
|
6
6
|
|
|
7
7
|
from datachain.lib.model_store import ModelStore
|
|
8
|
+
from datachain.lib.utils import normalize_col_names
|
|
8
9
|
|
|
9
10
|
StandardType = Union[
|
|
10
11
|
type[int],
|
|
@@ -60,7 +61,14 @@ def is_chain_type(t: type) -> bool:
|
|
|
60
61
|
|
|
61
62
|
|
|
62
63
|
def dict_to_data_model(name: str, data_dict: dict[str, DataType]) -> type[BaseModel]:
|
|
63
|
-
|
|
64
|
+
# Gets a map of a normalized_name -> original_name
|
|
65
|
+
columns = normalize_col_names(list(data_dict.keys()))
|
|
66
|
+
# We reverse if for convenience to original_name -> normalized_name
|
|
67
|
+
columns = {v: k for k, v in columns.items()}
|
|
68
|
+
|
|
69
|
+
fields = {
|
|
70
|
+
columns[name]: (anno, Field(alias=name)) for name, anno in data_dict.items()
|
|
71
|
+
}
|
|
64
72
|
return create_model(
|
|
65
73
|
name,
|
|
66
74
|
__base__=(DataModel,), # type: ignore[call-overload]
|
datachain/lib/utils.py
CHANGED
|
@@ -1,4 +1,6 @@
|
|
|
1
|
+
import re
|
|
1
2
|
from abc import ABC, abstractmethod
|
|
3
|
+
from collections.abc import Sequence
|
|
2
4
|
|
|
3
5
|
|
|
4
6
|
class AbstractUDF(ABC):
|
|
@@ -28,3 +30,31 @@ class DataChainParamsError(DataChainError):
|
|
|
28
30
|
class DataChainColumnError(DataChainParamsError):
|
|
29
31
|
def __init__(self, col_name, msg):
|
|
30
32
|
super().__init__(f"Error for column {col_name}: {msg}")
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def normalize_col_names(col_names: Sequence[str]) -> dict[str, str]:
|
|
36
|
+
gen_col_counter = 0
|
|
37
|
+
new_col_names = {}
|
|
38
|
+
org_col_names = set(col_names)
|
|
39
|
+
|
|
40
|
+
for org_column in col_names:
|
|
41
|
+
new_column = org_column.lower()
|
|
42
|
+
new_column = re.sub("[^0-9a-z]+", "_", new_column)
|
|
43
|
+
new_column = new_column.strip("_")
|
|
44
|
+
|
|
45
|
+
generated_column = new_column
|
|
46
|
+
|
|
47
|
+
while (
|
|
48
|
+
not generated_column.isidentifier()
|
|
49
|
+
or generated_column in new_col_names
|
|
50
|
+
or (generated_column != org_column and generated_column in org_col_names)
|
|
51
|
+
):
|
|
52
|
+
if new_column:
|
|
53
|
+
generated_column = f"c{gen_col_counter}_{new_column}"
|
|
54
|
+
else:
|
|
55
|
+
generated_column = f"c{gen_col_counter}"
|
|
56
|
+
gen_col_counter += 1
|
|
57
|
+
|
|
58
|
+
new_col_names[generated_column] = org_column
|
|
59
|
+
|
|
60
|
+
return new_col_names
|
datachain/node.py
CHANGED
|
@@ -3,8 +3,8 @@ from typing import TYPE_CHECKING, Any, Optional
|
|
|
3
3
|
|
|
4
4
|
import attrs
|
|
5
5
|
|
|
6
|
+
from datachain.dataset import StorageURI
|
|
6
7
|
from datachain.lib.file import File
|
|
7
|
-
from datachain.storage import StorageURI
|
|
8
8
|
from datachain.utils import TIME_ZERO, time_to_str
|
|
9
9
|
|
|
10
10
|
if TYPE_CHECKING:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.6.
|
|
3
|
+
Version: 0.6.4
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -47,7 +47,7 @@ Requires-Dist: platformdirs
|
|
|
47
47
|
Requires-Dist: dvc-studio-client <1,>=0.21
|
|
48
48
|
Provides-Extra: dev
|
|
49
49
|
Requires-Dist: datachain[docs,tests] ; extra == 'dev'
|
|
50
|
-
Requires-Dist: mypy ==1.
|
|
50
|
+
Requires-Dist: mypy ==1.13.0 ; extra == 'dev'
|
|
51
51
|
Requires-Dist: types-python-dateutil ; extra == 'dev'
|
|
52
52
|
Requires-Dist: types-pytz ; extra == 'dev'
|
|
53
53
|
Requires-Dist: types-PyYAML ; extra == 'dev'
|
|
@@ -2,47 +2,46 @@ datachain/__init__.py,sha256=OGzc8xZWtwqxiiutjU4AxCRPY0lrX_csgERiTrq4G0o,908
|
|
|
2
2
|
datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
|
|
3
3
|
datachain/asyn.py,sha256=Lg3Ck1PQLjQziMx9KU4atzbEnJXTE0924WMYkhgWtGU,8247
|
|
4
4
|
datachain/cache.py,sha256=s0YHN7qurmQv-eC265TjeureK84TebWWAnL07cxchZQ,2997
|
|
5
|
-
datachain/cli.py,sha256=
|
|
5
|
+
datachain/cli.py,sha256=Wl-xMpTRgrkg4drX5I_QxAB1IATyULHCXOdx_wfoLVg,33529
|
|
6
6
|
datachain/cli_utils.py,sha256=jrn9ejGXjybeO1ur3fjdSiAyCHZrX0qsLLbJzN9ErPM,2418
|
|
7
7
|
datachain/config.py,sha256=g8qbNV0vW2VEKpX-dGZ9pAn0DAz6G2ZFcr7SAV3PoSM,4272
|
|
8
|
-
datachain/dataset.py,sha256=
|
|
9
|
-
datachain/error.py,sha256=
|
|
8
|
+
datachain/dataset.py,sha256=lLUbUbJP1TYL9Obkc0f2IDziGcDylZge9ORQjK-WtXs,14717
|
|
9
|
+
datachain/error.py,sha256=bxAAL32lSeMgzsQDEHbGTGORj-mPzzpCRvWDPueJNN4,1092
|
|
10
10
|
datachain/job.py,sha256=Jt4sNutMHJReaGsj3r3scueN5aESLGfhimAa8pUP7Is,1271
|
|
11
11
|
datachain/listing.py,sha256=AV23WZq-k6e2zeeNBhVQP1-2PrwNCYidO0HBDKzpVaA,7152
|
|
12
|
-
datachain/node.py,sha256=
|
|
12
|
+
datachain/node.py,sha256=i7_jC8VcW6W5VYkDszAOu0H-rNBuqXB4UnLEh4wFzjc,5195
|
|
13
13
|
datachain/nodes_fetcher.py,sha256=F-73-h19HHNGtHFBGKk7p3mc0ALm4a9zGnzhtuUjnp4,1107
|
|
14
14
|
datachain/nodes_thread_pool.py,sha256=uPo-xl8zG5m9YgODjPFBpbcqqHjI-dcxH87yAbj_qco,3192
|
|
15
15
|
datachain/progress.py,sha256=5KotcvvzAUL_RF0GEj4JY0IB1lyImnmHxe89YkT1XO4,4330
|
|
16
16
|
datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
17
|
-
datachain/storage.py,sha256=RiSJLYdHUjnrEWkLBKPcETHpAxld_B2WxLg711t0aZI,3733
|
|
18
17
|
datachain/studio.py,sha256=d-jUsYpfI1LEv3g8KU-lLchVgb9L0TXvlHakieFud_E,3788
|
|
19
18
|
datachain/telemetry.py,sha256=0A4IOPPp9VlP5pyW9eBfaTK3YhHGzHl7dQudQjUAx9A,994
|
|
20
19
|
datachain/utils.py,sha256=-mSFowjIidJ4_sMXInvNHLn4rK_QnHuIlLuH1_lMGmI,13897
|
|
21
20
|
datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
|
|
22
|
-
datachain/catalog/catalog.py,sha256=
|
|
21
|
+
datachain/catalog/catalog.py,sha256=qFlRrR01_9h1MjK6DEgVSgIwbtZEGV_SdG_E5qUsHmM,57352
|
|
23
22
|
datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
|
|
24
23
|
datachain/catalog/loader.py,sha256=-6VelNfXUdgUnwInVyA8g86Boxv2xqhTh9xNS-Zlwig,8242
|
|
25
24
|
datachain/client/__init__.py,sha256=T4wiYL9KIM0ZZ_UqIyzV8_ufzYlewmizlV4iymHNluE,86
|
|
26
25
|
datachain/client/azure.py,sha256=ffxs26zm6KLAL1aUWJm-vtzuZP3LSNha7UDGXynMBKo,2234
|
|
27
26
|
datachain/client/fileslice.py,sha256=bT7TYco1Qe3bqoc8aUkUZcPdPofJDHlryL5BsTn9xsY,3021
|
|
28
|
-
datachain/client/fsspec.py,sha256=
|
|
27
|
+
datachain/client/fsspec.py,sha256=C6C5AO6ndkgcoUxCRN9_8fUzqX2cRWJWG6FL6oD9X_Q,12708
|
|
29
28
|
datachain/client/gcs.py,sha256=cnTIr5GS6dbYOEYfqehhyQu3dr6XNjPHSg5U3FkivUk,4124
|
|
30
29
|
datachain/client/hf.py,sha256=k24bpa6FEKNQn9zhoNC9kCigDwFSqobLsCnN_Nuzwh4,922
|
|
31
|
-
datachain/client/local.py,sha256=
|
|
30
|
+
datachain/client/local.py,sha256=vwbgCwZ7IqY2voj2l7tLJjgov7Dp--fEUvUwUBsMbls,4457
|
|
32
31
|
datachain/client/s3.py,sha256=CVHBUZ1Ic2Q3370nl-Bbe69phuWjFlrVv9dTJKBpRT0,6019
|
|
33
32
|
datachain/data_storage/__init__.py,sha256=cEOJpyu1JDZtfUupYucCDNFI6e5Wmp_Oyzq6rZv32Y8,398
|
|
34
33
|
datachain/data_storage/db_engine.py,sha256=81Ol1of9TTTzD97ORajCnP366Xz2mEJt6C-kTUCaru4,3406
|
|
35
34
|
datachain/data_storage/id_generator.py,sha256=lCEoU0BM37Ai2aRpSbwo5oQT0GqZnSpYwwvizathRMQ,4292
|
|
36
35
|
datachain/data_storage/job.py,sha256=w-7spowjkOa1P5fUVtJou3OltT0L48P0RYWZ9rSJ9-s,383
|
|
37
|
-
datachain/data_storage/metastore.py,sha256
|
|
36
|
+
datachain/data_storage/metastore.py,sha256=-TJCqG70VofSVOh2yEez4dwjHS3eQL8p7d9uO3WTVwM,35878
|
|
38
37
|
datachain/data_storage/schema.py,sha256=CiRXrDYp5ZZopSyUgZ7MT2ml_6YvqSTYXdybatcbX9M,9849
|
|
39
38
|
datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
|
|
40
|
-
datachain/data_storage/sqlite.py,sha256=
|
|
41
|
-
datachain/data_storage/warehouse.py,sha256=
|
|
39
|
+
datachain/data_storage/sqlite.py,sha256=wb8xlMJYYyt59wft0psJj587d-AwpNThzIqspVcKnRI,27388
|
|
40
|
+
datachain/data_storage/warehouse.py,sha256=xwMaR4jBpR13vjG3zrhphH4z2_CFLNj0KPF0LJCXCJ8,30727
|
|
42
41
|
datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
43
|
-
datachain/lib/arrow.py,sha256=
|
|
42
|
+
datachain/lib/arrow.py,sha256=M6SM4u2LeHgylzkPZBWckFeZt3CH3ehpBod3nGl6OYY,9138
|
|
44
43
|
datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
|
|
45
|
-
datachain/lib/data_model.py,sha256=
|
|
44
|
+
datachain/lib/data_model.py,sha256=dau4AlZBhOFvF7pEKMeqCeRkcFFg5KFvTBWW_2CdH5g,2371
|
|
46
45
|
datachain/lib/dataset_info.py,sha256=srPPhI2UHf6hFPBecyFEVw2SS5aPisIIMsvGgKqi7ss,2366
|
|
47
46
|
datachain/lib/dc.py,sha256=pOyE8LqIwo86GrZTSpSMUJAYYwep7nCdIxebkSYlMGo,84484
|
|
48
47
|
datachain/lib/file.py,sha256=LjTW_-PDAnoUhvyB4bJ8Y8n__XGqrxvmd9mDOF0Gir8,14875
|
|
@@ -59,7 +58,7 @@ datachain/lib/tar.py,sha256=3WIzao6yD5fbLqXLTt9GhPGNonbFIs_fDRu-9vgLgsA,1038
|
|
|
59
58
|
datachain/lib/text.py,sha256=UNHm8fhidk7wdrWqacEWaA6I9ykfYqarQ2URby7jc7M,1261
|
|
60
59
|
datachain/lib/udf.py,sha256=4CqK51n3bntXCmkwoOQIrX34wMKOknkC23HtR4D_2vM,12705
|
|
61
60
|
datachain/lib/udf_signature.py,sha256=GXw24A-Olna6DWCdgy2bC-gZh_gLGPQ-KvjuI6pUjC0,7281
|
|
62
|
-
datachain/lib/utils.py,sha256=
|
|
61
|
+
datachain/lib/utils.py,sha256=6NwgWLl5JrgtD4rsSFEe-yR2ntEwJMJEtAZ3FIxK3fg,1529
|
|
63
62
|
datachain/lib/vfile.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
64
63
|
datachain/lib/webdataset.py,sha256=o7SHk5HOUWsZ5Ln04xOM04eQqiBHiJNO7xLgyVBrwo8,6924
|
|
65
64
|
datachain/lib/webdataset_laion.py,sha256=aGMWeFmeYNK75ewO9JTA11iB1i3QtTzUfenQA5jajfo,2535
|
|
@@ -101,9 +100,9 @@ datachain/sql/sqlite/base.py,sha256=aHSZVvh4XSVkvZ07h3jMoRlHI4sWD8y3SnmGs9xMG9Y,
|
|
|
101
100
|
datachain/sql/sqlite/types.py,sha256=yzvp0sXSEoEYXs6zaYC_2YubarQoZH-MiUNXcpuEP4s,1573
|
|
102
101
|
datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
|
|
103
102
|
datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
|
|
104
|
-
datachain-0.6.
|
|
105
|
-
datachain-0.6.
|
|
106
|
-
datachain-0.6.
|
|
107
|
-
datachain-0.6.
|
|
108
|
-
datachain-0.6.
|
|
109
|
-
datachain-0.6.
|
|
103
|
+
datachain-0.6.4.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
|
|
104
|
+
datachain-0.6.4.dist-info/METADATA,sha256=zCHryMsrsacIST1qua0PHB6YRNgp1Qayuvsh57SqS9w,17188
|
|
105
|
+
datachain-0.6.4.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
|
|
106
|
+
datachain-0.6.4.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
|
|
107
|
+
datachain-0.6.4.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
|
|
108
|
+
datachain-0.6.4.dist-info/RECORD,,
|
datachain/storage.py
DELETED
|
@@ -1,136 +0,0 @@
|
|
|
1
|
-
import posixpath
|
|
2
|
-
from abc import ABC, abstractmethod
|
|
3
|
-
from datetime import datetime, timedelta, timezone
|
|
4
|
-
from functools import cached_property
|
|
5
|
-
from typing import NamedTuple, NewType, Optional, Union
|
|
6
|
-
from urllib.parse import urlparse
|
|
7
|
-
|
|
8
|
-
from datachain.utils import is_expired, time_to_local_str, time_to_str
|
|
9
|
-
|
|
10
|
-
STALE_MINUTES_LIMIT = 15
|
|
11
|
-
|
|
12
|
-
# StorageURI represents a normalised URI to a valid storage location (full bucket or
|
|
13
|
-
# absolute local path).
|
|
14
|
-
# Valid examples: s3://foo, file:///var/data
|
|
15
|
-
# Invalid examples: s3://foo/, s3://foo/bar, file://~
|
|
16
|
-
StorageURI = NewType("StorageURI", str)
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
class StorageStatus:
|
|
20
|
-
CREATED = 1
|
|
21
|
-
PENDING = 2
|
|
22
|
-
FAILED = 3
|
|
23
|
-
COMPLETE = 4
|
|
24
|
-
PARTIAL = 5
|
|
25
|
-
STALE = 6
|
|
26
|
-
INDEXING_SCHEDULED = 7
|
|
27
|
-
DELETE_SCHEDULED = 8
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
class AbstractStorage(ABC):
|
|
31
|
-
@property
|
|
32
|
-
@abstractmethod
|
|
33
|
-
def uri(self) -> StorageURI: ...
|
|
34
|
-
|
|
35
|
-
@property
|
|
36
|
-
@abstractmethod
|
|
37
|
-
def timestamp(self) -> Optional[Union[datetime, str]]: ...
|
|
38
|
-
|
|
39
|
-
@property
|
|
40
|
-
@abstractmethod
|
|
41
|
-
def expires(self) -> Optional[Union[datetime, str]]: ...
|
|
42
|
-
|
|
43
|
-
@property
|
|
44
|
-
@abstractmethod
|
|
45
|
-
def status(self) -> int: ...
|
|
46
|
-
|
|
47
|
-
@property
|
|
48
|
-
def type(self):
|
|
49
|
-
return self._parsed_uri.scheme
|
|
50
|
-
|
|
51
|
-
@property
|
|
52
|
-
def name(self):
|
|
53
|
-
return self._parsed_uri.netloc
|
|
54
|
-
|
|
55
|
-
@cached_property
|
|
56
|
-
def _parsed_uri(self):
|
|
57
|
-
return urlparse(self.uri)
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
class StorageRecord(NamedTuple):
|
|
61
|
-
id: int
|
|
62
|
-
uri: StorageURI
|
|
63
|
-
timestamp: Optional[Union[datetime, str]] = None
|
|
64
|
-
expires: Optional[Union[datetime, str]] = None
|
|
65
|
-
started_inserting_at: Optional[Union[datetime, str]] = None
|
|
66
|
-
last_inserted_at: Optional[Union[datetime, str]] = None
|
|
67
|
-
status: int = StorageStatus.CREATED
|
|
68
|
-
error_message: str = ""
|
|
69
|
-
error_stack: str = ""
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
class Storage(StorageRecord, AbstractStorage):
|
|
73
|
-
@property
|
|
74
|
-
def is_indexed(self) -> bool:
|
|
75
|
-
return self.status == StorageStatus.COMPLETE
|
|
76
|
-
|
|
77
|
-
@property
|
|
78
|
-
def is_expired(self) -> bool:
|
|
79
|
-
return is_expired(self.expires)
|
|
80
|
-
|
|
81
|
-
@property
|
|
82
|
-
def is_pending(self) -> bool:
|
|
83
|
-
return self.status == StorageStatus.PENDING
|
|
84
|
-
|
|
85
|
-
@property
|
|
86
|
-
def is_stale(self) -> bool:
|
|
87
|
-
limit = datetime.now(timezone.utc) - timedelta(minutes=STALE_MINUTES_LIMIT)
|
|
88
|
-
date_to_check = self.last_inserted_at or self.started_inserting_at
|
|
89
|
-
|
|
90
|
-
return self.is_pending and date_to_check < limit # type: ignore [operator]
|
|
91
|
-
|
|
92
|
-
@property
|
|
93
|
-
def need_indexing(self) -> bool:
|
|
94
|
-
return self.is_expired or not self.is_indexed
|
|
95
|
-
|
|
96
|
-
@property
|
|
97
|
-
def timestamp_str(self) -> Optional[str]:
|
|
98
|
-
if not self.timestamp:
|
|
99
|
-
return None
|
|
100
|
-
return time_to_str(self.timestamp)
|
|
101
|
-
|
|
102
|
-
@property
|
|
103
|
-
def timestamp_to_local(self) -> Optional[str]:
|
|
104
|
-
if not self.timestamp:
|
|
105
|
-
return None
|
|
106
|
-
return time_to_local_str(self.timestamp)
|
|
107
|
-
|
|
108
|
-
@property
|
|
109
|
-
def expires_to_local(self) -> Optional[str]:
|
|
110
|
-
if not self.expires:
|
|
111
|
-
return None
|
|
112
|
-
return time_to_local_str(self.expires)
|
|
113
|
-
|
|
114
|
-
@staticmethod
|
|
115
|
-
def get_expiration_time(timestamp: datetime, ttl: int):
|
|
116
|
-
if ttl >= 0:
|
|
117
|
-
try:
|
|
118
|
-
return timestamp + timedelta(seconds=ttl)
|
|
119
|
-
except OverflowError:
|
|
120
|
-
return datetime.max
|
|
121
|
-
else:
|
|
122
|
-
return datetime.max
|
|
123
|
-
|
|
124
|
-
@staticmethod
|
|
125
|
-
def dataset_name(uri: str, partial_path: str) -> str:
|
|
126
|
-
return f"{uri}/{partial_path}"
|
|
127
|
-
|
|
128
|
-
def to_dict(self, file_path=""):
|
|
129
|
-
uri = self.uri
|
|
130
|
-
if file_path:
|
|
131
|
-
uri = posixpath.join(uri, *file_path.rstrip("/").split("/"))
|
|
132
|
-
return {
|
|
133
|
-
"uri": uri,
|
|
134
|
-
"timestamp": time_to_str(self.timestamp) if self.timestamp else None,
|
|
135
|
-
"expires": time_to_str(self.expires) if self.expires else None,
|
|
136
|
-
}
|
|
File without changes
|
|
File without changes
|
|
File without changes
|