datachain 0.6.2__py3-none-any.whl → 0.6.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -29,12 +29,11 @@ from datachain.data_storage import AbstractDBMetastore, AbstractWarehouse
29
29
  from datachain.data_storage.db_engine import DatabaseEngine
30
30
  from datachain.data_storage.id_generator import AbstractDBIDGenerator
31
31
  from datachain.data_storage.schema import DefaultSchema
32
- from datachain.dataset import DatasetRecord
32
+ from datachain.dataset import DatasetRecord, StorageURI
33
33
  from datachain.error import DataChainError
34
34
  from datachain.sql.sqlite import create_user_defined_sql_functions, sqlite_dialect
35
35
  from datachain.sql.sqlite.base import load_usearch_extension
36
36
  from datachain.sql.types import SQLType
37
- from datachain.storage import StorageURI
38
37
  from datachain.utils import DataChainDir, batched_it
39
38
 
40
39
  if TYPE_CHECKING:
@@ -392,14 +391,14 @@ class SQLiteMetastore(AbstractDBMetastore):
392
391
  def __init__(
393
392
  self,
394
393
  id_generator: "SQLiteIDGenerator",
395
- uri: StorageURI = StorageURI(""),
396
- partial_id: Optional[int] = None,
394
+ uri: Optional[StorageURI] = None,
397
395
  db: Optional["SQLiteDatabaseEngine"] = None,
398
396
  db_file: Optional[str] = None,
399
397
  in_memory: bool = False,
400
398
  ):
399
+ uri = uri or StorageURI("")
401
400
  self.schema: DefaultSchema = DefaultSchema()
402
- super().__init__(id_generator, uri, partial_id)
401
+ super().__init__(id_generator, uri)
403
402
 
404
403
  # needed for dropping tables in correct order for tests because of
405
404
  # foreign keys
@@ -417,21 +416,16 @@ class SQLiteMetastore(AbstractDBMetastore):
417
416
 
418
417
  def clone(
419
418
  self,
420
- uri: StorageURI = StorageURI(""),
421
- partial_id: Optional[int] = None,
419
+ uri: Optional[StorageURI] = None,
422
420
  use_new_connection: bool = False,
423
421
  ) -> "SQLiteMetastore":
424
- if not uri:
425
- if partial_id is not None:
426
- raise ValueError("if partial_id is used, uri cannot be empty")
427
- if self.uri:
428
- uri = self.uri
429
- if self.partial_id:
430
- partial_id = self.partial_id
422
+ uri = uri or StorageURI("")
423
+ if not uri and self.uri:
424
+ uri = self.uri
425
+
431
426
  return SQLiteMetastore(
432
427
  self.id_generator.clone(),
433
428
  uri=uri,
434
- partial_id=partial_id,
435
429
  db=self.db.clone(),
436
430
  )
437
431
 
@@ -446,7 +440,6 @@ class SQLiteMetastore(AbstractDBMetastore):
446
440
  {
447
441
  "id_generator_clone_params": self.id_generator.clone_params(),
448
442
  "uri": self.uri,
449
- "partial_id": self.partial_id,
450
443
  "db_clone_params": self.db.clone_params(),
451
444
  },
452
445
  )
@@ -457,7 +450,6 @@ class SQLiteMetastore(AbstractDBMetastore):
457
450
  *,
458
451
  id_generator_clone_params: tuple[Callable, list, dict[str, Any]],
459
452
  uri: StorageURI,
460
- partial_id: Optional[int],
461
453
  db_clone_params: tuple[Callable, list, dict[str, Any]],
462
454
  ) -> "SQLiteMetastore":
463
455
  (
@@ -469,14 +461,11 @@ class SQLiteMetastore(AbstractDBMetastore):
469
461
  return cls(
470
462
  id_generator=id_generator_class(*id_generator_args, **id_generator_kwargs),
471
463
  uri=uri,
472
- partial_id=partial_id,
473
464
  db=db_class(*db_args, **db_kwargs),
474
465
  )
475
466
 
476
467
  def _init_tables(self) -> None:
477
468
  """Initialize tables."""
478
- self.db.create_table(self._storages, if_not_exists=True)
479
- self.default_table_names.append(self._storages.name)
480
469
  self.db.create_table(self._datasets, if_not_exists=True)
481
470
  self.default_table_names.append(self._datasets.name)
482
471
  self.db.create_table(self._datasets_versions, if_not_exists=True)
@@ -486,28 +475,11 @@ class SQLiteMetastore(AbstractDBMetastore):
486
475
  self.db.create_table(self._jobs, if_not_exists=True)
487
476
  self.default_table_names.append(self._jobs.name)
488
477
 
489
- def init(self, uri: StorageURI) -> None:
490
- if not uri:
491
- raise ValueError("uri for init() cannot be empty")
492
- partials_table = self._partials_table(uri)
493
- self.db.create_table(partials_table, if_not_exists=True)
494
-
495
- @classmethod
496
- def _buckets_columns(cls) -> list["SchemaItem"]:
497
- """Buckets (storages) table columns."""
498
- return [*super()._buckets_columns(), UniqueConstraint("uri")]
499
-
500
478
  @classmethod
501
479
  def _datasets_columns(cls) -> list["SchemaItem"]:
502
480
  """Datasets table columns."""
503
481
  return [*super()._datasets_columns(), UniqueConstraint("name")]
504
482
 
505
- def _storages_insert(self) -> "Insert":
506
- return sqlite.insert(self._storages)
507
-
508
- def _partials_insert(self) -> "Insert":
509
- return sqlite.insert(self._partials)
510
-
511
483
  def _datasets_insert(self) -> "Insert":
512
484
  return sqlite.insert(self._datasets)
513
485
 
@@ -526,13 +498,9 @@ class SQLiteMetastore(AbstractDBMetastore):
526
498
  self._datasets_dependencies.c.id,
527
499
  self._datasets_dependencies.c.dataset_id,
528
500
  self._datasets_dependencies.c.dataset_version_id,
529
- self._datasets_dependencies.c.bucket_id,
530
- self._datasets_dependencies.c.bucket_version,
531
501
  self._datasets.c.name,
532
- self._datasets.c.created_at,
533
502
  self._datasets_versions.c.version,
534
503
  self._datasets_versions.c.created_at,
535
- self._storages.c.uri,
536
504
  ]
537
505
 
538
506
  #
@@ -19,11 +19,10 @@ from tqdm import tqdm
19
19
  from datachain.client import Client
20
20
  from datachain.data_storage.schema import convert_rows_custom_column_types
21
21
  from datachain.data_storage.serializer import Serializable
22
- from datachain.dataset import DatasetRecord
22
+ from datachain.dataset import DatasetRecord, StorageURI
23
23
  from datachain.node import DirType, DirTypeGroup, Node, NodeWithPath, get_path
24
24
  from datachain.sql.functions import path as pathfunc
25
25
  from datachain.sql.types import Int, SQLType
26
- from datachain.storage import StorageURI
27
26
  from datachain.utils import sql_escape_like
28
27
 
29
28
  if TYPE_CHECKING:
datachain/dataset.py CHANGED
@@ -3,21 +3,17 @@ import json
3
3
  from dataclasses import dataclass, fields
4
4
  from datetime import datetime
5
5
  from typing import (
6
- TYPE_CHECKING,
7
6
  Any,
7
+ NewType,
8
8
  Optional,
9
9
  TypeVar,
10
10
  Union,
11
11
  )
12
12
  from urllib.parse import urlparse
13
13
 
14
- from datachain.client import Client
15
14
  from datachain.error import DatasetVersionNotFoundError
16
15
  from datachain.sql.types import NAME_TYPES_MAPPING, SQLType
17
16
 
18
- if TYPE_CHECKING:
19
- from datachain.storage import StorageURI
20
-
21
17
  T = TypeVar("T", bound="DatasetRecord")
22
18
  V = TypeVar("V", bound="DatasetVersion")
23
19
  DD = TypeVar("DD", bound="DatasetDependency")
@@ -27,6 +23,13 @@ QUERY_DATASET_PREFIX = "ds_query_"
27
23
  LISTING_PREFIX = "lst__"
28
24
 
29
25
 
26
+ # StorageURI represents a normalised URI to a valid storage location (full bucket or
27
+ # absolute local path).
28
+ # Valid examples: s3://foo, file:///var/data
29
+ # Invalid examples: s3://foo/, s3://foo/bar, file://~
30
+ StorageURI = NewType("StorageURI", str)
31
+
32
+
30
33
  def parse_dataset_uri(uri: str) -> tuple[str, Optional[int]]:
31
34
  """
32
35
  Parse dataser uri to extract name and version out of it (if version is defined)
@@ -94,14 +97,11 @@ class DatasetDependency:
94
97
  id: int,
95
98
  dataset_id: Optional[int],
96
99
  dataset_version_id: Optional[int],
97
- bucket_id: Optional[int],
98
- bucket_version: Optional[str],
99
100
  dataset_name: Optional[str],
100
- dataset_created_at: Optional[datetime],
101
101
  dataset_version: Optional[int],
102
102
  dataset_version_created_at: Optional[datetime],
103
- bucket_uri: Optional["StorageURI"],
104
103
  ) -> Optional["DatasetDependency"]:
104
+ from datachain.client import Client
105
105
  from datachain.lib.listing import is_listing_dataset, listing_uri_from_name
106
106
 
107
107
  if not dataset_id:
@@ -124,7 +124,7 @@ class DatasetDependency:
124
124
  if dataset_version
125
125
  else None
126
126
  ),
127
- dataset_version_created_at or dataset_created_at, # type: ignore[arg-type]
127
+ dataset_version_created_at, # type: ignore[arg-type]
128
128
  [],
129
129
  )
130
130
 
@@ -448,6 +448,8 @@ class DatasetRecord:
448
448
  For bucket listing we implicitly create underlying dataset to hold data. This
449
449
  method is checking if this is one of those datasets.
450
450
  """
451
+ from datachain.client import Client
452
+
451
453
  # TODO refactor and maybe remove method in
452
454
  # https://github.com/iterative/datachain/issues/318
453
455
  return Client.is_data_source_uri(self.name) or self.name.startswith(
datachain/error.py CHANGED
@@ -18,10 +18,6 @@ class DatasetInvalidVersionError(Exception):
18
18
  pass
19
19
 
20
20
 
21
- class StorageNotFoundError(NotFoundError):
22
- pass
23
-
24
-
25
21
  class PendingIndexingError(Exception):
26
22
  """An indexing operation is already in progress."""
27
23
 
datachain/lib/arrow.py CHANGED
@@ -1,4 +1,3 @@
1
- import re
2
1
  from collections.abc import Sequence
3
2
  from tempfile import NamedTemporaryFile
4
3
  from typing import TYPE_CHECKING, Any, Optional
@@ -13,6 +12,7 @@ from datachain.lib.file import ArrowRow, File
13
12
  from datachain.lib.model_store import ModelStore
14
13
  from datachain.lib.signal_schema import SignalSchema
15
14
  from datachain.lib.udf import Generator
15
+ from datachain.lib.utils import normalize_col_names
16
16
 
17
17
  if TYPE_CHECKING:
18
18
  from datasets.features.features import Features
@@ -128,7 +128,7 @@ def schema_to_output(schema: pa.Schema, col_names: Optional[Sequence[str]] = Non
128
128
  signal_schema = _get_datachain_schema(schema)
129
129
  if signal_schema:
130
130
  return signal_schema.values
131
- columns = _convert_col_names(col_names) # type: ignore[arg-type]
131
+ columns = list(normalize_col_names(col_names).keys()) # type: ignore[arg-type]
132
132
  hf_schema = _get_hf_schema(schema)
133
133
  if hf_schema:
134
134
  return {
@@ -143,19 +143,6 @@ def schema_to_output(schema: pa.Schema, col_names: Optional[Sequence[str]] = Non
143
143
  return output
144
144
 
145
145
 
146
- def _convert_col_names(col_names: Sequence[str]) -> list[str]:
147
- default_column = 0
148
- converted_col_names = []
149
- for column in col_names:
150
- column = column.lower()
151
- column = re.sub("[^0-9a-z_]+", "", column)
152
- if not column:
153
- column = f"c{default_column}"
154
- default_column += 1
155
- converted_col_names.append(column)
156
- return converted_col_names
157
-
158
-
159
146
  def arrow_type_mapper(col_type: pa.DataType, column: str = "") -> type: # noqa: PLR0911
160
147
  """Convert pyarrow types to basic types."""
161
148
  from datetime import datetime
@@ -2,9 +2,10 @@ from collections.abc import Sequence
2
2
  from datetime import datetime
3
3
  from typing import ClassVar, Union, get_args, get_origin
4
4
 
5
- from pydantic import BaseModel, create_model
5
+ from pydantic import BaseModel, Field, create_model
6
6
 
7
7
  from datachain.lib.model_store import ModelStore
8
+ from datachain.lib.utils import normalize_col_names
8
9
 
9
10
  StandardType = Union[
10
11
  type[int],
@@ -60,7 +61,14 @@ def is_chain_type(t: type) -> bool:
60
61
 
61
62
 
62
63
  def dict_to_data_model(name: str, data_dict: dict[str, DataType]) -> type[BaseModel]:
63
- fields = {name: (anno, ...) for name, anno in data_dict.items()}
64
+ # Gets a map of a normalized_name -> original_name
65
+ columns = normalize_col_names(list(data_dict.keys()))
66
+ # We reverse if for convenience to original_name -> normalized_name
67
+ columns = {v: k for k, v in columns.items()}
68
+
69
+ fields = {
70
+ columns[name]: (anno, Field(alias=name)) for name, anno in data_dict.items()
71
+ }
64
72
  return create_model(
65
73
  name,
66
74
  __base__=(DataModel,), # type: ignore[call-overload]
datachain/lib/utils.py CHANGED
@@ -1,4 +1,6 @@
1
+ import re
1
2
  from abc import ABC, abstractmethod
3
+ from collections.abc import Sequence
2
4
 
3
5
 
4
6
  class AbstractUDF(ABC):
@@ -28,3 +30,31 @@ class DataChainParamsError(DataChainError):
28
30
  class DataChainColumnError(DataChainParamsError):
29
31
  def __init__(self, col_name, msg):
30
32
  super().__init__(f"Error for column {col_name}: {msg}")
33
+
34
+
35
+ def normalize_col_names(col_names: Sequence[str]) -> dict[str, str]:
36
+ gen_col_counter = 0
37
+ new_col_names = {}
38
+ org_col_names = set(col_names)
39
+
40
+ for org_column in col_names:
41
+ new_column = org_column.lower()
42
+ new_column = re.sub("[^0-9a-z]+", "_", new_column)
43
+ new_column = new_column.strip("_")
44
+
45
+ generated_column = new_column
46
+
47
+ while (
48
+ not generated_column.isidentifier()
49
+ or generated_column in new_col_names
50
+ or (generated_column != org_column and generated_column in org_col_names)
51
+ ):
52
+ if new_column:
53
+ generated_column = f"c{gen_col_counter}_{new_column}"
54
+ else:
55
+ generated_column = f"c{gen_col_counter}"
56
+ gen_col_counter += 1
57
+
58
+ new_col_names[generated_column] = org_column
59
+
60
+ return new_col_names
datachain/node.py CHANGED
@@ -3,8 +3,8 @@ from typing import TYPE_CHECKING, Any, Optional
3
3
 
4
4
  import attrs
5
5
 
6
+ from datachain.dataset import StorageURI
6
7
  from datachain.lib.file import File
7
- from datachain.storage import StorageURI
8
8
  from datachain.utils import TIME_ZERO, time_to_str
9
9
 
10
10
  if TYPE_CHECKING:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.6.2
3
+ Version: 0.6.4
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -47,7 +47,7 @@ Requires-Dist: platformdirs
47
47
  Requires-Dist: dvc-studio-client <1,>=0.21
48
48
  Provides-Extra: dev
49
49
  Requires-Dist: datachain[docs,tests] ; extra == 'dev'
50
- Requires-Dist: mypy ==1.12.1 ; extra == 'dev'
50
+ Requires-Dist: mypy ==1.13.0 ; extra == 'dev'
51
51
  Requires-Dist: types-python-dateutil ; extra == 'dev'
52
52
  Requires-Dist: types-pytz ; extra == 'dev'
53
53
  Requires-Dist: types-PyYAML ; extra == 'dev'
@@ -2,47 +2,46 @@ datachain/__init__.py,sha256=OGzc8xZWtwqxiiutjU4AxCRPY0lrX_csgERiTrq4G0o,908
2
2
  datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
3
3
  datachain/asyn.py,sha256=Lg3Ck1PQLjQziMx9KU4atzbEnJXTE0924WMYkhgWtGU,8247
4
4
  datachain/cache.py,sha256=s0YHN7qurmQv-eC265TjeureK84TebWWAnL07cxchZQ,2997
5
- datachain/cli.py,sha256=EM6jlc9zunOJQi7-GwCyVtlumHmLM8NwN9Y6jqVGzyY,33769
5
+ datachain/cli.py,sha256=Wl-xMpTRgrkg4drX5I_QxAB1IATyULHCXOdx_wfoLVg,33529
6
6
  datachain/cli_utils.py,sha256=jrn9ejGXjybeO1ur3fjdSiAyCHZrX0qsLLbJzN9ErPM,2418
7
7
  datachain/config.py,sha256=g8qbNV0vW2VEKpX-dGZ9pAn0DAz6G2ZFcr7SAV3PoSM,4272
8
- datachain/dataset.py,sha256=w7qqJP7xYrm9CmBSmSezSxUQHZDsHKkwviF8AYUob7o,14671
9
- datachain/error.py,sha256=vbIbamnFMIojh1UpmxWoA6Omup7WFAFNJnf8xAkGWwI,1146
8
+ datachain/dataset.py,sha256=lLUbUbJP1TYL9Obkc0f2IDziGcDylZge9ORQjK-WtXs,14717
9
+ datachain/error.py,sha256=bxAAL32lSeMgzsQDEHbGTGORj-mPzzpCRvWDPueJNN4,1092
10
10
  datachain/job.py,sha256=Jt4sNutMHJReaGsj3r3scueN5aESLGfhimAa8pUP7Is,1271
11
11
  datachain/listing.py,sha256=AV23WZq-k6e2zeeNBhVQP1-2PrwNCYidO0HBDKzpVaA,7152
12
- datachain/node.py,sha256=ThE6Ue4BqpaBvrkFFJW_ljLxchixUX2aWz3l_nbwY54,5195
12
+ datachain/node.py,sha256=i7_jC8VcW6W5VYkDszAOu0H-rNBuqXB4UnLEh4wFzjc,5195
13
13
  datachain/nodes_fetcher.py,sha256=F-73-h19HHNGtHFBGKk7p3mc0ALm4a9zGnzhtuUjnp4,1107
14
14
  datachain/nodes_thread_pool.py,sha256=uPo-xl8zG5m9YgODjPFBpbcqqHjI-dcxH87yAbj_qco,3192
15
15
  datachain/progress.py,sha256=5KotcvvzAUL_RF0GEj4JY0IB1lyImnmHxe89YkT1XO4,4330
16
16
  datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
- datachain/storage.py,sha256=RiSJLYdHUjnrEWkLBKPcETHpAxld_B2WxLg711t0aZI,3733
18
17
  datachain/studio.py,sha256=d-jUsYpfI1LEv3g8KU-lLchVgb9L0TXvlHakieFud_E,3788
19
18
  datachain/telemetry.py,sha256=0A4IOPPp9VlP5pyW9eBfaTK3YhHGzHl7dQudQjUAx9A,994
20
19
  datachain/utils.py,sha256=-mSFowjIidJ4_sMXInvNHLn4rK_QnHuIlLuH1_lMGmI,13897
21
20
  datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
22
- datachain/catalog/catalog.py,sha256=PvJ-BRoSuI_FRCrXJ6tjMhYZD6L8Beq-ynrdPYRrwiw,58270
21
+ datachain/catalog/catalog.py,sha256=qFlRrR01_9h1MjK6DEgVSgIwbtZEGV_SdG_E5qUsHmM,57352
23
22
  datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
24
23
  datachain/catalog/loader.py,sha256=-6VelNfXUdgUnwInVyA8g86Boxv2xqhTh9xNS-Zlwig,8242
25
24
  datachain/client/__init__.py,sha256=T4wiYL9KIM0ZZ_UqIyzV8_ufzYlewmizlV4iymHNluE,86
26
25
  datachain/client/azure.py,sha256=ffxs26zm6KLAL1aUWJm-vtzuZP3LSNha7UDGXynMBKo,2234
27
26
  datachain/client/fileslice.py,sha256=bT7TYco1Qe3bqoc8aUkUZcPdPofJDHlryL5BsTn9xsY,3021
28
- datachain/client/fsspec.py,sha256=sB98CO7covhmFZg36hsnyv9UwUI8J94AD1QWgGdcBlY,12595
27
+ datachain/client/fsspec.py,sha256=C6C5AO6ndkgcoUxCRN9_8fUzqX2cRWJWG6FL6oD9X_Q,12708
29
28
  datachain/client/gcs.py,sha256=cnTIr5GS6dbYOEYfqehhyQu3dr6XNjPHSg5U3FkivUk,4124
30
29
  datachain/client/hf.py,sha256=k24bpa6FEKNQn9zhoNC9kCigDwFSqobLsCnN_Nuzwh4,922
31
- datachain/client/local.py,sha256=Uaf_y_UGspOgprDysUTI9wDo334MLjGPUudqVtvef0c,4367
30
+ datachain/client/local.py,sha256=vwbgCwZ7IqY2voj2l7tLJjgov7Dp--fEUvUwUBsMbls,4457
32
31
  datachain/client/s3.py,sha256=CVHBUZ1Ic2Q3370nl-Bbe69phuWjFlrVv9dTJKBpRT0,6019
33
32
  datachain/data_storage/__init__.py,sha256=cEOJpyu1JDZtfUupYucCDNFI6e5Wmp_Oyzq6rZv32Y8,398
34
33
  datachain/data_storage/db_engine.py,sha256=81Ol1of9TTTzD97ORajCnP366Xz2mEJt6C-kTUCaru4,3406
35
34
  datachain/data_storage/id_generator.py,sha256=lCEoU0BM37Ai2aRpSbwo5oQT0GqZnSpYwwvizathRMQ,4292
36
35
  datachain/data_storage/job.py,sha256=w-7spowjkOa1P5fUVtJou3OltT0L48P0RYWZ9rSJ9-s,383
37
- datachain/data_storage/metastore.py,sha256=HfCxk4lmDUg2Q4WsFNQGMWxllP0mToA00fxkFTwdNIE,52919
36
+ datachain/data_storage/metastore.py,sha256=-TJCqG70VofSVOh2yEez4dwjHS3eQL8p7d9uO3WTVwM,35878
38
37
  datachain/data_storage/schema.py,sha256=CiRXrDYp5ZZopSyUgZ7MT2ml_6YvqSTYXdybatcbX9M,9849
39
38
  datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
40
- datachain/data_storage/sqlite.py,sha256=jopfVftng157TVcBKMB_QPlbkE6fTatiY4GYSSLNkig,28737
41
- datachain/data_storage/warehouse.py,sha256=iIjFOutYxhLev3CcUhUTwMJOkHeAEBwXZ2y3wmjrF1s,30756
39
+ datachain/data_storage/sqlite.py,sha256=wb8xlMJYYyt59wft0psJj587d-AwpNThzIqspVcKnRI,27388
40
+ datachain/data_storage/warehouse.py,sha256=xwMaR4jBpR13vjG3zrhphH4z2_CFLNj0KPF0LJCXCJ8,30727
42
41
  datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
43
- datachain/lib/arrow.py,sha256=0R2CYsN82nNa5_03iS6jVix9EKeeqNZNAMgpSQP2hfo,9482
42
+ datachain/lib/arrow.py,sha256=M6SM4u2LeHgylzkPZBWckFeZt3CH3ehpBod3nGl6OYY,9138
44
43
  datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
45
- datachain/lib/data_model.py,sha256=ECTbvlnzM98hp2mZ4fo82Yi0-MuoqTIQasQKGIyd89I,2040
44
+ datachain/lib/data_model.py,sha256=dau4AlZBhOFvF7pEKMeqCeRkcFFg5KFvTBWW_2CdH5g,2371
46
45
  datachain/lib/dataset_info.py,sha256=srPPhI2UHf6hFPBecyFEVw2SS5aPisIIMsvGgKqi7ss,2366
47
46
  datachain/lib/dc.py,sha256=pOyE8LqIwo86GrZTSpSMUJAYYwep7nCdIxebkSYlMGo,84484
48
47
  datachain/lib/file.py,sha256=LjTW_-PDAnoUhvyB4bJ8Y8n__XGqrxvmd9mDOF0Gir8,14875
@@ -59,7 +58,7 @@ datachain/lib/tar.py,sha256=3WIzao6yD5fbLqXLTt9GhPGNonbFIs_fDRu-9vgLgsA,1038
59
58
  datachain/lib/text.py,sha256=UNHm8fhidk7wdrWqacEWaA6I9ykfYqarQ2URby7jc7M,1261
60
59
  datachain/lib/udf.py,sha256=4CqK51n3bntXCmkwoOQIrX34wMKOknkC23HtR4D_2vM,12705
61
60
  datachain/lib/udf_signature.py,sha256=GXw24A-Olna6DWCdgy2bC-gZh_gLGPQ-KvjuI6pUjC0,7281
62
- datachain/lib/utils.py,sha256=12elAX6eTFgMGKIf2UfZ4IW07kRwjK6wz8yGE41RtNM,618
61
+ datachain/lib/utils.py,sha256=6NwgWLl5JrgtD4rsSFEe-yR2ntEwJMJEtAZ3FIxK3fg,1529
63
62
  datachain/lib/vfile.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
64
63
  datachain/lib/webdataset.py,sha256=o7SHk5HOUWsZ5Ln04xOM04eQqiBHiJNO7xLgyVBrwo8,6924
65
64
  datachain/lib/webdataset_laion.py,sha256=aGMWeFmeYNK75ewO9JTA11iB1i3QtTzUfenQA5jajfo,2535
@@ -101,9 +100,9 @@ datachain/sql/sqlite/base.py,sha256=aHSZVvh4XSVkvZ07h3jMoRlHI4sWD8y3SnmGs9xMG9Y,
101
100
  datachain/sql/sqlite/types.py,sha256=yzvp0sXSEoEYXs6zaYC_2YubarQoZH-MiUNXcpuEP4s,1573
102
101
  datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
103
102
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
104
- datachain-0.6.2.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
105
- datachain-0.6.2.dist-info/METADATA,sha256=QJGHTrGZapho1am27dPKQCOKG_FiEMsvWNLloeU8qVQ,17188
106
- datachain-0.6.2.dist-info/WHEEL,sha256=OVMc5UfuAQiSplgO0_WdW7vXVGAt9Hdd6qtN4HotdyA,91
107
- datachain-0.6.2.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
108
- datachain-0.6.2.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
109
- datachain-0.6.2.dist-info/RECORD,,
103
+ datachain-0.6.4.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
104
+ datachain-0.6.4.dist-info/METADATA,sha256=zCHryMsrsacIST1qua0PHB6YRNgp1Qayuvsh57SqS9w,17188
105
+ datachain-0.6.4.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
106
+ datachain-0.6.4.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
107
+ datachain-0.6.4.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
108
+ datachain-0.6.4.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.2.0)
2
+ Generator: setuptools (75.3.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
datachain/storage.py DELETED
@@ -1,136 +0,0 @@
1
- import posixpath
2
- from abc import ABC, abstractmethod
3
- from datetime import datetime, timedelta, timezone
4
- from functools import cached_property
5
- from typing import NamedTuple, NewType, Optional, Union
6
- from urllib.parse import urlparse
7
-
8
- from datachain.utils import is_expired, time_to_local_str, time_to_str
9
-
10
- STALE_MINUTES_LIMIT = 15
11
-
12
- # StorageURI represents a normalised URI to a valid storage location (full bucket or
13
- # absolute local path).
14
- # Valid examples: s3://foo, file:///var/data
15
- # Invalid examples: s3://foo/, s3://foo/bar, file://~
16
- StorageURI = NewType("StorageURI", str)
17
-
18
-
19
- class StorageStatus:
20
- CREATED = 1
21
- PENDING = 2
22
- FAILED = 3
23
- COMPLETE = 4
24
- PARTIAL = 5
25
- STALE = 6
26
- INDEXING_SCHEDULED = 7
27
- DELETE_SCHEDULED = 8
28
-
29
-
30
- class AbstractStorage(ABC):
31
- @property
32
- @abstractmethod
33
- def uri(self) -> StorageURI: ...
34
-
35
- @property
36
- @abstractmethod
37
- def timestamp(self) -> Optional[Union[datetime, str]]: ...
38
-
39
- @property
40
- @abstractmethod
41
- def expires(self) -> Optional[Union[datetime, str]]: ...
42
-
43
- @property
44
- @abstractmethod
45
- def status(self) -> int: ...
46
-
47
- @property
48
- def type(self):
49
- return self._parsed_uri.scheme
50
-
51
- @property
52
- def name(self):
53
- return self._parsed_uri.netloc
54
-
55
- @cached_property
56
- def _parsed_uri(self):
57
- return urlparse(self.uri)
58
-
59
-
60
- class StorageRecord(NamedTuple):
61
- id: int
62
- uri: StorageURI
63
- timestamp: Optional[Union[datetime, str]] = None
64
- expires: Optional[Union[datetime, str]] = None
65
- started_inserting_at: Optional[Union[datetime, str]] = None
66
- last_inserted_at: Optional[Union[datetime, str]] = None
67
- status: int = StorageStatus.CREATED
68
- error_message: str = ""
69
- error_stack: str = ""
70
-
71
-
72
- class Storage(StorageRecord, AbstractStorage):
73
- @property
74
- def is_indexed(self) -> bool:
75
- return self.status == StorageStatus.COMPLETE
76
-
77
- @property
78
- def is_expired(self) -> bool:
79
- return is_expired(self.expires)
80
-
81
- @property
82
- def is_pending(self) -> bool:
83
- return self.status == StorageStatus.PENDING
84
-
85
- @property
86
- def is_stale(self) -> bool:
87
- limit = datetime.now(timezone.utc) - timedelta(minutes=STALE_MINUTES_LIMIT)
88
- date_to_check = self.last_inserted_at or self.started_inserting_at
89
-
90
- return self.is_pending and date_to_check < limit # type: ignore [operator]
91
-
92
- @property
93
- def need_indexing(self) -> bool:
94
- return self.is_expired or not self.is_indexed
95
-
96
- @property
97
- def timestamp_str(self) -> Optional[str]:
98
- if not self.timestamp:
99
- return None
100
- return time_to_str(self.timestamp)
101
-
102
- @property
103
- def timestamp_to_local(self) -> Optional[str]:
104
- if not self.timestamp:
105
- return None
106
- return time_to_local_str(self.timestamp)
107
-
108
- @property
109
- def expires_to_local(self) -> Optional[str]:
110
- if not self.expires:
111
- return None
112
- return time_to_local_str(self.expires)
113
-
114
- @staticmethod
115
- def get_expiration_time(timestamp: datetime, ttl: int):
116
- if ttl >= 0:
117
- try:
118
- return timestamp + timedelta(seconds=ttl)
119
- except OverflowError:
120
- return datetime.max
121
- else:
122
- return datetime.max
123
-
124
- @staticmethod
125
- def dataset_name(uri: str, partial_path: str) -> str:
126
- return f"{uri}/{partial_path}"
127
-
128
- def to_dict(self, file_path=""):
129
- uri = self.uri
130
- if file_path:
131
- uri = posixpath.join(uri, *file_path.rstrip("/").split("/"))
132
- return {
133
- "uri": uri,
134
- "timestamp": time_to_str(self.timestamp) if self.timestamp else None,
135
- "expires": time_to_str(self.expires) if self.expires else None,
136
- }