datachain 0.3.9__py3-none-any.whl → 0.3.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/catalog/catalog.py +57 -212
- datachain/cli.py +6 -38
- datachain/client/fsspec.py +3 -0
- datachain/client/hf.py +47 -0
- datachain/data_storage/metastore.py +2 -29
- datachain/data_storage/sqlite.py +3 -12
- datachain/data_storage/warehouse.py +20 -29
- datachain/dataset.py +44 -32
- datachain/job.py +4 -3
- datachain/lib/arrow.py +21 -5
- datachain/lib/dataset_info.py +4 -0
- datachain/lib/dc.py +183 -59
- datachain/lib/file.py +10 -33
- datachain/lib/hf.py +2 -1
- datachain/lib/listing.py +102 -94
- datachain/lib/listing_info.py +32 -0
- datachain/lib/meta_formats.py +39 -56
- datachain/lib/signal_schema.py +5 -2
- datachain/node.py +13 -0
- datachain/query/dataset.py +12 -105
- datachain/query/metrics.py +8 -0
- datachain/utils.py +5 -0
- {datachain-0.3.9.dist-info → datachain-0.3.11.dist-info}/METADATA +7 -3
- {datachain-0.3.9.dist-info → datachain-0.3.11.dist-info}/RECORD +28 -27
- {datachain-0.3.9.dist-info → datachain-0.3.11.dist-info}/WHEEL +1 -1
- datachain/catalog/subclass.py +0 -60
- {datachain-0.3.9.dist-info → datachain-0.3.11.dist-info}/LICENSE +0 -0
- {datachain-0.3.9.dist-info → datachain-0.3.11.dist-info}/entry_points.txt +0 -0
- {datachain-0.3.9.dist-info → datachain-0.3.11.dist-info}/top_level.txt +0 -0
datachain/data_storage/sqlite.py
CHANGED
|
@@ -143,7 +143,9 @@ class SQLiteDatabaseEngine(DatabaseEngine):
|
|
|
143
143
|
db.execute("PRAGMA synchronous = NORMAL")
|
|
144
144
|
db.execute("PRAGMA case_sensitive_like = ON")
|
|
145
145
|
if os.environ.get("DEBUG_SHOW_SQL_QUERIES"):
|
|
146
|
-
|
|
146
|
+
import sys
|
|
147
|
+
|
|
148
|
+
db.set_trace_callback(sys.stderr.write)
|
|
147
149
|
|
|
148
150
|
load_usearch_extension(db)
|
|
149
151
|
|
|
@@ -515,17 +517,6 @@ class SQLiteMetastore(AbstractDBMetastore):
|
|
|
515
517
|
def _datasets_dependencies_insert(self) -> "Insert":
|
|
516
518
|
return sqlite.insert(self._datasets_dependencies)
|
|
517
519
|
|
|
518
|
-
#
|
|
519
|
-
# Storages
|
|
520
|
-
#
|
|
521
|
-
|
|
522
|
-
def mark_storage_not_indexed(self, uri: StorageURI) -> None:
|
|
523
|
-
"""
|
|
524
|
-
Mark storage as not indexed.
|
|
525
|
-
This method should be called when storage index is deleted.
|
|
526
|
-
"""
|
|
527
|
-
self.db.execute(self._storages_delete().where(self._storages.c.uri == uri))
|
|
528
|
-
|
|
529
520
|
#
|
|
530
521
|
# Dataset dependencies
|
|
531
522
|
#
|
|
@@ -218,35 +218,26 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
218
218
|
results = None
|
|
219
219
|
offset = 0
|
|
220
220
|
num_yielded = 0
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
finally:
|
|
242
|
-
# https://www2.sqlite.org/cvstrac/wiki?p=DatabaseIsLocked (SELECT not
|
|
243
|
-
# finalized or reset) to prevent database table is locked error when an
|
|
244
|
-
# exception is raised in the middle of processing the results (e.g.
|
|
245
|
-
# https://github.com/iterative/dvcx/issues/924). Connections close
|
|
246
|
-
# apparently is not enough in some cases, at least on sqlite
|
|
247
|
-
# https://www.sqlite.org/c3ref/close.html
|
|
248
|
-
if results and hasattr(results, "close"):
|
|
249
|
-
results.close()
|
|
221
|
+
|
|
222
|
+
while True:
|
|
223
|
+
if limit is not None:
|
|
224
|
+
limit -= num_yielded
|
|
225
|
+
if limit == 0:
|
|
226
|
+
break
|
|
227
|
+
if limit < page_size:
|
|
228
|
+
paginated_query = paginated_query.limit(None).limit(limit)
|
|
229
|
+
|
|
230
|
+
results = self.dataset_rows_select(paginated_query.offset(offset))
|
|
231
|
+
|
|
232
|
+
processed = False
|
|
233
|
+
for row in results:
|
|
234
|
+
processed = True
|
|
235
|
+
yield row
|
|
236
|
+
num_yielded += 1
|
|
237
|
+
|
|
238
|
+
if not processed:
|
|
239
|
+
break # no more results
|
|
240
|
+
offset += page_size
|
|
250
241
|
|
|
251
242
|
#
|
|
252
243
|
# Table Name Internal Functions
|
datachain/dataset.py
CHANGED
|
@@ -11,8 +11,6 @@ from typing import (
|
|
|
11
11
|
)
|
|
12
12
|
from urllib.parse import urlparse
|
|
13
13
|
|
|
14
|
-
from dateutil.parser import isoparse
|
|
15
|
-
|
|
16
14
|
from datachain.client import Client
|
|
17
15
|
from datachain.sql.types import NAME_TYPES_MAPPING, SQLType
|
|
18
16
|
|
|
@@ -25,6 +23,7 @@ DD = TypeVar("DD", bound="DatasetDependency")
|
|
|
25
23
|
|
|
26
24
|
DATASET_PREFIX = "ds://"
|
|
27
25
|
QUERY_DATASET_PREFIX = "ds_query_"
|
|
26
|
+
LISTING_PREFIX = "lst__"
|
|
28
27
|
|
|
29
28
|
|
|
30
29
|
def parse_dataset_uri(uri: str) -> tuple[str, Optional[int]]:
|
|
@@ -72,11 +71,22 @@ class DatasetDependencyType:
|
|
|
72
71
|
class DatasetDependency:
|
|
73
72
|
id: int
|
|
74
73
|
type: str
|
|
75
|
-
name: str
|
|
76
|
-
version: str #
|
|
74
|
+
name: str
|
|
75
|
+
version: str # TODO change to int
|
|
77
76
|
created_at: datetime
|
|
78
77
|
dependencies: list[Optional["DatasetDependency"]]
|
|
79
78
|
|
|
79
|
+
@property
|
|
80
|
+
def dataset_name(self) -> str:
|
|
81
|
+
"""Returns clean dependency dataset name"""
|
|
82
|
+
from datachain.lib.listing import parse_listing_uri
|
|
83
|
+
|
|
84
|
+
if self.type == DatasetDependencyType.DATASET:
|
|
85
|
+
return self.name
|
|
86
|
+
|
|
87
|
+
list_dataset_name, _, _ = parse_listing_uri(self.name.strip("/"), None, {})
|
|
88
|
+
return list_dataset_name
|
|
89
|
+
|
|
80
90
|
@classmethod
|
|
81
91
|
def parse(
|
|
82
92
|
cls: builtins.type[DD],
|
|
@@ -91,33 +101,31 @@ class DatasetDependency:
|
|
|
91
101
|
dataset_version_created_at: Optional[datetime],
|
|
92
102
|
bucket_uri: Optional["StorageURI"],
|
|
93
103
|
) -> Optional["DatasetDependency"]:
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
# removing them from tables so that we can still have references
|
|
120
|
-
return None
|
|
104
|
+
from datachain.lib.listing import is_listing_dataset, listing_uri_from_name
|
|
105
|
+
|
|
106
|
+
if not dataset_id:
|
|
107
|
+
return None
|
|
108
|
+
|
|
109
|
+
assert dataset_name is not None
|
|
110
|
+
dependency_type = DatasetDependencyType.DATASET
|
|
111
|
+
dependency_name = dataset_name
|
|
112
|
+
|
|
113
|
+
if is_listing_dataset(dataset_name):
|
|
114
|
+
dependency_type = DatasetDependencyType.STORAGE # type: ignore[arg-type]
|
|
115
|
+
dependency_name = listing_uri_from_name(dataset_name)
|
|
116
|
+
|
|
117
|
+
return cls(
|
|
118
|
+
id,
|
|
119
|
+
dependency_type,
|
|
120
|
+
dependency_name,
|
|
121
|
+
(
|
|
122
|
+
str(dataset_version) # type: ignore[arg-type]
|
|
123
|
+
if dataset_version
|
|
124
|
+
else None
|
|
125
|
+
),
|
|
126
|
+
dataset_version_created_at or dataset_created_at, # type: ignore[arg-type]
|
|
127
|
+
[],
|
|
128
|
+
)
|
|
121
129
|
|
|
122
130
|
@property
|
|
123
131
|
def is_dataset(self) -> bool:
|
|
@@ -443,7 +451,11 @@ class DatasetRecord:
|
|
|
443
451
|
For bucket listing we implicitly create underlying dataset to hold data. This
|
|
444
452
|
method is checking if this is one of those datasets.
|
|
445
453
|
"""
|
|
446
|
-
|
|
454
|
+
# TODO refactor and maybe remove method in
|
|
455
|
+
# https://github.com/iterative/datachain/issues/318
|
|
456
|
+
return Client.is_data_source_uri(self.name) or self.name.startswith(
|
|
457
|
+
LISTING_PREFIX
|
|
458
|
+
)
|
|
447
459
|
|
|
448
460
|
@property
|
|
449
461
|
def versions_values(self) -> list[int]:
|
datachain/job.py
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
import json
|
|
2
|
+
import uuid
|
|
2
3
|
from dataclasses import dataclass
|
|
3
4
|
from datetime import datetime
|
|
4
|
-
from typing import Any, Optional, TypeVar
|
|
5
|
+
from typing import Any, Optional, TypeVar, Union
|
|
5
6
|
|
|
6
7
|
J = TypeVar("J", bound="Job")
|
|
7
8
|
|
|
@@ -25,7 +26,7 @@ class Job:
|
|
|
25
26
|
@classmethod
|
|
26
27
|
def parse(
|
|
27
28
|
cls: type[J],
|
|
28
|
-
id: str,
|
|
29
|
+
id: Union[str, uuid.UUID],
|
|
29
30
|
name: str,
|
|
30
31
|
status: int,
|
|
31
32
|
created_at: datetime,
|
|
@@ -40,7 +41,7 @@ class Job:
|
|
|
40
41
|
metrics: str,
|
|
41
42
|
) -> "Job":
|
|
42
43
|
return cls(
|
|
43
|
-
id,
|
|
44
|
+
str(id),
|
|
44
45
|
name,
|
|
45
46
|
status,
|
|
46
47
|
created_at,
|
datachain/lib/arrow.py
CHANGED
|
@@ -7,7 +7,9 @@ import pyarrow as pa
|
|
|
7
7
|
from pyarrow.dataset import dataset
|
|
8
8
|
from tqdm import tqdm
|
|
9
9
|
|
|
10
|
+
from datachain.lib.data_model import dict_to_data_model
|
|
10
11
|
from datachain.lib.file import File, IndexedFile
|
|
12
|
+
from datachain.lib.model_store import ModelStore
|
|
11
13
|
from datachain.lib.udf import Generator
|
|
12
14
|
|
|
13
15
|
if TYPE_CHECKING:
|
|
@@ -59,7 +61,13 @@ class ArrowGenerator(Generator):
|
|
|
59
61
|
vals = list(record.values())
|
|
60
62
|
if self.output_schema:
|
|
61
63
|
fields = self.output_schema.model_fields
|
|
62
|
-
|
|
64
|
+
vals_dict = {}
|
|
65
|
+
for (field, field_info), val in zip(fields.items(), vals):
|
|
66
|
+
if ModelStore.is_pydantic(field_info.annotation):
|
|
67
|
+
vals_dict[field] = field_info.annotation(**val) # type: ignore[misc]
|
|
68
|
+
else:
|
|
69
|
+
vals_dict[field] = val
|
|
70
|
+
vals = [self.output_schema(**vals_dict)]
|
|
63
71
|
if self.source:
|
|
64
72
|
yield [IndexedFile(file=file, index=index), *vals]
|
|
65
73
|
else:
|
|
@@ -95,15 +103,15 @@ def schema_to_output(schema: pa.Schema, col_names: Optional[Sequence[str]] = Non
|
|
|
95
103
|
if not column:
|
|
96
104
|
column = f"c{default_column}"
|
|
97
105
|
default_column += 1
|
|
98
|
-
dtype = arrow_type_mapper(field.type) # type: ignore[assignment]
|
|
99
|
-
if field.nullable:
|
|
106
|
+
dtype = arrow_type_mapper(field.type, column) # type: ignore[assignment]
|
|
107
|
+
if field.nullable and not ModelStore.is_pydantic(dtype):
|
|
100
108
|
dtype = Optional[dtype] # type: ignore[assignment]
|
|
101
109
|
output[column] = dtype
|
|
102
110
|
|
|
103
111
|
return output
|
|
104
112
|
|
|
105
113
|
|
|
106
|
-
def arrow_type_mapper(col_type: pa.DataType) -> type: # noqa: PLR0911
|
|
114
|
+
def arrow_type_mapper(col_type: pa.DataType, column: str = "") -> type: # noqa: PLR0911
|
|
107
115
|
"""Convert pyarrow types to basic types."""
|
|
108
116
|
from datetime import datetime
|
|
109
117
|
|
|
@@ -123,7 +131,15 @@ def arrow_type_mapper(col_type: pa.DataType) -> type: # noqa: PLR0911
|
|
|
123
131
|
return str
|
|
124
132
|
if pa.types.is_list(col_type):
|
|
125
133
|
return list[arrow_type_mapper(col_type.value_type)] # type: ignore[return-value, misc]
|
|
126
|
-
if pa.types.is_struct(col_type)
|
|
134
|
+
if pa.types.is_struct(col_type):
|
|
135
|
+
type_dict = {}
|
|
136
|
+
for field in col_type:
|
|
137
|
+
dtype = arrow_type_mapper(field.type, field.name)
|
|
138
|
+
if field.nullable and not ModelStore.is_pydantic(dtype):
|
|
139
|
+
dtype = Optional[dtype] # type: ignore[assignment]
|
|
140
|
+
type_dict[field.name] = dtype
|
|
141
|
+
return dict_to_data_model(column, type_dict)
|
|
142
|
+
if pa.types.is_map(col_type):
|
|
127
143
|
return dict
|
|
128
144
|
if isinstance(col_type, pa.lib.DictionaryType):
|
|
129
145
|
return arrow_type_mapper(col_type.value_type) # type: ignore[return-value]
|
datachain/lib/dataset_info.py
CHANGED
|
@@ -23,6 +23,8 @@ class DatasetInfo(DataModel):
|
|
|
23
23
|
size: Optional[int] = Field(default=None)
|
|
24
24
|
params: dict[str, str] = Field(default=dict)
|
|
25
25
|
metrics: dict[str, Any] = Field(default=dict)
|
|
26
|
+
error_message: str = Field(default="")
|
|
27
|
+
error_stack: str = Field(default="")
|
|
26
28
|
|
|
27
29
|
@staticmethod
|
|
28
30
|
def _validate_dict(
|
|
@@ -67,4 +69,6 @@ class DatasetInfo(DataModel):
|
|
|
67
69
|
size=version.size,
|
|
68
70
|
params=job.params if job else {},
|
|
69
71
|
metrics=job.metrics if job else {},
|
|
72
|
+
error_message=version.error_message,
|
|
73
|
+
error_stack=version.error_stack,
|
|
70
74
|
)
|