datachain 0.3.9__py3-none-any.whl → 0.3.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -143,7 +143,9 @@ class SQLiteDatabaseEngine(DatabaseEngine):
143
143
  db.execute("PRAGMA synchronous = NORMAL")
144
144
  db.execute("PRAGMA case_sensitive_like = ON")
145
145
  if os.environ.get("DEBUG_SHOW_SQL_QUERIES"):
146
- db.set_trace_callback(print)
146
+ import sys
147
+
148
+ db.set_trace_callback(sys.stderr.write)
147
149
 
148
150
  load_usearch_extension(db)
149
151
 
@@ -515,17 +517,6 @@ class SQLiteMetastore(AbstractDBMetastore):
515
517
  def _datasets_dependencies_insert(self) -> "Insert":
516
518
  return sqlite.insert(self._datasets_dependencies)
517
519
 
518
- #
519
- # Storages
520
- #
521
-
522
- def mark_storage_not_indexed(self, uri: StorageURI) -> None:
523
- """
524
- Mark storage as not indexed.
525
- This method should be called when storage index is deleted.
526
- """
527
- self.db.execute(self._storages_delete().where(self._storages.c.uri == uri))
528
-
529
520
  #
530
521
  # Dataset dependencies
531
522
  #
@@ -218,35 +218,26 @@ class AbstractWarehouse(ABC, Serializable):
218
218
  results = None
219
219
  offset = 0
220
220
  num_yielded = 0
221
- try:
222
- while True:
223
- if limit is not None:
224
- limit -= num_yielded
225
- if limit == 0:
226
- break
227
- if limit < page_size:
228
- paginated_query = paginated_query.limit(None).limit(limit)
229
-
230
- results = self.dataset_rows_select(paginated_query.offset(offset))
231
-
232
- processed = False
233
- for row in results:
234
- processed = True
235
- yield row
236
- num_yielded += 1
237
-
238
- if not processed:
239
- break # no more results
240
- offset += page_size
241
- finally:
242
- # https://www2.sqlite.org/cvstrac/wiki?p=DatabaseIsLocked (SELECT not
243
- # finalized or reset) to prevent database table is locked error when an
244
- # exception is raised in the middle of processing the results (e.g.
245
- # https://github.com/iterative/dvcx/issues/924). Connections close
246
- # apparently is not enough in some cases, at least on sqlite
247
- # https://www.sqlite.org/c3ref/close.html
248
- if results and hasattr(results, "close"):
249
- results.close()
221
+
222
+ while True:
223
+ if limit is not None:
224
+ limit -= num_yielded
225
+ if limit == 0:
226
+ break
227
+ if limit < page_size:
228
+ paginated_query = paginated_query.limit(None).limit(limit)
229
+
230
+ results = self.dataset_rows_select(paginated_query.offset(offset))
231
+
232
+ processed = False
233
+ for row in results:
234
+ processed = True
235
+ yield row
236
+ num_yielded += 1
237
+
238
+ if not processed:
239
+ break # no more results
240
+ offset += page_size
250
241
 
251
242
  #
252
243
  # Table Name Internal Functions
datachain/dataset.py CHANGED
@@ -11,8 +11,6 @@ from typing import (
11
11
  )
12
12
  from urllib.parse import urlparse
13
13
 
14
- from dateutil.parser import isoparse
15
-
16
14
  from datachain.client import Client
17
15
  from datachain.sql.types import NAME_TYPES_MAPPING, SQLType
18
16
 
@@ -25,6 +23,7 @@ DD = TypeVar("DD", bound="DatasetDependency")
25
23
 
26
24
  DATASET_PREFIX = "ds://"
27
25
  QUERY_DATASET_PREFIX = "ds_query_"
26
+ LISTING_PREFIX = "lst__"
28
27
 
29
28
 
30
29
  def parse_dataset_uri(uri: str) -> tuple[str, Optional[int]]:
@@ -72,11 +71,22 @@ class DatasetDependencyType:
72
71
  class DatasetDependency:
73
72
  id: int
74
73
  type: str
75
- name: str # when the type is STORAGE, this is actually StorageURI
76
- version: str # string until we'll have proper bucket listing versions
74
+ name: str
75
+ version: str # TODO change to int
77
76
  created_at: datetime
78
77
  dependencies: list[Optional["DatasetDependency"]]
79
78
 
79
+ @property
80
+ def dataset_name(self) -> str:
81
+ """Returns clean dependency dataset name"""
82
+ from datachain.lib.listing import parse_listing_uri
83
+
84
+ if self.type == DatasetDependencyType.DATASET:
85
+ return self.name
86
+
87
+ list_dataset_name, _, _ = parse_listing_uri(self.name.strip("/"), None, {})
88
+ return list_dataset_name
89
+
80
90
  @classmethod
81
91
  def parse(
82
92
  cls: builtins.type[DD],
@@ -91,33 +101,31 @@ class DatasetDependency:
91
101
  dataset_version_created_at: Optional[datetime],
92
102
  bucket_uri: Optional["StorageURI"],
93
103
  ) -> Optional["DatasetDependency"]:
94
- if dataset_id:
95
- assert dataset_name is not None
96
- return cls(
97
- id,
98
- DatasetDependencyType.DATASET,
99
- dataset_name,
100
- (
101
- str(dataset_version) # type: ignore[arg-type]
102
- if dataset_version
103
- else None
104
- ),
105
- dataset_version_created_at or dataset_created_at, # type: ignore[arg-type]
106
- [],
107
- )
108
- if bucket_uri:
109
- return cls(
110
- id,
111
- DatasetDependencyType.STORAGE,
112
- bucket_uri,
113
- bucket_version, # type: ignore[arg-type]
114
- isoparse(bucket_version), # type: ignore[arg-type]
115
- [],
116
- )
117
- # dependency has been removed
118
- # TODO we should introduce flags for removed datasets, instead of
119
- # removing them from tables so that we can still have references
120
- return None
104
+ from datachain.lib.listing import is_listing_dataset, listing_uri_from_name
105
+
106
+ if not dataset_id:
107
+ return None
108
+
109
+ assert dataset_name is not None
110
+ dependency_type = DatasetDependencyType.DATASET
111
+ dependency_name = dataset_name
112
+
113
+ if is_listing_dataset(dataset_name):
114
+ dependency_type = DatasetDependencyType.STORAGE # type: ignore[arg-type]
115
+ dependency_name = listing_uri_from_name(dataset_name)
116
+
117
+ return cls(
118
+ id,
119
+ dependency_type,
120
+ dependency_name,
121
+ (
122
+ str(dataset_version) # type: ignore[arg-type]
123
+ if dataset_version
124
+ else None
125
+ ),
126
+ dataset_version_created_at or dataset_created_at, # type: ignore[arg-type]
127
+ [],
128
+ )
121
129
 
122
130
  @property
123
131
  def is_dataset(self) -> bool:
@@ -443,7 +451,11 @@ class DatasetRecord:
443
451
  For bucket listing we implicitly create underlying dataset to hold data. This
444
452
  method is checking if this is one of those datasets.
445
453
  """
446
- return Client.is_data_source_uri(self.name)
454
+ # TODO refactor and maybe remove method in
455
+ # https://github.com/iterative/datachain/issues/318
456
+ return Client.is_data_source_uri(self.name) or self.name.startswith(
457
+ LISTING_PREFIX
458
+ )
447
459
 
448
460
  @property
449
461
  def versions_values(self) -> list[int]:
datachain/job.py CHANGED
@@ -1,7 +1,8 @@
1
1
  import json
2
+ import uuid
2
3
  from dataclasses import dataclass
3
4
  from datetime import datetime
4
- from typing import Any, Optional, TypeVar
5
+ from typing import Any, Optional, TypeVar, Union
5
6
 
6
7
  J = TypeVar("J", bound="Job")
7
8
 
@@ -25,7 +26,7 @@ class Job:
25
26
  @classmethod
26
27
  def parse(
27
28
  cls: type[J],
28
- id: str,
29
+ id: Union[str, uuid.UUID],
29
30
  name: str,
30
31
  status: int,
31
32
  created_at: datetime,
@@ -40,7 +41,7 @@ class Job:
40
41
  metrics: str,
41
42
  ) -> "Job":
42
43
  return cls(
43
- id,
44
+ str(id),
44
45
  name,
45
46
  status,
46
47
  created_at,
datachain/lib/arrow.py CHANGED
@@ -7,7 +7,9 @@ import pyarrow as pa
7
7
  from pyarrow.dataset import dataset
8
8
  from tqdm import tqdm
9
9
 
10
+ from datachain.lib.data_model import dict_to_data_model
10
11
  from datachain.lib.file import File, IndexedFile
12
+ from datachain.lib.model_store import ModelStore
11
13
  from datachain.lib.udf import Generator
12
14
 
13
15
  if TYPE_CHECKING:
@@ -59,7 +61,13 @@ class ArrowGenerator(Generator):
59
61
  vals = list(record.values())
60
62
  if self.output_schema:
61
63
  fields = self.output_schema.model_fields
62
- vals = [self.output_schema(**dict(zip(fields, vals)))]
64
+ vals_dict = {}
65
+ for (field, field_info), val in zip(fields.items(), vals):
66
+ if ModelStore.is_pydantic(field_info.annotation):
67
+ vals_dict[field] = field_info.annotation(**val) # type: ignore[misc]
68
+ else:
69
+ vals_dict[field] = val
70
+ vals = [self.output_schema(**vals_dict)]
63
71
  if self.source:
64
72
  yield [IndexedFile(file=file, index=index), *vals]
65
73
  else:
@@ -95,15 +103,15 @@ def schema_to_output(schema: pa.Schema, col_names: Optional[Sequence[str]] = Non
95
103
  if not column:
96
104
  column = f"c{default_column}"
97
105
  default_column += 1
98
- dtype = arrow_type_mapper(field.type) # type: ignore[assignment]
99
- if field.nullable:
106
+ dtype = arrow_type_mapper(field.type, column) # type: ignore[assignment]
107
+ if field.nullable and not ModelStore.is_pydantic(dtype):
100
108
  dtype = Optional[dtype] # type: ignore[assignment]
101
109
  output[column] = dtype
102
110
 
103
111
  return output
104
112
 
105
113
 
106
- def arrow_type_mapper(col_type: pa.DataType) -> type: # noqa: PLR0911
114
+ def arrow_type_mapper(col_type: pa.DataType, column: str = "") -> type: # noqa: PLR0911
107
115
  """Convert pyarrow types to basic types."""
108
116
  from datetime import datetime
109
117
 
@@ -123,7 +131,15 @@ def arrow_type_mapper(col_type: pa.DataType) -> type: # noqa: PLR0911
123
131
  return str
124
132
  if pa.types.is_list(col_type):
125
133
  return list[arrow_type_mapper(col_type.value_type)] # type: ignore[return-value, misc]
126
- if pa.types.is_struct(col_type) or pa.types.is_map(col_type):
134
+ if pa.types.is_struct(col_type):
135
+ type_dict = {}
136
+ for field in col_type:
137
+ dtype = arrow_type_mapper(field.type, field.name)
138
+ if field.nullable and not ModelStore.is_pydantic(dtype):
139
+ dtype = Optional[dtype] # type: ignore[assignment]
140
+ type_dict[field.name] = dtype
141
+ return dict_to_data_model(column, type_dict)
142
+ if pa.types.is_map(col_type):
127
143
  return dict
128
144
  if isinstance(col_type, pa.lib.DictionaryType):
129
145
  return arrow_type_mapper(col_type.value_type) # type: ignore[return-value]
@@ -23,6 +23,8 @@ class DatasetInfo(DataModel):
23
23
  size: Optional[int] = Field(default=None)
24
24
  params: dict[str, str] = Field(default=dict)
25
25
  metrics: dict[str, Any] = Field(default=dict)
26
+ error_message: str = Field(default="")
27
+ error_stack: str = Field(default="")
26
28
 
27
29
  @staticmethod
28
30
  def _validate_dict(
@@ -67,4 +69,6 @@ class DatasetInfo(DataModel):
67
69
  size=version.size,
68
70
  params=job.params if job else {},
69
71
  metrics=job.metrics if job else {},
72
+ error_message=version.error_message,
73
+ error_stack=version.error_stack,
70
74
  )