datachain 0.16.4__py3-none-any.whl → 0.17.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/catalog/catalog.py +25 -92
- datachain/cli/__init__.py +11 -9
- datachain/cli/commands/datasets.py +1 -1
- datachain/cli/commands/query.py +1 -0
- datachain/cli/commands/show.py +1 -1
- datachain/cli/parser/__init__.py +11 -3
- datachain/data_storage/job.py +1 -0
- datachain/data_storage/metastore.py +105 -94
- datachain/data_storage/sqlite.py +8 -7
- datachain/data_storage/warehouse.py +58 -46
- datachain/dataset.py +88 -45
- datachain/lib/arrow.py +23 -1
- datachain/lib/dataset_info.py +2 -1
- datachain/lib/dc/csv.py +1 -0
- datachain/lib/dc/datachain.py +38 -16
- datachain/lib/dc/datasets.py +28 -7
- datachain/lib/dc/storage.py +10 -2
- datachain/lib/listing.py +2 -0
- datachain/lib/pytorch.py +2 -2
- datachain/lib/udf.py +17 -5
- datachain/listing.py +1 -1
- datachain/query/batch.py +40 -39
- datachain/query/dataset.py +42 -41
- datachain/query/dispatch.py +137 -75
- datachain/query/metrics.py +1 -2
- datachain/query/queue.py +1 -11
- datachain/query/session.py +2 -2
- datachain/query/udf.py +1 -1
- datachain/query/utils.py +8 -14
- datachain/remote/studio.py +4 -4
- datachain/semver.py +58 -0
- datachain/studio.py +1 -1
- datachain/utils.py +3 -0
- {datachain-0.16.4.dist-info → datachain-0.17.0.dist-info}/METADATA +1 -1
- {datachain-0.16.4.dist-info → datachain-0.17.0.dist-info}/RECORD +39 -38
- {datachain-0.16.4.dist-info → datachain-0.17.0.dist-info}/WHEEL +1 -1
- {datachain-0.16.4.dist-info → datachain-0.17.0.dist-info}/entry_points.txt +0 -0
- {datachain-0.16.4.dist-info → datachain-0.17.0.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.16.4.dist-info → datachain-0.17.0.dist-info}/top_level.txt +0 -0
|
@@ -128,7 +128,7 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
128
128
|
def create_dataset_version( # noqa: PLR0913
|
|
129
129
|
self,
|
|
130
130
|
dataset: DatasetRecord,
|
|
131
|
-
version:
|
|
131
|
+
version: str,
|
|
132
132
|
status: int,
|
|
133
133
|
sources: str = "",
|
|
134
134
|
feature_schema: Optional[dict] = None,
|
|
@@ -158,13 +158,13 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
158
158
|
|
|
159
159
|
@abstractmethod
|
|
160
160
|
def update_dataset_version(
|
|
161
|
-
self, dataset: DatasetRecord, version:
|
|
161
|
+
self, dataset: DatasetRecord, version: str, **kwargs
|
|
162
162
|
) -> DatasetVersion:
|
|
163
163
|
"""Updates dataset version fields."""
|
|
164
164
|
|
|
165
165
|
@abstractmethod
|
|
166
166
|
def remove_dataset_version(
|
|
167
|
-
self, dataset: DatasetRecord, version:
|
|
167
|
+
self, dataset: DatasetRecord, version: str
|
|
168
168
|
) -> DatasetRecord:
|
|
169
169
|
"""
|
|
170
170
|
Deletes one single dataset version.
|
|
@@ -188,7 +188,7 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
188
188
|
self,
|
|
189
189
|
dataset: DatasetRecord,
|
|
190
190
|
status: int,
|
|
191
|
-
version: Optional[
|
|
191
|
+
version: Optional[str] = None,
|
|
192
192
|
error_message="",
|
|
193
193
|
error_stack="",
|
|
194
194
|
script_output="",
|
|
@@ -202,9 +202,9 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
202
202
|
def add_dataset_dependency(
|
|
203
203
|
self,
|
|
204
204
|
source_dataset_name: str,
|
|
205
|
-
source_dataset_version:
|
|
205
|
+
source_dataset_version: str,
|
|
206
206
|
dataset_name: str,
|
|
207
|
-
dataset_version:
|
|
207
|
+
dataset_version: str,
|
|
208
208
|
) -> None:
|
|
209
209
|
"""Adds dataset dependency to dataset."""
|
|
210
210
|
|
|
@@ -212,21 +212,21 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
212
212
|
def update_dataset_dependency_source(
|
|
213
213
|
self,
|
|
214
214
|
source_dataset: DatasetRecord,
|
|
215
|
-
source_dataset_version:
|
|
215
|
+
source_dataset_version: str,
|
|
216
216
|
new_source_dataset: Optional[DatasetRecord] = None,
|
|
217
|
-
new_source_dataset_version: Optional[
|
|
217
|
+
new_source_dataset_version: Optional[str] = None,
|
|
218
218
|
) -> None:
|
|
219
219
|
"""Updates dataset dependency source."""
|
|
220
220
|
|
|
221
221
|
@abstractmethod
|
|
222
222
|
def get_direct_dataset_dependencies(
|
|
223
|
-
self, dataset: DatasetRecord, version:
|
|
223
|
+
self, dataset: DatasetRecord, version: str
|
|
224
224
|
) -> list[Optional[DatasetDependency]]:
|
|
225
225
|
"""Gets direct dataset dependencies."""
|
|
226
226
|
|
|
227
227
|
@abstractmethod
|
|
228
228
|
def remove_dataset_dependencies(
|
|
229
|
-
self, dataset: DatasetRecord, version: Optional[
|
|
229
|
+
self, dataset: DatasetRecord, version: Optional[str] = None
|
|
230
230
|
) -> None:
|
|
231
231
|
"""
|
|
232
232
|
When we remove dataset, we need to clean up it's dependencies as well.
|
|
@@ -234,7 +234,7 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
234
234
|
|
|
235
235
|
@abstractmethod
|
|
236
236
|
def remove_dataset_dependants(
|
|
237
|
-
self, dataset: DatasetRecord, version: Optional[
|
|
237
|
+
self, dataset: DatasetRecord, version: Optional[str] = None
|
|
238
238
|
) -> None:
|
|
239
239
|
"""
|
|
240
240
|
When we remove dataset, we need to clear its references in other dataset
|
|
@@ -254,6 +254,7 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
254
254
|
name: str,
|
|
255
255
|
query: str,
|
|
256
256
|
query_type: JobQueryType = JobQueryType.PYTHON,
|
|
257
|
+
status: JobStatus = JobStatus.CREATED,
|
|
257
258
|
workers: int = 1,
|
|
258
259
|
python_version: Optional[str] = None,
|
|
259
260
|
params: Optional[dict[str, str]] = None,
|
|
@@ -264,33 +265,35 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
264
265
|
"""
|
|
265
266
|
|
|
266
267
|
@abstractmethod
|
|
267
|
-
def
|
|
268
|
+
def get_job(self, job_id: str) -> Optional[Job]:
|
|
269
|
+
"""Returns the job with the given ID."""
|
|
270
|
+
|
|
271
|
+
@abstractmethod
|
|
272
|
+
def update_job(
|
|
268
273
|
self,
|
|
269
274
|
job_id: str,
|
|
270
|
-
status: JobStatus,
|
|
275
|
+
status: Optional[JobStatus] = None,
|
|
276
|
+
exit_code: Optional[int] = None,
|
|
271
277
|
error_message: Optional[str] = None,
|
|
272
278
|
error_stack: Optional[str] = None,
|
|
279
|
+
finished_at: Optional[datetime] = None,
|
|
273
280
|
metrics: Optional[dict[str, Any]] = None,
|
|
274
|
-
) ->
|
|
275
|
-
"""
|
|
281
|
+
) -> Optional["Job"]:
|
|
282
|
+
"""Updates job fields."""
|
|
276
283
|
|
|
277
284
|
@abstractmethod
|
|
278
|
-
def
|
|
279
|
-
"""Returns the status of the given job."""
|
|
280
|
-
|
|
281
|
-
@abstractmethod
|
|
282
|
-
def set_job_and_dataset_status(
|
|
285
|
+
def set_job_status(
|
|
283
286
|
self,
|
|
284
287
|
job_id: str,
|
|
285
|
-
|
|
286
|
-
|
|
288
|
+
status: JobStatus,
|
|
289
|
+
error_message: Optional[str] = None,
|
|
290
|
+
error_stack: Optional[str] = None,
|
|
287
291
|
) -> None:
|
|
288
|
-
"""Set the status of the given job
|
|
292
|
+
"""Set the status of the given job."""
|
|
289
293
|
|
|
290
294
|
@abstractmethod
|
|
291
|
-
def
|
|
292
|
-
"""Returns
|
|
293
|
-
raise NotImplementedError
|
|
295
|
+
def get_job_status(self, job_id: str) -> Optional[JobStatus]:
|
|
296
|
+
"""Returns the status of the given job."""
|
|
294
297
|
|
|
295
298
|
|
|
296
299
|
class AbstractDBMetastore(AbstractMetastore):
|
|
@@ -367,7 +370,7 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
367
370
|
ForeignKey(f"{cls.DATASET_TABLE}.id", ondelete="CASCADE"),
|
|
368
371
|
nullable=False,
|
|
369
372
|
),
|
|
370
|
-
Column("version",
|
|
373
|
+
Column("version", Text, nullable=False, default="1.0.0"),
|
|
371
374
|
Column(
|
|
372
375
|
"status",
|
|
373
376
|
Integer,
|
|
@@ -551,7 +554,7 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
551
554
|
def create_dataset_version( # noqa: PLR0913
|
|
552
555
|
self,
|
|
553
556
|
dataset: DatasetRecord,
|
|
554
|
-
version:
|
|
557
|
+
version: str,
|
|
555
558
|
status: int,
|
|
556
559
|
sources: str = "",
|
|
557
560
|
feature_schema: Optional[dict] = None,
|
|
@@ -645,36 +648,37 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
645
648
|
return result_ds
|
|
646
649
|
|
|
647
650
|
def update_dataset_version(
|
|
648
|
-
self, dataset: DatasetRecord, version:
|
|
651
|
+
self, dataset: DatasetRecord, version: str, conn=None, **kwargs
|
|
649
652
|
) -> DatasetVersion:
|
|
650
653
|
"""Updates dataset fields."""
|
|
651
654
|
dataset_version = dataset.get_version(version)
|
|
652
655
|
|
|
653
656
|
values = {}
|
|
657
|
+
version_values: dict = {}
|
|
654
658
|
for field, value in kwargs.items():
|
|
655
659
|
if field in self._dataset_version_fields[1:]:
|
|
656
660
|
if field == "schema":
|
|
657
|
-
dataset_version.update(**{field: DatasetRecord.parse_schema(value)})
|
|
658
661
|
values[field] = json.dumps(value) if value else None
|
|
662
|
+
version_values[field] = DatasetRecord.parse_schema(value)
|
|
659
663
|
elif field == "feature_schema":
|
|
660
664
|
values[field] = json.dumps(value) if value else None
|
|
665
|
+
version_values[field] = value
|
|
661
666
|
elif field == "preview" and isinstance(value, list):
|
|
662
667
|
values[field] = json.dumps(value, cls=JSONSerialize)
|
|
668
|
+
version_values[field] = value
|
|
663
669
|
else:
|
|
664
670
|
values[field] = value
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
if not values:
|
|
668
|
-
# Nothing to update
|
|
669
|
-
return dataset_version
|
|
671
|
+
version_values[field] = value
|
|
670
672
|
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
self.
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
673
|
+
if values:
|
|
674
|
+
dv = self._datasets_versions
|
|
675
|
+
self.db.execute(
|
|
676
|
+
self._datasets_versions_update()
|
|
677
|
+
.where(dv.c.dataset_id == dataset.id and dv.c.version == version)
|
|
678
|
+
.values(values),
|
|
679
|
+
conn=conn,
|
|
680
|
+
) # type: ignore [attr-defined]
|
|
681
|
+
dataset_version.update(**version_values)
|
|
678
682
|
|
|
679
683
|
return dataset_version
|
|
680
684
|
|
|
@@ -702,7 +706,7 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
702
706
|
dataset_fields: list[str],
|
|
703
707
|
dataset_version_fields: list[str],
|
|
704
708
|
isouter: bool = True,
|
|
705
|
-
):
|
|
709
|
+
) -> "Select":
|
|
706
710
|
if not (
|
|
707
711
|
self.db.has_table(self._datasets.name)
|
|
708
712
|
and self.db.has_table(self._datasets_versions.name)
|
|
@@ -719,12 +723,12 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
719
723
|
j = d.join(dv, d.c.id == dv.c.dataset_id, isouter=isouter)
|
|
720
724
|
return query.select_from(j)
|
|
721
725
|
|
|
722
|
-
def _base_dataset_query(self):
|
|
726
|
+
def _base_dataset_query(self) -> "Select":
|
|
723
727
|
return self._get_dataset_query(
|
|
724
728
|
self._dataset_fields, self._dataset_version_fields
|
|
725
729
|
)
|
|
726
730
|
|
|
727
|
-
def _base_list_datasets_query(self):
|
|
731
|
+
def _base_list_datasets_query(self) -> "Select":
|
|
728
732
|
return self._get_dataset_query(
|
|
729
733
|
self._dataset_list_fields, self._dataset_list_version_fields, isouter=False
|
|
730
734
|
)
|
|
@@ -754,7 +758,7 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
754
758
|
return ds
|
|
755
759
|
|
|
756
760
|
def remove_dataset_version(
|
|
757
|
-
self, dataset: DatasetRecord, version:
|
|
761
|
+
self, dataset: DatasetRecord, version: str
|
|
758
762
|
) -> DatasetRecord:
|
|
759
763
|
"""
|
|
760
764
|
Deletes one single dataset version.
|
|
@@ -787,7 +791,7 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
787
791
|
self,
|
|
788
792
|
dataset: DatasetRecord,
|
|
789
793
|
status: int,
|
|
790
|
-
version: Optional[
|
|
794
|
+
version: Optional[str] = None,
|
|
791
795
|
error_message="",
|
|
792
796
|
error_stack="",
|
|
793
797
|
script_output="",
|
|
@@ -821,9 +825,9 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
821
825
|
def add_dataset_dependency(
|
|
822
826
|
self,
|
|
823
827
|
source_dataset_name: str,
|
|
824
|
-
source_dataset_version:
|
|
828
|
+
source_dataset_version: str,
|
|
825
829
|
dataset_name: str,
|
|
826
|
-
dataset_version:
|
|
830
|
+
dataset_version: str,
|
|
827
831
|
) -> None:
|
|
828
832
|
"""Adds dataset dependency to dataset."""
|
|
829
833
|
source_dataset = self.get_dataset(source_dataset_name)
|
|
@@ -843,9 +847,9 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
843
847
|
def update_dataset_dependency_source(
|
|
844
848
|
self,
|
|
845
849
|
source_dataset: DatasetRecord,
|
|
846
|
-
source_dataset_version:
|
|
850
|
+
source_dataset_version: str,
|
|
847
851
|
new_source_dataset: Optional[DatasetRecord] = None,
|
|
848
|
-
new_source_dataset_version: Optional[
|
|
852
|
+
new_source_dataset_version: Optional[str] = None,
|
|
849
853
|
) -> None:
|
|
850
854
|
dd = self._datasets_dependencies
|
|
851
855
|
|
|
@@ -876,7 +880,7 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
876
880
|
"""
|
|
877
881
|
|
|
878
882
|
def get_direct_dataset_dependencies(
|
|
879
|
-
self, dataset: DatasetRecord, version:
|
|
883
|
+
self, dataset: DatasetRecord, version: str
|
|
880
884
|
) -> list[Optional[DatasetDependency]]:
|
|
881
885
|
d = self._datasets
|
|
882
886
|
dd = self._datasets_dependencies
|
|
@@ -905,7 +909,7 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
905
909
|
return [self.dependency_class.parse(*r) for r in self.db.execute(query)]
|
|
906
910
|
|
|
907
911
|
def remove_dataset_dependencies(
|
|
908
|
-
self, dataset: DatasetRecord, version: Optional[
|
|
912
|
+
self, dataset: DatasetRecord, version: Optional[str] = None
|
|
909
913
|
) -> None:
|
|
910
914
|
"""
|
|
911
915
|
When we remove dataset, we need to clean up it's dependencies as well
|
|
@@ -924,7 +928,7 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
924
928
|
self.db.execute(q)
|
|
925
929
|
|
|
926
930
|
def remove_dataset_dependants(
|
|
927
|
-
self, dataset: DatasetRecord, version: Optional[
|
|
931
|
+
self, dataset: DatasetRecord, version: Optional[str] = None
|
|
928
932
|
) -> None:
|
|
929
933
|
"""
|
|
930
934
|
When we remove dataset, we need to clear its references in other dataset
|
|
@@ -1018,6 +1022,7 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
1018
1022
|
name: str,
|
|
1019
1023
|
query: str,
|
|
1020
1024
|
query_type: JobQueryType = JobQueryType.PYTHON,
|
|
1025
|
+
status: JobStatus = JobStatus.CREATED,
|
|
1021
1026
|
workers: int = 1,
|
|
1022
1027
|
python_version: Optional[str] = None,
|
|
1023
1028
|
params: Optional[dict[str, str]] = None,
|
|
@@ -1032,7 +1037,7 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
1032
1037
|
self._jobs_insert().values(
|
|
1033
1038
|
id=job_id,
|
|
1034
1039
|
name=name,
|
|
1035
|
-
status=
|
|
1040
|
+
status=status,
|
|
1036
1041
|
created_at=datetime.now(timezone.utc),
|
|
1037
1042
|
query=query,
|
|
1038
1043
|
query_type=query_type.value,
|
|
@@ -1047,25 +1052,65 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
1047
1052
|
)
|
|
1048
1053
|
return job_id
|
|
1049
1054
|
|
|
1055
|
+
def get_job(self, job_id: str, conn=None) -> Optional[Job]:
|
|
1056
|
+
"""Returns the job with the given ID."""
|
|
1057
|
+
query = self._jobs_select(self._jobs).where(self._jobs.c.id == job_id)
|
|
1058
|
+
results = list(self.db.execute(query, conn=conn))
|
|
1059
|
+
if not results:
|
|
1060
|
+
return None
|
|
1061
|
+
return self._parse_job(results[0])
|
|
1062
|
+
|
|
1063
|
+
def update_job(
|
|
1064
|
+
self,
|
|
1065
|
+
job_id: str,
|
|
1066
|
+
status: Optional[JobStatus] = None,
|
|
1067
|
+
exit_code: Optional[int] = None,
|
|
1068
|
+
error_message: Optional[str] = None,
|
|
1069
|
+
error_stack: Optional[str] = None,
|
|
1070
|
+
finished_at: Optional[datetime] = None,
|
|
1071
|
+
metrics: Optional[dict[str, Any]] = None,
|
|
1072
|
+
conn: Optional[Any] = None,
|
|
1073
|
+
) -> Optional["Job"]:
|
|
1074
|
+
"""Updates job fields."""
|
|
1075
|
+
values: dict = {}
|
|
1076
|
+
if status is not None:
|
|
1077
|
+
values["status"] = status
|
|
1078
|
+
if exit_code is not None:
|
|
1079
|
+
values["exit_code"] = exit_code
|
|
1080
|
+
if error_message is not None:
|
|
1081
|
+
values["error_message"] = error_message
|
|
1082
|
+
if error_stack is not None:
|
|
1083
|
+
values["error_stack"] = error_stack
|
|
1084
|
+
if finished_at is not None:
|
|
1085
|
+
values["finished_at"] = finished_at
|
|
1086
|
+
if metrics:
|
|
1087
|
+
values["metrics"] = json.dumps(metrics)
|
|
1088
|
+
|
|
1089
|
+
if values:
|
|
1090
|
+
j = self._jobs
|
|
1091
|
+
self.db.execute(
|
|
1092
|
+
self._jobs_update().where(j.c.id == job_id).values(**values),
|
|
1093
|
+
conn=conn,
|
|
1094
|
+
) # type: ignore [attr-defined]
|
|
1095
|
+
|
|
1096
|
+
return self.get_job(job_id, conn=conn)
|
|
1097
|
+
|
|
1050
1098
|
def set_job_status(
|
|
1051
1099
|
self,
|
|
1052
1100
|
job_id: str,
|
|
1053
1101
|
status: JobStatus,
|
|
1054
1102
|
error_message: Optional[str] = None,
|
|
1055
1103
|
error_stack: Optional[str] = None,
|
|
1056
|
-
metrics: Optional[dict[str, Any]] = None,
|
|
1057
1104
|
conn: Optional[Any] = None,
|
|
1058
1105
|
) -> None:
|
|
1059
1106
|
"""Set the status of the given job."""
|
|
1060
|
-
values: dict = {"status": status
|
|
1061
|
-
if status
|
|
1107
|
+
values: dict = {"status": status}
|
|
1108
|
+
if status in JobStatus.finished():
|
|
1062
1109
|
values["finished_at"] = datetime.now(timezone.utc)
|
|
1063
1110
|
if error_message:
|
|
1064
1111
|
values["error_message"] = error_message
|
|
1065
1112
|
if error_stack:
|
|
1066
1113
|
values["error_stack"] = error_stack
|
|
1067
|
-
if metrics:
|
|
1068
|
-
values["metrics"] = json.dumps(metrics)
|
|
1069
1114
|
self.db.execute(
|
|
1070
1115
|
self._jobs_update(self._jobs.c.id == job_id).values(**values),
|
|
1071
1116
|
conn=conn,
|
|
@@ -1086,37 +1131,3 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
1086
1131
|
if not results:
|
|
1087
1132
|
return None
|
|
1088
1133
|
return results[0][0]
|
|
1089
|
-
|
|
1090
|
-
def set_job_and_dataset_status(
|
|
1091
|
-
self,
|
|
1092
|
-
job_id: str,
|
|
1093
|
-
job_status: JobStatus,
|
|
1094
|
-
dataset_status: DatasetStatus,
|
|
1095
|
-
) -> None:
|
|
1096
|
-
"""Set the status of the given job and dataset."""
|
|
1097
|
-
with self.db.transaction() as conn:
|
|
1098
|
-
self.set_job_status(job_id, status=job_status, conn=conn)
|
|
1099
|
-
dv = self._datasets_versions
|
|
1100
|
-
query = (
|
|
1101
|
-
self._datasets_versions_update()
|
|
1102
|
-
.where(
|
|
1103
|
-
(dv.c.job_id == job_id) & (dv.c.status != DatasetStatus.COMPLETE)
|
|
1104
|
-
)
|
|
1105
|
-
.values(status=dataset_status)
|
|
1106
|
-
)
|
|
1107
|
-
self.db.execute(query, conn=conn) # type: ignore[attr-defined]
|
|
1108
|
-
|
|
1109
|
-
def get_job_dataset_versions(self, job_id: str) -> list[tuple[str, int]]:
|
|
1110
|
-
"""Returns dataset names and versions for the job."""
|
|
1111
|
-
dv = self._datasets_versions
|
|
1112
|
-
ds = self._datasets
|
|
1113
|
-
|
|
1114
|
-
join_condition = dv.c.dataset_id == ds.c.id
|
|
1115
|
-
|
|
1116
|
-
query = (
|
|
1117
|
-
self._datasets_versions_select(ds.c.name, dv.c.version)
|
|
1118
|
-
.select_from(dv.join(ds, join_condition))
|
|
1119
|
-
.where(dv.c.job_id == job_id)
|
|
1120
|
-
)
|
|
1121
|
-
|
|
1122
|
-
return list(self.db.execute(query))
|
datachain/data_storage/sqlite.py
CHANGED
|
@@ -25,6 +25,7 @@ from sqlalchemy.sql.selectable import Select
|
|
|
25
25
|
from tqdm.auto import tqdm
|
|
26
26
|
|
|
27
27
|
import datachain.sql.sqlite
|
|
28
|
+
from datachain import semver
|
|
28
29
|
from datachain.data_storage import AbstractDBMetastore, AbstractWarehouse
|
|
29
30
|
from datachain.data_storage.db_engine import DatabaseEngine
|
|
30
31
|
from datachain.data_storage.schema import DefaultSchema
|
|
@@ -486,7 +487,7 @@ class SQLiteWarehouse(AbstractWarehouse):
|
|
|
486
487
|
return table
|
|
487
488
|
|
|
488
489
|
def get_dataset_sources(
|
|
489
|
-
self, dataset: DatasetRecord, version:
|
|
490
|
+
self, dataset: DatasetRecord, version: str
|
|
490
491
|
) -> list[StorageURI]:
|
|
491
492
|
dr = self.dataset_rows(dataset, version)
|
|
492
493
|
query = dr.select(dr.c("source", column="file")).distinct()
|
|
@@ -502,8 +503,8 @@ class SQLiteWarehouse(AbstractWarehouse):
|
|
|
502
503
|
self,
|
|
503
504
|
src: DatasetRecord,
|
|
504
505
|
dst: DatasetRecord,
|
|
505
|
-
src_version:
|
|
506
|
-
dst_version:
|
|
506
|
+
src_version: str,
|
|
507
|
+
dst_version: str,
|
|
507
508
|
) -> None:
|
|
508
509
|
dst_empty = False
|
|
509
510
|
|
|
@@ -534,7 +535,7 @@ class SQLiteWarehouse(AbstractWarehouse):
|
|
|
534
535
|
dst_previous_versions = [
|
|
535
536
|
v.version
|
|
536
537
|
for v in dst.versions # type: ignore [union-attr]
|
|
537
|
-
if v.version
|
|
538
|
+
if semver.compare(v.version, dst_version) == -1
|
|
538
539
|
]
|
|
539
540
|
if dst_previous_versions:
|
|
540
541
|
dst_version_latest = max(dst_previous_versions)
|
|
@@ -570,7 +571,7 @@ class SQLiteWarehouse(AbstractWarehouse):
|
|
|
570
571
|
conn=conn,
|
|
571
572
|
)
|
|
572
573
|
|
|
573
|
-
def insert_dataset_rows(self, df, dataset: DatasetRecord, version:
|
|
574
|
+
def insert_dataset_rows(self, df, dataset: DatasetRecord, version: str) -> int:
|
|
574
575
|
dr = self.dataset_rows(dataset, version)
|
|
575
576
|
return self.db.insert_dataframe(dr.table.name, df)
|
|
576
577
|
|
|
@@ -595,7 +596,7 @@ class SQLiteWarehouse(AbstractWarehouse):
|
|
|
595
596
|
return col_type.python_type
|
|
596
597
|
|
|
597
598
|
def dataset_table_export_file_names(
|
|
598
|
-
self, dataset: DatasetRecord, version:
|
|
599
|
+
self, dataset: DatasetRecord, version: str
|
|
599
600
|
) -> list[str]:
|
|
600
601
|
raise NotImplementedError("Exporting dataset table not implemented for SQLite")
|
|
601
602
|
|
|
@@ -603,7 +604,7 @@ class SQLiteWarehouse(AbstractWarehouse):
|
|
|
603
604
|
self,
|
|
604
605
|
bucket_uri: str,
|
|
605
606
|
dataset: DatasetRecord,
|
|
606
|
-
version:
|
|
607
|
+
version: str,
|
|
607
608
|
client_config=None,
|
|
608
609
|
) -> list[str]:
|
|
609
610
|
raise NotImplementedError("Exporting dataset table not implemented for SQLite")
|