datachain 0.16.4__py3-none-any.whl → 0.17.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (39) hide show
  1. datachain/catalog/catalog.py +25 -92
  2. datachain/cli/__init__.py +11 -9
  3. datachain/cli/commands/datasets.py +1 -1
  4. datachain/cli/commands/query.py +1 -0
  5. datachain/cli/commands/show.py +1 -1
  6. datachain/cli/parser/__init__.py +11 -3
  7. datachain/data_storage/job.py +1 -0
  8. datachain/data_storage/metastore.py +105 -94
  9. datachain/data_storage/sqlite.py +8 -7
  10. datachain/data_storage/warehouse.py +58 -46
  11. datachain/dataset.py +88 -45
  12. datachain/lib/arrow.py +23 -1
  13. datachain/lib/dataset_info.py +2 -1
  14. datachain/lib/dc/csv.py +1 -0
  15. datachain/lib/dc/datachain.py +38 -16
  16. datachain/lib/dc/datasets.py +28 -7
  17. datachain/lib/dc/storage.py +10 -2
  18. datachain/lib/listing.py +2 -0
  19. datachain/lib/pytorch.py +2 -2
  20. datachain/lib/udf.py +17 -5
  21. datachain/listing.py +1 -1
  22. datachain/query/batch.py +40 -39
  23. datachain/query/dataset.py +42 -41
  24. datachain/query/dispatch.py +137 -75
  25. datachain/query/metrics.py +1 -2
  26. datachain/query/queue.py +1 -11
  27. datachain/query/session.py +2 -2
  28. datachain/query/udf.py +1 -1
  29. datachain/query/utils.py +8 -14
  30. datachain/remote/studio.py +4 -4
  31. datachain/semver.py +58 -0
  32. datachain/studio.py +1 -1
  33. datachain/utils.py +3 -0
  34. {datachain-0.16.4.dist-info → datachain-0.17.0.dist-info}/METADATA +1 -1
  35. {datachain-0.16.4.dist-info → datachain-0.17.0.dist-info}/RECORD +39 -38
  36. {datachain-0.16.4.dist-info → datachain-0.17.0.dist-info}/WHEEL +1 -1
  37. {datachain-0.16.4.dist-info → datachain-0.17.0.dist-info}/entry_points.txt +0 -0
  38. {datachain-0.16.4.dist-info → datachain-0.17.0.dist-info}/licenses/LICENSE +0 -0
  39. {datachain-0.16.4.dist-info → datachain-0.17.0.dist-info}/top_level.txt +0 -0
@@ -128,7 +128,7 @@ class AbstractMetastore(ABC, Serializable):
128
128
  def create_dataset_version( # noqa: PLR0913
129
129
  self,
130
130
  dataset: DatasetRecord,
131
- version: int,
131
+ version: str,
132
132
  status: int,
133
133
  sources: str = "",
134
134
  feature_schema: Optional[dict] = None,
@@ -158,13 +158,13 @@ class AbstractMetastore(ABC, Serializable):
158
158
 
159
159
  @abstractmethod
160
160
  def update_dataset_version(
161
- self, dataset: DatasetRecord, version: int, **kwargs
161
+ self, dataset: DatasetRecord, version: str, **kwargs
162
162
  ) -> DatasetVersion:
163
163
  """Updates dataset version fields."""
164
164
 
165
165
  @abstractmethod
166
166
  def remove_dataset_version(
167
- self, dataset: DatasetRecord, version: int
167
+ self, dataset: DatasetRecord, version: str
168
168
  ) -> DatasetRecord:
169
169
  """
170
170
  Deletes one single dataset version.
@@ -188,7 +188,7 @@ class AbstractMetastore(ABC, Serializable):
188
188
  self,
189
189
  dataset: DatasetRecord,
190
190
  status: int,
191
- version: Optional[int] = None,
191
+ version: Optional[str] = None,
192
192
  error_message="",
193
193
  error_stack="",
194
194
  script_output="",
@@ -202,9 +202,9 @@ class AbstractMetastore(ABC, Serializable):
202
202
  def add_dataset_dependency(
203
203
  self,
204
204
  source_dataset_name: str,
205
- source_dataset_version: int,
205
+ source_dataset_version: str,
206
206
  dataset_name: str,
207
- dataset_version: int,
207
+ dataset_version: str,
208
208
  ) -> None:
209
209
  """Adds dataset dependency to dataset."""
210
210
 
@@ -212,21 +212,21 @@ class AbstractMetastore(ABC, Serializable):
212
212
  def update_dataset_dependency_source(
213
213
  self,
214
214
  source_dataset: DatasetRecord,
215
- source_dataset_version: int,
215
+ source_dataset_version: str,
216
216
  new_source_dataset: Optional[DatasetRecord] = None,
217
- new_source_dataset_version: Optional[int] = None,
217
+ new_source_dataset_version: Optional[str] = None,
218
218
  ) -> None:
219
219
  """Updates dataset dependency source."""
220
220
 
221
221
  @abstractmethod
222
222
  def get_direct_dataset_dependencies(
223
- self, dataset: DatasetRecord, version: int
223
+ self, dataset: DatasetRecord, version: str
224
224
  ) -> list[Optional[DatasetDependency]]:
225
225
  """Gets direct dataset dependencies."""
226
226
 
227
227
  @abstractmethod
228
228
  def remove_dataset_dependencies(
229
- self, dataset: DatasetRecord, version: Optional[int] = None
229
+ self, dataset: DatasetRecord, version: Optional[str] = None
230
230
  ) -> None:
231
231
  """
232
232
  When we remove dataset, we need to clean up it's dependencies as well.
@@ -234,7 +234,7 @@ class AbstractMetastore(ABC, Serializable):
234
234
 
235
235
  @abstractmethod
236
236
  def remove_dataset_dependants(
237
- self, dataset: DatasetRecord, version: Optional[int] = None
237
+ self, dataset: DatasetRecord, version: Optional[str] = None
238
238
  ) -> None:
239
239
  """
240
240
  When we remove dataset, we need to clear its references in other dataset
@@ -254,6 +254,7 @@ class AbstractMetastore(ABC, Serializable):
254
254
  name: str,
255
255
  query: str,
256
256
  query_type: JobQueryType = JobQueryType.PYTHON,
257
+ status: JobStatus = JobStatus.CREATED,
257
258
  workers: int = 1,
258
259
  python_version: Optional[str] = None,
259
260
  params: Optional[dict[str, str]] = None,
@@ -264,33 +265,35 @@ class AbstractMetastore(ABC, Serializable):
264
265
  """
265
266
 
266
267
  @abstractmethod
267
- def set_job_status(
268
+ def get_job(self, job_id: str) -> Optional[Job]:
269
+ """Returns the job with the given ID."""
270
+
271
+ @abstractmethod
272
+ def update_job(
268
273
  self,
269
274
  job_id: str,
270
- status: JobStatus,
275
+ status: Optional[JobStatus] = None,
276
+ exit_code: Optional[int] = None,
271
277
  error_message: Optional[str] = None,
272
278
  error_stack: Optional[str] = None,
279
+ finished_at: Optional[datetime] = None,
273
280
  metrics: Optional[dict[str, Any]] = None,
274
- ) -> None:
275
- """Set the status of the given job."""
281
+ ) -> Optional["Job"]:
282
+ """Updates job fields."""
276
283
 
277
284
  @abstractmethod
278
- def get_job_status(self, job_id: str) -> Optional[JobStatus]:
279
- """Returns the status of the given job."""
280
-
281
- @abstractmethod
282
- def set_job_and_dataset_status(
285
+ def set_job_status(
283
286
  self,
284
287
  job_id: str,
285
- job_status: JobStatus,
286
- dataset_status: DatasetStatus,
288
+ status: JobStatus,
289
+ error_message: Optional[str] = None,
290
+ error_stack: Optional[str] = None,
287
291
  ) -> None:
288
- """Set the status of the given job and dataset."""
292
+ """Set the status of the given job."""
289
293
 
290
294
  @abstractmethod
291
- def get_job_dataset_versions(self, job_id: str) -> list[tuple[str, int]]:
292
- """Returns dataset names and versions for the job."""
293
- raise NotImplementedError
295
+ def get_job_status(self, job_id: str) -> Optional[JobStatus]:
296
+ """Returns the status of the given job."""
294
297
 
295
298
 
296
299
  class AbstractDBMetastore(AbstractMetastore):
@@ -367,7 +370,7 @@ class AbstractDBMetastore(AbstractMetastore):
367
370
  ForeignKey(f"{cls.DATASET_TABLE}.id", ondelete="CASCADE"),
368
371
  nullable=False,
369
372
  ),
370
- Column("version", Integer, nullable=False),
373
+ Column("version", Text, nullable=False, default="1.0.0"),
371
374
  Column(
372
375
  "status",
373
376
  Integer,
@@ -551,7 +554,7 @@ class AbstractDBMetastore(AbstractMetastore):
551
554
  def create_dataset_version( # noqa: PLR0913
552
555
  self,
553
556
  dataset: DatasetRecord,
554
- version: int,
557
+ version: str,
555
558
  status: int,
556
559
  sources: str = "",
557
560
  feature_schema: Optional[dict] = None,
@@ -645,36 +648,37 @@ class AbstractDBMetastore(AbstractMetastore):
645
648
  return result_ds
646
649
 
647
650
  def update_dataset_version(
648
- self, dataset: DatasetRecord, version: int, conn=None, **kwargs
651
+ self, dataset: DatasetRecord, version: str, conn=None, **kwargs
649
652
  ) -> DatasetVersion:
650
653
  """Updates dataset fields."""
651
654
  dataset_version = dataset.get_version(version)
652
655
 
653
656
  values = {}
657
+ version_values: dict = {}
654
658
  for field, value in kwargs.items():
655
659
  if field in self._dataset_version_fields[1:]:
656
660
  if field == "schema":
657
- dataset_version.update(**{field: DatasetRecord.parse_schema(value)})
658
661
  values[field] = json.dumps(value) if value else None
662
+ version_values[field] = DatasetRecord.parse_schema(value)
659
663
  elif field == "feature_schema":
660
664
  values[field] = json.dumps(value) if value else None
665
+ version_values[field] = value
661
666
  elif field == "preview" and isinstance(value, list):
662
667
  values[field] = json.dumps(value, cls=JSONSerialize)
668
+ version_values[field] = value
663
669
  else:
664
670
  values[field] = value
665
- dataset_version.update(**{field: value})
666
-
667
- if not values:
668
- # Nothing to update
669
- return dataset_version
671
+ version_values[field] = value
670
672
 
671
- dv = self._datasets_versions
672
- self.db.execute(
673
- self._datasets_versions_update()
674
- .where(dv.c.id == dataset_version.id)
675
- .values(values),
676
- conn=conn,
677
- ) # type: ignore [attr-defined]
673
+ if values:
674
+ dv = self._datasets_versions
675
+ self.db.execute(
676
+ self._datasets_versions_update()
677
+ .where(dv.c.dataset_id == dataset.id and dv.c.version == version)
678
+ .values(values),
679
+ conn=conn,
680
+ ) # type: ignore [attr-defined]
681
+ dataset_version.update(**version_values)
678
682
 
679
683
  return dataset_version
680
684
 
@@ -702,7 +706,7 @@ class AbstractDBMetastore(AbstractMetastore):
702
706
  dataset_fields: list[str],
703
707
  dataset_version_fields: list[str],
704
708
  isouter: bool = True,
705
- ):
709
+ ) -> "Select":
706
710
  if not (
707
711
  self.db.has_table(self._datasets.name)
708
712
  and self.db.has_table(self._datasets_versions.name)
@@ -719,12 +723,12 @@ class AbstractDBMetastore(AbstractMetastore):
719
723
  j = d.join(dv, d.c.id == dv.c.dataset_id, isouter=isouter)
720
724
  return query.select_from(j)
721
725
 
722
- def _base_dataset_query(self):
726
+ def _base_dataset_query(self) -> "Select":
723
727
  return self._get_dataset_query(
724
728
  self._dataset_fields, self._dataset_version_fields
725
729
  )
726
730
 
727
- def _base_list_datasets_query(self):
731
+ def _base_list_datasets_query(self) -> "Select":
728
732
  return self._get_dataset_query(
729
733
  self._dataset_list_fields, self._dataset_list_version_fields, isouter=False
730
734
  )
@@ -754,7 +758,7 @@ class AbstractDBMetastore(AbstractMetastore):
754
758
  return ds
755
759
 
756
760
  def remove_dataset_version(
757
- self, dataset: DatasetRecord, version: int
761
+ self, dataset: DatasetRecord, version: str
758
762
  ) -> DatasetRecord:
759
763
  """
760
764
  Deletes one single dataset version.
@@ -787,7 +791,7 @@ class AbstractDBMetastore(AbstractMetastore):
787
791
  self,
788
792
  dataset: DatasetRecord,
789
793
  status: int,
790
- version: Optional[int] = None,
794
+ version: Optional[str] = None,
791
795
  error_message="",
792
796
  error_stack="",
793
797
  script_output="",
@@ -821,9 +825,9 @@ class AbstractDBMetastore(AbstractMetastore):
821
825
  def add_dataset_dependency(
822
826
  self,
823
827
  source_dataset_name: str,
824
- source_dataset_version: int,
828
+ source_dataset_version: str,
825
829
  dataset_name: str,
826
- dataset_version: int,
830
+ dataset_version: str,
827
831
  ) -> None:
828
832
  """Adds dataset dependency to dataset."""
829
833
  source_dataset = self.get_dataset(source_dataset_name)
@@ -843,9 +847,9 @@ class AbstractDBMetastore(AbstractMetastore):
843
847
  def update_dataset_dependency_source(
844
848
  self,
845
849
  source_dataset: DatasetRecord,
846
- source_dataset_version: int,
850
+ source_dataset_version: str,
847
851
  new_source_dataset: Optional[DatasetRecord] = None,
848
- new_source_dataset_version: Optional[int] = None,
852
+ new_source_dataset_version: Optional[str] = None,
849
853
  ) -> None:
850
854
  dd = self._datasets_dependencies
851
855
 
@@ -876,7 +880,7 @@ class AbstractDBMetastore(AbstractMetastore):
876
880
  """
877
881
 
878
882
  def get_direct_dataset_dependencies(
879
- self, dataset: DatasetRecord, version: int
883
+ self, dataset: DatasetRecord, version: str
880
884
  ) -> list[Optional[DatasetDependency]]:
881
885
  d = self._datasets
882
886
  dd = self._datasets_dependencies
@@ -905,7 +909,7 @@ class AbstractDBMetastore(AbstractMetastore):
905
909
  return [self.dependency_class.parse(*r) for r in self.db.execute(query)]
906
910
 
907
911
  def remove_dataset_dependencies(
908
- self, dataset: DatasetRecord, version: Optional[int] = None
912
+ self, dataset: DatasetRecord, version: Optional[str] = None
909
913
  ) -> None:
910
914
  """
911
915
  When we remove dataset, we need to clean up it's dependencies as well
@@ -924,7 +928,7 @@ class AbstractDBMetastore(AbstractMetastore):
924
928
  self.db.execute(q)
925
929
 
926
930
  def remove_dataset_dependants(
927
- self, dataset: DatasetRecord, version: Optional[int] = None
931
+ self, dataset: DatasetRecord, version: Optional[str] = None
928
932
  ) -> None:
929
933
  """
930
934
  When we remove dataset, we need to clear its references in other dataset
@@ -1018,6 +1022,7 @@ class AbstractDBMetastore(AbstractMetastore):
1018
1022
  name: str,
1019
1023
  query: str,
1020
1024
  query_type: JobQueryType = JobQueryType.PYTHON,
1025
+ status: JobStatus = JobStatus.CREATED,
1021
1026
  workers: int = 1,
1022
1027
  python_version: Optional[str] = None,
1023
1028
  params: Optional[dict[str, str]] = None,
@@ -1032,7 +1037,7 @@ class AbstractDBMetastore(AbstractMetastore):
1032
1037
  self._jobs_insert().values(
1033
1038
  id=job_id,
1034
1039
  name=name,
1035
- status=JobStatus.CREATED,
1040
+ status=status,
1036
1041
  created_at=datetime.now(timezone.utc),
1037
1042
  query=query,
1038
1043
  query_type=query_type.value,
@@ -1047,25 +1052,65 @@ class AbstractDBMetastore(AbstractMetastore):
1047
1052
  )
1048
1053
  return job_id
1049
1054
 
1055
+ def get_job(self, job_id: str, conn=None) -> Optional[Job]:
1056
+ """Returns the job with the given ID."""
1057
+ query = self._jobs_select(self._jobs).where(self._jobs.c.id == job_id)
1058
+ results = list(self.db.execute(query, conn=conn))
1059
+ if not results:
1060
+ return None
1061
+ return self._parse_job(results[0])
1062
+
1063
+ def update_job(
1064
+ self,
1065
+ job_id: str,
1066
+ status: Optional[JobStatus] = None,
1067
+ exit_code: Optional[int] = None,
1068
+ error_message: Optional[str] = None,
1069
+ error_stack: Optional[str] = None,
1070
+ finished_at: Optional[datetime] = None,
1071
+ metrics: Optional[dict[str, Any]] = None,
1072
+ conn: Optional[Any] = None,
1073
+ ) -> Optional["Job"]:
1074
+ """Updates job fields."""
1075
+ values: dict = {}
1076
+ if status is not None:
1077
+ values["status"] = status
1078
+ if exit_code is not None:
1079
+ values["exit_code"] = exit_code
1080
+ if error_message is not None:
1081
+ values["error_message"] = error_message
1082
+ if error_stack is not None:
1083
+ values["error_stack"] = error_stack
1084
+ if finished_at is not None:
1085
+ values["finished_at"] = finished_at
1086
+ if metrics:
1087
+ values["metrics"] = json.dumps(metrics)
1088
+
1089
+ if values:
1090
+ j = self._jobs
1091
+ self.db.execute(
1092
+ self._jobs_update().where(j.c.id == job_id).values(**values),
1093
+ conn=conn,
1094
+ ) # type: ignore [attr-defined]
1095
+
1096
+ return self.get_job(job_id, conn=conn)
1097
+
1050
1098
  def set_job_status(
1051
1099
  self,
1052
1100
  job_id: str,
1053
1101
  status: JobStatus,
1054
1102
  error_message: Optional[str] = None,
1055
1103
  error_stack: Optional[str] = None,
1056
- metrics: Optional[dict[str, Any]] = None,
1057
1104
  conn: Optional[Any] = None,
1058
1105
  ) -> None:
1059
1106
  """Set the status of the given job."""
1060
- values: dict = {"status": status.value}
1061
- if status.value in JobStatus.finished():
1107
+ values: dict = {"status": status}
1108
+ if status in JobStatus.finished():
1062
1109
  values["finished_at"] = datetime.now(timezone.utc)
1063
1110
  if error_message:
1064
1111
  values["error_message"] = error_message
1065
1112
  if error_stack:
1066
1113
  values["error_stack"] = error_stack
1067
- if metrics:
1068
- values["metrics"] = json.dumps(metrics)
1069
1114
  self.db.execute(
1070
1115
  self._jobs_update(self._jobs.c.id == job_id).values(**values),
1071
1116
  conn=conn,
@@ -1086,37 +1131,3 @@ class AbstractDBMetastore(AbstractMetastore):
1086
1131
  if not results:
1087
1132
  return None
1088
1133
  return results[0][0]
1089
-
1090
- def set_job_and_dataset_status(
1091
- self,
1092
- job_id: str,
1093
- job_status: JobStatus,
1094
- dataset_status: DatasetStatus,
1095
- ) -> None:
1096
- """Set the status of the given job and dataset."""
1097
- with self.db.transaction() as conn:
1098
- self.set_job_status(job_id, status=job_status, conn=conn)
1099
- dv = self._datasets_versions
1100
- query = (
1101
- self._datasets_versions_update()
1102
- .where(
1103
- (dv.c.job_id == job_id) & (dv.c.status != DatasetStatus.COMPLETE)
1104
- )
1105
- .values(status=dataset_status)
1106
- )
1107
- self.db.execute(query, conn=conn) # type: ignore[attr-defined]
1108
-
1109
- def get_job_dataset_versions(self, job_id: str) -> list[tuple[str, int]]:
1110
- """Returns dataset names and versions for the job."""
1111
- dv = self._datasets_versions
1112
- ds = self._datasets
1113
-
1114
- join_condition = dv.c.dataset_id == ds.c.id
1115
-
1116
- query = (
1117
- self._datasets_versions_select(ds.c.name, dv.c.version)
1118
- .select_from(dv.join(ds, join_condition))
1119
- .where(dv.c.job_id == job_id)
1120
- )
1121
-
1122
- return list(self.db.execute(query))
@@ -25,6 +25,7 @@ from sqlalchemy.sql.selectable import Select
25
25
  from tqdm.auto import tqdm
26
26
 
27
27
  import datachain.sql.sqlite
28
+ from datachain import semver
28
29
  from datachain.data_storage import AbstractDBMetastore, AbstractWarehouse
29
30
  from datachain.data_storage.db_engine import DatabaseEngine
30
31
  from datachain.data_storage.schema import DefaultSchema
@@ -486,7 +487,7 @@ class SQLiteWarehouse(AbstractWarehouse):
486
487
  return table
487
488
 
488
489
  def get_dataset_sources(
489
- self, dataset: DatasetRecord, version: int
490
+ self, dataset: DatasetRecord, version: str
490
491
  ) -> list[StorageURI]:
491
492
  dr = self.dataset_rows(dataset, version)
492
493
  query = dr.select(dr.c("source", column="file")).distinct()
@@ -502,8 +503,8 @@ class SQLiteWarehouse(AbstractWarehouse):
502
503
  self,
503
504
  src: DatasetRecord,
504
505
  dst: DatasetRecord,
505
- src_version: int,
506
- dst_version: int,
506
+ src_version: str,
507
+ dst_version: str,
507
508
  ) -> None:
508
509
  dst_empty = False
509
510
 
@@ -534,7 +535,7 @@ class SQLiteWarehouse(AbstractWarehouse):
534
535
  dst_previous_versions = [
535
536
  v.version
536
537
  for v in dst.versions # type: ignore [union-attr]
537
- if v.version < dst_version
538
+ if semver.compare(v.version, dst_version) == -1
538
539
  ]
539
540
  if dst_previous_versions:
540
541
  dst_version_latest = max(dst_previous_versions)
@@ -570,7 +571,7 @@ class SQLiteWarehouse(AbstractWarehouse):
570
571
  conn=conn,
571
572
  )
572
573
 
573
- def insert_dataset_rows(self, df, dataset: DatasetRecord, version: int) -> int:
574
+ def insert_dataset_rows(self, df, dataset: DatasetRecord, version: str) -> int:
574
575
  dr = self.dataset_rows(dataset, version)
575
576
  return self.db.insert_dataframe(dr.table.name, df)
576
577
 
@@ -595,7 +596,7 @@ class SQLiteWarehouse(AbstractWarehouse):
595
596
  return col_type.python_type
596
597
 
597
598
  def dataset_table_export_file_names(
598
- self, dataset: DatasetRecord, version: int
599
+ self, dataset: DatasetRecord, version: str
599
600
  ) -> list[str]:
600
601
  raise NotImplementedError("Exporting dataset table not implemented for SQLite")
601
602
 
@@ -603,7 +604,7 @@ class SQLiteWarehouse(AbstractWarehouse):
603
604
  self,
604
605
  bucket_uri: str,
605
606
  dataset: DatasetRecord,
606
- version: int,
607
+ version: str,
607
608
  client_config=None,
608
609
  ) -> list[str]:
609
610
  raise NotImplementedError("Exporting dataset table not implemented for SQLite")