datachain 0.30.5__py3-none-any.whl → 0.39.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. datachain/__init__.py +4 -0
  2. datachain/asyn.py +11 -12
  3. datachain/cache.py +5 -5
  4. datachain/catalog/__init__.py +0 -2
  5. datachain/catalog/catalog.py +276 -354
  6. datachain/catalog/dependency.py +164 -0
  7. datachain/catalog/loader.py +8 -3
  8. datachain/checkpoint.py +43 -0
  9. datachain/cli/__init__.py +10 -17
  10. datachain/cli/commands/__init__.py +1 -8
  11. datachain/cli/commands/datasets.py +42 -27
  12. datachain/cli/commands/ls.py +15 -15
  13. datachain/cli/commands/show.py +2 -2
  14. datachain/cli/parser/__init__.py +3 -43
  15. datachain/cli/parser/job.py +1 -1
  16. datachain/cli/parser/utils.py +1 -2
  17. datachain/cli/utils.py +2 -15
  18. datachain/client/azure.py +2 -2
  19. datachain/client/fsspec.py +34 -23
  20. datachain/client/gcs.py +3 -3
  21. datachain/client/http.py +157 -0
  22. datachain/client/local.py +11 -7
  23. datachain/client/s3.py +3 -3
  24. datachain/config.py +4 -8
  25. datachain/data_storage/db_engine.py +12 -6
  26. datachain/data_storage/job.py +2 -0
  27. datachain/data_storage/metastore.py +716 -137
  28. datachain/data_storage/schema.py +20 -27
  29. datachain/data_storage/serializer.py +105 -15
  30. datachain/data_storage/sqlite.py +114 -114
  31. datachain/data_storage/warehouse.py +140 -48
  32. datachain/dataset.py +109 -89
  33. datachain/delta.py +117 -42
  34. datachain/diff/__init__.py +25 -33
  35. datachain/error.py +24 -0
  36. datachain/func/aggregate.py +9 -11
  37. datachain/func/array.py +12 -12
  38. datachain/func/base.py +7 -4
  39. datachain/func/conditional.py +9 -13
  40. datachain/func/func.py +63 -45
  41. datachain/func/numeric.py +5 -7
  42. datachain/func/string.py +2 -2
  43. datachain/hash_utils.py +123 -0
  44. datachain/job.py +11 -7
  45. datachain/json.py +138 -0
  46. datachain/lib/arrow.py +18 -15
  47. datachain/lib/audio.py +60 -59
  48. datachain/lib/clip.py +14 -13
  49. datachain/lib/convert/python_to_sql.py +6 -10
  50. datachain/lib/convert/values_to_tuples.py +151 -53
  51. datachain/lib/data_model.py +23 -19
  52. datachain/lib/dataset_info.py +7 -7
  53. datachain/lib/dc/__init__.py +2 -1
  54. datachain/lib/dc/csv.py +22 -26
  55. datachain/lib/dc/database.py +37 -34
  56. datachain/lib/dc/datachain.py +518 -324
  57. datachain/lib/dc/datasets.py +38 -30
  58. datachain/lib/dc/hf.py +16 -20
  59. datachain/lib/dc/json.py +17 -18
  60. datachain/lib/dc/listings.py +5 -8
  61. datachain/lib/dc/pandas.py +3 -6
  62. datachain/lib/dc/parquet.py +33 -21
  63. datachain/lib/dc/records.py +9 -13
  64. datachain/lib/dc/storage.py +103 -65
  65. datachain/lib/dc/storage_pattern.py +251 -0
  66. datachain/lib/dc/utils.py +17 -14
  67. datachain/lib/dc/values.py +3 -6
  68. datachain/lib/file.py +187 -50
  69. datachain/lib/hf.py +7 -5
  70. datachain/lib/image.py +13 -13
  71. datachain/lib/listing.py +5 -5
  72. datachain/lib/listing_info.py +1 -2
  73. datachain/lib/meta_formats.py +2 -3
  74. datachain/lib/model_store.py +20 -8
  75. datachain/lib/namespaces.py +59 -7
  76. datachain/lib/projects.py +51 -9
  77. datachain/lib/pytorch.py +31 -23
  78. datachain/lib/settings.py +188 -85
  79. datachain/lib/signal_schema.py +302 -64
  80. datachain/lib/text.py +8 -7
  81. datachain/lib/udf.py +103 -63
  82. datachain/lib/udf_signature.py +59 -34
  83. datachain/lib/utils.py +20 -0
  84. datachain/lib/video.py +3 -4
  85. datachain/lib/webdataset.py +31 -36
  86. datachain/lib/webdataset_laion.py +15 -16
  87. datachain/listing.py +12 -5
  88. datachain/model/bbox.py +3 -1
  89. datachain/namespace.py +22 -3
  90. datachain/node.py +6 -6
  91. datachain/nodes_thread_pool.py +0 -1
  92. datachain/plugins.py +24 -0
  93. datachain/project.py +4 -4
  94. datachain/query/batch.py +10 -12
  95. datachain/query/dataset.py +376 -194
  96. datachain/query/dispatch.py +112 -84
  97. datachain/query/metrics.py +3 -4
  98. datachain/query/params.py +2 -3
  99. datachain/query/queue.py +2 -1
  100. datachain/query/schema.py +7 -6
  101. datachain/query/session.py +190 -33
  102. datachain/query/udf.py +9 -6
  103. datachain/remote/studio.py +90 -53
  104. datachain/script_meta.py +12 -12
  105. datachain/sql/sqlite/base.py +37 -25
  106. datachain/sql/sqlite/types.py +1 -1
  107. datachain/sql/types.py +36 -5
  108. datachain/studio.py +49 -40
  109. datachain/toolkit/split.py +31 -10
  110. datachain/utils.py +39 -48
  111. {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/METADATA +26 -38
  112. datachain-0.39.0.dist-info/RECORD +173 -0
  113. datachain/cli/commands/query.py +0 -54
  114. datachain/query/utils.py +0 -36
  115. datachain-0.30.5.dist-info/RECORD +0 -168
  116. {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/WHEEL +0 -0
  117. {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/entry_points.txt +0 -0
  118. {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/licenses/LICENSE +0 -0
  119. {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/top_level.txt +0 -0
datachain/dataset.py CHANGED
@@ -1,21 +1,14 @@
1
1
  import builtins
2
- import json
3
2
  from dataclasses import dataclass, fields
4
3
  from datetime import datetime
5
4
  from functools import cached_property
6
- from typing import (
7
- Any,
8
- NewType,
9
- Optional,
10
- TypeVar,
11
- Union,
12
- )
5
+ from typing import Any, NewType, TypeVar
13
6
  from urllib.parse import urlparse
14
7
 
15
8
  from packaging.specifiers import SpecifierSet
16
9
  from packaging.version import Version
17
10
 
18
- from datachain import semver
11
+ from datachain import json, semver
19
12
  from datachain.error import DatasetVersionNotFoundError, InvalidDatasetNameError
20
13
  from datachain.namespace import Namespace
21
14
  from datachain.project import Project
@@ -43,7 +36,7 @@ DATASET_NAME_REPLACEMENT_CHAR = "_"
43
36
  StorageURI = NewType("StorageURI", str)
44
37
 
45
38
 
46
- def parse_dataset_uri(uri: str) -> tuple[str, Optional[str]]:
39
+ def parse_dataset_uri(uri: str) -> tuple[str, str | None]:
47
40
  """
48
41
  Parse dataser uri to extract name and version out of it (if version is defined)
49
42
  Example:
@@ -65,7 +58,7 @@ def parse_dataset_uri(uri: str) -> tuple[str, Optional[str]]:
65
58
 
66
59
 
67
60
  def create_dataset_uri(
68
- name: str, namespace: str, project: str, version: Optional[str] = None
61
+ name: str, namespace: str, project: str, version: str | None = None
69
62
  ) -> str:
70
63
  """
71
64
  Creates a dataset uri based on namespace, project, dataset name and optionally
@@ -81,7 +74,7 @@ def create_dataset_uri(
81
74
  return uri
82
75
 
83
76
 
84
- def parse_dataset_name(name: str) -> tuple[Optional[str], Optional[str], str]:
77
+ def parse_dataset_name(name: str) -> tuple[str | None, str | None, str]:
85
78
  """Parses dataset name and returns namespace, project and name"""
86
79
  if not name:
87
80
  raise InvalidDatasetNameError("Name must be defined to parse it")
@@ -95,6 +88,40 @@ def parse_dataset_name(name: str) -> tuple[Optional[str], Optional[str], str]:
95
88
  return namespace_name, project_name, name
96
89
 
97
90
 
91
+ def parse_schema(ct: dict[str, Any]) -> dict[str, SQLType | type[SQLType]]:
92
+ """Parse dataset schema from dictionary representation.
93
+
94
+ Args:
95
+ ct: Dictionary with column definitions
96
+
97
+ Returns:
98
+ Dictionary mapping column names to SQL types
99
+
100
+ Raises:
101
+ TypeError: If schema format is invalid
102
+ ValueError: If column type is not defined or not supported
103
+ """
104
+ if not isinstance(ct, dict):
105
+ raise TypeError("Schema definition must be a dictionary")
106
+ res = {}
107
+ for c_name, c_type in ct.items():
108
+ if not isinstance(c_type, dict):
109
+ raise TypeError(f"Schema column '{c_name}' type must be a dictionary")
110
+ if "type" not in c_type:
111
+ raise ValueError(f"Schema column '{c_name}' type is not defined")
112
+ if c_type["type"] not in NAME_TYPES_MAPPING:
113
+ raise ValueError(
114
+ f"Schema column '{c_name}' type '{c_type['type']}' is not supported"
115
+ )
116
+ try:
117
+ res[c_name] = NAME_TYPES_MAPPING[c_type["type"]].from_dict(c_type) # type: ignore [attr-defined]
118
+ except Exception as e:
119
+ raise ValueError(
120
+ f"Schema column '{c_name}' type '{c_type['type']}' parsing error: {e}"
121
+ ) from e
122
+ return res
123
+
124
+
98
125
  class DatasetDependencyType:
99
126
  DATASET = "dataset"
100
127
  STORAGE = "storage"
@@ -111,7 +138,7 @@ class DatasetDependency:
111
138
  name: str
112
139
  version: str
113
140
  created_at: datetime
114
- dependencies: list[Optional["DatasetDependency"]]
141
+ dependencies: list["DatasetDependency | None"]
115
142
 
116
143
  @property
117
144
  def dataset_name(self) -> str:
@@ -131,12 +158,12 @@ class DatasetDependency:
131
158
  namespace_name: str,
132
159
  project_name: str,
133
160
  id: int,
134
- dataset_id: Optional[int],
135
- dataset_version_id: Optional[int],
136
- dataset_name: Optional[str],
137
- dataset_version: Optional[str],
138
- dataset_version_created_at: Optional[datetime],
139
- ) -> Optional["DatasetDependency"]:
161
+ dataset_id: int | None,
162
+ dataset_version_id: int | None,
163
+ dataset_name: str | None,
164
+ dataset_version: str | None,
165
+ dataset_version_created_at: datetime | None,
166
+ ) -> "DatasetDependency | None":
140
167
  from datachain.lib.listing import is_listing_dataset
141
168
 
142
169
  if not dataset_id:
@@ -198,17 +225,17 @@ class DatasetVersion:
198
225
  status: int
199
226
  feature_schema: dict
200
227
  created_at: datetime
201
- finished_at: Optional[datetime]
228
+ finished_at: datetime | None
202
229
  error_message: str
203
230
  error_stack: str
204
231
  script_output: str
205
- schema: dict[str, Union[SQLType, type[SQLType]]]
206
- num_objects: Optional[int]
207
- size: Optional[int]
208
- _preview_data: Optional[Union[str, list[dict]]]
232
+ schema: dict[str, SQLType | type[SQLType]]
233
+ num_objects: int | None
234
+ size: int | None
235
+ _preview_data: str | list[dict] | None
209
236
  sources: str = ""
210
237
  query_script: str = ""
211
- job_id: Optional[str] = None
238
+ job_id: str | None = None
212
239
 
213
240
  @classmethod
214
241
  def parse( # noqa: PLR0913
@@ -218,20 +245,25 @@ class DatasetVersion:
218
245
  dataset_id: int,
219
246
  version: str,
220
247
  status: int,
221
- feature_schema: Optional[str],
248
+ feature_schema: str | None,
222
249
  created_at: datetime,
223
- finished_at: Optional[datetime],
250
+ finished_at: datetime | None,
224
251
  error_message: str,
225
252
  error_stack: str,
226
253
  script_output: str,
227
- num_objects: Optional[int],
228
- size: Optional[int],
229
- preview: Optional[Union[str, list[dict]]],
230
- schema: dict[str, Union[SQLType, type[SQLType]]],
254
+ num_objects: int | None,
255
+ size: int | None,
256
+ preview: str | list[dict] | None,
257
+ schema: str | dict[str, SQLType | type[SQLType]],
231
258
  sources: str = "",
232
259
  query_script: str = "",
233
- job_id: Optional[str] = None,
260
+ job_id: str | None = None,
234
261
  ):
262
+ if isinstance(schema, str):
263
+ schema_parsed = parse_schema(json.loads(schema) if schema else {})
264
+ else:
265
+ schema_parsed = schema
266
+
235
267
  return cls(
236
268
  id,
237
269
  uuid,
@@ -244,7 +276,7 @@ class DatasetVersion:
244
276
  error_message,
245
277
  error_stack,
246
278
  script_output,
247
- schema,
279
+ schema_parsed,
248
280
  num_objects,
249
281
  size,
250
282
  preview,
@@ -292,7 +324,7 @@ class DatasetVersion:
292
324
  }
293
325
 
294
326
  @cached_property
295
- def preview(self) -> Optional[list[dict]]:
327
+ def preview(self) -> list[dict] | None:
296
328
  if isinstance(self._preview_data, str):
297
329
  return json.loads(self._preview_data)
298
330
  return self._preview_data if self._preview_data else None
@@ -313,13 +345,13 @@ class DatasetListVersion:
313
345
  version: str
314
346
  status: int
315
347
  created_at: datetime
316
- finished_at: Optional[datetime]
348
+ finished_at: datetime | None
317
349
  error_message: str
318
350
  error_stack: str
319
- num_objects: Optional[int]
320
- size: Optional[int]
351
+ num_objects: int | None
352
+ size: int | None
321
353
  query_script: str = ""
322
- job_id: Optional[str] = None
354
+ job_id: str | None = None
323
355
 
324
356
  @classmethod
325
357
  def parse(
@@ -330,13 +362,13 @@ class DatasetListVersion:
330
362
  version: str,
331
363
  status: int,
332
364
  created_at: datetime,
333
- finished_at: Optional[datetime],
365
+ finished_at: datetime | None,
334
366
  error_message: str,
335
367
  error_stack: str,
336
- num_objects: Optional[int],
337
- size: Optional[int],
368
+ num_objects: int | None,
369
+ size: int | None,
338
370
  query_script: str = "",
339
- job_id: Optional[str] = None,
371
+ job_id: str | None = None,
340
372
  **kwargs,
341
373
  ):
342
374
  return cls(
@@ -368,14 +400,14 @@ class DatasetRecord:
368
400
  id: int
369
401
  name: str
370
402
  project: Project
371
- description: Optional[str]
403
+ description: str | None
372
404
  attrs: list[str]
373
- schema: dict[str, Union[SQLType, type[SQLType]]]
405
+ schema: dict[str, SQLType | type[SQLType]]
374
406
  feature_schema: dict
375
407
  versions: list[DatasetVersion]
376
408
  status: int = DatasetStatus.CREATED
377
- created_at: Optional[datetime] = None
378
- finished_at: Optional[datetime] = None
409
+ created_at: datetime | None = None
410
+ finished_at: datetime | None = None
379
411
  error_message: str = ""
380
412
  error_stack: str = ""
381
413
  script_output: str = ""
@@ -385,15 +417,6 @@ class DatasetRecord:
385
417
  def __hash__(self):
386
418
  return hash(f"{self.id}")
387
419
 
388
- @staticmethod
389
- def parse_schema(
390
- ct: dict[str, Any],
391
- ) -> dict[str, Union[SQLType, type[SQLType]]]:
392
- return {
393
- c_name: NAME_TYPES_MAPPING[c_type["type"]].from_dict(c_type) # type: ignore [attr-defined]
394
- for c_name, c_type in ct.items()
395
- }
396
-
397
420
  @staticmethod
398
421
  def validate_name(name: str) -> None:
399
422
  """Throws exception if name has reserved characters"""
@@ -409,23 +432,23 @@ class DatasetRecord:
409
432
  namespace_id: int,
410
433
  namespace_uuid: str,
411
434
  namespace_name: str,
412
- namespace_description: Optional[str],
435
+ namespace_description: str | None,
413
436
  namespace_created_at: datetime,
414
437
  project_id: int,
415
438
  project_uuid: str,
416
439
  project_name: str,
417
- project_description: Optional[str],
440
+ project_description: str | None,
418
441
  project_created_at: datetime,
419
442
  project_namespace_id: int,
420
443
  dataset_id: int,
421
444
  dataset_project_id: int,
422
445
  name: str,
423
- description: Optional[str],
446
+ description: str | None,
424
447
  attrs: str,
425
448
  status: int,
426
- feature_schema: Optional[str],
449
+ feature_schema: str | None,
427
450
  created_at: datetime,
428
- finished_at: Optional[datetime],
451
+ finished_at: datetime | None,
429
452
  error_message: str,
430
453
  error_stack: str,
431
454
  script_output: str,
@@ -437,25 +460,22 @@ class DatasetRecord:
437
460
  version_dataset_id: int,
438
461
  version: str,
439
462
  version_status: int,
440
- version_feature_schema: Optional[str],
463
+ version_feature_schema: str | None,
441
464
  version_created_at: datetime,
442
- version_finished_at: Optional[datetime],
465
+ version_finished_at: datetime | None,
443
466
  version_error_message: str,
444
467
  version_error_stack: str,
445
468
  version_script_output: str,
446
- version_num_objects: Optional[int],
447
- version_size: Optional[int],
448
- version_preview: Optional[str],
449
- version_sources: Optional[str],
450
- version_query_script: Optional[str],
469
+ version_num_objects: int | None,
470
+ version_size: int | None,
471
+ version_preview: str | None,
472
+ version_sources: str | None,
473
+ version_query_script: str | None,
451
474
  version_schema: str,
452
- version_job_id: Optional[str] = None,
475
+ version_job_id: str | None = None,
453
476
  ) -> "DatasetRecord":
454
477
  attrs_lst: list[str] = json.loads(attrs) if attrs else []
455
478
  schema_dct: dict[str, Any] = json.loads(schema) if schema else {}
456
- version_schema_dct: dict[str, str] = (
457
- json.loads(version_schema) if version_schema else {}
458
- )
459
479
 
460
480
  namespace = Namespace(
461
481
  namespace_id,
@@ -489,7 +509,7 @@ class DatasetRecord:
489
509
  version_num_objects,
490
510
  version_size,
491
511
  version_preview,
492
- cls.parse_schema(version_schema_dct), # type: ignore[arg-type]
512
+ version_schema,
493
513
  version_sources, # type: ignore[arg-type]
494
514
  version_query_script, # type: ignore[arg-type]
495
515
  version_job_id,
@@ -501,7 +521,7 @@ class DatasetRecord:
501
521
  project,
502
522
  description,
503
523
  attrs_lst,
504
- cls.parse_schema(schema_dct), # type: ignore[arg-type]
524
+ parse_schema(schema_dct), # type: ignore[arg-type]
505
525
  json.loads(feature_schema) if feature_schema else {},
506
526
  [dataset_version],
507
527
  status,
@@ -527,7 +547,7 @@ class DatasetRecord:
527
547
  def full_name(self) -> str:
528
548
  return f"{self.project.namespace.name}.{self.project.name}.{self.name}"
529
549
 
530
- def get_schema(self, version: str) -> dict[str, Union[SQLType, type[SQLType]]]:
550
+ def get_schema(self, version: str) -> dict[str, SQLType | type[SQLType]]:
531
551
  return self.get_version(version).schema if version else self.schema
532
552
 
533
553
  def update(self, **kwargs):
@@ -619,7 +639,7 @@ class DatasetRecord:
619
639
  if not self.versions:
620
640
  return "1.0.0"
621
641
 
622
- major, minor, patch = semver.parse(self.latest_version)
642
+ major, _, _ = semver.parse(self.latest_version)
623
643
  return semver.create(major + 1, 0, 0)
624
644
 
625
645
  @property
@@ -630,7 +650,7 @@ class DatasetRecord:
630
650
  if not self.versions:
631
651
  return "1.0.0"
632
652
 
633
- major, minor, patch = semver.parse(self.latest_version)
653
+ major, minor, _ = semver.parse(self.latest_version)
634
654
  return semver.create(major, minor + 1, 0)
635
655
 
636
656
  @property
@@ -649,7 +669,7 @@ class DatasetRecord:
649
669
  """Returns latest version of a dataset"""
650
670
  return max(self.versions).version
651
671
 
652
- def latest_major_version(self, major: int) -> Optional[str]:
672
+ def latest_major_version(self, major: int) -> str | None:
653
673
  """
654
674
  Returns latest specific major version, e.g if dataset has versions:
655
675
  - 1.4.1
@@ -664,7 +684,7 @@ class DatasetRecord:
664
684
  return None
665
685
  return max(versions).version
666
686
 
667
- def latest_compatible_version(self, version_spec: str) -> Optional[str]:
687
+ def latest_compatible_version(self, version_spec: str) -> str | None:
668
688
  """
669
689
  Returns the latest version that matches the given version specifier.
670
690
 
@@ -711,10 +731,10 @@ class DatasetListRecord:
711
731
  id: int
712
732
  name: str
713
733
  project: Project
714
- description: Optional[str]
734
+ description: str | None
715
735
  attrs: list[str]
716
736
  versions: list[DatasetListVersion]
717
- created_at: Optional[datetime] = None
737
+ created_at: datetime | None = None
718
738
 
719
739
  @classmethod
720
740
  def parse( # noqa: PLR0913
@@ -722,17 +742,17 @@ class DatasetListRecord:
722
742
  namespace_id: int,
723
743
  namespace_uuid: str,
724
744
  namespace_name: str,
725
- namespace_description: Optional[str],
745
+ namespace_description: str | None,
726
746
  namespace_created_at: datetime,
727
747
  project_id: int,
728
748
  project_uuid: str,
729
749
  project_name: str,
730
- project_description: Optional[str],
750
+ project_description: str | None,
731
751
  project_created_at: datetime,
732
752
  project_namespace_id: int,
733
753
  dataset_id: int,
734
754
  name: str,
735
- description: Optional[str],
755
+ description: str | None,
736
756
  attrs: str,
737
757
  created_at: datetime,
738
758
  version_id: int,
@@ -741,13 +761,13 @@ class DatasetListRecord:
741
761
  version: str,
742
762
  version_status: int,
743
763
  version_created_at: datetime,
744
- version_finished_at: Optional[datetime],
764
+ version_finished_at: datetime | None,
745
765
  version_error_message: str,
746
766
  version_error_stack: str,
747
- version_num_objects: Optional[int],
748
- version_size: Optional[int],
749
- version_query_script: Optional[str],
750
- version_job_id: Optional[str] = None,
767
+ version_num_objects: int | None,
768
+ version_size: int | None,
769
+ version_query_script: str | None,
770
+ version_job_id: str | None = None,
751
771
  ) -> "DatasetListRecord":
752
772
  attrs_lst: list[str] = json.loads(attrs) if attrs else []
753
773
 
@@ -824,7 +844,7 @@ class DatasetListRecord:
824
844
  from datachain.client import Client
825
845
 
826
846
  # TODO refactor and maybe remove method in
827
- # https://github.com/iterative/datachain/issues/318
847
+ # https://github.com/datachain-ai/datachain/issues/318
828
848
  return Client.is_data_source_uri(self.name) or self.name.startswith(
829
849
  LISTING_PREFIX
830
850
  )