datachain 0.30.5__py3-none-any.whl → 0.39.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. datachain/__init__.py +4 -0
  2. datachain/asyn.py +11 -12
  3. datachain/cache.py +5 -5
  4. datachain/catalog/__init__.py +0 -2
  5. datachain/catalog/catalog.py +276 -354
  6. datachain/catalog/dependency.py +164 -0
  7. datachain/catalog/loader.py +8 -3
  8. datachain/checkpoint.py +43 -0
  9. datachain/cli/__init__.py +10 -17
  10. datachain/cli/commands/__init__.py +1 -8
  11. datachain/cli/commands/datasets.py +42 -27
  12. datachain/cli/commands/ls.py +15 -15
  13. datachain/cli/commands/show.py +2 -2
  14. datachain/cli/parser/__init__.py +3 -43
  15. datachain/cli/parser/job.py +1 -1
  16. datachain/cli/parser/utils.py +1 -2
  17. datachain/cli/utils.py +2 -15
  18. datachain/client/azure.py +2 -2
  19. datachain/client/fsspec.py +34 -23
  20. datachain/client/gcs.py +3 -3
  21. datachain/client/http.py +157 -0
  22. datachain/client/local.py +11 -7
  23. datachain/client/s3.py +3 -3
  24. datachain/config.py +4 -8
  25. datachain/data_storage/db_engine.py +12 -6
  26. datachain/data_storage/job.py +2 -0
  27. datachain/data_storage/metastore.py +716 -137
  28. datachain/data_storage/schema.py +20 -27
  29. datachain/data_storage/serializer.py +105 -15
  30. datachain/data_storage/sqlite.py +114 -114
  31. datachain/data_storage/warehouse.py +140 -48
  32. datachain/dataset.py +109 -89
  33. datachain/delta.py +117 -42
  34. datachain/diff/__init__.py +25 -33
  35. datachain/error.py +24 -0
  36. datachain/func/aggregate.py +9 -11
  37. datachain/func/array.py +12 -12
  38. datachain/func/base.py +7 -4
  39. datachain/func/conditional.py +9 -13
  40. datachain/func/func.py +63 -45
  41. datachain/func/numeric.py +5 -7
  42. datachain/func/string.py +2 -2
  43. datachain/hash_utils.py +123 -0
  44. datachain/job.py +11 -7
  45. datachain/json.py +138 -0
  46. datachain/lib/arrow.py +18 -15
  47. datachain/lib/audio.py +60 -59
  48. datachain/lib/clip.py +14 -13
  49. datachain/lib/convert/python_to_sql.py +6 -10
  50. datachain/lib/convert/values_to_tuples.py +151 -53
  51. datachain/lib/data_model.py +23 -19
  52. datachain/lib/dataset_info.py +7 -7
  53. datachain/lib/dc/__init__.py +2 -1
  54. datachain/lib/dc/csv.py +22 -26
  55. datachain/lib/dc/database.py +37 -34
  56. datachain/lib/dc/datachain.py +518 -324
  57. datachain/lib/dc/datasets.py +38 -30
  58. datachain/lib/dc/hf.py +16 -20
  59. datachain/lib/dc/json.py +17 -18
  60. datachain/lib/dc/listings.py +5 -8
  61. datachain/lib/dc/pandas.py +3 -6
  62. datachain/lib/dc/parquet.py +33 -21
  63. datachain/lib/dc/records.py +9 -13
  64. datachain/lib/dc/storage.py +103 -65
  65. datachain/lib/dc/storage_pattern.py +251 -0
  66. datachain/lib/dc/utils.py +17 -14
  67. datachain/lib/dc/values.py +3 -6
  68. datachain/lib/file.py +187 -50
  69. datachain/lib/hf.py +7 -5
  70. datachain/lib/image.py +13 -13
  71. datachain/lib/listing.py +5 -5
  72. datachain/lib/listing_info.py +1 -2
  73. datachain/lib/meta_formats.py +2 -3
  74. datachain/lib/model_store.py +20 -8
  75. datachain/lib/namespaces.py +59 -7
  76. datachain/lib/projects.py +51 -9
  77. datachain/lib/pytorch.py +31 -23
  78. datachain/lib/settings.py +188 -85
  79. datachain/lib/signal_schema.py +302 -64
  80. datachain/lib/text.py +8 -7
  81. datachain/lib/udf.py +103 -63
  82. datachain/lib/udf_signature.py +59 -34
  83. datachain/lib/utils.py +20 -0
  84. datachain/lib/video.py +3 -4
  85. datachain/lib/webdataset.py +31 -36
  86. datachain/lib/webdataset_laion.py +15 -16
  87. datachain/listing.py +12 -5
  88. datachain/model/bbox.py +3 -1
  89. datachain/namespace.py +22 -3
  90. datachain/node.py +6 -6
  91. datachain/nodes_thread_pool.py +0 -1
  92. datachain/plugins.py +24 -0
  93. datachain/project.py +4 -4
  94. datachain/query/batch.py +10 -12
  95. datachain/query/dataset.py +376 -194
  96. datachain/query/dispatch.py +112 -84
  97. datachain/query/metrics.py +3 -4
  98. datachain/query/params.py +2 -3
  99. datachain/query/queue.py +2 -1
  100. datachain/query/schema.py +7 -6
  101. datachain/query/session.py +190 -33
  102. datachain/query/udf.py +9 -6
  103. datachain/remote/studio.py +90 -53
  104. datachain/script_meta.py +12 -12
  105. datachain/sql/sqlite/base.py +37 -25
  106. datachain/sql/sqlite/types.py +1 -1
  107. datachain/sql/types.py +36 -5
  108. datachain/studio.py +49 -40
  109. datachain/toolkit/split.py +31 -10
  110. datachain/utils.py +39 -48
  111. {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/METADATA +26 -38
  112. datachain-0.39.0.dist-info/RECORD +173 -0
  113. datachain/cli/commands/query.py +0 -54
  114. datachain/query/utils.py +0 -36
  115. datachain-0.30.5.dist-info/RECORD +0 -168
  116. {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/WHEEL +0 -0
  117. {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/entry_points.txt +0 -0
  118. {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/licenses/LICENSE +0 -0
  119. {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/top_level.txt +0 -0
@@ -1,30 +1,30 @@
1
1
  import glob
2
2
  import logging
3
3
  import posixpath
4
- import random
4
+ import secrets
5
5
  import string
6
6
  from abc import ABC, abstractmethod
7
- from collections.abc import Generator, Iterable, Iterator, Sequence
8
- from typing import TYPE_CHECKING, Any, Callable, Optional, Union
7
+ from collections.abc import Callable, Generator, Iterable, Iterator, Sequence
8
+ from typing import TYPE_CHECKING, Any, Union, cast
9
9
  from urllib.parse import urlparse
10
10
 
11
11
  import attrs
12
12
  import sqlalchemy as sa
13
- import ujson as json
14
13
  from sqlalchemy.sql.expression import true
15
14
 
15
+ from datachain import json
16
16
  from datachain.client import Client
17
17
  from datachain.data_storage.schema import convert_rows_custom_column_types
18
18
  from datachain.data_storage.serializer import Serializable
19
19
  from datachain.dataset import DatasetRecord, StorageURI
20
20
  from datachain.lib.file import File
21
+ from datachain.lib.model_store import ModelStore
21
22
  from datachain.lib.signal_schema import SignalSchema
22
23
  from datachain.node import DirType, DirTypeGroup, Node, NodeWithPath, get_path
23
24
  from datachain.query.batch import RowsOutput
24
25
  from datachain.query.schema import ColumnMeta
25
- from datachain.query.utils import get_query_id_column
26
26
  from datachain.sql.functions import path as pathfunc
27
- from datachain.sql.types import Int, SQLType
27
+ from datachain.sql.types import SQLType
28
28
  from datachain.utils import sql_escape_like
29
29
 
30
30
  if TYPE_CHECKING:
@@ -33,6 +33,7 @@ if TYPE_CHECKING:
33
33
  _FromClauseArgument,
34
34
  _OnClauseArgument,
35
35
  )
36
+ from sqlalchemy.sql.selectable import FromClause
36
37
  from sqlalchemy.types import TypeEngine
37
38
 
38
39
  from datachain.data_storage import schema
@@ -43,6 +44,7 @@ if TYPE_CHECKING:
43
44
  logger = logging.getLogger("datachain")
44
45
 
45
46
  SELECT_BATCH_SIZE = 100_000 # number of rows to fetch at a time
47
+ INSERT_BATCH_SIZE = 10_000 # number of rows to insert at a time
46
48
 
47
49
 
48
50
  class AbstractWarehouse(ABC, Serializable):
@@ -69,12 +71,36 @@ class AbstractWarehouse(ABC, Serializable):
69
71
  return self
70
72
 
71
73
  def __exit__(self, exc_type, exc_value, traceback) -> None:
72
- # Default behavior is to do nothing, as connections may be shared.
73
- pass
74
+ """Default behavior is to do nothing, as connections may be shared."""
74
75
 
75
76
  def cleanup_for_tests(self):
76
77
  """Cleanup for tests."""
77
78
 
79
+ def _to_jsonable(self, obj: Any) -> Any:
80
+ """Recursively convert Python/Pydantic structures into JSON-serializable
81
+ objects.
82
+ """
83
+
84
+ if ModelStore.is_pydantic(type(obj)):
85
+ # Use Pydantic's JSON mode to ensure datetime and other non-JSON
86
+ # native types are serialized in a compatible way.
87
+ return obj.model_dump(mode="json")
88
+
89
+ if isinstance(obj, dict):
90
+ out: dict[str, Any] = {}
91
+ for k, v in obj.items():
92
+ if not isinstance(k, str):
93
+ key_str = json.dumps(self._to_jsonable(k), ensure_ascii=False)
94
+ else:
95
+ key_str = k
96
+ out[key_str] = self._to_jsonable(v)
97
+ return out
98
+
99
+ if isinstance(obj, (list, tuple, set)):
100
+ return [self._to_jsonable(i) for i in obj]
101
+
102
+ return obj
103
+
78
104
  def convert_type( # noqa: PLR0911
79
105
  self,
80
106
  val: Any,
@@ -121,11 +147,13 @@ class AbstractWarehouse(ABC, Serializable):
121
147
  if col_python_type is dict or col_type_name == "JSON":
122
148
  if value_type is str:
123
149
  return val
124
- if value_type in (dict, list):
125
- return json.dumps(val, ensure_ascii=False)
126
- raise ValueError(
127
- f"Cannot convert value {val!r} with type {value_type} to JSON"
128
- )
150
+ try:
151
+ json_ready = self._to_jsonable(val)
152
+ return json.dumps(json_ready, ensure_ascii=False)
153
+ except Exception as e:
154
+ raise ValueError(
155
+ f"Cannot convert value {val!r} with type {value_type} to JSON"
156
+ ) from e
129
157
 
130
158
  if isinstance(val, col_python_type):
131
159
  return val
@@ -173,12 +201,12 @@ class AbstractWarehouse(ABC, Serializable):
173
201
  #
174
202
 
175
203
  @abstractmethod
176
- def is_ready(self, timeout: Optional[int] = None) -> bool: ...
204
+ def is_ready(self, timeout: int | None = None) -> bool: ...
177
205
 
178
206
  def dataset_rows(
179
207
  self,
180
208
  dataset: DatasetRecord,
181
- version: Optional[str] = None,
209
+ version: str | None = None,
182
210
  column: str = "file",
183
211
  ):
184
212
  version = version or dataset.latest_version
@@ -227,7 +255,8 @@ class AbstractWarehouse(ABC, Serializable):
227
255
  while True:
228
256
  if limit is not None:
229
257
  limit -= num_yielded
230
- if limit == 0:
258
+ num_yielded = 0
259
+ if limit <= 0:
231
260
  break
232
261
  if limit < page_size:
233
262
  paginated_query = paginated_query.limit(None).limit(limit)
@@ -245,6 +274,71 @@ class AbstractWarehouse(ABC, Serializable):
245
274
  break # no more results
246
275
  offset += page_size
247
276
 
277
+ def _regenerate_system_columns(
278
+ self,
279
+ selectable: sa.Select,
280
+ keep_existing_columns: bool = False,
281
+ regenerate_columns: Iterable[str] | None = None,
282
+ ) -> sa.Select:
283
+ """
284
+ Return a SELECT that regenerates system columns deterministically.
285
+
286
+ If keep_existing_columns is True, existing system columns will be kept as-is
287
+ even when they are listed in ``regenerate_columns``.
288
+
289
+ Args:
290
+ selectable: Base SELECT
291
+ keep_existing_columns: When True, reuse existing system columns even if
292
+ they are part of the regeneration set.
293
+ regenerate_columns: Names of system columns to regenerate. Defaults to
294
+ {"sys__id", "sys__rand"}. Columns not listed are left untouched.
295
+ """
296
+ system_columns = {
297
+ sys_col.name: sys_col.type
298
+ for sys_col in self.schema.dataset_row_cls.sys_columns()
299
+ }
300
+ regenerate = set(regenerate_columns or system_columns)
301
+ generators = {
302
+ "sys__id": self._system_row_number_expr,
303
+ "sys__rand": self._system_random_expr,
304
+ }
305
+
306
+ base = cast("FromClause", selectable.subquery())
307
+
308
+ def build(name: str) -> sa.ColumnElement:
309
+ expr = generators[name]()
310
+ return sa.cast(expr, system_columns[name]).label(name)
311
+
312
+ columns: list[sa.ColumnElement] = []
313
+ present: set[str] = set()
314
+ changed = False
315
+
316
+ for col in base.c:
317
+ present.add(col.name)
318
+ regen = col.name in regenerate and not keep_existing_columns
319
+ columns.append(build(col.name) if regen else col)
320
+ changed |= regen
321
+
322
+ for name in regenerate - present:
323
+ columns.append(build(name))
324
+ changed = True
325
+
326
+ if not changed:
327
+ return selectable
328
+
329
+ inner = sa.select(*columns).select_from(base).subquery()
330
+ return sa.select(*inner.c).select_from(inner)
331
+
332
+ def _system_row_number_expr(self):
333
+ """Return an expression that produces deterministic row numbers."""
334
+
335
+ raise NotImplementedError
336
+
337
+ def _system_random_expr(self):
338
+ """Return an expression that produces deterministic random values."""
339
+
340
+ raise NotImplementedError
341
+
248
342
  #
249
343
  # Table Name Internal Functions
250
344
  #
@@ -303,21 +397,9 @@ class AbstractWarehouse(ABC, Serializable):
303
397
  table_name = self.dataset_table_name(dataset, version)
304
398
  table = sa.Table(table_name, self.db.metadata)
305
399
  self.db.drop_table(table, if_exists=if_exists)
306
-
307
- @abstractmethod
308
- def merge_dataset_rows(
309
- self,
310
- src: "DatasetRecord",
311
- dst: "DatasetRecord",
312
- src_version: str,
313
- dst_version: str,
314
- ) -> None:
315
- """
316
- Merges source dataset rows and current latest destination dataset rows
317
- into a new rows table created for new destination dataset version.
318
- Note that table for new destination version must be created upfront.
319
- Merge results should not contain duplicates.
320
- """
400
+ # Remove from metadata cache to allow recreation
401
+ if table_name in self.db.metadata.tables:
402
+ self.db.metadata.remove(self.db.metadata.tables[table_name])
321
403
 
322
404
  def dataset_rows_select(
323
405
  self,
@@ -341,7 +423,7 @@ class AbstractWarehouse(ABC, Serializable):
341
423
  """
342
424
  Fetch dataset rows from database using a list of IDs.
343
425
  """
344
- if (id_col := get_query_id_column(query)) is None:
426
+ if (id_col := query.selected_columns.get("sys__id")) is None:
345
427
  raise RuntimeError("sys__id column not found in query")
346
428
 
347
429
  query = query._clone().offset(None).limit(None).order_by(None)
@@ -385,7 +467,7 @@ class AbstractWarehouse(ABC, Serializable):
385
467
 
386
468
  def dataset_stats(
387
469
  self, dataset: DatasetRecord, version: str
388
- ) -> tuple[Optional[int], Optional[int]]:
470
+ ) -> tuple[int | None, int | None]:
389
471
  """
390
472
  Returns tuple with dataset stats: total number of rows and total dataset size.
391
473
  """
@@ -415,7 +497,12 @@ class AbstractWarehouse(ABC, Serializable):
415
497
  """Convert File entries so they can be passed on to `insert_rows()`"""
416
498
 
417
499
  @abstractmethod
418
- def insert_rows(self, table: sa.Table, rows: Iterable[dict[str, Any]]) -> None:
500
+ def insert_rows(
501
+ self,
502
+ table: sa.Table,
503
+ rows: Iterable[dict[str, Any]],
504
+ batch_size: int = INSERT_BATCH_SIZE,
505
+ ) -> None:
419
506
  """Does batch inserts of any kind of rows into table"""
420
507
 
421
508
  def insert_rows_done(self, table: sa.Table) -> None:
@@ -505,7 +592,7 @@ class AbstractWarehouse(ABC, Serializable):
505
592
  dr = dataset_rows
506
593
  columns = [c.name for c in query.selected_columns]
507
594
  for row in self.db.execute(query):
508
- d = dict(zip(columns, row))
595
+ d = dict(zip(columns, row, strict=False))
509
596
  yield Node(**{dr.without_object(k): v for k, v in d.items()})
510
597
 
511
598
  def get_dirs_by_parent_path(
@@ -742,7 +829,7 @@ class AbstractWarehouse(ABC, Serializable):
742
829
  def size(
743
830
  self,
744
831
  dataset_rows: "DataTable",
745
- node: Union[Node, dict[str, Any]],
832
+ node: Node | dict[str, Any],
746
833
  count_files: bool = False,
747
834
  ) -> tuple[int, int]:
748
835
  """
@@ -784,10 +871,10 @@ class AbstractWarehouse(ABC, Serializable):
784
871
  self,
785
872
  dataset_rows: "DataTable",
786
873
  parent_path: str,
787
- fields: Optional[Sequence[str]] = None,
788
- type: Optional[str] = None,
874
+ fields: Sequence[str] | None = None,
875
+ type: str | None = None,
789
876
  conds=None,
790
- order_by: Optional[Union[str, list[str]]] = None,
877
+ order_by: str | list[str] | None = None,
791
878
  include_subobjects: bool = True,
792
879
  ) -> sa.Select:
793
880
  if not conds:
@@ -825,7 +912,7 @@ class AbstractWarehouse(ABC, Serializable):
825
912
  self,
826
913
  dataset_rows: "DataTable",
827
914
  node: Node,
828
- sort: Union[list[str], str, None] = None,
915
+ sort: list[str] | str | None = None,
829
916
  include_subobjects: bool = True,
830
917
  ) -> Iterator[NodeWithPath]:
831
918
  """
@@ -883,20 +970,25 @@ class AbstractWarehouse(ABC, Serializable):
883
970
  def create_udf_table(
884
971
  self,
885
972
  columns: Sequence["sa.Column"] = (),
886
- name: Optional[str] = None,
973
+ name: str | None = None,
887
974
  ) -> sa.Table:
888
975
  """
889
976
  Create a temporary table for storing custom signals generated by a UDF.
890
977
  SQLite TEMPORARY tables cannot be directly used as they are process-specific,
891
978
  and UDFs are run in other processes when run in parallel.
892
979
  """
980
+ columns = [
981
+ c
982
+ for c in columns
983
+ if c.name not in [col.name for col in self.dataset_row_cls.sys_columns()]
984
+ ]
893
985
  tbl = sa.Table(
894
986
  name or self.udf_table_name(),
895
987
  sa.MetaData(),
896
- sa.Column("sys__id", Int, primary_key=True),
988
+ *self.dataset_row_cls.sys_columns(),
897
989
  *columns,
898
990
  )
899
- self.db.create_table(tbl, if_not_exists=True)
991
+ self.db.create_table(tbl, if_not_exists=True, kind="udf")
900
992
  return tbl
901
993
 
902
994
  @abstractmethod
@@ -904,7 +996,7 @@ class AbstractWarehouse(ABC, Serializable):
904
996
  self,
905
997
  table: sa.Table,
906
998
  query: sa.Select,
907
- progress_cb: Optional[Callable[[int], None]] = None,
999
+ progress_cb: Callable[[int], None] | None = None,
908
1000
  ) -> None:
909
1001
  """
910
1002
  Copy the results of a query into a table.
@@ -917,6 +1009,8 @@ class AbstractWarehouse(ABC, Serializable):
917
1009
  right: "_FromClauseArgument",
918
1010
  onclause: "_OnClauseArgument",
919
1011
  inner: bool = True,
1012
+ full: bool = False,
1013
+ columns=None,
920
1014
  ) -> sa.Select:
921
1015
  """
922
1016
  Join two tables together.
@@ -953,7 +1047,5 @@ class AbstractWarehouse(ABC, Serializable):
953
1047
 
954
1048
 
955
1049
  def _random_string(length: int) -> str:
956
- return "".join(
957
- random.choice(string.ascii_letters + string.digits) # noqa: S311
958
- for i in range(length)
959
- )
1050
+ alphabet = string.ascii_letters + string.digits
1051
+ return "".join(secrets.choice(alphabet) for _ in range(length))