datachain 0.14.2__py3-none-any.whl → 0.39.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. datachain/__init__.py +20 -0
  2. datachain/asyn.py +11 -12
  3. datachain/cache.py +7 -7
  4. datachain/catalog/__init__.py +2 -2
  5. datachain/catalog/catalog.py +621 -507
  6. datachain/catalog/dependency.py +164 -0
  7. datachain/catalog/loader.py +28 -18
  8. datachain/checkpoint.py +43 -0
  9. datachain/cli/__init__.py +24 -33
  10. datachain/cli/commands/__init__.py +1 -8
  11. datachain/cli/commands/datasets.py +83 -52
  12. datachain/cli/commands/ls.py +17 -17
  13. datachain/cli/commands/show.py +4 -4
  14. datachain/cli/parser/__init__.py +8 -74
  15. datachain/cli/parser/job.py +95 -3
  16. datachain/cli/parser/studio.py +11 -4
  17. datachain/cli/parser/utils.py +1 -2
  18. datachain/cli/utils.py +2 -15
  19. datachain/client/azure.py +4 -4
  20. datachain/client/fsspec.py +45 -28
  21. datachain/client/gcs.py +6 -6
  22. datachain/client/hf.py +29 -2
  23. datachain/client/http.py +157 -0
  24. datachain/client/local.py +15 -11
  25. datachain/client/s3.py +17 -9
  26. datachain/config.py +4 -8
  27. datachain/data_storage/db_engine.py +12 -6
  28. datachain/data_storage/job.py +5 -1
  29. datachain/data_storage/metastore.py +1252 -186
  30. datachain/data_storage/schema.py +58 -45
  31. datachain/data_storage/serializer.py +105 -15
  32. datachain/data_storage/sqlite.py +286 -127
  33. datachain/data_storage/warehouse.py +250 -113
  34. datachain/dataset.py +353 -148
  35. datachain/delta.py +391 -0
  36. datachain/diff/__init__.py +27 -29
  37. datachain/error.py +60 -0
  38. datachain/func/__init__.py +2 -1
  39. datachain/func/aggregate.py +66 -42
  40. datachain/func/array.py +242 -38
  41. datachain/func/base.py +7 -4
  42. datachain/func/conditional.py +110 -60
  43. datachain/func/func.py +96 -45
  44. datachain/func/numeric.py +55 -38
  45. datachain/func/path.py +32 -20
  46. datachain/func/random.py +2 -2
  47. datachain/func/string.py +67 -37
  48. datachain/func/window.py +7 -8
  49. datachain/hash_utils.py +123 -0
  50. datachain/job.py +11 -7
  51. datachain/json.py +138 -0
  52. datachain/lib/arrow.py +58 -22
  53. datachain/lib/audio.py +245 -0
  54. datachain/lib/clip.py +14 -13
  55. datachain/lib/convert/flatten.py +5 -3
  56. datachain/lib/convert/python_to_sql.py +6 -10
  57. datachain/lib/convert/sql_to_python.py +8 -0
  58. datachain/lib/convert/values_to_tuples.py +156 -51
  59. datachain/lib/data_model.py +42 -20
  60. datachain/lib/dataset_info.py +36 -8
  61. datachain/lib/dc/__init__.py +8 -2
  62. datachain/lib/dc/csv.py +25 -28
  63. datachain/lib/dc/database.py +398 -0
  64. datachain/lib/dc/datachain.py +1289 -425
  65. datachain/lib/dc/datasets.py +320 -38
  66. datachain/lib/dc/hf.py +38 -24
  67. datachain/lib/dc/json.py +29 -32
  68. datachain/lib/dc/listings.py +112 -8
  69. datachain/lib/dc/pandas.py +16 -12
  70. datachain/lib/dc/parquet.py +35 -23
  71. datachain/lib/dc/records.py +31 -23
  72. datachain/lib/dc/storage.py +154 -64
  73. datachain/lib/dc/storage_pattern.py +251 -0
  74. datachain/lib/dc/utils.py +24 -16
  75. datachain/lib/dc/values.py +8 -9
  76. datachain/lib/file.py +622 -89
  77. datachain/lib/hf.py +69 -39
  78. datachain/lib/image.py +14 -14
  79. datachain/lib/listing.py +14 -11
  80. datachain/lib/listing_info.py +1 -2
  81. datachain/lib/meta_formats.py +3 -4
  82. datachain/lib/model_store.py +39 -7
  83. datachain/lib/namespaces.py +125 -0
  84. datachain/lib/projects.py +130 -0
  85. datachain/lib/pytorch.py +32 -21
  86. datachain/lib/settings.py +192 -56
  87. datachain/lib/signal_schema.py +427 -104
  88. datachain/lib/tar.py +1 -2
  89. datachain/lib/text.py +8 -7
  90. datachain/lib/udf.py +164 -76
  91. datachain/lib/udf_signature.py +60 -35
  92. datachain/lib/utils.py +118 -4
  93. datachain/lib/video.py +17 -9
  94. datachain/lib/webdataset.py +61 -56
  95. datachain/lib/webdataset_laion.py +15 -16
  96. datachain/listing.py +22 -10
  97. datachain/model/bbox.py +3 -1
  98. datachain/model/ultralytics/bbox.py +16 -12
  99. datachain/model/ultralytics/pose.py +16 -12
  100. datachain/model/ultralytics/segment.py +16 -12
  101. datachain/namespace.py +84 -0
  102. datachain/node.py +6 -6
  103. datachain/nodes_thread_pool.py +0 -1
  104. datachain/plugins.py +24 -0
  105. datachain/project.py +78 -0
  106. datachain/query/batch.py +40 -41
  107. datachain/query/dataset.py +604 -322
  108. datachain/query/dispatch.py +261 -154
  109. datachain/query/metrics.py +4 -6
  110. datachain/query/params.py +2 -3
  111. datachain/query/queue.py +3 -12
  112. datachain/query/schema.py +11 -6
  113. datachain/query/session.py +200 -33
  114. datachain/query/udf.py +34 -2
  115. datachain/remote/studio.py +171 -69
  116. datachain/script_meta.py +12 -12
  117. datachain/semver.py +68 -0
  118. datachain/sql/__init__.py +2 -0
  119. datachain/sql/functions/array.py +33 -1
  120. datachain/sql/postgresql_dialect.py +9 -0
  121. datachain/sql/postgresql_types.py +21 -0
  122. datachain/sql/sqlite/__init__.py +5 -1
  123. datachain/sql/sqlite/base.py +102 -29
  124. datachain/sql/sqlite/types.py +8 -13
  125. datachain/sql/types.py +70 -15
  126. datachain/studio.py +223 -46
  127. datachain/toolkit/split.py +31 -10
  128. datachain/utils.py +101 -59
  129. {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/METADATA +77 -22
  130. datachain-0.39.0.dist-info/RECORD +173 -0
  131. {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/WHEEL +1 -1
  132. datachain/cli/commands/query.py +0 -53
  133. datachain/query/utils.py +0 -42
  134. datachain-0.14.2.dist-info/RECORD +0 -158
  135. {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/entry_points.txt +0 -0
  136. {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/licenses/LICENSE +0 -0
  137. {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/top_level.txt +0 -0
datachain/func/path.py CHANGED
@@ -8,23 +8,26 @@ def parent(col: ColT) -> Func:
8
8
  Returns the directory component of a posix-style path.
9
9
 
10
10
  Args:
11
- col (str | literal | Func): String to compute the path parent of.
11
+ col (str | Column | Func | literal): String to compute the path parent of.
12
12
  If a string is provided, it is assumed to be the name of the column.
13
- If a literal is provided, it is assumed to be a string literal.
13
+ If a Column is provided, it is assumed to be a column object.
14
14
  If a Func is provided, it is assumed to be a function returning a string.
15
+ If a literal is provided, it is assumed to be a string literal.
15
16
 
16
17
  Returns:
17
- Func: A Func object that represents the path parent function.
18
+ Func: A `Func` object that represents the path parent function.
18
19
 
19
20
  Example:
20
21
  ```py
21
22
  dc.mutate(
22
- parent=func.path.parent("file.path"),
23
+ parent1=func.path.parent("file.path"),
24
+ parent2=func.path.parent(dc.C("file.path")),
25
+ parent3=func.path.parent(dc.func.literal("/path/to/file.txt")),
23
26
  )
24
27
  ```
25
28
 
26
29
  Note:
27
- - Result column will always be of type string.
30
+ - The result column will always be of type string.
28
31
  """
29
32
  return Func("parent", inner=path.parent, cols=[col], result_type=str)
30
33
 
@@ -34,23 +37,26 @@ def name(col: ColT) -> Func:
34
37
  Returns the final component of a posix-style path.
35
38
 
36
39
  Args:
37
- col (str | literal): String to compute the path name of.
40
+ col (str | Column | Func | literal): String to compute the path name of.
38
41
  If a string is provided, it is assumed to be the name of the column.
39
- If a literal is provided, it is assumed to be a string literal.
42
+ If a Column is provided, it is assumed to be a column object.
40
43
  If a Func is provided, it is assumed to be a function returning a string.
44
+ If a literal is provided, it is assumed to be a string literal.
41
45
 
42
46
  Returns:
43
- Func: A Func object that represents the path name function.
47
+ Func: A `Func` object that represents the path name function.
44
48
 
45
49
  Example:
46
50
  ```py
47
51
  dc.mutate(
48
- file_name=func.path.name("file.path"),
52
+ filename1=func.path.name("file.path"),
53
+ filename2=func.path.name(dc.C("file.path")),
54
+ filename3=func.path.name(dc.func.literal("/path/to/file.txt")
49
55
  )
50
56
  ```
51
57
 
52
58
  Note:
53
- - Result column will always be of type string.
59
+ - The result column will always be of type string.
54
60
  """
55
61
 
56
62
  return Func("name", inner=path.name, cols=[col], result_type=str)
@@ -61,23 +67,26 @@ def file_stem(col: ColT) -> Func:
61
67
  Returns the path without the extension.
62
68
 
63
69
  Args:
64
- col (str | literal): String to compute the file stem of.
70
+ col (str | Column | Func | literal): String to compute the file stem of.
65
71
  If a string is provided, it is assumed to be the name of the column.
66
- If a literal is provided, it is assumed to be a string literal.
72
+ If a Column is provided, it is assumed to be a column object.
67
73
  If a Func is provided, it is assumed to be a function returning a string.
74
+ If a literal is provided, it is assumed to be a string literal.
68
75
 
69
76
  Returns:
70
- Func: A Func object that represents the file stem function.
77
+ Func: A `Func` object that represents the file stem function.
71
78
 
72
79
  Example:
73
80
  ```py
74
81
  dc.mutate(
75
- file_stem=func.path.file_stem("file.path"),
82
+ filestem1=func.path.file_stem("file.path"),
83
+ filestem2=func.path.file_stem(dc.C("file.path")),
84
+ filestem3=func.path.file_stem(dc.func.literal("/path/to/file.txt")
76
85
  )
77
86
  ```
78
87
 
79
88
  Note:
80
- - Result column will always be of type string.
89
+ - The result column will always be of type string.
81
90
  """
82
91
 
83
92
  return Func("file_stem", inner=path.file_stem, cols=[col], result_type=str)
@@ -88,23 +97,26 @@ def file_ext(col: ColT) -> Func:
88
97
  Returns the extension of the given path.
89
98
 
90
99
  Args:
91
- col (str | literal): String to compute the file extension of.
100
+ col (str | Column | Func | literal): String to compute the file extension of.
92
101
  If a string is provided, it is assumed to be the name of the column.
93
- If a literal is provided, it is assumed to be a string literal.
102
+ If a Column is provided, it is assumed to be a column object.
94
103
  If a Func is provided, it is assumed to be a function returning a string.
104
+ If a literal is provided, it is assumed to be a string literal.
95
105
 
96
106
  Returns:
97
- Func: A Func object that represents the file extension function.
107
+ Func: A `Func` object that represents the file extension function.
98
108
 
99
109
  Example:
100
110
  ```py
101
111
  dc.mutate(
102
- file_stem=func.path.file_ext("file.path"),
112
+ filestem1=func.path.file_ext("file.path"),
113
+ filestem2=func.path.file_ext(dc.C("file.path")),
114
+ filestem3=func.path.file_ext(dc.func.literal("/path/to/file.txt")
103
115
  )
104
116
  ```
105
117
 
106
118
  Note:
107
- - Result column will always be of type string.
119
+ - The result column will always be of type string.
108
120
  """
109
121
 
110
122
  return Func("file_ext", inner=path.file_ext, cols=[col], result_type=str)
datachain/func/random.py CHANGED
@@ -8,7 +8,7 @@ def rand() -> Func:
8
8
  Returns the random integer value.
9
9
 
10
10
  Returns:
11
- Func: A Func object that represents the rand function.
11
+ Func: A `Func` object that represents the rand function.
12
12
 
13
13
  Example:
14
14
  ```py
@@ -18,6 +18,6 @@ def rand() -> Func:
18
18
  ```
19
19
 
20
20
  Note:
21
- - Result column will always be of type integer.
21
+ - The result column will always be of type integer.
22
22
  """
23
23
  return Func("rand", inner=random.rand, result_type=int)
datachain/func/string.py CHANGED
@@ -1,64 +1,76 @@
1
- from typing import Optional, Union, get_origin
1
+ from typing import get_origin
2
2
 
3
3
  from sqlalchemy import literal
4
4
 
5
5
  from datachain.sql.functions import string
6
6
 
7
- from .func import Func
7
+ from .func import ColT, Func
8
8
 
9
+ __all__ = [
10
+ "byte_hamming_distance",
11
+ "length",
12
+ "regexp_replace",
13
+ "replace",
14
+ "split",
15
+ ]
9
16
 
10
- def length(col: Union[str, Func]) -> Func:
17
+
18
+ def length(col: ColT) -> Func:
11
19
  """
12
20
  Returns the length of the string.
13
21
 
14
22
  Args:
15
- col (str | literal | Func): String to compute the length of.
23
+ col (str | Column | Func | literal): String to compute the length of.
16
24
  If a string is provided, it is assumed to be the name of the column.
17
- If a literal is provided, it is assumed to be a string literal.
25
+ If a Column is provided, it is assumed to be a column in the dataset.
18
26
  If a Func is provided, it is assumed to be a function returning a string.
27
+ If a literal is provided, it is assumed to be a string literal.
19
28
 
20
29
  Returns:
21
- Func: A Func object that represents the string length function.
30
+ Func: A `Func` object that represents the string length function.
22
31
 
23
32
  Example:
24
33
  ```py
25
34
  dc.mutate(
26
35
  len1=func.string.length("file.path"),
27
- len2=func.string.length("Random string"),
36
+ len2=func.string.length(dc.C("file.path")),
37
+ len3=func.string.length(dc.func.literal("Random string")),
28
38
  )
29
39
  ```
30
40
 
31
- Note:
32
- - Result column will always be of type int.
41
+ Notes:
42
+ - The result column will always be of type int.
33
43
  """
34
44
  return Func("length", inner=string.length, cols=[col], result_type=int)
35
45
 
36
46
 
37
- def split(col: Union[str, Func], sep: str, limit: Optional[int] = None) -> Func:
47
+ def split(col: ColT, sep: str, limit: int | None = None) -> Func:
38
48
  """
39
49
  Takes a column and split character and returns an array of the parts.
40
50
 
41
51
  Args:
42
- col (str | literal): Column to split.
52
+ col (str | Column | Func | literal): Column to split.
43
53
  If a string is provided, it is assumed to be the name of the column.
44
- If a literal is provided, it is assumed to be a string literal.
54
+ If a Column is provided, it is assumed to be a column in the dataset.
45
55
  If a Func is provided, it is assumed to be a function returning a string.
56
+ If a literal is provided, it is assumed to be a string literal.
46
57
  sep (str): Separator to split the string.
47
58
  limit (int, optional): Maximum number of splits to perform.
48
59
 
49
60
  Returns:
50
- Func: A Func object that represents the split function.
61
+ Func: A `Func` object that represents the split function.
51
62
 
52
63
  Example:
53
64
  ```py
54
65
  dc.mutate(
55
66
  path_parts=func.string.split("file.path", "/"),
56
- str_words=func.string.length("Random string", " "),
67
+ signal_values=func.string.split(dc.C("signal.value"), ","),
68
+ str_words=func.string.split(dc.func.literal("Random string"), " "),
57
69
  )
58
70
  ```
59
71
 
60
- Note:
61
- - Result column will always be of type array of strings.
72
+ Notes:
73
+ - The result column will always be of type array of strings.
62
74
  """
63
75
 
64
76
  def inner(arg):
@@ -76,30 +88,33 @@ def split(col: Union[str, Func], sep: str, limit: Optional[int] = None) -> Func:
76
88
  return Func("split", inner=inner, cols=cols, args=args, result_type=list[str])
77
89
 
78
90
 
79
- def replace(col: Union[str, Func], pattern: str, replacement: str) -> Func:
91
+ def replace(col: ColT, pattern: str, replacement: str) -> Func:
80
92
  """
81
93
  Replaces substring with another string.
82
94
 
83
95
  Args:
84
- col (str | literal): Column to split.
96
+ col (str | Column | Func | literal): Column to perform replacement on.
85
97
  If a string is provided, it is assumed to be the name of the column.
86
- If a literal is provided, it is assumed to be a string literal.
98
+ If a Column is provided, it is assumed to be a column in the dataset.
87
99
  If a Func is provided, it is assumed to be a function returning a string.
100
+ If a literal is provided, it is assumed to be a string literal.
88
101
  pattern (str): Pattern to replace.
89
102
  replacement (str): Replacement string.
90
103
 
91
104
  Returns:
92
- Func: A Func object that represents the replace function.
105
+ Func: A `Func` object that represents the replace function.
93
106
 
94
107
  Example:
95
108
  ```py
96
109
  dc.mutate(
97
- signal=func.string.replace("signal.name", "pattern", "replacement),
110
+ s1=func.string.replace("signal.name", "pattern", "replacement"),
111
+ s2=func.string.replace(dc.C("signal.name"), "pattern", "replacement"),
112
+ s3=func.string.replace(dc.func.literal("Random string"), "Random", "New"),
98
113
  )
99
114
  ```
100
115
 
101
- Note:
102
- - Result column will always be of type string.
116
+ Notes:
117
+ - The result column will always be of type string.
103
118
  """
104
119
 
105
120
  def inner(arg):
@@ -115,30 +130,37 @@ def replace(col: Union[str, Func], pattern: str, replacement: str) -> Func:
115
130
  return Func("replace", inner=inner, cols=cols, args=args, result_type=str)
116
131
 
117
132
 
118
- def regexp_replace(col: Union[str, Func], regex: str, replacement: str) -> Func:
133
+ def regexp_replace(col: ColT, regex: str, replacement: str) -> Func:
119
134
  r"""
120
135
  Replaces substring that match a regular expression.
121
136
 
122
137
  Args:
123
- col (str | literal): Column to split.
138
+ col (str | Column | Func | literal): Column to perform replacement on.
124
139
  If a string is provided, it is assumed to be the name of the column.
125
- If a literal is provided, it is assumed to be a string literal.
140
+ If a Column is provided, it is assumed to be a column in the dataset.
126
141
  If a Func is provided, it is assumed to be a function returning a string.
142
+ If a literal is provided, it is assumed to be a string literal.
127
143
  regex (str): Regular expression pattern to replace.
128
144
  replacement (str): Replacement string.
129
145
 
130
146
  Returns:
131
- Func: A Func object that represents the regexp_replace function.
147
+ Func: A `Func` object that represents the regexp_replace function.
132
148
 
133
149
  Example:
134
150
  ```py
135
151
  dc.mutate(
136
- signal=func.string.regexp_replace("signal.name", r"\d+", "X"),
152
+ s1=func.string.regexp_replace("signal.name", r"\d+", "X"),
153
+ s2=func.string.regexp_replace(dc.C("signal.name"), r"\d+", "X"),
154
+ s3=func.string.regexp_replace(
155
+ dc.func.literal("Random string"),
156
+ r"\s+",
157
+ "_",
158
+ ),
137
159
  )
138
160
  ```
139
161
 
140
- Note:
141
- - Result column will always be of type string.
162
+ Notes:
163
+ - The result column will always be of type string.
142
164
  """
143
165
 
144
166
  def inner(arg):
@@ -154,7 +176,7 @@ def regexp_replace(col: Union[str, Func], regex: str, replacement: str) -> Func:
154
176
  return Func("regexp_replace", inner=inner, cols=cols, args=args, result_type=str)
155
177
 
156
178
 
157
- def byte_hamming_distance(*args: Union[str, Func]) -> Func:
179
+ def byte_hamming_distance(*args: ColT) -> Func:
158
180
  """
159
181
  Computes the Hamming distance between two strings.
160
182
 
@@ -164,22 +186,30 @@ def byte_hamming_distance(*args: Union[str, Func]) -> Func:
164
186
  of the strings indicate higher dissimilarity.
165
187
 
166
188
  Args:
167
- args (str | literal): Two strings to compute the Hamming distance between.
168
- If a str is provided, it is assumed to be the name of the column.
169
- If a Literal is provided, it is assumed to be a string literal.
189
+ args (str | Column | Func | literal): Two strings to compute
190
+ the Hamming distance between.
191
+ If a string is provided, it is assumed to be the name of the column.
192
+ If a Column is provided, it is assumed to be a column in the dataset.
193
+ If a Func is provided, it is assumed to be a function returning a string.
194
+ If a literal is provided, it is assumed to be a string literal.
170
195
 
171
196
  Returns:
172
- Func: A Func object that represents the Hamming distance function.
197
+ Func: A `Func` object that represents the Hamming distance function.
173
198
 
174
199
  Example:
175
200
  ```py
176
201
  dc.mutate(
177
- ham_dist=func.byte_hamming_distance("file.phash", literal("hello")),
202
+ hd1=func.byte_hamming_distance("file.phash", literal("hello")),
203
+ hd2=func.byte_hamming_distance(dc.C("file.phash"), "hello"),
204
+ hd3=func.byte_hamming_distance(
205
+ dc.func.literal("hi"),
206
+ dc.func.literal("hello"),
207
+ ),
178
208
  )
179
209
  ```
180
210
 
181
211
  Notes:
182
- - Result column will always be of type int.
212
+ - The result column will always be of type int.
183
213
  """
184
214
  cols, func_args = [], []
185
215
  for arg in args:
datachain/func/window.py CHANGED
@@ -22,17 +22,16 @@ def window(partition_by: str, order_by: str, desc: bool = False) -> Window:
22
22
 
23
23
  Args:
24
24
  partition_by (str): The column name by which to partition the result set.
25
- Rows with the same value in the partition column
26
- will be grouped together for the window function.
27
- order_by (str): The column name by which to order the rows
28
- within each partition. This determines the sequence in which
29
- the window function is applied.
25
+ Rows with the same value in the partition column will be grouped together
26
+ for the window function.
27
+ order_by (str): The column name by which to order the rows within
28
+ each partition. This determines the sequence in which the window function
29
+ is applied.
30
30
  desc (bool, optional): If True, the rows will be ordered in descending order.
31
- Defaults to False, which orders the rows
32
- in ascending order.
31
+ Defaults to False, which orders the rows in ascending order.
33
32
 
34
33
  Returns:
35
- Window: A Window object representing the window specification.
34
+ Window: A `Window` object representing the window specification.
36
35
 
37
36
  Example:
38
37
  ```py
@@ -0,0 +1,123 @@
1
+ import hashlib
2
+ import inspect
3
+ import textwrap
4
+ from collections.abc import Sequence
5
+ from typing import TypeAlias, TypeVar
6
+
7
+ from sqlalchemy.sql.elements import ClauseElement, ColumnElement
8
+
9
+ from datachain import json
10
+
11
+ T = TypeVar("T", bound=ColumnElement)
12
+ ColumnLike: TypeAlias = str | T
13
+
14
+
15
+ def _serialize_value(val): # noqa: PLR0911
16
+ """Helper to serialize arbitrary values recursively."""
17
+ if val is None:
18
+ return None
19
+ if isinstance(val, (str, int, float, bool)):
20
+ return val
21
+ if isinstance(val, ClauseElement):
22
+ return serialize_column_element(val)
23
+ if isinstance(val, dict):
24
+ # Sort dict keys for deterministic serialization
25
+ return {k: _serialize_value(v) for k, v in sorted(val.items())}
26
+ if isinstance(val, (list, tuple)):
27
+ return [_serialize_value(v) for v in val]
28
+ if callable(val):
29
+ return val.__name__ if hasattr(val, "__name__") else str(val)
30
+ return str(val)
31
+
32
+
33
+ def serialize_column_element(expr: str | ColumnElement) -> dict:
34
+ """
35
+ Recursively serialize a SQLAlchemy ColumnElement into a deterministic structure.
36
+ Uses SQLAlchemy's _traverse_internals to automatically handle all expression types.
37
+ """
38
+ from sqlalchemy.sql.elements import BindParameter
39
+
40
+ # Special case: BindParameter has non-deterministic 'key' attribute, only use value
41
+ if isinstance(expr, BindParameter):
42
+ return {"type": "bind", "value": _serialize_value(expr.value)}
43
+
44
+ # Generic handling for all ClauseElement types using SQLAlchemy's internals
45
+ if isinstance(expr, ClauseElement):
46
+ # All standard SQLAlchemy types have _traverse_internals
47
+ if hasattr(expr, "_traverse_internals"):
48
+ result = {"type": expr.__class__.__name__}
49
+ for attr_name, _ in expr._traverse_internals:
50
+ # Skip 'table' attribute - table names can be auto-generated/random
51
+ # and are not semantically important for hashing
52
+ if attr_name == "table":
53
+ continue
54
+ if hasattr(expr, attr_name):
55
+ val = getattr(expr, attr_name)
56
+ result[attr_name] = _serialize_value(val)
57
+ return result
58
+ # Rare case: custom user-defined ClauseElement without _traverse_internals
59
+ # We don't know its structure, so just stringify it
60
+ return {"type": expr.__class__.__name__, "repr": str(expr)}
61
+
62
+ # Absolute fallback: stringify completely unknown types
63
+ return {"type": "other", "repr": str(expr)}
64
+
65
+
66
+ def hash_column_elements(columns: ColumnLike | Sequence[ColumnLike]) -> str:
67
+ """
68
+ Hash a list of ColumnElements deterministically, dialect agnostic.
69
+ Only accepts ordered iterables (like list or tuple).
70
+ """
71
+ # Handle case where a single ColumnElement is passed instead of a sequence
72
+ if isinstance(columns, (ColumnElement, str)):
73
+ columns = (columns,)
74
+
75
+ serialized = [serialize_column_element(c) for c in columns]
76
+ json_str = json.dumps(
77
+ serialized, sort_keys=True, separators=(", ", ": ")
78
+ ) # stable JSON
79
+ return hashlib.sha256(json_str.encode("utf-8")).hexdigest()
80
+
81
+
82
+ def hash_callable(func):
83
+ """
84
+ Calculate a hash from a callable.
85
+ Rules:
86
+ - Named functions (def) → use source code for stable, cross-version hashing
87
+ - Lambdas → use bytecode (deterministic in same Python runtime)
88
+ """
89
+ if not callable(func):
90
+ raise TypeError("Expected a callable")
91
+
92
+ # Determine if it is a lambda
93
+ is_lambda = func.__name__ == "<lambda>"
94
+
95
+ if not is_lambda:
96
+ # Try to get exact source of named function
97
+ try:
98
+ lines, _ = inspect.getsourcelines(func)
99
+ payload = textwrap.dedent("".join(lines)).strip()
100
+ except (OSError, TypeError):
101
+ # Fallback: bytecode if source not available
102
+ payload = func.__code__.co_code
103
+ else:
104
+ # For lambdas, fall back directly to bytecode
105
+ payload = func.__code__.co_code
106
+
107
+ # Normalize annotations
108
+ annotations = {
109
+ k: getattr(v, "__name__", str(v)) for k, v in func.__annotations__.items()
110
+ }
111
+
112
+ # Extras to distinguish functions with same code but different metadata
113
+ extras = {
114
+ "name": func.__name__,
115
+ "defaults": func.__defaults__,
116
+ "annotations": annotations,
117
+ }
118
+
119
+ # Compute SHA256
120
+ h = hashlib.sha256()
121
+ h.update(str(payload).encode() if isinstance(payload, str) else payload)
122
+ h.update(str(extras).encode())
123
+ return h.hexdigest()
datachain/job.py CHANGED
@@ -1,8 +1,9 @@
1
- import json
2
1
  import uuid
3
2
  from dataclasses import dataclass
4
3
  from datetime import datetime
5
- from typing import Any, Optional, TypeVar, Union
4
+ from typing import Any, TypeVar
5
+
6
+ from datachain import json
6
7
 
7
8
  J = TypeVar("J", bound="Job")
8
9
 
@@ -18,27 +19,29 @@ class Job:
18
19
  workers: int
19
20
  params: dict[str, str]
20
21
  metrics: dict[str, Any]
21
- finished_at: Optional[datetime] = None
22
- python_version: Optional[str] = None
22
+ finished_at: datetime | None = None
23
+ python_version: str | None = None
23
24
  error_message: str = ""
24
25
  error_stack: str = ""
26
+ parent_job_id: str | None = None
25
27
 
26
28
  @classmethod
27
29
  def parse(
28
30
  cls,
29
- id: Union[str, uuid.UUID],
31
+ id: str | uuid.UUID,
30
32
  name: str,
31
33
  status: int,
32
34
  created_at: datetime,
33
- finished_at: Optional[datetime],
35
+ finished_at: datetime | None,
34
36
  query: str,
35
37
  query_type: int,
36
38
  workers: int,
37
- python_version: Optional[str],
39
+ python_version: str | None,
38
40
  error_message: str,
39
41
  error_stack: str,
40
42
  params: str,
41
43
  metrics: str,
44
+ parent_job_id: str | None,
42
45
  ) -> "Job":
43
46
  return cls(
44
47
  str(id),
@@ -54,4 +57,5 @@ class Job:
54
57
  python_version,
55
58
  error_message,
56
59
  error_stack,
60
+ str(parent_job_id) if parent_job_id else None,
57
61
  )