datachain 0.14.2__py3-none-any.whl → 0.39.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datachain/__init__.py +20 -0
- datachain/asyn.py +11 -12
- datachain/cache.py +7 -7
- datachain/catalog/__init__.py +2 -2
- datachain/catalog/catalog.py +621 -507
- datachain/catalog/dependency.py +164 -0
- datachain/catalog/loader.py +28 -18
- datachain/checkpoint.py +43 -0
- datachain/cli/__init__.py +24 -33
- datachain/cli/commands/__init__.py +1 -8
- datachain/cli/commands/datasets.py +83 -52
- datachain/cli/commands/ls.py +17 -17
- datachain/cli/commands/show.py +4 -4
- datachain/cli/parser/__init__.py +8 -74
- datachain/cli/parser/job.py +95 -3
- datachain/cli/parser/studio.py +11 -4
- datachain/cli/parser/utils.py +1 -2
- datachain/cli/utils.py +2 -15
- datachain/client/azure.py +4 -4
- datachain/client/fsspec.py +45 -28
- datachain/client/gcs.py +6 -6
- datachain/client/hf.py +29 -2
- datachain/client/http.py +157 -0
- datachain/client/local.py +15 -11
- datachain/client/s3.py +17 -9
- datachain/config.py +4 -8
- datachain/data_storage/db_engine.py +12 -6
- datachain/data_storage/job.py +5 -1
- datachain/data_storage/metastore.py +1252 -186
- datachain/data_storage/schema.py +58 -45
- datachain/data_storage/serializer.py +105 -15
- datachain/data_storage/sqlite.py +286 -127
- datachain/data_storage/warehouse.py +250 -113
- datachain/dataset.py +353 -148
- datachain/delta.py +391 -0
- datachain/diff/__init__.py +27 -29
- datachain/error.py +60 -0
- datachain/func/__init__.py +2 -1
- datachain/func/aggregate.py +66 -42
- datachain/func/array.py +242 -38
- datachain/func/base.py +7 -4
- datachain/func/conditional.py +110 -60
- datachain/func/func.py +96 -45
- datachain/func/numeric.py +55 -38
- datachain/func/path.py +32 -20
- datachain/func/random.py +2 -2
- datachain/func/string.py +67 -37
- datachain/func/window.py +7 -8
- datachain/hash_utils.py +123 -0
- datachain/job.py +11 -7
- datachain/json.py +138 -0
- datachain/lib/arrow.py +58 -22
- datachain/lib/audio.py +245 -0
- datachain/lib/clip.py +14 -13
- datachain/lib/convert/flatten.py +5 -3
- datachain/lib/convert/python_to_sql.py +6 -10
- datachain/lib/convert/sql_to_python.py +8 -0
- datachain/lib/convert/values_to_tuples.py +156 -51
- datachain/lib/data_model.py +42 -20
- datachain/lib/dataset_info.py +36 -8
- datachain/lib/dc/__init__.py +8 -2
- datachain/lib/dc/csv.py +25 -28
- datachain/lib/dc/database.py +398 -0
- datachain/lib/dc/datachain.py +1289 -425
- datachain/lib/dc/datasets.py +320 -38
- datachain/lib/dc/hf.py +38 -24
- datachain/lib/dc/json.py +29 -32
- datachain/lib/dc/listings.py +112 -8
- datachain/lib/dc/pandas.py +16 -12
- datachain/lib/dc/parquet.py +35 -23
- datachain/lib/dc/records.py +31 -23
- datachain/lib/dc/storage.py +154 -64
- datachain/lib/dc/storage_pattern.py +251 -0
- datachain/lib/dc/utils.py +24 -16
- datachain/lib/dc/values.py +8 -9
- datachain/lib/file.py +622 -89
- datachain/lib/hf.py +69 -39
- datachain/lib/image.py +14 -14
- datachain/lib/listing.py +14 -11
- datachain/lib/listing_info.py +1 -2
- datachain/lib/meta_formats.py +3 -4
- datachain/lib/model_store.py +39 -7
- datachain/lib/namespaces.py +125 -0
- datachain/lib/projects.py +130 -0
- datachain/lib/pytorch.py +32 -21
- datachain/lib/settings.py +192 -56
- datachain/lib/signal_schema.py +427 -104
- datachain/lib/tar.py +1 -2
- datachain/lib/text.py +8 -7
- datachain/lib/udf.py +164 -76
- datachain/lib/udf_signature.py +60 -35
- datachain/lib/utils.py +118 -4
- datachain/lib/video.py +17 -9
- datachain/lib/webdataset.py +61 -56
- datachain/lib/webdataset_laion.py +15 -16
- datachain/listing.py +22 -10
- datachain/model/bbox.py +3 -1
- datachain/model/ultralytics/bbox.py +16 -12
- datachain/model/ultralytics/pose.py +16 -12
- datachain/model/ultralytics/segment.py +16 -12
- datachain/namespace.py +84 -0
- datachain/node.py +6 -6
- datachain/nodes_thread_pool.py +0 -1
- datachain/plugins.py +24 -0
- datachain/project.py +78 -0
- datachain/query/batch.py +40 -41
- datachain/query/dataset.py +604 -322
- datachain/query/dispatch.py +261 -154
- datachain/query/metrics.py +4 -6
- datachain/query/params.py +2 -3
- datachain/query/queue.py +3 -12
- datachain/query/schema.py +11 -6
- datachain/query/session.py +200 -33
- datachain/query/udf.py +34 -2
- datachain/remote/studio.py +171 -69
- datachain/script_meta.py +12 -12
- datachain/semver.py +68 -0
- datachain/sql/__init__.py +2 -0
- datachain/sql/functions/array.py +33 -1
- datachain/sql/postgresql_dialect.py +9 -0
- datachain/sql/postgresql_types.py +21 -0
- datachain/sql/sqlite/__init__.py +5 -1
- datachain/sql/sqlite/base.py +102 -29
- datachain/sql/sqlite/types.py +8 -13
- datachain/sql/types.py +70 -15
- datachain/studio.py +223 -46
- datachain/toolkit/split.py +31 -10
- datachain/utils.py +101 -59
- {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/METADATA +77 -22
- datachain-0.39.0.dist-info/RECORD +173 -0
- {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/WHEEL +1 -1
- datachain/cli/commands/query.py +0 -53
- datachain/query/utils.py +0 -42
- datachain-0.14.2.dist-info/RECORD +0 -158
- {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/entry_points.txt +0 -0
- {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/top_level.txt +0 -0
datachain/func/path.py
CHANGED
|
@@ -8,23 +8,26 @@ def parent(col: ColT) -> Func:
|
|
|
8
8
|
Returns the directory component of a posix-style path.
|
|
9
9
|
|
|
10
10
|
Args:
|
|
11
|
-
col (str |
|
|
11
|
+
col (str | Column | Func | literal): String to compute the path parent of.
|
|
12
12
|
If a string is provided, it is assumed to be the name of the column.
|
|
13
|
-
If a
|
|
13
|
+
If a Column is provided, it is assumed to be a column object.
|
|
14
14
|
If a Func is provided, it is assumed to be a function returning a string.
|
|
15
|
+
If a literal is provided, it is assumed to be a string literal.
|
|
15
16
|
|
|
16
17
|
Returns:
|
|
17
|
-
Func: A Func object that represents the path parent function.
|
|
18
|
+
Func: A `Func` object that represents the path parent function.
|
|
18
19
|
|
|
19
20
|
Example:
|
|
20
21
|
```py
|
|
21
22
|
dc.mutate(
|
|
22
|
-
|
|
23
|
+
parent1=func.path.parent("file.path"),
|
|
24
|
+
parent2=func.path.parent(dc.C("file.path")),
|
|
25
|
+
parent3=func.path.parent(dc.func.literal("/path/to/file.txt")),
|
|
23
26
|
)
|
|
24
27
|
```
|
|
25
28
|
|
|
26
29
|
Note:
|
|
27
|
-
-
|
|
30
|
+
- The result column will always be of type string.
|
|
28
31
|
"""
|
|
29
32
|
return Func("parent", inner=path.parent, cols=[col], result_type=str)
|
|
30
33
|
|
|
@@ -34,23 +37,26 @@ def name(col: ColT) -> Func:
|
|
|
34
37
|
Returns the final component of a posix-style path.
|
|
35
38
|
|
|
36
39
|
Args:
|
|
37
|
-
col (str | literal): String to compute the path name of.
|
|
40
|
+
col (str | Column | Func | literal): String to compute the path name of.
|
|
38
41
|
If a string is provided, it is assumed to be the name of the column.
|
|
39
|
-
If a
|
|
42
|
+
If a Column is provided, it is assumed to be a column object.
|
|
40
43
|
If a Func is provided, it is assumed to be a function returning a string.
|
|
44
|
+
If a literal is provided, it is assumed to be a string literal.
|
|
41
45
|
|
|
42
46
|
Returns:
|
|
43
|
-
Func: A Func object that represents the path name function.
|
|
47
|
+
Func: A `Func` object that represents the path name function.
|
|
44
48
|
|
|
45
49
|
Example:
|
|
46
50
|
```py
|
|
47
51
|
dc.mutate(
|
|
48
|
-
|
|
52
|
+
filename1=func.path.name("file.path"),
|
|
53
|
+
filename2=func.path.name(dc.C("file.path")),
|
|
54
|
+
filename3=func.path.name(dc.func.literal("/path/to/file.txt")
|
|
49
55
|
)
|
|
50
56
|
```
|
|
51
57
|
|
|
52
58
|
Note:
|
|
53
|
-
-
|
|
59
|
+
- The result column will always be of type string.
|
|
54
60
|
"""
|
|
55
61
|
|
|
56
62
|
return Func("name", inner=path.name, cols=[col], result_type=str)
|
|
@@ -61,23 +67,26 @@ def file_stem(col: ColT) -> Func:
|
|
|
61
67
|
Returns the path without the extension.
|
|
62
68
|
|
|
63
69
|
Args:
|
|
64
|
-
col (str | literal): String to compute the file stem of.
|
|
70
|
+
col (str | Column | Func | literal): String to compute the file stem of.
|
|
65
71
|
If a string is provided, it is assumed to be the name of the column.
|
|
66
|
-
If a
|
|
72
|
+
If a Column is provided, it is assumed to be a column object.
|
|
67
73
|
If a Func is provided, it is assumed to be a function returning a string.
|
|
74
|
+
If a literal is provided, it is assumed to be a string literal.
|
|
68
75
|
|
|
69
76
|
Returns:
|
|
70
|
-
Func: A Func object that represents the file stem function.
|
|
77
|
+
Func: A `Func` object that represents the file stem function.
|
|
71
78
|
|
|
72
79
|
Example:
|
|
73
80
|
```py
|
|
74
81
|
dc.mutate(
|
|
75
|
-
|
|
82
|
+
filestem1=func.path.file_stem("file.path"),
|
|
83
|
+
filestem2=func.path.file_stem(dc.C("file.path")),
|
|
84
|
+
filestem3=func.path.file_stem(dc.func.literal("/path/to/file.txt")
|
|
76
85
|
)
|
|
77
86
|
```
|
|
78
87
|
|
|
79
88
|
Note:
|
|
80
|
-
-
|
|
89
|
+
- The result column will always be of type string.
|
|
81
90
|
"""
|
|
82
91
|
|
|
83
92
|
return Func("file_stem", inner=path.file_stem, cols=[col], result_type=str)
|
|
@@ -88,23 +97,26 @@ def file_ext(col: ColT) -> Func:
|
|
|
88
97
|
Returns the extension of the given path.
|
|
89
98
|
|
|
90
99
|
Args:
|
|
91
|
-
col (str | literal): String to compute the file extension of.
|
|
100
|
+
col (str | Column | Func | literal): String to compute the file extension of.
|
|
92
101
|
If a string is provided, it is assumed to be the name of the column.
|
|
93
|
-
If a
|
|
102
|
+
If a Column is provided, it is assumed to be a column object.
|
|
94
103
|
If a Func is provided, it is assumed to be a function returning a string.
|
|
104
|
+
If a literal is provided, it is assumed to be a string literal.
|
|
95
105
|
|
|
96
106
|
Returns:
|
|
97
|
-
Func: A Func object that represents the file extension function.
|
|
107
|
+
Func: A `Func` object that represents the file extension function.
|
|
98
108
|
|
|
99
109
|
Example:
|
|
100
110
|
```py
|
|
101
111
|
dc.mutate(
|
|
102
|
-
|
|
112
|
+
filestem1=func.path.file_ext("file.path"),
|
|
113
|
+
filestem2=func.path.file_ext(dc.C("file.path")),
|
|
114
|
+
filestem3=func.path.file_ext(dc.func.literal("/path/to/file.txt")
|
|
103
115
|
)
|
|
104
116
|
```
|
|
105
117
|
|
|
106
118
|
Note:
|
|
107
|
-
-
|
|
119
|
+
- The result column will always be of type string.
|
|
108
120
|
"""
|
|
109
121
|
|
|
110
122
|
return Func("file_ext", inner=path.file_ext, cols=[col], result_type=str)
|
datachain/func/random.py
CHANGED
|
@@ -8,7 +8,7 @@ def rand() -> Func:
|
|
|
8
8
|
Returns the random integer value.
|
|
9
9
|
|
|
10
10
|
Returns:
|
|
11
|
-
Func: A Func object that represents the rand function.
|
|
11
|
+
Func: A `Func` object that represents the rand function.
|
|
12
12
|
|
|
13
13
|
Example:
|
|
14
14
|
```py
|
|
@@ -18,6 +18,6 @@ def rand() -> Func:
|
|
|
18
18
|
```
|
|
19
19
|
|
|
20
20
|
Note:
|
|
21
|
-
-
|
|
21
|
+
- The result column will always be of type integer.
|
|
22
22
|
"""
|
|
23
23
|
return Func("rand", inner=random.rand, result_type=int)
|
datachain/func/string.py
CHANGED
|
@@ -1,64 +1,76 @@
|
|
|
1
|
-
from typing import
|
|
1
|
+
from typing import get_origin
|
|
2
2
|
|
|
3
3
|
from sqlalchemy import literal
|
|
4
4
|
|
|
5
5
|
from datachain.sql.functions import string
|
|
6
6
|
|
|
7
|
-
from .func import Func
|
|
7
|
+
from .func import ColT, Func
|
|
8
8
|
|
|
9
|
+
__all__ = [
|
|
10
|
+
"byte_hamming_distance",
|
|
11
|
+
"length",
|
|
12
|
+
"regexp_replace",
|
|
13
|
+
"replace",
|
|
14
|
+
"split",
|
|
15
|
+
]
|
|
9
16
|
|
|
10
|
-
|
|
17
|
+
|
|
18
|
+
def length(col: ColT) -> Func:
|
|
11
19
|
"""
|
|
12
20
|
Returns the length of the string.
|
|
13
21
|
|
|
14
22
|
Args:
|
|
15
|
-
col (str |
|
|
23
|
+
col (str | Column | Func | literal): String to compute the length of.
|
|
16
24
|
If a string is provided, it is assumed to be the name of the column.
|
|
17
|
-
If a
|
|
25
|
+
If a Column is provided, it is assumed to be a column in the dataset.
|
|
18
26
|
If a Func is provided, it is assumed to be a function returning a string.
|
|
27
|
+
If a literal is provided, it is assumed to be a string literal.
|
|
19
28
|
|
|
20
29
|
Returns:
|
|
21
|
-
Func: A Func object that represents the string length function.
|
|
30
|
+
Func: A `Func` object that represents the string length function.
|
|
22
31
|
|
|
23
32
|
Example:
|
|
24
33
|
```py
|
|
25
34
|
dc.mutate(
|
|
26
35
|
len1=func.string.length("file.path"),
|
|
27
|
-
len2=func.string.length("
|
|
36
|
+
len2=func.string.length(dc.C("file.path")),
|
|
37
|
+
len3=func.string.length(dc.func.literal("Random string")),
|
|
28
38
|
)
|
|
29
39
|
```
|
|
30
40
|
|
|
31
|
-
|
|
32
|
-
-
|
|
41
|
+
Notes:
|
|
42
|
+
- The result column will always be of type int.
|
|
33
43
|
"""
|
|
34
44
|
return Func("length", inner=string.length, cols=[col], result_type=int)
|
|
35
45
|
|
|
36
46
|
|
|
37
|
-
def split(col:
|
|
47
|
+
def split(col: ColT, sep: str, limit: int | None = None) -> Func:
|
|
38
48
|
"""
|
|
39
49
|
Takes a column and split character and returns an array of the parts.
|
|
40
50
|
|
|
41
51
|
Args:
|
|
42
|
-
col (str | literal): Column to split.
|
|
52
|
+
col (str | Column | Func | literal): Column to split.
|
|
43
53
|
If a string is provided, it is assumed to be the name of the column.
|
|
44
|
-
If a
|
|
54
|
+
If a Column is provided, it is assumed to be a column in the dataset.
|
|
45
55
|
If a Func is provided, it is assumed to be a function returning a string.
|
|
56
|
+
If a literal is provided, it is assumed to be a string literal.
|
|
46
57
|
sep (str): Separator to split the string.
|
|
47
58
|
limit (int, optional): Maximum number of splits to perform.
|
|
48
59
|
|
|
49
60
|
Returns:
|
|
50
|
-
Func: A Func object that represents the split function.
|
|
61
|
+
Func: A `Func` object that represents the split function.
|
|
51
62
|
|
|
52
63
|
Example:
|
|
53
64
|
```py
|
|
54
65
|
dc.mutate(
|
|
55
66
|
path_parts=func.string.split("file.path", "/"),
|
|
56
|
-
|
|
67
|
+
signal_values=func.string.split(dc.C("signal.value"), ","),
|
|
68
|
+
str_words=func.string.split(dc.func.literal("Random string"), " "),
|
|
57
69
|
)
|
|
58
70
|
```
|
|
59
71
|
|
|
60
|
-
|
|
61
|
-
-
|
|
72
|
+
Notes:
|
|
73
|
+
- The result column will always be of type array of strings.
|
|
62
74
|
"""
|
|
63
75
|
|
|
64
76
|
def inner(arg):
|
|
@@ -76,30 +88,33 @@ def split(col: Union[str, Func], sep: str, limit: Optional[int] = None) -> Func:
|
|
|
76
88
|
return Func("split", inner=inner, cols=cols, args=args, result_type=list[str])
|
|
77
89
|
|
|
78
90
|
|
|
79
|
-
def replace(col:
|
|
91
|
+
def replace(col: ColT, pattern: str, replacement: str) -> Func:
|
|
80
92
|
"""
|
|
81
93
|
Replaces substring with another string.
|
|
82
94
|
|
|
83
95
|
Args:
|
|
84
|
-
col (str | literal): Column to
|
|
96
|
+
col (str | Column | Func | literal): Column to perform replacement on.
|
|
85
97
|
If a string is provided, it is assumed to be the name of the column.
|
|
86
|
-
If a
|
|
98
|
+
If a Column is provided, it is assumed to be a column in the dataset.
|
|
87
99
|
If a Func is provided, it is assumed to be a function returning a string.
|
|
100
|
+
If a literal is provided, it is assumed to be a string literal.
|
|
88
101
|
pattern (str): Pattern to replace.
|
|
89
102
|
replacement (str): Replacement string.
|
|
90
103
|
|
|
91
104
|
Returns:
|
|
92
|
-
Func: A Func object that represents the replace function.
|
|
105
|
+
Func: A `Func` object that represents the replace function.
|
|
93
106
|
|
|
94
107
|
Example:
|
|
95
108
|
```py
|
|
96
109
|
dc.mutate(
|
|
97
|
-
|
|
110
|
+
s1=func.string.replace("signal.name", "pattern", "replacement"),
|
|
111
|
+
s2=func.string.replace(dc.C("signal.name"), "pattern", "replacement"),
|
|
112
|
+
s3=func.string.replace(dc.func.literal("Random string"), "Random", "New"),
|
|
98
113
|
)
|
|
99
114
|
```
|
|
100
115
|
|
|
101
|
-
|
|
102
|
-
-
|
|
116
|
+
Notes:
|
|
117
|
+
- The result column will always be of type string.
|
|
103
118
|
"""
|
|
104
119
|
|
|
105
120
|
def inner(arg):
|
|
@@ -115,30 +130,37 @@ def replace(col: Union[str, Func], pattern: str, replacement: str) -> Func:
|
|
|
115
130
|
return Func("replace", inner=inner, cols=cols, args=args, result_type=str)
|
|
116
131
|
|
|
117
132
|
|
|
118
|
-
def regexp_replace(col:
|
|
133
|
+
def regexp_replace(col: ColT, regex: str, replacement: str) -> Func:
|
|
119
134
|
r"""
|
|
120
135
|
Replaces substring that match a regular expression.
|
|
121
136
|
|
|
122
137
|
Args:
|
|
123
|
-
col (str | literal): Column to
|
|
138
|
+
col (str | Column | Func | literal): Column to perform replacement on.
|
|
124
139
|
If a string is provided, it is assumed to be the name of the column.
|
|
125
|
-
If a
|
|
140
|
+
If a Column is provided, it is assumed to be a column in the dataset.
|
|
126
141
|
If a Func is provided, it is assumed to be a function returning a string.
|
|
142
|
+
If a literal is provided, it is assumed to be a string literal.
|
|
127
143
|
regex (str): Regular expression pattern to replace.
|
|
128
144
|
replacement (str): Replacement string.
|
|
129
145
|
|
|
130
146
|
Returns:
|
|
131
|
-
Func: A Func object that represents the regexp_replace function.
|
|
147
|
+
Func: A `Func` object that represents the regexp_replace function.
|
|
132
148
|
|
|
133
149
|
Example:
|
|
134
150
|
```py
|
|
135
151
|
dc.mutate(
|
|
136
|
-
|
|
152
|
+
s1=func.string.regexp_replace("signal.name", r"\d+", "X"),
|
|
153
|
+
s2=func.string.regexp_replace(dc.C("signal.name"), r"\d+", "X"),
|
|
154
|
+
s3=func.string.regexp_replace(
|
|
155
|
+
dc.func.literal("Random string"),
|
|
156
|
+
r"\s+",
|
|
157
|
+
"_",
|
|
158
|
+
),
|
|
137
159
|
)
|
|
138
160
|
```
|
|
139
161
|
|
|
140
|
-
|
|
141
|
-
-
|
|
162
|
+
Notes:
|
|
163
|
+
- The result column will always be of type string.
|
|
142
164
|
"""
|
|
143
165
|
|
|
144
166
|
def inner(arg):
|
|
@@ -154,7 +176,7 @@ def regexp_replace(col: Union[str, Func], regex: str, replacement: str) -> Func:
|
|
|
154
176
|
return Func("regexp_replace", inner=inner, cols=cols, args=args, result_type=str)
|
|
155
177
|
|
|
156
178
|
|
|
157
|
-
def byte_hamming_distance(*args:
|
|
179
|
+
def byte_hamming_distance(*args: ColT) -> Func:
|
|
158
180
|
"""
|
|
159
181
|
Computes the Hamming distance between two strings.
|
|
160
182
|
|
|
@@ -164,22 +186,30 @@ def byte_hamming_distance(*args: Union[str, Func]) -> Func:
|
|
|
164
186
|
of the strings indicate higher dissimilarity.
|
|
165
187
|
|
|
166
188
|
Args:
|
|
167
|
-
args (str | literal): Two strings to compute
|
|
168
|
-
|
|
169
|
-
If a
|
|
189
|
+
args (str | Column | Func | literal): Two strings to compute
|
|
190
|
+
the Hamming distance between.
|
|
191
|
+
If a string is provided, it is assumed to be the name of the column.
|
|
192
|
+
If a Column is provided, it is assumed to be a column in the dataset.
|
|
193
|
+
If a Func is provided, it is assumed to be a function returning a string.
|
|
194
|
+
If a literal is provided, it is assumed to be a string literal.
|
|
170
195
|
|
|
171
196
|
Returns:
|
|
172
|
-
Func: A Func object that represents the Hamming distance function.
|
|
197
|
+
Func: A `Func` object that represents the Hamming distance function.
|
|
173
198
|
|
|
174
199
|
Example:
|
|
175
200
|
```py
|
|
176
201
|
dc.mutate(
|
|
177
|
-
|
|
202
|
+
hd1=func.byte_hamming_distance("file.phash", literal("hello")),
|
|
203
|
+
hd2=func.byte_hamming_distance(dc.C("file.phash"), "hello"),
|
|
204
|
+
hd3=func.byte_hamming_distance(
|
|
205
|
+
dc.func.literal("hi"),
|
|
206
|
+
dc.func.literal("hello"),
|
|
207
|
+
),
|
|
178
208
|
)
|
|
179
209
|
```
|
|
180
210
|
|
|
181
211
|
Notes:
|
|
182
|
-
-
|
|
212
|
+
- The result column will always be of type int.
|
|
183
213
|
"""
|
|
184
214
|
cols, func_args = [], []
|
|
185
215
|
for arg in args:
|
datachain/func/window.py
CHANGED
|
@@ -22,17 +22,16 @@ def window(partition_by: str, order_by: str, desc: bool = False) -> Window:
|
|
|
22
22
|
|
|
23
23
|
Args:
|
|
24
24
|
partition_by (str): The column name by which to partition the result set.
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
order_by (str): The column name by which to order the rows
|
|
28
|
-
|
|
29
|
-
|
|
25
|
+
Rows with the same value in the partition column will be grouped together
|
|
26
|
+
for the window function.
|
|
27
|
+
order_by (str): The column name by which to order the rows within
|
|
28
|
+
each partition. This determines the sequence in which the window function
|
|
29
|
+
is applied.
|
|
30
30
|
desc (bool, optional): If True, the rows will be ordered in descending order.
|
|
31
|
-
|
|
32
|
-
in ascending order.
|
|
31
|
+
Defaults to False, which orders the rows in ascending order.
|
|
33
32
|
|
|
34
33
|
Returns:
|
|
35
|
-
Window: A Window object representing the window specification.
|
|
34
|
+
Window: A `Window` object representing the window specification.
|
|
36
35
|
|
|
37
36
|
Example:
|
|
38
37
|
```py
|
datachain/hash_utils.py
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
import inspect
|
|
3
|
+
import textwrap
|
|
4
|
+
from collections.abc import Sequence
|
|
5
|
+
from typing import TypeAlias, TypeVar
|
|
6
|
+
|
|
7
|
+
from sqlalchemy.sql.elements import ClauseElement, ColumnElement
|
|
8
|
+
|
|
9
|
+
from datachain import json
|
|
10
|
+
|
|
11
|
+
T = TypeVar("T", bound=ColumnElement)
|
|
12
|
+
ColumnLike: TypeAlias = str | T
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _serialize_value(val): # noqa: PLR0911
|
|
16
|
+
"""Helper to serialize arbitrary values recursively."""
|
|
17
|
+
if val is None:
|
|
18
|
+
return None
|
|
19
|
+
if isinstance(val, (str, int, float, bool)):
|
|
20
|
+
return val
|
|
21
|
+
if isinstance(val, ClauseElement):
|
|
22
|
+
return serialize_column_element(val)
|
|
23
|
+
if isinstance(val, dict):
|
|
24
|
+
# Sort dict keys for deterministic serialization
|
|
25
|
+
return {k: _serialize_value(v) for k, v in sorted(val.items())}
|
|
26
|
+
if isinstance(val, (list, tuple)):
|
|
27
|
+
return [_serialize_value(v) for v in val]
|
|
28
|
+
if callable(val):
|
|
29
|
+
return val.__name__ if hasattr(val, "__name__") else str(val)
|
|
30
|
+
return str(val)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def serialize_column_element(expr: str | ColumnElement) -> dict:
|
|
34
|
+
"""
|
|
35
|
+
Recursively serialize a SQLAlchemy ColumnElement into a deterministic structure.
|
|
36
|
+
Uses SQLAlchemy's _traverse_internals to automatically handle all expression types.
|
|
37
|
+
"""
|
|
38
|
+
from sqlalchemy.sql.elements import BindParameter
|
|
39
|
+
|
|
40
|
+
# Special case: BindParameter has non-deterministic 'key' attribute, only use value
|
|
41
|
+
if isinstance(expr, BindParameter):
|
|
42
|
+
return {"type": "bind", "value": _serialize_value(expr.value)}
|
|
43
|
+
|
|
44
|
+
# Generic handling for all ClauseElement types using SQLAlchemy's internals
|
|
45
|
+
if isinstance(expr, ClauseElement):
|
|
46
|
+
# All standard SQLAlchemy types have _traverse_internals
|
|
47
|
+
if hasattr(expr, "_traverse_internals"):
|
|
48
|
+
result = {"type": expr.__class__.__name__}
|
|
49
|
+
for attr_name, _ in expr._traverse_internals:
|
|
50
|
+
# Skip 'table' attribute - table names can be auto-generated/random
|
|
51
|
+
# and are not semantically important for hashing
|
|
52
|
+
if attr_name == "table":
|
|
53
|
+
continue
|
|
54
|
+
if hasattr(expr, attr_name):
|
|
55
|
+
val = getattr(expr, attr_name)
|
|
56
|
+
result[attr_name] = _serialize_value(val)
|
|
57
|
+
return result
|
|
58
|
+
# Rare case: custom user-defined ClauseElement without _traverse_internals
|
|
59
|
+
# We don't know its structure, so just stringify it
|
|
60
|
+
return {"type": expr.__class__.__name__, "repr": str(expr)}
|
|
61
|
+
|
|
62
|
+
# Absolute fallback: stringify completely unknown types
|
|
63
|
+
return {"type": "other", "repr": str(expr)}
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def hash_column_elements(columns: ColumnLike | Sequence[ColumnLike]) -> str:
|
|
67
|
+
"""
|
|
68
|
+
Hash a list of ColumnElements deterministically, dialect agnostic.
|
|
69
|
+
Only accepts ordered iterables (like list or tuple).
|
|
70
|
+
"""
|
|
71
|
+
# Handle case where a single ColumnElement is passed instead of a sequence
|
|
72
|
+
if isinstance(columns, (ColumnElement, str)):
|
|
73
|
+
columns = (columns,)
|
|
74
|
+
|
|
75
|
+
serialized = [serialize_column_element(c) for c in columns]
|
|
76
|
+
json_str = json.dumps(
|
|
77
|
+
serialized, sort_keys=True, separators=(", ", ": ")
|
|
78
|
+
) # stable JSON
|
|
79
|
+
return hashlib.sha256(json_str.encode("utf-8")).hexdigest()
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def hash_callable(func):
|
|
83
|
+
"""
|
|
84
|
+
Calculate a hash from a callable.
|
|
85
|
+
Rules:
|
|
86
|
+
- Named functions (def) → use source code for stable, cross-version hashing
|
|
87
|
+
- Lambdas → use bytecode (deterministic in same Python runtime)
|
|
88
|
+
"""
|
|
89
|
+
if not callable(func):
|
|
90
|
+
raise TypeError("Expected a callable")
|
|
91
|
+
|
|
92
|
+
# Determine if it is a lambda
|
|
93
|
+
is_lambda = func.__name__ == "<lambda>"
|
|
94
|
+
|
|
95
|
+
if not is_lambda:
|
|
96
|
+
# Try to get exact source of named function
|
|
97
|
+
try:
|
|
98
|
+
lines, _ = inspect.getsourcelines(func)
|
|
99
|
+
payload = textwrap.dedent("".join(lines)).strip()
|
|
100
|
+
except (OSError, TypeError):
|
|
101
|
+
# Fallback: bytecode if source not available
|
|
102
|
+
payload = func.__code__.co_code
|
|
103
|
+
else:
|
|
104
|
+
# For lambdas, fall back directly to bytecode
|
|
105
|
+
payload = func.__code__.co_code
|
|
106
|
+
|
|
107
|
+
# Normalize annotations
|
|
108
|
+
annotations = {
|
|
109
|
+
k: getattr(v, "__name__", str(v)) for k, v in func.__annotations__.items()
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
# Extras to distinguish functions with same code but different metadata
|
|
113
|
+
extras = {
|
|
114
|
+
"name": func.__name__,
|
|
115
|
+
"defaults": func.__defaults__,
|
|
116
|
+
"annotations": annotations,
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
# Compute SHA256
|
|
120
|
+
h = hashlib.sha256()
|
|
121
|
+
h.update(str(payload).encode() if isinstance(payload, str) else payload)
|
|
122
|
+
h.update(str(extras).encode())
|
|
123
|
+
return h.hexdigest()
|
datachain/job.py
CHANGED
|
@@ -1,8 +1,9 @@
|
|
|
1
|
-
import json
|
|
2
1
|
import uuid
|
|
3
2
|
from dataclasses import dataclass
|
|
4
3
|
from datetime import datetime
|
|
5
|
-
from typing import Any,
|
|
4
|
+
from typing import Any, TypeVar
|
|
5
|
+
|
|
6
|
+
from datachain import json
|
|
6
7
|
|
|
7
8
|
J = TypeVar("J", bound="Job")
|
|
8
9
|
|
|
@@ -18,27 +19,29 @@ class Job:
|
|
|
18
19
|
workers: int
|
|
19
20
|
params: dict[str, str]
|
|
20
21
|
metrics: dict[str, Any]
|
|
21
|
-
finished_at:
|
|
22
|
-
python_version:
|
|
22
|
+
finished_at: datetime | None = None
|
|
23
|
+
python_version: str | None = None
|
|
23
24
|
error_message: str = ""
|
|
24
25
|
error_stack: str = ""
|
|
26
|
+
parent_job_id: str | None = None
|
|
25
27
|
|
|
26
28
|
@classmethod
|
|
27
29
|
def parse(
|
|
28
30
|
cls,
|
|
29
|
-
id:
|
|
31
|
+
id: str | uuid.UUID,
|
|
30
32
|
name: str,
|
|
31
33
|
status: int,
|
|
32
34
|
created_at: datetime,
|
|
33
|
-
finished_at:
|
|
35
|
+
finished_at: datetime | None,
|
|
34
36
|
query: str,
|
|
35
37
|
query_type: int,
|
|
36
38
|
workers: int,
|
|
37
|
-
python_version:
|
|
39
|
+
python_version: str | None,
|
|
38
40
|
error_message: str,
|
|
39
41
|
error_stack: str,
|
|
40
42
|
params: str,
|
|
41
43
|
metrics: str,
|
|
44
|
+
parent_job_id: str | None,
|
|
42
45
|
) -> "Job":
|
|
43
46
|
return cls(
|
|
44
47
|
str(id),
|
|
@@ -54,4 +57,5 @@ class Job:
|
|
|
54
57
|
python_version,
|
|
55
58
|
error_message,
|
|
56
59
|
error_stack,
|
|
60
|
+
str(parent_job_id) if parent_job_id else None,
|
|
57
61
|
)
|