datachain 0.7.1__py3-none-any.whl → 0.7.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/__init__.py +0 -2
- datachain/catalog/catalog.py +12 -9
- datachain/cli.py +109 -9
- datachain/client/fsspec.py +9 -9
- datachain/data_storage/metastore.py +63 -11
- datachain/data_storage/schema.py +2 -2
- datachain/data_storage/sqlite.py +5 -4
- datachain/data_storage/warehouse.py +18 -18
- datachain/dataset.py +142 -14
- datachain/func/__init__.py +49 -0
- datachain/{lib/func → func}/aggregate.py +13 -11
- datachain/func/array.py +176 -0
- datachain/func/base.py +23 -0
- datachain/func/conditional.py +81 -0
- datachain/func/func.py +384 -0
- datachain/func/path.py +110 -0
- datachain/func/random.py +23 -0
- datachain/func/string.py +154 -0
- datachain/func/window.py +49 -0
- datachain/lib/arrow.py +24 -12
- datachain/lib/data_model.py +25 -9
- datachain/lib/dataset_info.py +9 -5
- datachain/lib/dc.py +94 -56
- datachain/lib/hf.py +1 -1
- datachain/lib/signal_schema.py +1 -1
- datachain/lib/utils.py +1 -0
- datachain/lib/webdataset_laion.py +5 -5
- datachain/model/bbox.py +2 -2
- datachain/model/pose.py +5 -5
- datachain/model/segment.py +2 -2
- datachain/nodes_fetcher.py +2 -2
- datachain/query/dataset.py +57 -34
- datachain/remote/studio.py +40 -8
- datachain/sql/__init__.py +0 -2
- datachain/sql/functions/__init__.py +0 -26
- datachain/sql/selectable.py +11 -5
- datachain/sql/sqlite/base.py +11 -2
- datachain/studio.py +29 -0
- {datachain-0.7.1.dist-info → datachain-0.7.3.dist-info}/METADATA +2 -2
- {datachain-0.7.1.dist-info → datachain-0.7.3.dist-info}/RECORD +44 -37
- datachain/lib/func/__init__.py +0 -32
- datachain/lib/func/func.py +0 -152
- {datachain-0.7.1.dist-info → datachain-0.7.3.dist-info}/LICENSE +0 -0
- {datachain-0.7.1.dist-info → datachain-0.7.3.dist-info}/WHEEL +0 -0
- {datachain-0.7.1.dist-info → datachain-0.7.3.dist-info}/entry_points.txt +0 -0
- {datachain-0.7.1.dist-info → datachain-0.7.3.dist-info}/top_level.txt +0 -0
datachain/lib/func/func.py
DELETED
|
@@ -1,152 +0,0 @@
|
|
|
1
|
-
from dataclasses import dataclass
|
|
2
|
-
from typing import TYPE_CHECKING, Callable, Optional
|
|
3
|
-
|
|
4
|
-
from sqlalchemy import desc
|
|
5
|
-
|
|
6
|
-
from datachain.lib.convert.python_to_sql import python_to_sql
|
|
7
|
-
from datachain.lib.utils import DataChainColumnError, DataChainParamsError
|
|
8
|
-
from datachain.query.schema import Column, ColumnMeta
|
|
9
|
-
|
|
10
|
-
if TYPE_CHECKING:
|
|
11
|
-
from datachain import DataType
|
|
12
|
-
from datachain.lib.signal_schema import SignalSchema
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
@dataclass
|
|
16
|
-
class Window:
|
|
17
|
-
"""Represents a window specification for SQL window functions."""
|
|
18
|
-
|
|
19
|
-
partition_by: str
|
|
20
|
-
order_by: str
|
|
21
|
-
desc: bool = False
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
def window(partition_by: str, order_by: str, desc: bool = False) -> Window:
|
|
25
|
-
"""
|
|
26
|
-
Defines a window specification for SQL window functions.
|
|
27
|
-
|
|
28
|
-
The `window` function specifies how to partition and order the result set
|
|
29
|
-
for the associated window function. It is used to define the scope of the rows
|
|
30
|
-
that the window function will operate on.
|
|
31
|
-
|
|
32
|
-
Args:
|
|
33
|
-
partition_by (str): The column name by which to partition the result set.
|
|
34
|
-
Rows with the same value in the partition column
|
|
35
|
-
will be grouped together for the window function.
|
|
36
|
-
order_by (str): The column name by which to order the rows
|
|
37
|
-
within each partition. This determines the sequence in which
|
|
38
|
-
the window function is applied.
|
|
39
|
-
desc (bool, optional): If True, the rows will be ordered in descending order.
|
|
40
|
-
Defaults to False, which orders the rows
|
|
41
|
-
in ascending order.
|
|
42
|
-
|
|
43
|
-
Returns:
|
|
44
|
-
Window: A Window object representing the window specification.
|
|
45
|
-
|
|
46
|
-
Example:
|
|
47
|
-
```py
|
|
48
|
-
window = func.window(partition_by="signal.category", order_by="created_at")
|
|
49
|
-
dc.mutate(
|
|
50
|
-
row_number=func.row_number().over(window),
|
|
51
|
-
)
|
|
52
|
-
```
|
|
53
|
-
"""
|
|
54
|
-
return Window(
|
|
55
|
-
ColumnMeta.to_db_name(partition_by),
|
|
56
|
-
ColumnMeta.to_db_name(order_by),
|
|
57
|
-
desc,
|
|
58
|
-
)
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
class Func:
|
|
62
|
-
"""Represents a function to be applied to a column in a SQL query."""
|
|
63
|
-
|
|
64
|
-
def __init__(
|
|
65
|
-
self,
|
|
66
|
-
name: str,
|
|
67
|
-
inner: Callable,
|
|
68
|
-
col: Optional[str] = None,
|
|
69
|
-
result_type: Optional["DataType"] = None,
|
|
70
|
-
is_array: bool = False,
|
|
71
|
-
is_window: bool = False,
|
|
72
|
-
window: Optional[Window] = None,
|
|
73
|
-
) -> None:
|
|
74
|
-
self.name = name
|
|
75
|
-
self.inner = inner
|
|
76
|
-
self.col = col
|
|
77
|
-
self.result_type = result_type
|
|
78
|
-
self.is_array = is_array
|
|
79
|
-
self.is_window = is_window
|
|
80
|
-
self.window = window
|
|
81
|
-
|
|
82
|
-
def __str__(self) -> str:
|
|
83
|
-
return self.name + "()"
|
|
84
|
-
|
|
85
|
-
def over(self, window: Window) -> "Func":
|
|
86
|
-
if not self.is_window:
|
|
87
|
-
raise DataChainParamsError(f"{self} doesn't support window (over())")
|
|
88
|
-
|
|
89
|
-
return Func(
|
|
90
|
-
"over",
|
|
91
|
-
self.inner,
|
|
92
|
-
self.col,
|
|
93
|
-
self.result_type,
|
|
94
|
-
self.is_array,
|
|
95
|
-
self.is_window,
|
|
96
|
-
window,
|
|
97
|
-
)
|
|
98
|
-
|
|
99
|
-
@property
|
|
100
|
-
def db_col(self) -> Optional[str]:
|
|
101
|
-
return ColumnMeta.to_db_name(self.col) if self.col else None
|
|
102
|
-
|
|
103
|
-
def db_col_type(self, signals_schema: "SignalSchema") -> Optional["DataType"]:
|
|
104
|
-
if not self.db_col:
|
|
105
|
-
return None
|
|
106
|
-
col_type: type = signals_schema.get_column_type(self.db_col)
|
|
107
|
-
return list[col_type] if self.is_array else col_type # type: ignore[valid-type]
|
|
108
|
-
|
|
109
|
-
def get_result_type(self, signals_schema: "SignalSchema") -> "DataType":
|
|
110
|
-
if self.result_type:
|
|
111
|
-
return self.result_type
|
|
112
|
-
|
|
113
|
-
if col_type := self.db_col_type(signals_schema):
|
|
114
|
-
return col_type
|
|
115
|
-
|
|
116
|
-
raise DataChainColumnError(
|
|
117
|
-
str(self),
|
|
118
|
-
"Column name is required to infer result type",
|
|
119
|
-
)
|
|
120
|
-
|
|
121
|
-
def get_column(
|
|
122
|
-
self, signals_schema: "SignalSchema", label: Optional[str] = None
|
|
123
|
-
) -> Column:
|
|
124
|
-
col_type = self.get_result_type(signals_schema)
|
|
125
|
-
sql_type = python_to_sql(col_type)
|
|
126
|
-
|
|
127
|
-
if self.col:
|
|
128
|
-
col = Column(self.db_col, sql_type)
|
|
129
|
-
func_col = self.inner(col)
|
|
130
|
-
else:
|
|
131
|
-
func_col = self.inner()
|
|
132
|
-
|
|
133
|
-
if self.is_window:
|
|
134
|
-
if not self.window:
|
|
135
|
-
raise DataChainParamsError(
|
|
136
|
-
f"Window function {self} requires over() clause with a window spec",
|
|
137
|
-
)
|
|
138
|
-
func_col = func_col.over(
|
|
139
|
-
partition_by=self.window.partition_by,
|
|
140
|
-
order_by=(
|
|
141
|
-
desc(self.window.order_by)
|
|
142
|
-
if self.window.desc
|
|
143
|
-
else self.window.order_by
|
|
144
|
-
),
|
|
145
|
-
)
|
|
146
|
-
|
|
147
|
-
func_col.type = sql_type
|
|
148
|
-
|
|
149
|
-
if label:
|
|
150
|
-
func_col = func_col.label(label)
|
|
151
|
-
|
|
152
|
-
return func_col
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|