datachain 0.7.1__py3-none-any.whl → 0.7.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (46) hide show
  1. datachain/__init__.py +0 -2
  2. datachain/catalog/catalog.py +12 -9
  3. datachain/cli.py +109 -9
  4. datachain/client/fsspec.py +9 -9
  5. datachain/data_storage/metastore.py +63 -11
  6. datachain/data_storage/schema.py +2 -2
  7. datachain/data_storage/sqlite.py +5 -4
  8. datachain/data_storage/warehouse.py +18 -18
  9. datachain/dataset.py +142 -14
  10. datachain/func/__init__.py +49 -0
  11. datachain/{lib/func → func}/aggregate.py +13 -11
  12. datachain/func/array.py +176 -0
  13. datachain/func/base.py +23 -0
  14. datachain/func/conditional.py +81 -0
  15. datachain/func/func.py +384 -0
  16. datachain/func/path.py +110 -0
  17. datachain/func/random.py +23 -0
  18. datachain/func/string.py +154 -0
  19. datachain/func/window.py +49 -0
  20. datachain/lib/arrow.py +24 -12
  21. datachain/lib/data_model.py +25 -9
  22. datachain/lib/dataset_info.py +9 -5
  23. datachain/lib/dc.py +94 -56
  24. datachain/lib/hf.py +1 -1
  25. datachain/lib/signal_schema.py +1 -1
  26. datachain/lib/utils.py +1 -0
  27. datachain/lib/webdataset_laion.py +5 -5
  28. datachain/model/bbox.py +2 -2
  29. datachain/model/pose.py +5 -5
  30. datachain/model/segment.py +2 -2
  31. datachain/nodes_fetcher.py +2 -2
  32. datachain/query/dataset.py +57 -34
  33. datachain/remote/studio.py +40 -8
  34. datachain/sql/__init__.py +0 -2
  35. datachain/sql/functions/__init__.py +0 -26
  36. datachain/sql/selectable.py +11 -5
  37. datachain/sql/sqlite/base.py +11 -2
  38. datachain/studio.py +29 -0
  39. {datachain-0.7.1.dist-info → datachain-0.7.3.dist-info}/METADATA +2 -2
  40. {datachain-0.7.1.dist-info → datachain-0.7.3.dist-info}/RECORD +44 -37
  41. datachain/lib/func/__init__.py +0 -32
  42. datachain/lib/func/func.py +0 -152
  43. {datachain-0.7.1.dist-info → datachain-0.7.3.dist-info}/LICENSE +0 -0
  44. {datachain-0.7.1.dist-info → datachain-0.7.3.dist-info}/WHEEL +0 -0
  45. {datachain-0.7.1.dist-info → datachain-0.7.3.dist-info}/entry_points.txt +0 -0
  46. {datachain-0.7.1.dist-info → datachain-0.7.3.dist-info}/top_level.txt +0 -0
@@ -1,152 +0,0 @@
1
- from dataclasses import dataclass
2
- from typing import TYPE_CHECKING, Callable, Optional
3
-
4
- from sqlalchemy import desc
5
-
6
- from datachain.lib.convert.python_to_sql import python_to_sql
7
- from datachain.lib.utils import DataChainColumnError, DataChainParamsError
8
- from datachain.query.schema import Column, ColumnMeta
9
-
10
- if TYPE_CHECKING:
11
- from datachain import DataType
12
- from datachain.lib.signal_schema import SignalSchema
13
-
14
-
15
- @dataclass
16
- class Window:
17
- """Represents a window specification for SQL window functions."""
18
-
19
- partition_by: str
20
- order_by: str
21
- desc: bool = False
22
-
23
-
24
- def window(partition_by: str, order_by: str, desc: bool = False) -> Window:
25
- """
26
- Defines a window specification for SQL window functions.
27
-
28
- The `window` function specifies how to partition and order the result set
29
- for the associated window function. It is used to define the scope of the rows
30
- that the window function will operate on.
31
-
32
- Args:
33
- partition_by (str): The column name by which to partition the result set.
34
- Rows with the same value in the partition column
35
- will be grouped together for the window function.
36
- order_by (str): The column name by which to order the rows
37
- within each partition. This determines the sequence in which
38
- the window function is applied.
39
- desc (bool, optional): If True, the rows will be ordered in descending order.
40
- Defaults to False, which orders the rows
41
- in ascending order.
42
-
43
- Returns:
44
- Window: A Window object representing the window specification.
45
-
46
- Example:
47
- ```py
48
- window = func.window(partition_by="signal.category", order_by="created_at")
49
- dc.mutate(
50
- row_number=func.row_number().over(window),
51
- )
52
- ```
53
- """
54
- return Window(
55
- ColumnMeta.to_db_name(partition_by),
56
- ColumnMeta.to_db_name(order_by),
57
- desc,
58
- )
59
-
60
-
61
- class Func:
62
- """Represents a function to be applied to a column in a SQL query."""
63
-
64
- def __init__(
65
- self,
66
- name: str,
67
- inner: Callable,
68
- col: Optional[str] = None,
69
- result_type: Optional["DataType"] = None,
70
- is_array: bool = False,
71
- is_window: bool = False,
72
- window: Optional[Window] = None,
73
- ) -> None:
74
- self.name = name
75
- self.inner = inner
76
- self.col = col
77
- self.result_type = result_type
78
- self.is_array = is_array
79
- self.is_window = is_window
80
- self.window = window
81
-
82
- def __str__(self) -> str:
83
- return self.name + "()"
84
-
85
- def over(self, window: Window) -> "Func":
86
- if not self.is_window:
87
- raise DataChainParamsError(f"{self} doesn't support window (over())")
88
-
89
- return Func(
90
- "over",
91
- self.inner,
92
- self.col,
93
- self.result_type,
94
- self.is_array,
95
- self.is_window,
96
- window,
97
- )
98
-
99
- @property
100
- def db_col(self) -> Optional[str]:
101
- return ColumnMeta.to_db_name(self.col) if self.col else None
102
-
103
- def db_col_type(self, signals_schema: "SignalSchema") -> Optional["DataType"]:
104
- if not self.db_col:
105
- return None
106
- col_type: type = signals_schema.get_column_type(self.db_col)
107
- return list[col_type] if self.is_array else col_type # type: ignore[valid-type]
108
-
109
- def get_result_type(self, signals_schema: "SignalSchema") -> "DataType":
110
- if self.result_type:
111
- return self.result_type
112
-
113
- if col_type := self.db_col_type(signals_schema):
114
- return col_type
115
-
116
- raise DataChainColumnError(
117
- str(self),
118
- "Column name is required to infer result type",
119
- )
120
-
121
- def get_column(
122
- self, signals_schema: "SignalSchema", label: Optional[str] = None
123
- ) -> Column:
124
- col_type = self.get_result_type(signals_schema)
125
- sql_type = python_to_sql(col_type)
126
-
127
- if self.col:
128
- col = Column(self.db_col, sql_type)
129
- func_col = self.inner(col)
130
- else:
131
- func_col = self.inner()
132
-
133
- if self.is_window:
134
- if not self.window:
135
- raise DataChainParamsError(
136
- f"Window function {self} requires over() clause with a window spec",
137
- )
138
- func_col = func_col.over(
139
- partition_by=self.window.partition_by,
140
- order_by=(
141
- desc(self.window.order_by)
142
- if self.window.desc
143
- else self.window.order_by
144
- ),
145
- )
146
-
147
- func_col.type = sql_type
148
-
149
- if label:
150
- func_col = func_col.label(label)
151
-
152
- return func_col