maxframe 0.1.0b4__cp311-cp311-win_amd64.whl → 1.0.0__cp311-cp311-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of maxframe might be problematic. Click here for more details.

Files changed (214) hide show
  1. maxframe/__init__.py +1 -0
  2. maxframe/_utils.cp311-win_amd64.pyd +0 -0
  3. maxframe/codegen.py +56 -5
  4. maxframe/config/config.py +78 -10
  5. maxframe/config/validators.py +42 -11
  6. maxframe/conftest.py +58 -14
  7. maxframe/core/__init__.py +2 -16
  8. maxframe/core/entity/__init__.py +1 -12
  9. maxframe/core/entity/executable.py +1 -1
  10. maxframe/core/entity/objects.py +46 -45
  11. maxframe/core/entity/output_types.py +0 -3
  12. maxframe/core/entity/tests/test_objects.py +43 -0
  13. maxframe/core/entity/tileables.py +5 -78
  14. maxframe/core/graph/__init__.py +2 -2
  15. maxframe/core/graph/builder/__init__.py +0 -1
  16. maxframe/core/graph/builder/base.py +5 -4
  17. maxframe/core/graph/builder/tileable.py +4 -4
  18. maxframe/core/graph/builder/utils.py +4 -8
  19. maxframe/core/graph/core.cp311-win_amd64.pyd +0 -0
  20. maxframe/core/graph/core.pyx +4 -4
  21. maxframe/core/graph/entity.py +9 -33
  22. maxframe/core/operator/__init__.py +2 -9
  23. maxframe/core/operator/base.py +3 -5
  24. maxframe/core/operator/objects.py +0 -9
  25. maxframe/core/operator/utils.py +55 -0
  26. maxframe/dataframe/__init__.py +2 -1
  27. maxframe/dataframe/arithmetic/around.py +5 -17
  28. maxframe/dataframe/arithmetic/core.py +15 -7
  29. maxframe/dataframe/arithmetic/docstring.py +7 -33
  30. maxframe/dataframe/arithmetic/equal.py +4 -2
  31. maxframe/dataframe/arithmetic/greater.py +4 -2
  32. maxframe/dataframe/arithmetic/greater_equal.py +4 -2
  33. maxframe/dataframe/arithmetic/less.py +2 -2
  34. maxframe/dataframe/arithmetic/less_equal.py +4 -2
  35. maxframe/dataframe/arithmetic/not_equal.py +4 -2
  36. maxframe/dataframe/arithmetic/tests/test_arithmetic.py +39 -16
  37. maxframe/dataframe/core.py +58 -12
  38. maxframe/dataframe/datasource/date_range.py +2 -2
  39. maxframe/dataframe/datasource/read_odps_query.py +120 -24
  40. maxframe/dataframe/datasource/read_odps_table.py +9 -4
  41. maxframe/dataframe/datasource/tests/test_datasource.py +103 -8
  42. maxframe/dataframe/datastore/tests/test_to_odps.py +48 -0
  43. maxframe/dataframe/datastore/to_odps.py +28 -0
  44. maxframe/dataframe/extensions/__init__.py +5 -0
  45. maxframe/dataframe/extensions/flatjson.py +131 -0
  46. maxframe/dataframe/extensions/flatmap.py +317 -0
  47. maxframe/dataframe/extensions/reshuffle.py +1 -1
  48. maxframe/dataframe/extensions/tests/test_extensions.py +108 -3
  49. maxframe/dataframe/groupby/core.py +1 -1
  50. maxframe/dataframe/groupby/cum.py +0 -1
  51. maxframe/dataframe/groupby/fill.py +4 -1
  52. maxframe/dataframe/groupby/getitem.py +6 -0
  53. maxframe/dataframe/groupby/tests/test_groupby.py +5 -1
  54. maxframe/dataframe/groupby/transform.py +5 -1
  55. maxframe/dataframe/indexing/align.py +1 -1
  56. maxframe/dataframe/indexing/loc.py +6 -4
  57. maxframe/dataframe/indexing/rename.py +5 -28
  58. maxframe/dataframe/indexing/sample.py +0 -1
  59. maxframe/dataframe/indexing/set_index.py +68 -1
  60. maxframe/dataframe/initializer.py +11 -1
  61. maxframe/dataframe/merge/__init__.py +9 -1
  62. maxframe/dataframe/merge/concat.py +41 -31
  63. maxframe/dataframe/merge/merge.py +237 -3
  64. maxframe/dataframe/merge/tests/test_merge.py +126 -1
  65. maxframe/dataframe/misc/__init__.py +4 -0
  66. maxframe/dataframe/misc/apply.py +6 -11
  67. maxframe/dataframe/misc/case_when.py +141 -0
  68. maxframe/dataframe/misc/describe.py +2 -2
  69. maxframe/dataframe/misc/drop_duplicates.py +8 -8
  70. maxframe/dataframe/misc/eval.py +4 -0
  71. maxframe/dataframe/misc/memory_usage.py +2 -2
  72. maxframe/dataframe/misc/pct_change.py +1 -83
  73. maxframe/dataframe/misc/pivot_table.py +262 -0
  74. maxframe/dataframe/misc/tests/test_misc.py +93 -1
  75. maxframe/dataframe/misc/transform.py +1 -30
  76. maxframe/dataframe/misc/value_counts.py +4 -17
  77. maxframe/dataframe/missing/dropna.py +1 -1
  78. maxframe/dataframe/missing/fillna.py +5 -5
  79. maxframe/dataframe/operators.py +1 -17
  80. maxframe/dataframe/plotting/core.py +2 -2
  81. maxframe/dataframe/reduction/core.py +4 -3
  82. maxframe/dataframe/reduction/tests/test_reduction.py +2 -4
  83. maxframe/dataframe/sort/sort_values.py +1 -11
  84. maxframe/dataframe/statistics/corr.py +3 -3
  85. maxframe/dataframe/statistics/quantile.py +13 -19
  86. maxframe/dataframe/statistics/tests/test_statistics.py +4 -4
  87. maxframe/dataframe/tests/test_initializer.py +33 -2
  88. maxframe/dataframe/utils.py +33 -11
  89. maxframe/dataframe/window/expanding.py +5 -3
  90. maxframe/dataframe/window/tests/test_expanding.py +2 -2
  91. maxframe/errors.py +13 -0
  92. maxframe/extension.py +12 -0
  93. maxframe/io/__init__.py +13 -0
  94. maxframe/io/objects/__init__.py +24 -0
  95. maxframe/io/objects/core.py +140 -0
  96. maxframe/io/objects/tensor.py +76 -0
  97. maxframe/io/objects/tests/__init__.py +13 -0
  98. maxframe/io/objects/tests/test_object_io.py +97 -0
  99. maxframe/{odpsio → io/odpsio}/__init__.py +3 -1
  100. maxframe/{odpsio → io/odpsio}/arrow.py +43 -12
  101. maxframe/{odpsio → io/odpsio}/schema.py +38 -16
  102. maxframe/io/odpsio/tableio.py +719 -0
  103. maxframe/io/odpsio/tests/__init__.py +13 -0
  104. maxframe/{odpsio → io/odpsio}/tests/test_schema.py +75 -33
  105. maxframe/{odpsio → io/odpsio}/tests/test_tableio.py +50 -23
  106. maxframe/{odpsio → io/odpsio}/tests/test_volumeio.py +4 -6
  107. maxframe/io/odpsio/volumeio.py +63 -0
  108. maxframe/learn/contrib/__init__.py +3 -1
  109. maxframe/learn/contrib/graph/__init__.py +15 -0
  110. maxframe/learn/contrib/graph/connected_components.py +215 -0
  111. maxframe/learn/contrib/graph/tests/__init__.py +13 -0
  112. maxframe/learn/contrib/graph/tests/test_connected_components.py +53 -0
  113. maxframe/learn/contrib/llm/__init__.py +16 -0
  114. maxframe/learn/contrib/llm/core.py +54 -0
  115. maxframe/learn/contrib/llm/models/__init__.py +14 -0
  116. maxframe/learn/contrib/llm/models/dashscope.py +73 -0
  117. maxframe/learn/contrib/llm/multi_modal.py +42 -0
  118. maxframe/learn/contrib/llm/text.py +42 -0
  119. maxframe/learn/contrib/utils.py +52 -0
  120. maxframe/learn/contrib/xgboost/__init__.py +26 -0
  121. maxframe/learn/contrib/xgboost/classifier.py +110 -0
  122. maxframe/learn/contrib/xgboost/core.py +241 -0
  123. maxframe/learn/contrib/xgboost/dmatrix.py +147 -0
  124. maxframe/learn/contrib/xgboost/predict.py +121 -0
  125. maxframe/learn/contrib/xgboost/regressor.py +71 -0
  126. maxframe/learn/contrib/xgboost/tests/__init__.py +13 -0
  127. maxframe/learn/contrib/xgboost/tests/test_core.py +43 -0
  128. maxframe/learn/contrib/xgboost/train.py +132 -0
  129. maxframe/{core/operator/fuse.py → learn/core.py} +7 -10
  130. maxframe/learn/utils/__init__.py +15 -0
  131. maxframe/learn/utils/core.py +29 -0
  132. maxframe/lib/mmh3.cp311-win_amd64.pyd +0 -0
  133. maxframe/lib/mmh3.pyi +43 -0
  134. maxframe/lib/sparse/tests/test_sparse.py +15 -15
  135. maxframe/lib/wrapped_pickle.py +2 -1
  136. maxframe/opcodes.py +11 -0
  137. maxframe/protocol.py +154 -27
  138. maxframe/remote/core.py +4 -8
  139. maxframe/serialization/__init__.py +1 -0
  140. maxframe/serialization/core.cp311-win_amd64.pyd +0 -0
  141. maxframe/serialization/core.pxd +3 -0
  142. maxframe/serialization/core.pyi +64 -0
  143. maxframe/serialization/core.pyx +67 -26
  144. maxframe/serialization/exception.py +1 -1
  145. maxframe/serialization/pandas.py +52 -17
  146. maxframe/serialization/serializables/core.py +180 -15
  147. maxframe/serialization/serializables/field_type.py +4 -1
  148. maxframe/serialization/serializables/tests/test_serializable.py +54 -5
  149. maxframe/serialization/tests/test_serial.py +2 -1
  150. maxframe/session.py +37 -2
  151. maxframe/tensor/__init__.py +81 -2
  152. maxframe/tensor/arithmetic/isclose.py +1 -0
  153. maxframe/tensor/arithmetic/tests/test_arithmetic.py +22 -18
  154. maxframe/tensor/core.py +5 -136
  155. maxframe/tensor/datasource/array.py +7 -2
  156. maxframe/tensor/datasource/full.py +1 -1
  157. maxframe/tensor/datasource/scalar.py +1 -1
  158. maxframe/tensor/datasource/tests/test_datasource.py +1 -1
  159. maxframe/tensor/indexing/flatnonzero.py +1 -1
  160. maxframe/tensor/indexing/getitem.py +2 -0
  161. maxframe/tensor/merge/__init__.py +2 -0
  162. maxframe/tensor/merge/concatenate.py +101 -0
  163. maxframe/tensor/merge/tests/test_merge.py +30 -1
  164. maxframe/tensor/merge/vstack.py +74 -0
  165. maxframe/tensor/{base → misc}/__init__.py +4 -0
  166. maxframe/tensor/misc/atleast_1d.py +72 -0
  167. maxframe/tensor/misc/atleast_2d.py +70 -0
  168. maxframe/tensor/misc/atleast_3d.py +85 -0
  169. maxframe/tensor/misc/tests/__init__.py +13 -0
  170. maxframe/tensor/{base → misc}/transpose.py +22 -18
  171. maxframe/tensor/misc/unique.py +205 -0
  172. maxframe/tensor/operators.py +1 -7
  173. maxframe/tensor/random/core.py +1 -1
  174. maxframe/tensor/reduction/count_nonzero.py +2 -1
  175. maxframe/tensor/reduction/mean.py +1 -0
  176. maxframe/tensor/reduction/nanmean.py +1 -0
  177. maxframe/tensor/reduction/nanvar.py +2 -0
  178. maxframe/tensor/reduction/tests/test_reduction.py +12 -1
  179. maxframe/tensor/reduction/var.py +2 -0
  180. maxframe/tensor/statistics/quantile.py +2 -2
  181. maxframe/tensor/utils.py +2 -22
  182. maxframe/tests/test_protocol.py +34 -0
  183. maxframe/tests/test_utils.py +0 -12
  184. maxframe/tests/utils.py +17 -2
  185. maxframe/typing_.py +4 -1
  186. maxframe/udf.py +62 -3
  187. maxframe/utils.py +112 -86
  188. {maxframe-0.1.0b4.dist-info → maxframe-1.0.0.dist-info}/METADATA +25 -25
  189. {maxframe-0.1.0b4.dist-info → maxframe-1.0.0.dist-info}/RECORD +208 -167
  190. {maxframe-0.1.0b4.dist-info → maxframe-1.0.0.dist-info}/WHEEL +1 -1
  191. maxframe_client/__init__.py +0 -1
  192. maxframe_client/clients/framedriver.py +4 -1
  193. maxframe_client/fetcher.py +123 -54
  194. maxframe_client/session/consts.py +3 -0
  195. maxframe_client/session/graph.py +8 -2
  196. maxframe_client/session/odps.py +223 -40
  197. maxframe_client/session/task.py +108 -80
  198. maxframe_client/tests/test_fetcher.py +21 -3
  199. maxframe_client/tests/test_session.py +136 -8
  200. maxframe/core/entity/chunks.py +0 -68
  201. maxframe/core/entity/fuse.py +0 -73
  202. maxframe/core/graph/builder/chunk.py +0 -430
  203. maxframe/odpsio/tableio.py +0 -300
  204. maxframe/odpsio/volumeio.py +0 -95
  205. maxframe_client/clients/spe.py +0 -104
  206. /maxframe/{odpsio → core/entity}/tests/__init__.py +0 -0
  207. /maxframe/{tensor/base → dataframe/datastore}/tests/__init__.py +0 -0
  208. /maxframe/{odpsio → io/odpsio}/tests/test_arrow.py +0 -0
  209. /maxframe/tensor/{base → misc}/astype.py +0 -0
  210. /maxframe/tensor/{base → misc}/broadcast_to.py +0 -0
  211. /maxframe/tensor/{base → misc}/ravel.py +0 -0
  212. /maxframe/tensor/{base/tests/test_base.py → misc/tests/test_misc.py} +0 -0
  213. /maxframe/tensor/{base → misc}/where.py +0 -0
  214. {maxframe-0.1.0b4.dist-info → maxframe-1.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,215 @@
1
+ # Copyright 1999-2024 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import numpy as np
16
+ import pandas as pd
17
+
18
+ from maxframe import opcodes
19
+
20
+ from ....core import OutputType
21
+ from ....dataframe.operators import DataFrameOperator, DataFrameOperatorMixin
22
+ from ....dataframe.utils import make_dtypes, parse_index
23
+ from ....serialization.serializables import Int32Field, StringField
24
+
25
+
26
+ class DataFrameConnectedComponentsOperator(DataFrameOperator, DataFrameOperatorMixin):
27
+ _op_type_ = opcodes.CONNECTED_COMPONENTS
28
+
29
+ vertex_col1 = StringField("vertex_col1", default=None)
30
+ vertex_col2 = StringField("vertex_col2", default=None)
31
+ max_iter = Int32Field("max_iter", default=6)
32
+
33
+ def __call__(self, df):
34
+ node_id_dtype = df.dtypes[self.vertex_col1]
35
+ dtypes = make_dtypes({"id": node_id_dtype, "component": node_id_dtype})
36
+ # this will return a dataframe and a bool flag
37
+ new_dataframe_tileable_kw = {
38
+ "shape": (np.nan, 2),
39
+ "index_value": parse_index(pd.RangeIndex(0)),
40
+ "columns_value": parse_index(dtypes.index, store_data=True),
41
+ "dtypes": dtypes,
42
+ }
43
+ new_scalar_tileable_kw = {"dtype": np.dtype(np.bool_), "shape": ()}
44
+ return self.new_tileables(
45
+ [df],
46
+ kws=[new_dataframe_tileable_kw, new_scalar_tileable_kw],
47
+ )
48
+
49
+ @property
50
+ def output_limit(self):
51
+ return 2
52
+
53
+
54
+ def connected_components(
55
+ dataframe, vertex_col1: str, vertex_col2: str, max_iter: int = 6
56
+ ):
57
+ """
58
+ The connected components algorithm labels each node as belonging to a specific connected component with the ID of
59
+ its lowest-numbered vertex.
60
+
61
+ Parameters
62
+ ----------
63
+ dataframe : DataFrame
64
+ A DataFrame containing the edges of the graph.
65
+
66
+ vertex_col1 : str
67
+ The name of the column in `dataframe` that contains the one of edge vertices. The column value must be an
68
+ integer.
69
+
70
+ vertex_col2 : str
71
+ The name of the column in `dataframe` that contains the other one of edge vertices. The column value must be an
72
+ integer.
73
+
74
+ max_iter : int
75
+ The algorithm use large and small star transformation to find all connected components, `max_iter`
76
+ controls the max round of the iterations before finds all edges. Default is 6.
77
+
78
+
79
+ Returns
80
+ -------
81
+ DataFrame
82
+ Return dataFrame contains all connected component edges by two columns `id` and `component`. `component` is
83
+ the lowest-numbered vertex in the connected components.
84
+
85
+ Notes
86
+ -------
87
+ After `execute()`, the dataframe has a bool member `flag` to indicate if the `connected_components` already
88
+ converged in `max_iter` rounds. `True` means the dataframe already contains all edges of the connected components.
89
+ If `False` you can run `connected_components` more times to reach the converged state.
90
+
91
+ Examples
92
+ --------
93
+ >>> import numpy as np
94
+ >>> import maxframe.dataframe as md
95
+ >>> import maxframe.learn.contrib.graph.connected_components
96
+ >>> df = md.DataFrame({'x': [4, 1], 'y': [0, 4]})
97
+ >>> df.execute()
98
+ x y
99
+ 0 4 1
100
+ 1 0 4
101
+
102
+ Get connected components with 1 round iteration.
103
+
104
+ >>> components, converged = connected_components(df, "x", "y", 1)
105
+ >>> session.execute(components, converged)
106
+ >>> components
107
+ A B
108
+ 0 1 0
109
+ 1 4 0
110
+
111
+ >>> converged
112
+ True
113
+
114
+ Sometimes, a single iteration may not be sufficient to propagate the connectivity of all edges.
115
+ By default, `connected_components` performs 6 iterations of calculations.
116
+ If you are unsure whether the connected components have converged, you can check the `flag` variable in
117
+ the output DataFrame after calling `execute()`.
118
+
119
+ >>> df = md.DataFrame({'x': [4, 1, 7, 5, 8, 11, 11], 'y': [0, 4, 4, 7, 7, 9, 13]})
120
+ >>> df.execute()
121
+ x y
122
+ 0 4 0
123
+ 1 1 4
124
+ 2 7 4
125
+ 3 5 7
126
+ 4 8 7
127
+ 5 11 9
128
+ 6 11 13
129
+
130
+ >>> components, converged = connected_components(df, "x", "y", 1)
131
+ >>> session.execute(components, converged)
132
+ >>> components
133
+ id component
134
+ 0 4 0
135
+ 1 7 0
136
+ 2 8 4
137
+ 3 13 9
138
+ 4 1 0
139
+ 5 5 0
140
+ 6 11 9
141
+
142
+ If `flag` is True, it means convergence has been achieved.
143
+
144
+ >>> converged
145
+ False
146
+
147
+ You can determine whether to continue iterating or to use a larger number of iterations
148
+ (but not too large, which would result in wasted computational overhead).
149
+
150
+ >>> components, converged = connected_components(components, "id", "component", 1)
151
+ >>> session.execute(components, converged)
152
+ >>> components
153
+ id component
154
+ 0 4 0
155
+ 1 7 0
156
+ 2 13 9
157
+ 3 1 0
158
+ 4 5 0
159
+ 5 11 9
160
+ 6 8 0
161
+
162
+ >>> components, converged = connected_components(df, "x", "y")
163
+ >>> session.execute(components, converged)
164
+ >>> components
165
+ id component
166
+ 0 4 0
167
+ 1 7 0
168
+ 2 13 9
169
+ 3 1 0
170
+ 4 5 0
171
+ 5 11 9
172
+ 6 8 0
173
+ """
174
+
175
+ # Check if vertex columns are provided
176
+ if not vertex_col1 or not vertex_col2:
177
+ raise ValueError("Both vertex_col1 and vertex_col2 must be provided.")
178
+
179
+ # Check if max_iter is provided and within the valid range
180
+ if max_iter is None:
181
+ raise ValueError("max_iter must be provided.")
182
+ if not (1 <= max_iter <= 50):
183
+ raise ValueError("max_iter must be an integer between 1 and 50.")
184
+
185
+ # Verify that the vertex columns exist in the dataframe
186
+ missing_cols = [
187
+ col for col in (vertex_col1, vertex_col2) if col not in dataframe.dtypes
188
+ ]
189
+ if missing_cols:
190
+ raise ValueError(
191
+ f"The following required columns {missing_cols} are not in {list(dataframe.dtypes.index)}"
192
+ )
193
+
194
+ # Ensure that the vertex columns are of integer type
195
+ # TODO support string dtype
196
+ incorrect_dtypes = [
197
+ col
198
+ for col in (vertex_col1, vertex_col2)
199
+ if dataframe[col].dtype != np.dtype("int")
200
+ ]
201
+ if incorrect_dtypes:
202
+ dtypes_str = ", ".join(str(dataframe[col].dtype) for col in incorrect_dtypes)
203
+ raise ValueError(
204
+ f"Columns {incorrect_dtypes} should be of integer type, but found {dtypes_str}."
205
+ )
206
+
207
+ op = DataFrameConnectedComponentsOperator(
208
+ vertex_col1=vertex_col1,
209
+ vertex_col2=vertex_col2,
210
+ _output_types=[OutputType.dataframe, OutputType.scalar],
211
+ max_iter=max_iter,
212
+ )
213
+ return op(
214
+ dataframe,
215
+ )
@@ -0,0 +1,13 @@
1
+ # Copyright 1999-2024 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
@@ -0,0 +1,53 @@
1
+ # Copyright 1999-2024 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import numpy as np
16
+ import pytest
17
+
18
+ from ..... import dataframe as md
19
+ from .....dataframe.core import DataFrameData
20
+ from .....tensor.core import TensorData
21
+ from .. import connected_components
22
+
23
+
24
+ @pytest.fixture
25
+ def df1():
26
+ return md.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3], "c": [1, 2, 3]})
27
+
28
+
29
+ @pytest.fixture
30
+ def df2():
31
+ return md.DataFrame(
32
+ [[1, "2"], [1, "2"]],
33
+ columns=["a", "b"],
34
+ )
35
+
36
+
37
+ def test_connected_components(df1, df2):
38
+ edges, flag = connected_components(df1, "a", "b")
39
+ assert edges.op.max_iter == 6
40
+ assert edges.shape == (np.nan, 2)
41
+ assert isinstance(edges.data, DataFrameData)
42
+ assert isinstance(flag.data, TensorData)
43
+ assert flag.shape == ()
44
+ assert "id" in edges.dtypes and "component" in edges.dtypes
45
+
46
+ with pytest.raises(ValueError):
47
+ connected_components(df1, "a", "x")
48
+
49
+ with pytest.raises(ValueError):
50
+ connected_components(df1, "a", "b", 0)
51
+
52
+ with pytest.raises(ValueError):
53
+ connected_components(df2, "a", "b")
@@ -0,0 +1,16 @@
1
+ # Copyright 1999-2024 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ from . import models, multi_modal, text
15
+
16
+ del models
@@ -0,0 +1,54 @@
1
+ # Copyright 1999-2024 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ from typing import Any, Dict
15
+
16
+ import numpy as np
17
+ import pandas as pd
18
+
19
+ from ....core.entity.output_types import OutputType
20
+ from ....core.operator.base import Operator
21
+ from ....core.operator.core import TileableOperatorMixin
22
+ from ....dataframe.utils import parse_index
23
+ from ....serialization.serializables.core import Serializable
24
+ from ....serialization.serializables.field import AnyField, DictField, StringField
25
+
26
+
27
+ class LLM(Serializable):
28
+ name = StringField("name", default=None)
29
+
30
+ def validate_params(self, params: Dict[str, Any]):
31
+ pass
32
+
33
+
34
+ class LLMOperator(Operator, TileableOperatorMixin):
35
+ model = AnyField("model", default=None)
36
+ prompt_template = AnyField("prompt_template", default=None)
37
+ params = DictField("params", default=None)
38
+
39
+ def __init__(self, output_types=None, **kw):
40
+ if output_types is None:
41
+ output_types = [OutputType.dataframe]
42
+ super().__init__(_output_types=output_types, **kw)
43
+
44
+ def __call__(self, data):
45
+ col_names = ["response", "success"]
46
+ columns = parse_index(pd.Index(col_names), store_data=True)
47
+ out_dtypes = pd.Series([np.dtype("O"), np.dtype("bool")], index=col_names)
48
+ return self.new_tileable(
49
+ inputs=[data],
50
+ dtypes=out_dtypes,
51
+ shape=(data.shape[0], len(col_names)),
52
+ index_value=data.index_value,
53
+ columns_value=columns,
54
+ )
@@ -0,0 +1,14 @@
1
+ # Copyright 1999-2024 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ from .dashscope import DashScopeMultiModalLLM, DashScopeTextLLM
@@ -0,0 +1,73 @@
1
+ # Copyright 1999-2024 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ from typing import Any, Dict
15
+
16
+ from ..... import opcodes
17
+ from .....serialization.serializables.core import Serializable
18
+ from .....serialization.serializables.field import StringField
19
+ from ..core import LLMOperator
20
+ from ..multi_modal import MultiModalLLM
21
+ from ..text import TextLLM
22
+
23
+
24
+ class DashScopeLLMMixin(Serializable):
25
+ __slots__ = ()
26
+
27
+ _not_supported_params = {"stream", "incremental_output"}
28
+
29
+ def validate_params(self, params: Dict[str, Any]):
30
+ for k in params.keys():
31
+ if k in self._not_supported_params:
32
+ raise ValueError(f"{k} is not supported")
33
+
34
+
35
+ class DashScopeTextLLM(TextLLM, DashScopeLLMMixin):
36
+ api_key_resource = StringField("api_key_resource", default=None)
37
+
38
+ def generate(
39
+ self,
40
+ data,
41
+ prompt_template: Dict[str, Any],
42
+ params: Dict[str, Any] = None,
43
+ ):
44
+ return DashScopeTextGenerationOperator(
45
+ model=self,
46
+ prompt_template=prompt_template,
47
+ params=params,
48
+ )(data)
49
+
50
+
51
+ class DashScopeMultiModalLLM(MultiModalLLM, DashScopeLLMMixin):
52
+ api_key_resource = StringField("api_key_resource", default=None)
53
+
54
+ def generate(
55
+ self,
56
+ data,
57
+ prompt_template: Dict[str, Any],
58
+ params: Dict[str, Any] = None,
59
+ ):
60
+ # TODO add precheck here
61
+ return DashScopeMultiModalGenerationOperator(
62
+ model=self,
63
+ prompt_template=prompt_template,
64
+ params=params,
65
+ )(data)
66
+
67
+
68
+ class DashScopeTextGenerationOperator(LLMOperator):
69
+ _op_type_ = opcodes.DASHSCOPE_TEXT_GENERATION
70
+
71
+
72
+ class DashScopeMultiModalGenerationOperator(LLMOperator):
73
+ _op_type_ = opcodes.DASHSCOPE_MULTI_MODAL_GENERATION
@@ -0,0 +1,42 @@
1
+ # Copyright 1999-2024 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ from typing import Any, Dict
15
+
16
+ from ....dataframe.core import DATAFRAME_TYPE, SERIES_TYPE
17
+ from .core import LLM
18
+
19
+
20
+ class MultiModalLLM(LLM):
21
+ def generate(
22
+ self,
23
+ data,
24
+ prompt_template: Dict[str, Any],
25
+ params: Dict[str, Any] = None,
26
+ ):
27
+ raise NotImplementedError
28
+
29
+
30
+ def generate(
31
+ data,
32
+ model: MultiModalLLM,
33
+ prompt_template: Dict[str, Any],
34
+ params: Dict[str, Any] = None,
35
+ ):
36
+ if not isinstance(data, DATAFRAME_TYPE) and not isinstance(data, SERIES_TYPE):
37
+ raise ValueError("data must be a maxframe dataframe or series object")
38
+ if not isinstance(model, MultiModalLLM):
39
+ raise ValueError("model must be a MultiModalLLM object")
40
+ params = params if params is not None else dict()
41
+ model.validate_params(params)
42
+ return model.generate(data, prompt_template, params)
@@ -0,0 +1,42 @@
1
+ # Copyright 1999-2024 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ from typing import Any, Dict
15
+
16
+ from ....dataframe.core import DATAFRAME_TYPE, SERIES_TYPE
17
+ from .core import LLM
18
+
19
+
20
+ class TextLLM(LLM):
21
+ def generate(
22
+ self,
23
+ data,
24
+ prompt_template: Dict[str, Any],
25
+ params: Dict[str, Any] = None,
26
+ ):
27
+ raise NotImplementedError
28
+
29
+
30
+ def generate(
31
+ data,
32
+ model: TextLLM,
33
+ prompt_template: Dict[str, Any],
34
+ params: Dict[str, Any] = None,
35
+ ):
36
+ if not isinstance(data, DATAFRAME_TYPE) and not isinstance(data, SERIES_TYPE):
37
+ raise ValueError("data must be a maxframe dataframe or series object")
38
+ if not isinstance(model, TextLLM):
39
+ raise ValueError("model must be a TextLLM object")
40
+ params = params if params is not None else dict()
41
+ model.validate_params(params)
42
+ return model.generate(data, prompt_template, params)
@@ -0,0 +1,52 @@
1
+ # Copyright 1999-2024 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import sys
15
+
16
+
17
+ def make_import_error_func(package_name):
18
+ def _func(*_, **__): # pragma: no cover
19
+ raise ImportError(
20
+ f"Cannot import {package_name}, please reinstall that package."
21
+ )
22
+
23
+ return _func
24
+
25
+
26
+ def config_mod_getattr(mod_dict, globals_):
27
+ def __getattr__(name):
28
+ import importlib
29
+
30
+ if name in mod_dict:
31
+ mod_name, cls_name = mod_dict[name].rsplit(".", 1)
32
+ mod = importlib.import_module(mod_name, globals_["__name__"])
33
+ cls = globals_[name] = getattr(mod, cls_name)
34
+ return cls
35
+ else: # pragma: no cover
36
+ raise AttributeError(name)
37
+
38
+ if sys.version_info[:2] < (3, 7):
39
+ for _mod in mod_dict.keys():
40
+ __getattr__(_mod)
41
+
42
+ def __dir__():
43
+ return sorted([n for n in globals_ if not n.startswith("_")] + list(mod_dict))
44
+
45
+ globals_.update(
46
+ {
47
+ "__getattr__": __getattr__,
48
+ "__dir__": __dir__,
49
+ "__all__": list(__dir__()),
50
+ "__warningregistry__": dict(),
51
+ }
52
+ )
@@ -0,0 +1,26 @@
1
+ # Copyright 1999-2024 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from ..utils import config_mod_getattr as _config_mod_getattr
16
+ from .dmatrix import DMatrix
17
+ from .predict import predict
18
+ from .train import train
19
+
20
+ _config_mod_getattr(
21
+ {
22
+ "XGBClassifier": ".classifier.XGBClassifier",
23
+ "XGBRegressor": ".regressor.XGBRegressor",
24
+ },
25
+ globals(),
26
+ )