maxframe 0.1.0b4__cp37-cp37m-win_amd64.whl → 1.0.0__cp37-cp37m-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of maxframe might be problematic. Click here for more details.
- maxframe/__init__.py +1 -0
- maxframe/_utils.cp37-win_amd64.pyd +0 -0
- maxframe/codegen.py +56 -5
- maxframe/config/config.py +78 -10
- maxframe/config/validators.py +42 -11
- maxframe/conftest.py +58 -14
- maxframe/core/__init__.py +2 -16
- maxframe/core/entity/__init__.py +1 -12
- maxframe/core/entity/executable.py +1 -1
- maxframe/core/entity/objects.py +46 -45
- maxframe/core/entity/output_types.py +0 -3
- maxframe/core/entity/tests/test_objects.py +43 -0
- maxframe/core/entity/tileables.py +5 -78
- maxframe/core/graph/__init__.py +2 -2
- maxframe/core/graph/builder/__init__.py +0 -1
- maxframe/core/graph/builder/base.py +5 -4
- maxframe/core/graph/builder/tileable.py +4 -4
- maxframe/core/graph/builder/utils.py +4 -8
- maxframe/core/graph/core.cp37-win_amd64.pyd +0 -0
- maxframe/core/graph/core.pyx +4 -4
- maxframe/core/graph/entity.py +9 -33
- maxframe/core/operator/__init__.py +2 -9
- maxframe/core/operator/base.py +3 -5
- maxframe/core/operator/objects.py +0 -9
- maxframe/core/operator/utils.py +55 -0
- maxframe/dataframe/__init__.py +2 -1
- maxframe/dataframe/arithmetic/around.py +5 -17
- maxframe/dataframe/arithmetic/core.py +15 -7
- maxframe/dataframe/arithmetic/docstring.py +7 -33
- maxframe/dataframe/arithmetic/equal.py +4 -2
- maxframe/dataframe/arithmetic/greater.py +4 -2
- maxframe/dataframe/arithmetic/greater_equal.py +4 -2
- maxframe/dataframe/arithmetic/less.py +2 -2
- maxframe/dataframe/arithmetic/less_equal.py +4 -2
- maxframe/dataframe/arithmetic/not_equal.py +4 -2
- maxframe/dataframe/arithmetic/tests/test_arithmetic.py +39 -16
- maxframe/dataframe/core.py +58 -12
- maxframe/dataframe/datasource/date_range.py +2 -2
- maxframe/dataframe/datasource/read_odps_query.py +120 -24
- maxframe/dataframe/datasource/read_odps_table.py +9 -4
- maxframe/dataframe/datasource/tests/test_datasource.py +103 -8
- maxframe/dataframe/datastore/tests/test_to_odps.py +48 -0
- maxframe/dataframe/datastore/to_odps.py +28 -0
- maxframe/dataframe/extensions/__init__.py +5 -0
- maxframe/dataframe/extensions/flatjson.py +131 -0
- maxframe/dataframe/extensions/flatmap.py +317 -0
- maxframe/dataframe/extensions/reshuffle.py +1 -1
- maxframe/dataframe/extensions/tests/test_extensions.py +108 -3
- maxframe/dataframe/groupby/core.py +1 -1
- maxframe/dataframe/groupby/cum.py +0 -1
- maxframe/dataframe/groupby/fill.py +4 -1
- maxframe/dataframe/groupby/getitem.py +6 -0
- maxframe/dataframe/groupby/tests/test_groupby.py +5 -1
- maxframe/dataframe/groupby/transform.py +5 -1
- maxframe/dataframe/indexing/align.py +1 -1
- maxframe/dataframe/indexing/loc.py +6 -4
- maxframe/dataframe/indexing/rename.py +5 -28
- maxframe/dataframe/indexing/sample.py +0 -1
- maxframe/dataframe/indexing/set_index.py +68 -1
- maxframe/dataframe/initializer.py +11 -1
- maxframe/dataframe/merge/__init__.py +9 -1
- maxframe/dataframe/merge/concat.py +41 -31
- maxframe/dataframe/merge/merge.py +237 -3
- maxframe/dataframe/merge/tests/test_merge.py +126 -1
- maxframe/dataframe/misc/__init__.py +4 -0
- maxframe/dataframe/misc/apply.py +6 -11
- maxframe/dataframe/misc/case_when.py +141 -0
- maxframe/dataframe/misc/describe.py +2 -2
- maxframe/dataframe/misc/drop_duplicates.py +8 -8
- maxframe/dataframe/misc/eval.py +4 -0
- maxframe/dataframe/misc/memory_usage.py +2 -2
- maxframe/dataframe/misc/pct_change.py +1 -83
- maxframe/dataframe/misc/pivot_table.py +262 -0
- maxframe/dataframe/misc/tests/test_misc.py +93 -1
- maxframe/dataframe/misc/transform.py +1 -30
- maxframe/dataframe/misc/value_counts.py +4 -17
- maxframe/dataframe/missing/dropna.py +1 -1
- maxframe/dataframe/missing/fillna.py +5 -5
- maxframe/dataframe/operators.py +1 -17
- maxframe/dataframe/plotting/core.py +2 -2
- maxframe/dataframe/reduction/core.py +4 -3
- maxframe/dataframe/reduction/tests/test_reduction.py +2 -4
- maxframe/dataframe/sort/sort_values.py +1 -11
- maxframe/dataframe/statistics/corr.py +3 -3
- maxframe/dataframe/statistics/quantile.py +13 -19
- maxframe/dataframe/statistics/tests/test_statistics.py +4 -4
- maxframe/dataframe/tests/test_initializer.py +33 -2
- maxframe/dataframe/utils.py +33 -11
- maxframe/dataframe/window/expanding.py +5 -3
- maxframe/dataframe/window/tests/test_expanding.py +2 -2
- maxframe/errors.py +13 -0
- maxframe/extension.py +12 -0
- maxframe/io/__init__.py +13 -0
- maxframe/io/objects/__init__.py +24 -0
- maxframe/io/objects/core.py +140 -0
- maxframe/io/objects/tensor.py +76 -0
- maxframe/io/objects/tests/__init__.py +13 -0
- maxframe/io/objects/tests/test_object_io.py +97 -0
- maxframe/{odpsio → io/odpsio}/__init__.py +3 -1
- maxframe/{odpsio → io/odpsio}/arrow.py +43 -12
- maxframe/{odpsio → io/odpsio}/schema.py +38 -16
- maxframe/io/odpsio/tableio.py +719 -0
- maxframe/io/odpsio/tests/__init__.py +13 -0
- maxframe/{odpsio → io/odpsio}/tests/test_schema.py +75 -33
- maxframe/{odpsio → io/odpsio}/tests/test_tableio.py +50 -23
- maxframe/{odpsio → io/odpsio}/tests/test_volumeio.py +4 -6
- maxframe/io/odpsio/volumeio.py +63 -0
- maxframe/learn/contrib/__init__.py +3 -1
- maxframe/learn/contrib/graph/__init__.py +15 -0
- maxframe/learn/contrib/graph/connected_components.py +215 -0
- maxframe/learn/contrib/graph/tests/__init__.py +13 -0
- maxframe/learn/contrib/graph/tests/test_connected_components.py +53 -0
- maxframe/learn/contrib/llm/__init__.py +16 -0
- maxframe/learn/contrib/llm/core.py +54 -0
- maxframe/learn/contrib/llm/models/__init__.py +14 -0
- maxframe/learn/contrib/llm/models/dashscope.py +73 -0
- maxframe/learn/contrib/llm/multi_modal.py +42 -0
- maxframe/learn/contrib/llm/text.py +42 -0
- maxframe/learn/contrib/utils.py +52 -0
- maxframe/learn/contrib/xgboost/__init__.py +26 -0
- maxframe/learn/contrib/xgboost/classifier.py +110 -0
- maxframe/learn/contrib/xgboost/core.py +241 -0
- maxframe/learn/contrib/xgboost/dmatrix.py +147 -0
- maxframe/learn/contrib/xgboost/predict.py +121 -0
- maxframe/learn/contrib/xgboost/regressor.py +71 -0
- maxframe/learn/contrib/xgboost/tests/__init__.py +13 -0
- maxframe/learn/contrib/xgboost/tests/test_core.py +43 -0
- maxframe/learn/contrib/xgboost/train.py +132 -0
- maxframe/{core/operator/fuse.py → learn/core.py} +7 -10
- maxframe/learn/utils/__init__.py +15 -0
- maxframe/learn/utils/core.py +29 -0
- maxframe/lib/mmh3.cp37-win_amd64.pyd +0 -0
- maxframe/lib/mmh3.pyi +43 -0
- maxframe/lib/sparse/tests/test_sparse.py +15 -15
- maxframe/lib/wrapped_pickle.py +2 -1
- maxframe/opcodes.py +11 -0
- maxframe/protocol.py +154 -27
- maxframe/remote/core.py +4 -8
- maxframe/serialization/__init__.py +1 -0
- maxframe/serialization/core.cp37-win_amd64.pyd +0 -0
- maxframe/serialization/core.pxd +3 -0
- maxframe/serialization/core.pyi +64 -0
- maxframe/serialization/core.pyx +67 -26
- maxframe/serialization/exception.py +1 -1
- maxframe/serialization/pandas.py +52 -17
- maxframe/serialization/serializables/core.py +180 -15
- maxframe/serialization/serializables/field_type.py +4 -1
- maxframe/serialization/serializables/tests/test_serializable.py +54 -5
- maxframe/serialization/tests/test_serial.py +2 -1
- maxframe/session.py +37 -2
- maxframe/tensor/__init__.py +81 -2
- maxframe/tensor/arithmetic/isclose.py +1 -0
- maxframe/tensor/arithmetic/tests/test_arithmetic.py +22 -18
- maxframe/tensor/core.py +5 -136
- maxframe/tensor/datasource/array.py +7 -2
- maxframe/tensor/datasource/full.py +1 -1
- maxframe/tensor/datasource/scalar.py +1 -1
- maxframe/tensor/datasource/tests/test_datasource.py +1 -1
- maxframe/tensor/indexing/flatnonzero.py +1 -1
- maxframe/tensor/indexing/getitem.py +2 -0
- maxframe/tensor/merge/__init__.py +2 -0
- maxframe/tensor/merge/concatenate.py +101 -0
- maxframe/tensor/merge/tests/test_merge.py +30 -1
- maxframe/tensor/merge/vstack.py +74 -0
- maxframe/tensor/{base → misc}/__init__.py +4 -0
- maxframe/tensor/misc/atleast_1d.py +72 -0
- maxframe/tensor/misc/atleast_2d.py +70 -0
- maxframe/tensor/misc/atleast_3d.py +85 -0
- maxframe/tensor/misc/tests/__init__.py +13 -0
- maxframe/tensor/{base → misc}/transpose.py +22 -18
- maxframe/tensor/misc/unique.py +205 -0
- maxframe/tensor/operators.py +1 -7
- maxframe/tensor/random/core.py +1 -1
- maxframe/tensor/reduction/count_nonzero.py +2 -1
- maxframe/tensor/reduction/mean.py +1 -0
- maxframe/tensor/reduction/nanmean.py +1 -0
- maxframe/tensor/reduction/nanvar.py +2 -0
- maxframe/tensor/reduction/tests/test_reduction.py +12 -1
- maxframe/tensor/reduction/var.py +2 -0
- maxframe/tensor/statistics/quantile.py +2 -2
- maxframe/tensor/utils.py +2 -22
- maxframe/tests/test_protocol.py +34 -0
- maxframe/tests/test_utils.py +0 -12
- maxframe/tests/utils.py +17 -2
- maxframe/typing_.py +4 -1
- maxframe/udf.py +62 -3
- maxframe/utils.py +112 -86
- {maxframe-0.1.0b4.dist-info → maxframe-1.0.0.dist-info}/METADATA +4 -4
- {maxframe-0.1.0b4.dist-info → maxframe-1.0.0.dist-info}/RECORD +208 -167
- maxframe_client/__init__.py +0 -1
- maxframe_client/clients/framedriver.py +4 -1
- maxframe_client/fetcher.py +123 -54
- maxframe_client/session/consts.py +3 -0
- maxframe_client/session/graph.py +8 -2
- maxframe_client/session/odps.py +223 -40
- maxframe_client/session/task.py +108 -80
- maxframe_client/tests/test_fetcher.py +21 -3
- maxframe_client/tests/test_session.py +136 -8
- maxframe/core/entity/chunks.py +0 -68
- maxframe/core/entity/fuse.py +0 -73
- maxframe/core/graph/builder/chunk.py +0 -430
- maxframe/odpsio/tableio.py +0 -300
- maxframe/odpsio/volumeio.py +0 -95
- maxframe_client/clients/spe.py +0 -104
- /maxframe/{odpsio → core/entity}/tests/__init__.py +0 -0
- /maxframe/{tensor/base → dataframe/datastore}/tests/__init__.py +0 -0
- /maxframe/{odpsio → io/odpsio}/tests/test_arrow.py +0 -0
- /maxframe/tensor/{base → misc}/astype.py +0 -0
- /maxframe/tensor/{base → misc}/broadcast_to.py +0 -0
- /maxframe/tensor/{base → misc}/ravel.py +0 -0
- /maxframe/tensor/{base/tests/test_base.py → misc/tests/test_misc.py} +0 -0
- /maxframe/tensor/{base → misc}/where.py +0 -0
- {maxframe-0.1.0b4.dist-info → maxframe-1.0.0.dist-info}/WHEEL +0 -0
- {maxframe-0.1.0b4.dist-info → maxframe-1.0.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,215 @@
|
|
|
1
|
+
# Copyright 1999-2024 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import numpy as np
|
|
16
|
+
import pandas as pd
|
|
17
|
+
|
|
18
|
+
from maxframe import opcodes
|
|
19
|
+
|
|
20
|
+
from ....core import OutputType
|
|
21
|
+
from ....dataframe.operators import DataFrameOperator, DataFrameOperatorMixin
|
|
22
|
+
from ....dataframe.utils import make_dtypes, parse_index
|
|
23
|
+
from ....serialization.serializables import Int32Field, StringField
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class DataFrameConnectedComponentsOperator(DataFrameOperator, DataFrameOperatorMixin):
|
|
27
|
+
_op_type_ = opcodes.CONNECTED_COMPONENTS
|
|
28
|
+
|
|
29
|
+
vertex_col1 = StringField("vertex_col1", default=None)
|
|
30
|
+
vertex_col2 = StringField("vertex_col2", default=None)
|
|
31
|
+
max_iter = Int32Field("max_iter", default=6)
|
|
32
|
+
|
|
33
|
+
def __call__(self, df):
|
|
34
|
+
node_id_dtype = df.dtypes[self.vertex_col1]
|
|
35
|
+
dtypes = make_dtypes({"id": node_id_dtype, "component": node_id_dtype})
|
|
36
|
+
# this will return a dataframe and a bool flag
|
|
37
|
+
new_dataframe_tileable_kw = {
|
|
38
|
+
"shape": (np.nan, 2),
|
|
39
|
+
"index_value": parse_index(pd.RangeIndex(0)),
|
|
40
|
+
"columns_value": parse_index(dtypes.index, store_data=True),
|
|
41
|
+
"dtypes": dtypes,
|
|
42
|
+
}
|
|
43
|
+
new_scalar_tileable_kw = {"dtype": np.dtype(np.bool_), "shape": ()}
|
|
44
|
+
return self.new_tileables(
|
|
45
|
+
[df],
|
|
46
|
+
kws=[new_dataframe_tileable_kw, new_scalar_tileable_kw],
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
@property
|
|
50
|
+
def output_limit(self):
|
|
51
|
+
return 2
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def connected_components(
|
|
55
|
+
dataframe, vertex_col1: str, vertex_col2: str, max_iter: int = 6
|
|
56
|
+
):
|
|
57
|
+
"""
|
|
58
|
+
The connected components algorithm labels each node as belonging to a specific connected component with the ID of
|
|
59
|
+
its lowest-numbered vertex.
|
|
60
|
+
|
|
61
|
+
Parameters
|
|
62
|
+
----------
|
|
63
|
+
dataframe : DataFrame
|
|
64
|
+
A DataFrame containing the edges of the graph.
|
|
65
|
+
|
|
66
|
+
vertex_col1 : str
|
|
67
|
+
The name of the column in `dataframe` that contains the one of edge vertices. The column value must be an
|
|
68
|
+
integer.
|
|
69
|
+
|
|
70
|
+
vertex_col2 : str
|
|
71
|
+
The name of the column in `dataframe` that contains the other one of edge vertices. The column value must be an
|
|
72
|
+
integer.
|
|
73
|
+
|
|
74
|
+
max_iter : int
|
|
75
|
+
The algorithm use large and small star transformation to find all connected components, `max_iter`
|
|
76
|
+
controls the max round of the iterations before finds all edges. Default is 6.
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
Returns
|
|
80
|
+
-------
|
|
81
|
+
DataFrame
|
|
82
|
+
Return dataFrame contains all connected component edges by two columns `id` and `component`. `component` is
|
|
83
|
+
the lowest-numbered vertex in the connected components.
|
|
84
|
+
|
|
85
|
+
Notes
|
|
86
|
+
-------
|
|
87
|
+
After `execute()`, the dataframe has a bool member `flag` to indicate if the `connected_components` already
|
|
88
|
+
converged in `max_iter` rounds. `True` means the dataframe already contains all edges of the connected components.
|
|
89
|
+
If `False` you can run `connected_components` more times to reach the converged state.
|
|
90
|
+
|
|
91
|
+
Examples
|
|
92
|
+
--------
|
|
93
|
+
>>> import numpy as np
|
|
94
|
+
>>> import maxframe.dataframe as md
|
|
95
|
+
>>> import maxframe.learn.contrib.graph.connected_components
|
|
96
|
+
>>> df = md.DataFrame({'x': [4, 1], 'y': [0, 4]})
|
|
97
|
+
>>> df.execute()
|
|
98
|
+
x y
|
|
99
|
+
0 4 1
|
|
100
|
+
1 0 4
|
|
101
|
+
|
|
102
|
+
Get connected components with 1 round iteration.
|
|
103
|
+
|
|
104
|
+
>>> components, converged = connected_components(df, "x", "y", 1)
|
|
105
|
+
>>> session.execute(components, converged)
|
|
106
|
+
>>> components
|
|
107
|
+
A B
|
|
108
|
+
0 1 0
|
|
109
|
+
1 4 0
|
|
110
|
+
|
|
111
|
+
>>> converged
|
|
112
|
+
True
|
|
113
|
+
|
|
114
|
+
Sometimes, a single iteration may not be sufficient to propagate the connectivity of all edges.
|
|
115
|
+
By default, `connected_components` performs 6 iterations of calculations.
|
|
116
|
+
If you are unsure whether the connected components have converged, you can check the `flag` variable in
|
|
117
|
+
the output DataFrame after calling `execute()`.
|
|
118
|
+
|
|
119
|
+
>>> df = md.DataFrame({'x': [4, 1, 7, 5, 8, 11, 11], 'y': [0, 4, 4, 7, 7, 9, 13]})
|
|
120
|
+
>>> df.execute()
|
|
121
|
+
x y
|
|
122
|
+
0 4 0
|
|
123
|
+
1 1 4
|
|
124
|
+
2 7 4
|
|
125
|
+
3 5 7
|
|
126
|
+
4 8 7
|
|
127
|
+
5 11 9
|
|
128
|
+
6 11 13
|
|
129
|
+
|
|
130
|
+
>>> components, converged = connected_components(df, "x", "y", 1)
|
|
131
|
+
>>> session.execute(components, converged)
|
|
132
|
+
>>> components
|
|
133
|
+
id component
|
|
134
|
+
0 4 0
|
|
135
|
+
1 7 0
|
|
136
|
+
2 8 4
|
|
137
|
+
3 13 9
|
|
138
|
+
4 1 0
|
|
139
|
+
5 5 0
|
|
140
|
+
6 11 9
|
|
141
|
+
|
|
142
|
+
If `flag` is True, it means convergence has been achieved.
|
|
143
|
+
|
|
144
|
+
>>> converged
|
|
145
|
+
False
|
|
146
|
+
|
|
147
|
+
You can determine whether to continue iterating or to use a larger number of iterations
|
|
148
|
+
(but not too large, which would result in wasted computational overhead).
|
|
149
|
+
|
|
150
|
+
>>> components, converged = connected_components(components, "id", "component", 1)
|
|
151
|
+
>>> session.execute(components, converged)
|
|
152
|
+
>>> components
|
|
153
|
+
id component
|
|
154
|
+
0 4 0
|
|
155
|
+
1 7 0
|
|
156
|
+
2 13 9
|
|
157
|
+
3 1 0
|
|
158
|
+
4 5 0
|
|
159
|
+
5 11 9
|
|
160
|
+
6 8 0
|
|
161
|
+
|
|
162
|
+
>>> components, converged = connected_components(df, "x", "y")
|
|
163
|
+
>>> session.execute(components, converged)
|
|
164
|
+
>>> components
|
|
165
|
+
id component
|
|
166
|
+
0 4 0
|
|
167
|
+
1 7 0
|
|
168
|
+
2 13 9
|
|
169
|
+
3 1 0
|
|
170
|
+
4 5 0
|
|
171
|
+
5 11 9
|
|
172
|
+
6 8 0
|
|
173
|
+
"""
|
|
174
|
+
|
|
175
|
+
# Check if vertex columns are provided
|
|
176
|
+
if not vertex_col1 or not vertex_col2:
|
|
177
|
+
raise ValueError("Both vertex_col1 and vertex_col2 must be provided.")
|
|
178
|
+
|
|
179
|
+
# Check if max_iter is provided and within the valid range
|
|
180
|
+
if max_iter is None:
|
|
181
|
+
raise ValueError("max_iter must be provided.")
|
|
182
|
+
if not (1 <= max_iter <= 50):
|
|
183
|
+
raise ValueError("max_iter must be an integer between 1 and 50.")
|
|
184
|
+
|
|
185
|
+
# Verify that the vertex columns exist in the dataframe
|
|
186
|
+
missing_cols = [
|
|
187
|
+
col for col in (vertex_col1, vertex_col2) if col not in dataframe.dtypes
|
|
188
|
+
]
|
|
189
|
+
if missing_cols:
|
|
190
|
+
raise ValueError(
|
|
191
|
+
f"The following required columns {missing_cols} are not in {list(dataframe.dtypes.index)}"
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
# Ensure that the vertex columns are of integer type
|
|
195
|
+
# TODO support string dtype
|
|
196
|
+
incorrect_dtypes = [
|
|
197
|
+
col
|
|
198
|
+
for col in (vertex_col1, vertex_col2)
|
|
199
|
+
if dataframe[col].dtype != np.dtype("int")
|
|
200
|
+
]
|
|
201
|
+
if incorrect_dtypes:
|
|
202
|
+
dtypes_str = ", ".join(str(dataframe[col].dtype) for col in incorrect_dtypes)
|
|
203
|
+
raise ValueError(
|
|
204
|
+
f"Columns {incorrect_dtypes} should be of integer type, but found {dtypes_str}."
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
op = DataFrameConnectedComponentsOperator(
|
|
208
|
+
vertex_col1=vertex_col1,
|
|
209
|
+
vertex_col2=vertex_col2,
|
|
210
|
+
_output_types=[OutputType.dataframe, OutputType.scalar],
|
|
211
|
+
max_iter=max_iter,
|
|
212
|
+
)
|
|
213
|
+
return op(
|
|
214
|
+
dataframe,
|
|
215
|
+
)
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
# Copyright 1999-2024 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
# Copyright 1999-2024 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import numpy as np
|
|
16
|
+
import pytest
|
|
17
|
+
|
|
18
|
+
from ..... import dataframe as md
|
|
19
|
+
from .....dataframe.core import DataFrameData
|
|
20
|
+
from .....tensor.core import TensorData
|
|
21
|
+
from .. import connected_components
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@pytest.fixture
|
|
25
|
+
def df1():
|
|
26
|
+
return md.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3], "c": [1, 2, 3]})
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@pytest.fixture
|
|
30
|
+
def df2():
|
|
31
|
+
return md.DataFrame(
|
|
32
|
+
[[1, "2"], [1, "2"]],
|
|
33
|
+
columns=["a", "b"],
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def test_connected_components(df1, df2):
|
|
38
|
+
edges, flag = connected_components(df1, "a", "b")
|
|
39
|
+
assert edges.op.max_iter == 6
|
|
40
|
+
assert edges.shape == (np.nan, 2)
|
|
41
|
+
assert isinstance(edges.data, DataFrameData)
|
|
42
|
+
assert isinstance(flag.data, TensorData)
|
|
43
|
+
assert flag.shape == ()
|
|
44
|
+
assert "id" in edges.dtypes and "component" in edges.dtypes
|
|
45
|
+
|
|
46
|
+
with pytest.raises(ValueError):
|
|
47
|
+
connected_components(df1, "a", "x")
|
|
48
|
+
|
|
49
|
+
with pytest.raises(ValueError):
|
|
50
|
+
connected_components(df1, "a", "b", 0)
|
|
51
|
+
|
|
52
|
+
with pytest.raises(ValueError):
|
|
53
|
+
connected_components(df2, "a", "b")
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# Copyright 1999-2024 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
from . import models, multi_modal, text
|
|
15
|
+
|
|
16
|
+
del models
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
# Copyright 1999-2024 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
from typing import Any, Dict
|
|
15
|
+
|
|
16
|
+
import numpy as np
|
|
17
|
+
import pandas as pd
|
|
18
|
+
|
|
19
|
+
from ....core.entity.output_types import OutputType
|
|
20
|
+
from ....core.operator.base import Operator
|
|
21
|
+
from ....core.operator.core import TileableOperatorMixin
|
|
22
|
+
from ....dataframe.utils import parse_index
|
|
23
|
+
from ....serialization.serializables.core import Serializable
|
|
24
|
+
from ....serialization.serializables.field import AnyField, DictField, StringField
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class LLM(Serializable):
|
|
28
|
+
name = StringField("name", default=None)
|
|
29
|
+
|
|
30
|
+
def validate_params(self, params: Dict[str, Any]):
|
|
31
|
+
pass
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class LLMOperator(Operator, TileableOperatorMixin):
|
|
35
|
+
model = AnyField("model", default=None)
|
|
36
|
+
prompt_template = AnyField("prompt_template", default=None)
|
|
37
|
+
params = DictField("params", default=None)
|
|
38
|
+
|
|
39
|
+
def __init__(self, output_types=None, **kw):
|
|
40
|
+
if output_types is None:
|
|
41
|
+
output_types = [OutputType.dataframe]
|
|
42
|
+
super().__init__(_output_types=output_types, **kw)
|
|
43
|
+
|
|
44
|
+
def __call__(self, data):
|
|
45
|
+
col_names = ["response", "success"]
|
|
46
|
+
columns = parse_index(pd.Index(col_names), store_data=True)
|
|
47
|
+
out_dtypes = pd.Series([np.dtype("O"), np.dtype("bool")], index=col_names)
|
|
48
|
+
return self.new_tileable(
|
|
49
|
+
inputs=[data],
|
|
50
|
+
dtypes=out_dtypes,
|
|
51
|
+
shape=(data.shape[0], len(col_names)),
|
|
52
|
+
index_value=data.index_value,
|
|
53
|
+
columns_value=columns,
|
|
54
|
+
)
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
# Copyright 1999-2024 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
from .dashscope import DashScopeMultiModalLLM, DashScopeTextLLM
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# Copyright 1999-2024 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
from typing import Any, Dict
|
|
15
|
+
|
|
16
|
+
from ..... import opcodes
|
|
17
|
+
from .....serialization.serializables.core import Serializable
|
|
18
|
+
from .....serialization.serializables.field import StringField
|
|
19
|
+
from ..core import LLMOperator
|
|
20
|
+
from ..multi_modal import MultiModalLLM
|
|
21
|
+
from ..text import TextLLM
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class DashScopeLLMMixin(Serializable):
|
|
25
|
+
__slots__ = ()
|
|
26
|
+
|
|
27
|
+
_not_supported_params = {"stream", "incremental_output"}
|
|
28
|
+
|
|
29
|
+
def validate_params(self, params: Dict[str, Any]):
|
|
30
|
+
for k in params.keys():
|
|
31
|
+
if k in self._not_supported_params:
|
|
32
|
+
raise ValueError(f"{k} is not supported")
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class DashScopeTextLLM(TextLLM, DashScopeLLMMixin):
|
|
36
|
+
api_key_resource = StringField("api_key_resource", default=None)
|
|
37
|
+
|
|
38
|
+
def generate(
|
|
39
|
+
self,
|
|
40
|
+
data,
|
|
41
|
+
prompt_template: Dict[str, Any],
|
|
42
|
+
params: Dict[str, Any] = None,
|
|
43
|
+
):
|
|
44
|
+
return DashScopeTextGenerationOperator(
|
|
45
|
+
model=self,
|
|
46
|
+
prompt_template=prompt_template,
|
|
47
|
+
params=params,
|
|
48
|
+
)(data)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class DashScopeMultiModalLLM(MultiModalLLM, DashScopeLLMMixin):
|
|
52
|
+
api_key_resource = StringField("api_key_resource", default=None)
|
|
53
|
+
|
|
54
|
+
def generate(
|
|
55
|
+
self,
|
|
56
|
+
data,
|
|
57
|
+
prompt_template: Dict[str, Any],
|
|
58
|
+
params: Dict[str, Any] = None,
|
|
59
|
+
):
|
|
60
|
+
# TODO add precheck here
|
|
61
|
+
return DashScopeMultiModalGenerationOperator(
|
|
62
|
+
model=self,
|
|
63
|
+
prompt_template=prompt_template,
|
|
64
|
+
params=params,
|
|
65
|
+
)(data)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class DashScopeTextGenerationOperator(LLMOperator):
|
|
69
|
+
_op_type_ = opcodes.DASHSCOPE_TEXT_GENERATION
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class DashScopeMultiModalGenerationOperator(LLMOperator):
|
|
73
|
+
_op_type_ = opcodes.DASHSCOPE_MULTI_MODAL_GENERATION
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
# Copyright 1999-2024 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
from typing import Any, Dict
|
|
15
|
+
|
|
16
|
+
from ....dataframe.core import DATAFRAME_TYPE, SERIES_TYPE
|
|
17
|
+
from .core import LLM
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class MultiModalLLM(LLM):
|
|
21
|
+
def generate(
|
|
22
|
+
self,
|
|
23
|
+
data,
|
|
24
|
+
prompt_template: Dict[str, Any],
|
|
25
|
+
params: Dict[str, Any] = None,
|
|
26
|
+
):
|
|
27
|
+
raise NotImplementedError
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def generate(
|
|
31
|
+
data,
|
|
32
|
+
model: MultiModalLLM,
|
|
33
|
+
prompt_template: Dict[str, Any],
|
|
34
|
+
params: Dict[str, Any] = None,
|
|
35
|
+
):
|
|
36
|
+
if not isinstance(data, DATAFRAME_TYPE) and not isinstance(data, SERIES_TYPE):
|
|
37
|
+
raise ValueError("data must be a maxframe dataframe or series object")
|
|
38
|
+
if not isinstance(model, MultiModalLLM):
|
|
39
|
+
raise ValueError("model must be a MultiModalLLM object")
|
|
40
|
+
params = params if params is not None else dict()
|
|
41
|
+
model.validate_params(params)
|
|
42
|
+
return model.generate(data, prompt_template, params)
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
# Copyright 1999-2024 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
from typing import Any, Dict
|
|
15
|
+
|
|
16
|
+
from ....dataframe.core import DATAFRAME_TYPE, SERIES_TYPE
|
|
17
|
+
from .core import LLM
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class TextLLM(LLM):
|
|
21
|
+
def generate(
|
|
22
|
+
self,
|
|
23
|
+
data,
|
|
24
|
+
prompt_template: Dict[str, Any],
|
|
25
|
+
params: Dict[str, Any] = None,
|
|
26
|
+
):
|
|
27
|
+
raise NotImplementedError
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def generate(
|
|
31
|
+
data,
|
|
32
|
+
model: TextLLM,
|
|
33
|
+
prompt_template: Dict[str, Any],
|
|
34
|
+
params: Dict[str, Any] = None,
|
|
35
|
+
):
|
|
36
|
+
if not isinstance(data, DATAFRAME_TYPE) and not isinstance(data, SERIES_TYPE):
|
|
37
|
+
raise ValueError("data must be a maxframe dataframe or series object")
|
|
38
|
+
if not isinstance(model, TextLLM):
|
|
39
|
+
raise ValueError("model must be a TextLLM object")
|
|
40
|
+
params = params if params is not None else dict()
|
|
41
|
+
model.validate_params(params)
|
|
42
|
+
return model.generate(data, prompt_template, params)
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
# Copyright 1999-2024 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
import sys
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def make_import_error_func(package_name):
|
|
18
|
+
def _func(*_, **__): # pragma: no cover
|
|
19
|
+
raise ImportError(
|
|
20
|
+
f"Cannot import {package_name}, please reinstall that package."
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
return _func
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def config_mod_getattr(mod_dict, globals_):
|
|
27
|
+
def __getattr__(name):
|
|
28
|
+
import importlib
|
|
29
|
+
|
|
30
|
+
if name in mod_dict:
|
|
31
|
+
mod_name, cls_name = mod_dict[name].rsplit(".", 1)
|
|
32
|
+
mod = importlib.import_module(mod_name, globals_["__name__"])
|
|
33
|
+
cls = globals_[name] = getattr(mod, cls_name)
|
|
34
|
+
return cls
|
|
35
|
+
else: # pragma: no cover
|
|
36
|
+
raise AttributeError(name)
|
|
37
|
+
|
|
38
|
+
if sys.version_info[:2] < (3, 7):
|
|
39
|
+
for _mod in mod_dict.keys():
|
|
40
|
+
__getattr__(_mod)
|
|
41
|
+
|
|
42
|
+
def __dir__():
|
|
43
|
+
return sorted([n for n in globals_ if not n.startswith("_")] + list(mod_dict))
|
|
44
|
+
|
|
45
|
+
globals_.update(
|
|
46
|
+
{
|
|
47
|
+
"__getattr__": __getattr__,
|
|
48
|
+
"__dir__": __dir__,
|
|
49
|
+
"__all__": list(__dir__()),
|
|
50
|
+
"__warningregistry__": dict(),
|
|
51
|
+
}
|
|
52
|
+
)
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
# Copyright 1999-2024 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from ..utils import config_mod_getattr as _config_mod_getattr
|
|
16
|
+
from .dmatrix import DMatrix
|
|
17
|
+
from .predict import predict
|
|
18
|
+
from .train import train
|
|
19
|
+
|
|
20
|
+
_config_mod_getattr(
|
|
21
|
+
{
|
|
22
|
+
"XGBClassifier": ".classifier.XGBClassifier",
|
|
23
|
+
"XGBRegressor": ".regressor.XGBRegressor",
|
|
24
|
+
},
|
|
25
|
+
globals(),
|
|
26
|
+
)
|