maxframe 1.0.0rc1__cp311-cp311-win_amd64.whl → 1.0.0rc2__cp311-cp311-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of maxframe might be problematic. Click here for more details.
- maxframe/_utils.cp311-win_amd64.pyd +0 -0
- maxframe/codegen.py +0 -4
- maxframe/config/config.py +34 -2
- maxframe/config/validators.py +1 -0
- maxframe/conftest.py +2 -0
- maxframe/core/entity/objects.py +1 -1
- maxframe/core/graph/core.cp311-win_amd64.pyd +0 -0
- maxframe/dataframe/__init__.py +1 -1
- maxframe/dataframe/arithmetic/around.py +5 -17
- maxframe/dataframe/arithmetic/core.py +15 -7
- maxframe/dataframe/arithmetic/docstring.py +5 -55
- maxframe/dataframe/arithmetic/tests/test_arithmetic.py +22 -0
- maxframe/dataframe/core.py +5 -5
- maxframe/dataframe/datasource/date_range.py +2 -2
- maxframe/dataframe/datasource/read_odps_query.py +6 -0
- maxframe/dataframe/datasource/read_odps_table.py +2 -1
- maxframe/dataframe/datasource/tests/test_datasource.py +14 -0
- maxframe/dataframe/groupby/cum.py +0 -1
- maxframe/dataframe/groupby/tests/test_groupby.py +4 -0
- maxframe/dataframe/indexing/add_prefix_suffix.py +1 -1
- maxframe/dataframe/indexing/rename.py +3 -37
- maxframe/dataframe/indexing/sample.py +0 -1
- maxframe/dataframe/indexing/set_index.py +68 -1
- maxframe/dataframe/merge/merge.py +236 -2
- maxframe/dataframe/merge/tests/test_merge.py +123 -0
- maxframe/dataframe/misc/apply.py +3 -10
- maxframe/dataframe/misc/case_when.py +1 -1
- maxframe/dataframe/misc/describe.py +2 -2
- maxframe/dataframe/misc/drop_duplicates.py +4 -25
- maxframe/dataframe/misc/eval.py +4 -0
- maxframe/dataframe/misc/pct_change.py +1 -83
- maxframe/dataframe/misc/transform.py +1 -30
- maxframe/dataframe/misc/value_counts.py +4 -17
- maxframe/dataframe/missing/dropna.py +1 -1
- maxframe/dataframe/missing/fillna.py +5 -5
- maxframe/dataframe/sort/sort_values.py +1 -11
- maxframe/dataframe/statistics/quantile.py +5 -17
- maxframe/dataframe/utils.py +4 -7
- maxframe/learn/contrib/xgboost/dmatrix.py +2 -2
- maxframe/learn/contrib/xgboost/predict.py +2 -2
- maxframe/learn/contrib/xgboost/train.py +2 -2
- maxframe/lib/mmh3.cp311-win_amd64.pyd +0 -0
- maxframe/odpsio/__init__.py +1 -1
- maxframe/odpsio/arrow.py +8 -4
- maxframe/odpsio/schema.py +10 -7
- maxframe/odpsio/tableio.py +388 -14
- maxframe/odpsio/tests/test_schema.py +16 -15
- maxframe/odpsio/tests/test_tableio.py +48 -21
- maxframe/protocol.py +40 -2
- maxframe/serialization/core.cp311-win_amd64.pyd +0 -0
- maxframe/serialization/serializables/core.py +48 -9
- maxframe/tensor/__init__.py +59 -0
- maxframe/tensor/base/unique.py +2 -2
- maxframe/tensor/statistics/quantile.py +2 -2
- maxframe/tests/utils.py +11 -2
- maxframe/utils.py +17 -9
- {maxframe-1.0.0rc1.dist-info → maxframe-1.0.0rc2.dist-info}/METADATA +74 -1
- {maxframe-1.0.0rc1.dist-info → maxframe-1.0.0rc2.dist-info}/RECORD +64 -64
- {maxframe-1.0.0rc1.dist-info → maxframe-1.0.0rc2.dist-info}/WHEEL +1 -1
- maxframe_client/fetcher.py +38 -27
- maxframe_client/session/odps.py +5 -5
- maxframe_client/tests/test_fetcher.py +21 -3
- maxframe_client/tests/test_session.py +13 -2
- {maxframe-1.0.0rc1.dist-info → maxframe-1.0.0rc2.dist-info}/top_level.txt +0 -0
|
@@ -18,6 +18,7 @@ from ..utils import validate_axis
|
|
|
18
18
|
def pct_change(
|
|
19
19
|
df_or_series, periods=1, fill_method="pad", limit=None, freq=None, **kwargs
|
|
20
20
|
):
|
|
21
|
+
# FIXME: https://github.com/aliyun/alibabacloud-odps-maxframe-client/issues/32
|
|
21
22
|
"""
|
|
22
23
|
Percentage change between the current and a prior element.
|
|
23
24
|
|
|
@@ -50,89 +51,6 @@ def pct_change(
|
|
|
50
51
|
DataFrame.diff : Compute the difference of two elements in a DataFrame.
|
|
51
52
|
Series.shift : Shift the index by some number of periods.
|
|
52
53
|
DataFrame.shift : Shift the index by some number of periods.
|
|
53
|
-
|
|
54
|
-
Examples
|
|
55
|
-
--------
|
|
56
|
-
**Series**
|
|
57
|
-
|
|
58
|
-
>>> import maxframe.dataframe as md
|
|
59
|
-
|
|
60
|
-
>>> s = md.Series([90, 91, 85])
|
|
61
|
-
>>> s.execute()
|
|
62
|
-
0 90
|
|
63
|
-
1 91
|
|
64
|
-
2 85
|
|
65
|
-
dtype: int64
|
|
66
|
-
|
|
67
|
-
>>> s.pct_change().execute()
|
|
68
|
-
0 NaN
|
|
69
|
-
1 0.011111
|
|
70
|
-
2 -0.065934
|
|
71
|
-
dtype: float64
|
|
72
|
-
|
|
73
|
-
>>> s.pct_change(periods=2).execute()
|
|
74
|
-
0 NaN
|
|
75
|
-
1 NaN
|
|
76
|
-
2 -0.055556
|
|
77
|
-
dtype: float64
|
|
78
|
-
|
|
79
|
-
See the percentage change in a Series where filling NAs with last
|
|
80
|
-
valid observation forward to next valid.
|
|
81
|
-
|
|
82
|
-
>>> s = md.Series([90, 91, None, 85])
|
|
83
|
-
>>> s.execute()
|
|
84
|
-
0 90.0
|
|
85
|
-
1 91.0
|
|
86
|
-
2 NaN
|
|
87
|
-
3 85.0
|
|
88
|
-
dtype: float64
|
|
89
|
-
|
|
90
|
-
>>> s.pct_change(fill_method='ffill').execute()
|
|
91
|
-
0 NaN
|
|
92
|
-
1 0.011111
|
|
93
|
-
2 0.000000
|
|
94
|
-
3 -0.065934
|
|
95
|
-
dtype: float64
|
|
96
|
-
|
|
97
|
-
**DataFrame**
|
|
98
|
-
|
|
99
|
-
Percentage change in French franc, Deutsche Mark, and Italian lira from
|
|
100
|
-
1980-01-01 to 1980-03-01.
|
|
101
|
-
|
|
102
|
-
>>> df = md.DataFrame({
|
|
103
|
-
... 'FR': [4.0405, 4.0963, 4.3149],
|
|
104
|
-
... 'GR': [1.7246, 1.7482, 1.8519],
|
|
105
|
-
... 'IT': [804.74, 810.01, 860.13]},
|
|
106
|
-
... index=['1980-01-01', '1980-02-01', '1980-03-01'])
|
|
107
|
-
>>> df.execute()
|
|
108
|
-
FR GR IT
|
|
109
|
-
1980-01-01 4.0405 1.7246 804.74
|
|
110
|
-
1980-02-01 4.0963 1.7482 810.01
|
|
111
|
-
1980-03-01 4.3149 1.8519 860.13
|
|
112
|
-
|
|
113
|
-
>>> df.pct_change().execute()
|
|
114
|
-
FR GR IT
|
|
115
|
-
1980-01-01 NaN NaN NaN
|
|
116
|
-
1980-02-01 0.013810 0.013684 0.006549
|
|
117
|
-
1980-03-01 0.053365 0.059318 0.061876
|
|
118
|
-
|
|
119
|
-
Percentage of change in GOOG and APPL stock volume. Shows computing
|
|
120
|
-
the percentage change between columns.
|
|
121
|
-
|
|
122
|
-
>>> df = md.DataFrame({
|
|
123
|
-
... '2016': [1769950, 30586265],
|
|
124
|
-
... '2015': [1500923, 40912316],
|
|
125
|
-
... '2014': [1371819, 41403351]},
|
|
126
|
-
... index=['GOOG', 'APPL'])
|
|
127
|
-
>>> df.execute()
|
|
128
|
-
2016 2015 2014
|
|
129
|
-
GOOG 1769950 1500923 1371819
|
|
130
|
-
APPL 30586265 40912316 41403351
|
|
131
|
-
|
|
132
|
-
>>> df.pct_change(axis='columns').execute()
|
|
133
|
-
2016 2015 2014
|
|
134
|
-
GOOG NaN -0.151997 -0.086016
|
|
135
|
-
APPL NaN 0.337604 0.012002
|
|
136
54
|
"""
|
|
137
55
|
|
|
138
56
|
axis = validate_axis(kwargs.pop("axis", 0))
|
|
@@ -228,21 +228,6 @@ def df_transform(df, func, axis=0, *args, dtypes=None, skip_infer=False, **kwarg
|
|
|
228
228
|
0 1 2
|
|
229
229
|
1 2 3
|
|
230
230
|
2 3 4
|
|
231
|
-
|
|
232
|
-
Even though the resulting DataFrame must have the same length as the
|
|
233
|
-
input DataFrame, it is possible to provide several input functions:
|
|
234
|
-
|
|
235
|
-
>>> s = md.Series(range(3))
|
|
236
|
-
>>> s.execute()
|
|
237
|
-
0 0
|
|
238
|
-
1 1
|
|
239
|
-
2 2
|
|
240
|
-
dtype: int64
|
|
241
|
-
>>> s.transform([mt.sqrt, mt.exp]).execute()
|
|
242
|
-
sqrt exp
|
|
243
|
-
0 0.000000 1.000000
|
|
244
|
-
1 1.000000 2.718282
|
|
245
|
-
2 1.414214 7.389056
|
|
246
231
|
"""
|
|
247
232
|
op = TransformOperator(
|
|
248
233
|
func=func,
|
|
@@ -265,6 +250,7 @@ def series_transform(
|
|
|
265
250
|
dtype=None,
|
|
266
251
|
**kwargs
|
|
267
252
|
):
|
|
253
|
+
# FIXME: https://github.com/aliyun/alibabacloud-odps-maxframe-client/issues/10
|
|
268
254
|
"""
|
|
269
255
|
Call ``func`` on self producing a Series with transformed values.
|
|
270
256
|
|
|
@@ -332,21 +318,6 @@ def series_transform(
|
|
|
332
318
|
0 1 2
|
|
333
319
|
1 2 3
|
|
334
320
|
2 3 4
|
|
335
|
-
|
|
336
|
-
Even though the resulting Series must have the same length as the
|
|
337
|
-
input Series, it is possible to provide several input functions:
|
|
338
|
-
|
|
339
|
-
>>> s = md.Series(range(3))
|
|
340
|
-
>>> s.execute()
|
|
341
|
-
0 0
|
|
342
|
-
1 1
|
|
343
|
-
2 2
|
|
344
|
-
dtype: int64
|
|
345
|
-
>>> s.transform([mt.sqrt, mt.exp]).execute()
|
|
346
|
-
sqrt exp
|
|
347
|
-
0 0.000000 1.000000
|
|
348
|
-
1 1.000000 2.718282
|
|
349
|
-
2 1.414214 7.389056
|
|
350
321
|
"""
|
|
351
322
|
op = TransformOperator(
|
|
352
323
|
func=func,
|
|
@@ -85,6 +85,7 @@ def value_counts(
|
|
|
85
85
|
dropna=True,
|
|
86
86
|
method="auto",
|
|
87
87
|
):
|
|
88
|
+
# FIXME: https://github.com/aliyun/alibabacloud-odps-maxframe-client/issues/33
|
|
88
89
|
"""
|
|
89
90
|
Return a Series containing counts of unique values.
|
|
90
91
|
|
|
@@ -125,9 +126,8 @@ def value_counts(
|
|
|
125
126
|
Examples
|
|
126
127
|
--------
|
|
127
128
|
>>> import maxframe.dataframe as md
|
|
128
|
-
>>> import
|
|
129
|
-
|
|
130
|
-
>>> s = md.Series([3, 1, 2, 3, 4, mt.nan])
|
|
129
|
+
>>> import numpy as np
|
|
130
|
+
>>> s = md.Series([3, 1, 2, 3, 4, np.nan])
|
|
131
131
|
>>> s.value_counts().execute()
|
|
132
132
|
3.0 2
|
|
133
133
|
4.0 1
|
|
@@ -138,7 +138,7 @@ def value_counts(
|
|
|
138
138
|
With `normalize` set to `True`, returns the relative frequency by
|
|
139
139
|
dividing all values by the sum of values.
|
|
140
140
|
|
|
141
|
-
>>> s = md.Series([3, 1, 2, 3, 4,
|
|
141
|
+
>>> s = md.Series([3, 1, 2, 3, 4, np.nan])
|
|
142
142
|
>>> s.value_counts(normalize=True).execute()
|
|
143
143
|
3.0 0.4
|
|
144
144
|
4.0 0.2
|
|
@@ -146,19 +146,6 @@ def value_counts(
|
|
|
146
146
|
1.0 0.2
|
|
147
147
|
dtype: float64
|
|
148
148
|
|
|
149
|
-
**bins**
|
|
150
|
-
|
|
151
|
-
Bins can be useful for going from a continuous variable to a
|
|
152
|
-
categorical variable; instead of counting unique
|
|
153
|
-
apparitions of values, divide the index in the specified
|
|
154
|
-
number of half-open bins.
|
|
155
|
-
|
|
156
|
-
>>> s.value_counts(bins=3).execute()
|
|
157
|
-
(2.0, 3.0] 2
|
|
158
|
-
(0.996, 2.0] 2
|
|
159
|
-
(3.0, 4.0] 1
|
|
160
|
-
dtype: int64
|
|
161
|
-
|
|
162
149
|
**dropna**
|
|
163
150
|
|
|
164
151
|
With `dropna` set to `False` we can also see NaN index values.
|
|
@@ -234,7 +234,7 @@ def series_dropna(series, axis=0, inplace=False, how=None):
|
|
|
234
234
|
Empty strings are not considered NA values. ``None`` is considered an
|
|
235
235
|
NA value.
|
|
236
236
|
|
|
237
|
-
>>> ser = md.Series([np.NaN, 2, md.NaT, '', None, 'I stay'])
|
|
237
|
+
>>> ser = md.Series([np.NaN, '2', md.NaT, '', None, 'I stay'])
|
|
238
238
|
>>> ser.execute()
|
|
239
239
|
0 NaN
|
|
240
240
|
1 2
|
|
@@ -132,11 +132,11 @@ def fillna(
|
|
|
132
132
|
--------
|
|
133
133
|
>>> import maxframe.tensor as mt
|
|
134
134
|
>>> import maxframe.dataframe as md
|
|
135
|
-
>>> df = md.DataFrame([[
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
135
|
+
>>> df = md.DataFrame([[np.nan, 2, np.nan, 0],
|
|
136
|
+
[3, 4, np.nan, 1],
|
|
137
|
+
[np.nan, np.nan, np.nan, 5],
|
|
138
|
+
[np.nan, 3, np.nan, 4]],
|
|
139
|
+
columns=list('ABCD'))
|
|
140
140
|
>>> df.execute()
|
|
141
141
|
A B C D
|
|
142
142
|
0 NaN 2.0 NaN 0
|
|
@@ -67,6 +67,7 @@ def dataframe_sort_values(
|
|
|
67
67
|
parallel_kind="PSRS",
|
|
68
68
|
psrs_kinds=None,
|
|
69
69
|
):
|
|
70
|
+
# FIXME: https://github.com/aliyun/alibabacloud-odps-maxframe-client/issues/15
|
|
70
71
|
"""
|
|
71
72
|
Sort by the values along either axis.
|
|
72
73
|
|
|
@@ -152,17 +153,6 @@ def dataframe_sort_values(
|
|
|
152
153
|
0 A 2 0
|
|
153
154
|
1 A 1 1
|
|
154
155
|
3 NaN 8 4
|
|
155
|
-
|
|
156
|
-
Putting NAs first
|
|
157
|
-
|
|
158
|
-
>>> df.sort_values(by='col1', ascending=False, na_position='first').execute()
|
|
159
|
-
col1 col2 col3
|
|
160
|
-
3 NaN 8 4
|
|
161
|
-
4 D 7 2
|
|
162
|
-
5 C 4 3
|
|
163
|
-
2 B 9 9
|
|
164
|
-
0 A 2 0
|
|
165
|
-
1 A 1 1
|
|
166
156
|
"""
|
|
167
157
|
|
|
168
158
|
if na_position not in ["last", "first"]: # pragma: no cover
|
|
@@ -14,8 +14,9 @@
|
|
|
14
14
|
|
|
15
15
|
import numpy as np
|
|
16
16
|
import pandas as pd
|
|
17
|
+
from pandas.core.dtypes.cast import find_common_type
|
|
17
18
|
|
|
18
|
-
from ... import opcodes
|
|
19
|
+
from ... import opcodes
|
|
19
20
|
from ...core import ENTITY_TYPE
|
|
20
21
|
from ...serialization.serializables import (
|
|
21
22
|
AnyField,
|
|
@@ -32,11 +33,11 @@ from ...tensor.datasource import tensor as astensor
|
|
|
32
33
|
from ...tensor.statistics.quantile import quantile as tensor_quantile
|
|
33
34
|
from ..core import DATAFRAME_TYPE
|
|
34
35
|
from ..operators import DataFrameOperator, DataFrameOperatorMixin
|
|
35
|
-
from ..utils import build_empty_df,
|
|
36
|
+
from ..utils import build_empty_df, parse_index, validate_axis
|
|
36
37
|
|
|
37
38
|
|
|
38
39
|
class DataFrameQuantile(DataFrameOperator, DataFrameOperatorMixin):
|
|
39
|
-
_op_type_ =
|
|
40
|
+
_op_type_ = opcodes.QUANTILE
|
|
40
41
|
|
|
41
42
|
input = KeyField("input", default=None)
|
|
42
43
|
q = AnyField("q", default=None)
|
|
@@ -259,6 +260,7 @@ def quantile_series(series, q=0.5, interpolation="linear"):
|
|
|
259
260
|
|
|
260
261
|
|
|
261
262
|
def quantile_dataframe(df, q=0.5, axis=0, numeric_only=True, interpolation="linear"):
|
|
263
|
+
# FIXME: Timedelta not support. Data invalid: ODPS-0010000:InvalidArgument:duration[ns] is not equal to string
|
|
262
264
|
"""
|
|
263
265
|
Return values at the given quantile over requested axis.
|
|
264
266
|
|
|
@@ -309,20 +311,6 @@ def quantile_dataframe(df, q=0.5, axis=0, numeric_only=True, interpolation="line
|
|
|
309
311
|
a b
|
|
310
312
|
0.1 1.3 3.7
|
|
311
313
|
0.5 2.5 55.0
|
|
312
|
-
|
|
313
|
-
Specifying `numeric_only=False` will also compute the quantile of
|
|
314
|
-
datetime and timedelta data.
|
|
315
|
-
|
|
316
|
-
>>> df = md.DataFrame({'A': [1, 2],
|
|
317
|
-
... 'B': [md.Timestamp('2010'),
|
|
318
|
-
... md.Timestamp('2011')],
|
|
319
|
-
... 'C': [md.Timedelta('1 days'),
|
|
320
|
-
... md.Timedelta('2 days')]})
|
|
321
|
-
>>> df.quantile(0.5, numeric_only=False).execute()
|
|
322
|
-
A 1.5
|
|
323
|
-
B 2010-07-02 12:00:00
|
|
324
|
-
C 1 days 12:00:00
|
|
325
|
-
Name: 0.5, dtype: object
|
|
326
314
|
"""
|
|
327
315
|
if isinstance(q, ENTITY_TYPE):
|
|
328
316
|
q = astensor(q)
|
maxframe/dataframe/utils.py
CHANGED
|
@@ -26,7 +26,6 @@ import numpy as np
|
|
|
26
26
|
import pandas as pd
|
|
27
27
|
from pandas.api.extensions import ExtensionDtype
|
|
28
28
|
from pandas.api.types import is_string_dtype
|
|
29
|
-
from pandas.core.dtypes.cast import find_common_type
|
|
30
29
|
from pandas.core.dtypes.inference import is_dict_like, is_list_like
|
|
31
30
|
|
|
32
31
|
from ..core import Entity, ExecutableTuple
|
|
@@ -477,11 +476,11 @@ def build_df(df_obj, fill_value=1, size=1, ensure_string=False):
|
|
|
477
476
|
else:
|
|
478
477
|
fill_values = fill_value
|
|
479
478
|
|
|
480
|
-
from .core import SERIES_TYPE
|
|
479
|
+
from .core import INDEX_TYPE, SERIES_TYPE
|
|
481
480
|
|
|
482
481
|
dtypes = (
|
|
483
482
|
pd.Series([df_obj.dtype], index=[df_obj.name])
|
|
484
|
-
if isinstance(df_obj, SERIES_TYPE)
|
|
483
|
+
if isinstance(df_obj, (INDEX_TYPE, SERIES_TYPE))
|
|
485
484
|
else df_obj.dtypes
|
|
486
485
|
)
|
|
487
486
|
for size, fill_value in zip(sizes, fill_values):
|
|
@@ -593,7 +592,7 @@ def build_series(
|
|
|
593
592
|
return ret_series
|
|
594
593
|
|
|
595
594
|
|
|
596
|
-
def infer_index_value(left_index_value, right_index_value):
|
|
595
|
+
def infer_index_value(left_index_value, right_index_value, level=None):
|
|
597
596
|
from .core import IndexValue
|
|
598
597
|
|
|
599
598
|
if isinstance(left_index_value.value, IndexValue.RangeIndex) and isinstance(
|
|
@@ -616,9 +615,7 @@ def infer_index_value(left_index_value, right_index_value):
|
|
|
616
615
|
|
|
617
616
|
left_index = left_index_value.to_pandas()
|
|
618
617
|
right_index = right_index_value.to_pandas()
|
|
619
|
-
out_index =
|
|
620
|
-
[], dtype=find_common_type([left_index.dtype, right_index.dtype])
|
|
621
|
-
)
|
|
618
|
+
out_index = left_index.join(right_index, level=level)[:0]
|
|
622
619
|
return parse_index(out_index, left_index_value, right_index_value)
|
|
623
620
|
|
|
624
621
|
|
|
@@ -13,7 +13,7 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
15
|
|
|
16
|
-
from .... import opcodes
|
|
16
|
+
from .... import opcodes
|
|
17
17
|
from ....core.entity.output_types import get_output_types
|
|
18
18
|
from ....core.operator.base import Operator
|
|
19
19
|
from ....core.operator.core import TileableOperatorMixin
|
|
@@ -27,7 +27,7 @@ from ...utils import convert_to_tensor_or_dataframe
|
|
|
27
27
|
|
|
28
28
|
|
|
29
29
|
class ToDMatrix(Operator, TileableOperatorMixin):
|
|
30
|
-
_op_type_ =
|
|
30
|
+
_op_type_ = opcodes.TO_DMATRIX
|
|
31
31
|
|
|
32
32
|
data = KeyField("data", default=None)
|
|
33
33
|
label = KeyField("label", default=None)
|
|
@@ -17,7 +17,7 @@ import pickle
|
|
|
17
17
|
import numpy as np
|
|
18
18
|
import pandas as pd
|
|
19
19
|
|
|
20
|
-
from .... import opcodes
|
|
20
|
+
from .... import opcodes
|
|
21
21
|
from ....core.entity.output_types import OutputType
|
|
22
22
|
from ....core.operator.base import Operator
|
|
23
23
|
from ....core.operator.core import TileableOperatorMixin
|
|
@@ -28,7 +28,7 @@ from .dmatrix import check_data
|
|
|
28
28
|
|
|
29
29
|
|
|
30
30
|
class XGBPredict(Operator, TileableOperatorMixin):
|
|
31
|
-
_op_type_ =
|
|
31
|
+
_op_type_ = opcodes.XGBOOST_PREDICT
|
|
32
32
|
output_dtype = np.dtype(np.float32)
|
|
33
33
|
|
|
34
34
|
data = KeyField("data", default=None)
|
|
@@ -15,7 +15,7 @@
|
|
|
15
15
|
import logging
|
|
16
16
|
from collections import OrderedDict
|
|
17
17
|
|
|
18
|
-
from .... import opcodes
|
|
18
|
+
from .... import opcodes
|
|
19
19
|
from ....core import OutputType
|
|
20
20
|
from ....core.operator.base import Operator
|
|
21
21
|
from ....core.operator.core import TileableOperatorMixin
|
|
@@ -41,7 +41,7 @@ def _on_serialize_evals(evals_val):
|
|
|
41
41
|
|
|
42
42
|
|
|
43
43
|
class XGBTrain(Operator, TileableOperatorMixin):
|
|
44
|
-
_op_type_ =
|
|
44
|
+
_op_type_ = opcodes.XGBOOST_TRAIN
|
|
45
45
|
|
|
46
46
|
params = DictField("params", key_type=FieldTypes.string, default=None)
|
|
47
47
|
dtrain = KeyField("dtrain", default=None)
|
|
Binary file
|
maxframe/odpsio/__init__.py
CHANGED
maxframe/odpsio/arrow.py
CHANGED
|
@@ -45,9 +45,13 @@ def _rebuild_dataframe(
|
|
|
45
45
|
|
|
46
46
|
def _rebuild_index(df: pd.DataFrame, table_meta: DataFrameTableMeta) -> pd.Index:
|
|
47
47
|
if df.shape[1] > 1:
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
48
|
+
idx = pd.MultiIndex.from_frame(df)
|
|
49
|
+
idx.names = table_meta.pd_index_level_names
|
|
50
|
+
else:
|
|
51
|
+
# make sure even if None names are updated properly
|
|
52
|
+
idx = pd.Index(df.iloc[:, 0])
|
|
53
|
+
idx.name = table_meta.pd_index_level_names[0]
|
|
54
|
+
return idx
|
|
51
55
|
|
|
52
56
|
|
|
53
57
|
def arrow_to_pandas(
|
|
@@ -75,7 +79,7 @@ def pandas_to_arrow(
|
|
|
75
79
|
df.columns = pd.Index(table_meta.table_column_names)
|
|
76
80
|
if not ignore_index:
|
|
77
81
|
df = df.rename_axis(table_meta.table_index_column_names).reset_index()
|
|
78
|
-
elif ignore_index:
|
|
82
|
+
elif ignore_index and table_meta.type != OutputType.index:
|
|
79
83
|
df = pd.DataFrame([], columns=[])
|
|
80
84
|
elif table_meta.type == OutputType.index:
|
|
81
85
|
names = [f"_idx_{idx}" for idx in range(len(df.names))]
|
maxframe/odpsio/schema.py
CHANGED
|
@@ -126,10 +126,15 @@ def odps_type_to_arrow_type(
|
|
|
126
126
|
]
|
|
127
127
|
col_type = pa.struct(fields)
|
|
128
128
|
elif isinstance(odps_type, odps_types.Decimal):
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
129
|
+
if odps_type.name == "decimal":
|
|
130
|
+
# legacy decimal data without precision or scale
|
|
131
|
+
# precision data from internal compat mode
|
|
132
|
+
col_type = pa.decimal128(38, 18)
|
|
133
|
+
else:
|
|
134
|
+
col_type = pa.decimal128(
|
|
135
|
+
odps_type.precision or odps_types.Decimal._max_precision,
|
|
136
|
+
odps_type.scale or odps_types.Decimal._max_scale,
|
|
137
|
+
)
|
|
133
138
|
elif isinstance(odps_type, (odps_types.Varchar, odps_types.Char)):
|
|
134
139
|
col_type = pa.string()
|
|
135
140
|
else:
|
|
@@ -289,8 +294,6 @@ def build_dataframe_table_meta(
|
|
|
289
294
|
else: # pragma: no cover
|
|
290
295
|
raise TypeError(f"Cannot accept type {type(df_obj)}")
|
|
291
296
|
|
|
292
|
-
assert not ignore_index or obj_type in (OutputType.dataframe, OutputType.series)
|
|
293
|
-
|
|
294
297
|
if obj_type == OutputType.scalar:
|
|
295
298
|
pd_dtypes = pd.Series([])
|
|
296
299
|
column_index_names = []
|
|
@@ -346,7 +349,7 @@ def build_dataframe_table_meta(
|
|
|
346
349
|
else:
|
|
347
350
|
index_dtypes = pd.Series([pd_index_val.dtype], index=pd_index_val.names)
|
|
348
351
|
|
|
349
|
-
if ignore_index:
|
|
352
|
+
if ignore_index and obj_type != OutputType.index:
|
|
350
353
|
table_index_column_names = []
|
|
351
354
|
pd_index_dtypes = pd.Series([], index=[])
|
|
352
355
|
else:
|