maxframe 2.0.0b2__cp37-cp37m-win32.whl → 2.3.0rc1__cp37-cp37m-win32.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of maxframe might be problematic. Click here for more details.
- maxframe/__init__.py +1 -0
- maxframe/_utils.cp37-win32.pyd +0 -0
- maxframe/_utils.pyx +14 -1
- maxframe/codegen/core.py +9 -8
- maxframe/codegen/spe/core.py +1 -1
- maxframe/codegen/spe/dataframe/__init__.py +1 -0
- maxframe/codegen/spe/dataframe/accessors/base.py +18 -0
- maxframe/codegen/spe/dataframe/accessors/dict_.py +25 -130
- maxframe/codegen/spe/dataframe/accessors/list_.py +12 -48
- maxframe/codegen/spe/dataframe/accessors/struct_.py +28 -0
- maxframe/codegen/spe/dataframe/arithmetic.py +7 -2
- maxframe/codegen/spe/dataframe/groupby.py +88 -0
- maxframe/codegen/spe/dataframe/indexing.py +99 -4
- maxframe/codegen/spe/dataframe/merge.py +38 -1
- maxframe/codegen/spe/dataframe/misc.py +11 -33
- maxframe/codegen/spe/dataframe/reduction.py +32 -9
- maxframe/codegen/spe/dataframe/reshape.py +46 -0
- maxframe/codegen/spe/dataframe/sort.py +39 -18
- maxframe/codegen/spe/dataframe/tests/accessors/test_dict.py +9 -15
- maxframe/codegen/spe/dataframe/tests/accessors/test_list.py +4 -7
- maxframe/codegen/spe/dataframe/tests/accessors/test_struct.py +75 -0
- maxframe/codegen/spe/dataframe/tests/indexing/test_iloc.py +20 -1
- maxframe/codegen/spe/dataframe/tests/indexing/test_loc.py +35 -0
- maxframe/codegen/spe/dataframe/tests/misc/test_misc.py +0 -32
- maxframe/codegen/spe/dataframe/tests/test_groupby.py +81 -18
- maxframe/codegen/spe/dataframe/tests/test_merge.py +27 -1
- maxframe/codegen/spe/dataframe/tests/test_reduction.py +13 -0
- maxframe/codegen/spe/dataframe/tests/test_reshape.py +79 -0
- maxframe/codegen/spe/dataframe/tests/test_sort.py +20 -0
- maxframe/codegen/spe/dataframe/tseries.py +9 -0
- maxframe/codegen/spe/learn/contrib/lightgbm.py +4 -3
- maxframe/codegen/spe/learn/contrib/tests/test_xgboost.py +2 -1
- maxframe/codegen/spe/learn/metrics/__init__.py +1 -1
- maxframe/codegen/spe/learn/metrics/_ranking.py +76 -0
- maxframe/codegen/spe/learn/metrics/pairwise.py +51 -0
- maxframe/codegen/spe/learn/metrics/tests/test_pairwise.py +36 -0
- maxframe/codegen/spe/learn/metrics/tests/test_ranking.py +59 -0
- maxframe/codegen/spe/tensor/__init__.py +3 -0
- maxframe/codegen/spe/tensor/datasource.py +1 -0
- maxframe/codegen/spe/tensor/fft.py +74 -0
- maxframe/codegen/spe/tensor/linalg.py +29 -2
- maxframe/codegen/spe/tensor/misc.py +79 -25
- maxframe/codegen/spe/tensor/spatial.py +45 -0
- maxframe/codegen/spe/tensor/statistics.py +44 -0
- maxframe/codegen/spe/tensor/tests/test_fft.py +64 -0
- maxframe/codegen/spe/tensor/tests/test_linalg.py +15 -1
- maxframe/codegen/spe/tensor/tests/test_misc.py +52 -2
- maxframe/codegen/spe/tensor/tests/test_spatial.py +33 -0
- maxframe/codegen/spe/tensor/tests/test_statistics.py +15 -1
- maxframe/codegen/spe/tests/test_spe_codegen.py +6 -12
- maxframe/codegen/spe/utils.py +2 -0
- maxframe/config/config.py +73 -9
- maxframe/config/tests/test_validators.py +13 -1
- maxframe/config/validators.py +49 -0
- maxframe/conftest.py +54 -17
- maxframe/core/accessor.py +2 -2
- maxframe/core/base.py +2 -1
- maxframe/core/entity/core.py +5 -0
- maxframe/core/entity/tileables.py +3 -1
- maxframe/core/graph/core.cp37-win32.pyd +0 -0
- maxframe/core/graph/entity.py +8 -3
- maxframe/core/mode.py +6 -1
- maxframe/core/operator/base.py +9 -2
- maxframe/core/operator/core.py +10 -2
- maxframe/core/operator/utils.py +13 -0
- maxframe/dataframe/__init__.py +12 -5
- maxframe/dataframe/accessors/__init__.py +1 -1
- maxframe/dataframe/accessors/compat.py +45 -0
- maxframe/dataframe/accessors/datetime_/__init__.py +4 -1
- maxframe/dataframe/accessors/dict_/contains.py +7 -16
- maxframe/dataframe/accessors/dict_/core.py +48 -0
- maxframe/dataframe/accessors/dict_/getitem.py +17 -21
- maxframe/dataframe/accessors/dict_/length.py +7 -16
- maxframe/dataframe/accessors/dict_/remove.py +6 -18
- maxframe/dataframe/accessors/dict_/setitem.py +8 -18
- maxframe/dataframe/accessors/dict_/tests/test_dict_accessor.py +62 -22
- maxframe/dataframe/accessors/list_/__init__.py +2 -2
- maxframe/dataframe/accessors/list_/core.py +48 -0
- maxframe/dataframe/accessors/list_/getitem.py +12 -19
- maxframe/dataframe/accessors/list_/length.py +7 -16
- maxframe/dataframe/accessors/list_/tests/test_list_accessor.py +11 -9
- maxframe/dataframe/accessors/string_/__init__.py +4 -1
- maxframe/dataframe/accessors/struct_/__init__.py +37 -0
- maxframe/dataframe/accessors/struct_/accessor.py +39 -0
- maxframe/dataframe/accessors/struct_/core.py +43 -0
- maxframe/dataframe/accessors/struct_/dtypes.py +53 -0
- maxframe/dataframe/accessors/struct_/field.py +123 -0
- maxframe/dataframe/accessors/struct_/tests/__init__.py +13 -0
- maxframe/dataframe/accessors/struct_/tests/test_struct_accessor.py +91 -0
- maxframe/dataframe/arithmetic/__init__.py +18 -4
- maxframe/dataframe/arithmetic/between.py +106 -0
- maxframe/dataframe/arithmetic/dot.py +237 -0
- maxframe/dataframe/arithmetic/maximum.py +33 -0
- maxframe/dataframe/arithmetic/minimum.py +33 -0
- maxframe/dataframe/arithmetic/{around.py → round.py} +11 -7
- maxframe/dataframe/core.py +161 -224
- maxframe/dataframe/datasource/__init__.py +18 -0
- maxframe/dataframe/datasource/core.py +6 -0
- maxframe/dataframe/datasource/direct.py +57 -0
- maxframe/dataframe/datasource/from_dict.py +124 -0
- maxframe/dataframe/datasource/from_index.py +1 -1
- maxframe/dataframe/datasource/from_records.py +77 -0
- maxframe/dataframe/datasource/from_tensor.py +109 -41
- maxframe/dataframe/datasource/read_csv.py +21 -14
- maxframe/dataframe/datasource/read_odps_query.py +29 -6
- maxframe/dataframe/datasource/read_odps_table.py +32 -10
- maxframe/dataframe/datasource/read_parquet.py +38 -39
- maxframe/dataframe/datasource/tests/test_datasource.py +37 -0
- maxframe/dataframe/datastore/__init__.py +11 -1
- maxframe/dataframe/datastore/direct.py +268 -0
- maxframe/dataframe/datastore/to_csv.py +29 -41
- maxframe/dataframe/datastore/to_odps.py +36 -4
- maxframe/dataframe/extensions/__init__.py +20 -4
- maxframe/dataframe/extensions/apply_chunk.py +32 -6
- maxframe/dataframe/extensions/cartesian_chunk.py +153 -0
- maxframe/dataframe/extensions/collect_kv.py +126 -0
- maxframe/dataframe/extensions/extract_kv.py +177 -0
- maxframe/dataframe/extensions/flatjson.py +2 -1
- maxframe/dataframe/extensions/map_reduce.py +263 -0
- maxframe/dataframe/extensions/rebalance.py +62 -0
- maxframe/dataframe/extensions/tests/test_apply_chunk.py +9 -2
- maxframe/dataframe/extensions/tests/test_extensions.py +54 -0
- maxframe/dataframe/extensions/tests/test_map_reduce.py +135 -0
- maxframe/dataframe/groupby/__init__.py +17 -2
- maxframe/dataframe/groupby/aggregation.py +86 -49
- maxframe/dataframe/groupby/apply.py +1 -1
- maxframe/dataframe/groupby/apply_chunk.py +19 -5
- maxframe/dataframe/groupby/core.py +116 -16
- maxframe/dataframe/groupby/cum.py +4 -25
- maxframe/dataframe/groupby/expanding.py +264 -0
- maxframe/dataframe/groupby/fill.py +1 -1
- maxframe/dataframe/groupby/getitem.py +12 -5
- maxframe/dataframe/groupby/head.py +11 -1
- maxframe/dataframe/groupby/rank.py +136 -0
- maxframe/dataframe/groupby/rolling.py +206 -0
- maxframe/dataframe/groupby/shift.py +114 -0
- maxframe/dataframe/groupby/tests/test_groupby.py +0 -5
- maxframe/dataframe/indexing/__init__.py +22 -2
- maxframe/dataframe/indexing/droplevel.py +195 -0
- maxframe/dataframe/indexing/filter.py +169 -0
- maxframe/dataframe/indexing/get_level_values.py +76 -0
- maxframe/dataframe/indexing/iat.py +45 -0
- maxframe/dataframe/indexing/iloc.py +152 -12
- maxframe/dataframe/indexing/insert.py +46 -18
- maxframe/dataframe/indexing/loc.py +287 -7
- maxframe/dataframe/indexing/reindex.py +14 -5
- maxframe/dataframe/indexing/rename.py +6 -0
- maxframe/dataframe/indexing/rename_axis.py +2 -2
- maxframe/dataframe/indexing/reorder_levels.py +143 -0
- maxframe/dataframe/indexing/reset_index.py +33 -6
- maxframe/dataframe/indexing/sample.py +8 -0
- maxframe/dataframe/indexing/setitem.py +3 -3
- maxframe/dataframe/indexing/swaplevel.py +185 -0
- maxframe/dataframe/indexing/take.py +99 -0
- maxframe/dataframe/indexing/truncate.py +140 -0
- maxframe/dataframe/indexing/where.py +0 -11
- maxframe/dataframe/indexing/xs.py +148 -0
- maxframe/dataframe/merge/__init__.py +15 -1
- maxframe/dataframe/merge/append.py +97 -98
- maxframe/dataframe/merge/combine.py +244 -0
- maxframe/dataframe/merge/combine_first.py +120 -0
- maxframe/dataframe/merge/compare.py +387 -0
- maxframe/dataframe/merge/concat.py +183 -0
- maxframe/dataframe/merge/update.py +271 -0
- maxframe/dataframe/misc/__init__.py +28 -11
- maxframe/dataframe/misc/_duplicate.py +10 -4
- maxframe/dataframe/misc/apply.py +1 -1
- maxframe/dataframe/misc/check_unique.py +82 -0
- maxframe/dataframe/misc/clip.py +145 -0
- maxframe/dataframe/misc/describe.py +175 -9
- maxframe/dataframe/misc/drop.py +31 -0
- maxframe/dataframe/misc/drop_duplicates.py +2 -2
- maxframe/dataframe/misc/duplicated.py +2 -2
- maxframe/dataframe/misc/get_dummies.py +5 -1
- maxframe/dataframe/misc/infer_dtypes.py +251 -0
- maxframe/dataframe/misc/isin.py +2 -2
- maxframe/dataframe/misc/map.py +125 -18
- maxframe/dataframe/misc/repeat.py +159 -0
- maxframe/dataframe/misc/tests/test_misc.py +48 -3
- maxframe/dataframe/misc/to_numeric.py +3 -0
- maxframe/dataframe/misc/transform.py +12 -5
- maxframe/dataframe/misc/transpose.py +13 -1
- maxframe/dataframe/misc/valid_index.py +115 -0
- maxframe/dataframe/misc/value_counts.py +38 -4
- maxframe/dataframe/missing/checkna.py +14 -6
- maxframe/dataframe/missing/dropna.py +5 -0
- maxframe/dataframe/missing/fillna.py +1 -1
- maxframe/dataframe/missing/replace.py +7 -4
- maxframe/dataframe/reduction/__init__.py +35 -16
- maxframe/dataframe/reduction/aggregation.py +43 -14
- maxframe/dataframe/reduction/all.py +2 -2
- maxframe/dataframe/reduction/any.py +2 -2
- maxframe/dataframe/reduction/argmax.py +103 -0
- maxframe/dataframe/reduction/argmin.py +103 -0
- maxframe/dataframe/reduction/core.py +80 -24
- maxframe/dataframe/reduction/count.py +13 -9
- maxframe/dataframe/reduction/cov.py +166 -0
- maxframe/dataframe/reduction/cummax.py +2 -2
- maxframe/dataframe/reduction/cummin.py +2 -2
- maxframe/dataframe/reduction/cumprod.py +2 -2
- maxframe/dataframe/reduction/cumsum.py +2 -2
- maxframe/dataframe/reduction/custom_reduction.py +2 -2
- maxframe/dataframe/reduction/idxmax.py +185 -0
- maxframe/dataframe/reduction/idxmin.py +185 -0
- maxframe/dataframe/reduction/kurtosis.py +37 -30
- maxframe/dataframe/reduction/max.py +2 -2
- maxframe/dataframe/reduction/mean.py +9 -7
- maxframe/dataframe/reduction/median.py +2 -2
- maxframe/dataframe/reduction/min.py +2 -2
- maxframe/dataframe/reduction/mode.py +144 -0
- maxframe/dataframe/reduction/nunique.py +19 -11
- maxframe/dataframe/reduction/prod.py +18 -13
- maxframe/dataframe/reduction/reduction_size.py +2 -2
- maxframe/dataframe/reduction/sem.py +13 -9
- maxframe/dataframe/reduction/skew.py +31 -27
- maxframe/dataframe/reduction/str_concat.py +10 -7
- maxframe/dataframe/reduction/sum.py +18 -14
- maxframe/dataframe/reduction/tests/test_reduction.py +12 -0
- maxframe/dataframe/reduction/unique.py +20 -3
- maxframe/dataframe/reduction/var.py +16 -12
- maxframe/dataframe/reshape/__init__.py +38 -0
- maxframe/dataframe/{misc → reshape}/pivot.py +1 -0
- maxframe/dataframe/{misc → reshape}/pivot_table.py +1 -0
- maxframe/dataframe/reshape/unstack.py +114 -0
- maxframe/dataframe/sort/__init__.py +16 -1
- maxframe/dataframe/sort/argsort.py +68 -0
- maxframe/dataframe/sort/core.py +2 -1
- maxframe/dataframe/sort/nlargest.py +238 -0
- maxframe/dataframe/sort/nsmallest.py +228 -0
- maxframe/dataframe/sort/rank.py +147 -0
- maxframe/dataframe/statistics/__init__.py +3 -3
- maxframe/dataframe/statistics/corr.py +1 -0
- maxframe/dataframe/statistics/quantile.py +2 -2
- maxframe/dataframe/tests/test_typing.py +104 -0
- maxframe/dataframe/tests/test_utils.py +66 -2
- maxframe/dataframe/tseries/__init__.py +19 -0
- maxframe/dataframe/tseries/at_time.py +61 -0
- maxframe/dataframe/tseries/between_time.py +122 -0
- maxframe/dataframe/typing_.py +185 -0
- maxframe/dataframe/utils.py +125 -52
- maxframe/dataframe/window/aggregation.py +8 -4
- maxframe/dataframe/window/core.py +14 -1
- maxframe/dataframe/window/ewm.py +1 -3
- maxframe/dataframe/window/expanding.py +37 -35
- maxframe/dataframe/window/rolling.py +49 -39
- maxframe/dataframe/window/tests/test_expanding.py +1 -7
- maxframe/dataframe/window/tests/test_rolling.py +1 -1
- maxframe/env.py +7 -4
- maxframe/errors.py +2 -2
- maxframe/io/odpsio/schema.py +9 -3
- maxframe/io/odpsio/tableio.py +7 -2
- maxframe/io/odpsio/tests/test_schema.py +198 -83
- maxframe/learn/__init__.py +10 -2
- maxframe/learn/cluster/__init__.py +15 -0
- maxframe/learn/cluster/_kmeans.py +782 -0
- maxframe/learn/contrib/llm/core.py +18 -7
- maxframe/learn/contrib/llm/deploy/__init__.py +13 -0
- maxframe/learn/contrib/llm/deploy/config.py +221 -0
- maxframe/learn/contrib/llm/deploy/core.py +247 -0
- maxframe/learn/contrib/llm/deploy/framework.py +35 -0
- maxframe/learn/contrib/llm/deploy/loader.py +360 -0
- maxframe/learn/contrib/llm/deploy/tests/__init__.py +13 -0
- maxframe/learn/contrib/llm/deploy/tests/test_register_models.py +359 -0
- maxframe/learn/contrib/llm/models/__init__.py +1 -0
- maxframe/learn/contrib/llm/models/dashscope.py +12 -6
- maxframe/learn/contrib/llm/models/managed.py +76 -11
- maxframe/learn/contrib/llm/models/openai.py +72 -0
- maxframe/learn/contrib/llm/tests/__init__.py +13 -0
- maxframe/learn/contrib/llm/tests/test_core.py +34 -0
- maxframe/learn/contrib/llm/tests/test_openai.py +187 -0
- maxframe/learn/contrib/llm/tests/test_text_gen.py +155 -0
- maxframe/learn/contrib/llm/text.py +348 -42
- maxframe/learn/contrib/models.py +4 -1
- maxframe/learn/contrib/xgboost/classifier.py +2 -0
- maxframe/learn/contrib/xgboost/core.py +113 -4
- maxframe/learn/contrib/xgboost/predict.py +4 -2
- maxframe/learn/contrib/xgboost/regressor.py +5 -0
- maxframe/learn/contrib/xgboost/train.py +7 -2
- maxframe/learn/core.py +66 -0
- maxframe/learn/linear_model/_base.py +58 -1
- maxframe/learn/linear_model/_lin_reg.py +1 -1
- maxframe/learn/metrics/__init__.py +6 -0
- maxframe/learn/metrics/_classification.py +145 -0
- maxframe/learn/metrics/_ranking.py +477 -0
- maxframe/learn/metrics/_scorer.py +60 -0
- maxframe/learn/metrics/pairwise/__init__.py +21 -0
- maxframe/learn/metrics/pairwise/core.py +77 -0
- maxframe/learn/metrics/pairwise/cosine.py +115 -0
- maxframe/learn/metrics/pairwise/euclidean.py +176 -0
- maxframe/learn/metrics/pairwise/haversine.py +96 -0
- maxframe/learn/metrics/pairwise/manhattan.py +80 -0
- maxframe/learn/metrics/pairwise/pairwise.py +127 -0
- maxframe/learn/metrics/pairwise/pairwise_distances_topk.py +121 -0
- maxframe/learn/metrics/pairwise/rbf_kernel.py +51 -0
- maxframe/learn/metrics/tests/__init__.py +13 -0
- maxframe/learn/metrics/tests/test_scorer.py +26 -0
- maxframe/learn/preprocessing/_data/min_max_scaler.py +34 -23
- maxframe/learn/preprocessing/_data/standard_scaler.py +34 -25
- maxframe/learn/utils/__init__.py +2 -1
- maxframe/learn/utils/checks.py +1 -2
- maxframe/learn/utils/core.py +59 -0
- maxframe/learn/utils/extmath.py +79 -9
- maxframe/learn/utils/odpsio.py +262 -0
- maxframe/learn/utils/validation.py +2 -2
- maxframe/lib/compat.py +40 -0
- maxframe/lib/dtypes_extension/__init__.py +16 -1
- maxframe/lib/dtypes_extension/_fake_arrow_dtype.py +604 -0
- maxframe/lib/dtypes_extension/blob.py +304 -0
- maxframe/lib/dtypes_extension/dtypes.py +40 -0
- maxframe/lib/dtypes_extension/tests/test_blob.py +88 -0
- maxframe/lib/dtypes_extension/tests/test_dtypes.py +16 -1
- maxframe/lib/dtypes_extension/tests/test_fake_arrow_dtype.py +75 -0
- maxframe/lib/filesystem/_oss_lib/common.py +124 -50
- maxframe/lib/filesystem/_oss_lib/glob.py +1 -1
- maxframe/lib/filesystem/_oss_lib/handle.py +21 -25
- maxframe/lib/filesystem/base.py +1 -1
- maxframe/lib/filesystem/core.py +1 -1
- maxframe/lib/filesystem/oss.py +115 -46
- maxframe/lib/filesystem/tests/test_oss.py +74 -36
- maxframe/lib/mmh3.cp37-win32.pyd +0 -0
- maxframe/lib/wrapped_pickle.py +10 -0
- maxframe/opcodes.py +41 -15
- maxframe/protocol.py +12 -0
- maxframe/remote/core.py +4 -0
- maxframe/serialization/__init__.py +11 -2
- maxframe/serialization/arrow.py +38 -13
- maxframe/serialization/blob.py +32 -0
- maxframe/serialization/core.cp37-win32.pyd +0 -0
- maxframe/serialization/core.pyx +39 -1
- maxframe/serialization/exception.py +2 -4
- maxframe/serialization/numpy.py +11 -0
- maxframe/serialization/pandas.py +46 -9
- maxframe/serialization/serializables/core.py +2 -2
- maxframe/serialization/tests/test_serial.py +31 -4
- maxframe/tensor/__init__.py +38 -8
- maxframe/tensor/arithmetic/__init__.py +19 -10
- maxframe/tensor/arithmetic/core.py +2 -2
- maxframe/tensor/arithmetic/iscomplexobj.py +53 -0
- maxframe/tensor/arithmetic/tests/test_arithmetic.py +6 -9
- maxframe/tensor/core.py +6 -2
- maxframe/tensor/datasource/tests/test_datasource.py +2 -1
- maxframe/tensor/extensions/__init__.py +2 -0
- maxframe/tensor/extensions/apply_chunk.py +3 -3
- maxframe/tensor/extensions/rebalance.py +65 -0
- maxframe/tensor/fft/__init__.py +32 -0
- maxframe/tensor/fft/core.py +168 -0
- maxframe/tensor/fft/fft.py +112 -0
- maxframe/tensor/fft/fft2.py +118 -0
- maxframe/tensor/fft/fftfreq.py +80 -0
- maxframe/tensor/fft/fftn.py +123 -0
- maxframe/tensor/fft/fftshift.py +79 -0
- maxframe/tensor/fft/hfft.py +112 -0
- maxframe/tensor/fft/ifft.py +114 -0
- maxframe/tensor/fft/ifft2.py +115 -0
- maxframe/tensor/fft/ifftn.py +123 -0
- maxframe/tensor/fft/ifftshift.py +73 -0
- maxframe/tensor/fft/ihfft.py +93 -0
- maxframe/tensor/fft/irfft.py +118 -0
- maxframe/tensor/fft/irfft2.py +62 -0
- maxframe/tensor/fft/irfftn.py +114 -0
- maxframe/tensor/fft/rfft.py +116 -0
- maxframe/tensor/fft/rfft2.py +63 -0
- maxframe/tensor/fft/rfftfreq.py +87 -0
- maxframe/tensor/fft/rfftn.py +113 -0
- maxframe/tensor/indexing/fill_diagonal.py +1 -7
- maxframe/tensor/linalg/__init__.py +7 -0
- maxframe/tensor/linalg/_einsumfunc.py +1025 -0
- maxframe/tensor/linalg/cholesky.py +117 -0
- maxframe/tensor/linalg/einsum.py +339 -0
- maxframe/tensor/linalg/lstsq.py +100 -0
- maxframe/tensor/linalg/matrix_norm.py +75 -0
- maxframe/tensor/linalg/norm.py +249 -0
- maxframe/tensor/linalg/solve.py +72 -0
- maxframe/tensor/linalg/solve_triangular.py +2 -2
- maxframe/tensor/linalg/vector_norm.py +113 -0
- maxframe/tensor/misc/__init__.py +24 -1
- maxframe/tensor/misc/argwhere.py +72 -0
- maxframe/tensor/misc/array_split.py +46 -0
- maxframe/tensor/misc/broadcast_arrays.py +57 -0
- maxframe/tensor/misc/copyto.py +130 -0
- maxframe/tensor/misc/delete.py +104 -0
- maxframe/tensor/misc/dsplit.py +68 -0
- maxframe/tensor/misc/ediff1d.py +74 -0
- maxframe/tensor/misc/expand_dims.py +85 -0
- maxframe/tensor/misc/flip.py +90 -0
- maxframe/tensor/misc/fliplr.py +64 -0
- maxframe/tensor/misc/flipud.py +68 -0
- maxframe/tensor/misc/hsplit.py +85 -0
- maxframe/tensor/misc/insert.py +139 -0
- maxframe/tensor/misc/moveaxis.py +83 -0
- maxframe/tensor/misc/result_type.py +88 -0
- maxframe/tensor/misc/roll.py +124 -0
- maxframe/tensor/misc/rollaxis.py +77 -0
- maxframe/tensor/misc/shape.py +89 -0
- maxframe/tensor/misc/split.py +190 -0
- maxframe/tensor/misc/tile.py +109 -0
- maxframe/tensor/misc/vsplit.py +74 -0
- maxframe/tensor/reduction/array_equal.py +2 -1
- maxframe/tensor/sort/__init__.py +2 -0
- maxframe/tensor/sort/argpartition.py +98 -0
- maxframe/tensor/sort/partition.py +228 -0
- maxframe/tensor/spatial/__init__.py +15 -0
- maxframe/tensor/spatial/distance/__init__.py +17 -0
- maxframe/tensor/spatial/distance/cdist.py +421 -0
- maxframe/tensor/spatial/distance/pdist.py +398 -0
- maxframe/tensor/spatial/distance/squareform.py +153 -0
- maxframe/tensor/special/__init__.py +159 -21
- maxframe/tensor/special/airy.py +55 -0
- maxframe/tensor/special/bessel.py +199 -0
- maxframe/tensor/special/core.py +65 -4
- maxframe/tensor/special/ellip_func_integrals.py +155 -0
- maxframe/tensor/special/ellip_harm.py +55 -0
- maxframe/tensor/special/err_fresnel.py +223 -0
- maxframe/tensor/special/gamma_funcs.py +303 -0
- maxframe/tensor/special/hypergeometric_funcs.py +69 -0
- maxframe/tensor/special/info_theory.py +189 -0
- maxframe/tensor/special/misc.py +21 -0
- maxframe/tensor/statistics/__init__.py +6 -0
- maxframe/tensor/statistics/corrcoef.py +77 -0
- maxframe/tensor/statistics/cov.py +222 -0
- maxframe/tensor/statistics/digitize.py +126 -0
- maxframe/tensor/statistics/histogram.py +520 -0
- maxframe/tensor/statistics/median.py +85 -0
- maxframe/tensor/statistics/ptp.py +89 -0
- maxframe/tensor/utils.py +3 -3
- maxframe/tests/test_udf.py +61 -0
- maxframe/tests/test_utils.py +51 -6
- maxframe/tests/utils.py +0 -2
- maxframe/typing_.py +2 -0
- maxframe/udf.py +130 -9
- maxframe/utils.py +254 -27
- {maxframe-2.0.0b2.dist-info → maxframe-2.3.0rc1.dist-info}/METADATA +3 -3
- {maxframe-2.0.0b2.dist-info → maxframe-2.3.0rc1.dist-info}/RECORD +442 -264
- maxframe_client/fetcher.py +35 -4
- maxframe_client/session/odps.py +7 -2
- maxframe_client/session/task.py +8 -1
- maxframe_client/tests/test_fetcher.py +76 -3
- maxframe_client/tests/test_session.py +28 -1
- maxframe/dataframe/arrays.py +0 -864
- /maxframe/dataframe/{misc → reshape}/melt.py +0 -0
- /maxframe/dataframe/{misc → reshape}/stack.py +0 -0
- {maxframe-2.0.0b2.dist-info → maxframe-2.3.0rc1.dist-info}/WHEEL +0 -0
- {maxframe-2.0.0b2.dist-info → maxframe-2.3.0rc1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
# Copyright 1999-2025 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import functools
|
|
16
|
+
from collections import defaultdict
|
|
17
|
+
|
|
18
|
+
import pandas as pd
|
|
19
|
+
import pytest
|
|
20
|
+
|
|
21
|
+
from .... import dataframe as md
|
|
22
|
+
from ...groupby.apply_chunk import GroupByApplyChunk
|
|
23
|
+
from .. import DataFrameApplyChunk
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@pytest.fixture
|
|
27
|
+
def df1():
|
|
28
|
+
return md.DataFrame(
|
|
29
|
+
{
|
|
30
|
+
"name": ["name key", "name", "key", "name", "key name"],
|
|
31
|
+
"id": [4, 2, 4, 3, 3],
|
|
32
|
+
"fid": [5.3, 3.5, 4.2, 2.2, 4.1],
|
|
33
|
+
}
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def test_map_reduce_with_map_only(df1):
|
|
38
|
+
func = functools.partial(lambda x: x)
|
|
39
|
+
|
|
40
|
+
with pytest.raises(ValueError):
|
|
41
|
+
df1.mf.map_reduce(func, reducer_dtypes={"col": "string"})
|
|
42
|
+
|
|
43
|
+
mapped = df1.mf.map_reduce(func)
|
|
44
|
+
assert isinstance(mapped.op, DataFrameApplyChunk)
|
|
45
|
+
assert mapped.op.func is func
|
|
46
|
+
|
|
47
|
+
map_combined = df1.mf.map_reduce(
|
|
48
|
+
func, combiner=func, mapper_dtypes=df1.dtypes, mapper_index=df1.index
|
|
49
|
+
)
|
|
50
|
+
assert isinstance(map_combined.op, DataFrameApplyChunk)
|
|
51
|
+
assert map_combined.op.func.__name__ == "CombinedMapper"
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def test_mapper_with_combiner(df1):
|
|
55
|
+
class BaseFunc:
|
|
56
|
+
def __init__(self):
|
|
57
|
+
self._word_to_count = defaultdict(lambda: 0)
|
|
58
|
+
|
|
59
|
+
def _collect_df(self):
|
|
60
|
+
word_to_count = self._word_to_count.copy()
|
|
61
|
+
self._word_to_count.clear()
|
|
62
|
+
return pd.DataFrame(
|
|
63
|
+
[list(tp) for tp in word_to_count.items()],
|
|
64
|
+
columns=["word", "count"],
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
def close(self):
|
|
68
|
+
print(f"Close {type(self)}")
|
|
69
|
+
|
|
70
|
+
class MapperCls(BaseFunc):
|
|
71
|
+
def __call__(self, batch, end=False):
|
|
72
|
+
for words in batch["name"]:
|
|
73
|
+
for w in words.split():
|
|
74
|
+
self._word_to_count[w] += 1
|
|
75
|
+
if end:
|
|
76
|
+
return self._collect_df()
|
|
77
|
+
|
|
78
|
+
class CombinerCls(BaseFunc):
|
|
79
|
+
def __call__(self, batch, end=False):
|
|
80
|
+
for _, row in batch.iterrows():
|
|
81
|
+
self._word_to_count[row["word"]] = row["count"]
|
|
82
|
+
if end:
|
|
83
|
+
return self._collect_df()
|
|
84
|
+
|
|
85
|
+
map_combined = df1.mf.map_reduce(
|
|
86
|
+
MapperCls,
|
|
87
|
+
combiner=CombinerCls,
|
|
88
|
+
group_cols="word",
|
|
89
|
+
mapper_dtypes={"word": "str", "count": "int"},
|
|
90
|
+
mapper_index=df1.index,
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
raw = df1.op.data
|
|
94
|
+
combiner = map_combined.op.func()
|
|
95
|
+
ret1 = combiner(raw.iloc[:3], end=True)
|
|
96
|
+
ret2 = combiner(raw.iloc[3:], end=True)
|
|
97
|
+
close_ret = combiner.close()
|
|
98
|
+
expected1 = pd.DataFrame([["key", 2], ["name", 2]], columns=["word", "count"])
|
|
99
|
+
expected2 = pd.DataFrame([["key", 1], ["name", 2]], columns=["word", "count"])
|
|
100
|
+
assert close_ret is None
|
|
101
|
+
pd.testing.assert_frame_equal(ret1.reset_index(drop=True), expected1)
|
|
102
|
+
pd.testing.assert_frame_equal(ret2.reset_index(drop=True), expected2)
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def test_map_reduce_with_reduce_only(df1):
|
|
106
|
+
func = functools.partial(lambda x: x)
|
|
107
|
+
|
|
108
|
+
with pytest.raises(ValueError):
|
|
109
|
+
df1.mf.map_reduce(reducer=func, mapper_dtypes={"col": "string"})
|
|
110
|
+
|
|
111
|
+
reduced = df1.mf.map_reduce(reducer=func, group_cols="name")
|
|
112
|
+
assert isinstance(reduced.op, GroupByApplyChunk)
|
|
113
|
+
assert reduced.op.func is func
|
|
114
|
+
assert reduced.op.groupby_params["by"] == ["name"]
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def test_map_reduce_with_both_funcs(df1):
|
|
118
|
+
map_func = functools.partial(lambda x: x)
|
|
119
|
+
|
|
120
|
+
class ReducerCls:
|
|
121
|
+
def __call__(self, batch):
|
|
122
|
+
return batch
|
|
123
|
+
|
|
124
|
+
reduced = df1.mf.map_reduce(
|
|
125
|
+
mapper=map_func,
|
|
126
|
+
reducer=ReducerCls,
|
|
127
|
+
group_cols="name",
|
|
128
|
+
reducer_dtypes=df1.dtypes,
|
|
129
|
+
reducer_index=df1.index,
|
|
130
|
+
)
|
|
131
|
+
assert isinstance(reduced.op, GroupByApplyChunk)
|
|
132
|
+
assert reduced.op.func is ReducerCls
|
|
133
|
+
assert reduced.op.groupby_params["by"] == ["name"]
|
|
134
|
+
assert isinstance(reduced.inputs[0].op, DataFrameApplyChunk)
|
|
135
|
+
assert reduced.inputs[0].op.func is map_func
|
|
@@ -14,7 +14,9 @@
|
|
|
14
14
|
|
|
15
15
|
# noinspection PyUnresolvedReferences
|
|
16
16
|
from ..core import DataFrameGroupBy, GroupBy, SeriesGroupBy
|
|
17
|
-
from .core import
|
|
17
|
+
from .core import _make_named_agg_compat
|
|
18
|
+
from .expanding import ExpandingGroupby
|
|
19
|
+
from .rolling import RollingGroupby
|
|
18
20
|
|
|
19
21
|
|
|
20
22
|
def _install():
|
|
@@ -24,12 +26,15 @@ def _install():
|
|
|
24
26
|
from .apply import groupby_apply
|
|
25
27
|
from .apply_chunk import df_groupby_apply_chunk
|
|
26
28
|
from .core import groupby
|
|
27
|
-
from .
|
|
29
|
+
from .expanding import cumcount, cummax, cummin, cumprod, cumsum, expanding
|
|
28
30
|
from .extensions import DataFrameGroupByMaxFrameAccessor
|
|
29
31
|
from .fill import bfill, ffill, fillna
|
|
30
32
|
from .getitem import df_groupby_getitem
|
|
31
33
|
from .head import head
|
|
34
|
+
from .rank import rank
|
|
35
|
+
from .rolling import rolling
|
|
32
36
|
from .sample import groupby_sample
|
|
37
|
+
from .shift import shift
|
|
33
38
|
from .transform import groupby_transform
|
|
34
39
|
|
|
35
40
|
for cls in DATAFRAME_TYPE:
|
|
@@ -69,6 +74,12 @@ def _install():
|
|
|
69
74
|
setattr(cls, "cumprod", cumprod)
|
|
70
75
|
setattr(cls, "cumsum", cumsum)
|
|
71
76
|
|
|
77
|
+
setattr(cls, "expanding", expanding)
|
|
78
|
+
setattr(cls, "rolling", rolling)
|
|
79
|
+
|
|
80
|
+
setattr(cls, "shift", shift)
|
|
81
|
+
setattr(cls, "rank", rank)
|
|
82
|
+
|
|
72
83
|
setattr(cls, "head", head)
|
|
73
84
|
|
|
74
85
|
setattr(cls, "sample", groupby_sample)
|
|
@@ -88,3 +99,7 @@ def _install():
|
|
|
88
99
|
|
|
89
100
|
_install()
|
|
90
101
|
del _install
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
__getattr__ = _make_named_agg_compat
|
|
105
|
+
del _make_named_agg_compat
|
|
@@ -20,16 +20,21 @@ import numpy as np
|
|
|
20
20
|
import pandas as pd
|
|
21
21
|
|
|
22
22
|
from ... import opcodes
|
|
23
|
-
from ...
|
|
23
|
+
from ...config import options
|
|
24
|
+
from ...core import ENTITY_TYPE, EntityData, OutputType, enter_mode
|
|
25
|
+
from ...serialization import PickleContainer
|
|
24
26
|
from ...serialization.serializables import (
|
|
25
27
|
AnyField,
|
|
28
|
+
BoolField,
|
|
26
29
|
DictField,
|
|
30
|
+
Int8Field,
|
|
27
31
|
Int32Field,
|
|
28
32
|
Int64Field,
|
|
29
33
|
ListField,
|
|
30
34
|
StringField,
|
|
31
35
|
)
|
|
32
|
-
from ...
|
|
36
|
+
from ...udf import BuiltinFunction
|
|
37
|
+
from ...utils import find_objects, get_pd_option, lazy_import, pd_release_version
|
|
33
38
|
from ..core import GROUPBY_TYPE
|
|
34
39
|
from ..operators import DataFrameOperator, DataFrameOperatorMixin
|
|
35
40
|
from ..reduction.aggregation import (
|
|
@@ -46,19 +51,7 @@ logger = logging.getLogger(__name__)
|
|
|
46
51
|
CV_THRESHOLD = 0.2
|
|
47
52
|
MEAN_RATIO_THRESHOLD = 2 / 3
|
|
48
53
|
_support_get_group_without_as_index = pd_release_version[:2] > (1, 0)
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
class SizeRecorder:
|
|
52
|
-
def __init__(self):
|
|
53
|
-
self._raw_records = []
|
|
54
|
-
self._agg_records = []
|
|
55
|
-
|
|
56
|
-
def record(self, raw_record: int, agg_record: int):
|
|
57
|
-
self._raw_records.append(raw_record)
|
|
58
|
-
self._agg_records.append(agg_record)
|
|
59
|
-
|
|
60
|
-
def get(self):
|
|
61
|
-
return self._raw_records, self._agg_records
|
|
54
|
+
_support_multi_index_as_index = pd_release_version[:2] > (2, 0)
|
|
62
55
|
|
|
63
56
|
|
|
64
57
|
_agg_functions = {
|
|
@@ -86,24 +79,28 @@ _series_col_name = "col_name"
|
|
|
86
79
|
|
|
87
80
|
def _patch_groupby_kurt():
|
|
88
81
|
try:
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
82
|
+
try:
|
|
83
|
+
from pandas.api.typing import DataFrameGroupBy, SeriesGroupBy
|
|
84
|
+
except ImportError:
|
|
85
|
+
from pandas.core.groupby import DataFrameGroupBy, SeriesGroupBy
|
|
86
|
+
|
|
87
|
+
if hasattr(DataFrameGroupBy, "kurt"): # pragma: no branch
|
|
88
|
+
return
|
|
89
|
+
|
|
90
|
+
def _kurt_by_frame(a, *args, **kwargs):
|
|
91
|
+
data = a.to_frame().kurt(*args, **kwargs).iloc[0]
|
|
92
|
+
if is_cudf(data): # pragma: no cover
|
|
93
|
+
data = data.copy()
|
|
94
|
+
return data
|
|
95
|
+
|
|
96
|
+
def _group_kurt(x, *args, **kwargs):
|
|
97
|
+
if kwargs.get("numeric_only") is not None:
|
|
98
|
+
return x.agg(functools.partial(_kurt_by_frame, *args, **kwargs))
|
|
99
|
+
else:
|
|
100
|
+
return x.agg(functools.partial(pd.Series.kurt, *args, **kwargs))
|
|
101
|
+
|
|
102
|
+
DataFrameGroupBy.kurt = DataFrameGroupBy.kurtosis = _group_kurt
|
|
103
|
+
SeriesGroupBy.kurt = SeriesGroupBy.kurtosis = _group_kurt
|
|
107
104
|
except (AttributeError, ImportError): # pragma: no cover
|
|
108
105
|
pass
|
|
109
106
|
|
|
@@ -119,7 +116,10 @@ def build_mock_agg_result(
|
|
|
119
116
|
**raw_func_kw,
|
|
120
117
|
):
|
|
121
118
|
try:
|
|
122
|
-
|
|
119
|
+
with enter_mode(mock=True):
|
|
120
|
+
agg_result = groupby.op.build_mock_groupby().aggregate(
|
|
121
|
+
raw_func, **raw_func_kw
|
|
122
|
+
)
|
|
123
123
|
except ValueError:
|
|
124
124
|
if (
|
|
125
125
|
groupby_params.get("as_index") or _support_get_group_without_as_index
|
|
@@ -137,23 +137,43 @@ def build_mock_agg_result(
|
|
|
137
137
|
class DataFrameGroupByAgg(DataFrameOperator, DataFrameOperatorMixin):
|
|
138
138
|
_op_type_ = opcodes.GROUPBY_AGG
|
|
139
139
|
|
|
140
|
-
raw_func = AnyField("raw_func")
|
|
141
|
-
raw_func_kw = DictField("raw_func_kw")
|
|
142
|
-
func = AnyField("func")
|
|
140
|
+
raw_func = AnyField("raw_func", default=None)
|
|
141
|
+
raw_func_kw = DictField("raw_func_kw", default=None)
|
|
142
|
+
func = AnyField("func", default=None)
|
|
143
143
|
func_rename = ListField("func_rename", default=None)
|
|
144
144
|
|
|
145
|
-
raw_groupby_params = DictField("raw_groupby_params")
|
|
146
|
-
groupby_params = DictField("groupby_params")
|
|
145
|
+
raw_groupby_params = DictField("raw_groupby_params", default=None)
|
|
146
|
+
groupby_params = DictField("groupby_params", default=None)
|
|
147
147
|
|
|
148
|
-
method = StringField("method")
|
|
148
|
+
method = StringField("method", default=None)
|
|
149
149
|
|
|
150
150
|
# for chunk
|
|
151
|
-
chunk_store_limit = Int64Field("chunk_store_limit")
|
|
152
|
-
pre_funcs = ListField("pre_funcs")
|
|
153
|
-
agg_funcs = ListField("agg_funcs")
|
|
154
|
-
post_funcs = ListField("post_funcs")
|
|
155
|
-
index_levels = Int32Field("index_levels")
|
|
156
|
-
size_recorder_name = StringField("size_recorder_name")
|
|
151
|
+
chunk_store_limit = Int64Field("chunk_store_limit", default=None)
|
|
152
|
+
pre_funcs = ListField("pre_funcs", default=None)
|
|
153
|
+
agg_funcs = ListField("agg_funcs", default=None)
|
|
154
|
+
post_funcs = ListField("post_funcs", default=None)
|
|
155
|
+
index_levels = Int32Field("index_levels", default=None)
|
|
156
|
+
size_recorder_name = StringField("size_recorder_name", default=None)
|
|
157
|
+
combine_size = Int32Field("combine_size", default=None)
|
|
158
|
+
|
|
159
|
+
use_inf_as_na = BoolField("use_inf_as_na", default=None)
|
|
160
|
+
input_ndim = Int8Field("input_ndim", default=1)
|
|
161
|
+
append_level = BoolField("append_level", default=False)
|
|
162
|
+
|
|
163
|
+
def has_custom_code(self) -> bool:
|
|
164
|
+
callable_bys = find_objects(
|
|
165
|
+
self.groupby_params.get("by"), types=PickleContainer, checker=callable
|
|
166
|
+
)
|
|
167
|
+
if callable_bys and any(
|
|
168
|
+
not isinstance(fun, BuiltinFunction) for fun in callable_bys
|
|
169
|
+
):
|
|
170
|
+
return True
|
|
171
|
+
|
|
172
|
+
return any(
|
|
173
|
+
fun.custom_reduction
|
|
174
|
+
and not isinstance(fun.custom_reduction, BuiltinFunction)
|
|
175
|
+
for fun in self.agg_funcs or ()
|
|
176
|
+
)
|
|
157
177
|
|
|
158
178
|
@classmethod
|
|
159
179
|
def _set_inputs(cls, op: "DataFrameGroupByAgg", inputs: List[EntityData]):
|
|
@@ -193,7 +213,9 @@ class DataFrameGroupByAgg(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
193
213
|
|
|
194
214
|
def _fix_as_index(self, result_index: pd.Index):
|
|
195
215
|
# make sure if as_index=False takes effect
|
|
196
|
-
if isinstance(
|
|
216
|
+
if not _support_multi_index_as_index and isinstance(
|
|
217
|
+
result_index, pd.MultiIndex
|
|
218
|
+
):
|
|
197
219
|
# if MultiIndex, as_index=False definitely takes no effect
|
|
198
220
|
self.groupby_params["as_index"] = True
|
|
199
221
|
elif result_index.name is not None:
|
|
@@ -217,12 +239,17 @@ class DataFrameGroupByAgg(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
217
239
|
agg_df.index, groupby.key, groupby.index_value.key
|
|
218
240
|
)
|
|
219
241
|
|
|
242
|
+
self.input_ndim = 2
|
|
243
|
+
|
|
220
244
|
# make sure if as_index=False takes effect
|
|
221
245
|
self._fix_as_index(agg_df.index)
|
|
222
246
|
|
|
223
247
|
# determine num of indices to group in intermediate steps
|
|
224
248
|
self.index_levels = self._get_index_levels(groupby, agg_df.index)
|
|
225
249
|
|
|
250
|
+
# if True, name of agg funcs will be appended as the last level
|
|
251
|
+
self.append_level = agg_df.dtypes.index.nlevels > input_df.dtypes.index.nlevels
|
|
252
|
+
|
|
226
253
|
inputs = self._get_inputs([input_df])
|
|
227
254
|
return self.new_dataframe(
|
|
228
255
|
inputs,
|
|
@@ -247,6 +274,8 @@ class DataFrameGroupByAgg(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
247
274
|
|
|
248
275
|
inputs = self._get_inputs([in_series])
|
|
249
276
|
|
|
277
|
+
self.input_ndim = 1
|
|
278
|
+
|
|
250
279
|
# determine num of indices to group in intermediate steps
|
|
251
280
|
self.index_levels = self._get_index_levels(groupby, agg_result.index)
|
|
252
281
|
|
|
@@ -351,9 +380,10 @@ def agg(groupby, func=None, method="auto", *args, **kwargs):
|
|
|
351
380
|
1 1 2 0.590715
|
|
352
381
|
2 3 4 0.704907
|
|
353
382
|
|
|
354
|
-
To control the output names with different aggregations per column,
|
|
383
|
+
To control the output names with different aggregations per column,
|
|
384
|
+
MaxFrame supports “named aggregation”
|
|
355
385
|
|
|
356
|
-
>>> from maxframe.dataframe
|
|
386
|
+
>>> from maxframe.dataframe import NamedAgg
|
|
357
387
|
>>> df.groupby("A").agg(
|
|
358
388
|
... b_min=NamedAgg(column="B", aggfunc="min"),
|
|
359
389
|
... c_sum=NamedAgg(column="C", aggfunc="sum")).execute()
|
|
@@ -376,6 +406,10 @@ def agg(groupby, func=None, method="auto", *args, **kwargs):
|
|
|
376
406
|
f"Method {method} is not available, please specify 'tree' or 'shuffle"
|
|
377
407
|
)
|
|
378
408
|
|
|
409
|
+
combine_size = (
|
|
410
|
+
kwargs.pop("combine_size", None) or options.dpe.reduction.combine_size
|
|
411
|
+
)
|
|
412
|
+
|
|
379
413
|
if not is_funcs_aggregate(func, ndim=groupby.ndim):
|
|
380
414
|
# pass index to transform, otherwise it will lose name info for index
|
|
381
415
|
agg_result = build_mock_agg_result(
|
|
@@ -400,5 +434,8 @@ def agg(groupby, func=None, method="auto", *args, **kwargs):
|
|
|
400
434
|
method=method,
|
|
401
435
|
raw_groupby_params=groupby.op.groupby_params,
|
|
402
436
|
groupby_params=groupby.op.groupby_params,
|
|
437
|
+
combine_size=combine_size,
|
|
438
|
+
chunk_store_limit=options.chunk_store_limit,
|
|
439
|
+
use_inf_as_na=get_pd_option("mode.use_inf_as_na", False),
|
|
403
440
|
)
|
|
404
441
|
return agg_op(groupby)
|
|
@@ -60,7 +60,7 @@ class GroupByApply(
|
|
|
60
60
|
maybe_agg = BoolField("maybe_agg", default=None)
|
|
61
61
|
|
|
62
62
|
logic_key = StringField("logic_key", default=None)
|
|
63
|
-
|
|
63
|
+
func_ref = AnyField("func_ref", default=None)
|
|
64
64
|
need_clean_up_func = BoolField("need_clean_up_func", default=False)
|
|
65
65
|
|
|
66
66
|
def __init__(self, output_types=None, **kw):
|
|
@@ -22,15 +22,18 @@ from ...core import OutputType
|
|
|
22
22
|
from ...lib.version import parse as parse_version
|
|
23
23
|
from ...serialization.serializables import (
|
|
24
24
|
DictField,
|
|
25
|
+
FieldTypes,
|
|
25
26
|
FunctionField,
|
|
26
27
|
Int32Field,
|
|
28
|
+
ListField,
|
|
27
29
|
TupleField,
|
|
28
30
|
)
|
|
29
31
|
from ...udf import BuiltinFunction, MarkedFunction
|
|
30
|
-
from ...utils import copy_if_possible
|
|
32
|
+
from ...utils import copy_if_possible, make_dtype, make_dtypes
|
|
31
33
|
from ..core import (
|
|
32
34
|
DATAFRAME_GROUPBY_TYPE,
|
|
33
35
|
GROUPBY_TYPE,
|
|
36
|
+
INDEX_TYPE,
|
|
34
37
|
DataFrameGroupBy,
|
|
35
38
|
IndexValue,
|
|
36
39
|
SeriesGroupBy,
|
|
@@ -42,8 +45,6 @@ from ..utils import (
|
|
|
42
45
|
copy_func_scheduling_hints,
|
|
43
46
|
infer_dataframe_return_value,
|
|
44
47
|
make_column_list,
|
|
45
|
-
make_dtype,
|
|
46
|
-
make_dtypes,
|
|
47
48
|
parse_index,
|
|
48
49
|
validate_output_types,
|
|
49
50
|
)
|
|
@@ -61,6 +62,8 @@ class GroupByApplyChunk(DataFrameOperatorMixin, DataFrameOperator):
|
|
|
61
62
|
kwargs = DictField("kwargs", default=None)
|
|
62
63
|
|
|
63
64
|
groupby_params = DictField("groupby_params", default=None)
|
|
65
|
+
order_cols = ListField("order_cols", default=None)
|
|
66
|
+
ascending = ListField("ascending", FieldTypes.bool, default_factory=lambda: [True])
|
|
64
67
|
|
|
65
68
|
def __init__(self, output_type=None, **kw):
|
|
66
69
|
if output_type:
|
|
@@ -240,14 +243,14 @@ class GroupByApplyChunk(DataFrameOperatorMixin, DataFrameOperator):
|
|
|
240
243
|
if self.output_types:
|
|
241
244
|
inferred_meta.output_type = self.output_types[0]
|
|
242
245
|
inferred_meta.dtypes = dtypes if dtypes is not None else inferred_meta.dtypes
|
|
246
|
+
if isinstance(index, INDEX_TYPE):
|
|
247
|
+
index = index.index_value
|
|
243
248
|
if index is not None:
|
|
244
249
|
inferred_meta.index_value = (
|
|
245
250
|
parse_index(index)
|
|
246
251
|
if index is not input_groupby.index_value
|
|
247
252
|
else input_groupby.index_value
|
|
248
253
|
)
|
|
249
|
-
else:
|
|
250
|
-
inferred_meta.index_value = inferred_meta.index_value
|
|
251
254
|
inferred_meta.elementwise = elementwise or inferred_meta.elementwise
|
|
252
255
|
return inferred_meta
|
|
253
256
|
|
|
@@ -272,6 +275,8 @@ def df_groupby_apply_chunk(
|
|
|
272
275
|
output_type=None,
|
|
273
276
|
index=None,
|
|
274
277
|
skip_infer=False,
|
|
278
|
+
order_cols=None,
|
|
279
|
+
ascending=True,
|
|
275
280
|
args=(),
|
|
276
281
|
**kwargs,
|
|
277
282
|
):
|
|
@@ -373,6 +378,13 @@ def df_groupby_apply_chunk(
|
|
|
373
378
|
if skip_infer and output_type is None:
|
|
374
379
|
output_type = OutputType.df_or_series
|
|
375
380
|
|
|
381
|
+
if order_cols and not isinstance(order_cols, list):
|
|
382
|
+
order_cols = [order_cols]
|
|
383
|
+
if not isinstance(ascending, list):
|
|
384
|
+
ascending = [ascending]
|
|
385
|
+
elif len(order_cols) != len(ascending):
|
|
386
|
+
raise ValueError("order_cols and ascending must have same length")
|
|
387
|
+
|
|
376
388
|
# bind args and kwargs
|
|
377
389
|
op = GroupByApplyChunk(
|
|
378
390
|
func=func,
|
|
@@ -380,6 +392,8 @@ def df_groupby_apply_chunk(
|
|
|
380
392
|
output_type=output_type,
|
|
381
393
|
args=args,
|
|
382
394
|
kwargs=kwargs,
|
|
395
|
+
order_cols=order_cols,
|
|
396
|
+
ascending=ascending,
|
|
383
397
|
groupby_params=dataframe_groupby.op.groupby_params,
|
|
384
398
|
)
|
|
385
399
|
|
|
@@ -12,30 +12,37 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
|
|
16
|
-
|
|
15
|
+
import os
|
|
16
|
+
import warnings
|
|
17
|
+
from typing import Any, Dict, List
|
|
17
18
|
|
|
18
19
|
import pandas as pd
|
|
19
20
|
|
|
20
21
|
from ... import opcodes
|
|
21
22
|
from ...core import ENTITY_TYPE, Entity, EntityData, OutputType
|
|
22
23
|
from ...core.operator import MapReduceOperator
|
|
23
|
-
from ...
|
|
24
|
-
from ...
|
|
25
|
-
from
|
|
24
|
+
from ...env import MAXFRAME_INSIDE_TASK
|
|
25
|
+
from ...serialization import PickleContainer
|
|
26
|
+
from ...serialization.serializables import AnyField, BoolField, DictField, Int32Field
|
|
27
|
+
from ...udf import BuiltinFunction
|
|
28
|
+
from ...utils import find_objects, lazy_import, no_default
|
|
29
|
+
from ..core import GROUPBY_TYPE, SERIES_TYPE
|
|
26
30
|
from ..initializer import Series as asseries
|
|
27
|
-
from ..operators import DataFrameOperatorMixin
|
|
28
|
-
from ..utils import
|
|
31
|
+
from ..operators import DataFrameOperator, DataFrameOperatorMixin
|
|
32
|
+
from ..utils import (
|
|
33
|
+
build_df,
|
|
34
|
+
build_series,
|
|
35
|
+
call_groupby_with_params,
|
|
36
|
+
make_column_list,
|
|
37
|
+
parse_index,
|
|
38
|
+
)
|
|
29
39
|
|
|
30
40
|
cudf = lazy_import("cudf")
|
|
31
41
|
|
|
32
42
|
|
|
33
|
-
NamedAgg = namedtuple("NamedAgg", ["column", "aggfunc"])
|
|
34
|
-
|
|
35
|
-
|
|
36
43
|
class DataFrameGroupByOp(MapReduceOperator, DataFrameOperatorMixin):
|
|
37
44
|
_op_type_ = opcodes.GROUPBY
|
|
38
|
-
_legacy_name = "DataFrameGroupByOperator"
|
|
45
|
+
_legacy_name = "DataFrameGroupByOperator" # since v2.0.0
|
|
39
46
|
|
|
40
47
|
by = AnyField(
|
|
41
48
|
"by",
|
|
@@ -61,6 +68,12 @@ class DataFrameGroupByOp(MapReduceOperator, DataFrameOperatorMixin):
|
|
|
61
68
|
output_types = [OutputType.series_groupby]
|
|
62
69
|
self.output_types = output_types
|
|
63
70
|
|
|
71
|
+
def has_custom_code(self) -> bool:
|
|
72
|
+
callable_bys = find_objects(self.by, types=PickleContainer, checker=callable)
|
|
73
|
+
if not callable_bys:
|
|
74
|
+
return False
|
|
75
|
+
return any(not isinstance(fun, BuiltinFunction) for fun in callable_bys)
|
|
76
|
+
|
|
64
77
|
@property
|
|
65
78
|
def is_dataframe_obj(self):
|
|
66
79
|
return self.output_types[0] in (
|
|
@@ -93,8 +106,8 @@ class DataFrameGroupByOp(MapReduceOperator, DataFrameOperatorMixin):
|
|
|
93
106
|
ensure_string=True,
|
|
94
107
|
)
|
|
95
108
|
|
|
96
|
-
new_kw = self.groupby_params
|
|
97
|
-
new_kw.update(kwargs)
|
|
109
|
+
new_kw = self.groupby_params.copy()
|
|
110
|
+
new_kw.update({k: v for k, v in kwargs.items()})
|
|
98
111
|
if isinstance(new_kw["by"], list):
|
|
99
112
|
new_by = []
|
|
100
113
|
for v in new_kw["by"]:
|
|
@@ -110,7 +123,7 @@ class DataFrameGroupByOp(MapReduceOperator, DataFrameOperatorMixin):
|
|
|
110
123
|
else:
|
|
111
124
|
new_by.append(v)
|
|
112
125
|
new_kw["by"] = new_by
|
|
113
|
-
return mock_obj
|
|
126
|
+
return call_groupby_with_params(mock_obj, new_kw)
|
|
114
127
|
|
|
115
128
|
@classmethod
|
|
116
129
|
def _set_inputs(cls, op: "DataFrameGroupByOp", inputs: List[EntityData]):
|
|
@@ -118,8 +131,8 @@ class DataFrameGroupByOp(MapReduceOperator, DataFrameOperatorMixin):
|
|
|
118
131
|
inputs_iter = iter(op._inputs[1:])
|
|
119
132
|
if len(inputs) > 1:
|
|
120
133
|
by = []
|
|
121
|
-
for k in op.by:
|
|
122
|
-
if isinstance(k,
|
|
134
|
+
for k in op.by or ():
|
|
135
|
+
if isinstance(k, ENTITY_TYPE):
|
|
123
136
|
by.append(next(inputs_iter))
|
|
124
137
|
else:
|
|
125
138
|
by.append(k)
|
|
@@ -240,3 +253,90 @@ def groupby(df, by=None, level=None, as_index=True, sort=True, group_keys=True):
|
|
|
240
253
|
output_types=output_types,
|
|
241
254
|
)
|
|
242
255
|
return op(df)
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
class BaseGroupByWindowOp(DataFrameOperatorMixin, DataFrameOperator):
|
|
259
|
+
_op_module_ = "dataframe.groupby"
|
|
260
|
+
|
|
261
|
+
groupby_params = DictField("groupby_params", default=None)
|
|
262
|
+
window_params = DictField("window_params", default=None)
|
|
263
|
+
|
|
264
|
+
def __init__(self, output_types=None, **kw):
|
|
265
|
+
super().__init__(_output_types=output_types, **kw)
|
|
266
|
+
|
|
267
|
+
def _calc_mock_result_df(self, mock_groupby):
|
|
268
|
+
raise NotImplementedError
|
|
269
|
+
|
|
270
|
+
def get_sort_cols_to_asc(self) -> Dict[Any, bool]:
|
|
271
|
+
order_cols = self.window_params.get("order_cols") or []
|
|
272
|
+
asc_list = self.window_params.get("ascending") or [True]
|
|
273
|
+
if len(asc_list) < len(order_cols):
|
|
274
|
+
asc_list = [asc_list[0]] * len(order_cols)
|
|
275
|
+
return dict(zip(order_cols, asc_list))
|
|
276
|
+
|
|
277
|
+
def _calc_out_dtypes(self, in_groupby):
|
|
278
|
+
in_obj = in_groupby
|
|
279
|
+
groupby_params = in_groupby.op.groupby_params
|
|
280
|
+
while isinstance(in_obj, GROUPBY_TYPE):
|
|
281
|
+
in_obj = in_obj.inputs[0]
|
|
282
|
+
|
|
283
|
+
if in_groupby.ndim == 1:
|
|
284
|
+
selection = None
|
|
285
|
+
else:
|
|
286
|
+
by_cols = (
|
|
287
|
+
make_column_list(groupby_params.get("by"), in_groupby.dtypes) or []
|
|
288
|
+
)
|
|
289
|
+
selection = groupby_params.get("selection")
|
|
290
|
+
if not selection:
|
|
291
|
+
selection = [c for c in in_obj.dtypes.index if c not in by_cols]
|
|
292
|
+
|
|
293
|
+
mock_groupby = in_groupby.op.build_mock_groupby(
|
|
294
|
+
group_keys=False, selection=selection
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
result_df = self._calc_mock_result_df(mock_groupby)
|
|
298
|
+
|
|
299
|
+
if isinstance(result_df, pd.DataFrame):
|
|
300
|
+
self.output_types = [OutputType.dataframe]
|
|
301
|
+
return result_df.dtypes
|
|
302
|
+
else:
|
|
303
|
+
self.output_types = [OutputType.series]
|
|
304
|
+
return result_df.name, result_df.dtype
|
|
305
|
+
|
|
306
|
+
def __call__(self, groupby):
|
|
307
|
+
in_df = groupby
|
|
308
|
+
while in_df.op.output_types[0] not in (OutputType.dataframe, OutputType.series):
|
|
309
|
+
in_df = in_df.inputs[0]
|
|
310
|
+
|
|
311
|
+
out_dtypes = self._calc_out_dtypes(groupby)
|
|
312
|
+
|
|
313
|
+
kw = in_df.params.copy()
|
|
314
|
+
if self.output_types[0] == OutputType.dataframe:
|
|
315
|
+
kw.update(
|
|
316
|
+
dict(
|
|
317
|
+
columns_value=parse_index(out_dtypes.index, store_data=True),
|
|
318
|
+
dtypes=out_dtypes,
|
|
319
|
+
shape=(groupby.shape[0], len(out_dtypes)),
|
|
320
|
+
)
|
|
321
|
+
)
|
|
322
|
+
else:
|
|
323
|
+
name, dtype = out_dtypes
|
|
324
|
+
kw.update(dtype=dtype, name=name, shape=(groupby.shape[0],))
|
|
325
|
+
return self.new_tileable([in_df], **kw)
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
def _make_named_agg_compat(name): # pragma: no cover
|
|
329
|
+
# to make imports compatible
|
|
330
|
+
from ..reduction import NamedAgg
|
|
331
|
+
|
|
332
|
+
if name == "NamedAgg":
|
|
333
|
+
if MAXFRAME_INSIDE_TASK not in os.environ:
|
|
334
|
+
warnings.warn(
|
|
335
|
+
"Please import NamedAgg from maxframe.dataframe",
|
|
336
|
+
DeprecationWarning,
|
|
337
|
+
)
|
|
338
|
+
return NamedAgg
|
|
339
|
+
raise AttributeError(f"module {__name__} has no attribute {name}")
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
__getattr__ = _make_named_agg_compat
|