maxframe 2.0.0b1__cp37-cp37m-win_amd64.whl → 2.2.0__cp37-cp37m-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of maxframe might be problematic. Click here for more details.
- maxframe/__init__.py +1 -0
- maxframe/_utils.cp37-win_amd64.pyd +0 -0
- maxframe/_utils.pyx +14 -1
- maxframe/codegen/core.py +6 -6
- maxframe/codegen/spe/core.py +1 -1
- maxframe/codegen/spe/dataframe/__init__.py +1 -0
- maxframe/codegen/spe/dataframe/accessors/base.py +18 -0
- maxframe/codegen/spe/dataframe/accessors/dict_.py +25 -130
- maxframe/codegen/spe/dataframe/accessors/list_.py +12 -48
- maxframe/codegen/spe/dataframe/accessors/struct_.py +28 -0
- maxframe/codegen/spe/dataframe/arithmetic.py +7 -2
- maxframe/codegen/spe/dataframe/groupby.py +88 -0
- maxframe/codegen/spe/dataframe/indexing.py +99 -4
- maxframe/codegen/spe/dataframe/merge.py +34 -1
- maxframe/codegen/spe/dataframe/misc.py +9 -33
- maxframe/codegen/spe/dataframe/reduction.py +14 -9
- maxframe/codegen/spe/dataframe/reshape.py +46 -0
- maxframe/codegen/spe/dataframe/sort.py +30 -17
- maxframe/codegen/spe/dataframe/tests/accessors/test_dict.py +9 -15
- maxframe/codegen/spe/dataframe/tests/accessors/test_list.py +4 -7
- maxframe/codegen/spe/dataframe/tests/accessors/test_struct.py +75 -0
- maxframe/codegen/spe/dataframe/tests/indexing/test_iloc.py +20 -1
- maxframe/codegen/spe/dataframe/tests/indexing/test_loc.py +35 -0
- maxframe/codegen/spe/dataframe/tests/misc/test_misc.py +0 -32
- maxframe/codegen/spe/dataframe/tests/test_groupby.py +81 -18
- maxframe/codegen/spe/dataframe/tests/test_merge.py +27 -1
- maxframe/codegen/spe/dataframe/tests/test_reshape.py +79 -0
- maxframe/codegen/spe/dataframe/tests/test_sort.py +20 -0
- maxframe/codegen/spe/learn/contrib/tests/test_xgboost.py +2 -1
- maxframe/codegen/spe/learn/metrics/__init__.py +1 -1
- maxframe/codegen/spe/learn/metrics/_ranking.py +76 -0
- maxframe/codegen/spe/learn/metrics/pairwise.py +51 -0
- maxframe/codegen/spe/learn/metrics/tests/test_pairwise.py +36 -0
- maxframe/codegen/spe/learn/metrics/tests/test_ranking.py +59 -0
- maxframe/codegen/spe/tensor/__init__.py +3 -0
- maxframe/codegen/spe/tensor/fft.py +74 -0
- maxframe/codegen/spe/tensor/linalg.py +29 -2
- maxframe/codegen/spe/tensor/misc.py +79 -25
- maxframe/codegen/spe/tensor/spatial.py +45 -0
- maxframe/codegen/spe/tensor/statistics.py +44 -0
- maxframe/codegen/spe/tensor/tests/test_fft.py +64 -0
- maxframe/codegen/spe/tensor/tests/test_linalg.py +15 -1
- maxframe/codegen/spe/tensor/tests/test_misc.py +52 -2
- maxframe/codegen/spe/tensor/tests/test_spatial.py +33 -0
- maxframe/codegen/spe/tensor/tests/test_statistics.py +15 -1
- maxframe/codegen/spe/tests/test_spe_codegen.py +6 -12
- maxframe/codegen/spe/utils.py +2 -0
- maxframe/config/config.py +70 -9
- maxframe/config/tests/test_validators.py +13 -1
- maxframe/config/validators.py +49 -0
- maxframe/conftest.py +44 -17
- maxframe/core/accessor.py +2 -2
- maxframe/core/entity/core.py +5 -0
- maxframe/core/entity/tileables.py +1 -1
- maxframe/core/graph/core.cp37-win_amd64.pyd +0 -0
- maxframe/core/graph/entity.py +1 -2
- maxframe/core/operator/base.py +9 -2
- maxframe/core/operator/core.py +10 -2
- maxframe/core/operator/utils.py +13 -0
- maxframe/dataframe/__init__.py +10 -3
- maxframe/dataframe/accessors/__init__.py +1 -1
- maxframe/dataframe/accessors/compat.py +45 -0
- maxframe/dataframe/accessors/datetime_/__init__.py +4 -1
- maxframe/dataframe/accessors/dict_/contains.py +7 -16
- maxframe/dataframe/accessors/dict_/core.py +48 -0
- maxframe/dataframe/accessors/dict_/getitem.py +17 -21
- maxframe/dataframe/accessors/dict_/length.py +7 -16
- maxframe/dataframe/accessors/dict_/remove.py +6 -18
- maxframe/dataframe/accessors/dict_/setitem.py +8 -18
- maxframe/dataframe/accessors/dict_/tests/test_dict_accessor.py +62 -22
- maxframe/dataframe/accessors/list_/__init__.py +2 -2
- maxframe/dataframe/accessors/list_/core.py +48 -0
- maxframe/dataframe/accessors/list_/getitem.py +12 -19
- maxframe/dataframe/accessors/list_/length.py +7 -16
- maxframe/dataframe/accessors/list_/tests/test_list_accessor.py +11 -9
- maxframe/dataframe/accessors/string_/__init__.py +4 -1
- maxframe/dataframe/accessors/struct_/__init__.py +37 -0
- maxframe/dataframe/accessors/struct_/accessor.py +39 -0
- maxframe/dataframe/accessors/struct_/core.py +43 -0
- maxframe/dataframe/accessors/struct_/dtypes.py +53 -0
- maxframe/dataframe/accessors/struct_/field.py +123 -0
- maxframe/dataframe/accessors/struct_/tests/__init__.py +13 -0
- maxframe/dataframe/accessors/struct_/tests/test_struct_accessor.py +91 -0
- maxframe/dataframe/arithmetic/__init__.py +14 -4
- maxframe/dataframe/arithmetic/between.py +106 -0
- maxframe/dataframe/arithmetic/dot.py +237 -0
- maxframe/dataframe/arithmetic/{around.py → round.py} +11 -7
- maxframe/dataframe/core.py +63 -118
- maxframe/dataframe/datasource/__init__.py +18 -0
- maxframe/dataframe/datasource/from_dict.py +124 -0
- maxframe/dataframe/datasource/from_index.py +1 -1
- maxframe/dataframe/datasource/from_records.py +77 -0
- maxframe/dataframe/datasource/from_tensor.py +109 -41
- maxframe/dataframe/datasource/read_csv.py +2 -3
- maxframe/dataframe/datasource/read_odps_query.py +76 -16
- maxframe/dataframe/datasource/tests/test_datasource.py +84 -1
- maxframe/dataframe/datastore/__init__.py +5 -1
- maxframe/dataframe/datastore/to_csv.py +29 -41
- maxframe/dataframe/datastore/to_odps.py +30 -4
- maxframe/dataframe/extensions/__init__.py +20 -4
- maxframe/dataframe/extensions/apply_chunk.py +32 -6
- maxframe/dataframe/extensions/cartesian_chunk.py +153 -0
- maxframe/dataframe/extensions/collect_kv.py +126 -0
- maxframe/dataframe/extensions/extract_kv.py +177 -0
- maxframe/dataframe/extensions/map_reduce.py +263 -0
- maxframe/dataframe/extensions/rebalance.py +62 -0
- maxframe/dataframe/extensions/tests/test_apply_chunk.py +9 -2
- maxframe/dataframe/extensions/tests/test_extensions.py +54 -0
- maxframe/dataframe/extensions/tests/test_map_reduce.py +135 -0
- maxframe/dataframe/groupby/__init__.py +12 -1
- maxframe/dataframe/groupby/aggregation.py +78 -45
- maxframe/dataframe/groupby/apply.py +1 -1
- maxframe/dataframe/groupby/apply_chunk.py +18 -2
- maxframe/dataframe/groupby/core.py +96 -12
- maxframe/dataframe/groupby/cum.py +4 -25
- maxframe/dataframe/groupby/expanding.py +264 -0
- maxframe/dataframe/groupby/fill.py +1 -1
- maxframe/dataframe/groupby/getitem.py +12 -5
- maxframe/dataframe/groupby/head.py +11 -1
- maxframe/dataframe/groupby/rank.py +136 -0
- maxframe/dataframe/groupby/rolling.py +206 -0
- maxframe/dataframe/groupby/shift.py +114 -0
- maxframe/dataframe/groupby/tests/test_groupby.py +0 -5
- maxframe/dataframe/indexing/__init__.py +20 -1
- maxframe/dataframe/indexing/droplevel.py +195 -0
- maxframe/dataframe/indexing/filter.py +169 -0
- maxframe/dataframe/indexing/get_level_values.py +76 -0
- maxframe/dataframe/indexing/iat.py +45 -0
- maxframe/dataframe/indexing/iloc.py +152 -12
- maxframe/dataframe/indexing/insert.py +1 -1
- maxframe/dataframe/indexing/loc.py +287 -7
- maxframe/dataframe/indexing/reindex.py +14 -5
- maxframe/dataframe/indexing/rename.py +6 -0
- maxframe/dataframe/indexing/rename_axis.py +2 -2
- maxframe/dataframe/indexing/reorder_levels.py +143 -0
- maxframe/dataframe/indexing/reset_index.py +33 -6
- maxframe/dataframe/indexing/sample.py +8 -0
- maxframe/dataframe/indexing/setitem.py +3 -3
- maxframe/dataframe/indexing/swaplevel.py +185 -0
- maxframe/dataframe/indexing/take.py +99 -0
- maxframe/dataframe/indexing/truncate.py +140 -0
- maxframe/dataframe/indexing/where.py +0 -11
- maxframe/dataframe/indexing/xs.py +148 -0
- maxframe/dataframe/merge/__init__.py +12 -1
- maxframe/dataframe/merge/append.py +97 -98
- maxframe/dataframe/merge/combine_first.py +120 -0
- maxframe/dataframe/merge/compare.py +387 -0
- maxframe/dataframe/merge/concat.py +183 -0
- maxframe/dataframe/merge/update.py +271 -0
- maxframe/dataframe/misc/__init__.py +16 -10
- maxframe/dataframe/misc/_duplicate.py +10 -4
- maxframe/dataframe/misc/apply.py +1 -1
- maxframe/dataframe/misc/check_unique.py +51 -0
- maxframe/dataframe/misc/clip.py +145 -0
- maxframe/dataframe/misc/describe.py +175 -9
- maxframe/dataframe/misc/drop_duplicates.py +2 -2
- maxframe/dataframe/misc/duplicated.py +2 -2
- maxframe/dataframe/misc/get_dummies.py +5 -1
- maxframe/dataframe/misc/isin.py +2 -2
- maxframe/dataframe/misc/map.py +94 -0
- maxframe/dataframe/misc/tests/test_misc.py +13 -2
- maxframe/dataframe/misc/to_numeric.py +3 -0
- maxframe/dataframe/misc/transform.py +12 -5
- maxframe/dataframe/misc/transpose.py +13 -1
- maxframe/dataframe/misc/valid_index.py +115 -0
- maxframe/dataframe/misc/value_counts.py +38 -4
- maxframe/dataframe/missing/checkna.py +13 -6
- maxframe/dataframe/missing/dropna.py +5 -0
- maxframe/dataframe/missing/fillna.py +1 -1
- maxframe/dataframe/missing/replace.py +7 -4
- maxframe/dataframe/reduction/__init__.py +29 -15
- maxframe/dataframe/reduction/aggregation.py +38 -9
- maxframe/dataframe/reduction/all.py +2 -2
- maxframe/dataframe/reduction/any.py +2 -2
- maxframe/dataframe/reduction/argmax.py +100 -0
- maxframe/dataframe/reduction/argmin.py +100 -0
- maxframe/dataframe/reduction/core.py +65 -18
- maxframe/dataframe/reduction/count.py +13 -9
- maxframe/dataframe/reduction/cov.py +166 -0
- maxframe/dataframe/reduction/cummax.py +2 -2
- maxframe/dataframe/reduction/cummin.py +2 -2
- maxframe/dataframe/reduction/cumprod.py +2 -2
- maxframe/dataframe/reduction/cumsum.py +2 -2
- maxframe/dataframe/reduction/custom_reduction.py +2 -2
- maxframe/dataframe/reduction/idxmax.py +185 -0
- maxframe/dataframe/reduction/idxmin.py +185 -0
- maxframe/dataframe/reduction/kurtosis.py +37 -30
- maxframe/dataframe/reduction/max.py +2 -2
- maxframe/dataframe/reduction/mean.py +9 -7
- maxframe/dataframe/reduction/median.py +2 -2
- maxframe/dataframe/reduction/min.py +2 -2
- maxframe/dataframe/reduction/nunique.py +9 -8
- maxframe/dataframe/reduction/prod.py +18 -13
- maxframe/dataframe/reduction/reduction_size.py +2 -2
- maxframe/dataframe/reduction/sem.py +13 -9
- maxframe/dataframe/reduction/skew.py +31 -27
- maxframe/dataframe/reduction/str_concat.py +10 -7
- maxframe/dataframe/reduction/sum.py +18 -14
- maxframe/dataframe/reduction/unique.py +20 -3
- maxframe/dataframe/reduction/var.py +16 -12
- maxframe/dataframe/reshape/__init__.py +38 -0
- maxframe/dataframe/{misc → reshape}/pivot.py +1 -0
- maxframe/dataframe/{misc → reshape}/pivot_table.py +1 -0
- maxframe/dataframe/reshape/unstack.py +114 -0
- maxframe/dataframe/sort/__init__.py +8 -0
- maxframe/dataframe/sort/argsort.py +62 -0
- maxframe/dataframe/sort/core.py +1 -0
- maxframe/dataframe/sort/nlargest.py +238 -0
- maxframe/dataframe/sort/nsmallest.py +228 -0
- maxframe/dataframe/statistics/__init__.py +3 -3
- maxframe/dataframe/statistics/corr.py +1 -0
- maxframe/dataframe/statistics/quantile.py +2 -2
- maxframe/dataframe/tests/test_typing.py +104 -0
- maxframe/dataframe/tests/test_utils.py +66 -2
- maxframe/dataframe/typing_.py +185 -0
- maxframe/dataframe/utils.py +95 -26
- maxframe/dataframe/window/aggregation.py +8 -4
- maxframe/dataframe/window/core.py +14 -1
- maxframe/dataframe/window/ewm.py +1 -3
- maxframe/dataframe/window/expanding.py +37 -35
- maxframe/dataframe/window/rolling.py +49 -39
- maxframe/dataframe/window/tests/test_expanding.py +1 -7
- maxframe/dataframe/window/tests/test_rolling.py +1 -1
- maxframe/env.py +7 -4
- maxframe/errors.py +2 -2
- maxframe/io/objects/tests/test_object_io.py +4 -2
- maxframe/io/odpsio/schema.py +9 -3
- maxframe/io/odpsio/tableio.py +7 -2
- maxframe/io/odpsio/tests/test_schema.py +198 -83
- maxframe/io/odpsio/tests/test_volumeio.py +4 -15
- maxframe/io/odpsio/volumeio.py +23 -8
- maxframe/learn/__init__.py +10 -2
- maxframe/learn/cluster/__init__.py +15 -0
- maxframe/learn/cluster/_kmeans.py +782 -0
- maxframe/learn/contrib/llm/core.py +2 -0
- maxframe/learn/contrib/xgboost/core.py +87 -1
- maxframe/learn/contrib/xgboost/train.py +5 -2
- maxframe/learn/core.py +66 -0
- maxframe/learn/linear_model/_base.py +58 -1
- maxframe/learn/linear_model/_lin_reg.py +1 -1
- maxframe/learn/metrics/__init__.py +6 -0
- maxframe/learn/metrics/_classification.py +145 -0
- maxframe/learn/metrics/_ranking.py +477 -0
- maxframe/learn/metrics/_scorer.py +60 -0
- maxframe/learn/metrics/pairwise/__init__.py +21 -0
- maxframe/learn/metrics/pairwise/core.py +77 -0
- maxframe/learn/metrics/pairwise/cosine.py +115 -0
- maxframe/learn/metrics/pairwise/euclidean.py +176 -0
- maxframe/learn/metrics/pairwise/haversine.py +96 -0
- maxframe/learn/metrics/pairwise/manhattan.py +80 -0
- maxframe/learn/metrics/pairwise/pairwise.py +127 -0
- maxframe/learn/metrics/pairwise/pairwise_distances_topk.py +121 -0
- maxframe/learn/metrics/pairwise/rbf_kernel.py +51 -0
- maxframe/learn/metrics/tests/__init__.py +13 -0
- maxframe/learn/metrics/tests/test_scorer.py +26 -0
- maxframe/learn/utils/__init__.py +1 -1
- maxframe/learn/utils/checks.py +1 -2
- maxframe/learn/utils/core.py +59 -0
- maxframe/learn/utils/extmath.py +37 -0
- maxframe/learn/utils/odpsio.py +193 -0
- maxframe/learn/utils/validation.py +2 -2
- maxframe/lib/compat.py +40 -0
- maxframe/lib/dtypes_extension/__init__.py +16 -1
- maxframe/lib/dtypes_extension/_fake_arrow_dtype.py +604 -0
- maxframe/lib/dtypes_extension/blob.py +304 -0
- maxframe/lib/dtypes_extension/dtypes.py +40 -0
- maxframe/lib/dtypes_extension/tests/test_blob.py +88 -0
- maxframe/lib/dtypes_extension/tests/test_dtypes.py +16 -1
- maxframe/lib/dtypes_extension/tests/test_fake_arrow_dtype.py +75 -0
- maxframe/lib/filesystem/_oss_lib/common.py +122 -50
- maxframe/lib/filesystem/_oss_lib/glob.py +1 -1
- maxframe/lib/filesystem/_oss_lib/handle.py +21 -25
- maxframe/lib/filesystem/base.py +1 -1
- maxframe/lib/filesystem/core.py +1 -1
- maxframe/lib/filesystem/oss.py +115 -46
- maxframe/lib/filesystem/tests/test_oss.py +74 -36
- maxframe/lib/mmh3.cp37-win_amd64.pyd +0 -0
- maxframe/lib/wrapped_pickle.py +10 -0
- maxframe/opcodes.py +33 -15
- maxframe/protocol.py +12 -0
- maxframe/serialization/__init__.py +11 -2
- maxframe/serialization/arrow.py +38 -13
- maxframe/serialization/blob.py +32 -0
- maxframe/serialization/core.cp37-win_amd64.pyd +0 -0
- maxframe/serialization/core.pyx +39 -1
- maxframe/serialization/exception.py +2 -4
- maxframe/serialization/numpy.py +11 -0
- maxframe/serialization/pandas.py +46 -9
- maxframe/serialization/serializables/core.py +2 -2
- maxframe/serialization/tests/test_serial.py +29 -2
- maxframe/tensor/__init__.py +38 -8
- maxframe/tensor/arithmetic/__init__.py +19 -10
- maxframe/tensor/arithmetic/iscomplexobj.py +53 -0
- maxframe/tensor/arithmetic/tests/test_arithmetic.py +6 -0
- maxframe/tensor/core.py +3 -2
- maxframe/tensor/datasource/tests/test_datasource.py +2 -1
- maxframe/tensor/extensions/__init__.py +2 -0
- maxframe/tensor/extensions/apply_chunk.py +3 -3
- maxframe/tensor/extensions/rebalance.py +65 -0
- maxframe/tensor/fft/__init__.py +32 -0
- maxframe/tensor/fft/core.py +168 -0
- maxframe/tensor/fft/fft.py +112 -0
- maxframe/tensor/fft/fft2.py +118 -0
- maxframe/tensor/fft/fftfreq.py +80 -0
- maxframe/tensor/fft/fftn.py +123 -0
- maxframe/tensor/fft/fftshift.py +79 -0
- maxframe/tensor/fft/hfft.py +112 -0
- maxframe/tensor/fft/ifft.py +114 -0
- maxframe/tensor/fft/ifft2.py +115 -0
- maxframe/tensor/fft/ifftn.py +123 -0
- maxframe/tensor/fft/ifftshift.py +73 -0
- maxframe/tensor/fft/ihfft.py +93 -0
- maxframe/tensor/fft/irfft.py +118 -0
- maxframe/tensor/fft/irfft2.py +62 -0
- maxframe/tensor/fft/irfftn.py +114 -0
- maxframe/tensor/fft/rfft.py +116 -0
- maxframe/tensor/fft/rfft2.py +63 -0
- maxframe/tensor/fft/rfftfreq.py +87 -0
- maxframe/tensor/fft/rfftn.py +113 -0
- maxframe/tensor/indexing/fill_diagonal.py +1 -7
- maxframe/tensor/linalg/__init__.py +7 -0
- maxframe/tensor/linalg/_einsumfunc.py +1025 -0
- maxframe/tensor/linalg/cholesky.py +117 -0
- maxframe/tensor/linalg/einsum.py +339 -0
- maxframe/tensor/linalg/lstsq.py +100 -0
- maxframe/tensor/linalg/matrix_norm.py +75 -0
- maxframe/tensor/linalg/norm.py +249 -0
- maxframe/tensor/linalg/solve.py +72 -0
- maxframe/tensor/linalg/solve_triangular.py +2 -2
- maxframe/tensor/linalg/vector_norm.py +113 -0
- maxframe/tensor/misc/__init__.py +24 -1
- maxframe/tensor/misc/argwhere.py +72 -0
- maxframe/tensor/misc/array_split.py +46 -0
- maxframe/tensor/misc/broadcast_arrays.py +57 -0
- maxframe/tensor/misc/copyto.py +130 -0
- maxframe/tensor/misc/delete.py +104 -0
- maxframe/tensor/misc/dsplit.py +68 -0
- maxframe/tensor/misc/ediff1d.py +74 -0
- maxframe/tensor/misc/expand_dims.py +85 -0
- maxframe/tensor/misc/flip.py +90 -0
- maxframe/tensor/misc/fliplr.py +64 -0
- maxframe/tensor/misc/flipud.py +68 -0
- maxframe/tensor/misc/hsplit.py +85 -0
- maxframe/tensor/misc/insert.py +139 -0
- maxframe/tensor/misc/moveaxis.py +83 -0
- maxframe/tensor/misc/result_type.py +88 -0
- maxframe/tensor/misc/roll.py +124 -0
- maxframe/tensor/misc/rollaxis.py +77 -0
- maxframe/tensor/misc/shape.py +89 -0
- maxframe/tensor/misc/split.py +190 -0
- maxframe/tensor/misc/tile.py +109 -0
- maxframe/tensor/misc/vsplit.py +74 -0
- maxframe/tensor/reduction/array_equal.py +2 -1
- maxframe/tensor/sort/__init__.py +2 -0
- maxframe/tensor/sort/argpartition.py +98 -0
- maxframe/tensor/sort/partition.py +228 -0
- maxframe/tensor/spatial/__init__.py +15 -0
- maxframe/tensor/spatial/distance/__init__.py +17 -0
- maxframe/tensor/spatial/distance/cdist.py +421 -0
- maxframe/tensor/spatial/distance/pdist.py +398 -0
- maxframe/tensor/spatial/distance/squareform.py +153 -0
- maxframe/tensor/special/__init__.py +159 -21
- maxframe/tensor/special/airy.py +55 -0
- maxframe/tensor/special/bessel.py +199 -0
- maxframe/tensor/special/core.py +65 -4
- maxframe/tensor/special/ellip_func_integrals.py +155 -0
- maxframe/tensor/special/ellip_harm.py +55 -0
- maxframe/tensor/special/err_fresnel.py +223 -0
- maxframe/tensor/special/gamma_funcs.py +303 -0
- maxframe/tensor/special/hypergeometric_funcs.py +69 -0
- maxframe/tensor/special/info_theory.py +189 -0
- maxframe/tensor/special/misc.py +21 -0
- maxframe/tensor/statistics/__init__.py +6 -0
- maxframe/tensor/statistics/corrcoef.py +77 -0
- maxframe/tensor/statistics/cov.py +222 -0
- maxframe/tensor/statistics/digitize.py +126 -0
- maxframe/tensor/statistics/histogram.py +520 -0
- maxframe/tensor/statistics/median.py +85 -0
- maxframe/tensor/statistics/ptp.py +89 -0
- maxframe/tensor/utils.py +3 -3
- maxframe/tests/test_utils.py +43 -1
- maxframe/tests/utils.py +3 -13
- maxframe/typing_.py +2 -0
- maxframe/udf.py +27 -2
- maxframe/utils.py +193 -19
- {maxframe-2.0.0b1.dist-info → maxframe-2.2.0.dist-info}/METADATA +3 -2
- {maxframe-2.0.0b1.dist-info → maxframe-2.2.0.dist-info}/RECORD +395 -240
- maxframe_client/fetcher.py +35 -4
- maxframe_client/session/odps.py +7 -2
- maxframe_client/tests/test_fetcher.py +76 -3
- maxframe_client/tests/test_session.py +4 -1
- /maxframe/dataframe/{misc → reshape}/melt.py +0 -0
- /maxframe/dataframe/{misc → reshape}/stack.py +0 -0
- {maxframe-2.0.0b1.dist-info → maxframe-2.2.0.dist-info}/WHEEL +0 -0
- {maxframe-2.0.0b1.dist-info → maxframe-2.2.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
# Copyright 1999-2025 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import functools
|
|
16
|
+
from collections import defaultdict
|
|
17
|
+
|
|
18
|
+
import pandas as pd
|
|
19
|
+
import pytest
|
|
20
|
+
|
|
21
|
+
from .... import dataframe as md
|
|
22
|
+
from ...groupby.apply_chunk import GroupByApplyChunk
|
|
23
|
+
from .. import DataFrameApplyChunk
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@pytest.fixture
|
|
27
|
+
def df1():
|
|
28
|
+
return md.DataFrame(
|
|
29
|
+
{
|
|
30
|
+
"name": ["name key", "name", "key", "name", "key name"],
|
|
31
|
+
"id": [4, 2, 4, 3, 3],
|
|
32
|
+
"fid": [5.3, 3.5, 4.2, 2.2, 4.1],
|
|
33
|
+
}
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def test_map_reduce_with_map_only(df1):
|
|
38
|
+
func = functools.partial(lambda x: x)
|
|
39
|
+
|
|
40
|
+
with pytest.raises(ValueError):
|
|
41
|
+
df1.mf.map_reduce(func, reducer_dtypes={"col": "string"})
|
|
42
|
+
|
|
43
|
+
mapped = df1.mf.map_reduce(func)
|
|
44
|
+
assert isinstance(mapped.op, DataFrameApplyChunk)
|
|
45
|
+
assert mapped.op.func is func
|
|
46
|
+
|
|
47
|
+
map_combined = df1.mf.map_reduce(
|
|
48
|
+
func, combiner=func, mapper_dtypes=df1.dtypes, mapper_index=df1.index
|
|
49
|
+
)
|
|
50
|
+
assert isinstance(map_combined.op, DataFrameApplyChunk)
|
|
51
|
+
assert map_combined.op.func.__name__ == "CombinedMapper"
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def test_mapper_with_combiner(df1):
|
|
55
|
+
class BaseFunc:
|
|
56
|
+
def __init__(self):
|
|
57
|
+
self._word_to_count = defaultdict(lambda: 0)
|
|
58
|
+
|
|
59
|
+
def _collect_df(self):
|
|
60
|
+
word_to_count = self._word_to_count.copy()
|
|
61
|
+
self._word_to_count.clear()
|
|
62
|
+
return pd.DataFrame(
|
|
63
|
+
[list(tp) for tp in word_to_count.items()],
|
|
64
|
+
columns=["word", "count"],
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
def close(self):
|
|
68
|
+
print(f"Close {type(self)}")
|
|
69
|
+
|
|
70
|
+
class MapperCls(BaseFunc):
|
|
71
|
+
def __call__(self, batch, end=False):
|
|
72
|
+
for words in batch["name"]:
|
|
73
|
+
for w in words.split():
|
|
74
|
+
self._word_to_count[w] += 1
|
|
75
|
+
if end:
|
|
76
|
+
return self._collect_df()
|
|
77
|
+
|
|
78
|
+
class CombinerCls(BaseFunc):
|
|
79
|
+
def __call__(self, batch, end=False):
|
|
80
|
+
for _, row in batch.iterrows():
|
|
81
|
+
self._word_to_count[row["word"]] = row["count"]
|
|
82
|
+
if end:
|
|
83
|
+
return self._collect_df()
|
|
84
|
+
|
|
85
|
+
map_combined = df1.mf.map_reduce(
|
|
86
|
+
MapperCls,
|
|
87
|
+
combiner=CombinerCls,
|
|
88
|
+
group_cols="word",
|
|
89
|
+
mapper_dtypes={"word": "str", "count": "int"},
|
|
90
|
+
mapper_index=df1.index,
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
raw = df1.op.data
|
|
94
|
+
combiner = map_combined.op.func()
|
|
95
|
+
ret1 = combiner(raw.iloc[:3], end=True)
|
|
96
|
+
ret2 = combiner(raw.iloc[3:], end=True)
|
|
97
|
+
close_ret = combiner.close()
|
|
98
|
+
expected1 = pd.DataFrame([["key", 2], ["name", 2]], columns=["word", "count"])
|
|
99
|
+
expected2 = pd.DataFrame([["key", 1], ["name", 2]], columns=["word", "count"])
|
|
100
|
+
assert close_ret is None
|
|
101
|
+
pd.testing.assert_frame_equal(ret1.reset_index(drop=True), expected1)
|
|
102
|
+
pd.testing.assert_frame_equal(ret2.reset_index(drop=True), expected2)
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def test_map_reduce_with_reduce_only(df1):
|
|
106
|
+
func = functools.partial(lambda x: x)
|
|
107
|
+
|
|
108
|
+
with pytest.raises(ValueError):
|
|
109
|
+
df1.mf.map_reduce(reducer=func, mapper_dtypes={"col": "string"})
|
|
110
|
+
|
|
111
|
+
reduced = df1.mf.map_reduce(reducer=func, group_cols="name")
|
|
112
|
+
assert isinstance(reduced.op, GroupByApplyChunk)
|
|
113
|
+
assert reduced.op.func is func
|
|
114
|
+
assert reduced.op.groupby_params["by"] == ["name"]
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def test_map_reduce_with_both_funcs(df1):
|
|
118
|
+
map_func = functools.partial(lambda x: x)
|
|
119
|
+
|
|
120
|
+
class ReducerCls:
|
|
121
|
+
def __call__(self, batch):
|
|
122
|
+
return batch
|
|
123
|
+
|
|
124
|
+
reduced = df1.mf.map_reduce(
|
|
125
|
+
mapper=map_func,
|
|
126
|
+
reducer=ReducerCls,
|
|
127
|
+
group_cols="name",
|
|
128
|
+
reducer_dtypes=df1.dtypes,
|
|
129
|
+
reducer_index=df1.index,
|
|
130
|
+
)
|
|
131
|
+
assert isinstance(reduced.op, GroupByApplyChunk)
|
|
132
|
+
assert reduced.op.func is ReducerCls
|
|
133
|
+
assert reduced.op.groupby_params["by"] == ["name"]
|
|
134
|
+
assert isinstance(reduced.inputs[0].op, DataFrameApplyChunk)
|
|
135
|
+
assert reduced.inputs[0].op.func is map_func
|
|
@@ -15,6 +15,8 @@
|
|
|
15
15
|
# noinspection PyUnresolvedReferences
|
|
16
16
|
from ..core import DataFrameGroupBy, GroupBy, SeriesGroupBy
|
|
17
17
|
from .core import NamedAgg
|
|
18
|
+
from .expanding import ExpandingGroupby
|
|
19
|
+
from .rolling import RollingGroupby
|
|
18
20
|
|
|
19
21
|
|
|
20
22
|
def _install():
|
|
@@ -24,12 +26,15 @@ def _install():
|
|
|
24
26
|
from .apply import groupby_apply
|
|
25
27
|
from .apply_chunk import df_groupby_apply_chunk
|
|
26
28
|
from .core import groupby
|
|
27
|
-
from .
|
|
29
|
+
from .expanding import cumcount, cummax, cummin, cumprod, cumsum, expanding
|
|
28
30
|
from .extensions import DataFrameGroupByMaxFrameAccessor
|
|
29
31
|
from .fill import bfill, ffill, fillna
|
|
30
32
|
from .getitem import df_groupby_getitem
|
|
31
33
|
from .head import head
|
|
34
|
+
from .rank import rank
|
|
35
|
+
from .rolling import rolling
|
|
32
36
|
from .sample import groupby_sample
|
|
37
|
+
from .shift import shift
|
|
33
38
|
from .transform import groupby_transform
|
|
34
39
|
|
|
35
40
|
for cls in DATAFRAME_TYPE:
|
|
@@ -69,6 +74,12 @@ def _install():
|
|
|
69
74
|
setattr(cls, "cumprod", cumprod)
|
|
70
75
|
setattr(cls, "cumsum", cumsum)
|
|
71
76
|
|
|
77
|
+
setattr(cls, "expanding", expanding)
|
|
78
|
+
setattr(cls, "rolling", rolling)
|
|
79
|
+
|
|
80
|
+
setattr(cls, "shift", shift)
|
|
81
|
+
setattr(cls, "rank", rank)
|
|
82
|
+
|
|
72
83
|
setattr(cls, "head", head)
|
|
73
84
|
|
|
74
85
|
setattr(cls, "sample", groupby_sample)
|
|
@@ -20,16 +20,21 @@ import numpy as np
|
|
|
20
20
|
import pandas as pd
|
|
21
21
|
|
|
22
22
|
from ... import opcodes
|
|
23
|
+
from ...config import options
|
|
23
24
|
from ...core import ENTITY_TYPE, EntityData, OutputType
|
|
25
|
+
from ...serialization import PickleContainer
|
|
24
26
|
from ...serialization.serializables import (
|
|
25
27
|
AnyField,
|
|
28
|
+
BoolField,
|
|
26
29
|
DictField,
|
|
30
|
+
Int8Field,
|
|
27
31
|
Int32Field,
|
|
28
32
|
Int64Field,
|
|
29
33
|
ListField,
|
|
30
34
|
StringField,
|
|
31
35
|
)
|
|
32
|
-
from ...
|
|
36
|
+
from ...udf import BuiltinFunction
|
|
37
|
+
from ...utils import find_objects, lazy_import, pd_release_version
|
|
33
38
|
from ..core import GROUPBY_TYPE
|
|
34
39
|
from ..operators import DataFrameOperator, DataFrameOperatorMixin
|
|
35
40
|
from ..reduction.aggregation import (
|
|
@@ -46,19 +51,7 @@ logger = logging.getLogger(__name__)
|
|
|
46
51
|
CV_THRESHOLD = 0.2
|
|
47
52
|
MEAN_RATIO_THRESHOLD = 2 / 3
|
|
48
53
|
_support_get_group_without_as_index = pd_release_version[:2] > (1, 0)
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
class SizeRecorder:
|
|
52
|
-
def __init__(self):
|
|
53
|
-
self._raw_records = []
|
|
54
|
-
self._agg_records = []
|
|
55
|
-
|
|
56
|
-
def record(self, raw_record: int, agg_record: int):
|
|
57
|
-
self._raw_records.append(raw_record)
|
|
58
|
-
self._agg_records.append(agg_record)
|
|
59
|
-
|
|
60
|
-
def get(self):
|
|
61
|
-
return self._raw_records, self._agg_records
|
|
54
|
+
_support_multi_index_as_index = pd_release_version[:2] > (2, 0)
|
|
62
55
|
|
|
63
56
|
|
|
64
57
|
_agg_functions = {
|
|
@@ -86,24 +79,28 @@ _series_col_name = "col_name"
|
|
|
86
79
|
|
|
87
80
|
def _patch_groupby_kurt():
|
|
88
81
|
try:
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
82
|
+
try:
|
|
83
|
+
from pandas.api.typing import DataFrameGroupBy, SeriesGroupBy
|
|
84
|
+
except ImportError:
|
|
85
|
+
from pandas.core.groupby import DataFrameGroupBy, SeriesGroupBy
|
|
86
|
+
|
|
87
|
+
if hasattr(DataFrameGroupBy, "kurt"): # pragma: no branch
|
|
88
|
+
return
|
|
89
|
+
|
|
90
|
+
def _kurt_by_frame(a, *args, **kwargs):
|
|
91
|
+
data = a.to_frame().kurt(*args, **kwargs).iloc[0]
|
|
92
|
+
if is_cudf(data): # pragma: no cover
|
|
93
|
+
data = data.copy()
|
|
94
|
+
return data
|
|
95
|
+
|
|
96
|
+
def _group_kurt(x, *args, **kwargs):
|
|
97
|
+
if kwargs.get("numeric_only") is not None:
|
|
98
|
+
return x.agg(functools.partial(_kurt_by_frame, *args, **kwargs))
|
|
99
|
+
else:
|
|
100
|
+
return x.agg(functools.partial(pd.Series.kurt, *args, **kwargs))
|
|
101
|
+
|
|
102
|
+
DataFrameGroupBy.kurt = DataFrameGroupBy.kurtosis = _group_kurt
|
|
103
|
+
SeriesGroupBy.kurt = SeriesGroupBy.kurtosis = _group_kurt
|
|
107
104
|
except (AttributeError, ImportError): # pragma: no cover
|
|
108
105
|
pass
|
|
109
106
|
|
|
@@ -137,23 +134,43 @@ def build_mock_agg_result(
|
|
|
137
134
|
class DataFrameGroupByAgg(DataFrameOperator, DataFrameOperatorMixin):
|
|
138
135
|
_op_type_ = opcodes.GROUPBY_AGG
|
|
139
136
|
|
|
140
|
-
raw_func = AnyField("raw_func")
|
|
141
|
-
raw_func_kw = DictField("raw_func_kw")
|
|
142
|
-
func = AnyField("func")
|
|
137
|
+
raw_func = AnyField("raw_func", default=None)
|
|
138
|
+
raw_func_kw = DictField("raw_func_kw", default=None)
|
|
139
|
+
func = AnyField("func", default=None)
|
|
143
140
|
func_rename = ListField("func_rename", default=None)
|
|
144
141
|
|
|
145
|
-
raw_groupby_params = DictField("raw_groupby_params")
|
|
146
|
-
groupby_params = DictField("groupby_params")
|
|
142
|
+
raw_groupby_params = DictField("raw_groupby_params", default=None)
|
|
143
|
+
groupby_params = DictField("groupby_params", default=None)
|
|
147
144
|
|
|
148
|
-
method = StringField("method")
|
|
145
|
+
method = StringField("method", default=None)
|
|
149
146
|
|
|
150
147
|
# for chunk
|
|
151
|
-
chunk_store_limit = Int64Field("chunk_store_limit")
|
|
152
|
-
pre_funcs = ListField("pre_funcs")
|
|
153
|
-
agg_funcs = ListField("agg_funcs")
|
|
154
|
-
post_funcs = ListField("post_funcs")
|
|
155
|
-
index_levels = Int32Field("index_levels")
|
|
156
|
-
size_recorder_name = StringField("size_recorder_name")
|
|
148
|
+
chunk_store_limit = Int64Field("chunk_store_limit", default=None)
|
|
149
|
+
pre_funcs = ListField("pre_funcs", default=None)
|
|
150
|
+
agg_funcs = ListField("agg_funcs", default=None)
|
|
151
|
+
post_funcs = ListField("post_funcs", default=None)
|
|
152
|
+
index_levels = Int32Field("index_levels", default=None)
|
|
153
|
+
size_recorder_name = StringField("size_recorder_name", default=None)
|
|
154
|
+
combine_size = Int32Field("combine_size", default=None)
|
|
155
|
+
|
|
156
|
+
use_inf_as_na = BoolField("use_inf_as_na", default=None)
|
|
157
|
+
input_ndim = Int8Field("input_ndim", default=1)
|
|
158
|
+
append_level = BoolField("append_level", default=False)
|
|
159
|
+
|
|
160
|
+
def has_custom_code(self) -> bool:
|
|
161
|
+
callable_bys = find_objects(
|
|
162
|
+
self.groupby_params.get("by"), types=PickleContainer, checker=callable
|
|
163
|
+
)
|
|
164
|
+
if callable_bys and any(
|
|
165
|
+
not isinstance(fun, BuiltinFunction) for fun in callable_bys
|
|
166
|
+
):
|
|
167
|
+
return True
|
|
168
|
+
|
|
169
|
+
return any(
|
|
170
|
+
fun.custom_reduction
|
|
171
|
+
and not isinstance(fun.custom_reduction, BuiltinFunction)
|
|
172
|
+
for fun in self.agg_funcs or ()
|
|
173
|
+
)
|
|
157
174
|
|
|
158
175
|
@classmethod
|
|
159
176
|
def _set_inputs(cls, op: "DataFrameGroupByAgg", inputs: List[EntityData]):
|
|
@@ -193,7 +210,9 @@ class DataFrameGroupByAgg(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
193
210
|
|
|
194
211
|
def _fix_as_index(self, result_index: pd.Index):
|
|
195
212
|
# make sure if as_index=False takes effect
|
|
196
|
-
if isinstance(
|
|
213
|
+
if not _support_multi_index_as_index and isinstance(
|
|
214
|
+
result_index, pd.MultiIndex
|
|
215
|
+
):
|
|
197
216
|
# if MultiIndex, as_index=False definitely takes no effect
|
|
198
217
|
self.groupby_params["as_index"] = True
|
|
199
218
|
elif result_index.name is not None:
|
|
@@ -217,12 +236,17 @@ class DataFrameGroupByAgg(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
217
236
|
agg_df.index, groupby.key, groupby.index_value.key
|
|
218
237
|
)
|
|
219
238
|
|
|
239
|
+
self.input_ndim = 2
|
|
240
|
+
|
|
220
241
|
# make sure if as_index=False takes effect
|
|
221
242
|
self._fix_as_index(agg_df.index)
|
|
222
243
|
|
|
223
244
|
# determine num of indices to group in intermediate steps
|
|
224
245
|
self.index_levels = self._get_index_levels(groupby, agg_df.index)
|
|
225
246
|
|
|
247
|
+
# if True, name of agg funcs will be appended as the last level
|
|
248
|
+
self.append_level = agg_df.dtypes.index.nlevels > input_df.dtypes.index.nlevels
|
|
249
|
+
|
|
226
250
|
inputs = self._get_inputs([input_df])
|
|
227
251
|
return self.new_dataframe(
|
|
228
252
|
inputs,
|
|
@@ -247,6 +271,8 @@ class DataFrameGroupByAgg(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
247
271
|
|
|
248
272
|
inputs = self._get_inputs([in_series])
|
|
249
273
|
|
|
274
|
+
self.input_ndim = 1
|
|
275
|
+
|
|
250
276
|
# determine num of indices to group in intermediate steps
|
|
251
277
|
self.index_levels = self._get_index_levels(groupby, agg_result.index)
|
|
252
278
|
|
|
@@ -376,6 +402,10 @@ def agg(groupby, func=None, method="auto", *args, **kwargs):
|
|
|
376
402
|
f"Method {method} is not available, please specify 'tree' or 'shuffle"
|
|
377
403
|
)
|
|
378
404
|
|
|
405
|
+
combine_size = (
|
|
406
|
+
kwargs.pop("combine_size", None) or options.dpe.reduction.combine_size
|
|
407
|
+
)
|
|
408
|
+
|
|
379
409
|
if not is_funcs_aggregate(func, ndim=groupby.ndim):
|
|
380
410
|
# pass index to transform, otherwise it will lose name info for index
|
|
381
411
|
agg_result = build_mock_agg_result(
|
|
@@ -400,5 +430,8 @@ def agg(groupby, func=None, method="auto", *args, **kwargs):
|
|
|
400
430
|
method=method,
|
|
401
431
|
raw_groupby_params=groupby.op.groupby_params,
|
|
402
432
|
groupby_params=groupby.op.groupby_params,
|
|
433
|
+
combine_size=combine_size,
|
|
434
|
+
chunk_store_limit=options.chunk_store_limit,
|
|
435
|
+
use_inf_as_na=pd.get_option("mode.use_inf_as_na"),
|
|
403
436
|
)
|
|
404
437
|
return agg_op(groupby)
|
|
@@ -60,7 +60,7 @@ class GroupByApply(
|
|
|
60
60
|
maybe_agg = BoolField("maybe_agg", default=None)
|
|
61
61
|
|
|
62
62
|
logic_key = StringField("logic_key", default=None)
|
|
63
|
-
|
|
63
|
+
func_ref = AnyField("func_ref", default=None)
|
|
64
64
|
need_clean_up_func = BoolField("need_clean_up_func", default=False)
|
|
65
65
|
|
|
66
66
|
def __init__(self, output_types=None, **kw):
|
|
@@ -22,8 +22,10 @@ from ...core import OutputType
|
|
|
22
22
|
from ...lib.version import parse as parse_version
|
|
23
23
|
from ...serialization.serializables import (
|
|
24
24
|
DictField,
|
|
25
|
+
FieldTypes,
|
|
25
26
|
FunctionField,
|
|
26
27
|
Int32Field,
|
|
28
|
+
ListField,
|
|
27
29
|
TupleField,
|
|
28
30
|
)
|
|
29
31
|
from ...udf import BuiltinFunction, MarkedFunction
|
|
@@ -31,6 +33,7 @@ from ...utils import copy_if_possible
|
|
|
31
33
|
from ..core import (
|
|
32
34
|
DATAFRAME_GROUPBY_TYPE,
|
|
33
35
|
GROUPBY_TYPE,
|
|
36
|
+
INDEX_TYPE,
|
|
34
37
|
DataFrameGroupBy,
|
|
35
38
|
IndexValue,
|
|
36
39
|
SeriesGroupBy,
|
|
@@ -61,6 +64,8 @@ class GroupByApplyChunk(DataFrameOperatorMixin, DataFrameOperator):
|
|
|
61
64
|
kwargs = DictField("kwargs", default=None)
|
|
62
65
|
|
|
63
66
|
groupby_params = DictField("groupby_params", default=None)
|
|
67
|
+
order_cols = ListField("order_cols", default=None)
|
|
68
|
+
ascending = ListField("ascending", FieldTypes.bool, default_factory=lambda: [True])
|
|
64
69
|
|
|
65
70
|
def __init__(self, output_type=None, **kw):
|
|
66
71
|
if output_type:
|
|
@@ -240,14 +245,14 @@ class GroupByApplyChunk(DataFrameOperatorMixin, DataFrameOperator):
|
|
|
240
245
|
if self.output_types:
|
|
241
246
|
inferred_meta.output_type = self.output_types[0]
|
|
242
247
|
inferred_meta.dtypes = dtypes if dtypes is not None else inferred_meta.dtypes
|
|
248
|
+
if isinstance(index, INDEX_TYPE):
|
|
249
|
+
index = index.index_value
|
|
243
250
|
if index is not None:
|
|
244
251
|
inferred_meta.index_value = (
|
|
245
252
|
parse_index(index)
|
|
246
253
|
if index is not input_groupby.index_value
|
|
247
254
|
else input_groupby.index_value
|
|
248
255
|
)
|
|
249
|
-
else:
|
|
250
|
-
inferred_meta.index_value = inferred_meta.index_value
|
|
251
256
|
inferred_meta.elementwise = elementwise or inferred_meta.elementwise
|
|
252
257
|
return inferred_meta
|
|
253
258
|
|
|
@@ -272,6 +277,8 @@ def df_groupby_apply_chunk(
|
|
|
272
277
|
output_type=None,
|
|
273
278
|
index=None,
|
|
274
279
|
skip_infer=False,
|
|
280
|
+
order_cols=None,
|
|
281
|
+
ascending=True,
|
|
275
282
|
args=(),
|
|
276
283
|
**kwargs,
|
|
277
284
|
):
|
|
@@ -373,6 +380,13 @@ def df_groupby_apply_chunk(
|
|
|
373
380
|
if skip_infer and output_type is None:
|
|
374
381
|
output_type = OutputType.df_or_series
|
|
375
382
|
|
|
383
|
+
if order_cols and not isinstance(order_cols, list):
|
|
384
|
+
order_cols = [order_cols]
|
|
385
|
+
if not isinstance(ascending, list):
|
|
386
|
+
ascending = [ascending]
|
|
387
|
+
elif len(order_cols) != len(ascending):
|
|
388
|
+
raise ValueError("order_cols and ascending must have same length")
|
|
389
|
+
|
|
376
390
|
# bind args and kwargs
|
|
377
391
|
op = GroupByApplyChunk(
|
|
378
392
|
func=func,
|
|
@@ -380,6 +394,8 @@ def df_groupby_apply_chunk(
|
|
|
380
394
|
output_type=output_type,
|
|
381
395
|
args=args,
|
|
382
396
|
kwargs=kwargs,
|
|
397
|
+
order_cols=order_cols,
|
|
398
|
+
ascending=ascending,
|
|
383
399
|
groupby_params=dataframe_groupby.op.groupby_params,
|
|
384
400
|
)
|
|
385
401
|
|
|
@@ -13,19 +13,27 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
15
|
from collections import namedtuple
|
|
16
|
-
from typing import List
|
|
16
|
+
from typing import Any, Dict, List
|
|
17
17
|
|
|
18
18
|
import pandas as pd
|
|
19
19
|
|
|
20
20
|
from ... import opcodes
|
|
21
21
|
from ...core import ENTITY_TYPE, Entity, EntityData, OutputType
|
|
22
22
|
from ...core.operator import MapReduceOperator
|
|
23
|
-
from ...serialization
|
|
24
|
-
from ...
|
|
25
|
-
from
|
|
23
|
+
from ...serialization import PickleContainer
|
|
24
|
+
from ...serialization.serializables import AnyField, BoolField, DictField, Int32Field
|
|
25
|
+
from ...udf import BuiltinFunction
|
|
26
|
+
from ...utils import find_objects, lazy_import, no_default
|
|
27
|
+
from ..core import GROUPBY_TYPE, SERIES_TYPE
|
|
26
28
|
from ..initializer import Series as asseries
|
|
27
|
-
from ..operators import DataFrameOperatorMixin
|
|
28
|
-
from ..utils import
|
|
29
|
+
from ..operators import DataFrameOperator, DataFrameOperatorMixin
|
|
30
|
+
from ..utils import (
|
|
31
|
+
build_df,
|
|
32
|
+
build_series,
|
|
33
|
+
call_groupby_with_params,
|
|
34
|
+
make_column_list,
|
|
35
|
+
parse_index,
|
|
36
|
+
)
|
|
29
37
|
|
|
30
38
|
cudf = lazy_import("cudf")
|
|
31
39
|
|
|
@@ -35,7 +43,7 @@ NamedAgg = namedtuple("NamedAgg", ["column", "aggfunc"])
|
|
|
35
43
|
|
|
36
44
|
class DataFrameGroupByOp(MapReduceOperator, DataFrameOperatorMixin):
|
|
37
45
|
_op_type_ = opcodes.GROUPBY
|
|
38
|
-
_legacy_name = "DataFrameGroupByOperator"
|
|
46
|
+
_legacy_name = "DataFrameGroupByOperator" # since v2.0.0
|
|
39
47
|
|
|
40
48
|
by = AnyField(
|
|
41
49
|
"by",
|
|
@@ -61,6 +69,12 @@ class DataFrameGroupByOp(MapReduceOperator, DataFrameOperatorMixin):
|
|
|
61
69
|
output_types = [OutputType.series_groupby]
|
|
62
70
|
self.output_types = output_types
|
|
63
71
|
|
|
72
|
+
def has_custom_code(self) -> bool:
|
|
73
|
+
callable_bys = find_objects(self.by, types=PickleContainer, checker=callable)
|
|
74
|
+
if not callable_bys:
|
|
75
|
+
return False
|
|
76
|
+
return any(not isinstance(fun, BuiltinFunction) for fun in callable_bys)
|
|
77
|
+
|
|
64
78
|
@property
|
|
65
79
|
def is_dataframe_obj(self):
|
|
66
80
|
return self.output_types[0] in (
|
|
@@ -93,8 +107,8 @@ class DataFrameGroupByOp(MapReduceOperator, DataFrameOperatorMixin):
|
|
|
93
107
|
ensure_string=True,
|
|
94
108
|
)
|
|
95
109
|
|
|
96
|
-
new_kw = self.groupby_params
|
|
97
|
-
new_kw.update(kwargs)
|
|
110
|
+
new_kw = self.groupby_params.copy()
|
|
111
|
+
new_kw.update({k: v for k, v in kwargs.items()})
|
|
98
112
|
if isinstance(new_kw["by"], list):
|
|
99
113
|
new_by = []
|
|
100
114
|
for v in new_kw["by"]:
|
|
@@ -110,7 +124,7 @@ class DataFrameGroupByOp(MapReduceOperator, DataFrameOperatorMixin):
|
|
|
110
124
|
else:
|
|
111
125
|
new_by.append(v)
|
|
112
126
|
new_kw["by"] = new_by
|
|
113
|
-
return mock_obj
|
|
127
|
+
return call_groupby_with_params(mock_obj, new_kw)
|
|
114
128
|
|
|
115
129
|
@classmethod
|
|
116
130
|
def _set_inputs(cls, op: "DataFrameGroupByOp", inputs: List[EntityData]):
|
|
@@ -118,8 +132,8 @@ class DataFrameGroupByOp(MapReduceOperator, DataFrameOperatorMixin):
|
|
|
118
132
|
inputs_iter = iter(op._inputs[1:])
|
|
119
133
|
if len(inputs) > 1:
|
|
120
134
|
by = []
|
|
121
|
-
for k in op.by:
|
|
122
|
-
if isinstance(k,
|
|
135
|
+
for k in op.by or ():
|
|
136
|
+
if isinstance(k, ENTITY_TYPE):
|
|
123
137
|
by.append(next(inputs_iter))
|
|
124
138
|
else:
|
|
125
139
|
by.append(k)
|
|
@@ -240,3 +254,73 @@ def groupby(df, by=None, level=None, as_index=True, sort=True, group_keys=True):
|
|
|
240
254
|
output_types=output_types,
|
|
241
255
|
)
|
|
242
256
|
return op(df)
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
class BaseGroupByWindowOp(DataFrameOperatorMixin, DataFrameOperator):
|
|
260
|
+
_op_module_ = "dataframe.groupby"
|
|
261
|
+
|
|
262
|
+
groupby_params = DictField("groupby_params", default=None)
|
|
263
|
+
window_params = DictField("window_params", default=None)
|
|
264
|
+
|
|
265
|
+
def __init__(self, output_types=None, **kw):
|
|
266
|
+
super().__init__(_output_types=output_types, **kw)
|
|
267
|
+
|
|
268
|
+
def _calc_mock_result_df(self, mock_groupby):
|
|
269
|
+
raise NotImplementedError
|
|
270
|
+
|
|
271
|
+
def get_sort_cols_to_asc(self) -> Dict[Any, bool]:
|
|
272
|
+
order_cols = self.window_params.get("order_cols") or []
|
|
273
|
+
asc_list = self.window_params.get("ascending") or [True]
|
|
274
|
+
if len(asc_list) < len(order_cols):
|
|
275
|
+
asc_list = [asc_list[0]] * len(order_cols)
|
|
276
|
+
return dict(zip(order_cols, asc_list))
|
|
277
|
+
|
|
278
|
+
def _calc_out_dtypes(self, in_groupby):
|
|
279
|
+
in_obj = in_groupby
|
|
280
|
+
groupby_params = in_groupby.op.groupby_params
|
|
281
|
+
while isinstance(in_obj, GROUPBY_TYPE):
|
|
282
|
+
in_obj = in_obj.inputs[0]
|
|
283
|
+
|
|
284
|
+
if in_groupby.ndim == 1:
|
|
285
|
+
selection = None
|
|
286
|
+
else:
|
|
287
|
+
by_cols = (
|
|
288
|
+
make_column_list(groupby_params.get("by"), in_groupby.dtypes) or []
|
|
289
|
+
)
|
|
290
|
+
selection = groupby_params.get("selection")
|
|
291
|
+
if not selection:
|
|
292
|
+
selection = [c for c in in_obj.dtypes.index if c not in by_cols]
|
|
293
|
+
|
|
294
|
+
mock_groupby = in_groupby.op.build_mock_groupby(
|
|
295
|
+
group_keys=False, selection=selection
|
|
296
|
+
)
|
|
297
|
+
|
|
298
|
+
result_df = self._calc_mock_result_df(mock_groupby)
|
|
299
|
+
|
|
300
|
+
if isinstance(result_df, pd.DataFrame):
|
|
301
|
+
self.output_types = [OutputType.dataframe]
|
|
302
|
+
return result_df.dtypes
|
|
303
|
+
else:
|
|
304
|
+
self.output_types = [OutputType.series]
|
|
305
|
+
return result_df.name, result_df.dtype
|
|
306
|
+
|
|
307
|
+
def __call__(self, groupby):
|
|
308
|
+
in_df = groupby
|
|
309
|
+
while in_df.op.output_types[0] not in (OutputType.dataframe, OutputType.series):
|
|
310
|
+
in_df = in_df.inputs[0]
|
|
311
|
+
|
|
312
|
+
out_dtypes = self._calc_out_dtypes(groupby)
|
|
313
|
+
|
|
314
|
+
kw = in_df.params.copy()
|
|
315
|
+
if self.output_types[0] == OutputType.dataframe:
|
|
316
|
+
kw.update(
|
|
317
|
+
dict(
|
|
318
|
+
columns_value=parse_index(out_dtypes.index, store_data=True),
|
|
319
|
+
dtypes=out_dtypes,
|
|
320
|
+
shape=(groupby.shape[0], len(out_dtypes)),
|
|
321
|
+
)
|
|
322
|
+
)
|
|
323
|
+
else:
|
|
324
|
+
name, dtype = out_dtypes
|
|
325
|
+
kw.update(dtype=dtype, name=name, shape=(groupby.shape[0],))
|
|
326
|
+
return self.new_tileable([in_df], **kw)
|
|
@@ -25,6 +25,10 @@ cudf = lazy_import("cudf")
|
|
|
25
25
|
|
|
26
26
|
|
|
27
27
|
class GroupByCumReductionOperator(DataFrameOperatorMixin, DataFrameOperator):
|
|
28
|
+
"""
|
|
29
|
+
NOTE: this operator has been deprecated and merged with GroupByExpandingAgg.
|
|
30
|
+
"""
|
|
31
|
+
|
|
28
32
|
_op_module_ = "dataframe.groupby"
|
|
29
33
|
|
|
30
34
|
axis = AnyField("axis", default=None)
|
|
@@ -96,28 +100,3 @@ class GroupByCumprod(GroupByCumReductionOperator):
|
|
|
96
100
|
class GroupByCumcount(GroupByCumReductionOperator):
|
|
97
101
|
_op_type_ = opcodes.CUMCOUNT
|
|
98
102
|
_func_name = "cumcount"
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
def cumcount(groupby, ascending: bool = True):
|
|
102
|
-
op = GroupByCumcount(ascending=ascending)
|
|
103
|
-
return op(groupby)
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
def cummin(groupby, axis=0):
|
|
107
|
-
op = GroupByCummin(axis=axis)
|
|
108
|
-
return op(groupby)
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
def cummax(groupby, axis=0):
|
|
112
|
-
op = GroupByCummax(axis=axis)
|
|
113
|
-
return op(groupby)
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
def cumprod(groupby, axis=0):
|
|
117
|
-
op = GroupByCumprod(axis=axis)
|
|
118
|
-
return op(groupby)
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
def cumsum(groupby, axis=0):
|
|
122
|
-
op = GroupByCumsum(axis=axis)
|
|
123
|
-
return op(groupby)
|