maxframe 2.0.0b2__cp37-cp37m-win32.whl → 2.3.0rc1__cp37-cp37m-win32.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of maxframe might be problematic. Click here for more details.
- maxframe/__init__.py +1 -0
- maxframe/_utils.cp37-win32.pyd +0 -0
- maxframe/_utils.pyx +14 -1
- maxframe/codegen/core.py +9 -8
- maxframe/codegen/spe/core.py +1 -1
- maxframe/codegen/spe/dataframe/__init__.py +1 -0
- maxframe/codegen/spe/dataframe/accessors/base.py +18 -0
- maxframe/codegen/spe/dataframe/accessors/dict_.py +25 -130
- maxframe/codegen/spe/dataframe/accessors/list_.py +12 -48
- maxframe/codegen/spe/dataframe/accessors/struct_.py +28 -0
- maxframe/codegen/spe/dataframe/arithmetic.py +7 -2
- maxframe/codegen/spe/dataframe/groupby.py +88 -0
- maxframe/codegen/spe/dataframe/indexing.py +99 -4
- maxframe/codegen/spe/dataframe/merge.py +38 -1
- maxframe/codegen/spe/dataframe/misc.py +11 -33
- maxframe/codegen/spe/dataframe/reduction.py +32 -9
- maxframe/codegen/spe/dataframe/reshape.py +46 -0
- maxframe/codegen/spe/dataframe/sort.py +39 -18
- maxframe/codegen/spe/dataframe/tests/accessors/test_dict.py +9 -15
- maxframe/codegen/spe/dataframe/tests/accessors/test_list.py +4 -7
- maxframe/codegen/spe/dataframe/tests/accessors/test_struct.py +75 -0
- maxframe/codegen/spe/dataframe/tests/indexing/test_iloc.py +20 -1
- maxframe/codegen/spe/dataframe/tests/indexing/test_loc.py +35 -0
- maxframe/codegen/spe/dataframe/tests/misc/test_misc.py +0 -32
- maxframe/codegen/spe/dataframe/tests/test_groupby.py +81 -18
- maxframe/codegen/spe/dataframe/tests/test_merge.py +27 -1
- maxframe/codegen/spe/dataframe/tests/test_reduction.py +13 -0
- maxframe/codegen/spe/dataframe/tests/test_reshape.py +79 -0
- maxframe/codegen/spe/dataframe/tests/test_sort.py +20 -0
- maxframe/codegen/spe/dataframe/tseries.py +9 -0
- maxframe/codegen/spe/learn/contrib/lightgbm.py +4 -3
- maxframe/codegen/spe/learn/contrib/tests/test_xgboost.py +2 -1
- maxframe/codegen/spe/learn/metrics/__init__.py +1 -1
- maxframe/codegen/spe/learn/metrics/_ranking.py +76 -0
- maxframe/codegen/spe/learn/metrics/pairwise.py +51 -0
- maxframe/codegen/spe/learn/metrics/tests/test_pairwise.py +36 -0
- maxframe/codegen/spe/learn/metrics/tests/test_ranking.py +59 -0
- maxframe/codegen/spe/tensor/__init__.py +3 -0
- maxframe/codegen/spe/tensor/datasource.py +1 -0
- maxframe/codegen/spe/tensor/fft.py +74 -0
- maxframe/codegen/spe/tensor/linalg.py +29 -2
- maxframe/codegen/spe/tensor/misc.py +79 -25
- maxframe/codegen/spe/tensor/spatial.py +45 -0
- maxframe/codegen/spe/tensor/statistics.py +44 -0
- maxframe/codegen/spe/tensor/tests/test_fft.py +64 -0
- maxframe/codegen/spe/tensor/tests/test_linalg.py +15 -1
- maxframe/codegen/spe/tensor/tests/test_misc.py +52 -2
- maxframe/codegen/spe/tensor/tests/test_spatial.py +33 -0
- maxframe/codegen/spe/tensor/tests/test_statistics.py +15 -1
- maxframe/codegen/spe/tests/test_spe_codegen.py +6 -12
- maxframe/codegen/spe/utils.py +2 -0
- maxframe/config/config.py +73 -9
- maxframe/config/tests/test_validators.py +13 -1
- maxframe/config/validators.py +49 -0
- maxframe/conftest.py +54 -17
- maxframe/core/accessor.py +2 -2
- maxframe/core/base.py +2 -1
- maxframe/core/entity/core.py +5 -0
- maxframe/core/entity/tileables.py +3 -1
- maxframe/core/graph/core.cp37-win32.pyd +0 -0
- maxframe/core/graph/entity.py +8 -3
- maxframe/core/mode.py +6 -1
- maxframe/core/operator/base.py +9 -2
- maxframe/core/operator/core.py +10 -2
- maxframe/core/operator/utils.py +13 -0
- maxframe/dataframe/__init__.py +12 -5
- maxframe/dataframe/accessors/__init__.py +1 -1
- maxframe/dataframe/accessors/compat.py +45 -0
- maxframe/dataframe/accessors/datetime_/__init__.py +4 -1
- maxframe/dataframe/accessors/dict_/contains.py +7 -16
- maxframe/dataframe/accessors/dict_/core.py +48 -0
- maxframe/dataframe/accessors/dict_/getitem.py +17 -21
- maxframe/dataframe/accessors/dict_/length.py +7 -16
- maxframe/dataframe/accessors/dict_/remove.py +6 -18
- maxframe/dataframe/accessors/dict_/setitem.py +8 -18
- maxframe/dataframe/accessors/dict_/tests/test_dict_accessor.py +62 -22
- maxframe/dataframe/accessors/list_/__init__.py +2 -2
- maxframe/dataframe/accessors/list_/core.py +48 -0
- maxframe/dataframe/accessors/list_/getitem.py +12 -19
- maxframe/dataframe/accessors/list_/length.py +7 -16
- maxframe/dataframe/accessors/list_/tests/test_list_accessor.py +11 -9
- maxframe/dataframe/accessors/string_/__init__.py +4 -1
- maxframe/dataframe/accessors/struct_/__init__.py +37 -0
- maxframe/dataframe/accessors/struct_/accessor.py +39 -0
- maxframe/dataframe/accessors/struct_/core.py +43 -0
- maxframe/dataframe/accessors/struct_/dtypes.py +53 -0
- maxframe/dataframe/accessors/struct_/field.py +123 -0
- maxframe/dataframe/accessors/struct_/tests/__init__.py +13 -0
- maxframe/dataframe/accessors/struct_/tests/test_struct_accessor.py +91 -0
- maxframe/dataframe/arithmetic/__init__.py +18 -4
- maxframe/dataframe/arithmetic/between.py +106 -0
- maxframe/dataframe/arithmetic/dot.py +237 -0
- maxframe/dataframe/arithmetic/maximum.py +33 -0
- maxframe/dataframe/arithmetic/minimum.py +33 -0
- maxframe/dataframe/arithmetic/{around.py → round.py} +11 -7
- maxframe/dataframe/core.py +161 -224
- maxframe/dataframe/datasource/__init__.py +18 -0
- maxframe/dataframe/datasource/core.py +6 -0
- maxframe/dataframe/datasource/direct.py +57 -0
- maxframe/dataframe/datasource/from_dict.py +124 -0
- maxframe/dataframe/datasource/from_index.py +1 -1
- maxframe/dataframe/datasource/from_records.py +77 -0
- maxframe/dataframe/datasource/from_tensor.py +109 -41
- maxframe/dataframe/datasource/read_csv.py +21 -14
- maxframe/dataframe/datasource/read_odps_query.py +29 -6
- maxframe/dataframe/datasource/read_odps_table.py +32 -10
- maxframe/dataframe/datasource/read_parquet.py +38 -39
- maxframe/dataframe/datasource/tests/test_datasource.py +37 -0
- maxframe/dataframe/datastore/__init__.py +11 -1
- maxframe/dataframe/datastore/direct.py +268 -0
- maxframe/dataframe/datastore/to_csv.py +29 -41
- maxframe/dataframe/datastore/to_odps.py +36 -4
- maxframe/dataframe/extensions/__init__.py +20 -4
- maxframe/dataframe/extensions/apply_chunk.py +32 -6
- maxframe/dataframe/extensions/cartesian_chunk.py +153 -0
- maxframe/dataframe/extensions/collect_kv.py +126 -0
- maxframe/dataframe/extensions/extract_kv.py +177 -0
- maxframe/dataframe/extensions/flatjson.py +2 -1
- maxframe/dataframe/extensions/map_reduce.py +263 -0
- maxframe/dataframe/extensions/rebalance.py +62 -0
- maxframe/dataframe/extensions/tests/test_apply_chunk.py +9 -2
- maxframe/dataframe/extensions/tests/test_extensions.py +54 -0
- maxframe/dataframe/extensions/tests/test_map_reduce.py +135 -0
- maxframe/dataframe/groupby/__init__.py +17 -2
- maxframe/dataframe/groupby/aggregation.py +86 -49
- maxframe/dataframe/groupby/apply.py +1 -1
- maxframe/dataframe/groupby/apply_chunk.py +19 -5
- maxframe/dataframe/groupby/core.py +116 -16
- maxframe/dataframe/groupby/cum.py +4 -25
- maxframe/dataframe/groupby/expanding.py +264 -0
- maxframe/dataframe/groupby/fill.py +1 -1
- maxframe/dataframe/groupby/getitem.py +12 -5
- maxframe/dataframe/groupby/head.py +11 -1
- maxframe/dataframe/groupby/rank.py +136 -0
- maxframe/dataframe/groupby/rolling.py +206 -0
- maxframe/dataframe/groupby/shift.py +114 -0
- maxframe/dataframe/groupby/tests/test_groupby.py +0 -5
- maxframe/dataframe/indexing/__init__.py +22 -2
- maxframe/dataframe/indexing/droplevel.py +195 -0
- maxframe/dataframe/indexing/filter.py +169 -0
- maxframe/dataframe/indexing/get_level_values.py +76 -0
- maxframe/dataframe/indexing/iat.py +45 -0
- maxframe/dataframe/indexing/iloc.py +152 -12
- maxframe/dataframe/indexing/insert.py +46 -18
- maxframe/dataframe/indexing/loc.py +287 -7
- maxframe/dataframe/indexing/reindex.py +14 -5
- maxframe/dataframe/indexing/rename.py +6 -0
- maxframe/dataframe/indexing/rename_axis.py +2 -2
- maxframe/dataframe/indexing/reorder_levels.py +143 -0
- maxframe/dataframe/indexing/reset_index.py +33 -6
- maxframe/dataframe/indexing/sample.py +8 -0
- maxframe/dataframe/indexing/setitem.py +3 -3
- maxframe/dataframe/indexing/swaplevel.py +185 -0
- maxframe/dataframe/indexing/take.py +99 -0
- maxframe/dataframe/indexing/truncate.py +140 -0
- maxframe/dataframe/indexing/where.py +0 -11
- maxframe/dataframe/indexing/xs.py +148 -0
- maxframe/dataframe/merge/__init__.py +15 -1
- maxframe/dataframe/merge/append.py +97 -98
- maxframe/dataframe/merge/combine.py +244 -0
- maxframe/dataframe/merge/combine_first.py +120 -0
- maxframe/dataframe/merge/compare.py +387 -0
- maxframe/dataframe/merge/concat.py +183 -0
- maxframe/dataframe/merge/update.py +271 -0
- maxframe/dataframe/misc/__init__.py +28 -11
- maxframe/dataframe/misc/_duplicate.py +10 -4
- maxframe/dataframe/misc/apply.py +1 -1
- maxframe/dataframe/misc/check_unique.py +82 -0
- maxframe/dataframe/misc/clip.py +145 -0
- maxframe/dataframe/misc/describe.py +175 -9
- maxframe/dataframe/misc/drop.py +31 -0
- maxframe/dataframe/misc/drop_duplicates.py +2 -2
- maxframe/dataframe/misc/duplicated.py +2 -2
- maxframe/dataframe/misc/get_dummies.py +5 -1
- maxframe/dataframe/misc/infer_dtypes.py +251 -0
- maxframe/dataframe/misc/isin.py +2 -2
- maxframe/dataframe/misc/map.py +125 -18
- maxframe/dataframe/misc/repeat.py +159 -0
- maxframe/dataframe/misc/tests/test_misc.py +48 -3
- maxframe/dataframe/misc/to_numeric.py +3 -0
- maxframe/dataframe/misc/transform.py +12 -5
- maxframe/dataframe/misc/transpose.py +13 -1
- maxframe/dataframe/misc/valid_index.py +115 -0
- maxframe/dataframe/misc/value_counts.py +38 -4
- maxframe/dataframe/missing/checkna.py +14 -6
- maxframe/dataframe/missing/dropna.py +5 -0
- maxframe/dataframe/missing/fillna.py +1 -1
- maxframe/dataframe/missing/replace.py +7 -4
- maxframe/dataframe/reduction/__init__.py +35 -16
- maxframe/dataframe/reduction/aggregation.py +43 -14
- maxframe/dataframe/reduction/all.py +2 -2
- maxframe/dataframe/reduction/any.py +2 -2
- maxframe/dataframe/reduction/argmax.py +103 -0
- maxframe/dataframe/reduction/argmin.py +103 -0
- maxframe/dataframe/reduction/core.py +80 -24
- maxframe/dataframe/reduction/count.py +13 -9
- maxframe/dataframe/reduction/cov.py +166 -0
- maxframe/dataframe/reduction/cummax.py +2 -2
- maxframe/dataframe/reduction/cummin.py +2 -2
- maxframe/dataframe/reduction/cumprod.py +2 -2
- maxframe/dataframe/reduction/cumsum.py +2 -2
- maxframe/dataframe/reduction/custom_reduction.py +2 -2
- maxframe/dataframe/reduction/idxmax.py +185 -0
- maxframe/dataframe/reduction/idxmin.py +185 -0
- maxframe/dataframe/reduction/kurtosis.py +37 -30
- maxframe/dataframe/reduction/max.py +2 -2
- maxframe/dataframe/reduction/mean.py +9 -7
- maxframe/dataframe/reduction/median.py +2 -2
- maxframe/dataframe/reduction/min.py +2 -2
- maxframe/dataframe/reduction/mode.py +144 -0
- maxframe/dataframe/reduction/nunique.py +19 -11
- maxframe/dataframe/reduction/prod.py +18 -13
- maxframe/dataframe/reduction/reduction_size.py +2 -2
- maxframe/dataframe/reduction/sem.py +13 -9
- maxframe/dataframe/reduction/skew.py +31 -27
- maxframe/dataframe/reduction/str_concat.py +10 -7
- maxframe/dataframe/reduction/sum.py +18 -14
- maxframe/dataframe/reduction/tests/test_reduction.py +12 -0
- maxframe/dataframe/reduction/unique.py +20 -3
- maxframe/dataframe/reduction/var.py +16 -12
- maxframe/dataframe/reshape/__init__.py +38 -0
- maxframe/dataframe/{misc → reshape}/pivot.py +1 -0
- maxframe/dataframe/{misc → reshape}/pivot_table.py +1 -0
- maxframe/dataframe/reshape/unstack.py +114 -0
- maxframe/dataframe/sort/__init__.py +16 -1
- maxframe/dataframe/sort/argsort.py +68 -0
- maxframe/dataframe/sort/core.py +2 -1
- maxframe/dataframe/sort/nlargest.py +238 -0
- maxframe/dataframe/sort/nsmallest.py +228 -0
- maxframe/dataframe/sort/rank.py +147 -0
- maxframe/dataframe/statistics/__init__.py +3 -3
- maxframe/dataframe/statistics/corr.py +1 -0
- maxframe/dataframe/statistics/quantile.py +2 -2
- maxframe/dataframe/tests/test_typing.py +104 -0
- maxframe/dataframe/tests/test_utils.py +66 -2
- maxframe/dataframe/tseries/__init__.py +19 -0
- maxframe/dataframe/tseries/at_time.py +61 -0
- maxframe/dataframe/tseries/between_time.py +122 -0
- maxframe/dataframe/typing_.py +185 -0
- maxframe/dataframe/utils.py +125 -52
- maxframe/dataframe/window/aggregation.py +8 -4
- maxframe/dataframe/window/core.py +14 -1
- maxframe/dataframe/window/ewm.py +1 -3
- maxframe/dataframe/window/expanding.py +37 -35
- maxframe/dataframe/window/rolling.py +49 -39
- maxframe/dataframe/window/tests/test_expanding.py +1 -7
- maxframe/dataframe/window/tests/test_rolling.py +1 -1
- maxframe/env.py +7 -4
- maxframe/errors.py +2 -2
- maxframe/io/odpsio/schema.py +9 -3
- maxframe/io/odpsio/tableio.py +7 -2
- maxframe/io/odpsio/tests/test_schema.py +198 -83
- maxframe/learn/__init__.py +10 -2
- maxframe/learn/cluster/__init__.py +15 -0
- maxframe/learn/cluster/_kmeans.py +782 -0
- maxframe/learn/contrib/llm/core.py +18 -7
- maxframe/learn/contrib/llm/deploy/__init__.py +13 -0
- maxframe/learn/contrib/llm/deploy/config.py +221 -0
- maxframe/learn/contrib/llm/deploy/core.py +247 -0
- maxframe/learn/contrib/llm/deploy/framework.py +35 -0
- maxframe/learn/contrib/llm/deploy/loader.py +360 -0
- maxframe/learn/contrib/llm/deploy/tests/__init__.py +13 -0
- maxframe/learn/contrib/llm/deploy/tests/test_register_models.py +359 -0
- maxframe/learn/contrib/llm/models/__init__.py +1 -0
- maxframe/learn/contrib/llm/models/dashscope.py +12 -6
- maxframe/learn/contrib/llm/models/managed.py +76 -11
- maxframe/learn/contrib/llm/models/openai.py +72 -0
- maxframe/learn/contrib/llm/tests/__init__.py +13 -0
- maxframe/learn/contrib/llm/tests/test_core.py +34 -0
- maxframe/learn/contrib/llm/tests/test_openai.py +187 -0
- maxframe/learn/contrib/llm/tests/test_text_gen.py +155 -0
- maxframe/learn/contrib/llm/text.py +348 -42
- maxframe/learn/contrib/models.py +4 -1
- maxframe/learn/contrib/xgboost/classifier.py +2 -0
- maxframe/learn/contrib/xgboost/core.py +113 -4
- maxframe/learn/contrib/xgboost/predict.py +4 -2
- maxframe/learn/contrib/xgboost/regressor.py +5 -0
- maxframe/learn/contrib/xgboost/train.py +7 -2
- maxframe/learn/core.py +66 -0
- maxframe/learn/linear_model/_base.py +58 -1
- maxframe/learn/linear_model/_lin_reg.py +1 -1
- maxframe/learn/metrics/__init__.py +6 -0
- maxframe/learn/metrics/_classification.py +145 -0
- maxframe/learn/metrics/_ranking.py +477 -0
- maxframe/learn/metrics/_scorer.py +60 -0
- maxframe/learn/metrics/pairwise/__init__.py +21 -0
- maxframe/learn/metrics/pairwise/core.py +77 -0
- maxframe/learn/metrics/pairwise/cosine.py +115 -0
- maxframe/learn/metrics/pairwise/euclidean.py +176 -0
- maxframe/learn/metrics/pairwise/haversine.py +96 -0
- maxframe/learn/metrics/pairwise/manhattan.py +80 -0
- maxframe/learn/metrics/pairwise/pairwise.py +127 -0
- maxframe/learn/metrics/pairwise/pairwise_distances_topk.py +121 -0
- maxframe/learn/metrics/pairwise/rbf_kernel.py +51 -0
- maxframe/learn/metrics/tests/__init__.py +13 -0
- maxframe/learn/metrics/tests/test_scorer.py +26 -0
- maxframe/learn/preprocessing/_data/min_max_scaler.py +34 -23
- maxframe/learn/preprocessing/_data/standard_scaler.py +34 -25
- maxframe/learn/utils/__init__.py +2 -1
- maxframe/learn/utils/checks.py +1 -2
- maxframe/learn/utils/core.py +59 -0
- maxframe/learn/utils/extmath.py +79 -9
- maxframe/learn/utils/odpsio.py +262 -0
- maxframe/learn/utils/validation.py +2 -2
- maxframe/lib/compat.py +40 -0
- maxframe/lib/dtypes_extension/__init__.py +16 -1
- maxframe/lib/dtypes_extension/_fake_arrow_dtype.py +604 -0
- maxframe/lib/dtypes_extension/blob.py +304 -0
- maxframe/lib/dtypes_extension/dtypes.py +40 -0
- maxframe/lib/dtypes_extension/tests/test_blob.py +88 -0
- maxframe/lib/dtypes_extension/tests/test_dtypes.py +16 -1
- maxframe/lib/dtypes_extension/tests/test_fake_arrow_dtype.py +75 -0
- maxframe/lib/filesystem/_oss_lib/common.py +124 -50
- maxframe/lib/filesystem/_oss_lib/glob.py +1 -1
- maxframe/lib/filesystem/_oss_lib/handle.py +21 -25
- maxframe/lib/filesystem/base.py +1 -1
- maxframe/lib/filesystem/core.py +1 -1
- maxframe/lib/filesystem/oss.py +115 -46
- maxframe/lib/filesystem/tests/test_oss.py +74 -36
- maxframe/lib/mmh3.cp37-win32.pyd +0 -0
- maxframe/lib/wrapped_pickle.py +10 -0
- maxframe/opcodes.py +41 -15
- maxframe/protocol.py +12 -0
- maxframe/remote/core.py +4 -0
- maxframe/serialization/__init__.py +11 -2
- maxframe/serialization/arrow.py +38 -13
- maxframe/serialization/blob.py +32 -0
- maxframe/serialization/core.cp37-win32.pyd +0 -0
- maxframe/serialization/core.pyx +39 -1
- maxframe/serialization/exception.py +2 -4
- maxframe/serialization/numpy.py +11 -0
- maxframe/serialization/pandas.py +46 -9
- maxframe/serialization/serializables/core.py +2 -2
- maxframe/serialization/tests/test_serial.py +31 -4
- maxframe/tensor/__init__.py +38 -8
- maxframe/tensor/arithmetic/__init__.py +19 -10
- maxframe/tensor/arithmetic/core.py +2 -2
- maxframe/tensor/arithmetic/iscomplexobj.py +53 -0
- maxframe/tensor/arithmetic/tests/test_arithmetic.py +6 -9
- maxframe/tensor/core.py +6 -2
- maxframe/tensor/datasource/tests/test_datasource.py +2 -1
- maxframe/tensor/extensions/__init__.py +2 -0
- maxframe/tensor/extensions/apply_chunk.py +3 -3
- maxframe/tensor/extensions/rebalance.py +65 -0
- maxframe/tensor/fft/__init__.py +32 -0
- maxframe/tensor/fft/core.py +168 -0
- maxframe/tensor/fft/fft.py +112 -0
- maxframe/tensor/fft/fft2.py +118 -0
- maxframe/tensor/fft/fftfreq.py +80 -0
- maxframe/tensor/fft/fftn.py +123 -0
- maxframe/tensor/fft/fftshift.py +79 -0
- maxframe/tensor/fft/hfft.py +112 -0
- maxframe/tensor/fft/ifft.py +114 -0
- maxframe/tensor/fft/ifft2.py +115 -0
- maxframe/tensor/fft/ifftn.py +123 -0
- maxframe/tensor/fft/ifftshift.py +73 -0
- maxframe/tensor/fft/ihfft.py +93 -0
- maxframe/tensor/fft/irfft.py +118 -0
- maxframe/tensor/fft/irfft2.py +62 -0
- maxframe/tensor/fft/irfftn.py +114 -0
- maxframe/tensor/fft/rfft.py +116 -0
- maxframe/tensor/fft/rfft2.py +63 -0
- maxframe/tensor/fft/rfftfreq.py +87 -0
- maxframe/tensor/fft/rfftn.py +113 -0
- maxframe/tensor/indexing/fill_diagonal.py +1 -7
- maxframe/tensor/linalg/__init__.py +7 -0
- maxframe/tensor/linalg/_einsumfunc.py +1025 -0
- maxframe/tensor/linalg/cholesky.py +117 -0
- maxframe/tensor/linalg/einsum.py +339 -0
- maxframe/tensor/linalg/lstsq.py +100 -0
- maxframe/tensor/linalg/matrix_norm.py +75 -0
- maxframe/tensor/linalg/norm.py +249 -0
- maxframe/tensor/linalg/solve.py +72 -0
- maxframe/tensor/linalg/solve_triangular.py +2 -2
- maxframe/tensor/linalg/vector_norm.py +113 -0
- maxframe/tensor/misc/__init__.py +24 -1
- maxframe/tensor/misc/argwhere.py +72 -0
- maxframe/tensor/misc/array_split.py +46 -0
- maxframe/tensor/misc/broadcast_arrays.py +57 -0
- maxframe/tensor/misc/copyto.py +130 -0
- maxframe/tensor/misc/delete.py +104 -0
- maxframe/tensor/misc/dsplit.py +68 -0
- maxframe/tensor/misc/ediff1d.py +74 -0
- maxframe/tensor/misc/expand_dims.py +85 -0
- maxframe/tensor/misc/flip.py +90 -0
- maxframe/tensor/misc/fliplr.py +64 -0
- maxframe/tensor/misc/flipud.py +68 -0
- maxframe/tensor/misc/hsplit.py +85 -0
- maxframe/tensor/misc/insert.py +139 -0
- maxframe/tensor/misc/moveaxis.py +83 -0
- maxframe/tensor/misc/result_type.py +88 -0
- maxframe/tensor/misc/roll.py +124 -0
- maxframe/tensor/misc/rollaxis.py +77 -0
- maxframe/tensor/misc/shape.py +89 -0
- maxframe/tensor/misc/split.py +190 -0
- maxframe/tensor/misc/tile.py +109 -0
- maxframe/tensor/misc/vsplit.py +74 -0
- maxframe/tensor/reduction/array_equal.py +2 -1
- maxframe/tensor/sort/__init__.py +2 -0
- maxframe/tensor/sort/argpartition.py +98 -0
- maxframe/tensor/sort/partition.py +228 -0
- maxframe/tensor/spatial/__init__.py +15 -0
- maxframe/tensor/spatial/distance/__init__.py +17 -0
- maxframe/tensor/spatial/distance/cdist.py +421 -0
- maxframe/tensor/spatial/distance/pdist.py +398 -0
- maxframe/tensor/spatial/distance/squareform.py +153 -0
- maxframe/tensor/special/__init__.py +159 -21
- maxframe/tensor/special/airy.py +55 -0
- maxframe/tensor/special/bessel.py +199 -0
- maxframe/tensor/special/core.py +65 -4
- maxframe/tensor/special/ellip_func_integrals.py +155 -0
- maxframe/tensor/special/ellip_harm.py +55 -0
- maxframe/tensor/special/err_fresnel.py +223 -0
- maxframe/tensor/special/gamma_funcs.py +303 -0
- maxframe/tensor/special/hypergeometric_funcs.py +69 -0
- maxframe/tensor/special/info_theory.py +189 -0
- maxframe/tensor/special/misc.py +21 -0
- maxframe/tensor/statistics/__init__.py +6 -0
- maxframe/tensor/statistics/corrcoef.py +77 -0
- maxframe/tensor/statistics/cov.py +222 -0
- maxframe/tensor/statistics/digitize.py +126 -0
- maxframe/tensor/statistics/histogram.py +520 -0
- maxframe/tensor/statistics/median.py +85 -0
- maxframe/tensor/statistics/ptp.py +89 -0
- maxframe/tensor/utils.py +3 -3
- maxframe/tests/test_udf.py +61 -0
- maxframe/tests/test_utils.py +51 -6
- maxframe/tests/utils.py +0 -2
- maxframe/typing_.py +2 -0
- maxframe/udf.py +130 -9
- maxframe/utils.py +254 -27
- {maxframe-2.0.0b2.dist-info → maxframe-2.3.0rc1.dist-info}/METADATA +3 -3
- {maxframe-2.0.0b2.dist-info → maxframe-2.3.0rc1.dist-info}/RECORD +442 -264
- maxframe_client/fetcher.py +35 -4
- maxframe_client/session/odps.py +7 -2
- maxframe_client/session/task.py +8 -1
- maxframe_client/tests/test_fetcher.py +76 -3
- maxframe_client/tests/test_session.py +28 -1
- maxframe/dataframe/arrays.py +0 -864
- /maxframe/dataframe/{misc → reshape}/melt.py +0 -0
- /maxframe/dataframe/{misc → reshape}/stack.py +0 -0
- {maxframe-2.0.0b2.dist-info → maxframe-2.3.0rc1.dist-info}/WHEEL +0 -0
- {maxframe-2.0.0b2.dist-info → maxframe-2.3.0rc1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
# Copyright 1999-2025 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from typing import List
|
|
16
|
+
|
|
17
|
+
import numpy as np
|
|
18
|
+
import pandas as pd
|
|
19
|
+
|
|
20
|
+
from ... import opcodes
|
|
21
|
+
from ...core import EntityData, OutputType
|
|
22
|
+
from ...serialization.serializables import AnyField, KeyField, StringField
|
|
23
|
+
from ...utils import make_dtype, no_default
|
|
24
|
+
from ..operators import DataFrameOperator, DataFrameOperatorMixin
|
|
25
|
+
from ..utils import make_column_list
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class DataFrameExtractKv(DataFrameOperator, DataFrameOperatorMixin):
|
|
29
|
+
_op_type_ = opcodes.EXTRACT_KV
|
|
30
|
+
|
|
31
|
+
columns = AnyField("columns", default=None)
|
|
32
|
+
kv_delim = StringField("kv_delim", default="=")
|
|
33
|
+
item_delim = StringField("item_delim", default=",")
|
|
34
|
+
dtype = AnyField("dtype", default=None)
|
|
35
|
+
fill_value = AnyField("fill_value", default=None)
|
|
36
|
+
errors = StringField("errors", default="raise")
|
|
37
|
+
# intermediate agg data
|
|
38
|
+
agg_results = KeyField("agg_results", default=None)
|
|
39
|
+
|
|
40
|
+
def __init__(self, kv_delim="=", item_delim=",", **kw):
|
|
41
|
+
super().__init__(kv_delim=kv_delim, item_delim=item_delim, **kw)
|
|
42
|
+
self.output_types = [OutputType.dataframe]
|
|
43
|
+
|
|
44
|
+
@classmethod
|
|
45
|
+
def _set_inputs(cls, op: "DataFrameExtractKv", inputs: List[EntityData]):
|
|
46
|
+
super()._set_inputs(op, inputs)
|
|
47
|
+
if op.agg_results is not None:
|
|
48
|
+
op.agg_results = inputs[-1]
|
|
49
|
+
|
|
50
|
+
def __call__(self, df):
|
|
51
|
+
shape = (df.shape[0], np.nan)
|
|
52
|
+
errors_arg = self.errors
|
|
53
|
+
|
|
54
|
+
def get_keys(row, cols, kv_delim, item_delim):
|
|
55
|
+
for col in cols:
|
|
56
|
+
if row[col] is not None:
|
|
57
|
+
pairs = row[col].split(item_delim)
|
|
58
|
+
else:
|
|
59
|
+
pairs = []
|
|
60
|
+
for pair in pairs:
|
|
61
|
+
result = pair.split(kv_delim, 1)
|
|
62
|
+
if len(result) == 2:
|
|
63
|
+
yield f"{col}_{result[0]}"
|
|
64
|
+
elif errors_arg == "raise":
|
|
65
|
+
raise ValueError(f"Malformed data {pair} in column '{col}'.")
|
|
66
|
+
|
|
67
|
+
all_keys = df.mf.flatmap(
|
|
68
|
+
get_keys,
|
|
69
|
+
dtypes=pd.Series([str], index=["keys_cols"]),
|
|
70
|
+
cols=self.columns,
|
|
71
|
+
kv_delim=self.kv_delim,
|
|
72
|
+
item_delim=self.item_delim,
|
|
73
|
+
)
|
|
74
|
+
self.agg_results = all_keys.drop_duplicates().sort_values(by="keys_cols")
|
|
75
|
+
inputs = [df]
|
|
76
|
+
inputs.append(self.agg_results)
|
|
77
|
+
return self.new_dataframe(
|
|
78
|
+
inputs,
|
|
79
|
+
shape=shape,
|
|
80
|
+
dtypes=None,
|
|
81
|
+
index_value=df.index_value,
|
|
82
|
+
columns_value=None,
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def extract_kv(
|
|
87
|
+
data,
|
|
88
|
+
columns=None,
|
|
89
|
+
kv_delim="=",
|
|
90
|
+
item_delim=",",
|
|
91
|
+
dtype="float",
|
|
92
|
+
fill_value=None,
|
|
93
|
+
errors="raise",
|
|
94
|
+
):
|
|
95
|
+
"""
|
|
96
|
+
Extract values in key-value represented columns into standalone columns.
|
|
97
|
+
New column names will be the name of the key-value column followed by
|
|
98
|
+
an underscore and the key.
|
|
99
|
+
|
|
100
|
+
Parameters
|
|
101
|
+
----------
|
|
102
|
+
columns : list, default None
|
|
103
|
+
The key-value columns to be extracted.
|
|
104
|
+
kv_delim : str, default '='
|
|
105
|
+
Delimiter between key and value.
|
|
106
|
+
item_delim : str, default ','
|
|
107
|
+
Delimiter between key-value pairs.
|
|
108
|
+
dtype : str
|
|
109
|
+
Type of value columns to generate.
|
|
110
|
+
fill_value : object, default None
|
|
111
|
+
Default value for missing key-value pairs.
|
|
112
|
+
errors : {'ignore', 'raise'}, default 'raise'
|
|
113
|
+
* If 'raise', then invalid parsing will raise an exception.
|
|
114
|
+
* If 'ignore', then invalid parsing will return the input.
|
|
115
|
+
|
|
116
|
+
Returns
|
|
117
|
+
-------
|
|
118
|
+
DataFrame
|
|
119
|
+
extracted data frame
|
|
120
|
+
|
|
121
|
+
See Also
|
|
122
|
+
--------
|
|
123
|
+
DataFrame.mf.collect_kv
|
|
124
|
+
|
|
125
|
+
Examples
|
|
126
|
+
--------
|
|
127
|
+
>>> import numpy as np
|
|
128
|
+
>>> import maxframe.dataframe as md
|
|
129
|
+
|
|
130
|
+
>>> df = md.DataFrame({"name": ["name1", "name2", "name3", "name4", "name5"],
|
|
131
|
+
... "kv": ["k1=1.0,k2=3.0,k5=10.0",
|
|
132
|
+
... "k2=3.0,k3=5.1",
|
|
133
|
+
... "k1=7.1,k7=8.2",
|
|
134
|
+
... "k2=1.2,k3=1.5",
|
|
135
|
+
... "k2=1.0,k9=1.1"]})
|
|
136
|
+
>>> df.execute()
|
|
137
|
+
name kv
|
|
138
|
+
0 name1 k1=1.0,k2=3.0,k5=10.0
|
|
139
|
+
1 name2 k2=3.0,k3=5.1
|
|
140
|
+
2 name3 k1=7.1,k7=8.2
|
|
141
|
+
3 name4 k2=1.2,k3=1.5
|
|
142
|
+
4 name5 k2=1.0,k9=1.1
|
|
143
|
+
|
|
144
|
+
The field names to be expanded are specified by columns
|
|
145
|
+
kv_delim is to delimit the key and value and '=' is default
|
|
146
|
+
item_delim is to delimit the Key-Value pairs, ',' is default
|
|
147
|
+
The output field name is the original field name connect with the key by "_"
|
|
148
|
+
fill_value is used to fill missing values, None is default
|
|
149
|
+
|
|
150
|
+
>>> df.mf.extract_kv(columns=['kv'], kv_delim='=', item_delim=',').execute()
|
|
151
|
+
name kv_k1 kv_k2 kv_k3 kv_k5 kv_k7 kv_k9
|
|
152
|
+
0 name1 1.0 3.0 NaN 10.0 NaN NaN
|
|
153
|
+
1 name2 NaN 3.0 5.1 NaN NaN NaN
|
|
154
|
+
2 name3 7.1 NaN NaN NaN 8.2 NaN
|
|
155
|
+
3 name4 NaN 1.2 1.5 NaN NaN NaN
|
|
156
|
+
4 name5 NaN 1.0 NaN NaN NaN 1.1
|
|
157
|
+
"""
|
|
158
|
+
if columns is None:
|
|
159
|
+
columns = data.dtypes.index.tolist()
|
|
160
|
+
columns_list = make_column_list(columns, data.dtypes)
|
|
161
|
+
non_exist_key = next(
|
|
162
|
+
(c for c in columns_list if c not in data.dtypes.index), no_default
|
|
163
|
+
)
|
|
164
|
+
if non_exist_key is not no_default:
|
|
165
|
+
raise ValueError(f"Column {non_exist_key} specified is not a valid column.")
|
|
166
|
+
for col in columns_list:
|
|
167
|
+
if str(data.dtypes[col]) not in ("object", "string"):
|
|
168
|
+
raise ValueError(f"Column '{col}' must be of string type.")
|
|
169
|
+
op = DataFrameExtractKv(
|
|
170
|
+
columns=columns,
|
|
171
|
+
kv_delim=kv_delim,
|
|
172
|
+
item_delim=item_delim,
|
|
173
|
+
dtype=make_dtype(dtype),
|
|
174
|
+
fill_value=fill_value,
|
|
175
|
+
errors=errors,
|
|
176
|
+
)
|
|
177
|
+
return op(data)
|
|
@@ -39,12 +39,13 @@ class SeriesFlatJSONOperator(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
39
39
|
name=name,
|
|
40
40
|
dtype=make_dtype(dtype),
|
|
41
41
|
)
|
|
42
|
+
dtypes = make_dtypes(dtypes)
|
|
42
43
|
return self.new_dataframe(
|
|
43
44
|
[series],
|
|
44
45
|
shape=(series.shape[0], len(dtypes)),
|
|
45
46
|
index_value=series.index_value,
|
|
46
47
|
columns_value=parse_index(dtypes.index, store_data=True),
|
|
47
|
-
dtypes=
|
|
48
|
+
dtypes=dtypes,
|
|
48
49
|
)
|
|
49
50
|
|
|
50
51
|
|
|
@@ -0,0 +1,263 @@
|
|
|
1
|
+
# Copyright 1999-2025 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import inspect
|
|
16
|
+
from typing import Any, Callable, List, Optional, Union
|
|
17
|
+
|
|
18
|
+
import pandas as pd
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _has_end_arg(func) -> bool:
|
|
22
|
+
f_args = inspect.getfullargspec(func)
|
|
23
|
+
return "end" in f_args.args or "end" in f_args.kwonlyargs
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _gen_combined_mapper(
|
|
27
|
+
mapper: Callable,
|
|
28
|
+
combiner: Callable,
|
|
29
|
+
group_cols: List[Any],
|
|
30
|
+
order_cols: List[Any],
|
|
31
|
+
ascending: Union[bool, List[bool]] = True,
|
|
32
|
+
):
|
|
33
|
+
class CombinedMapper:
|
|
34
|
+
def __init__(self):
|
|
35
|
+
if isinstance(mapper, type):
|
|
36
|
+
self.f = mapper()
|
|
37
|
+
else:
|
|
38
|
+
self.f = mapper
|
|
39
|
+
|
|
40
|
+
if isinstance(combiner, type):
|
|
41
|
+
self.combiner = combiner()
|
|
42
|
+
else:
|
|
43
|
+
self.combiner = combiner
|
|
44
|
+
|
|
45
|
+
def _combine_mapper_result(self, mapper_result, end=False):
|
|
46
|
+
if mapper_result is None:
|
|
47
|
+
return None
|
|
48
|
+
res = mapper_result
|
|
49
|
+
if order_cols:
|
|
50
|
+
res = mapper_result.sort_values(order_cols, ascending=ascending)
|
|
51
|
+
|
|
52
|
+
kw = {"end": end} if _has_end_arg(self.combiner) else {}
|
|
53
|
+
gcols = group_cols or list(res.columns)
|
|
54
|
+
return res.groupby(gcols, group_keys=False)[list(res.columns)].apply(
|
|
55
|
+
self.combiner, **kw
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
def __call__(self, batch, end=False):
|
|
59
|
+
kw = {"end": end} if _has_end_arg(self.f) else {}
|
|
60
|
+
f_ret = self.f(batch, **kw)
|
|
61
|
+
return self._combine_mapper_result(f_ret, end=end)
|
|
62
|
+
|
|
63
|
+
def close(self) -> None:
|
|
64
|
+
if hasattr(self.f, "close"):
|
|
65
|
+
self.f.close()
|
|
66
|
+
if hasattr(self.combiner, "close"):
|
|
67
|
+
self.combiner.close()
|
|
68
|
+
|
|
69
|
+
return CombinedMapper
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def map_reduce(
|
|
73
|
+
df,
|
|
74
|
+
mapper: Optional[Callable] = None,
|
|
75
|
+
reducer: Optional[Callable] = None,
|
|
76
|
+
group_cols: Optional[List[Any]] = None,
|
|
77
|
+
*,
|
|
78
|
+
order_cols: List[Any] = None,
|
|
79
|
+
ascending: Union[bool, List[bool]] = True,
|
|
80
|
+
combiner: Callable = None,
|
|
81
|
+
batch_rows: Optional[int] = 1024,
|
|
82
|
+
mapper_dtypes: pd.Series = None,
|
|
83
|
+
mapper_index: pd.Index = None,
|
|
84
|
+
mapper_batch_rows: Optional[int] = None,
|
|
85
|
+
reducer_dtypes: pd.Series = None,
|
|
86
|
+
reducer_index: pd.Index = None,
|
|
87
|
+
reducer_batch_rows: Optional[int] = None,
|
|
88
|
+
ignore_index: bool = False,
|
|
89
|
+
):
|
|
90
|
+
"""
|
|
91
|
+
Map-reduce API over certain DataFrames. This function is roughly
|
|
92
|
+
a shortcut for
|
|
93
|
+
|
|
94
|
+
.. code-block:: python
|
|
95
|
+
|
|
96
|
+
df.mf.apply_chunk(mapper).groupby(group_keys).mf.apply_chunk(reducer)
|
|
97
|
+
|
|
98
|
+
Parameters
|
|
99
|
+
----------
|
|
100
|
+
mapper : function or type
|
|
101
|
+
Mapper function or class.
|
|
102
|
+
reducer : function or type
|
|
103
|
+
Reducer function or class.
|
|
104
|
+
group_cols : str or list[str]
|
|
105
|
+
The keys to group after mapper. If absent, all columns in the mapped
|
|
106
|
+
DataFrame will be used.
|
|
107
|
+
order_cols : str or list[str]
|
|
108
|
+
The columns to sort after groupby.
|
|
109
|
+
ascending : bool or list[bool] or None
|
|
110
|
+
Whether columns should be in ascending order or not, only effective when
|
|
111
|
+
`order_cols` are specified. If a list of booleans are passed, orders of
|
|
112
|
+
every column in `order_cols` are specified.
|
|
113
|
+
combiner : function or class
|
|
114
|
+
Combiner function or class. Should accept and returns the same schema
|
|
115
|
+
of mapper outputs.
|
|
116
|
+
batch_rows : int or None
|
|
117
|
+
Rows in batches for mappers and reducers. Ignored if `mapper_batch_rows`
|
|
118
|
+
specified for mappers or `reducer_batch_rows` specified for reducers.
|
|
119
|
+
1024 by default.
|
|
120
|
+
mapper_dtypes : pd.Series or dict or None
|
|
121
|
+
Output dtypes of mapper stage.
|
|
122
|
+
mapper_index : pd.Index or None
|
|
123
|
+
Index of DataFrame returned by mappers.
|
|
124
|
+
mapper_batch_rows : int or None
|
|
125
|
+
Rows in batches for mappers. If specified, `batch_rows` will be ignored
|
|
126
|
+
for mappers.
|
|
127
|
+
reducer_dtypes : pd.Series or dict or None
|
|
128
|
+
Output dtypes of reducer stage.
|
|
129
|
+
reducer_index : pd.Index or None
|
|
130
|
+
Index of DataFrame returned by reducers.
|
|
131
|
+
reducer_batch_rows : int or None
|
|
132
|
+
Rows in batches for mappers. If specified, `batch_rows` will be ignored
|
|
133
|
+
for reducers.
|
|
134
|
+
ignore_index : bool
|
|
135
|
+
If true, indexes generated at mapper or reducer functions will be ignored.
|
|
136
|
+
|
|
137
|
+
Returns
|
|
138
|
+
-------
|
|
139
|
+
output: DataFrame
|
|
140
|
+
Result DataFrame after map and reduce.
|
|
141
|
+
|
|
142
|
+
Examples
|
|
143
|
+
--------
|
|
144
|
+
|
|
145
|
+
We first define a DataFrame with a column of several words.
|
|
146
|
+
|
|
147
|
+
>>> from collections import defaultdict
|
|
148
|
+
>>> import maxframe.dataframe as md
|
|
149
|
+
>>> from maxframe.udf import with_running_options
|
|
150
|
+
>>> df = pd.DataFrame(
|
|
151
|
+
>>> {
|
|
152
|
+
>>> "name": ["name key", "name", "key", "name", "key name"],
|
|
153
|
+
>>> "id": [4, 2, 4, 3, 3],
|
|
154
|
+
>>> "fid": [5.3, 3.5, 4.2, 2.2, 4.1],
|
|
155
|
+
>>> }
|
|
156
|
+
>>> )
|
|
157
|
+
|
|
158
|
+
Then we write a mapper function which accepts batches in the DataFrame
|
|
159
|
+
and returns counts of words in every row.
|
|
160
|
+
|
|
161
|
+
>>> def mapper(batch):
|
|
162
|
+
>>> word_to_count = defaultdict(lambda: 0)
|
|
163
|
+
>>> for words in batch["name"]:
|
|
164
|
+
>>> for w in words.split():
|
|
165
|
+
>>> word_to_count[w] += 1
|
|
166
|
+
>>> return pd.DataFrame(
|
|
167
|
+
>>> [list(tp) for tp in word_to_count.items()], columns=["word", "count"]
|
|
168
|
+
>>> )
|
|
169
|
+
|
|
170
|
+
After that we write a reducer function which aggregates records with
|
|
171
|
+
the same word. Running options such as CPU specifications can be supplied
|
|
172
|
+
as well.
|
|
173
|
+
|
|
174
|
+
>>> @with_running_options(cpu=2)
|
|
175
|
+
>>> class TestReducer:
|
|
176
|
+
>>> def __init__(self):
|
|
177
|
+
>>> self._word_to_count = defaultdict(lambda: 0)
|
|
178
|
+
>>>
|
|
179
|
+
>>> def __call__(self, batch, end=False):
|
|
180
|
+
>>> word = None
|
|
181
|
+
>>> for _, row in batch.iterrows():
|
|
182
|
+
>>> word = row.iloc[0]
|
|
183
|
+
>>> self._word_to_count[row.iloc[0]] += row.iloc[1]
|
|
184
|
+
>>> if end:
|
|
185
|
+
>>> return pd.DataFrame(
|
|
186
|
+
>>> [[word, self._word_to_count[word]]], columns=["word", "count"]
|
|
187
|
+
>>> )
|
|
188
|
+
>>>
|
|
189
|
+
>>> def close(self):
|
|
190
|
+
>>> # you can do several cleanups here
|
|
191
|
+
>>> print("close")
|
|
192
|
+
|
|
193
|
+
Finally we can call `map_reduce` with mappers and reducers specified above.
|
|
194
|
+
|
|
195
|
+
>>> res = df.mf.map_reduce(
|
|
196
|
+
>>> mapper,
|
|
197
|
+
>>> TestReducer,
|
|
198
|
+
>>> group_cols=["word"],
|
|
199
|
+
>>> mapper_dtypes={"word": "str", "count": "int"},
|
|
200
|
+
>>> mapper_index=pd.Index([0]),
|
|
201
|
+
>>> reducer_dtypes={"word": "str", "count": "int"},
|
|
202
|
+
>>> reducer_index=pd.Index([0]),
|
|
203
|
+
>>> ignore_index=True,
|
|
204
|
+
>>> )
|
|
205
|
+
>>> res.execute().fetch()
|
|
206
|
+
word count
|
|
207
|
+
0 key 3
|
|
208
|
+
1 name 4
|
|
209
|
+
|
|
210
|
+
See Also
|
|
211
|
+
--------
|
|
212
|
+
DataFrame.mf.apply_chunk, DataFrame.groupby.mf.apply_chunk
|
|
213
|
+
"""
|
|
214
|
+
mapper_batch_rows = mapper_batch_rows or batch_rows
|
|
215
|
+
reducer_batch_rows = reducer_batch_rows or batch_rows
|
|
216
|
+
|
|
217
|
+
def check_arg(arg_type, locals_):
|
|
218
|
+
if locals_.get(arg_type) is not None:
|
|
219
|
+
return
|
|
220
|
+
for suffix in ("dtypes", "index"):
|
|
221
|
+
arg_name = f"{arg_type}_{suffix}"
|
|
222
|
+
if locals_.get(arg_name) is not None:
|
|
223
|
+
raise ValueError(f"Cannot specify {arg_name} when {arg_type} is None")
|
|
224
|
+
|
|
225
|
+
if mapper is None:
|
|
226
|
+
check_arg("mapper", locals())
|
|
227
|
+
mapped = df
|
|
228
|
+
group_cols = group_cols or df.dtypes.index
|
|
229
|
+
if combiner is not None:
|
|
230
|
+
raise ValueError("Combiner cannot be set when mapper is None")
|
|
231
|
+
else:
|
|
232
|
+
if combiner is not None:
|
|
233
|
+
mapper = _gen_combined_mapper(
|
|
234
|
+
mapper, combiner, group_cols, order_cols, ascending=ascending
|
|
235
|
+
)
|
|
236
|
+
mapped = df.mf.apply_chunk(
|
|
237
|
+
mapper,
|
|
238
|
+
batch_rows=mapper_batch_rows,
|
|
239
|
+
dtypes=mapper_dtypes,
|
|
240
|
+
output_type="dataframe",
|
|
241
|
+
index=mapper_index,
|
|
242
|
+
)
|
|
243
|
+
group_cols = group_cols or list(df.dtypes.index)
|
|
244
|
+
|
|
245
|
+
if reducer is None:
|
|
246
|
+
check_arg("reducer", locals())
|
|
247
|
+
res = mapped
|
|
248
|
+
else:
|
|
249
|
+
res = mapped.groupby(group_cols, group_keys=False)[
|
|
250
|
+
list(mapped.dtypes.index)
|
|
251
|
+
].mf.apply_chunk(
|
|
252
|
+
reducer,
|
|
253
|
+
batch_rows=reducer_batch_rows,
|
|
254
|
+
dtypes=reducer_dtypes,
|
|
255
|
+
output_type="dataframe",
|
|
256
|
+
index=reducer_index,
|
|
257
|
+
order_cols=order_cols,
|
|
258
|
+
ascending=ascending,
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
if ignore_index:
|
|
262
|
+
return res.reset_index(drop=True)
|
|
263
|
+
return res
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
# Copyright 1999-2025 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from ... import opcodes
|
|
16
|
+
from ...serialization.serializables import Float64Field, Int64Field
|
|
17
|
+
from ...tensor.extensions.rebalance import RebalanceMixin
|
|
18
|
+
from ..operators import DataFrameOperator, DataFrameOperatorMixin
|
|
19
|
+
from ..utils import validate_axis
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class DataFrameRebalance(RebalanceMixin, DataFrameOperatorMixin, DataFrameOperator):
|
|
23
|
+
_op_type_ = opcodes.REBALANCE
|
|
24
|
+
|
|
25
|
+
axis = Int64Field("axis")
|
|
26
|
+
factor = Float64Field("factor", default=None)
|
|
27
|
+
num_partitions = Int64Field("num_partitions")
|
|
28
|
+
|
|
29
|
+
def __init__(self, output_types=None, **kw):
|
|
30
|
+
super().__init__(_output_types=output_types, **kw)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def rebalance(df_or_series, axis=0, factor=None, num_partitions=None):
|
|
34
|
+
"""
|
|
35
|
+
Make data more balanced across entire cluster.
|
|
36
|
+
|
|
37
|
+
Parameters
|
|
38
|
+
----------
|
|
39
|
+
axis : int
|
|
40
|
+
The axis to rebalance.
|
|
41
|
+
factor : float
|
|
42
|
+
Specified so that number of chunks after balance is
|
|
43
|
+
total number of input chunks * factor.
|
|
44
|
+
num_partitions : int
|
|
45
|
+
Specified so the number of chunks are at most
|
|
46
|
+
num_partitions.
|
|
47
|
+
|
|
48
|
+
Returns
|
|
49
|
+
-------
|
|
50
|
+
Series or DataFrame
|
|
51
|
+
Result of DataFrame or Series after rebalanced.
|
|
52
|
+
"""
|
|
53
|
+
axis = validate_axis(axis, df_or_series)
|
|
54
|
+
if num_partitions is None and factor is None:
|
|
55
|
+
raise ValueError("Need to specify num_partitions or factor")
|
|
56
|
+
if num_partitions is not None and factor is not None:
|
|
57
|
+
raise ValueError(
|
|
58
|
+
"num_partitions and factor cannot be specified at the same time"
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
op = DataFrameRebalance(axis=axis, factor=factor, num_partitions=num_partitions)
|
|
62
|
+
return op(df_or_series)
|
|
@@ -91,15 +91,22 @@ def test_apply_chunk_infer_dtypes_and_index(df1, df2, df3):
|
|
|
91
91
|
assert result.index_value is df2.index_value
|
|
92
92
|
assert result.dtypes.equals(df2.dtypes)
|
|
93
93
|
|
|
94
|
+
def process(data, param, k) -> pd.DataFrame[df2.dtypes]:
|
|
95
|
+
return data * param * k
|
|
96
|
+
|
|
97
|
+
result = df2.mf.apply_chunk(process, batch_rows=3, args=(4,), k=1)
|
|
98
|
+
assert result.index_value is df2.index_value
|
|
99
|
+
assert result.dtypes.equals(df2.dtypes)
|
|
100
|
+
|
|
94
101
|
# mark functions
|
|
95
102
|
from ....udf import with_python_requirements, with_resources
|
|
96
103
|
|
|
97
104
|
@with_resources("empty.txt")
|
|
98
105
|
@with_python_requirements("numpy")
|
|
99
|
-
def process(data, k):
|
|
106
|
+
def process(data, k) -> pd.DataFrame[df1.dtypes]:
|
|
100
107
|
return data
|
|
101
108
|
|
|
102
|
-
result = df1.mf.apply_chunk(process, batch_rows=3,
|
|
109
|
+
result = df1.mf.apply_chunk(process, batch_rows=3, k=1)
|
|
103
110
|
assert result.index_value is df1.index_value
|
|
104
111
|
assert result.dtypes.equals(df1.dtypes)
|
|
105
112
|
assert isinstance(result.op.func, MarkedFunction)
|
|
@@ -60,6 +60,40 @@ def df3():
|
|
|
60
60
|
)
|
|
61
61
|
|
|
62
62
|
|
|
63
|
+
@pytest.fixture
|
|
64
|
+
def df4():
|
|
65
|
+
return DataFrame(
|
|
66
|
+
{
|
|
67
|
+
"name1": ["a", "b", "c", "d"],
|
|
68
|
+
"name2": ["a", "b", "c", "d"],
|
|
69
|
+
"num": [1, 2, 3, 4],
|
|
70
|
+
"kv": [
|
|
71
|
+
"k1=1.1,k2=3.1,k3=1.0",
|
|
72
|
+
"k1=7.1,k4=8.2",
|
|
73
|
+
"k5=1.2,k7=1.5",
|
|
74
|
+
"k3=1.1,k9=1",
|
|
75
|
+
],
|
|
76
|
+
"vk": ["v1=1.1,v2=1.2", "v3=1.1,v4=1.2", "v5=1.1,v6=1.2", "v7=1.1,v8=1.2"],
|
|
77
|
+
}
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
@pytest.fixture
|
|
82
|
+
def df5():
|
|
83
|
+
return DataFrame(
|
|
84
|
+
{
|
|
85
|
+
"name1": ["name1", "name2", "name3", "name4", "name5"],
|
|
86
|
+
"name2": ["name1", "name2", "name3", "name4", "name5"],
|
|
87
|
+
"k1": [1.0, None, 7.1, None, None],
|
|
88
|
+
"k2": [3.0, 3.0, None, 1.2, 1.0],
|
|
89
|
+
"k3": [None, 5.1, None, 1.5, None],
|
|
90
|
+
"k5": [10.0, None, None, None, None],
|
|
91
|
+
"k7": [None, None, 8.2, None, None],
|
|
92
|
+
"k9": [None, None, None, None, 1.1],
|
|
93
|
+
}
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
|
|
63
97
|
def test_flatmap(df1, df2, df3):
|
|
64
98
|
def f(x, keys):
|
|
65
99
|
if x["a"] in keys:
|
|
@@ -142,3 +176,23 @@ def test_flatjson():
|
|
|
142
176
|
)
|
|
143
177
|
with pytest.raises(ValueError):
|
|
144
178
|
s1.mf.flatjson(["$.a"])
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def test_extract_kv(df4):
|
|
182
|
+
extract_kv_df = df4.mf.extract_kv(
|
|
183
|
+
columns=["kv", "vk"], kv_delim=",", item_delim="="
|
|
184
|
+
)
|
|
185
|
+
assert extract_kv_df.shape == (4, np.nan)
|
|
186
|
+
assert extract_kv_df.index_value.key == df4.index_value.key
|
|
187
|
+
with pytest.raises(ValueError):
|
|
188
|
+
df4.mf.extract_kv(columns=["name"])
|
|
189
|
+
with pytest.raises(ValueError):
|
|
190
|
+
df4.mf.extract_kv(columns=["num"])
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def test_collect_kv(df5):
|
|
194
|
+
collect_kv_df = df5.mf.collect_kv(columns=["k1", "k2", "k3", "k5", "k7", "k9"])
|
|
195
|
+
assert collect_kv_df.shape == (5, 3)
|
|
196
|
+
assert collect_kv_df.index_value.key == df5.index_value.key
|
|
197
|
+
with pytest.raises(ValueError):
|
|
198
|
+
df5.mf.collect_kv(columns=["num"])
|