maxframe 2.0.0b2__cp39-cp39-macosx_10_9_universal2.whl → 2.2.0__cp39-cp39-macosx_10_9_universal2.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of maxframe might be problematic. Click here for more details.
- maxframe/__init__.py +1 -0
- maxframe/_utils.cpython-39-darwin.so +0 -0
- maxframe/_utils.pyx +14 -1
- maxframe/codegen/core.py +6 -6
- maxframe/codegen/spe/core.py +1 -1
- maxframe/codegen/spe/dataframe/__init__.py +1 -0
- maxframe/codegen/spe/dataframe/accessors/base.py +18 -0
- maxframe/codegen/spe/dataframe/accessors/dict_.py +25 -130
- maxframe/codegen/spe/dataframe/accessors/list_.py +12 -48
- maxframe/codegen/spe/dataframe/accessors/struct_.py +28 -0
- maxframe/codegen/spe/dataframe/arithmetic.py +7 -2
- maxframe/codegen/spe/dataframe/groupby.py +88 -0
- maxframe/codegen/spe/dataframe/indexing.py +99 -4
- maxframe/codegen/spe/dataframe/merge.py +34 -1
- maxframe/codegen/spe/dataframe/misc.py +9 -33
- maxframe/codegen/spe/dataframe/reduction.py +14 -9
- maxframe/codegen/spe/dataframe/reshape.py +46 -0
- maxframe/codegen/spe/dataframe/sort.py +30 -17
- maxframe/codegen/spe/dataframe/tests/accessors/test_dict.py +9 -15
- maxframe/codegen/spe/dataframe/tests/accessors/test_list.py +4 -7
- maxframe/codegen/spe/dataframe/tests/accessors/test_struct.py +75 -0
- maxframe/codegen/spe/dataframe/tests/indexing/test_iloc.py +20 -1
- maxframe/codegen/spe/dataframe/tests/indexing/test_loc.py +35 -0
- maxframe/codegen/spe/dataframe/tests/misc/test_misc.py +0 -32
- maxframe/codegen/spe/dataframe/tests/test_groupby.py +81 -18
- maxframe/codegen/spe/dataframe/tests/test_merge.py +27 -1
- maxframe/codegen/spe/dataframe/tests/test_reshape.py +79 -0
- maxframe/codegen/spe/dataframe/tests/test_sort.py +20 -0
- maxframe/codegen/spe/learn/contrib/tests/test_xgboost.py +2 -1
- maxframe/codegen/spe/learn/metrics/__init__.py +1 -1
- maxframe/codegen/spe/learn/metrics/_ranking.py +76 -0
- maxframe/codegen/spe/learn/metrics/pairwise.py +51 -0
- maxframe/codegen/spe/learn/metrics/tests/test_pairwise.py +36 -0
- maxframe/codegen/spe/learn/metrics/tests/test_ranking.py +59 -0
- maxframe/codegen/spe/tensor/__init__.py +3 -0
- maxframe/codegen/spe/tensor/fft.py +74 -0
- maxframe/codegen/spe/tensor/linalg.py +29 -2
- maxframe/codegen/spe/tensor/misc.py +79 -25
- maxframe/codegen/spe/tensor/spatial.py +45 -0
- maxframe/codegen/spe/tensor/statistics.py +44 -0
- maxframe/codegen/spe/tensor/tests/test_fft.py +64 -0
- maxframe/codegen/spe/tensor/tests/test_linalg.py +15 -1
- maxframe/codegen/spe/tensor/tests/test_misc.py +52 -2
- maxframe/codegen/spe/tensor/tests/test_spatial.py +33 -0
- maxframe/codegen/spe/tensor/tests/test_statistics.py +15 -1
- maxframe/codegen/spe/tests/test_spe_codegen.py +6 -12
- maxframe/codegen/spe/utils.py +2 -0
- maxframe/config/config.py +70 -9
- maxframe/config/tests/test_validators.py +13 -1
- maxframe/config/validators.py +49 -0
- maxframe/conftest.py +44 -17
- maxframe/core/accessor.py +2 -2
- maxframe/core/entity/core.py +5 -0
- maxframe/core/entity/tileables.py +1 -1
- maxframe/core/graph/core.cpython-39-darwin.so +0 -0
- maxframe/core/graph/entity.py +1 -2
- maxframe/core/operator/base.py +9 -2
- maxframe/core/operator/core.py +10 -2
- maxframe/core/operator/utils.py +13 -0
- maxframe/dataframe/__init__.py +10 -3
- maxframe/dataframe/accessors/__init__.py +1 -1
- maxframe/dataframe/accessors/compat.py +45 -0
- maxframe/dataframe/accessors/datetime_/__init__.py +4 -1
- maxframe/dataframe/accessors/dict_/contains.py +7 -16
- maxframe/dataframe/accessors/dict_/core.py +48 -0
- maxframe/dataframe/accessors/dict_/getitem.py +17 -21
- maxframe/dataframe/accessors/dict_/length.py +7 -16
- maxframe/dataframe/accessors/dict_/remove.py +6 -18
- maxframe/dataframe/accessors/dict_/setitem.py +8 -18
- maxframe/dataframe/accessors/dict_/tests/test_dict_accessor.py +62 -22
- maxframe/dataframe/accessors/list_/__init__.py +2 -2
- maxframe/dataframe/accessors/list_/core.py +48 -0
- maxframe/dataframe/accessors/list_/getitem.py +12 -19
- maxframe/dataframe/accessors/list_/length.py +7 -16
- maxframe/dataframe/accessors/list_/tests/test_list_accessor.py +11 -9
- maxframe/dataframe/accessors/string_/__init__.py +4 -1
- maxframe/dataframe/accessors/struct_/__init__.py +37 -0
- maxframe/dataframe/accessors/struct_/accessor.py +39 -0
- maxframe/dataframe/accessors/struct_/core.py +43 -0
- maxframe/dataframe/accessors/struct_/dtypes.py +53 -0
- maxframe/dataframe/accessors/struct_/field.py +123 -0
- maxframe/dataframe/accessors/struct_/tests/__init__.py +13 -0
- maxframe/dataframe/accessors/struct_/tests/test_struct_accessor.py +91 -0
- maxframe/dataframe/arithmetic/__init__.py +14 -4
- maxframe/dataframe/arithmetic/between.py +106 -0
- maxframe/dataframe/arithmetic/dot.py +237 -0
- maxframe/dataframe/arithmetic/{around.py → round.py} +11 -7
- maxframe/dataframe/core.py +63 -118
- maxframe/dataframe/datasource/__init__.py +18 -0
- maxframe/dataframe/datasource/from_dict.py +124 -0
- maxframe/dataframe/datasource/from_index.py +1 -1
- maxframe/dataframe/datasource/from_records.py +77 -0
- maxframe/dataframe/datasource/from_tensor.py +109 -41
- maxframe/dataframe/datasource/read_csv.py +2 -3
- maxframe/dataframe/datasource/tests/test_datasource.py +37 -0
- maxframe/dataframe/datastore/__init__.py +5 -1
- maxframe/dataframe/datastore/to_csv.py +29 -41
- maxframe/dataframe/datastore/to_odps.py +30 -4
- maxframe/dataframe/extensions/__init__.py +20 -4
- maxframe/dataframe/extensions/apply_chunk.py +32 -6
- maxframe/dataframe/extensions/cartesian_chunk.py +153 -0
- maxframe/dataframe/extensions/collect_kv.py +126 -0
- maxframe/dataframe/extensions/extract_kv.py +177 -0
- maxframe/dataframe/extensions/map_reduce.py +263 -0
- maxframe/dataframe/extensions/rebalance.py +62 -0
- maxframe/dataframe/extensions/tests/test_apply_chunk.py +9 -2
- maxframe/dataframe/extensions/tests/test_extensions.py +54 -0
- maxframe/dataframe/extensions/tests/test_map_reduce.py +135 -0
- maxframe/dataframe/groupby/__init__.py +12 -1
- maxframe/dataframe/groupby/aggregation.py +78 -45
- maxframe/dataframe/groupby/apply.py +1 -1
- maxframe/dataframe/groupby/apply_chunk.py +18 -2
- maxframe/dataframe/groupby/core.py +96 -12
- maxframe/dataframe/groupby/cum.py +4 -25
- maxframe/dataframe/groupby/expanding.py +264 -0
- maxframe/dataframe/groupby/fill.py +1 -1
- maxframe/dataframe/groupby/getitem.py +12 -5
- maxframe/dataframe/groupby/head.py +11 -1
- maxframe/dataframe/groupby/rank.py +136 -0
- maxframe/dataframe/groupby/rolling.py +206 -0
- maxframe/dataframe/groupby/shift.py +114 -0
- maxframe/dataframe/groupby/tests/test_groupby.py +0 -5
- maxframe/dataframe/indexing/__init__.py +20 -1
- maxframe/dataframe/indexing/droplevel.py +195 -0
- maxframe/dataframe/indexing/filter.py +169 -0
- maxframe/dataframe/indexing/get_level_values.py +76 -0
- maxframe/dataframe/indexing/iat.py +45 -0
- maxframe/dataframe/indexing/iloc.py +152 -12
- maxframe/dataframe/indexing/insert.py +1 -1
- maxframe/dataframe/indexing/loc.py +287 -7
- maxframe/dataframe/indexing/reindex.py +14 -5
- maxframe/dataframe/indexing/rename.py +6 -0
- maxframe/dataframe/indexing/rename_axis.py +2 -2
- maxframe/dataframe/indexing/reorder_levels.py +143 -0
- maxframe/dataframe/indexing/reset_index.py +33 -6
- maxframe/dataframe/indexing/sample.py +8 -0
- maxframe/dataframe/indexing/setitem.py +3 -3
- maxframe/dataframe/indexing/swaplevel.py +185 -0
- maxframe/dataframe/indexing/take.py +99 -0
- maxframe/dataframe/indexing/truncate.py +140 -0
- maxframe/dataframe/indexing/where.py +0 -11
- maxframe/dataframe/indexing/xs.py +148 -0
- maxframe/dataframe/merge/__init__.py +12 -1
- maxframe/dataframe/merge/append.py +97 -98
- maxframe/dataframe/merge/combine_first.py +120 -0
- maxframe/dataframe/merge/compare.py +387 -0
- maxframe/dataframe/merge/concat.py +183 -0
- maxframe/dataframe/merge/update.py +271 -0
- maxframe/dataframe/misc/__init__.py +16 -10
- maxframe/dataframe/misc/_duplicate.py +10 -4
- maxframe/dataframe/misc/apply.py +1 -1
- maxframe/dataframe/misc/check_unique.py +51 -0
- maxframe/dataframe/misc/clip.py +145 -0
- maxframe/dataframe/misc/describe.py +175 -9
- maxframe/dataframe/misc/drop_duplicates.py +2 -2
- maxframe/dataframe/misc/duplicated.py +2 -2
- maxframe/dataframe/misc/get_dummies.py +5 -1
- maxframe/dataframe/misc/isin.py +2 -2
- maxframe/dataframe/misc/map.py +94 -0
- maxframe/dataframe/misc/tests/test_misc.py +13 -2
- maxframe/dataframe/misc/to_numeric.py +3 -0
- maxframe/dataframe/misc/transform.py +12 -5
- maxframe/dataframe/misc/transpose.py +13 -1
- maxframe/dataframe/misc/valid_index.py +115 -0
- maxframe/dataframe/misc/value_counts.py +38 -4
- maxframe/dataframe/missing/checkna.py +13 -6
- maxframe/dataframe/missing/dropna.py +5 -0
- maxframe/dataframe/missing/fillna.py +1 -1
- maxframe/dataframe/missing/replace.py +7 -4
- maxframe/dataframe/reduction/__init__.py +29 -15
- maxframe/dataframe/reduction/aggregation.py +38 -9
- maxframe/dataframe/reduction/all.py +2 -2
- maxframe/dataframe/reduction/any.py +2 -2
- maxframe/dataframe/reduction/argmax.py +100 -0
- maxframe/dataframe/reduction/argmin.py +100 -0
- maxframe/dataframe/reduction/core.py +65 -18
- maxframe/dataframe/reduction/count.py +13 -9
- maxframe/dataframe/reduction/cov.py +166 -0
- maxframe/dataframe/reduction/cummax.py +2 -2
- maxframe/dataframe/reduction/cummin.py +2 -2
- maxframe/dataframe/reduction/cumprod.py +2 -2
- maxframe/dataframe/reduction/cumsum.py +2 -2
- maxframe/dataframe/reduction/custom_reduction.py +2 -2
- maxframe/dataframe/reduction/idxmax.py +185 -0
- maxframe/dataframe/reduction/idxmin.py +185 -0
- maxframe/dataframe/reduction/kurtosis.py +37 -30
- maxframe/dataframe/reduction/max.py +2 -2
- maxframe/dataframe/reduction/mean.py +9 -7
- maxframe/dataframe/reduction/median.py +2 -2
- maxframe/dataframe/reduction/min.py +2 -2
- maxframe/dataframe/reduction/nunique.py +9 -8
- maxframe/dataframe/reduction/prod.py +18 -13
- maxframe/dataframe/reduction/reduction_size.py +2 -2
- maxframe/dataframe/reduction/sem.py +13 -9
- maxframe/dataframe/reduction/skew.py +31 -27
- maxframe/dataframe/reduction/str_concat.py +10 -7
- maxframe/dataframe/reduction/sum.py +18 -14
- maxframe/dataframe/reduction/unique.py +20 -3
- maxframe/dataframe/reduction/var.py +16 -12
- maxframe/dataframe/reshape/__init__.py +38 -0
- maxframe/dataframe/{misc → reshape}/pivot.py +1 -0
- maxframe/dataframe/{misc → reshape}/pivot_table.py +1 -0
- maxframe/dataframe/reshape/unstack.py +114 -0
- maxframe/dataframe/sort/__init__.py +8 -0
- maxframe/dataframe/sort/argsort.py +62 -0
- maxframe/dataframe/sort/core.py +1 -0
- maxframe/dataframe/sort/nlargest.py +238 -0
- maxframe/dataframe/sort/nsmallest.py +228 -0
- maxframe/dataframe/statistics/__init__.py +3 -3
- maxframe/dataframe/statistics/corr.py +1 -0
- maxframe/dataframe/statistics/quantile.py +2 -2
- maxframe/dataframe/tests/test_typing.py +104 -0
- maxframe/dataframe/tests/test_utils.py +66 -2
- maxframe/dataframe/typing_.py +185 -0
- maxframe/dataframe/utils.py +95 -26
- maxframe/dataframe/window/aggregation.py +8 -4
- maxframe/dataframe/window/core.py +14 -1
- maxframe/dataframe/window/ewm.py +1 -3
- maxframe/dataframe/window/expanding.py +37 -35
- maxframe/dataframe/window/rolling.py +49 -39
- maxframe/dataframe/window/tests/test_expanding.py +1 -7
- maxframe/dataframe/window/tests/test_rolling.py +1 -1
- maxframe/env.py +7 -4
- maxframe/errors.py +2 -2
- maxframe/io/odpsio/schema.py +9 -3
- maxframe/io/odpsio/tableio.py +7 -2
- maxframe/io/odpsio/tests/test_schema.py +198 -83
- maxframe/learn/__init__.py +10 -2
- maxframe/learn/cluster/__init__.py +15 -0
- maxframe/learn/cluster/_kmeans.py +782 -0
- maxframe/learn/contrib/llm/core.py +2 -0
- maxframe/learn/contrib/xgboost/core.py +86 -1
- maxframe/learn/contrib/xgboost/train.py +5 -2
- maxframe/learn/core.py +66 -0
- maxframe/learn/linear_model/_base.py +58 -1
- maxframe/learn/linear_model/_lin_reg.py +1 -1
- maxframe/learn/metrics/__init__.py +6 -0
- maxframe/learn/metrics/_classification.py +145 -0
- maxframe/learn/metrics/_ranking.py +477 -0
- maxframe/learn/metrics/_scorer.py +60 -0
- maxframe/learn/metrics/pairwise/__init__.py +21 -0
- maxframe/learn/metrics/pairwise/core.py +77 -0
- maxframe/learn/metrics/pairwise/cosine.py +115 -0
- maxframe/learn/metrics/pairwise/euclidean.py +176 -0
- maxframe/learn/metrics/pairwise/haversine.py +96 -0
- maxframe/learn/metrics/pairwise/manhattan.py +80 -0
- maxframe/learn/metrics/pairwise/pairwise.py +127 -0
- maxframe/learn/metrics/pairwise/pairwise_distances_topk.py +121 -0
- maxframe/learn/metrics/pairwise/rbf_kernel.py +51 -0
- maxframe/learn/metrics/tests/__init__.py +13 -0
- maxframe/learn/metrics/tests/test_scorer.py +26 -0
- maxframe/learn/utils/__init__.py +1 -1
- maxframe/learn/utils/checks.py +1 -2
- maxframe/learn/utils/core.py +59 -0
- maxframe/learn/utils/extmath.py +37 -0
- maxframe/learn/utils/odpsio.py +193 -0
- maxframe/learn/utils/validation.py +2 -2
- maxframe/lib/compat.py +40 -0
- maxframe/lib/dtypes_extension/__init__.py +16 -1
- maxframe/lib/dtypes_extension/_fake_arrow_dtype.py +604 -0
- maxframe/lib/dtypes_extension/blob.py +304 -0
- maxframe/lib/dtypes_extension/dtypes.py +40 -0
- maxframe/lib/dtypes_extension/tests/test_blob.py +88 -0
- maxframe/lib/dtypes_extension/tests/test_dtypes.py +16 -1
- maxframe/lib/dtypes_extension/tests/test_fake_arrow_dtype.py +75 -0
- maxframe/lib/filesystem/_oss_lib/common.py +122 -50
- maxframe/lib/filesystem/_oss_lib/glob.py +1 -1
- maxframe/lib/filesystem/_oss_lib/handle.py +21 -25
- maxframe/lib/filesystem/base.py +1 -1
- maxframe/lib/filesystem/core.py +1 -1
- maxframe/lib/filesystem/oss.py +115 -46
- maxframe/lib/filesystem/tests/test_oss.py +74 -36
- maxframe/lib/mmh3.cpython-39-darwin.so +0 -0
- maxframe/lib/wrapped_pickle.py +10 -0
- maxframe/opcodes.py +33 -15
- maxframe/protocol.py +12 -0
- maxframe/serialization/__init__.py +11 -2
- maxframe/serialization/arrow.py +38 -13
- maxframe/serialization/blob.py +32 -0
- maxframe/serialization/core.cpython-39-darwin.so +0 -0
- maxframe/serialization/core.pyx +39 -1
- maxframe/serialization/exception.py +2 -4
- maxframe/serialization/numpy.py +11 -0
- maxframe/serialization/pandas.py +46 -9
- maxframe/serialization/serializables/core.py +2 -2
- maxframe/serialization/tests/test_serial.py +29 -2
- maxframe/tensor/__init__.py +38 -8
- maxframe/tensor/arithmetic/__init__.py +19 -10
- maxframe/tensor/arithmetic/iscomplexobj.py +53 -0
- maxframe/tensor/arithmetic/tests/test_arithmetic.py +6 -0
- maxframe/tensor/core.py +3 -2
- maxframe/tensor/datasource/tests/test_datasource.py +2 -1
- maxframe/tensor/extensions/__init__.py +2 -0
- maxframe/tensor/extensions/apply_chunk.py +3 -3
- maxframe/tensor/extensions/rebalance.py +65 -0
- maxframe/tensor/fft/__init__.py +32 -0
- maxframe/tensor/fft/core.py +168 -0
- maxframe/tensor/fft/fft.py +112 -0
- maxframe/tensor/fft/fft2.py +118 -0
- maxframe/tensor/fft/fftfreq.py +80 -0
- maxframe/tensor/fft/fftn.py +123 -0
- maxframe/tensor/fft/fftshift.py +79 -0
- maxframe/tensor/fft/hfft.py +112 -0
- maxframe/tensor/fft/ifft.py +114 -0
- maxframe/tensor/fft/ifft2.py +115 -0
- maxframe/tensor/fft/ifftn.py +123 -0
- maxframe/tensor/fft/ifftshift.py +73 -0
- maxframe/tensor/fft/ihfft.py +93 -0
- maxframe/tensor/fft/irfft.py +118 -0
- maxframe/tensor/fft/irfft2.py +62 -0
- maxframe/tensor/fft/irfftn.py +114 -0
- maxframe/tensor/fft/rfft.py +116 -0
- maxframe/tensor/fft/rfft2.py +63 -0
- maxframe/tensor/fft/rfftfreq.py +87 -0
- maxframe/tensor/fft/rfftn.py +113 -0
- maxframe/tensor/indexing/fill_diagonal.py +1 -7
- maxframe/tensor/linalg/__init__.py +7 -0
- maxframe/tensor/linalg/_einsumfunc.py +1025 -0
- maxframe/tensor/linalg/cholesky.py +117 -0
- maxframe/tensor/linalg/einsum.py +339 -0
- maxframe/tensor/linalg/lstsq.py +100 -0
- maxframe/tensor/linalg/matrix_norm.py +75 -0
- maxframe/tensor/linalg/norm.py +249 -0
- maxframe/tensor/linalg/solve.py +72 -0
- maxframe/tensor/linalg/solve_triangular.py +2 -2
- maxframe/tensor/linalg/vector_norm.py +113 -0
- maxframe/tensor/misc/__init__.py +24 -1
- maxframe/tensor/misc/argwhere.py +72 -0
- maxframe/tensor/misc/array_split.py +46 -0
- maxframe/tensor/misc/broadcast_arrays.py +57 -0
- maxframe/tensor/misc/copyto.py +130 -0
- maxframe/tensor/misc/delete.py +104 -0
- maxframe/tensor/misc/dsplit.py +68 -0
- maxframe/tensor/misc/ediff1d.py +74 -0
- maxframe/tensor/misc/expand_dims.py +85 -0
- maxframe/tensor/misc/flip.py +90 -0
- maxframe/tensor/misc/fliplr.py +64 -0
- maxframe/tensor/misc/flipud.py +68 -0
- maxframe/tensor/misc/hsplit.py +85 -0
- maxframe/tensor/misc/insert.py +139 -0
- maxframe/tensor/misc/moveaxis.py +83 -0
- maxframe/tensor/misc/result_type.py +88 -0
- maxframe/tensor/misc/roll.py +124 -0
- maxframe/tensor/misc/rollaxis.py +77 -0
- maxframe/tensor/misc/shape.py +89 -0
- maxframe/tensor/misc/split.py +190 -0
- maxframe/tensor/misc/tile.py +109 -0
- maxframe/tensor/misc/vsplit.py +74 -0
- maxframe/tensor/reduction/array_equal.py +2 -1
- maxframe/tensor/sort/__init__.py +2 -0
- maxframe/tensor/sort/argpartition.py +98 -0
- maxframe/tensor/sort/partition.py +228 -0
- maxframe/tensor/spatial/__init__.py +15 -0
- maxframe/tensor/spatial/distance/__init__.py +17 -0
- maxframe/tensor/spatial/distance/cdist.py +421 -0
- maxframe/tensor/spatial/distance/pdist.py +398 -0
- maxframe/tensor/spatial/distance/squareform.py +153 -0
- maxframe/tensor/special/__init__.py +159 -21
- maxframe/tensor/special/airy.py +55 -0
- maxframe/tensor/special/bessel.py +199 -0
- maxframe/tensor/special/core.py +65 -4
- maxframe/tensor/special/ellip_func_integrals.py +155 -0
- maxframe/tensor/special/ellip_harm.py +55 -0
- maxframe/tensor/special/err_fresnel.py +223 -0
- maxframe/tensor/special/gamma_funcs.py +303 -0
- maxframe/tensor/special/hypergeometric_funcs.py +69 -0
- maxframe/tensor/special/info_theory.py +189 -0
- maxframe/tensor/special/misc.py +21 -0
- maxframe/tensor/statistics/__init__.py +6 -0
- maxframe/tensor/statistics/corrcoef.py +77 -0
- maxframe/tensor/statistics/cov.py +222 -0
- maxframe/tensor/statistics/digitize.py +126 -0
- maxframe/tensor/statistics/histogram.py +520 -0
- maxframe/tensor/statistics/median.py +85 -0
- maxframe/tensor/statistics/ptp.py +89 -0
- maxframe/tensor/utils.py +3 -3
- maxframe/tests/test_utils.py +43 -1
- maxframe/tests/utils.py +0 -2
- maxframe/typing_.py +2 -0
- maxframe/udf.py +27 -2
- maxframe/utils.py +193 -19
- {maxframe-2.0.0b2.dist-info → maxframe-2.2.0.dist-info}/METADATA +3 -2
- {maxframe-2.0.0b2.dist-info → maxframe-2.2.0.dist-info}/RECORD +391 -236
- maxframe_client/fetcher.py +35 -4
- maxframe_client/session/odps.py +7 -2
- maxframe_client/tests/test_fetcher.py +76 -3
- maxframe_client/tests/test_session.py +4 -1
- /maxframe/dataframe/{misc → reshape}/melt.py +0 -0
- /maxframe/dataframe/{misc → reshape}/stack.py +0 -0
- {maxframe-2.0.0b2.dist-info → maxframe-2.2.0.dist-info}/WHEEL +0 -0
- {maxframe-2.0.0b2.dist-info → maxframe-2.2.0.dist-info}/top_level.txt +0 -0
|
@@ -26,9 +26,10 @@ from ...serialization.serializables import (
|
|
|
26
26
|
Int32Field,
|
|
27
27
|
TupleField,
|
|
28
28
|
)
|
|
29
|
+
from ...typing_ import TileableType
|
|
29
30
|
from ...udf import BuiltinFunction, MarkedFunction
|
|
30
31
|
from ...utils import copy_if_possible, make_dtype, make_dtypes
|
|
31
|
-
from ..core import DATAFRAME_TYPE, DataFrame, IndexValue, Series
|
|
32
|
+
from ..core import DATAFRAME_TYPE, INDEX_TYPE, DataFrame, IndexValue, Series
|
|
32
33
|
from ..operators import DataFrameOperator, DataFrameOperatorMixin
|
|
33
34
|
from ..utils import (
|
|
34
35
|
InferredDataFrameMeta,
|
|
@@ -43,7 +44,7 @@ from ..utils import (
|
|
|
43
44
|
|
|
44
45
|
class DataFrameApplyChunk(DataFrameOperator, DataFrameOperatorMixin):
|
|
45
46
|
_op_type_ = opcodes.APPLY_CHUNK
|
|
46
|
-
_legacy_name = "DataFrameApplyChunkOperator"
|
|
47
|
+
_legacy_name = "DataFrameApplyChunkOperator" # since v2.0.0
|
|
47
48
|
|
|
48
49
|
func = FunctionField("func")
|
|
49
50
|
batch_rows = Int32Field("batch_rows", default=None)
|
|
@@ -60,16 +61,26 @@ class DataFrameApplyChunk(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
60
61
|
def has_custom_code(self) -> bool:
|
|
61
62
|
return not isinstance(self.func, BuiltinFunction)
|
|
62
63
|
|
|
64
|
+
def check_inputs(self, inputs: List[TileableType]):
|
|
65
|
+
# for apply_chunk we allow called on non-deterministic tileables
|
|
66
|
+
pass
|
|
67
|
+
|
|
63
68
|
def _call_dataframe(self, df, dtypes, dtype, name, index_value, element_wise):
|
|
64
69
|
# return dataframe
|
|
65
70
|
if self.output_types[0] == OutputType.dataframe:
|
|
66
71
|
dtypes = make_dtypes(dtypes)
|
|
72
|
+
if dtypes is not None:
|
|
73
|
+
shape = df.shape if element_wise else (np.nan, len(dtypes))
|
|
74
|
+
cols_value = parse_index(dtypes.index, store_data=True)
|
|
75
|
+
else:
|
|
76
|
+
shape = (np.nan, np.nan)
|
|
77
|
+
cols_value = None
|
|
67
78
|
# apply_chunk will use generate new range index for results
|
|
68
79
|
return self.new_dataframe(
|
|
69
80
|
[df],
|
|
70
|
-
shape=
|
|
81
|
+
shape=shape,
|
|
71
82
|
index_value=index_value,
|
|
72
|
-
columns_value=
|
|
83
|
+
columns_value=cols_value,
|
|
73
84
|
dtypes=dtypes,
|
|
74
85
|
)
|
|
75
86
|
|
|
@@ -106,11 +117,17 @@ class DataFrameApplyChunk(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
106
117
|
name: Any = None,
|
|
107
118
|
output_type=None,
|
|
108
119
|
index=None,
|
|
120
|
+
skip_infer=False,
|
|
109
121
|
):
|
|
110
122
|
args = self.args or ()
|
|
111
123
|
kwargs = self.kwargs or {}
|
|
112
124
|
# if not dtypes and not skip_infer:
|
|
113
|
-
|
|
125
|
+
try:
|
|
126
|
+
packed_func = get_packed_func(df_or_series, self.func, *args, **kwargs)
|
|
127
|
+
except:
|
|
128
|
+
if not skip_infer:
|
|
129
|
+
raise
|
|
130
|
+
packed_func = self.func
|
|
114
131
|
|
|
115
132
|
# if skip_infer, directly build a frame
|
|
116
133
|
if self.output_types and self.output_types[0] == OutputType.df_or_series:
|
|
@@ -125,13 +142,15 @@ class DataFrameApplyChunk(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
125
142
|
dtype=dtype,
|
|
126
143
|
name=name,
|
|
127
144
|
index=index,
|
|
145
|
+
skip_infer=skip_infer,
|
|
128
146
|
)
|
|
129
147
|
|
|
130
148
|
if inferred_meta.index_value is None:
|
|
131
149
|
inferred_meta.index_value = parse_index(
|
|
132
150
|
None, (df_or_series.key, df_or_series.index_value.key, self.func)
|
|
133
151
|
)
|
|
134
|
-
|
|
152
|
+
if not skip_infer:
|
|
153
|
+
inferred_meta.check_absence("output_type", "dtypes", "dtype")
|
|
135
154
|
|
|
136
155
|
if isinstance(df_or_series, DATAFRAME_TYPE):
|
|
137
156
|
return self._call_dataframe(
|
|
@@ -163,6 +182,7 @@ class DataFrameApplyChunk(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
163
182
|
name: Any = None,
|
|
164
183
|
index: Union[pd.Index, IndexValue] = None,
|
|
165
184
|
elementwise: bool = None,
|
|
185
|
+
skip_infer: bool = False,
|
|
166
186
|
**kwargs,
|
|
167
187
|
) -> InferredDataFrameMeta:
|
|
168
188
|
inferred_meta = infer_dataframe_return_value(
|
|
@@ -174,7 +194,10 @@ class DataFrameApplyChunk(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
174
194
|
name=name,
|
|
175
195
|
index=index,
|
|
176
196
|
elementwise=elementwise,
|
|
197
|
+
skip_infer=skip_infer,
|
|
177
198
|
)
|
|
199
|
+
if skip_infer:
|
|
200
|
+
return inferred_meta
|
|
178
201
|
|
|
179
202
|
# merge specified and inferred index, dtypes, output_type
|
|
180
203
|
# elementwise used to decide shape
|
|
@@ -186,6 +209,8 @@ class DataFrameApplyChunk(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
186
209
|
if self.output_types:
|
|
187
210
|
inferred_meta.output_type = self.output_types[0]
|
|
188
211
|
inferred_meta.dtypes = dtypes if dtypes is not None else inferred_meta.dtypes
|
|
212
|
+
if isinstance(index, INDEX_TYPE):
|
|
213
|
+
index = index.index_value
|
|
189
214
|
if index is not None:
|
|
190
215
|
inferred_meta.index_value = (
|
|
191
216
|
parse_index(index)
|
|
@@ -458,6 +483,7 @@ def df_apply_chunk(
|
|
|
458
483
|
name=name,
|
|
459
484
|
index=index,
|
|
460
485
|
output_type=output_type,
|
|
486
|
+
skip_infer=skip_infer,
|
|
461
487
|
)
|
|
462
488
|
|
|
463
489
|
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
# Copyright 1999-2025 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from typing import List
|
|
16
|
+
|
|
17
|
+
import numpy as np
|
|
18
|
+
import pandas as pd
|
|
19
|
+
|
|
20
|
+
from ... import opcodes
|
|
21
|
+
from ...core import EntityData, OutputType
|
|
22
|
+
from ...serialization.serializables import (
|
|
23
|
+
DictField,
|
|
24
|
+
FunctionField,
|
|
25
|
+
KeyField,
|
|
26
|
+
TupleField,
|
|
27
|
+
)
|
|
28
|
+
from ...udf import BuiltinFunction
|
|
29
|
+
from ...utils import quiet_stdio
|
|
30
|
+
from ..operators import DataFrameOperator, DataFrameOperatorMixin
|
|
31
|
+
from ..utils import (
|
|
32
|
+
build_df,
|
|
33
|
+
build_empty_df,
|
|
34
|
+
build_series,
|
|
35
|
+
parse_index,
|
|
36
|
+
validate_output_types,
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class DataFrameCartesianChunk(DataFrameOperator, DataFrameOperatorMixin):
|
|
41
|
+
_op_type_ = opcodes.CARTESIAN_CHUNK
|
|
42
|
+
|
|
43
|
+
left = KeyField("left")
|
|
44
|
+
right = KeyField("right")
|
|
45
|
+
func = FunctionField("func")
|
|
46
|
+
args = TupleField("args")
|
|
47
|
+
kwargs = DictField("kwargs")
|
|
48
|
+
|
|
49
|
+
def __init__(self, output_types=None, **kw):
|
|
50
|
+
super().__init__(_output_types=output_types, **kw)
|
|
51
|
+
if self.memory_scale is None:
|
|
52
|
+
self.memory_scale = 2.0
|
|
53
|
+
|
|
54
|
+
@classmethod
|
|
55
|
+
def _set_inputs(cls, op: "DataFrameCartesianChunk", inputs: List[EntityData]):
|
|
56
|
+
super()._set_inputs(op, inputs)
|
|
57
|
+
op.left, op.right = op.inputs[:2]
|
|
58
|
+
|
|
59
|
+
@staticmethod
|
|
60
|
+
def _build_test_obj(obj):
|
|
61
|
+
return (
|
|
62
|
+
build_df(obj, size=2)
|
|
63
|
+
if obj.ndim == 2
|
|
64
|
+
else build_series(obj, size=2, name=obj.name)
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
def has_custom_code(self) -> bool:
|
|
68
|
+
return not isinstance(self.func, BuiltinFunction)
|
|
69
|
+
|
|
70
|
+
def __call__(self, left, right, index=None, dtypes=None):
|
|
71
|
+
test_left = self._build_test_obj(left)
|
|
72
|
+
test_right = self._build_test_obj(right)
|
|
73
|
+
output_type = self._output_types[0] if self._output_types else None
|
|
74
|
+
|
|
75
|
+
if output_type == OutputType.df_or_series:
|
|
76
|
+
return self.new_df_or_series([left, right])
|
|
77
|
+
|
|
78
|
+
# try run to infer meta
|
|
79
|
+
try:
|
|
80
|
+
with np.errstate(all="ignore"), quiet_stdio():
|
|
81
|
+
obj = self.func(test_left, test_right, *self.args, **self.kwargs)
|
|
82
|
+
except: # noqa: E722 # nosec # pylint: disable=bare-except
|
|
83
|
+
if output_type == OutputType.series:
|
|
84
|
+
obj = pd.Series([], dtype=np.dtype(object))
|
|
85
|
+
elif output_type == OutputType.dataframe and dtypes is not None:
|
|
86
|
+
obj = build_empty_df(dtypes)
|
|
87
|
+
else:
|
|
88
|
+
raise TypeError(
|
|
89
|
+
"Cannot determine `output_type`, "
|
|
90
|
+
"you have to specify it as `dataframe` or `series`, "
|
|
91
|
+
"for dataframe, `dtypes` is required as well "
|
|
92
|
+
"if output_type='dataframe'"
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
if getattr(obj, "ndim", 0) == 1 or output_type == OutputType.series:
|
|
96
|
+
shape = self.kwargs.pop("shape", (np.nan,))
|
|
97
|
+
if index is None:
|
|
98
|
+
index = obj.index
|
|
99
|
+
index_value = parse_index(
|
|
100
|
+
index, left, right, self.func, self.args, self.kwargs
|
|
101
|
+
)
|
|
102
|
+
return self.new_series(
|
|
103
|
+
[left, right],
|
|
104
|
+
dtype=obj.dtype,
|
|
105
|
+
shape=shape,
|
|
106
|
+
index_value=index_value,
|
|
107
|
+
name=obj.name,
|
|
108
|
+
)
|
|
109
|
+
else:
|
|
110
|
+
dtypes = dtypes if dtypes is not None else obj.dtypes
|
|
111
|
+
# dataframe
|
|
112
|
+
shape = (np.nan, len(dtypes))
|
|
113
|
+
columns_value = parse_index(dtypes.index, store_data=True)
|
|
114
|
+
if index is None:
|
|
115
|
+
index = obj.index
|
|
116
|
+
index_value = parse_index(
|
|
117
|
+
index, left, right, self.func, self.args, self.kwargs
|
|
118
|
+
)
|
|
119
|
+
return self.new_dataframe(
|
|
120
|
+
[left, right],
|
|
121
|
+
shape=shape,
|
|
122
|
+
dtypes=dtypes,
|
|
123
|
+
index_value=index_value,
|
|
124
|
+
columns_value=columns_value,
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def cartesian_chunk(left, right, func, skip_infer=False, args=(), **kwargs):
|
|
129
|
+
output_type = kwargs.pop("output_type", None)
|
|
130
|
+
output_types = kwargs.pop("output_types", None)
|
|
131
|
+
object_type = kwargs.pop("object_type", None)
|
|
132
|
+
output_types = validate_output_types(
|
|
133
|
+
output_type=output_type, output_types=output_types, object_type=object_type
|
|
134
|
+
)
|
|
135
|
+
output_type = output_types[0] if output_types else None
|
|
136
|
+
if output_type:
|
|
137
|
+
output_types = [output_type]
|
|
138
|
+
elif skip_infer:
|
|
139
|
+
output_types = [OutputType.df_or_series]
|
|
140
|
+
index = kwargs.pop("index", None)
|
|
141
|
+
dtypes = kwargs.pop("dtypes", None)
|
|
142
|
+
memory_scale = kwargs.pop("memory_scale", None)
|
|
143
|
+
|
|
144
|
+
op = DataFrameCartesianChunk(
|
|
145
|
+
left=left,
|
|
146
|
+
right=right,
|
|
147
|
+
func=func,
|
|
148
|
+
args=args,
|
|
149
|
+
kwargs=kwargs,
|
|
150
|
+
output_types=output_types,
|
|
151
|
+
memory_scale=memory_scale,
|
|
152
|
+
)
|
|
153
|
+
return op(left, right, index=index, dtypes=dtypes)
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
# Copyright 1999-2025 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import numpy as np
|
|
16
|
+
import pandas as pd
|
|
17
|
+
|
|
18
|
+
from ... import opcodes
|
|
19
|
+
from ...serialization.serializables import AnyField, StringField
|
|
20
|
+
from ...utils import no_default
|
|
21
|
+
from ..operators import DataFrameOperator, DataFrameOperatorMixin
|
|
22
|
+
from ..utils import make_column_list
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class DataFrameCollectKv(DataFrameOperator, DataFrameOperatorMixin):
|
|
26
|
+
_op_type_ = opcodes.COLLECT_KV
|
|
27
|
+
|
|
28
|
+
columns = AnyField("columns", default=None)
|
|
29
|
+
kv_delim = StringField("kv_delim", default=None)
|
|
30
|
+
item_delim = StringField("item_delim", default=None)
|
|
31
|
+
kv_col = StringField("kv_col", default=None)
|
|
32
|
+
|
|
33
|
+
def __call__(self, df):
|
|
34
|
+
if self.columns is None:
|
|
35
|
+
cols = list(df.dtypes.index)
|
|
36
|
+
else:
|
|
37
|
+
cols = self.columns if isinstance(self.columns, list) else [self.columns]
|
|
38
|
+
new_dtypes = df.dtypes.drop(cols, errors="ignore")
|
|
39
|
+
new_dtypes = pd.concat(
|
|
40
|
+
[new_dtypes, pd.Series([np.dtype("object")], index=[self.kv_col])]
|
|
41
|
+
)
|
|
42
|
+
shape = (df.shape[0], len(new_dtypes))
|
|
43
|
+
return self.new_dataframe(
|
|
44
|
+
[df],
|
|
45
|
+
shape=shape,
|
|
46
|
+
dtypes=new_dtypes,
|
|
47
|
+
index_value=df.index_value,
|
|
48
|
+
columns_value=new_dtypes.index,
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def collect_kv(
|
|
53
|
+
data,
|
|
54
|
+
columns=None,
|
|
55
|
+
kv_delim="=",
|
|
56
|
+
item_delim=",",
|
|
57
|
+
kv_col="kv_col",
|
|
58
|
+
):
|
|
59
|
+
"""
|
|
60
|
+
Merge values in specified columns into a key-value represented column.
|
|
61
|
+
|
|
62
|
+
Parameters
|
|
63
|
+
----------
|
|
64
|
+
columns : list, default None
|
|
65
|
+
The columns to be merged.
|
|
66
|
+
kv_delim : str, default '='
|
|
67
|
+
Delimiter between key and value.
|
|
68
|
+
item_delim : str, default ','
|
|
69
|
+
Delimiter between key-value pairs.
|
|
70
|
+
kv_col : str, default 'kv_col'
|
|
71
|
+
Name of the new key-value column
|
|
72
|
+
|
|
73
|
+
Returns
|
|
74
|
+
-------
|
|
75
|
+
DataFrame
|
|
76
|
+
converted data frame
|
|
77
|
+
|
|
78
|
+
See Also
|
|
79
|
+
--------
|
|
80
|
+
DataFrame.mf.extract_kv
|
|
81
|
+
|
|
82
|
+
Examples
|
|
83
|
+
-------
|
|
84
|
+
>>> import maxframe.dataframe as md
|
|
85
|
+
|
|
86
|
+
>>> df = md.DataFrame({"name": ["name1", "name2", "name3", "name4", "name5"],
|
|
87
|
+
... "k1": [1.0, NaN, 7.1, NaN, NaN],
|
|
88
|
+
... "k2": [3.0, 3.0, NaN, 1.2, 1.0],
|
|
89
|
+
... "k3": [NaN, 5.1, NaN, 1.5, NaN],
|
|
90
|
+
... "k5": [10.0, NaN, NaN, NaN, NaN,],
|
|
91
|
+
... "k7": [NaN, NaN, 8.2, NaN, NaN, ],
|
|
92
|
+
... "k9": [NaN, NaN, NaN, NaN, 1.1]})
|
|
93
|
+
>>> df.execute()
|
|
94
|
+
name k1 k2 k3 k5 k7 k9
|
|
95
|
+
0 name1 1.0 3.0 NaN 10.0 NaN NaN
|
|
96
|
+
1 name2 NaN 3.0 5.1 NaN NaN NaN
|
|
97
|
+
2 name3 7.1 NaN NaN NaN 8.2 NaN
|
|
98
|
+
3 name4 NaN 1.2 1.5 NaN NaN NaN
|
|
99
|
+
4 name5 NaN 1.0 NaN NaN NaN 1.1
|
|
100
|
+
|
|
101
|
+
The field names to be merged are specified by columns
|
|
102
|
+
kv_delim is to delimit the key and value and '=' is default
|
|
103
|
+
item_delim is to delimit the Key-Value pairs, ',' is default
|
|
104
|
+
The new column name is specified by kv_col, 'kv_col' is default
|
|
105
|
+
|
|
106
|
+
>>> df.mf.collect_kv(columns=['k1', 'k2', 'k3', 'k5', 'k7', 'k9']).execute()
|
|
107
|
+
name kv_col
|
|
108
|
+
0 name1 k1=1.0,k2=3.0,k5=10.0
|
|
109
|
+
1 name2 k2=3.0,k3=5.1
|
|
110
|
+
2 name3 k1=7.1,k7=8.2
|
|
111
|
+
3 name4 k2=1.2,k3=1.5
|
|
112
|
+
4 name5 k2=1.0,k9=1.1
|
|
113
|
+
"""
|
|
114
|
+
columns_list = make_column_list(columns, data.dtypes) or []
|
|
115
|
+
non_exist_key = next(
|
|
116
|
+
(c for c in columns_list if c not in data.dtypes.index), no_default
|
|
117
|
+
)
|
|
118
|
+
if columns_list and non_exist_key is not no_default:
|
|
119
|
+
raise ValueError(f"Column {non_exist_key} specified is not a valid column.")
|
|
120
|
+
op = DataFrameCollectKv(
|
|
121
|
+
columns=columns,
|
|
122
|
+
kv_delim=kv_delim,
|
|
123
|
+
item_delim=item_delim,
|
|
124
|
+
kv_col=kv_col,
|
|
125
|
+
)
|
|
126
|
+
return op(data)
|
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
# Copyright 1999-2025 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from typing import List
|
|
16
|
+
|
|
17
|
+
import numpy as np
|
|
18
|
+
import pandas as pd
|
|
19
|
+
|
|
20
|
+
from ... import opcodes
|
|
21
|
+
from ...core import EntityData, OutputType
|
|
22
|
+
from ...serialization.serializables import AnyField, KeyField, StringField
|
|
23
|
+
from ...utils import make_dtype, no_default
|
|
24
|
+
from ..operators import DataFrameOperator, DataFrameOperatorMixin
|
|
25
|
+
from ..utils import make_column_list
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class DataFrameExtractKv(DataFrameOperator, DataFrameOperatorMixin):
|
|
29
|
+
_op_type_ = opcodes.EXTRACT_KV
|
|
30
|
+
|
|
31
|
+
columns = AnyField("columns", default=None)
|
|
32
|
+
kv_delim = StringField("kv_delim", default="=")
|
|
33
|
+
item_delim = StringField("item_delim", default=",")
|
|
34
|
+
dtype = AnyField("dtype", default=None)
|
|
35
|
+
fill_value = AnyField("fill_value", default=None)
|
|
36
|
+
errors = StringField("errors", default="raise")
|
|
37
|
+
# intermediate agg data
|
|
38
|
+
agg_results = KeyField("agg_results", default=None)
|
|
39
|
+
|
|
40
|
+
def __init__(self, kv_delim="=", item_delim=",", **kw):
|
|
41
|
+
super().__init__(kv_delim=kv_delim, item_delim=item_delim, **kw)
|
|
42
|
+
self.output_types = [OutputType.dataframe]
|
|
43
|
+
|
|
44
|
+
@classmethod
|
|
45
|
+
def _set_inputs(cls, op: "DataFrameExtractKv", inputs: List[EntityData]):
|
|
46
|
+
super()._set_inputs(op, inputs)
|
|
47
|
+
if op.agg_results is not None:
|
|
48
|
+
op.agg_results = inputs[-1]
|
|
49
|
+
|
|
50
|
+
def __call__(self, df):
|
|
51
|
+
shape = (df.shape[0], np.nan)
|
|
52
|
+
errors_arg = self.errors
|
|
53
|
+
|
|
54
|
+
def get_keys(row, cols, kv_delim, item_delim):
|
|
55
|
+
for col in cols:
|
|
56
|
+
if row[col] is not None:
|
|
57
|
+
pairs = row[col].split(item_delim)
|
|
58
|
+
else:
|
|
59
|
+
pairs = []
|
|
60
|
+
for pair in pairs:
|
|
61
|
+
result = pair.split(kv_delim, 1)
|
|
62
|
+
if len(result) == 2:
|
|
63
|
+
yield f"{col}_{result[0]}"
|
|
64
|
+
elif errors_arg == "raise":
|
|
65
|
+
raise ValueError(f"Malformed data {pair} in column '{col}'.")
|
|
66
|
+
|
|
67
|
+
all_keys = df.mf.flatmap(
|
|
68
|
+
get_keys,
|
|
69
|
+
dtypes=pd.Series([str], index=["keys_cols"]),
|
|
70
|
+
cols=self.columns,
|
|
71
|
+
kv_delim=self.kv_delim,
|
|
72
|
+
item_delim=self.item_delim,
|
|
73
|
+
)
|
|
74
|
+
self.agg_results = all_keys.drop_duplicates().sort_values(by="keys_cols")
|
|
75
|
+
inputs = [df]
|
|
76
|
+
inputs.append(self.agg_results)
|
|
77
|
+
return self.new_dataframe(
|
|
78
|
+
inputs,
|
|
79
|
+
shape=shape,
|
|
80
|
+
dtypes=None,
|
|
81
|
+
index_value=df.index_value,
|
|
82
|
+
columns_value=None,
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def extract_kv(
|
|
87
|
+
data,
|
|
88
|
+
columns=None,
|
|
89
|
+
kv_delim="=",
|
|
90
|
+
item_delim=",",
|
|
91
|
+
dtype="float",
|
|
92
|
+
fill_value=None,
|
|
93
|
+
errors="raise",
|
|
94
|
+
):
|
|
95
|
+
"""
|
|
96
|
+
Extract values in key-value represented columns into standalone columns.
|
|
97
|
+
New column names will be the name of the key-value column followed by
|
|
98
|
+
an underscore and the key.
|
|
99
|
+
|
|
100
|
+
Parameters
|
|
101
|
+
----------
|
|
102
|
+
columns : list, default None
|
|
103
|
+
The key-value columns to be extracted.
|
|
104
|
+
kv_delim : str, default '='
|
|
105
|
+
Delimiter between key and value.
|
|
106
|
+
item_delim : str, default ','
|
|
107
|
+
Delimiter between key-value pairs.
|
|
108
|
+
dtype : str
|
|
109
|
+
Type of value columns to generate.
|
|
110
|
+
fill_value : object, default None
|
|
111
|
+
Default value for missing key-value pairs.
|
|
112
|
+
errors : {'ignore', 'raise'}, default 'raise'
|
|
113
|
+
* If 'raise', then invalid parsing will raise an exception.
|
|
114
|
+
* If 'ignore', then invalid parsing will return the input.
|
|
115
|
+
|
|
116
|
+
Returns
|
|
117
|
+
-------
|
|
118
|
+
DataFrame
|
|
119
|
+
extracted data frame
|
|
120
|
+
|
|
121
|
+
See Also
|
|
122
|
+
--------
|
|
123
|
+
DataFrame.mf.collect_kv
|
|
124
|
+
|
|
125
|
+
Examples
|
|
126
|
+
--------
|
|
127
|
+
>>> import numpy as np
|
|
128
|
+
>>> import maxframe.dataframe as md
|
|
129
|
+
|
|
130
|
+
>>> df = md.DataFrame({"name": ["name1", "name2", "name3", "name4", "name5"],
|
|
131
|
+
... "kv": ["k1=1.0,k2=3.0,k5=10.0",
|
|
132
|
+
... "k2=3.0,k3=5.1",
|
|
133
|
+
... "k1=7.1,k7=8.2",
|
|
134
|
+
... "k2=1.2,k3=1.5",
|
|
135
|
+
... "k2=1.0,k9=1.1"]})
|
|
136
|
+
>>> df.execute()
|
|
137
|
+
name kv
|
|
138
|
+
0 name1 k1=1.0,k2=3.0,k5=10.0
|
|
139
|
+
1 name2 k2=3.0,k3=5.1
|
|
140
|
+
2 name3 k1=7.1,k7=8.2
|
|
141
|
+
3 name4 k2=1.2,k3=1.5
|
|
142
|
+
4 name5 k2=1.0,k9=1.1
|
|
143
|
+
|
|
144
|
+
The field names to be expanded are specified by columns
|
|
145
|
+
kv_delim is to delimit the key and value and '=' is default
|
|
146
|
+
item_delim is to delimit the Key-Value pairs, ',' is default
|
|
147
|
+
The output field name is the original field name connect with the key by "_"
|
|
148
|
+
fill_value is used to fill missing values, None is default
|
|
149
|
+
|
|
150
|
+
>>> df.mf.extract_kv(columns=['kv'], kv_delim='=', item_delim=',').execute()
|
|
151
|
+
name kv_k1 kv_k2 kv_k3 kv_k5 kv_k7 kv_k9
|
|
152
|
+
0 name1 1.0 3.0 NaN 10.0 NaN NaN
|
|
153
|
+
1 name2 NaN 3.0 5.1 NaN NaN NaN
|
|
154
|
+
2 name3 7.1 NaN NaN NaN 8.2 NaN
|
|
155
|
+
3 name4 NaN 1.2 1.5 NaN NaN NaN
|
|
156
|
+
4 name5 NaN 1.0 NaN NaN NaN 1.1
|
|
157
|
+
"""
|
|
158
|
+
if columns is None:
|
|
159
|
+
columns = data.dtypes.index.tolist()
|
|
160
|
+
columns_list = make_column_list(columns, data.dtypes)
|
|
161
|
+
non_exist_key = next(
|
|
162
|
+
(c for c in columns_list if c not in data.dtypes.index), no_default
|
|
163
|
+
)
|
|
164
|
+
if non_exist_key is not no_default:
|
|
165
|
+
raise ValueError(f"Column {non_exist_key} specified is not a valid column.")
|
|
166
|
+
for col in columns_list:
|
|
167
|
+
if str(data.dtypes[col]) not in ("object", "string"):
|
|
168
|
+
raise ValueError(f"Column '{col}' must be of string type.")
|
|
169
|
+
op = DataFrameExtractKv(
|
|
170
|
+
columns=columns,
|
|
171
|
+
kv_delim=kv_delim,
|
|
172
|
+
item_delim=item_delim,
|
|
173
|
+
dtype=make_dtype(dtype),
|
|
174
|
+
fill_value=fill_value,
|
|
175
|
+
errors=errors,
|
|
176
|
+
)
|
|
177
|
+
return op(data)
|