maxframe 2.0.0b2__cp37-cp37m-win32.whl → 2.3.0rc1__cp37-cp37m-win32.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of maxframe might be problematic. Click here for more details.
- maxframe/__init__.py +1 -0
- maxframe/_utils.cp37-win32.pyd +0 -0
- maxframe/_utils.pyx +14 -1
- maxframe/codegen/core.py +9 -8
- maxframe/codegen/spe/core.py +1 -1
- maxframe/codegen/spe/dataframe/__init__.py +1 -0
- maxframe/codegen/spe/dataframe/accessors/base.py +18 -0
- maxframe/codegen/spe/dataframe/accessors/dict_.py +25 -130
- maxframe/codegen/spe/dataframe/accessors/list_.py +12 -48
- maxframe/codegen/spe/dataframe/accessors/struct_.py +28 -0
- maxframe/codegen/spe/dataframe/arithmetic.py +7 -2
- maxframe/codegen/spe/dataframe/groupby.py +88 -0
- maxframe/codegen/spe/dataframe/indexing.py +99 -4
- maxframe/codegen/spe/dataframe/merge.py +38 -1
- maxframe/codegen/spe/dataframe/misc.py +11 -33
- maxframe/codegen/spe/dataframe/reduction.py +32 -9
- maxframe/codegen/spe/dataframe/reshape.py +46 -0
- maxframe/codegen/spe/dataframe/sort.py +39 -18
- maxframe/codegen/spe/dataframe/tests/accessors/test_dict.py +9 -15
- maxframe/codegen/spe/dataframe/tests/accessors/test_list.py +4 -7
- maxframe/codegen/spe/dataframe/tests/accessors/test_struct.py +75 -0
- maxframe/codegen/spe/dataframe/tests/indexing/test_iloc.py +20 -1
- maxframe/codegen/spe/dataframe/tests/indexing/test_loc.py +35 -0
- maxframe/codegen/spe/dataframe/tests/misc/test_misc.py +0 -32
- maxframe/codegen/spe/dataframe/tests/test_groupby.py +81 -18
- maxframe/codegen/spe/dataframe/tests/test_merge.py +27 -1
- maxframe/codegen/spe/dataframe/tests/test_reduction.py +13 -0
- maxframe/codegen/spe/dataframe/tests/test_reshape.py +79 -0
- maxframe/codegen/spe/dataframe/tests/test_sort.py +20 -0
- maxframe/codegen/spe/dataframe/tseries.py +9 -0
- maxframe/codegen/spe/learn/contrib/lightgbm.py +4 -3
- maxframe/codegen/spe/learn/contrib/tests/test_xgboost.py +2 -1
- maxframe/codegen/spe/learn/metrics/__init__.py +1 -1
- maxframe/codegen/spe/learn/metrics/_ranking.py +76 -0
- maxframe/codegen/spe/learn/metrics/pairwise.py +51 -0
- maxframe/codegen/spe/learn/metrics/tests/test_pairwise.py +36 -0
- maxframe/codegen/spe/learn/metrics/tests/test_ranking.py +59 -0
- maxframe/codegen/spe/tensor/__init__.py +3 -0
- maxframe/codegen/spe/tensor/datasource.py +1 -0
- maxframe/codegen/spe/tensor/fft.py +74 -0
- maxframe/codegen/spe/tensor/linalg.py +29 -2
- maxframe/codegen/spe/tensor/misc.py +79 -25
- maxframe/codegen/spe/tensor/spatial.py +45 -0
- maxframe/codegen/spe/tensor/statistics.py +44 -0
- maxframe/codegen/spe/tensor/tests/test_fft.py +64 -0
- maxframe/codegen/spe/tensor/tests/test_linalg.py +15 -1
- maxframe/codegen/spe/tensor/tests/test_misc.py +52 -2
- maxframe/codegen/spe/tensor/tests/test_spatial.py +33 -0
- maxframe/codegen/spe/tensor/tests/test_statistics.py +15 -1
- maxframe/codegen/spe/tests/test_spe_codegen.py +6 -12
- maxframe/codegen/spe/utils.py +2 -0
- maxframe/config/config.py +73 -9
- maxframe/config/tests/test_validators.py +13 -1
- maxframe/config/validators.py +49 -0
- maxframe/conftest.py +54 -17
- maxframe/core/accessor.py +2 -2
- maxframe/core/base.py +2 -1
- maxframe/core/entity/core.py +5 -0
- maxframe/core/entity/tileables.py +3 -1
- maxframe/core/graph/core.cp37-win32.pyd +0 -0
- maxframe/core/graph/entity.py +8 -3
- maxframe/core/mode.py +6 -1
- maxframe/core/operator/base.py +9 -2
- maxframe/core/operator/core.py +10 -2
- maxframe/core/operator/utils.py +13 -0
- maxframe/dataframe/__init__.py +12 -5
- maxframe/dataframe/accessors/__init__.py +1 -1
- maxframe/dataframe/accessors/compat.py +45 -0
- maxframe/dataframe/accessors/datetime_/__init__.py +4 -1
- maxframe/dataframe/accessors/dict_/contains.py +7 -16
- maxframe/dataframe/accessors/dict_/core.py +48 -0
- maxframe/dataframe/accessors/dict_/getitem.py +17 -21
- maxframe/dataframe/accessors/dict_/length.py +7 -16
- maxframe/dataframe/accessors/dict_/remove.py +6 -18
- maxframe/dataframe/accessors/dict_/setitem.py +8 -18
- maxframe/dataframe/accessors/dict_/tests/test_dict_accessor.py +62 -22
- maxframe/dataframe/accessors/list_/__init__.py +2 -2
- maxframe/dataframe/accessors/list_/core.py +48 -0
- maxframe/dataframe/accessors/list_/getitem.py +12 -19
- maxframe/dataframe/accessors/list_/length.py +7 -16
- maxframe/dataframe/accessors/list_/tests/test_list_accessor.py +11 -9
- maxframe/dataframe/accessors/string_/__init__.py +4 -1
- maxframe/dataframe/accessors/struct_/__init__.py +37 -0
- maxframe/dataframe/accessors/struct_/accessor.py +39 -0
- maxframe/dataframe/accessors/struct_/core.py +43 -0
- maxframe/dataframe/accessors/struct_/dtypes.py +53 -0
- maxframe/dataframe/accessors/struct_/field.py +123 -0
- maxframe/dataframe/accessors/struct_/tests/__init__.py +13 -0
- maxframe/dataframe/accessors/struct_/tests/test_struct_accessor.py +91 -0
- maxframe/dataframe/arithmetic/__init__.py +18 -4
- maxframe/dataframe/arithmetic/between.py +106 -0
- maxframe/dataframe/arithmetic/dot.py +237 -0
- maxframe/dataframe/arithmetic/maximum.py +33 -0
- maxframe/dataframe/arithmetic/minimum.py +33 -0
- maxframe/dataframe/arithmetic/{around.py → round.py} +11 -7
- maxframe/dataframe/core.py +161 -224
- maxframe/dataframe/datasource/__init__.py +18 -0
- maxframe/dataframe/datasource/core.py +6 -0
- maxframe/dataframe/datasource/direct.py +57 -0
- maxframe/dataframe/datasource/from_dict.py +124 -0
- maxframe/dataframe/datasource/from_index.py +1 -1
- maxframe/dataframe/datasource/from_records.py +77 -0
- maxframe/dataframe/datasource/from_tensor.py +109 -41
- maxframe/dataframe/datasource/read_csv.py +21 -14
- maxframe/dataframe/datasource/read_odps_query.py +29 -6
- maxframe/dataframe/datasource/read_odps_table.py +32 -10
- maxframe/dataframe/datasource/read_parquet.py +38 -39
- maxframe/dataframe/datasource/tests/test_datasource.py +37 -0
- maxframe/dataframe/datastore/__init__.py +11 -1
- maxframe/dataframe/datastore/direct.py +268 -0
- maxframe/dataframe/datastore/to_csv.py +29 -41
- maxframe/dataframe/datastore/to_odps.py +36 -4
- maxframe/dataframe/extensions/__init__.py +20 -4
- maxframe/dataframe/extensions/apply_chunk.py +32 -6
- maxframe/dataframe/extensions/cartesian_chunk.py +153 -0
- maxframe/dataframe/extensions/collect_kv.py +126 -0
- maxframe/dataframe/extensions/extract_kv.py +177 -0
- maxframe/dataframe/extensions/flatjson.py +2 -1
- maxframe/dataframe/extensions/map_reduce.py +263 -0
- maxframe/dataframe/extensions/rebalance.py +62 -0
- maxframe/dataframe/extensions/tests/test_apply_chunk.py +9 -2
- maxframe/dataframe/extensions/tests/test_extensions.py +54 -0
- maxframe/dataframe/extensions/tests/test_map_reduce.py +135 -0
- maxframe/dataframe/groupby/__init__.py +17 -2
- maxframe/dataframe/groupby/aggregation.py +86 -49
- maxframe/dataframe/groupby/apply.py +1 -1
- maxframe/dataframe/groupby/apply_chunk.py +19 -5
- maxframe/dataframe/groupby/core.py +116 -16
- maxframe/dataframe/groupby/cum.py +4 -25
- maxframe/dataframe/groupby/expanding.py +264 -0
- maxframe/dataframe/groupby/fill.py +1 -1
- maxframe/dataframe/groupby/getitem.py +12 -5
- maxframe/dataframe/groupby/head.py +11 -1
- maxframe/dataframe/groupby/rank.py +136 -0
- maxframe/dataframe/groupby/rolling.py +206 -0
- maxframe/dataframe/groupby/shift.py +114 -0
- maxframe/dataframe/groupby/tests/test_groupby.py +0 -5
- maxframe/dataframe/indexing/__init__.py +22 -2
- maxframe/dataframe/indexing/droplevel.py +195 -0
- maxframe/dataframe/indexing/filter.py +169 -0
- maxframe/dataframe/indexing/get_level_values.py +76 -0
- maxframe/dataframe/indexing/iat.py +45 -0
- maxframe/dataframe/indexing/iloc.py +152 -12
- maxframe/dataframe/indexing/insert.py +46 -18
- maxframe/dataframe/indexing/loc.py +287 -7
- maxframe/dataframe/indexing/reindex.py +14 -5
- maxframe/dataframe/indexing/rename.py +6 -0
- maxframe/dataframe/indexing/rename_axis.py +2 -2
- maxframe/dataframe/indexing/reorder_levels.py +143 -0
- maxframe/dataframe/indexing/reset_index.py +33 -6
- maxframe/dataframe/indexing/sample.py +8 -0
- maxframe/dataframe/indexing/setitem.py +3 -3
- maxframe/dataframe/indexing/swaplevel.py +185 -0
- maxframe/dataframe/indexing/take.py +99 -0
- maxframe/dataframe/indexing/truncate.py +140 -0
- maxframe/dataframe/indexing/where.py +0 -11
- maxframe/dataframe/indexing/xs.py +148 -0
- maxframe/dataframe/merge/__init__.py +15 -1
- maxframe/dataframe/merge/append.py +97 -98
- maxframe/dataframe/merge/combine.py +244 -0
- maxframe/dataframe/merge/combine_first.py +120 -0
- maxframe/dataframe/merge/compare.py +387 -0
- maxframe/dataframe/merge/concat.py +183 -0
- maxframe/dataframe/merge/update.py +271 -0
- maxframe/dataframe/misc/__init__.py +28 -11
- maxframe/dataframe/misc/_duplicate.py +10 -4
- maxframe/dataframe/misc/apply.py +1 -1
- maxframe/dataframe/misc/check_unique.py +82 -0
- maxframe/dataframe/misc/clip.py +145 -0
- maxframe/dataframe/misc/describe.py +175 -9
- maxframe/dataframe/misc/drop.py +31 -0
- maxframe/dataframe/misc/drop_duplicates.py +2 -2
- maxframe/dataframe/misc/duplicated.py +2 -2
- maxframe/dataframe/misc/get_dummies.py +5 -1
- maxframe/dataframe/misc/infer_dtypes.py +251 -0
- maxframe/dataframe/misc/isin.py +2 -2
- maxframe/dataframe/misc/map.py +125 -18
- maxframe/dataframe/misc/repeat.py +159 -0
- maxframe/dataframe/misc/tests/test_misc.py +48 -3
- maxframe/dataframe/misc/to_numeric.py +3 -0
- maxframe/dataframe/misc/transform.py +12 -5
- maxframe/dataframe/misc/transpose.py +13 -1
- maxframe/dataframe/misc/valid_index.py +115 -0
- maxframe/dataframe/misc/value_counts.py +38 -4
- maxframe/dataframe/missing/checkna.py +14 -6
- maxframe/dataframe/missing/dropna.py +5 -0
- maxframe/dataframe/missing/fillna.py +1 -1
- maxframe/dataframe/missing/replace.py +7 -4
- maxframe/dataframe/reduction/__init__.py +35 -16
- maxframe/dataframe/reduction/aggregation.py +43 -14
- maxframe/dataframe/reduction/all.py +2 -2
- maxframe/dataframe/reduction/any.py +2 -2
- maxframe/dataframe/reduction/argmax.py +103 -0
- maxframe/dataframe/reduction/argmin.py +103 -0
- maxframe/dataframe/reduction/core.py +80 -24
- maxframe/dataframe/reduction/count.py +13 -9
- maxframe/dataframe/reduction/cov.py +166 -0
- maxframe/dataframe/reduction/cummax.py +2 -2
- maxframe/dataframe/reduction/cummin.py +2 -2
- maxframe/dataframe/reduction/cumprod.py +2 -2
- maxframe/dataframe/reduction/cumsum.py +2 -2
- maxframe/dataframe/reduction/custom_reduction.py +2 -2
- maxframe/dataframe/reduction/idxmax.py +185 -0
- maxframe/dataframe/reduction/idxmin.py +185 -0
- maxframe/dataframe/reduction/kurtosis.py +37 -30
- maxframe/dataframe/reduction/max.py +2 -2
- maxframe/dataframe/reduction/mean.py +9 -7
- maxframe/dataframe/reduction/median.py +2 -2
- maxframe/dataframe/reduction/min.py +2 -2
- maxframe/dataframe/reduction/mode.py +144 -0
- maxframe/dataframe/reduction/nunique.py +19 -11
- maxframe/dataframe/reduction/prod.py +18 -13
- maxframe/dataframe/reduction/reduction_size.py +2 -2
- maxframe/dataframe/reduction/sem.py +13 -9
- maxframe/dataframe/reduction/skew.py +31 -27
- maxframe/dataframe/reduction/str_concat.py +10 -7
- maxframe/dataframe/reduction/sum.py +18 -14
- maxframe/dataframe/reduction/tests/test_reduction.py +12 -0
- maxframe/dataframe/reduction/unique.py +20 -3
- maxframe/dataframe/reduction/var.py +16 -12
- maxframe/dataframe/reshape/__init__.py +38 -0
- maxframe/dataframe/{misc → reshape}/pivot.py +1 -0
- maxframe/dataframe/{misc → reshape}/pivot_table.py +1 -0
- maxframe/dataframe/reshape/unstack.py +114 -0
- maxframe/dataframe/sort/__init__.py +16 -1
- maxframe/dataframe/sort/argsort.py +68 -0
- maxframe/dataframe/sort/core.py +2 -1
- maxframe/dataframe/sort/nlargest.py +238 -0
- maxframe/dataframe/sort/nsmallest.py +228 -0
- maxframe/dataframe/sort/rank.py +147 -0
- maxframe/dataframe/statistics/__init__.py +3 -3
- maxframe/dataframe/statistics/corr.py +1 -0
- maxframe/dataframe/statistics/quantile.py +2 -2
- maxframe/dataframe/tests/test_typing.py +104 -0
- maxframe/dataframe/tests/test_utils.py +66 -2
- maxframe/dataframe/tseries/__init__.py +19 -0
- maxframe/dataframe/tseries/at_time.py +61 -0
- maxframe/dataframe/tseries/between_time.py +122 -0
- maxframe/dataframe/typing_.py +185 -0
- maxframe/dataframe/utils.py +125 -52
- maxframe/dataframe/window/aggregation.py +8 -4
- maxframe/dataframe/window/core.py +14 -1
- maxframe/dataframe/window/ewm.py +1 -3
- maxframe/dataframe/window/expanding.py +37 -35
- maxframe/dataframe/window/rolling.py +49 -39
- maxframe/dataframe/window/tests/test_expanding.py +1 -7
- maxframe/dataframe/window/tests/test_rolling.py +1 -1
- maxframe/env.py +7 -4
- maxframe/errors.py +2 -2
- maxframe/io/odpsio/schema.py +9 -3
- maxframe/io/odpsio/tableio.py +7 -2
- maxframe/io/odpsio/tests/test_schema.py +198 -83
- maxframe/learn/__init__.py +10 -2
- maxframe/learn/cluster/__init__.py +15 -0
- maxframe/learn/cluster/_kmeans.py +782 -0
- maxframe/learn/contrib/llm/core.py +18 -7
- maxframe/learn/contrib/llm/deploy/__init__.py +13 -0
- maxframe/learn/contrib/llm/deploy/config.py +221 -0
- maxframe/learn/contrib/llm/deploy/core.py +247 -0
- maxframe/learn/contrib/llm/deploy/framework.py +35 -0
- maxframe/learn/contrib/llm/deploy/loader.py +360 -0
- maxframe/learn/contrib/llm/deploy/tests/__init__.py +13 -0
- maxframe/learn/contrib/llm/deploy/tests/test_register_models.py +359 -0
- maxframe/learn/contrib/llm/models/__init__.py +1 -0
- maxframe/learn/contrib/llm/models/dashscope.py +12 -6
- maxframe/learn/contrib/llm/models/managed.py +76 -11
- maxframe/learn/contrib/llm/models/openai.py +72 -0
- maxframe/learn/contrib/llm/tests/__init__.py +13 -0
- maxframe/learn/contrib/llm/tests/test_core.py +34 -0
- maxframe/learn/contrib/llm/tests/test_openai.py +187 -0
- maxframe/learn/contrib/llm/tests/test_text_gen.py +155 -0
- maxframe/learn/contrib/llm/text.py +348 -42
- maxframe/learn/contrib/models.py +4 -1
- maxframe/learn/contrib/xgboost/classifier.py +2 -0
- maxframe/learn/contrib/xgboost/core.py +113 -4
- maxframe/learn/contrib/xgboost/predict.py +4 -2
- maxframe/learn/contrib/xgboost/regressor.py +5 -0
- maxframe/learn/contrib/xgboost/train.py +7 -2
- maxframe/learn/core.py +66 -0
- maxframe/learn/linear_model/_base.py +58 -1
- maxframe/learn/linear_model/_lin_reg.py +1 -1
- maxframe/learn/metrics/__init__.py +6 -0
- maxframe/learn/metrics/_classification.py +145 -0
- maxframe/learn/metrics/_ranking.py +477 -0
- maxframe/learn/metrics/_scorer.py +60 -0
- maxframe/learn/metrics/pairwise/__init__.py +21 -0
- maxframe/learn/metrics/pairwise/core.py +77 -0
- maxframe/learn/metrics/pairwise/cosine.py +115 -0
- maxframe/learn/metrics/pairwise/euclidean.py +176 -0
- maxframe/learn/metrics/pairwise/haversine.py +96 -0
- maxframe/learn/metrics/pairwise/manhattan.py +80 -0
- maxframe/learn/metrics/pairwise/pairwise.py +127 -0
- maxframe/learn/metrics/pairwise/pairwise_distances_topk.py +121 -0
- maxframe/learn/metrics/pairwise/rbf_kernel.py +51 -0
- maxframe/learn/metrics/tests/__init__.py +13 -0
- maxframe/learn/metrics/tests/test_scorer.py +26 -0
- maxframe/learn/preprocessing/_data/min_max_scaler.py +34 -23
- maxframe/learn/preprocessing/_data/standard_scaler.py +34 -25
- maxframe/learn/utils/__init__.py +2 -1
- maxframe/learn/utils/checks.py +1 -2
- maxframe/learn/utils/core.py +59 -0
- maxframe/learn/utils/extmath.py +79 -9
- maxframe/learn/utils/odpsio.py +262 -0
- maxframe/learn/utils/validation.py +2 -2
- maxframe/lib/compat.py +40 -0
- maxframe/lib/dtypes_extension/__init__.py +16 -1
- maxframe/lib/dtypes_extension/_fake_arrow_dtype.py +604 -0
- maxframe/lib/dtypes_extension/blob.py +304 -0
- maxframe/lib/dtypes_extension/dtypes.py +40 -0
- maxframe/lib/dtypes_extension/tests/test_blob.py +88 -0
- maxframe/lib/dtypes_extension/tests/test_dtypes.py +16 -1
- maxframe/lib/dtypes_extension/tests/test_fake_arrow_dtype.py +75 -0
- maxframe/lib/filesystem/_oss_lib/common.py +124 -50
- maxframe/lib/filesystem/_oss_lib/glob.py +1 -1
- maxframe/lib/filesystem/_oss_lib/handle.py +21 -25
- maxframe/lib/filesystem/base.py +1 -1
- maxframe/lib/filesystem/core.py +1 -1
- maxframe/lib/filesystem/oss.py +115 -46
- maxframe/lib/filesystem/tests/test_oss.py +74 -36
- maxframe/lib/mmh3.cp37-win32.pyd +0 -0
- maxframe/lib/wrapped_pickle.py +10 -0
- maxframe/opcodes.py +41 -15
- maxframe/protocol.py +12 -0
- maxframe/remote/core.py +4 -0
- maxframe/serialization/__init__.py +11 -2
- maxframe/serialization/arrow.py +38 -13
- maxframe/serialization/blob.py +32 -0
- maxframe/serialization/core.cp37-win32.pyd +0 -0
- maxframe/serialization/core.pyx +39 -1
- maxframe/serialization/exception.py +2 -4
- maxframe/serialization/numpy.py +11 -0
- maxframe/serialization/pandas.py +46 -9
- maxframe/serialization/serializables/core.py +2 -2
- maxframe/serialization/tests/test_serial.py +31 -4
- maxframe/tensor/__init__.py +38 -8
- maxframe/tensor/arithmetic/__init__.py +19 -10
- maxframe/tensor/arithmetic/core.py +2 -2
- maxframe/tensor/arithmetic/iscomplexobj.py +53 -0
- maxframe/tensor/arithmetic/tests/test_arithmetic.py +6 -9
- maxframe/tensor/core.py +6 -2
- maxframe/tensor/datasource/tests/test_datasource.py +2 -1
- maxframe/tensor/extensions/__init__.py +2 -0
- maxframe/tensor/extensions/apply_chunk.py +3 -3
- maxframe/tensor/extensions/rebalance.py +65 -0
- maxframe/tensor/fft/__init__.py +32 -0
- maxframe/tensor/fft/core.py +168 -0
- maxframe/tensor/fft/fft.py +112 -0
- maxframe/tensor/fft/fft2.py +118 -0
- maxframe/tensor/fft/fftfreq.py +80 -0
- maxframe/tensor/fft/fftn.py +123 -0
- maxframe/tensor/fft/fftshift.py +79 -0
- maxframe/tensor/fft/hfft.py +112 -0
- maxframe/tensor/fft/ifft.py +114 -0
- maxframe/tensor/fft/ifft2.py +115 -0
- maxframe/tensor/fft/ifftn.py +123 -0
- maxframe/tensor/fft/ifftshift.py +73 -0
- maxframe/tensor/fft/ihfft.py +93 -0
- maxframe/tensor/fft/irfft.py +118 -0
- maxframe/tensor/fft/irfft2.py +62 -0
- maxframe/tensor/fft/irfftn.py +114 -0
- maxframe/tensor/fft/rfft.py +116 -0
- maxframe/tensor/fft/rfft2.py +63 -0
- maxframe/tensor/fft/rfftfreq.py +87 -0
- maxframe/tensor/fft/rfftn.py +113 -0
- maxframe/tensor/indexing/fill_diagonal.py +1 -7
- maxframe/tensor/linalg/__init__.py +7 -0
- maxframe/tensor/linalg/_einsumfunc.py +1025 -0
- maxframe/tensor/linalg/cholesky.py +117 -0
- maxframe/tensor/linalg/einsum.py +339 -0
- maxframe/tensor/linalg/lstsq.py +100 -0
- maxframe/tensor/linalg/matrix_norm.py +75 -0
- maxframe/tensor/linalg/norm.py +249 -0
- maxframe/tensor/linalg/solve.py +72 -0
- maxframe/tensor/linalg/solve_triangular.py +2 -2
- maxframe/tensor/linalg/vector_norm.py +113 -0
- maxframe/tensor/misc/__init__.py +24 -1
- maxframe/tensor/misc/argwhere.py +72 -0
- maxframe/tensor/misc/array_split.py +46 -0
- maxframe/tensor/misc/broadcast_arrays.py +57 -0
- maxframe/tensor/misc/copyto.py +130 -0
- maxframe/tensor/misc/delete.py +104 -0
- maxframe/tensor/misc/dsplit.py +68 -0
- maxframe/tensor/misc/ediff1d.py +74 -0
- maxframe/tensor/misc/expand_dims.py +85 -0
- maxframe/tensor/misc/flip.py +90 -0
- maxframe/tensor/misc/fliplr.py +64 -0
- maxframe/tensor/misc/flipud.py +68 -0
- maxframe/tensor/misc/hsplit.py +85 -0
- maxframe/tensor/misc/insert.py +139 -0
- maxframe/tensor/misc/moveaxis.py +83 -0
- maxframe/tensor/misc/result_type.py +88 -0
- maxframe/tensor/misc/roll.py +124 -0
- maxframe/tensor/misc/rollaxis.py +77 -0
- maxframe/tensor/misc/shape.py +89 -0
- maxframe/tensor/misc/split.py +190 -0
- maxframe/tensor/misc/tile.py +109 -0
- maxframe/tensor/misc/vsplit.py +74 -0
- maxframe/tensor/reduction/array_equal.py +2 -1
- maxframe/tensor/sort/__init__.py +2 -0
- maxframe/tensor/sort/argpartition.py +98 -0
- maxframe/tensor/sort/partition.py +228 -0
- maxframe/tensor/spatial/__init__.py +15 -0
- maxframe/tensor/spatial/distance/__init__.py +17 -0
- maxframe/tensor/spatial/distance/cdist.py +421 -0
- maxframe/tensor/spatial/distance/pdist.py +398 -0
- maxframe/tensor/spatial/distance/squareform.py +153 -0
- maxframe/tensor/special/__init__.py +159 -21
- maxframe/tensor/special/airy.py +55 -0
- maxframe/tensor/special/bessel.py +199 -0
- maxframe/tensor/special/core.py +65 -4
- maxframe/tensor/special/ellip_func_integrals.py +155 -0
- maxframe/tensor/special/ellip_harm.py +55 -0
- maxframe/tensor/special/err_fresnel.py +223 -0
- maxframe/tensor/special/gamma_funcs.py +303 -0
- maxframe/tensor/special/hypergeometric_funcs.py +69 -0
- maxframe/tensor/special/info_theory.py +189 -0
- maxframe/tensor/special/misc.py +21 -0
- maxframe/tensor/statistics/__init__.py +6 -0
- maxframe/tensor/statistics/corrcoef.py +77 -0
- maxframe/tensor/statistics/cov.py +222 -0
- maxframe/tensor/statistics/digitize.py +126 -0
- maxframe/tensor/statistics/histogram.py +520 -0
- maxframe/tensor/statistics/median.py +85 -0
- maxframe/tensor/statistics/ptp.py +89 -0
- maxframe/tensor/utils.py +3 -3
- maxframe/tests/test_udf.py +61 -0
- maxframe/tests/test_utils.py +51 -6
- maxframe/tests/utils.py +0 -2
- maxframe/typing_.py +2 -0
- maxframe/udf.py +130 -9
- maxframe/utils.py +254 -27
- {maxframe-2.0.0b2.dist-info → maxframe-2.3.0rc1.dist-info}/METADATA +3 -3
- {maxframe-2.0.0b2.dist-info → maxframe-2.3.0rc1.dist-info}/RECORD +442 -264
- maxframe_client/fetcher.py +35 -4
- maxframe_client/session/odps.py +7 -2
- maxframe_client/session/task.py +8 -1
- maxframe_client/tests/test_fetcher.py +76 -3
- maxframe_client/tests/test_session.py +28 -1
- maxframe/dataframe/arrays.py +0 -864
- /maxframe/dataframe/{misc → reshape}/melt.py +0 -0
- /maxframe/dataframe/{misc → reshape}/stack.py +0 -0
- {maxframe-2.0.0b2.dist-info → maxframe-2.3.0rc1.dist-info}/WHEEL +0 -0
- {maxframe-2.0.0b2.dist-info → maxframe-2.3.0rc1.dist-info}/top_level.txt +0 -0
maxframe/env.py
CHANGED
|
@@ -17,12 +17,14 @@ MAXFRAME_NAMESPACE = "MAXFRAME_NAMESPACE"
|
|
|
17
17
|
|
|
18
18
|
# Maxframe Service common envs
|
|
19
19
|
MAXFRAME_HTTP_PORT_FILE = "MAXFRAME_PROXY_PORT_FILE"
|
|
20
|
-
|
|
21
|
-
|
|
20
|
+
MAXFRAME_INSIDE_TASK = "MAXFRAME_INSIDE_TASK"
|
|
21
|
+
MAXFRAME_SERVICE_BASE_URL = "MF_SERVICE_BASE_URL"
|
|
22
|
+
MAXFRAME_SERVICE_ALLOW_ORIGIN = "MAXFRAME_SERVICE_ALLOW_ORIGIN"
|
|
22
23
|
MAXFRAME_SERVICE_LISTEN_ADDRESS = "MAXFRAME_SERVICE_LISTEN_ADDRESS"
|
|
23
24
|
MAXFRAME_SERVICE_LOG_CONFIG_FILE = "MAXFRAME_SERVICE_LOG_CONFIG_FILE"
|
|
24
|
-
|
|
25
|
-
|
|
25
|
+
MAXFRAME_SERVICE_PORT = "MAXFRAME_SERVICE_PORT"
|
|
26
|
+
MAXFRAME_SERVICE_PORT_RETRIES = "MAXFRAME_SERVICE_PORT_RETRIES"
|
|
27
|
+
MAXFRAME_USER_LOG_CONFIG_FILE = "MAXFRAME_USER_LOG_CONFIG_FILE"
|
|
26
28
|
|
|
27
29
|
# ODPS envs
|
|
28
30
|
ODPS_BEARER_TOKEN = "ODPS_BEARER_TOKEN"
|
|
@@ -31,4 +33,5 @@ ODPS_BEARER_TOKEN_TIMESTAMP_FILE = "ODPS_BEARER_TOKEN_TIMESTAMP_FILE"
|
|
|
31
33
|
ODPS_PROJECT_NAME = "ODPS_PROJECT_NAME"
|
|
32
34
|
ODPS_ENDPOINT = "ODPS_ENDPOINT"
|
|
33
35
|
ODPS_TUNNEL_ENDPOINT = "ODPS_TUNNEL_ENDPOINT"
|
|
36
|
+
ODPS_NAMESPACE = "ODPS_NAMESPACE"
|
|
34
37
|
ODPS_STORAGE_API_ENDPOINT = "ODPS_STORAGE_API_ENDPOINT"
|
maxframe/errors.py
CHANGED
|
@@ -43,5 +43,5 @@ class SessionAlreadyClosedError(MaxFrameError):
|
|
|
43
43
|
|
|
44
44
|
|
|
45
45
|
class EngineUnavailableError(MaxFrameIntentionalError):
|
|
46
|
-
def __init__(self,
|
|
47
|
-
super().__init__(
|
|
46
|
+
def __init__(self, msg: str):
|
|
47
|
+
super().__init__(msg)
|
maxframe/io/odpsio/schema.py
CHANGED
|
@@ -22,9 +22,10 @@ import pyarrow as pa
|
|
|
22
22
|
from odps import types as odps_types
|
|
23
23
|
from pandas.api import types as pd_types
|
|
24
24
|
|
|
25
|
+
from ...config import options
|
|
25
26
|
from ...core import TILEABLE_TYPE, OutputType
|
|
26
27
|
from ...dataframe.core import DATAFRAME_TYPE, INDEX_TYPE, SERIES_TYPE
|
|
27
|
-
from ...lib.dtypes_extension import ArrowDtype
|
|
28
|
+
from ...lib.dtypes_extension import ArrowBlobType, ArrowDtype
|
|
28
29
|
from ...protocol import DataFrameTableMeta
|
|
29
30
|
from ...tensor.core import TENSOR_TYPE
|
|
30
31
|
from ...utils import build_temp_table_name
|
|
@@ -65,7 +66,11 @@ _odps_type_to_arrow = {
|
|
|
65
66
|
odps_types.timestamp_ntz: pa.timestamp("ns"),
|
|
66
67
|
}
|
|
67
68
|
|
|
68
|
-
|
|
69
|
+
if hasattr(odps_types, "blob"):
|
|
70
|
+
_arrow_to_odps_types[ArrowBlobType()] = odps_types.blob
|
|
71
|
+
_odps_type_to_arrow[odps_types.blob] = ArrowBlobType()
|
|
72
|
+
|
|
73
|
+
_based_for_pandas_pa_types = (pa.ListType, pa.MapType, pa.StructType)
|
|
69
74
|
|
|
70
75
|
|
|
71
76
|
def is_based_for_pandas_dtype(arrow_type: pa.DataType) -> bool:
|
|
@@ -204,9 +209,10 @@ def odps_schema_to_pandas_dtypes(
|
|
|
204
209
|
def arrow_table_to_pandas_dataframe(
|
|
205
210
|
table: pa.Table, meta: DataFrameTableMeta = None
|
|
206
211
|
) -> pd.DataFrame:
|
|
212
|
+
use_arrow_backend = options.dataframe.dtype_backend == "pyarrow"
|
|
207
213
|
df = table.to_pandas(
|
|
208
214
|
types_mapper=lambda x: (
|
|
209
|
-
ArrowDtype(x) if is_based_for_pandas_dtype(x) else None
|
|
215
|
+
ArrowDtype(x) if is_based_for_pandas_dtype(x) or use_arrow_backend else None
|
|
210
216
|
),
|
|
211
217
|
ignore_metadata=True,
|
|
212
218
|
)
|
maxframe/io/odpsio/tableio.py
CHANGED
|
@@ -274,6 +274,7 @@ class TunnelTableIO(ODPSTableIO):
|
|
|
274
274
|
full_table_name: str,
|
|
275
275
|
partitions: List[Optional[str]] = None,
|
|
276
276
|
reopen: bool = False,
|
|
277
|
+
timeout: Optional[float] = None,
|
|
277
278
|
) -> Dict[Optional[str], TableDownloadSession]:
|
|
278
279
|
table = odps_entry.get_table(full_table_name)
|
|
279
280
|
tunnel = TableTunnel(odps_entry, quota_name=options.tunnel_quota_name)
|
|
@@ -295,14 +296,18 @@ class TunnelTableIO(ODPSTableIO):
|
|
|
295
296
|
):
|
|
296
297
|
down_id = cls._down_session_ids[part_key]
|
|
297
298
|
down_session = tunnel.create_download_session(
|
|
298
|
-
table,
|
|
299
|
+
table,
|
|
300
|
+
async_mode=True,
|
|
301
|
+
partition_spec=part,
|
|
302
|
+
download_id=down_id,
|
|
303
|
+
timeout=timeout,
|
|
299
304
|
)
|
|
300
305
|
if down_session.status != TableDownloadStatus.Normal:
|
|
301
306
|
down_session = None
|
|
302
307
|
|
|
303
308
|
if down_session is None:
|
|
304
309
|
down_session = tunnel.create_download_session(
|
|
305
|
-
table, async_mode=True, partition_spec=part
|
|
310
|
+
table, async_mode=True, partition_spec=part, timeout=timeout
|
|
306
311
|
)
|
|
307
312
|
|
|
308
313
|
while len(cls._down_session_ids) >= _DOWNLOAD_ID_CACHE_SIZE:
|
|
@@ -12,6 +12,8 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
+
import os
|
|
16
|
+
|
|
15
17
|
import numpy as np
|
|
16
18
|
import pandas as pd
|
|
17
19
|
import pyarrow as pa
|
|
@@ -19,9 +21,11 @@ import pytest
|
|
|
19
21
|
from odps import types as odps_types
|
|
20
22
|
|
|
21
23
|
from .... import dataframe as md
|
|
24
|
+
from .... import env
|
|
22
25
|
from .... import tensor as mt
|
|
26
|
+
from ....config import option_context, options
|
|
23
27
|
from ....core import OutputType
|
|
24
|
-
from ....lib.dtypes_extension import ArrowDtype, dict_, list_
|
|
28
|
+
from ....lib.dtypes_extension import ArrowBlobType, ArrowDtype, dict_, list_
|
|
25
29
|
from ....utils import pd_release_version
|
|
26
30
|
from ..schema import (
|
|
27
31
|
arrow_schema_to_odps_schema,
|
|
@@ -35,6 +39,16 @@ from ..schema import (
|
|
|
35
39
|
)
|
|
36
40
|
|
|
37
41
|
|
|
42
|
+
@pytest.fixture
|
|
43
|
+
def set_dtype_backend(request):
|
|
44
|
+
os.environ[env.MAXFRAME_INSIDE_TASK] = "1"
|
|
45
|
+
with option_context({"dataframe.dtype_backend": request.param}):
|
|
46
|
+
try:
|
|
47
|
+
yield request.param
|
|
48
|
+
finally:
|
|
49
|
+
os.environ.pop(env.MAXFRAME_INSIDE_TASK)
|
|
50
|
+
|
|
51
|
+
|
|
38
52
|
def _wrap_maxframe_obj(obj, wrap="no"):
|
|
39
53
|
if wrap == "no":
|
|
40
54
|
return obj
|
|
@@ -54,7 +68,9 @@ def _wrap_maxframe_obj(obj, wrap="no"):
|
|
|
54
68
|
|
|
55
69
|
|
|
56
70
|
@pytest.mark.parametrize("wrap_obj", ["no", "yes", "data"])
|
|
57
|
-
|
|
71
|
+
@pytest.mark.parametrize("set_dtype_backend", ["numpy", "pyarrow"], indirect=True)
|
|
72
|
+
def test_pandas_to_odps_schema_dataframe(wrap_obj, set_dtype_backend):
|
|
73
|
+
# Test with a simple DataFrame
|
|
58
74
|
data = pd.DataFrame(np.random.rand(100, 5), columns=list("ABCDE"))
|
|
59
75
|
|
|
60
76
|
test_df = _wrap_maxframe_obj(data, wrap=wrap_obj)
|
|
@@ -71,6 +87,7 @@ def test_pandas_to_odps_schema_dataframe(wrap_obj):
|
|
|
71
87
|
assert meta.pd_column_level_names == [None]
|
|
72
88
|
assert meta.pd_index_level_names == [None]
|
|
73
89
|
|
|
90
|
+
# Test with ignore_index=True to exclude index from schema
|
|
74
91
|
test_df = _wrap_maxframe_obj(data, wrap=wrap_obj)
|
|
75
92
|
schema, meta = pandas_to_odps_schema(test_df, ignore_index=True)
|
|
76
93
|
assert [c.name for c in schema.columns] == list(test_df.dtypes.index.str.lower())
|
|
@@ -81,6 +98,7 @@ def test_pandas_to_odps_schema_dataframe(wrap_obj):
|
|
|
81
98
|
assert meta.pd_column_level_names == [None]
|
|
82
99
|
assert meta.pd_index_level_names == []
|
|
83
100
|
|
|
101
|
+
# Test with MultiIndex columns and index
|
|
84
102
|
data.columns = pd.MultiIndex.from_tuples(
|
|
85
103
|
[("A", "A"), ("A", "B"), ("A", "C"), ("B", "A"), ("B", "B")], names=["c1", "c2"]
|
|
86
104
|
)
|
|
@@ -105,7 +123,9 @@ def test_pandas_to_odps_schema_dataframe(wrap_obj):
|
|
|
105
123
|
|
|
106
124
|
|
|
107
125
|
@pytest.mark.parametrize("wrap_obj", ["no", "yes", "data"])
|
|
108
|
-
|
|
126
|
+
@pytest.mark.parametrize("set_dtype_backend", ["numpy", "pyarrow"], indirect=True)
|
|
127
|
+
def test_pandas_to_odps_schema_series(wrap_obj, set_dtype_backend):
|
|
128
|
+
# Test with a simple Series
|
|
109
129
|
data = pd.Series(np.random.rand(100))
|
|
110
130
|
|
|
111
131
|
test_s = _wrap_maxframe_obj(data, wrap=wrap_obj)
|
|
@@ -119,6 +139,7 @@ def test_pandas_to_odps_schema_series(wrap_obj):
|
|
|
119
139
|
assert meta.pd_column_level_names == [None]
|
|
120
140
|
assert meta.pd_index_level_names == [None]
|
|
121
141
|
|
|
142
|
+
# Test with ignore_index=True to exclude index from schema
|
|
122
143
|
schema, meta = pandas_to_odps_schema(test_s, ignore_index=True)
|
|
123
144
|
assert [c.name for c in schema.columns] == ["_data"]
|
|
124
145
|
assert [c.type.name for c in schema.columns] == ["double"]
|
|
@@ -128,6 +149,7 @@ def test_pandas_to_odps_schema_series(wrap_obj):
|
|
|
128
149
|
assert meta.pd_column_level_names == [None]
|
|
129
150
|
assert meta.pd_index_level_names == []
|
|
130
151
|
|
|
152
|
+
# Test with named Series and MultiIndex
|
|
131
153
|
data.index = pd.MultiIndex.from_arrays(
|
|
132
154
|
[np.random.choice(list("ABC"), 100), np.random.randint(0, 10, 100)],
|
|
133
155
|
names=["c1", "c2"],
|
|
@@ -146,7 +168,9 @@ def test_pandas_to_odps_schema_series(wrap_obj):
|
|
|
146
168
|
|
|
147
169
|
|
|
148
170
|
@pytest.mark.parametrize("wrap_obj", ["no", "yes", "data"])
|
|
149
|
-
|
|
171
|
+
@pytest.mark.parametrize("set_dtype_backend", ["numpy", "pyarrow"], indirect=True)
|
|
172
|
+
def test_pandas_to_odps_schema_index(wrap_obj, set_dtype_backend):
|
|
173
|
+
# Test with a simple Index
|
|
150
174
|
data = pd.Index(np.random.randint(0, 100, 100))
|
|
151
175
|
|
|
152
176
|
test_idx = _wrap_maxframe_obj(data, wrap=wrap_obj)
|
|
@@ -162,6 +186,7 @@ def test_pandas_to_odps_schema_index(wrap_obj):
|
|
|
162
186
|
assert meta.pd_column_level_names == []
|
|
163
187
|
assert meta.pd_index_level_names == [None]
|
|
164
188
|
|
|
189
|
+
# Test with MultiIndex
|
|
165
190
|
data = pd.MultiIndex.from_arrays(
|
|
166
191
|
[np.random.choice(list("ABC"), 100), np.random.randint(0, 10, 100)],
|
|
167
192
|
names=["c1", "c2"],
|
|
@@ -178,7 +203,8 @@ def test_pandas_to_odps_schema_index(wrap_obj):
|
|
|
178
203
|
|
|
179
204
|
|
|
180
205
|
@pytest.mark.parametrize("wrap_obj", ["no", "yes", "data"])
|
|
181
|
-
|
|
206
|
+
@pytest.mark.parametrize("set_dtype_backend", ["numpy", "pyarrow"], indirect=True)
|
|
207
|
+
def test_pandas_to_odps_schema_scalar(wrap_obj, set_dtype_backend):
|
|
182
208
|
data = 1234.56
|
|
183
209
|
|
|
184
210
|
test_scalar = _wrap_maxframe_obj(data, wrap=wrap_obj)
|
|
@@ -196,7 +222,8 @@ def test_pandas_to_odps_schema_scalar(wrap_obj):
|
|
|
196
222
|
|
|
197
223
|
|
|
198
224
|
@pytest.mark.parametrize("wrap_obj", ["no", "yes", "data"])
|
|
199
|
-
|
|
225
|
+
@pytest.mark.parametrize("set_dtype_backend", ["numpy", "pyarrow"], indirect=True)
|
|
226
|
+
def test_pandas_to_odps_schema_tensor(wrap_obj, set_dtype_backend):
|
|
200
227
|
data = np.array([1, 2, 3])
|
|
201
228
|
|
|
202
229
|
test_tensor = _wrap_maxframe_obj(data, wrap=wrap_obj)
|
|
@@ -214,6 +241,7 @@ def test_pandas_to_odps_schema_tensor(wrap_obj):
|
|
|
214
241
|
|
|
215
242
|
|
|
216
243
|
def test_odps_arrow_schema_conversion():
|
|
244
|
+
# Create an ODPS schema with various data types
|
|
217
245
|
odps_schema = odps_types.OdpsSchema(
|
|
218
246
|
[
|
|
219
247
|
odps_types.Column("col1", "string"),
|
|
@@ -293,110 +321,168 @@ def test_odps_arrow_schema_conversion():
|
|
|
293
321
|
c.type for c in odps_schema2.columns
|
|
294
322
|
]
|
|
295
323
|
|
|
324
|
+
# Test that unsupported data types raise TypeError
|
|
296
325
|
with pytest.raises(TypeError):
|
|
297
326
|
arrow_schema_to_odps_schema(pa.schema([("col1", pa.float16())]))
|
|
298
327
|
|
|
299
328
|
|
|
300
|
-
def
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
odps_types.Column("col4", "smallint"),
|
|
307
|
-
odps_types.Column("col5", "int"),
|
|
308
|
-
odps_types.Column("col6", "bigint"),
|
|
309
|
-
odps_types.Column("col7", "boolean"),
|
|
310
|
-
odps_types.Column("col8", "float"),
|
|
311
|
-
odps_types.Column("col9", "double"),
|
|
312
|
-
# odps_types.Column("col10", "date"),
|
|
313
|
-
odps_types.Column("col11", "datetime"),
|
|
314
|
-
odps_types.Column("col12", "timestamp"),
|
|
315
|
-
# odps_types.Column("col13", "decimal(10, 2)"),
|
|
316
|
-
odps_types.Column("col14", "array<string>"),
|
|
317
|
-
odps_types.Column("col15", "map<string, bigint>"),
|
|
318
|
-
# odps_types.Column("col16", "struct<a1: string, a2: map<string, bigint>>"),
|
|
319
|
-
# odps_types.Column("col17", "CHAR(15)"),
|
|
320
|
-
# odps_types.Column("col18", "VARCHAR(15)"),
|
|
321
|
-
# odps_types.Column("col19", "decimal"),
|
|
322
|
-
]
|
|
323
|
-
)
|
|
324
|
-
pd_dtypes = odps_schema_to_pandas_dtypes(odps_schema)
|
|
325
|
-
pd.testing.assert_series_equal(
|
|
326
|
-
pd_dtypes,
|
|
327
|
-
pd.Series(
|
|
328
|
-
[
|
|
329
|
-
np.dtype("O"), # string
|
|
330
|
-
np.dtype("O"), # binary
|
|
331
|
-
np.dtype(np.int8),
|
|
332
|
-
np.dtype(np.int16),
|
|
333
|
-
np.dtype(np.int32),
|
|
334
|
-
np.dtype(np.int64),
|
|
335
|
-
np.dtype(np.bool_),
|
|
336
|
-
np.dtype(np.float32),
|
|
337
|
-
np.dtype(np.float64),
|
|
338
|
-
np.dtype(
|
|
339
|
-
"datetime64[ms]" if pd_release_version[0] >= 2 else "datetime64[ns]"
|
|
340
|
-
),
|
|
341
|
-
np.dtype("datetime64[ns]"),
|
|
342
|
-
ArrowDtype(pa.list_(pa.string())),
|
|
343
|
-
ArrowDtype(pa.map_(pa.string(), pa.int64())),
|
|
344
|
-
],
|
|
345
|
-
index=[c.name for c in odps_schema.columns],
|
|
329
|
+
def _get_odps_schema_for_test(cast_result=False):
|
|
330
|
+
test_pyarrow = options.dataframe.dtype_backend == "pyarrow"
|
|
331
|
+
cols = [
|
|
332
|
+
odps_types.Column("col1", "string"),
|
|
333
|
+
odps_types.Column(
|
|
334
|
+
"col2", "binary" if test_pyarrow or not cast_result else "string"
|
|
346
335
|
),
|
|
347
|
-
|
|
336
|
+
odps_types.Column("col3", "tinyint"),
|
|
337
|
+
odps_types.Column("col4", "smallint"),
|
|
338
|
+
odps_types.Column("col5", "int"),
|
|
339
|
+
odps_types.Column("col6", "bigint"),
|
|
340
|
+
odps_types.Column("col7", "boolean"),
|
|
341
|
+
odps_types.Column("col8", "float"),
|
|
342
|
+
odps_types.Column("col9", "double"),
|
|
343
|
+
odps_types.Column("col10", "date") if test_pyarrow else None,
|
|
344
|
+
odps_types.Column(
|
|
345
|
+
"col11",
|
|
346
|
+
"datetime" if test_pyarrow or pd_release_version[0] >= 2 else "timestamp",
|
|
347
|
+
),
|
|
348
|
+
odps_types.Column("col12", "timestamp"),
|
|
349
|
+
odps_types.Column("col13", "decimal(10, 2)") if test_pyarrow else None,
|
|
350
|
+
odps_types.Column("col14", "array<string>"),
|
|
351
|
+
odps_types.Column("col15", "map<string, bigint>"),
|
|
352
|
+
odps_types.Column("col16", "struct<a1: string, a2: map<string, bigint>>"),
|
|
353
|
+
odps_types.Column("col17", "CHAR(15)" if not cast_result else "string")
|
|
354
|
+
if test_pyarrow
|
|
355
|
+
else None,
|
|
356
|
+
odps_types.Column("col18", "VARCHAR(15)" if not cast_result else "string")
|
|
357
|
+
if test_pyarrow
|
|
358
|
+
else None,
|
|
359
|
+
]
|
|
360
|
+
return odps_types.OdpsSchema([c for c in cols if c is not None])
|
|
348
361
|
|
|
349
|
-
|
|
362
|
+
|
|
363
|
+
def _assert_odps_schema_equal(left, right):
|
|
364
|
+
assert [c.name for c in left.columns] == [c.name for c in right.columns]
|
|
365
|
+
assert [c.type for c in left.columns] == [c.type for c in right.columns]
|
|
366
|
+
|
|
367
|
+
|
|
368
|
+
@pytest.mark.parametrize("set_dtype_backend", ["numpy"], indirect=True)
|
|
369
|
+
def test_odps_pandas_schema_conversion_with_numpy(set_dtype_backend):
|
|
370
|
+
# Create an ODPS schema with various data types
|
|
371
|
+
odps_schema = _get_odps_schema_for_test()
|
|
372
|
+
pd_dtypes = odps_schema_to_pandas_dtypes(odps_schema)
|
|
373
|
+
|
|
374
|
+
expected_series = pd.Series(
|
|
350
375
|
[
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
"col11", "datetime" if pd_release_version[0] >= 2 else "timestamp"
|
|
376
|
+
np.dtype("O"), # string
|
|
377
|
+
np.dtype("O"), # binary
|
|
378
|
+
np.dtype(np.int8),
|
|
379
|
+
np.dtype(np.int16),
|
|
380
|
+
np.dtype(np.int32),
|
|
381
|
+
np.dtype(np.int64),
|
|
382
|
+
np.dtype(np.bool_),
|
|
383
|
+
np.dtype(np.float32),
|
|
384
|
+
np.dtype(np.float64),
|
|
385
|
+
np.dtype(
|
|
386
|
+
"datetime64[ms]" if pd_release_version[0] >= 2 else "datetime64[ns]"
|
|
363
387
|
),
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
388
|
+
np.dtype("datetime64[ns]"),
|
|
389
|
+
ArrowDtype(pa.list_(pa.string())),
|
|
390
|
+
ArrowDtype(pa.map_(pa.string(), pa.int64())),
|
|
391
|
+
ArrowDtype(
|
|
392
|
+
pa.struct(
|
|
393
|
+
[
|
|
394
|
+
pa.field("a1", pa.string()),
|
|
395
|
+
pa.field("a2", pa.map_(pa.string(), pa.int64())),
|
|
396
|
+
]
|
|
397
|
+
)
|
|
398
|
+
),
|
|
399
|
+
],
|
|
400
|
+
index=[c.name for c in odps_schema.columns],
|
|
373
401
|
)
|
|
374
402
|
|
|
403
|
+
pd.testing.assert_series_equal(pd_dtypes, expected_series)
|
|
404
|
+
|
|
405
|
+
expected_odps_schema = _get_odps_schema_for_test(cast_result=True)
|
|
406
|
+
|
|
375
407
|
odps_schema2 = arrow_schema_to_odps_schema(
|
|
376
408
|
pandas_dtypes_to_arrow_schema(pd_dtypes, unknown_as_string=True)
|
|
377
409
|
)
|
|
378
|
-
|
|
379
|
-
c.name for c in odps_schema2.columns
|
|
380
|
-
]
|
|
381
|
-
assert [c.type for c in expected_odps_schema.columns] == [
|
|
382
|
-
c.type for c in odps_schema2.columns
|
|
383
|
-
]
|
|
410
|
+
_assert_odps_schema_equal(expected_odps_schema, odps_schema2)
|
|
384
411
|
|
|
412
|
+
# Test that unsupported data types raise TypeError
|
|
385
413
|
with pytest.raises(TypeError):
|
|
386
414
|
arrow_schema_to_odps_schema(pa.schema([("col1", pa.float16())]))
|
|
387
415
|
|
|
388
416
|
|
|
417
|
+
@pytest.mark.parametrize("set_dtype_backend", ["pyarrow"], indirect=True)
|
|
418
|
+
def test_odps_pandas_schema_conversion_with_pyarrow(set_dtype_backend):
|
|
419
|
+
# Create an ODPS schema with various data types
|
|
420
|
+
odps_schema = _get_odps_schema_for_test()
|
|
421
|
+
pd_dtypes = odps_schema_to_pandas_dtypes(odps_schema)
|
|
422
|
+
|
|
423
|
+
# When dtype_backend is pyarrow, complex types should be ArrowDtype
|
|
424
|
+
expected_series = pd.Series(
|
|
425
|
+
[
|
|
426
|
+
ArrowDtype(pa.string()),
|
|
427
|
+
ArrowDtype(pa.binary()),
|
|
428
|
+
ArrowDtype(pa.int8()),
|
|
429
|
+
ArrowDtype(pa.int16()),
|
|
430
|
+
ArrowDtype(pa.int32()),
|
|
431
|
+
ArrowDtype(pa.int64()),
|
|
432
|
+
ArrowDtype(pa.bool_()),
|
|
433
|
+
ArrowDtype(pa.float32()),
|
|
434
|
+
ArrowDtype(pa.float64()),
|
|
435
|
+
ArrowDtype(pa.date32()),
|
|
436
|
+
ArrowDtype(pa.timestamp("ms")),
|
|
437
|
+
ArrowDtype(pa.timestamp("ns")),
|
|
438
|
+
ArrowDtype(pa.decimal128(10, 2)),
|
|
439
|
+
ArrowDtype(pa.list_(pa.string())),
|
|
440
|
+
ArrowDtype(pa.map_(pa.string(), pa.int64())),
|
|
441
|
+
ArrowDtype(
|
|
442
|
+
pa.struct(
|
|
443
|
+
[
|
|
444
|
+
pa.field("a1", pa.string()),
|
|
445
|
+
pa.field("a2", pa.map_(pa.string(), pa.int64())),
|
|
446
|
+
]
|
|
447
|
+
)
|
|
448
|
+
),
|
|
449
|
+
ArrowDtype(pa.string()),
|
|
450
|
+
ArrowDtype(pa.string()),
|
|
451
|
+
],
|
|
452
|
+
index=[c.name for c in odps_schema.columns],
|
|
453
|
+
)
|
|
454
|
+
|
|
455
|
+
pd.testing.assert_series_equal(pd_dtypes, expected_series)
|
|
456
|
+
|
|
457
|
+
expected_odps_schema = _get_odps_schema_for_test(cast_result=True)
|
|
458
|
+
|
|
459
|
+
odps_schema2 = arrow_schema_to_odps_schema(
|
|
460
|
+
pandas_dtypes_to_arrow_schema(pd_dtypes, unknown_as_string=True)
|
|
461
|
+
)
|
|
462
|
+
_assert_odps_schema_equal(expected_odps_schema, odps_schema2)
|
|
463
|
+
|
|
464
|
+
|
|
389
465
|
def test_build_column_name():
|
|
390
466
|
records = dict()
|
|
467
|
+
# Test that long valid names are preserved
|
|
391
468
|
assert build_table_column_name(0, "a" * 127, records) == "a" * 127
|
|
469
|
+
|
|
470
|
+
# Test that valid names with underscores and alphanumeric chars are preserved
|
|
392
471
|
assert build_table_column_name(1, "_abc123", records) == "_abc123"
|
|
472
|
+
|
|
473
|
+
# Test that names with invalid characters are replaced with generated names
|
|
393
474
|
assert build_table_column_name(2, "_abc'123", records) == "_column_2"
|
|
475
|
+
|
|
476
|
+
# Test that overly long names are replaced with generated names
|
|
394
477
|
assert build_table_column_name(3, "a" * 256, records) == "_column_3"
|
|
478
|
+
|
|
479
|
+
# Test that tuple names are converted to underscore-separated strings
|
|
395
480
|
assert build_table_column_name(4, ("A", 1), records) == "a_1"
|
|
396
481
|
|
|
397
482
|
|
|
398
483
|
@pytest.mark.parametrize("wrap_obj", ["no", "yes", "data"])
|
|
399
|
-
|
|
484
|
+
@pytest.mark.parametrize("set_dtype_backend", ["numpy", "pyarrow"], indirect=True)
|
|
485
|
+
def test_build_table_meta(wrap_obj, set_dtype_backend):
|
|
400
486
|
data = pd.DataFrame(
|
|
401
487
|
np.random.rand(100, 7),
|
|
402
488
|
columns=["A", "A", "A_0", "A_1", "a_1", "B", "C"],
|
|
@@ -411,7 +497,9 @@ def test_build_table_meta(wrap_obj):
|
|
|
411
497
|
@pytest.mark.skipif(
|
|
412
498
|
pd_release_version[0] < 2, reason="only run under pandas 2.0 or greater"
|
|
413
499
|
)
|
|
414
|
-
|
|
500
|
+
@pytest.mark.parametrize("set_dtype_backend", ["numpy", "pyarrow"], indirect=True)
|
|
501
|
+
def test_table_meta_with_datetime(set_dtype_backend):
|
|
502
|
+
# Test DataFrame with datetime column
|
|
415
503
|
raw_df = pd.DataFrame(
|
|
416
504
|
[
|
|
417
505
|
[1, "abc", "2024-10-01 11:23:12"],
|
|
@@ -423,6 +511,7 @@ def test_table_meta_with_datetime():
|
|
|
423
511
|
schema, _ = pandas_to_odps_schema(df, unknown_as_string=True)
|
|
424
512
|
assert schema.columns[3].type == odps_types.datetime
|
|
425
513
|
|
|
514
|
+
# Test Series with datetime dtype
|
|
426
515
|
raw_series = pd.Series(
|
|
427
516
|
["2024-10-01 11:23:12", "2024-10-02 22:55:13"], dtype="datetime64[ms]"
|
|
428
517
|
)
|
|
@@ -430,6 +519,7 @@ def test_table_meta_with_datetime():
|
|
|
430
519
|
schema, _ = pandas_to_odps_schema(s, unknown_as_string=True)
|
|
431
520
|
assert schema.columns[1].type == odps_types.datetime
|
|
432
521
|
|
|
522
|
+
# Test Index with datetime dtype
|
|
433
523
|
raw_index = pd.Index(
|
|
434
524
|
["2024-10-01 11:23:12", "2024-10-02 22:55:13"], dtype="datetime64[ms]"
|
|
435
525
|
)
|
|
@@ -437,6 +527,7 @@ def test_table_meta_with_datetime():
|
|
|
437
527
|
schema, _ = pandas_to_odps_schema(idx, unknown_as_string=True)
|
|
438
528
|
assert schema.columns[0].type == odps_types.datetime
|
|
439
529
|
|
|
530
|
+
# Test MultiIndex with datetime column
|
|
440
531
|
src_df = pd.DataFrame(
|
|
441
532
|
[[1, "2024-10-01 11:23:12"], [3, "2024-10-02 22:55:13"]],
|
|
442
533
|
columns=["A", "B"],
|
|
@@ -463,3 +554,27 @@ def test_pandas_types_to_arrow_schema():
|
|
|
463
554
|
assert schema.field("int8").type == pa.int8()
|
|
464
555
|
assert schema.field("map").type == pa.map_(pa.string(), pa.string())
|
|
465
556
|
assert schema.field("list").type == pa.list_(pa.string())
|
|
557
|
+
|
|
558
|
+
|
|
559
|
+
@pytest.mark.skipif(
|
|
560
|
+
not hasattr(odps_types, "blob"),
|
|
561
|
+
reason="need pyodps to support blob type to run this test",
|
|
562
|
+
)
|
|
563
|
+
def test_blob_types_conversion():
|
|
564
|
+
pd_data = pd.DataFrame(
|
|
565
|
+
{
|
|
566
|
+
"int_col": pd.Series([1, 2], dtype=np.int64),
|
|
567
|
+
"blob_col": pd.Series([b"abcd", b"efgh"], dtype="blob"),
|
|
568
|
+
},
|
|
569
|
+
)
|
|
570
|
+
arrow_schema = pandas_types_to_arrow_schema(pd_data)
|
|
571
|
+
assert arrow_schema.field("int_col").type == pa.int64()
|
|
572
|
+
assert arrow_schema.field("blob_col").type == ArrowBlobType()
|
|
573
|
+
|
|
574
|
+
odps_schema = arrow_schema_to_odps_schema(arrow_schema)
|
|
575
|
+
assert odps_schema.columns[0].type == odps_types.bigint
|
|
576
|
+
assert odps_schema.columns[1].type == odps_types.blob
|
|
577
|
+
|
|
578
|
+
arrow_schema2 = odps_schema_to_arrow_schema(odps_schema)
|
|
579
|
+
assert arrow_schema2.field("int_col").type == pa.int64()
|
|
580
|
+
assert arrow_schema2.field("blob_col").type == ArrowBlobType()
|
maxframe/learn/__init__.py
CHANGED
|
@@ -12,6 +12,14 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
from . import
|
|
15
|
+
from . import (
|
|
16
|
+
cluster,
|
|
17
|
+
contrib,
|
|
18
|
+
linear_model,
|
|
19
|
+
metrics,
|
|
20
|
+
model_selection,
|
|
21
|
+
preprocessing,
|
|
22
|
+
utils,
|
|
23
|
+
)
|
|
16
24
|
|
|
17
|
-
del contrib, model_selection, preprocessing
|
|
25
|
+
del cluster, contrib, linear_model, metrics, model_selection, preprocessing, utils
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
# Copyright 1999-2025 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from ._kmeans import KMeans, k_means
|