maxframe 0.1.0b5__cp38-cp38-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of maxframe might be problematic. Click here for more details.
- maxframe/__init__.py +32 -0
- maxframe/_utils.cpython-38-darwin.so +0 -0
- maxframe/_utils.pxd +33 -0
- maxframe/_utils.pyx +547 -0
- maxframe/codegen.py +528 -0
- maxframe/config/__init__.py +15 -0
- maxframe/config/config.py +443 -0
- maxframe/config/tests/__init__.py +13 -0
- maxframe/config/tests/test_config.py +103 -0
- maxframe/config/tests/test_validators.py +34 -0
- maxframe/config/validators.py +57 -0
- maxframe/conftest.py +139 -0
- maxframe/core/__init__.py +65 -0
- maxframe/core/base.py +156 -0
- maxframe/core/entity/__init__.py +44 -0
- maxframe/core/entity/chunks.py +68 -0
- maxframe/core/entity/core.py +152 -0
- maxframe/core/entity/executable.py +337 -0
- maxframe/core/entity/fuse.py +73 -0
- maxframe/core/entity/objects.py +100 -0
- maxframe/core/entity/output_types.py +90 -0
- maxframe/core/entity/tileables.py +438 -0
- maxframe/core/entity/utils.py +24 -0
- maxframe/core/graph/__init__.py +17 -0
- maxframe/core/graph/builder/__init__.py +16 -0
- maxframe/core/graph/builder/base.py +86 -0
- maxframe/core/graph/builder/chunk.py +430 -0
- maxframe/core/graph/builder/tileable.py +34 -0
- maxframe/core/graph/builder/utils.py +41 -0
- maxframe/core/graph/core.cpython-38-darwin.so +0 -0
- maxframe/core/graph/core.pyx +467 -0
- maxframe/core/graph/entity.py +171 -0
- maxframe/core/graph/tests/__init__.py +13 -0
- maxframe/core/graph/tests/test_graph.py +205 -0
- maxframe/core/mode.py +96 -0
- maxframe/core/operator/__init__.py +34 -0
- maxframe/core/operator/base.py +450 -0
- maxframe/core/operator/core.py +276 -0
- maxframe/core/operator/fetch.py +53 -0
- maxframe/core/operator/fuse.py +29 -0
- maxframe/core/operator/objects.py +72 -0
- maxframe/core/operator/shuffle.py +111 -0
- maxframe/core/operator/tests/__init__.py +13 -0
- maxframe/core/operator/tests/test_core.py +64 -0
- maxframe/core/tests/__init__.py +13 -0
- maxframe/core/tests/test_mode.py +75 -0
- maxframe/dataframe/__init__.py +81 -0
- maxframe/dataframe/arithmetic/__init__.py +359 -0
- maxframe/dataframe/arithmetic/abs.py +33 -0
- maxframe/dataframe/arithmetic/add.py +60 -0
- maxframe/dataframe/arithmetic/arccos.py +28 -0
- maxframe/dataframe/arithmetic/arccosh.py +28 -0
- maxframe/dataframe/arithmetic/arcsin.py +28 -0
- maxframe/dataframe/arithmetic/arcsinh.py +28 -0
- maxframe/dataframe/arithmetic/arctan.py +28 -0
- maxframe/dataframe/arithmetic/arctanh.py +28 -0
- maxframe/dataframe/arithmetic/around.py +152 -0
- maxframe/dataframe/arithmetic/bitwise_and.py +46 -0
- maxframe/dataframe/arithmetic/bitwise_or.py +50 -0
- maxframe/dataframe/arithmetic/bitwise_xor.py +46 -0
- maxframe/dataframe/arithmetic/ceil.py +28 -0
- maxframe/dataframe/arithmetic/core.py +342 -0
- maxframe/dataframe/arithmetic/cos.py +28 -0
- maxframe/dataframe/arithmetic/cosh.py +28 -0
- maxframe/dataframe/arithmetic/degrees.py +28 -0
- maxframe/dataframe/arithmetic/docstring.py +442 -0
- maxframe/dataframe/arithmetic/equal.py +56 -0
- maxframe/dataframe/arithmetic/exp.py +28 -0
- maxframe/dataframe/arithmetic/exp2.py +28 -0
- maxframe/dataframe/arithmetic/expm1.py +28 -0
- maxframe/dataframe/arithmetic/floor.py +28 -0
- maxframe/dataframe/arithmetic/floordiv.py +64 -0
- maxframe/dataframe/arithmetic/greater.py +57 -0
- maxframe/dataframe/arithmetic/greater_equal.py +57 -0
- maxframe/dataframe/arithmetic/invert.py +33 -0
- maxframe/dataframe/arithmetic/is_ufuncs.py +62 -0
- maxframe/dataframe/arithmetic/less.py +57 -0
- maxframe/dataframe/arithmetic/less_equal.py +57 -0
- maxframe/dataframe/arithmetic/log.py +28 -0
- maxframe/dataframe/arithmetic/log10.py +28 -0
- maxframe/dataframe/arithmetic/log2.py +28 -0
- maxframe/dataframe/arithmetic/mod.py +60 -0
- maxframe/dataframe/arithmetic/multiply.py +60 -0
- maxframe/dataframe/arithmetic/negative.py +33 -0
- maxframe/dataframe/arithmetic/not_equal.py +56 -0
- maxframe/dataframe/arithmetic/power.py +68 -0
- maxframe/dataframe/arithmetic/radians.py +28 -0
- maxframe/dataframe/arithmetic/sin.py +28 -0
- maxframe/dataframe/arithmetic/sinh.py +28 -0
- maxframe/dataframe/arithmetic/sqrt.py +28 -0
- maxframe/dataframe/arithmetic/subtract.py +64 -0
- maxframe/dataframe/arithmetic/tan.py +28 -0
- maxframe/dataframe/arithmetic/tanh.py +28 -0
- maxframe/dataframe/arithmetic/tests/__init__.py +13 -0
- maxframe/dataframe/arithmetic/tests/test_arithmetic.py +695 -0
- maxframe/dataframe/arithmetic/truediv.py +64 -0
- maxframe/dataframe/arithmetic/trunc.py +28 -0
- maxframe/dataframe/arrays.py +864 -0
- maxframe/dataframe/core.py +2417 -0
- maxframe/dataframe/datasource/__init__.py +15 -0
- maxframe/dataframe/datasource/core.py +81 -0
- maxframe/dataframe/datasource/dataframe.py +59 -0
- maxframe/dataframe/datasource/date_range.py +504 -0
- maxframe/dataframe/datasource/from_index.py +54 -0
- maxframe/dataframe/datasource/from_records.py +107 -0
- maxframe/dataframe/datasource/from_tensor.py +419 -0
- maxframe/dataframe/datasource/index.py +117 -0
- maxframe/dataframe/datasource/read_csv.py +528 -0
- maxframe/dataframe/datasource/read_odps_query.py +299 -0
- maxframe/dataframe/datasource/read_odps_table.py +253 -0
- maxframe/dataframe/datasource/read_parquet.py +421 -0
- maxframe/dataframe/datasource/series.py +55 -0
- maxframe/dataframe/datasource/tests/__init__.py +13 -0
- maxframe/dataframe/datasource/tests/test_datasource.py +401 -0
- maxframe/dataframe/datastore/__init__.py +26 -0
- maxframe/dataframe/datastore/core.py +19 -0
- maxframe/dataframe/datastore/to_csv.py +227 -0
- maxframe/dataframe/datastore/to_odps.py +162 -0
- maxframe/dataframe/extensions/__init__.py +41 -0
- maxframe/dataframe/extensions/accessor.py +50 -0
- maxframe/dataframe/extensions/reshuffle.py +83 -0
- maxframe/dataframe/extensions/tests/__init__.py +13 -0
- maxframe/dataframe/extensions/tests/test_extensions.py +38 -0
- maxframe/dataframe/fetch/__init__.py +15 -0
- maxframe/dataframe/fetch/core.py +86 -0
- maxframe/dataframe/groupby/__init__.py +82 -0
- maxframe/dataframe/groupby/aggregation.py +350 -0
- maxframe/dataframe/groupby/apply.py +251 -0
- maxframe/dataframe/groupby/core.py +179 -0
- maxframe/dataframe/groupby/cum.py +124 -0
- maxframe/dataframe/groupby/fill.py +141 -0
- maxframe/dataframe/groupby/getitem.py +92 -0
- maxframe/dataframe/groupby/head.py +105 -0
- maxframe/dataframe/groupby/sample.py +214 -0
- maxframe/dataframe/groupby/tests/__init__.py +13 -0
- maxframe/dataframe/groupby/tests/test_groupby.py +374 -0
- maxframe/dataframe/groupby/transform.py +255 -0
- maxframe/dataframe/indexing/__init__.py +84 -0
- maxframe/dataframe/indexing/add_prefix_suffix.py +110 -0
- maxframe/dataframe/indexing/align.py +349 -0
- maxframe/dataframe/indexing/at.py +83 -0
- maxframe/dataframe/indexing/getitem.py +204 -0
- maxframe/dataframe/indexing/iat.py +37 -0
- maxframe/dataframe/indexing/iloc.py +566 -0
- maxframe/dataframe/indexing/insert.py +86 -0
- maxframe/dataframe/indexing/loc.py +411 -0
- maxframe/dataframe/indexing/reindex.py +526 -0
- maxframe/dataframe/indexing/rename.py +462 -0
- maxframe/dataframe/indexing/rename_axis.py +209 -0
- maxframe/dataframe/indexing/reset_index.py +402 -0
- maxframe/dataframe/indexing/sample.py +221 -0
- maxframe/dataframe/indexing/set_axis.py +194 -0
- maxframe/dataframe/indexing/set_index.py +61 -0
- maxframe/dataframe/indexing/setitem.py +130 -0
- maxframe/dataframe/indexing/tests/__init__.py +13 -0
- maxframe/dataframe/indexing/tests/test_indexing.py +488 -0
- maxframe/dataframe/indexing/where.py +308 -0
- maxframe/dataframe/initializer.py +288 -0
- maxframe/dataframe/merge/__init__.py +32 -0
- maxframe/dataframe/merge/append.py +121 -0
- maxframe/dataframe/merge/concat.py +325 -0
- maxframe/dataframe/merge/merge.py +593 -0
- maxframe/dataframe/merge/tests/__init__.py +13 -0
- maxframe/dataframe/merge/tests/test_merge.py +215 -0
- maxframe/dataframe/misc/__init__.py +134 -0
- maxframe/dataframe/misc/_duplicate.py +46 -0
- maxframe/dataframe/misc/accessor.py +276 -0
- maxframe/dataframe/misc/apply.py +692 -0
- maxframe/dataframe/misc/astype.py +236 -0
- maxframe/dataframe/misc/case_when.py +141 -0
- maxframe/dataframe/misc/check_monotonic.py +84 -0
- maxframe/dataframe/misc/cut.py +383 -0
- maxframe/dataframe/misc/datetimes.py +79 -0
- maxframe/dataframe/misc/describe.py +108 -0
- maxframe/dataframe/misc/diff.py +210 -0
- maxframe/dataframe/misc/drop.py +440 -0
- maxframe/dataframe/misc/drop_duplicates.py +248 -0
- maxframe/dataframe/misc/duplicated.py +292 -0
- maxframe/dataframe/misc/eval.py +728 -0
- maxframe/dataframe/misc/explode.py +171 -0
- maxframe/dataframe/misc/get_dummies.py +208 -0
- maxframe/dataframe/misc/isin.py +217 -0
- maxframe/dataframe/misc/map.py +236 -0
- maxframe/dataframe/misc/melt.py +162 -0
- maxframe/dataframe/misc/memory_usage.py +248 -0
- maxframe/dataframe/misc/pct_change.py +150 -0
- maxframe/dataframe/misc/pivot_table.py +262 -0
- maxframe/dataframe/misc/qcut.py +104 -0
- maxframe/dataframe/misc/select_dtypes.py +104 -0
- maxframe/dataframe/misc/shift.py +256 -0
- maxframe/dataframe/misc/stack.py +238 -0
- maxframe/dataframe/misc/string_.py +221 -0
- maxframe/dataframe/misc/tests/__init__.py +13 -0
- maxframe/dataframe/misc/tests/test_misc.py +468 -0
- maxframe/dataframe/misc/to_numeric.py +178 -0
- maxframe/dataframe/misc/transform.py +361 -0
- maxframe/dataframe/misc/transpose.py +136 -0
- maxframe/dataframe/misc/value_counts.py +182 -0
- maxframe/dataframe/missing/__init__.py +53 -0
- maxframe/dataframe/missing/checkna.py +223 -0
- maxframe/dataframe/missing/dropna.py +280 -0
- maxframe/dataframe/missing/fillna.py +275 -0
- maxframe/dataframe/missing/replace.py +439 -0
- maxframe/dataframe/missing/tests/__init__.py +13 -0
- maxframe/dataframe/missing/tests/test_missing.py +89 -0
- maxframe/dataframe/operators.py +273 -0
- maxframe/dataframe/plotting/__init__.py +40 -0
- maxframe/dataframe/plotting/core.py +78 -0
- maxframe/dataframe/plotting/tests/__init__.py +13 -0
- maxframe/dataframe/plotting/tests/test_plotting.py +136 -0
- maxframe/dataframe/reduction/__init__.py +107 -0
- maxframe/dataframe/reduction/aggregation.py +344 -0
- maxframe/dataframe/reduction/all.py +78 -0
- maxframe/dataframe/reduction/any.py +78 -0
- maxframe/dataframe/reduction/core.py +837 -0
- maxframe/dataframe/reduction/count.py +59 -0
- maxframe/dataframe/reduction/cummax.py +30 -0
- maxframe/dataframe/reduction/cummin.py +30 -0
- maxframe/dataframe/reduction/cumprod.py +30 -0
- maxframe/dataframe/reduction/cumsum.py +30 -0
- maxframe/dataframe/reduction/custom_reduction.py +42 -0
- maxframe/dataframe/reduction/kurtosis.py +104 -0
- maxframe/dataframe/reduction/max.py +65 -0
- maxframe/dataframe/reduction/mean.py +61 -0
- maxframe/dataframe/reduction/min.py +65 -0
- maxframe/dataframe/reduction/nunique.py +141 -0
- maxframe/dataframe/reduction/prod.py +76 -0
- maxframe/dataframe/reduction/reduction_size.py +36 -0
- maxframe/dataframe/reduction/sem.py +69 -0
- maxframe/dataframe/reduction/skew.py +89 -0
- maxframe/dataframe/reduction/std.py +53 -0
- maxframe/dataframe/reduction/str_concat.py +48 -0
- maxframe/dataframe/reduction/sum.py +77 -0
- maxframe/dataframe/reduction/tests/__init__.py +13 -0
- maxframe/dataframe/reduction/tests/test_reduction.py +486 -0
- maxframe/dataframe/reduction/unique.py +90 -0
- maxframe/dataframe/reduction/var.py +72 -0
- maxframe/dataframe/sort/__init__.py +34 -0
- maxframe/dataframe/sort/core.py +36 -0
- maxframe/dataframe/sort/sort_index.py +153 -0
- maxframe/dataframe/sort/sort_values.py +311 -0
- maxframe/dataframe/sort/tests/__init__.py +13 -0
- maxframe/dataframe/sort/tests/test_sort.py +81 -0
- maxframe/dataframe/statistics/__init__.py +33 -0
- maxframe/dataframe/statistics/corr.py +280 -0
- maxframe/dataframe/statistics/quantile.py +341 -0
- maxframe/dataframe/statistics/tests/__init__.py +13 -0
- maxframe/dataframe/statistics/tests/test_statistics.py +82 -0
- maxframe/dataframe/tests/__init__.py +13 -0
- maxframe/dataframe/tests/test_initializer.py +29 -0
- maxframe/dataframe/tseries/__init__.py +13 -0
- maxframe/dataframe/tseries/tests/__init__.py +13 -0
- maxframe/dataframe/tseries/tests/test_tseries.py +30 -0
- maxframe/dataframe/tseries/to_datetime.py +297 -0
- maxframe/dataframe/ufunc/__init__.py +27 -0
- maxframe/dataframe/ufunc/tensor.py +54 -0
- maxframe/dataframe/ufunc/ufunc.py +52 -0
- maxframe/dataframe/utils.py +1267 -0
- maxframe/dataframe/window/__init__.py +29 -0
- maxframe/dataframe/window/aggregation.py +96 -0
- maxframe/dataframe/window/core.py +69 -0
- maxframe/dataframe/window/ewm.py +249 -0
- maxframe/dataframe/window/expanding.py +147 -0
- maxframe/dataframe/window/rolling.py +376 -0
- maxframe/dataframe/window/tests/__init__.py +13 -0
- maxframe/dataframe/window/tests/test_ewm.py +70 -0
- maxframe/dataframe/window/tests/test_expanding.py +66 -0
- maxframe/dataframe/window/tests/test_rolling.py +57 -0
- maxframe/env.py +33 -0
- maxframe/errors.py +21 -0
- maxframe/extension.py +81 -0
- maxframe/learn/__init__.py +17 -0
- maxframe/learn/contrib/__init__.py +17 -0
- maxframe/learn/contrib/pytorch/__init__.py +16 -0
- maxframe/learn/contrib/pytorch/run_function.py +110 -0
- maxframe/learn/contrib/pytorch/run_script.py +102 -0
- maxframe/learn/contrib/pytorch/tests/__init__.py +13 -0
- maxframe/learn/contrib/pytorch/tests/test_pytorch.py +42 -0
- maxframe/learn/contrib/utils.py +52 -0
- maxframe/learn/contrib/xgboost/__init__.py +26 -0
- maxframe/learn/contrib/xgboost/classifier.py +86 -0
- maxframe/learn/contrib/xgboost/core.py +156 -0
- maxframe/learn/contrib/xgboost/dmatrix.py +150 -0
- maxframe/learn/contrib/xgboost/predict.py +138 -0
- maxframe/learn/contrib/xgboost/regressor.py +78 -0
- maxframe/learn/contrib/xgboost/tests/__init__.py +13 -0
- maxframe/learn/contrib/xgboost/tests/test_core.py +43 -0
- maxframe/learn/contrib/xgboost/train.py +121 -0
- maxframe/learn/utils/__init__.py +15 -0
- maxframe/learn/utils/core.py +29 -0
- maxframe/lib/__init__.py +15 -0
- maxframe/lib/aio/__init__.py +27 -0
- maxframe/lib/aio/_runners.py +162 -0
- maxframe/lib/aio/_threads.py +35 -0
- maxframe/lib/aio/base.py +82 -0
- maxframe/lib/aio/file.py +85 -0
- maxframe/lib/aio/isolation.py +100 -0
- maxframe/lib/aio/lru.py +242 -0
- maxframe/lib/aio/parallelism.py +37 -0
- maxframe/lib/aio/tests/__init__.py +13 -0
- maxframe/lib/aio/tests/test_aio_file.py +55 -0
- maxframe/lib/compression.py +55 -0
- maxframe/lib/cython/__init__.py +13 -0
- maxframe/lib/cython/libcpp.pxd +30 -0
- maxframe/lib/filesystem/__init__.py +21 -0
- maxframe/lib/filesystem/_glob.py +173 -0
- maxframe/lib/filesystem/_oss_lib/__init__.py +13 -0
- maxframe/lib/filesystem/_oss_lib/common.py +198 -0
- maxframe/lib/filesystem/_oss_lib/glob.py +147 -0
- maxframe/lib/filesystem/_oss_lib/handle.py +156 -0
- maxframe/lib/filesystem/arrow.py +236 -0
- maxframe/lib/filesystem/base.py +263 -0
- maxframe/lib/filesystem/core.py +95 -0
- maxframe/lib/filesystem/fsmap.py +164 -0
- maxframe/lib/filesystem/hdfs.py +31 -0
- maxframe/lib/filesystem/local.py +112 -0
- maxframe/lib/filesystem/oss.py +157 -0
- maxframe/lib/filesystem/tests/__init__.py +13 -0
- maxframe/lib/filesystem/tests/test_filesystem.py +223 -0
- maxframe/lib/filesystem/tests/test_oss.py +182 -0
- maxframe/lib/functools_compat.py +81 -0
- maxframe/lib/mmh3.cpython-38-darwin.so +0 -0
- maxframe/lib/mmh3_src/MurmurHash3.cpp +339 -0
- maxframe/lib/mmh3_src/MurmurHash3.h +43 -0
- maxframe/lib/mmh3_src/mmh3module.cpp +387 -0
- maxframe/lib/sparse/__init__.py +861 -0
- maxframe/lib/sparse/array.py +1604 -0
- maxframe/lib/sparse/core.py +92 -0
- maxframe/lib/sparse/matrix.py +241 -0
- maxframe/lib/sparse/tests/__init__.py +15 -0
- maxframe/lib/sparse/tests/test_sparse.py +476 -0
- maxframe/lib/sparse/vector.py +150 -0
- maxframe/lib/tblib/LICENSE +20 -0
- maxframe/lib/tblib/__init__.py +327 -0
- maxframe/lib/tblib/cpython.py +83 -0
- maxframe/lib/tblib/decorators.py +44 -0
- maxframe/lib/tblib/pickling_support.py +90 -0
- maxframe/lib/tests/__init__.py +13 -0
- maxframe/lib/tests/test_wrapped_pickle.py +51 -0
- maxframe/lib/version.py +620 -0
- maxframe/lib/wrapped_pickle.py +139 -0
- maxframe/mixin.py +100 -0
- maxframe/odpsio/__init__.py +21 -0
- maxframe/odpsio/arrow.py +91 -0
- maxframe/odpsio/schema.py +364 -0
- maxframe/odpsio/tableio.py +322 -0
- maxframe/odpsio/tests/__init__.py +13 -0
- maxframe/odpsio/tests/test_arrow.py +88 -0
- maxframe/odpsio/tests/test_schema.py +297 -0
- maxframe/odpsio/tests/test_tableio.py +136 -0
- maxframe/odpsio/tests/test_volumeio.py +90 -0
- maxframe/odpsio/volumeio.py +95 -0
- maxframe/opcodes.py +590 -0
- maxframe/protocol.py +415 -0
- maxframe/remote/__init__.py +18 -0
- maxframe/remote/core.py +210 -0
- maxframe/remote/run_script.py +121 -0
- maxframe/serialization/__init__.py +26 -0
- maxframe/serialization/arrow.py +95 -0
- maxframe/serialization/core.cpython-38-darwin.so +0 -0
- maxframe/serialization/core.pxd +44 -0
- maxframe/serialization/core.pyi +61 -0
- maxframe/serialization/core.pyx +1094 -0
- maxframe/serialization/exception.py +86 -0
- maxframe/serialization/maxframe_objects.py +39 -0
- maxframe/serialization/numpy.py +91 -0
- maxframe/serialization/pandas.py +202 -0
- maxframe/serialization/scipy.py +71 -0
- maxframe/serialization/serializables/__init__.py +55 -0
- maxframe/serialization/serializables/core.py +262 -0
- maxframe/serialization/serializables/field.py +624 -0
- maxframe/serialization/serializables/field_type.py +589 -0
- maxframe/serialization/serializables/tests/__init__.py +13 -0
- maxframe/serialization/serializables/tests/test_field_type.py +121 -0
- maxframe/serialization/serializables/tests/test_serializable.py +250 -0
- maxframe/serialization/tests/__init__.py +13 -0
- maxframe/serialization/tests/test_serial.py +412 -0
- maxframe/session.py +1310 -0
- maxframe/tensor/__init__.py +183 -0
- maxframe/tensor/arithmetic/__init__.py +315 -0
- maxframe/tensor/arithmetic/abs.py +68 -0
- maxframe/tensor/arithmetic/absolute.py +68 -0
- maxframe/tensor/arithmetic/add.py +82 -0
- maxframe/tensor/arithmetic/angle.py +72 -0
- maxframe/tensor/arithmetic/arccos.py +104 -0
- maxframe/tensor/arithmetic/arccosh.py +91 -0
- maxframe/tensor/arithmetic/arcsin.py +94 -0
- maxframe/tensor/arithmetic/arcsinh.py +86 -0
- maxframe/tensor/arithmetic/arctan.py +106 -0
- maxframe/tensor/arithmetic/arctan2.py +128 -0
- maxframe/tensor/arithmetic/arctanh.py +86 -0
- maxframe/tensor/arithmetic/around.py +114 -0
- maxframe/tensor/arithmetic/bitand.py +95 -0
- maxframe/tensor/arithmetic/bitor.py +102 -0
- maxframe/tensor/arithmetic/bitxor.py +95 -0
- maxframe/tensor/arithmetic/cbrt.py +66 -0
- maxframe/tensor/arithmetic/ceil.py +71 -0
- maxframe/tensor/arithmetic/clip.py +165 -0
- maxframe/tensor/arithmetic/conj.py +74 -0
- maxframe/tensor/arithmetic/copysign.py +78 -0
- maxframe/tensor/arithmetic/core.py +544 -0
- maxframe/tensor/arithmetic/cos.py +85 -0
- maxframe/tensor/arithmetic/cosh.py +72 -0
- maxframe/tensor/arithmetic/deg2rad.py +72 -0
- maxframe/tensor/arithmetic/degrees.py +77 -0
- maxframe/tensor/arithmetic/divide.py +114 -0
- maxframe/tensor/arithmetic/equal.py +76 -0
- maxframe/tensor/arithmetic/exp.py +106 -0
- maxframe/tensor/arithmetic/exp2.py +67 -0
- maxframe/tensor/arithmetic/expm1.py +79 -0
- maxframe/tensor/arithmetic/fabs.py +74 -0
- maxframe/tensor/arithmetic/fix.py +69 -0
- maxframe/tensor/arithmetic/float_power.py +103 -0
- maxframe/tensor/arithmetic/floor.py +77 -0
- maxframe/tensor/arithmetic/floordiv.py +94 -0
- maxframe/tensor/arithmetic/fmax.py +105 -0
- maxframe/tensor/arithmetic/fmin.py +106 -0
- maxframe/tensor/arithmetic/fmod.py +99 -0
- maxframe/tensor/arithmetic/frexp.py +92 -0
- maxframe/tensor/arithmetic/greater.py +77 -0
- maxframe/tensor/arithmetic/greater_equal.py +69 -0
- maxframe/tensor/arithmetic/hypot.py +77 -0
- maxframe/tensor/arithmetic/i0.py +89 -0
- maxframe/tensor/arithmetic/imag.py +67 -0
- maxframe/tensor/arithmetic/invert.py +110 -0
- maxframe/tensor/arithmetic/isclose.py +115 -0
- maxframe/tensor/arithmetic/iscomplex.py +64 -0
- maxframe/tensor/arithmetic/isfinite.py +106 -0
- maxframe/tensor/arithmetic/isinf.py +103 -0
- maxframe/tensor/arithmetic/isnan.py +82 -0
- maxframe/tensor/arithmetic/isreal.py +63 -0
- maxframe/tensor/arithmetic/ldexp.py +99 -0
- maxframe/tensor/arithmetic/less.py +69 -0
- maxframe/tensor/arithmetic/less_equal.py +69 -0
- maxframe/tensor/arithmetic/log.py +92 -0
- maxframe/tensor/arithmetic/log10.py +85 -0
- maxframe/tensor/arithmetic/log1p.py +95 -0
- maxframe/tensor/arithmetic/log2.py +85 -0
- maxframe/tensor/arithmetic/logaddexp.py +80 -0
- maxframe/tensor/arithmetic/logaddexp2.py +78 -0
- maxframe/tensor/arithmetic/logical_and.py +81 -0
- maxframe/tensor/arithmetic/logical_not.py +74 -0
- maxframe/tensor/arithmetic/logical_or.py +82 -0
- maxframe/tensor/arithmetic/logical_xor.py +88 -0
- maxframe/tensor/arithmetic/lshift.py +82 -0
- maxframe/tensor/arithmetic/maximum.py +108 -0
- maxframe/tensor/arithmetic/minimum.py +108 -0
- maxframe/tensor/arithmetic/mod.py +104 -0
- maxframe/tensor/arithmetic/modf.py +83 -0
- maxframe/tensor/arithmetic/multiply.py +81 -0
- maxframe/tensor/arithmetic/nan_to_num.py +99 -0
- maxframe/tensor/arithmetic/negative.py +65 -0
- maxframe/tensor/arithmetic/nextafter.py +68 -0
- maxframe/tensor/arithmetic/not_equal.py +72 -0
- maxframe/tensor/arithmetic/positive.py +47 -0
- maxframe/tensor/arithmetic/power.py +106 -0
- maxframe/tensor/arithmetic/rad2deg.py +71 -0
- maxframe/tensor/arithmetic/radians.py +77 -0
- maxframe/tensor/arithmetic/real.py +70 -0
- maxframe/tensor/arithmetic/reciprocal.py +76 -0
- maxframe/tensor/arithmetic/rint.py +68 -0
- maxframe/tensor/arithmetic/rshift.py +81 -0
- maxframe/tensor/arithmetic/setimag.py +29 -0
- maxframe/tensor/arithmetic/setreal.py +29 -0
- maxframe/tensor/arithmetic/sign.py +81 -0
- maxframe/tensor/arithmetic/signbit.py +65 -0
- maxframe/tensor/arithmetic/sin.py +98 -0
- maxframe/tensor/arithmetic/sinc.py +102 -0
- maxframe/tensor/arithmetic/sinh.py +93 -0
- maxframe/tensor/arithmetic/spacing.py +72 -0
- maxframe/tensor/arithmetic/sqrt.py +81 -0
- maxframe/tensor/arithmetic/square.py +69 -0
- maxframe/tensor/arithmetic/subtract.py +81 -0
- maxframe/tensor/arithmetic/tan.py +88 -0
- maxframe/tensor/arithmetic/tanh.py +92 -0
- maxframe/tensor/arithmetic/tests/__init__.py +15 -0
- maxframe/tensor/arithmetic/tests/test_arithmetic.py +414 -0
- maxframe/tensor/arithmetic/truediv.py +104 -0
- maxframe/tensor/arithmetic/trunc.py +72 -0
- maxframe/tensor/arithmetic/utils.py +65 -0
- maxframe/tensor/array_utils.py +186 -0
- maxframe/tensor/base/__init__.py +34 -0
- maxframe/tensor/base/astype.py +119 -0
- maxframe/tensor/base/atleast_1d.py +74 -0
- maxframe/tensor/base/broadcast_to.py +89 -0
- maxframe/tensor/base/ravel.py +92 -0
- maxframe/tensor/base/tests/__init__.py +13 -0
- maxframe/tensor/base/tests/test_base.py +114 -0
- maxframe/tensor/base/transpose.py +125 -0
- maxframe/tensor/base/unique.py +205 -0
- maxframe/tensor/base/where.py +127 -0
- maxframe/tensor/core.py +724 -0
- maxframe/tensor/datasource/__init__.py +32 -0
- maxframe/tensor/datasource/arange.py +156 -0
- maxframe/tensor/datasource/array.py +415 -0
- maxframe/tensor/datasource/core.py +109 -0
- maxframe/tensor/datasource/empty.py +169 -0
- maxframe/tensor/datasource/from_dataframe.py +70 -0
- maxframe/tensor/datasource/from_dense.py +54 -0
- maxframe/tensor/datasource/from_sparse.py +47 -0
- maxframe/tensor/datasource/full.py +186 -0
- maxframe/tensor/datasource/ones.py +173 -0
- maxframe/tensor/datasource/scalar.py +40 -0
- maxframe/tensor/datasource/tests/__init__.py +13 -0
- maxframe/tensor/datasource/tests/test_datasource.py +278 -0
- maxframe/tensor/datasource/zeros.py +188 -0
- maxframe/tensor/fetch/__init__.py +15 -0
- maxframe/tensor/fetch/core.py +54 -0
- maxframe/tensor/indexing/__init__.py +47 -0
- maxframe/tensor/indexing/choose.py +196 -0
- maxframe/tensor/indexing/compress.py +124 -0
- maxframe/tensor/indexing/core.py +190 -0
- maxframe/tensor/indexing/extract.py +71 -0
- maxframe/tensor/indexing/fill_diagonal.py +183 -0
- maxframe/tensor/indexing/flatnonzero.py +60 -0
- maxframe/tensor/indexing/getitem.py +175 -0
- maxframe/tensor/indexing/nonzero.py +120 -0
- maxframe/tensor/indexing/setitem.py +132 -0
- maxframe/tensor/indexing/slice.py +29 -0
- maxframe/tensor/indexing/take.py +130 -0
- maxframe/tensor/indexing/tests/__init__.py +15 -0
- maxframe/tensor/indexing/tests/test_indexing.py +234 -0
- maxframe/tensor/indexing/unravel_index.py +103 -0
- maxframe/tensor/merge/__init__.py +15 -0
- maxframe/tensor/merge/stack.py +132 -0
- maxframe/tensor/merge/tests/__init__.py +13 -0
- maxframe/tensor/merge/tests/test_merge.py +52 -0
- maxframe/tensor/operators.py +123 -0
- maxframe/tensor/random/__init__.py +168 -0
- maxframe/tensor/random/beta.py +87 -0
- maxframe/tensor/random/binomial.py +137 -0
- maxframe/tensor/random/bytes.py +39 -0
- maxframe/tensor/random/chisquare.py +110 -0
- maxframe/tensor/random/choice.py +186 -0
- maxframe/tensor/random/core.py +234 -0
- maxframe/tensor/random/dirichlet.py +123 -0
- maxframe/tensor/random/exponential.py +94 -0
- maxframe/tensor/random/f.py +135 -0
- maxframe/tensor/random/gamma.py +128 -0
- maxframe/tensor/random/geometric.py +93 -0
- maxframe/tensor/random/gumbel.py +167 -0
- maxframe/tensor/random/hypergeometric.py +148 -0
- maxframe/tensor/random/laplace.py +133 -0
- maxframe/tensor/random/logistic.py +129 -0
- maxframe/tensor/random/lognormal.py +159 -0
- maxframe/tensor/random/logseries.py +122 -0
- maxframe/tensor/random/multinomial.py +133 -0
- maxframe/tensor/random/multivariate_normal.py +192 -0
- maxframe/tensor/random/negative_binomial.py +125 -0
- maxframe/tensor/random/noncentral_chisquare.py +132 -0
- maxframe/tensor/random/noncentral_f.py +126 -0
- maxframe/tensor/random/normal.py +143 -0
- maxframe/tensor/random/pareto.py +140 -0
- maxframe/tensor/random/permutation.py +104 -0
- maxframe/tensor/random/poisson.py +111 -0
- maxframe/tensor/random/power.py +142 -0
- maxframe/tensor/random/rand.py +82 -0
- maxframe/tensor/random/randint.py +121 -0
- maxframe/tensor/random/randn.py +96 -0
- maxframe/tensor/random/random_integers.py +123 -0
- maxframe/tensor/random/random_sample.py +86 -0
- maxframe/tensor/random/rayleigh.py +110 -0
- maxframe/tensor/random/shuffle.py +61 -0
- maxframe/tensor/random/standard_cauchy.py +105 -0
- maxframe/tensor/random/standard_exponential.py +72 -0
- maxframe/tensor/random/standard_gamma.py +120 -0
- maxframe/tensor/random/standard_normal.py +74 -0
- maxframe/tensor/random/standard_t.py +135 -0
- maxframe/tensor/random/tests/__init__.py +15 -0
- maxframe/tensor/random/tests/test_random.py +167 -0
- maxframe/tensor/random/triangular.py +119 -0
- maxframe/tensor/random/uniform.py +131 -0
- maxframe/tensor/random/vonmises.py +131 -0
- maxframe/tensor/random/wald.py +114 -0
- maxframe/tensor/random/weibull.py +140 -0
- maxframe/tensor/random/zipf.py +122 -0
- maxframe/tensor/rechunk/__init__.py +26 -0
- maxframe/tensor/rechunk/rechunk.py +43 -0
- maxframe/tensor/reduction/__init__.py +66 -0
- maxframe/tensor/reduction/all.py +103 -0
- maxframe/tensor/reduction/allclose.py +88 -0
- maxframe/tensor/reduction/any.py +105 -0
- maxframe/tensor/reduction/argmax.py +103 -0
- maxframe/tensor/reduction/argmin.py +103 -0
- maxframe/tensor/reduction/array_equal.py +64 -0
- maxframe/tensor/reduction/core.py +168 -0
- maxframe/tensor/reduction/count_nonzero.py +81 -0
- maxframe/tensor/reduction/cumprod.py +97 -0
- maxframe/tensor/reduction/cumsum.py +101 -0
- maxframe/tensor/reduction/max.py +120 -0
- maxframe/tensor/reduction/mean.py +123 -0
- maxframe/tensor/reduction/min.py +120 -0
- maxframe/tensor/reduction/nanargmax.py +82 -0
- maxframe/tensor/reduction/nanargmin.py +76 -0
- maxframe/tensor/reduction/nancumprod.py +91 -0
- maxframe/tensor/reduction/nancumsum.py +94 -0
- maxframe/tensor/reduction/nanmax.py +111 -0
- maxframe/tensor/reduction/nanmean.py +106 -0
- maxframe/tensor/reduction/nanmin.py +111 -0
- maxframe/tensor/reduction/nanprod.py +94 -0
- maxframe/tensor/reduction/nanstd.py +126 -0
- maxframe/tensor/reduction/nansum.py +115 -0
- maxframe/tensor/reduction/nanvar.py +149 -0
- maxframe/tensor/reduction/prod.py +130 -0
- maxframe/tensor/reduction/std.py +134 -0
- maxframe/tensor/reduction/sum.py +125 -0
- maxframe/tensor/reduction/tests/__init__.py +13 -0
- maxframe/tensor/reduction/tests/test_reduction.py +181 -0
- maxframe/tensor/reduction/var.py +176 -0
- maxframe/tensor/reshape/__init__.py +17 -0
- maxframe/tensor/reshape/reshape.py +188 -0
- maxframe/tensor/reshape/tests/__init__.py +15 -0
- maxframe/tensor/reshape/tests/test_reshape.py +37 -0
- maxframe/tensor/statistics/__init__.py +13 -0
- maxframe/tensor/statistics/percentile.py +175 -0
- maxframe/tensor/statistics/quantile.py +288 -0
- maxframe/tensor/ufunc/__init__.py +26 -0
- maxframe/tensor/ufunc/ufunc.py +200 -0
- maxframe/tensor/utils.py +718 -0
- maxframe/tests/__init__.py +13 -0
- maxframe/tests/test_codegen.py +69 -0
- maxframe/tests/test_protocol.py +144 -0
- maxframe/tests/test_utils.py +376 -0
- maxframe/tests/utils.py +164 -0
- maxframe/typing_.py +37 -0
- maxframe/udf.py +134 -0
- maxframe/utils.py +1114 -0
- maxframe-0.1.0b5.dist-info/METADATA +104 -0
- maxframe-0.1.0b5.dist-info/RECORD +647 -0
- maxframe-0.1.0b5.dist-info/WHEEL +5 -0
- maxframe-0.1.0b5.dist-info/top_level.txt +2 -0
- maxframe_client/__init__.py +17 -0
- maxframe_client/clients/__init__.py +13 -0
- maxframe_client/clients/framedriver.py +118 -0
- maxframe_client/clients/spe.py +104 -0
- maxframe_client/conftest.py +15 -0
- maxframe_client/fetcher.py +264 -0
- maxframe_client/session/__init__.py +22 -0
- maxframe_client/session/consts.py +36 -0
- maxframe_client/session/graph.py +119 -0
- maxframe_client/session/odps.py +482 -0
- maxframe_client/session/task.py +280 -0
- maxframe_client/session/tests/__init__.py +13 -0
- maxframe_client/session/tests/test_task.py +85 -0
- maxframe_client/tests/__init__.py +13 -0
- maxframe_client/tests/test_fetcher.py +89 -0
- maxframe_client/tests/test_session.py +255 -0
|
@@ -0,0 +1,421 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
# Copyright 1999-2024 Alibaba Group Holding Ltd.
|
|
4
|
+
#
|
|
5
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
6
|
+
# you may not use this file except in compliance with the License.
|
|
7
|
+
# You may obtain a copy of the License at
|
|
8
|
+
#
|
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
+
#
|
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
12
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
14
|
+
# See the License for the specific language governing permissions and
|
|
15
|
+
# limitations under the License.
|
|
16
|
+
|
|
17
|
+
import os
|
|
18
|
+
from typing import Dict
|
|
19
|
+
from urllib.parse import urlparse
|
|
20
|
+
|
|
21
|
+
import numpy as np
|
|
22
|
+
import pandas as pd
|
|
23
|
+
|
|
24
|
+
try:
|
|
25
|
+
import pyarrow as pa
|
|
26
|
+
import pyarrow.parquet as pq
|
|
27
|
+
except ImportError:
|
|
28
|
+
pa = None
|
|
29
|
+
|
|
30
|
+
try:
|
|
31
|
+
import fastparquet
|
|
32
|
+
except ImportError:
|
|
33
|
+
fastparquet = None
|
|
34
|
+
|
|
35
|
+
from ... import opcodes
|
|
36
|
+
from ...config import options
|
|
37
|
+
from ...lib.filesystem import FileSystem, get_fs, glob, open_file
|
|
38
|
+
from ...serialization.serializables import (
|
|
39
|
+
AnyField,
|
|
40
|
+
BoolField,
|
|
41
|
+
DictField,
|
|
42
|
+
Int32Field,
|
|
43
|
+
Int64Field,
|
|
44
|
+
ListField,
|
|
45
|
+
StringField,
|
|
46
|
+
)
|
|
47
|
+
from ...utils import lazy_import
|
|
48
|
+
from ..arrays import ArrowStringDtype
|
|
49
|
+
from ..operators import OutputType
|
|
50
|
+
from ..utils import parse_index, to_arrow_dtypes
|
|
51
|
+
from .core import ColumnPruneSupportedDataSourceMixin, IncrementalIndexDatasource
|
|
52
|
+
|
|
53
|
+
PARQUET_MEMORY_SCALE = 15
|
|
54
|
+
STRING_FIELD_OVERHEAD = 50
|
|
55
|
+
cudf = lazy_import("cudf")
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def check_engine(engine):
|
|
59
|
+
if engine == "auto":
|
|
60
|
+
if pa is not None:
|
|
61
|
+
return "pyarrow"
|
|
62
|
+
elif fastparquet is not None: # pragma: no cover
|
|
63
|
+
return "fastparquet"
|
|
64
|
+
else: # pragma: no cover
|
|
65
|
+
raise RuntimeError("Please install either pyarrow or fastparquet.")
|
|
66
|
+
elif engine == "pyarrow":
|
|
67
|
+
if pa is None: # pragma: no cover
|
|
68
|
+
raise RuntimeError("Please install pyarrow first.")
|
|
69
|
+
return engine
|
|
70
|
+
elif engine == "fastparquet":
|
|
71
|
+
if fastparquet is None: # pragma: no cover
|
|
72
|
+
raise RuntimeError("Please install fastparquet first.")
|
|
73
|
+
return engine
|
|
74
|
+
else: # pragma: no cover
|
|
75
|
+
raise RuntimeError("Unsupported engine {} to read parquet.".format(engine))
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def get_engine(engine):
|
|
79
|
+
if engine == "pyarrow":
|
|
80
|
+
return ArrowEngine()
|
|
81
|
+
elif engine == "fastparquet":
|
|
82
|
+
return FastpaquetEngine()
|
|
83
|
+
else: # pragma: no cover
|
|
84
|
+
raise RuntimeError("Unsupported engine {}".format(engine))
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
class ParquetEngine:
|
|
88
|
+
def get_row_num(self, f):
|
|
89
|
+
raise NotImplementedError
|
|
90
|
+
|
|
91
|
+
def read_dtypes(self, f, **kwargs):
|
|
92
|
+
raise NotImplementedError
|
|
93
|
+
|
|
94
|
+
def read_to_pandas(
|
|
95
|
+
self, f, columns=None, nrows=None, use_arrow_dtype=None, **kwargs
|
|
96
|
+
):
|
|
97
|
+
raise NotImplementedError
|
|
98
|
+
|
|
99
|
+
def read_group_to_pandas(
|
|
100
|
+
self, f, group_index, columns=None, nrows=None, use_arrow_dtype=None, **kwargs
|
|
101
|
+
):
|
|
102
|
+
raise NotImplementedError
|
|
103
|
+
|
|
104
|
+
def read_partitioned_to_pandas(
|
|
105
|
+
self,
|
|
106
|
+
f,
|
|
107
|
+
partitions: Dict,
|
|
108
|
+
partition_keys: Dict,
|
|
109
|
+
columns=None,
|
|
110
|
+
nrows=None,
|
|
111
|
+
use_arrow_dtype=None,
|
|
112
|
+
**kwargs,
|
|
113
|
+
):
|
|
114
|
+
raw_df = self.read_to_pandas(
|
|
115
|
+
f, columns=columns, nrows=nrows, use_arrow_dtype=use_arrow_dtype, **kwargs
|
|
116
|
+
)
|
|
117
|
+
for col, value in partition_keys.items():
|
|
118
|
+
dictionary = partitions[col]
|
|
119
|
+
raw_df[col] = pd.Series(
|
|
120
|
+
value,
|
|
121
|
+
dtype=pd.CategoricalDtype(categories=dictionary.tolist()),
|
|
122
|
+
index=raw_df.index,
|
|
123
|
+
)
|
|
124
|
+
return raw_df
|
|
125
|
+
|
|
126
|
+
def read_partitioned_dtypes(self, fs: FileSystem, directory, storage_options):
|
|
127
|
+
# As ParquetDataset will iterate all files,
|
|
128
|
+
# here we just find one file to infer dtypes
|
|
129
|
+
current_path = directory
|
|
130
|
+
partition_cols = []
|
|
131
|
+
while fs.isdir(current_path):
|
|
132
|
+
_, dirs, files = next(fs.walk(current_path))
|
|
133
|
+
dirs = [d for d in dirs if not d.startswith(".")]
|
|
134
|
+
files = [f for f in files if not f.startswith(".")]
|
|
135
|
+
if len(files) == 0:
|
|
136
|
+
# directory as partition
|
|
137
|
+
partition_cols.append(dirs[0].split("=", 1)[0])
|
|
138
|
+
current_path = os.path.join(current_path, dirs[0])
|
|
139
|
+
elif len(dirs) == 0:
|
|
140
|
+
# parquet files in deepest directory
|
|
141
|
+
current_path = os.path.join(current_path, files[0])
|
|
142
|
+
else: # pragma: no cover
|
|
143
|
+
raise ValueError(
|
|
144
|
+
"Files and directories are mixed in an intermediate directory"
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
# current path is now a parquet file
|
|
148
|
+
with open_file(current_path, storage_options=storage_options) as f:
|
|
149
|
+
dtypes = self.read_dtypes(f)
|
|
150
|
+
for partition in partition_cols:
|
|
151
|
+
dtypes[partition] = pd.CategoricalDtype()
|
|
152
|
+
return dtypes
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def _parse_prefix(path):
|
|
156
|
+
path_prefix = ""
|
|
157
|
+
if isinstance(path, str):
|
|
158
|
+
parsed_path = urlparse(path)
|
|
159
|
+
if parsed_path.scheme:
|
|
160
|
+
path_prefix = f"{parsed_path.scheme}://{parsed_path.netloc}"
|
|
161
|
+
return path_prefix
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
class ArrowEngine(ParquetEngine):
|
|
165
|
+
def get_row_num(self, f):
|
|
166
|
+
file = pq.ParquetFile(f)
|
|
167
|
+
return file.metadata.num_rows
|
|
168
|
+
|
|
169
|
+
def read_dtypes(self, f, **kwargs):
|
|
170
|
+
file = pq.ParquetFile(f)
|
|
171
|
+
return file.schema_arrow.empty_table().to_pandas().dtypes
|
|
172
|
+
|
|
173
|
+
@classmethod
|
|
174
|
+
def _table_to_pandas(cls, t, nrows=None, use_arrow_dtype=None):
|
|
175
|
+
if nrows is not None:
|
|
176
|
+
t = t.slice(0, nrows)
|
|
177
|
+
if use_arrow_dtype:
|
|
178
|
+
df = t.to_pandas(types_mapper={pa.string(): ArrowStringDtype()}.get)
|
|
179
|
+
else:
|
|
180
|
+
df = t.to_pandas()
|
|
181
|
+
return df
|
|
182
|
+
|
|
183
|
+
def read_to_pandas(
|
|
184
|
+
self, f, columns=None, nrows=None, use_arrow_dtype=None, **kwargs
|
|
185
|
+
):
|
|
186
|
+
file = pq.ParquetFile(f)
|
|
187
|
+
t = file.read(columns=columns, **kwargs)
|
|
188
|
+
return self._table_to_pandas(t, nrows=nrows, use_arrow_dtype=use_arrow_dtype)
|
|
189
|
+
|
|
190
|
+
def read_group_to_pandas(
|
|
191
|
+
self, f, group_index, columns=None, nrows=None, use_arrow_dtype=None, **kwargs
|
|
192
|
+
):
|
|
193
|
+
file = pq.ParquetFile(f)
|
|
194
|
+
t = file.read_row_group(group_index, columns=columns, **kwargs)
|
|
195
|
+
return self._table_to_pandas(t, nrows=nrows, use_arrow_dtype=use_arrow_dtype)
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
class FastpaquetEngine(ParquetEngine):
|
|
199
|
+
def get_row_num(self, f):
|
|
200
|
+
file = fastparquet.ParquetFile(f)
|
|
201
|
+
return file.count()
|
|
202
|
+
|
|
203
|
+
def read_dtypes(self, f, **kwargs):
|
|
204
|
+
file = fastparquet.ParquetFile(f)
|
|
205
|
+
dtypes_dict = file._dtypes()
|
|
206
|
+
return pd.Series(dict((c, dtypes_dict[c]) for c in file.columns))
|
|
207
|
+
|
|
208
|
+
def read_to_pandas(
|
|
209
|
+
self, f, columns=None, nrows=None, use_arrow_dtype=None, **kwargs
|
|
210
|
+
):
|
|
211
|
+
file = fastparquet.ParquetFile(f)
|
|
212
|
+
df = file.to_pandas(columns, **kwargs)
|
|
213
|
+
if nrows is not None:
|
|
214
|
+
df = df.head(nrows)
|
|
215
|
+
if use_arrow_dtype:
|
|
216
|
+
df = df.astype(to_arrow_dtypes(df.dtypes).to_dict())
|
|
217
|
+
return df
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
class CudfEngine:
|
|
221
|
+
@classmethod
|
|
222
|
+
def read_to_cudf(cls, file, columns: list = None, nrows: int = None, **kwargs):
|
|
223
|
+
df = cudf.read_parquet(file, columns=columns, **kwargs)
|
|
224
|
+
if nrows is not None:
|
|
225
|
+
df = df.head(nrows)
|
|
226
|
+
return df
|
|
227
|
+
|
|
228
|
+
def read_group_to_cudf(
|
|
229
|
+
self, file, group_index: int, columns: list = None, nrows: int = None, **kwargs
|
|
230
|
+
):
|
|
231
|
+
return self.read_to_cudf(
|
|
232
|
+
file, columns=columns, nrows=nrows, row_groups=group_index, **kwargs
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
@classmethod
|
|
236
|
+
def read_partitioned_to_cudf(
|
|
237
|
+
cls,
|
|
238
|
+
file,
|
|
239
|
+
partitions: Dict,
|
|
240
|
+
partition_keys: Dict,
|
|
241
|
+
columns=None,
|
|
242
|
+
nrows=None,
|
|
243
|
+
**kwargs,
|
|
244
|
+
):
|
|
245
|
+
# cudf will read entire partitions even if only one partition provided,
|
|
246
|
+
# so we just read with pyarrow and convert to cudf DataFrame
|
|
247
|
+
file = pq.ParquetFile(file)
|
|
248
|
+
t = file.read(columns=columns, **kwargs)
|
|
249
|
+
t = t.slice(0, nrows) if nrows is not None else t
|
|
250
|
+
t = pa.table(t.columns, names=t.column_names)
|
|
251
|
+
raw_df = cudf.DataFrame.from_arrow(t)
|
|
252
|
+
for col, value in partition_keys.items():
|
|
253
|
+
dictionary = partitions[col].tolist()
|
|
254
|
+
codes = cudf.core.column.as_column(
|
|
255
|
+
dictionary.index(value), length=len(raw_df)
|
|
256
|
+
)
|
|
257
|
+
raw_df[col] = cudf.core.column.build_categorical_column(
|
|
258
|
+
categories=dictionary,
|
|
259
|
+
codes=codes,
|
|
260
|
+
size=codes.size,
|
|
261
|
+
offset=codes.offset,
|
|
262
|
+
ordered=False,
|
|
263
|
+
)
|
|
264
|
+
return raw_df
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
class DataFrameReadParquet(
|
|
268
|
+
IncrementalIndexDatasource,
|
|
269
|
+
ColumnPruneSupportedDataSourceMixin,
|
|
270
|
+
):
|
|
271
|
+
_op_type_ = opcodes.READ_PARQUET
|
|
272
|
+
|
|
273
|
+
path = AnyField("path")
|
|
274
|
+
engine = StringField("engine")
|
|
275
|
+
columns = ListField("columns")
|
|
276
|
+
use_arrow_dtype = BoolField("use_arrow_dtype")
|
|
277
|
+
groups_as_chunks = BoolField("groups_as_chunks")
|
|
278
|
+
group_index = Int32Field("group_index")
|
|
279
|
+
read_kwargs = DictField("read_kwargs")
|
|
280
|
+
incremental_index = BoolField("incremental_index")
|
|
281
|
+
storage_options = DictField("storage_options")
|
|
282
|
+
is_partitioned = BoolField("is_partitioned")
|
|
283
|
+
merge_small_files = BoolField("merge_small_files")
|
|
284
|
+
merge_small_file_options = DictField("merge_small_file_options")
|
|
285
|
+
# for chunk
|
|
286
|
+
partitions = DictField("partitions", default=None)
|
|
287
|
+
partition_keys = DictField("partition_keys", default=None)
|
|
288
|
+
num_group_rows = Int64Field("num_group_rows", default=None)
|
|
289
|
+
# as read meta may be too time-consuming when number of files is large,
|
|
290
|
+
# thus we only read first file to get row number and raw file size
|
|
291
|
+
first_chunk_row_num = Int64Field("first_chunk_row_num")
|
|
292
|
+
first_chunk_raw_bytes = Int64Field("first_chunk_raw_bytes")
|
|
293
|
+
|
|
294
|
+
def get_columns(self):
|
|
295
|
+
return self.columns
|
|
296
|
+
|
|
297
|
+
def set_pruned_columns(self, columns, *, keep_order=None):
|
|
298
|
+
self.columns = columns
|
|
299
|
+
|
|
300
|
+
def __call__(self, index_value=None, columns_value=None, dtypes=None):
|
|
301
|
+
self._output_types = [OutputType.dataframe]
|
|
302
|
+
shape = (np.nan, len(dtypes))
|
|
303
|
+
return self.new_dataframe(
|
|
304
|
+
None,
|
|
305
|
+
shape,
|
|
306
|
+
dtypes=dtypes,
|
|
307
|
+
index_value=index_value,
|
|
308
|
+
columns_value=columns_value,
|
|
309
|
+
)
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
def read_parquet(
|
|
313
|
+
path,
|
|
314
|
+
engine: str = "auto",
|
|
315
|
+
columns: list = None,
|
|
316
|
+
groups_as_chunks: bool = False,
|
|
317
|
+
use_arrow_dtype: bool = None,
|
|
318
|
+
incremental_index: bool = False,
|
|
319
|
+
storage_options: dict = None,
|
|
320
|
+
memory_scale: int = None,
|
|
321
|
+
merge_small_files: bool = True,
|
|
322
|
+
merge_small_file_options: dict = None,
|
|
323
|
+
gpu: bool = None,
|
|
324
|
+
**kwargs,
|
|
325
|
+
):
|
|
326
|
+
"""
|
|
327
|
+
Load a parquet object from the file path, returning a DataFrame.
|
|
328
|
+
|
|
329
|
+
Parameters
|
|
330
|
+
----------
|
|
331
|
+
path : str, path object or file-like object
|
|
332
|
+
Any valid string path is acceptable. The string could be a URL.
|
|
333
|
+
For file URLs, a host is expected. A local file could be:
|
|
334
|
+
``file://localhost/path/to/table.parquet``.
|
|
335
|
+
A file URL can also be a path to a directory that contains multiple
|
|
336
|
+
partitioned parquet files. Both pyarrow and fastparquet support
|
|
337
|
+
paths to directories as well as file URLs. A directory path could be:
|
|
338
|
+
``file://localhost/path/to/tables``.
|
|
339
|
+
By file-like object, we refer to objects with a ``read()`` method,
|
|
340
|
+
such as a file handler (e.g. via builtin ``open`` function)
|
|
341
|
+
or ``StringIO``.
|
|
342
|
+
engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
|
|
343
|
+
Parquet library to use. The default behavior is to try 'pyarrow',
|
|
344
|
+
falling back to 'fastparquet' if 'pyarrow' is unavailable.
|
|
345
|
+
columns : list, default=None
|
|
346
|
+
If not None, only these columns will be read from the file.
|
|
347
|
+
groups_as_chunks : bool, default False
|
|
348
|
+
if True, each row group correspond to a chunk.
|
|
349
|
+
if False, each file correspond to a chunk.
|
|
350
|
+
Only available for 'pyarrow' engine.
|
|
351
|
+
incremental_index: bool, default False
|
|
352
|
+
If index_col not specified, ensure range index incremental,
|
|
353
|
+
gain a slightly better performance if setting False.
|
|
354
|
+
use_arrow_dtype: bool, default None
|
|
355
|
+
If True, use arrow dtype to store columns.
|
|
356
|
+
storage_options: dict, optional
|
|
357
|
+
Options for storage connection.
|
|
358
|
+
memory_scale: int, optional
|
|
359
|
+
Scale that real memory occupation divided with raw file size.
|
|
360
|
+
merge_small_files: bool, default True
|
|
361
|
+
Merge small files whose size is small.
|
|
362
|
+
merge_small_file_options: dict
|
|
363
|
+
Options for merging small files
|
|
364
|
+
**kwargs
|
|
365
|
+
Any additional kwargs are passed to the engine.
|
|
366
|
+
|
|
367
|
+
Returns
|
|
368
|
+
-------
|
|
369
|
+
MaxFrame DataFrame
|
|
370
|
+
"""
|
|
371
|
+
|
|
372
|
+
engine_type = check_engine(engine)
|
|
373
|
+
engine = get_engine(engine_type)
|
|
374
|
+
|
|
375
|
+
single_path = path[0] if isinstance(path, list) else path
|
|
376
|
+
fs = get_fs(single_path, storage_options)
|
|
377
|
+
is_partitioned = False
|
|
378
|
+
if fs.isdir(single_path):
|
|
379
|
+
paths = fs.ls(path)
|
|
380
|
+
if all(fs.isdir(p) for p in paths):
|
|
381
|
+
# If all are directories, it is read as a partitioned dataset.
|
|
382
|
+
dtypes = engine.read_partitioned_dtypes(fs, path, storage_options)
|
|
383
|
+
is_partitioned = True
|
|
384
|
+
else:
|
|
385
|
+
with fs.open(paths[0], mode="rb") as f:
|
|
386
|
+
dtypes = engine.read_dtypes(f)
|
|
387
|
+
else:
|
|
388
|
+
if not isinstance(path, list):
|
|
389
|
+
file_path = glob(path, storage_options=storage_options)[0]
|
|
390
|
+
else:
|
|
391
|
+
file_path = path[0]
|
|
392
|
+
|
|
393
|
+
with open_file(file_path, storage_options=storage_options) as f:
|
|
394
|
+
dtypes = engine.read_dtypes(f)
|
|
395
|
+
|
|
396
|
+
if columns:
|
|
397
|
+
dtypes = dtypes[columns]
|
|
398
|
+
|
|
399
|
+
if use_arrow_dtype is None:
|
|
400
|
+
use_arrow_dtype = options.dataframe.use_arrow_dtype
|
|
401
|
+
if use_arrow_dtype:
|
|
402
|
+
dtypes = to_arrow_dtypes(dtypes)
|
|
403
|
+
|
|
404
|
+
index_value = parse_index(pd.RangeIndex(-1))
|
|
405
|
+
columns_value = parse_index(dtypes.index, store_data=True)
|
|
406
|
+
op = DataFrameReadParquet(
|
|
407
|
+
path=path,
|
|
408
|
+
engine=engine_type,
|
|
409
|
+
columns=columns,
|
|
410
|
+
groups_as_chunks=groups_as_chunks,
|
|
411
|
+
use_arrow_dtype=use_arrow_dtype,
|
|
412
|
+
read_kwargs=kwargs,
|
|
413
|
+
incremental_index=incremental_index,
|
|
414
|
+
storage_options=storage_options,
|
|
415
|
+
is_partitioned=is_partitioned,
|
|
416
|
+
memory_scale=memory_scale,
|
|
417
|
+
merge_small_files=merge_small_files,
|
|
418
|
+
merge_small_file_options=merge_small_file_options,
|
|
419
|
+
gpu=gpu,
|
|
420
|
+
)
|
|
421
|
+
return op(index_value=index_value, columns_value=columns_value, dtypes=dtypes)
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
# Copyright 1999-2024 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from ... import opcodes
|
|
16
|
+
from ...core import OutputType
|
|
17
|
+
from ...serialization.serializables import DataTypeField, SeriesField
|
|
18
|
+
from ..operators import DataFrameOperatorMixin
|
|
19
|
+
from ..utils import is_cudf, parse_index
|
|
20
|
+
from .core import PandasDataSourceOperator
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class SeriesDataSource(PandasDataSourceOperator, DataFrameOperatorMixin):
|
|
24
|
+
"""
|
|
25
|
+
Represents data from pandas Series
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
_op_type_ = opcodes.SERIES_DATA_SOURCE
|
|
29
|
+
|
|
30
|
+
data = SeriesField("data")
|
|
31
|
+
dtype = DataTypeField("dtype")
|
|
32
|
+
|
|
33
|
+
def __init__(self, data=None, dtype=None, gpu=None, **kw):
|
|
34
|
+
if dtype is None and data is not None:
|
|
35
|
+
dtype = data.dtype
|
|
36
|
+
if gpu is None and is_cudf(data): # pragma: no cover
|
|
37
|
+
gpu = True
|
|
38
|
+
super().__init__(
|
|
39
|
+
data=data, dtype=dtype, gpu=gpu, _output_types=[OutputType.series], **kw
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
def __call__(self, shape, chunk_size=None):
|
|
43
|
+
return self.new_series(
|
|
44
|
+
None,
|
|
45
|
+
shape=shape,
|
|
46
|
+
dtype=self.dtype,
|
|
47
|
+
index_value=parse_index(self.data.index),
|
|
48
|
+
name=self.data.name,
|
|
49
|
+
raw_chunk_size=chunk_size,
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def from_pandas(data, chunk_size=None, gpu=None, sparse=False):
|
|
54
|
+
op = SeriesDataSource(data=data, gpu=gpu, sparse=sparse)
|
|
55
|
+
return op(data.shape, chunk_size=chunk_size)
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
# Copyright 1999-2024 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|