maxframe 0.1.0b5__cp311-cp311-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of maxframe might be problematic. Click here for more details.

Files changed (647) hide show
  1. maxframe/__init__.py +32 -0
  2. maxframe/_utils.cpython-311-darwin.so +0 -0
  3. maxframe/_utils.pxd +33 -0
  4. maxframe/_utils.pyx +547 -0
  5. maxframe/codegen.py +528 -0
  6. maxframe/config/__init__.py +15 -0
  7. maxframe/config/config.py +443 -0
  8. maxframe/config/tests/__init__.py +13 -0
  9. maxframe/config/tests/test_config.py +103 -0
  10. maxframe/config/tests/test_validators.py +34 -0
  11. maxframe/config/validators.py +57 -0
  12. maxframe/conftest.py +139 -0
  13. maxframe/core/__init__.py +65 -0
  14. maxframe/core/base.py +156 -0
  15. maxframe/core/entity/__init__.py +44 -0
  16. maxframe/core/entity/chunks.py +68 -0
  17. maxframe/core/entity/core.py +152 -0
  18. maxframe/core/entity/executable.py +337 -0
  19. maxframe/core/entity/fuse.py +73 -0
  20. maxframe/core/entity/objects.py +100 -0
  21. maxframe/core/entity/output_types.py +90 -0
  22. maxframe/core/entity/tileables.py +438 -0
  23. maxframe/core/entity/utils.py +24 -0
  24. maxframe/core/graph/__init__.py +17 -0
  25. maxframe/core/graph/builder/__init__.py +16 -0
  26. maxframe/core/graph/builder/base.py +86 -0
  27. maxframe/core/graph/builder/chunk.py +430 -0
  28. maxframe/core/graph/builder/tileable.py +34 -0
  29. maxframe/core/graph/builder/utils.py +41 -0
  30. maxframe/core/graph/core.cpython-311-darwin.so +0 -0
  31. maxframe/core/graph/core.pyx +467 -0
  32. maxframe/core/graph/entity.py +171 -0
  33. maxframe/core/graph/tests/__init__.py +13 -0
  34. maxframe/core/graph/tests/test_graph.py +205 -0
  35. maxframe/core/mode.py +96 -0
  36. maxframe/core/operator/__init__.py +34 -0
  37. maxframe/core/operator/base.py +450 -0
  38. maxframe/core/operator/core.py +276 -0
  39. maxframe/core/operator/fetch.py +53 -0
  40. maxframe/core/operator/fuse.py +29 -0
  41. maxframe/core/operator/objects.py +72 -0
  42. maxframe/core/operator/shuffle.py +111 -0
  43. maxframe/core/operator/tests/__init__.py +13 -0
  44. maxframe/core/operator/tests/test_core.py +64 -0
  45. maxframe/core/tests/__init__.py +13 -0
  46. maxframe/core/tests/test_mode.py +75 -0
  47. maxframe/dataframe/__init__.py +81 -0
  48. maxframe/dataframe/arithmetic/__init__.py +359 -0
  49. maxframe/dataframe/arithmetic/abs.py +33 -0
  50. maxframe/dataframe/arithmetic/add.py +60 -0
  51. maxframe/dataframe/arithmetic/arccos.py +28 -0
  52. maxframe/dataframe/arithmetic/arccosh.py +28 -0
  53. maxframe/dataframe/arithmetic/arcsin.py +28 -0
  54. maxframe/dataframe/arithmetic/arcsinh.py +28 -0
  55. maxframe/dataframe/arithmetic/arctan.py +28 -0
  56. maxframe/dataframe/arithmetic/arctanh.py +28 -0
  57. maxframe/dataframe/arithmetic/around.py +152 -0
  58. maxframe/dataframe/arithmetic/bitwise_and.py +46 -0
  59. maxframe/dataframe/arithmetic/bitwise_or.py +50 -0
  60. maxframe/dataframe/arithmetic/bitwise_xor.py +46 -0
  61. maxframe/dataframe/arithmetic/ceil.py +28 -0
  62. maxframe/dataframe/arithmetic/core.py +342 -0
  63. maxframe/dataframe/arithmetic/cos.py +28 -0
  64. maxframe/dataframe/arithmetic/cosh.py +28 -0
  65. maxframe/dataframe/arithmetic/degrees.py +28 -0
  66. maxframe/dataframe/arithmetic/docstring.py +442 -0
  67. maxframe/dataframe/arithmetic/equal.py +56 -0
  68. maxframe/dataframe/arithmetic/exp.py +28 -0
  69. maxframe/dataframe/arithmetic/exp2.py +28 -0
  70. maxframe/dataframe/arithmetic/expm1.py +28 -0
  71. maxframe/dataframe/arithmetic/floor.py +28 -0
  72. maxframe/dataframe/arithmetic/floordiv.py +64 -0
  73. maxframe/dataframe/arithmetic/greater.py +57 -0
  74. maxframe/dataframe/arithmetic/greater_equal.py +57 -0
  75. maxframe/dataframe/arithmetic/invert.py +33 -0
  76. maxframe/dataframe/arithmetic/is_ufuncs.py +62 -0
  77. maxframe/dataframe/arithmetic/less.py +57 -0
  78. maxframe/dataframe/arithmetic/less_equal.py +57 -0
  79. maxframe/dataframe/arithmetic/log.py +28 -0
  80. maxframe/dataframe/arithmetic/log10.py +28 -0
  81. maxframe/dataframe/arithmetic/log2.py +28 -0
  82. maxframe/dataframe/arithmetic/mod.py +60 -0
  83. maxframe/dataframe/arithmetic/multiply.py +60 -0
  84. maxframe/dataframe/arithmetic/negative.py +33 -0
  85. maxframe/dataframe/arithmetic/not_equal.py +56 -0
  86. maxframe/dataframe/arithmetic/power.py +68 -0
  87. maxframe/dataframe/arithmetic/radians.py +28 -0
  88. maxframe/dataframe/arithmetic/sin.py +28 -0
  89. maxframe/dataframe/arithmetic/sinh.py +28 -0
  90. maxframe/dataframe/arithmetic/sqrt.py +28 -0
  91. maxframe/dataframe/arithmetic/subtract.py +64 -0
  92. maxframe/dataframe/arithmetic/tan.py +28 -0
  93. maxframe/dataframe/arithmetic/tanh.py +28 -0
  94. maxframe/dataframe/arithmetic/tests/__init__.py +13 -0
  95. maxframe/dataframe/arithmetic/tests/test_arithmetic.py +695 -0
  96. maxframe/dataframe/arithmetic/truediv.py +64 -0
  97. maxframe/dataframe/arithmetic/trunc.py +28 -0
  98. maxframe/dataframe/arrays.py +864 -0
  99. maxframe/dataframe/core.py +2417 -0
  100. maxframe/dataframe/datasource/__init__.py +15 -0
  101. maxframe/dataframe/datasource/core.py +81 -0
  102. maxframe/dataframe/datasource/dataframe.py +59 -0
  103. maxframe/dataframe/datasource/date_range.py +504 -0
  104. maxframe/dataframe/datasource/from_index.py +54 -0
  105. maxframe/dataframe/datasource/from_records.py +107 -0
  106. maxframe/dataframe/datasource/from_tensor.py +419 -0
  107. maxframe/dataframe/datasource/index.py +117 -0
  108. maxframe/dataframe/datasource/read_csv.py +528 -0
  109. maxframe/dataframe/datasource/read_odps_query.py +299 -0
  110. maxframe/dataframe/datasource/read_odps_table.py +253 -0
  111. maxframe/dataframe/datasource/read_parquet.py +421 -0
  112. maxframe/dataframe/datasource/series.py +55 -0
  113. maxframe/dataframe/datasource/tests/__init__.py +13 -0
  114. maxframe/dataframe/datasource/tests/test_datasource.py +401 -0
  115. maxframe/dataframe/datastore/__init__.py +26 -0
  116. maxframe/dataframe/datastore/core.py +19 -0
  117. maxframe/dataframe/datastore/to_csv.py +227 -0
  118. maxframe/dataframe/datastore/to_odps.py +162 -0
  119. maxframe/dataframe/extensions/__init__.py +41 -0
  120. maxframe/dataframe/extensions/accessor.py +50 -0
  121. maxframe/dataframe/extensions/reshuffle.py +83 -0
  122. maxframe/dataframe/extensions/tests/__init__.py +13 -0
  123. maxframe/dataframe/extensions/tests/test_extensions.py +38 -0
  124. maxframe/dataframe/fetch/__init__.py +15 -0
  125. maxframe/dataframe/fetch/core.py +86 -0
  126. maxframe/dataframe/groupby/__init__.py +82 -0
  127. maxframe/dataframe/groupby/aggregation.py +350 -0
  128. maxframe/dataframe/groupby/apply.py +251 -0
  129. maxframe/dataframe/groupby/core.py +179 -0
  130. maxframe/dataframe/groupby/cum.py +124 -0
  131. maxframe/dataframe/groupby/fill.py +141 -0
  132. maxframe/dataframe/groupby/getitem.py +92 -0
  133. maxframe/dataframe/groupby/head.py +105 -0
  134. maxframe/dataframe/groupby/sample.py +214 -0
  135. maxframe/dataframe/groupby/tests/__init__.py +13 -0
  136. maxframe/dataframe/groupby/tests/test_groupby.py +374 -0
  137. maxframe/dataframe/groupby/transform.py +255 -0
  138. maxframe/dataframe/indexing/__init__.py +84 -0
  139. maxframe/dataframe/indexing/add_prefix_suffix.py +110 -0
  140. maxframe/dataframe/indexing/align.py +349 -0
  141. maxframe/dataframe/indexing/at.py +83 -0
  142. maxframe/dataframe/indexing/getitem.py +204 -0
  143. maxframe/dataframe/indexing/iat.py +37 -0
  144. maxframe/dataframe/indexing/iloc.py +566 -0
  145. maxframe/dataframe/indexing/insert.py +86 -0
  146. maxframe/dataframe/indexing/loc.py +411 -0
  147. maxframe/dataframe/indexing/reindex.py +526 -0
  148. maxframe/dataframe/indexing/rename.py +462 -0
  149. maxframe/dataframe/indexing/rename_axis.py +209 -0
  150. maxframe/dataframe/indexing/reset_index.py +402 -0
  151. maxframe/dataframe/indexing/sample.py +221 -0
  152. maxframe/dataframe/indexing/set_axis.py +194 -0
  153. maxframe/dataframe/indexing/set_index.py +61 -0
  154. maxframe/dataframe/indexing/setitem.py +130 -0
  155. maxframe/dataframe/indexing/tests/__init__.py +13 -0
  156. maxframe/dataframe/indexing/tests/test_indexing.py +488 -0
  157. maxframe/dataframe/indexing/where.py +308 -0
  158. maxframe/dataframe/initializer.py +288 -0
  159. maxframe/dataframe/merge/__init__.py +32 -0
  160. maxframe/dataframe/merge/append.py +121 -0
  161. maxframe/dataframe/merge/concat.py +325 -0
  162. maxframe/dataframe/merge/merge.py +593 -0
  163. maxframe/dataframe/merge/tests/__init__.py +13 -0
  164. maxframe/dataframe/merge/tests/test_merge.py +215 -0
  165. maxframe/dataframe/misc/__init__.py +134 -0
  166. maxframe/dataframe/misc/_duplicate.py +46 -0
  167. maxframe/dataframe/misc/accessor.py +276 -0
  168. maxframe/dataframe/misc/apply.py +692 -0
  169. maxframe/dataframe/misc/astype.py +236 -0
  170. maxframe/dataframe/misc/case_when.py +141 -0
  171. maxframe/dataframe/misc/check_monotonic.py +84 -0
  172. maxframe/dataframe/misc/cut.py +383 -0
  173. maxframe/dataframe/misc/datetimes.py +79 -0
  174. maxframe/dataframe/misc/describe.py +108 -0
  175. maxframe/dataframe/misc/diff.py +210 -0
  176. maxframe/dataframe/misc/drop.py +440 -0
  177. maxframe/dataframe/misc/drop_duplicates.py +248 -0
  178. maxframe/dataframe/misc/duplicated.py +292 -0
  179. maxframe/dataframe/misc/eval.py +728 -0
  180. maxframe/dataframe/misc/explode.py +171 -0
  181. maxframe/dataframe/misc/get_dummies.py +208 -0
  182. maxframe/dataframe/misc/isin.py +217 -0
  183. maxframe/dataframe/misc/map.py +236 -0
  184. maxframe/dataframe/misc/melt.py +162 -0
  185. maxframe/dataframe/misc/memory_usage.py +248 -0
  186. maxframe/dataframe/misc/pct_change.py +150 -0
  187. maxframe/dataframe/misc/pivot_table.py +262 -0
  188. maxframe/dataframe/misc/qcut.py +104 -0
  189. maxframe/dataframe/misc/select_dtypes.py +104 -0
  190. maxframe/dataframe/misc/shift.py +256 -0
  191. maxframe/dataframe/misc/stack.py +238 -0
  192. maxframe/dataframe/misc/string_.py +221 -0
  193. maxframe/dataframe/misc/tests/__init__.py +13 -0
  194. maxframe/dataframe/misc/tests/test_misc.py +468 -0
  195. maxframe/dataframe/misc/to_numeric.py +178 -0
  196. maxframe/dataframe/misc/transform.py +361 -0
  197. maxframe/dataframe/misc/transpose.py +136 -0
  198. maxframe/dataframe/misc/value_counts.py +182 -0
  199. maxframe/dataframe/missing/__init__.py +53 -0
  200. maxframe/dataframe/missing/checkna.py +223 -0
  201. maxframe/dataframe/missing/dropna.py +280 -0
  202. maxframe/dataframe/missing/fillna.py +275 -0
  203. maxframe/dataframe/missing/replace.py +439 -0
  204. maxframe/dataframe/missing/tests/__init__.py +13 -0
  205. maxframe/dataframe/missing/tests/test_missing.py +89 -0
  206. maxframe/dataframe/operators.py +273 -0
  207. maxframe/dataframe/plotting/__init__.py +40 -0
  208. maxframe/dataframe/plotting/core.py +78 -0
  209. maxframe/dataframe/plotting/tests/__init__.py +13 -0
  210. maxframe/dataframe/plotting/tests/test_plotting.py +136 -0
  211. maxframe/dataframe/reduction/__init__.py +107 -0
  212. maxframe/dataframe/reduction/aggregation.py +344 -0
  213. maxframe/dataframe/reduction/all.py +78 -0
  214. maxframe/dataframe/reduction/any.py +78 -0
  215. maxframe/dataframe/reduction/core.py +837 -0
  216. maxframe/dataframe/reduction/count.py +59 -0
  217. maxframe/dataframe/reduction/cummax.py +30 -0
  218. maxframe/dataframe/reduction/cummin.py +30 -0
  219. maxframe/dataframe/reduction/cumprod.py +30 -0
  220. maxframe/dataframe/reduction/cumsum.py +30 -0
  221. maxframe/dataframe/reduction/custom_reduction.py +42 -0
  222. maxframe/dataframe/reduction/kurtosis.py +104 -0
  223. maxframe/dataframe/reduction/max.py +65 -0
  224. maxframe/dataframe/reduction/mean.py +61 -0
  225. maxframe/dataframe/reduction/min.py +65 -0
  226. maxframe/dataframe/reduction/nunique.py +141 -0
  227. maxframe/dataframe/reduction/prod.py +76 -0
  228. maxframe/dataframe/reduction/reduction_size.py +36 -0
  229. maxframe/dataframe/reduction/sem.py +69 -0
  230. maxframe/dataframe/reduction/skew.py +89 -0
  231. maxframe/dataframe/reduction/std.py +53 -0
  232. maxframe/dataframe/reduction/str_concat.py +48 -0
  233. maxframe/dataframe/reduction/sum.py +77 -0
  234. maxframe/dataframe/reduction/tests/__init__.py +13 -0
  235. maxframe/dataframe/reduction/tests/test_reduction.py +486 -0
  236. maxframe/dataframe/reduction/unique.py +90 -0
  237. maxframe/dataframe/reduction/var.py +72 -0
  238. maxframe/dataframe/sort/__init__.py +34 -0
  239. maxframe/dataframe/sort/core.py +36 -0
  240. maxframe/dataframe/sort/sort_index.py +153 -0
  241. maxframe/dataframe/sort/sort_values.py +311 -0
  242. maxframe/dataframe/sort/tests/__init__.py +13 -0
  243. maxframe/dataframe/sort/tests/test_sort.py +81 -0
  244. maxframe/dataframe/statistics/__init__.py +33 -0
  245. maxframe/dataframe/statistics/corr.py +280 -0
  246. maxframe/dataframe/statistics/quantile.py +341 -0
  247. maxframe/dataframe/statistics/tests/__init__.py +13 -0
  248. maxframe/dataframe/statistics/tests/test_statistics.py +82 -0
  249. maxframe/dataframe/tests/__init__.py +13 -0
  250. maxframe/dataframe/tests/test_initializer.py +29 -0
  251. maxframe/dataframe/tseries/__init__.py +13 -0
  252. maxframe/dataframe/tseries/tests/__init__.py +13 -0
  253. maxframe/dataframe/tseries/tests/test_tseries.py +30 -0
  254. maxframe/dataframe/tseries/to_datetime.py +297 -0
  255. maxframe/dataframe/ufunc/__init__.py +27 -0
  256. maxframe/dataframe/ufunc/tensor.py +54 -0
  257. maxframe/dataframe/ufunc/ufunc.py +52 -0
  258. maxframe/dataframe/utils.py +1267 -0
  259. maxframe/dataframe/window/__init__.py +29 -0
  260. maxframe/dataframe/window/aggregation.py +96 -0
  261. maxframe/dataframe/window/core.py +69 -0
  262. maxframe/dataframe/window/ewm.py +249 -0
  263. maxframe/dataframe/window/expanding.py +147 -0
  264. maxframe/dataframe/window/rolling.py +376 -0
  265. maxframe/dataframe/window/tests/__init__.py +13 -0
  266. maxframe/dataframe/window/tests/test_ewm.py +70 -0
  267. maxframe/dataframe/window/tests/test_expanding.py +66 -0
  268. maxframe/dataframe/window/tests/test_rolling.py +57 -0
  269. maxframe/env.py +33 -0
  270. maxframe/errors.py +21 -0
  271. maxframe/extension.py +81 -0
  272. maxframe/learn/__init__.py +17 -0
  273. maxframe/learn/contrib/__init__.py +17 -0
  274. maxframe/learn/contrib/pytorch/__init__.py +16 -0
  275. maxframe/learn/contrib/pytorch/run_function.py +110 -0
  276. maxframe/learn/contrib/pytorch/run_script.py +102 -0
  277. maxframe/learn/contrib/pytorch/tests/__init__.py +13 -0
  278. maxframe/learn/contrib/pytorch/tests/test_pytorch.py +42 -0
  279. maxframe/learn/contrib/utils.py +52 -0
  280. maxframe/learn/contrib/xgboost/__init__.py +26 -0
  281. maxframe/learn/contrib/xgboost/classifier.py +86 -0
  282. maxframe/learn/contrib/xgboost/core.py +156 -0
  283. maxframe/learn/contrib/xgboost/dmatrix.py +150 -0
  284. maxframe/learn/contrib/xgboost/predict.py +138 -0
  285. maxframe/learn/contrib/xgboost/regressor.py +78 -0
  286. maxframe/learn/contrib/xgboost/tests/__init__.py +13 -0
  287. maxframe/learn/contrib/xgboost/tests/test_core.py +43 -0
  288. maxframe/learn/contrib/xgboost/train.py +121 -0
  289. maxframe/learn/utils/__init__.py +15 -0
  290. maxframe/learn/utils/core.py +29 -0
  291. maxframe/lib/__init__.py +15 -0
  292. maxframe/lib/aio/__init__.py +27 -0
  293. maxframe/lib/aio/_runners.py +162 -0
  294. maxframe/lib/aio/_threads.py +35 -0
  295. maxframe/lib/aio/base.py +82 -0
  296. maxframe/lib/aio/file.py +85 -0
  297. maxframe/lib/aio/isolation.py +100 -0
  298. maxframe/lib/aio/lru.py +242 -0
  299. maxframe/lib/aio/parallelism.py +37 -0
  300. maxframe/lib/aio/tests/__init__.py +13 -0
  301. maxframe/lib/aio/tests/test_aio_file.py +55 -0
  302. maxframe/lib/compression.py +55 -0
  303. maxframe/lib/cython/__init__.py +13 -0
  304. maxframe/lib/cython/libcpp.pxd +30 -0
  305. maxframe/lib/filesystem/__init__.py +21 -0
  306. maxframe/lib/filesystem/_glob.py +173 -0
  307. maxframe/lib/filesystem/_oss_lib/__init__.py +13 -0
  308. maxframe/lib/filesystem/_oss_lib/common.py +198 -0
  309. maxframe/lib/filesystem/_oss_lib/glob.py +147 -0
  310. maxframe/lib/filesystem/_oss_lib/handle.py +156 -0
  311. maxframe/lib/filesystem/arrow.py +236 -0
  312. maxframe/lib/filesystem/base.py +263 -0
  313. maxframe/lib/filesystem/core.py +95 -0
  314. maxframe/lib/filesystem/fsmap.py +164 -0
  315. maxframe/lib/filesystem/hdfs.py +31 -0
  316. maxframe/lib/filesystem/local.py +112 -0
  317. maxframe/lib/filesystem/oss.py +157 -0
  318. maxframe/lib/filesystem/tests/__init__.py +13 -0
  319. maxframe/lib/filesystem/tests/test_filesystem.py +223 -0
  320. maxframe/lib/filesystem/tests/test_oss.py +182 -0
  321. maxframe/lib/functools_compat.py +81 -0
  322. maxframe/lib/mmh3.cpython-311-darwin.so +0 -0
  323. maxframe/lib/mmh3_src/MurmurHash3.cpp +339 -0
  324. maxframe/lib/mmh3_src/MurmurHash3.h +43 -0
  325. maxframe/lib/mmh3_src/mmh3module.cpp +387 -0
  326. maxframe/lib/sparse/__init__.py +861 -0
  327. maxframe/lib/sparse/array.py +1604 -0
  328. maxframe/lib/sparse/core.py +92 -0
  329. maxframe/lib/sparse/matrix.py +241 -0
  330. maxframe/lib/sparse/tests/__init__.py +15 -0
  331. maxframe/lib/sparse/tests/test_sparse.py +476 -0
  332. maxframe/lib/sparse/vector.py +150 -0
  333. maxframe/lib/tblib/LICENSE +20 -0
  334. maxframe/lib/tblib/__init__.py +327 -0
  335. maxframe/lib/tblib/cpython.py +83 -0
  336. maxframe/lib/tblib/decorators.py +44 -0
  337. maxframe/lib/tblib/pickling_support.py +90 -0
  338. maxframe/lib/tests/__init__.py +13 -0
  339. maxframe/lib/tests/test_wrapped_pickle.py +51 -0
  340. maxframe/lib/version.py +620 -0
  341. maxframe/lib/wrapped_pickle.py +139 -0
  342. maxframe/mixin.py +100 -0
  343. maxframe/odpsio/__init__.py +21 -0
  344. maxframe/odpsio/arrow.py +91 -0
  345. maxframe/odpsio/schema.py +364 -0
  346. maxframe/odpsio/tableio.py +322 -0
  347. maxframe/odpsio/tests/__init__.py +13 -0
  348. maxframe/odpsio/tests/test_arrow.py +88 -0
  349. maxframe/odpsio/tests/test_schema.py +297 -0
  350. maxframe/odpsio/tests/test_tableio.py +136 -0
  351. maxframe/odpsio/tests/test_volumeio.py +90 -0
  352. maxframe/odpsio/volumeio.py +95 -0
  353. maxframe/opcodes.py +590 -0
  354. maxframe/protocol.py +415 -0
  355. maxframe/remote/__init__.py +18 -0
  356. maxframe/remote/core.py +210 -0
  357. maxframe/remote/run_script.py +121 -0
  358. maxframe/serialization/__init__.py +26 -0
  359. maxframe/serialization/arrow.py +95 -0
  360. maxframe/serialization/core.cpython-311-darwin.so +0 -0
  361. maxframe/serialization/core.pxd +44 -0
  362. maxframe/serialization/core.pyi +61 -0
  363. maxframe/serialization/core.pyx +1094 -0
  364. maxframe/serialization/exception.py +86 -0
  365. maxframe/serialization/maxframe_objects.py +39 -0
  366. maxframe/serialization/numpy.py +91 -0
  367. maxframe/serialization/pandas.py +202 -0
  368. maxframe/serialization/scipy.py +71 -0
  369. maxframe/serialization/serializables/__init__.py +55 -0
  370. maxframe/serialization/serializables/core.py +262 -0
  371. maxframe/serialization/serializables/field.py +624 -0
  372. maxframe/serialization/serializables/field_type.py +589 -0
  373. maxframe/serialization/serializables/tests/__init__.py +13 -0
  374. maxframe/serialization/serializables/tests/test_field_type.py +121 -0
  375. maxframe/serialization/serializables/tests/test_serializable.py +250 -0
  376. maxframe/serialization/tests/__init__.py +13 -0
  377. maxframe/serialization/tests/test_serial.py +412 -0
  378. maxframe/session.py +1310 -0
  379. maxframe/tensor/__init__.py +183 -0
  380. maxframe/tensor/arithmetic/__init__.py +315 -0
  381. maxframe/tensor/arithmetic/abs.py +68 -0
  382. maxframe/tensor/arithmetic/absolute.py +68 -0
  383. maxframe/tensor/arithmetic/add.py +82 -0
  384. maxframe/tensor/arithmetic/angle.py +72 -0
  385. maxframe/tensor/arithmetic/arccos.py +104 -0
  386. maxframe/tensor/arithmetic/arccosh.py +91 -0
  387. maxframe/tensor/arithmetic/arcsin.py +94 -0
  388. maxframe/tensor/arithmetic/arcsinh.py +86 -0
  389. maxframe/tensor/arithmetic/arctan.py +106 -0
  390. maxframe/tensor/arithmetic/arctan2.py +128 -0
  391. maxframe/tensor/arithmetic/arctanh.py +86 -0
  392. maxframe/tensor/arithmetic/around.py +114 -0
  393. maxframe/tensor/arithmetic/bitand.py +95 -0
  394. maxframe/tensor/arithmetic/bitor.py +102 -0
  395. maxframe/tensor/arithmetic/bitxor.py +95 -0
  396. maxframe/tensor/arithmetic/cbrt.py +66 -0
  397. maxframe/tensor/arithmetic/ceil.py +71 -0
  398. maxframe/tensor/arithmetic/clip.py +165 -0
  399. maxframe/tensor/arithmetic/conj.py +74 -0
  400. maxframe/tensor/arithmetic/copysign.py +78 -0
  401. maxframe/tensor/arithmetic/core.py +544 -0
  402. maxframe/tensor/arithmetic/cos.py +85 -0
  403. maxframe/tensor/arithmetic/cosh.py +72 -0
  404. maxframe/tensor/arithmetic/deg2rad.py +72 -0
  405. maxframe/tensor/arithmetic/degrees.py +77 -0
  406. maxframe/tensor/arithmetic/divide.py +114 -0
  407. maxframe/tensor/arithmetic/equal.py +76 -0
  408. maxframe/tensor/arithmetic/exp.py +106 -0
  409. maxframe/tensor/arithmetic/exp2.py +67 -0
  410. maxframe/tensor/arithmetic/expm1.py +79 -0
  411. maxframe/tensor/arithmetic/fabs.py +74 -0
  412. maxframe/tensor/arithmetic/fix.py +69 -0
  413. maxframe/tensor/arithmetic/float_power.py +103 -0
  414. maxframe/tensor/arithmetic/floor.py +77 -0
  415. maxframe/tensor/arithmetic/floordiv.py +94 -0
  416. maxframe/tensor/arithmetic/fmax.py +105 -0
  417. maxframe/tensor/arithmetic/fmin.py +106 -0
  418. maxframe/tensor/arithmetic/fmod.py +99 -0
  419. maxframe/tensor/arithmetic/frexp.py +92 -0
  420. maxframe/tensor/arithmetic/greater.py +77 -0
  421. maxframe/tensor/arithmetic/greater_equal.py +69 -0
  422. maxframe/tensor/arithmetic/hypot.py +77 -0
  423. maxframe/tensor/arithmetic/i0.py +89 -0
  424. maxframe/tensor/arithmetic/imag.py +67 -0
  425. maxframe/tensor/arithmetic/invert.py +110 -0
  426. maxframe/tensor/arithmetic/isclose.py +115 -0
  427. maxframe/tensor/arithmetic/iscomplex.py +64 -0
  428. maxframe/tensor/arithmetic/isfinite.py +106 -0
  429. maxframe/tensor/arithmetic/isinf.py +103 -0
  430. maxframe/tensor/arithmetic/isnan.py +82 -0
  431. maxframe/tensor/arithmetic/isreal.py +63 -0
  432. maxframe/tensor/arithmetic/ldexp.py +99 -0
  433. maxframe/tensor/arithmetic/less.py +69 -0
  434. maxframe/tensor/arithmetic/less_equal.py +69 -0
  435. maxframe/tensor/arithmetic/log.py +92 -0
  436. maxframe/tensor/arithmetic/log10.py +85 -0
  437. maxframe/tensor/arithmetic/log1p.py +95 -0
  438. maxframe/tensor/arithmetic/log2.py +85 -0
  439. maxframe/tensor/arithmetic/logaddexp.py +80 -0
  440. maxframe/tensor/arithmetic/logaddexp2.py +78 -0
  441. maxframe/tensor/arithmetic/logical_and.py +81 -0
  442. maxframe/tensor/arithmetic/logical_not.py +74 -0
  443. maxframe/tensor/arithmetic/logical_or.py +82 -0
  444. maxframe/tensor/arithmetic/logical_xor.py +88 -0
  445. maxframe/tensor/arithmetic/lshift.py +82 -0
  446. maxframe/tensor/arithmetic/maximum.py +108 -0
  447. maxframe/tensor/arithmetic/minimum.py +108 -0
  448. maxframe/tensor/arithmetic/mod.py +104 -0
  449. maxframe/tensor/arithmetic/modf.py +83 -0
  450. maxframe/tensor/arithmetic/multiply.py +81 -0
  451. maxframe/tensor/arithmetic/nan_to_num.py +99 -0
  452. maxframe/tensor/arithmetic/negative.py +65 -0
  453. maxframe/tensor/arithmetic/nextafter.py +68 -0
  454. maxframe/tensor/arithmetic/not_equal.py +72 -0
  455. maxframe/tensor/arithmetic/positive.py +47 -0
  456. maxframe/tensor/arithmetic/power.py +106 -0
  457. maxframe/tensor/arithmetic/rad2deg.py +71 -0
  458. maxframe/tensor/arithmetic/radians.py +77 -0
  459. maxframe/tensor/arithmetic/real.py +70 -0
  460. maxframe/tensor/arithmetic/reciprocal.py +76 -0
  461. maxframe/tensor/arithmetic/rint.py +68 -0
  462. maxframe/tensor/arithmetic/rshift.py +81 -0
  463. maxframe/tensor/arithmetic/setimag.py +29 -0
  464. maxframe/tensor/arithmetic/setreal.py +29 -0
  465. maxframe/tensor/arithmetic/sign.py +81 -0
  466. maxframe/tensor/arithmetic/signbit.py +65 -0
  467. maxframe/tensor/arithmetic/sin.py +98 -0
  468. maxframe/tensor/arithmetic/sinc.py +102 -0
  469. maxframe/tensor/arithmetic/sinh.py +93 -0
  470. maxframe/tensor/arithmetic/spacing.py +72 -0
  471. maxframe/tensor/arithmetic/sqrt.py +81 -0
  472. maxframe/tensor/arithmetic/square.py +69 -0
  473. maxframe/tensor/arithmetic/subtract.py +81 -0
  474. maxframe/tensor/arithmetic/tan.py +88 -0
  475. maxframe/tensor/arithmetic/tanh.py +92 -0
  476. maxframe/tensor/arithmetic/tests/__init__.py +15 -0
  477. maxframe/tensor/arithmetic/tests/test_arithmetic.py +414 -0
  478. maxframe/tensor/arithmetic/truediv.py +104 -0
  479. maxframe/tensor/arithmetic/trunc.py +72 -0
  480. maxframe/tensor/arithmetic/utils.py +65 -0
  481. maxframe/tensor/array_utils.py +186 -0
  482. maxframe/tensor/base/__init__.py +34 -0
  483. maxframe/tensor/base/astype.py +119 -0
  484. maxframe/tensor/base/atleast_1d.py +74 -0
  485. maxframe/tensor/base/broadcast_to.py +89 -0
  486. maxframe/tensor/base/ravel.py +92 -0
  487. maxframe/tensor/base/tests/__init__.py +13 -0
  488. maxframe/tensor/base/tests/test_base.py +114 -0
  489. maxframe/tensor/base/transpose.py +125 -0
  490. maxframe/tensor/base/unique.py +205 -0
  491. maxframe/tensor/base/where.py +127 -0
  492. maxframe/tensor/core.py +724 -0
  493. maxframe/tensor/datasource/__init__.py +32 -0
  494. maxframe/tensor/datasource/arange.py +156 -0
  495. maxframe/tensor/datasource/array.py +415 -0
  496. maxframe/tensor/datasource/core.py +109 -0
  497. maxframe/tensor/datasource/empty.py +169 -0
  498. maxframe/tensor/datasource/from_dataframe.py +70 -0
  499. maxframe/tensor/datasource/from_dense.py +54 -0
  500. maxframe/tensor/datasource/from_sparse.py +47 -0
  501. maxframe/tensor/datasource/full.py +186 -0
  502. maxframe/tensor/datasource/ones.py +173 -0
  503. maxframe/tensor/datasource/scalar.py +40 -0
  504. maxframe/tensor/datasource/tests/__init__.py +13 -0
  505. maxframe/tensor/datasource/tests/test_datasource.py +278 -0
  506. maxframe/tensor/datasource/zeros.py +188 -0
  507. maxframe/tensor/fetch/__init__.py +15 -0
  508. maxframe/tensor/fetch/core.py +54 -0
  509. maxframe/tensor/indexing/__init__.py +47 -0
  510. maxframe/tensor/indexing/choose.py +196 -0
  511. maxframe/tensor/indexing/compress.py +124 -0
  512. maxframe/tensor/indexing/core.py +190 -0
  513. maxframe/tensor/indexing/extract.py +71 -0
  514. maxframe/tensor/indexing/fill_diagonal.py +183 -0
  515. maxframe/tensor/indexing/flatnonzero.py +60 -0
  516. maxframe/tensor/indexing/getitem.py +175 -0
  517. maxframe/tensor/indexing/nonzero.py +120 -0
  518. maxframe/tensor/indexing/setitem.py +132 -0
  519. maxframe/tensor/indexing/slice.py +29 -0
  520. maxframe/tensor/indexing/take.py +130 -0
  521. maxframe/tensor/indexing/tests/__init__.py +15 -0
  522. maxframe/tensor/indexing/tests/test_indexing.py +234 -0
  523. maxframe/tensor/indexing/unravel_index.py +103 -0
  524. maxframe/tensor/merge/__init__.py +15 -0
  525. maxframe/tensor/merge/stack.py +132 -0
  526. maxframe/tensor/merge/tests/__init__.py +13 -0
  527. maxframe/tensor/merge/tests/test_merge.py +52 -0
  528. maxframe/tensor/operators.py +123 -0
  529. maxframe/tensor/random/__init__.py +168 -0
  530. maxframe/tensor/random/beta.py +87 -0
  531. maxframe/tensor/random/binomial.py +137 -0
  532. maxframe/tensor/random/bytes.py +39 -0
  533. maxframe/tensor/random/chisquare.py +110 -0
  534. maxframe/tensor/random/choice.py +186 -0
  535. maxframe/tensor/random/core.py +234 -0
  536. maxframe/tensor/random/dirichlet.py +123 -0
  537. maxframe/tensor/random/exponential.py +94 -0
  538. maxframe/tensor/random/f.py +135 -0
  539. maxframe/tensor/random/gamma.py +128 -0
  540. maxframe/tensor/random/geometric.py +93 -0
  541. maxframe/tensor/random/gumbel.py +167 -0
  542. maxframe/tensor/random/hypergeometric.py +148 -0
  543. maxframe/tensor/random/laplace.py +133 -0
  544. maxframe/tensor/random/logistic.py +129 -0
  545. maxframe/tensor/random/lognormal.py +159 -0
  546. maxframe/tensor/random/logseries.py +122 -0
  547. maxframe/tensor/random/multinomial.py +133 -0
  548. maxframe/tensor/random/multivariate_normal.py +192 -0
  549. maxframe/tensor/random/negative_binomial.py +125 -0
  550. maxframe/tensor/random/noncentral_chisquare.py +132 -0
  551. maxframe/tensor/random/noncentral_f.py +126 -0
  552. maxframe/tensor/random/normal.py +143 -0
  553. maxframe/tensor/random/pareto.py +140 -0
  554. maxframe/tensor/random/permutation.py +104 -0
  555. maxframe/tensor/random/poisson.py +111 -0
  556. maxframe/tensor/random/power.py +142 -0
  557. maxframe/tensor/random/rand.py +82 -0
  558. maxframe/tensor/random/randint.py +121 -0
  559. maxframe/tensor/random/randn.py +96 -0
  560. maxframe/tensor/random/random_integers.py +123 -0
  561. maxframe/tensor/random/random_sample.py +86 -0
  562. maxframe/tensor/random/rayleigh.py +110 -0
  563. maxframe/tensor/random/shuffle.py +61 -0
  564. maxframe/tensor/random/standard_cauchy.py +105 -0
  565. maxframe/tensor/random/standard_exponential.py +72 -0
  566. maxframe/tensor/random/standard_gamma.py +120 -0
  567. maxframe/tensor/random/standard_normal.py +74 -0
  568. maxframe/tensor/random/standard_t.py +135 -0
  569. maxframe/tensor/random/tests/__init__.py +15 -0
  570. maxframe/tensor/random/tests/test_random.py +167 -0
  571. maxframe/tensor/random/triangular.py +119 -0
  572. maxframe/tensor/random/uniform.py +131 -0
  573. maxframe/tensor/random/vonmises.py +131 -0
  574. maxframe/tensor/random/wald.py +114 -0
  575. maxframe/tensor/random/weibull.py +140 -0
  576. maxframe/tensor/random/zipf.py +122 -0
  577. maxframe/tensor/rechunk/__init__.py +26 -0
  578. maxframe/tensor/rechunk/rechunk.py +43 -0
  579. maxframe/tensor/reduction/__init__.py +66 -0
  580. maxframe/tensor/reduction/all.py +103 -0
  581. maxframe/tensor/reduction/allclose.py +88 -0
  582. maxframe/tensor/reduction/any.py +105 -0
  583. maxframe/tensor/reduction/argmax.py +103 -0
  584. maxframe/tensor/reduction/argmin.py +103 -0
  585. maxframe/tensor/reduction/array_equal.py +64 -0
  586. maxframe/tensor/reduction/core.py +168 -0
  587. maxframe/tensor/reduction/count_nonzero.py +81 -0
  588. maxframe/tensor/reduction/cumprod.py +97 -0
  589. maxframe/tensor/reduction/cumsum.py +101 -0
  590. maxframe/tensor/reduction/max.py +120 -0
  591. maxframe/tensor/reduction/mean.py +123 -0
  592. maxframe/tensor/reduction/min.py +120 -0
  593. maxframe/tensor/reduction/nanargmax.py +82 -0
  594. maxframe/tensor/reduction/nanargmin.py +76 -0
  595. maxframe/tensor/reduction/nancumprod.py +91 -0
  596. maxframe/tensor/reduction/nancumsum.py +94 -0
  597. maxframe/tensor/reduction/nanmax.py +111 -0
  598. maxframe/tensor/reduction/nanmean.py +106 -0
  599. maxframe/tensor/reduction/nanmin.py +111 -0
  600. maxframe/tensor/reduction/nanprod.py +94 -0
  601. maxframe/tensor/reduction/nanstd.py +126 -0
  602. maxframe/tensor/reduction/nansum.py +115 -0
  603. maxframe/tensor/reduction/nanvar.py +149 -0
  604. maxframe/tensor/reduction/prod.py +130 -0
  605. maxframe/tensor/reduction/std.py +134 -0
  606. maxframe/tensor/reduction/sum.py +125 -0
  607. maxframe/tensor/reduction/tests/__init__.py +13 -0
  608. maxframe/tensor/reduction/tests/test_reduction.py +181 -0
  609. maxframe/tensor/reduction/var.py +176 -0
  610. maxframe/tensor/reshape/__init__.py +17 -0
  611. maxframe/tensor/reshape/reshape.py +188 -0
  612. maxframe/tensor/reshape/tests/__init__.py +15 -0
  613. maxframe/tensor/reshape/tests/test_reshape.py +37 -0
  614. maxframe/tensor/statistics/__init__.py +13 -0
  615. maxframe/tensor/statistics/percentile.py +175 -0
  616. maxframe/tensor/statistics/quantile.py +288 -0
  617. maxframe/tensor/ufunc/__init__.py +26 -0
  618. maxframe/tensor/ufunc/ufunc.py +200 -0
  619. maxframe/tensor/utils.py +718 -0
  620. maxframe/tests/__init__.py +13 -0
  621. maxframe/tests/test_codegen.py +69 -0
  622. maxframe/tests/test_protocol.py +144 -0
  623. maxframe/tests/test_utils.py +376 -0
  624. maxframe/tests/utils.py +164 -0
  625. maxframe/typing_.py +37 -0
  626. maxframe/udf.py +134 -0
  627. maxframe/utils.py +1114 -0
  628. maxframe-0.1.0b5.dist-info/METADATA +104 -0
  629. maxframe-0.1.0b5.dist-info/RECORD +647 -0
  630. maxframe-0.1.0b5.dist-info/WHEEL +5 -0
  631. maxframe-0.1.0b5.dist-info/top_level.txt +3 -0
  632. maxframe_client/__init__.py +17 -0
  633. maxframe_client/clients/__init__.py +13 -0
  634. maxframe_client/clients/framedriver.py +118 -0
  635. maxframe_client/clients/spe.py +104 -0
  636. maxframe_client/conftest.py +15 -0
  637. maxframe_client/fetcher.py +264 -0
  638. maxframe_client/session/__init__.py +22 -0
  639. maxframe_client/session/consts.py +36 -0
  640. maxframe_client/session/graph.py +119 -0
  641. maxframe_client/session/odps.py +482 -0
  642. maxframe_client/session/task.py +280 -0
  643. maxframe_client/session/tests/__init__.py +13 -0
  644. maxframe_client/session/tests/test_task.py +85 -0
  645. maxframe_client/tests/__init__.py +13 -0
  646. maxframe_client/tests/test_fetcher.py +89 -0
  647. maxframe_client/tests/test_session.py +255 -0
@@ -0,0 +1,2417 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ # Copyright 1999-2024 Alibaba Group Holding Ltd.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ import functools
18
+ import operator
19
+ import weakref
20
+ from collections.abc import Iterable
21
+ from io import StringIO
22
+ from typing import Any, Dict, List, Tuple, Union
23
+
24
+ import numpy as np
25
+ import pandas as pd
26
+
27
+ from ..core import (
28
+ ENTITY_TYPE,
29
+ HasShapeTileable,
30
+ HasShapeTileableData,
31
+ OutputType,
32
+ Tileable,
33
+ _ExecuteAndFetchMixin,
34
+ is_build_mode,
35
+ register_output_types,
36
+ )
37
+ from ..core.entity.utils import refresh_tileable_shape
38
+ from ..protocol import DataFrameTableMeta
39
+ from ..serialization.serializables import (
40
+ AnyField,
41
+ BoolField,
42
+ DataTypeField,
43
+ DictField,
44
+ Int32Field,
45
+ IntervalArrayField,
46
+ ListField,
47
+ NDArrayField,
48
+ OneOfField,
49
+ ReferenceField,
50
+ Serializable,
51
+ SeriesField,
52
+ SliceField,
53
+ StringField,
54
+ )
55
+ from ..session import get_default_session
56
+ from ..utils import (
57
+ calc_nsplits,
58
+ ceildiv,
59
+ estimate_pandas_size,
60
+ on_serialize_numpy_type,
61
+ tokenize,
62
+ )
63
+ from .utils import (
64
+ ReprSeries,
65
+ apply_if_callable,
66
+ fetch_corner_data,
67
+ merge_index_value,
68
+ parse_index,
69
+ )
70
+
71
+
72
+ class IndexValue(Serializable):
73
+ """
74
+ Meta class for index, held by IndexData, SeriesData and DataFrameData
75
+ """
76
+
77
+ __slots__ = ()
78
+
79
+ class IndexBase(Serializable):
80
+ _key = StringField("key") # to identify if the index is the same
81
+ _is_monotonic_increasing = BoolField("is_monotonic_increasing")
82
+ _is_monotonic_decreasing = BoolField("is_monotonic_decreasing")
83
+ _is_unique = BoolField("is_unique")
84
+ _max_val = AnyField("max_val", on_serialize=on_serialize_numpy_type)
85
+ _max_val_close = BoolField("max_val_close")
86
+ _min_val = AnyField("min_val", on_serialize=on_serialize_numpy_type)
87
+ _min_val_close = BoolField("min_val_close")
88
+
89
+ @property
90
+ def is_monotonic_increasing(self):
91
+ return self._is_monotonic_increasing
92
+
93
+ @property
94
+ def is_monotonic_decreasing(self):
95
+ return self._is_monotonic_decreasing
96
+
97
+ @property
98
+ def is_unique(self):
99
+ return self._is_unique
100
+
101
+ @property
102
+ def min_val(self):
103
+ return self._min_val
104
+
105
+ @property
106
+ def min_val_close(self):
107
+ return self._min_val_close
108
+
109
+ @property
110
+ def max_val(self):
111
+ return self._max_val
112
+
113
+ @property
114
+ def max_val_close(self):
115
+ return self._max_val_close
116
+
117
+ @property
118
+ def key(self):
119
+ return self._key
120
+
121
+ @property
122
+ def inferred_type(self):
123
+ return None
124
+
125
+ def to_pandas(self):
126
+ kw = {
127
+ field.tag: getattr(self, attr, None)
128
+ for attr, field in self._FIELDS.items()
129
+ if attr not in super(type(self), self)._FIELDS
130
+ }
131
+ kw = {k: v for k, v in kw.items() if v is not None}
132
+ if kw.get("data") is None:
133
+ kw["data"] = []
134
+
135
+ pd_initializer = getattr(self, "_pd_initializer", None)
136
+ if pd_initializer is None:
137
+ pd_initializer = getattr(pd, type(self).__name__)
138
+ return pd_initializer(**kw)
139
+
140
+ class Index(IndexBase):
141
+ _name = AnyField("name")
142
+ _data = NDArrayField("data")
143
+ _dtype = DataTypeField("dtype")
144
+
145
+ class RangeIndex(IndexBase):
146
+ _name = AnyField("name")
147
+ _slice = SliceField("slice")
148
+ _dtype = DataTypeField("dtype")
149
+
150
+ @property
151
+ def slice(self):
152
+ return self._slice
153
+
154
+ @property
155
+ def dtype(self):
156
+ return getattr(self, "_dtype", np.dtype(np.intc))
157
+
158
+ def to_pandas(self):
159
+ slc = self._slice
160
+ return pd.RangeIndex(
161
+ slc.start, slc.stop, slc.step, name=getattr(self, "_name", None)
162
+ )
163
+
164
+ class CategoricalIndex(IndexBase):
165
+ _name = AnyField("name")
166
+ _data = NDArrayField("data")
167
+ _categories = AnyField("categories")
168
+ _ordered = BoolField("ordered")
169
+
170
+ @property
171
+ def inferred_type(self):
172
+ return "categorical"
173
+
174
+ class IntervalIndex(IndexBase):
175
+ _name = AnyField("name")
176
+ _data = IntervalArrayField("data")
177
+ _closed = StringField("closed")
178
+
179
+ @property
180
+ def inferred_type(self):
181
+ return "interval"
182
+
183
+ class DatetimeIndex(IndexBase):
184
+ _name = AnyField("name")
185
+ _data = NDArrayField("data")
186
+ _freq = AnyField("freq")
187
+ _start = AnyField("start")
188
+ _periods = AnyField("periods")
189
+ _end = AnyField("end")
190
+ _closed = AnyField("closed")
191
+ _tz = AnyField("tz")
192
+ _ambiguous = AnyField("ambiguous")
193
+ _dayfirst = BoolField("dayfirst")
194
+ _yearfirst = BoolField("yearfirst")
195
+
196
+ @property
197
+ def inferred_type(self):
198
+ return "datetime64"
199
+
200
+ @property
201
+ def freq(self):
202
+ return getattr(self, "_freq", None)
203
+
204
+ class TimedeltaIndex(IndexBase):
205
+ _name = AnyField("name")
206
+ _data = NDArrayField("data")
207
+ _unit = AnyField("unit")
208
+ _freq = AnyField("freq")
209
+ _start = AnyField("start")
210
+ _periods = AnyField("periods")
211
+ _end = AnyField("end")
212
+ _closed = AnyField("closed")
213
+
214
+ @property
215
+ def inferred_type(self):
216
+ return "timedelta64"
217
+
218
+ class PeriodIndex(IndexBase):
219
+ _name = AnyField("name")
220
+ _data = NDArrayField("data")
221
+ _freq = AnyField("freq")
222
+ _start = AnyField("start")
223
+ _periods = AnyField("periods")
224
+ _end = AnyField("end")
225
+ _year = AnyField("year")
226
+ _month = AnyField("month")
227
+ _quarter = AnyField("quarter")
228
+ _day = AnyField("day")
229
+ _hour = AnyField("hour")
230
+ _minute = AnyField("minute")
231
+ _second = AnyField("second")
232
+ _tz = AnyField("tz")
233
+ _dtype = DataTypeField("dtype")
234
+
235
+ @property
236
+ def inferred_type(self):
237
+ return "period"
238
+
239
+ class Int64Index(IndexBase):
240
+ _pd_initializer = pd.Index
241
+
242
+ _name = AnyField("name")
243
+ _data = NDArrayField("data")
244
+ _dtype = DataTypeField("dtype")
245
+
246
+ @property
247
+ def inferred_type(self):
248
+ return "integer"
249
+
250
+ class UInt64Index(IndexBase):
251
+ _pd_initializer = pd.Index
252
+
253
+ _name = AnyField("name")
254
+ _data = NDArrayField("data")
255
+ _dtype = DataTypeField("dtype")
256
+
257
+ @property
258
+ def inferred_type(self):
259
+ return "integer"
260
+
261
+ class Float64Index(IndexBase):
262
+ _pd_initializer = pd.Index
263
+
264
+ _name = AnyField("name")
265
+ _data = NDArrayField("data")
266
+ _dtype = DataTypeField("dtype")
267
+
268
+ @property
269
+ def inferred_type(self):
270
+ return "floating"
271
+
272
+ class MultiIndex(IndexBase):
273
+ _names = ListField("names", on_serialize=list)
274
+ _dtypes = ListField("dtypes", on_serialize=list)
275
+ _data = NDArrayField("data")
276
+ _sortorder = Int32Field("sortorder")
277
+
278
+ @property
279
+ def inferred_type(self):
280
+ return "mixed"
281
+
282
+ @property
283
+ def names(self) -> list:
284
+ return self._names
285
+
286
+ def to_pandas(self):
287
+ data = getattr(self, "_data", None)
288
+ sortorder = getattr(self, "_sortorder", None)
289
+
290
+ def _build_empty_array(dtype):
291
+ try:
292
+ return np.array([], dtype=dtype)
293
+ except TypeError: # pragma: no cover
294
+ return pd.array([], dtype=dtype)
295
+
296
+ if data is None:
297
+ return pd.MultiIndex.from_arrays(
298
+ [_build_empty_array(dtype) for dtype in self._dtypes],
299
+ sortorder=sortorder,
300
+ names=self._names,
301
+ )
302
+ return pd.MultiIndex.from_tuples(
303
+ [tuple(d) for d in data], sortorder=sortorder, names=self._names
304
+ )
305
+
306
+ _index_value = OneOfField(
307
+ "index_value",
308
+ index=Index,
309
+ range_index=RangeIndex,
310
+ categorical_index=CategoricalIndex,
311
+ interval_index=IntervalIndex,
312
+ datetime_index=DatetimeIndex,
313
+ timedelta_index=TimedeltaIndex,
314
+ period_index=PeriodIndex,
315
+ int64_index=Int64Index,
316
+ uint64_index=UInt64Index,
317
+ float64_index=Float64Index,
318
+ multi_index=MultiIndex,
319
+ )
320
+
321
+ def __maxframe_tokenize__(self):
322
+ # return object for tokenize
323
+ v = self._index_value
324
+ return v._key
325
+
326
+ @property
327
+ def value(self):
328
+ return self._index_value
329
+
330
+ @property
331
+ def key(self):
332
+ return self._index_value.key
333
+
334
+ @property
335
+ def is_monotonic_increasing(self):
336
+ return self._index_value.is_monotonic_increasing
337
+
338
+ @property
339
+ def is_monotonic_decreasing(self):
340
+ return self._index_value.is_monotonic_decreasing
341
+
342
+ @property
343
+ def is_monotonic_increasing_or_decreasing(self):
344
+ return self.is_monotonic_increasing or self.is_monotonic_decreasing
345
+
346
+ @property
347
+ def is_unique(self):
348
+ return self._index_value.is_unique
349
+
350
+ @property
351
+ def min_val(self):
352
+ return self._index_value.min_val
353
+
354
+ @property
355
+ def min_val_close(self):
356
+ return self._index_value.min_val_close
357
+
358
+ @property
359
+ def max_val(self):
360
+ return self._index_value.max_val
361
+
362
+ @property
363
+ def max_val_close(self):
364
+ return self._index_value.max_val_close
365
+
366
+ @property
367
+ def min_max(self):
368
+ return (
369
+ self._index_value.min_val,
370
+ self._index_value.min_val_close,
371
+ self._index_value.max_val,
372
+ self._index_value.max_val_close,
373
+ )
374
+
375
+ @property
376
+ def name(self):
377
+ return getattr(self._index_value, "_name", None)
378
+
379
+ @property
380
+ def names(self):
381
+ return getattr(self._index_value, "_names", [self.name])
382
+
383
+ @property
384
+ def inferred_type(self):
385
+ return self._index_value.inferred_type
386
+
387
+ def has_value(self):
388
+ if isinstance(self._index_value, self.RangeIndex):
389
+ if np.isnan(self._index_value.max_val):
390
+ return False
391
+ else:
392
+ return True
393
+ elif getattr(self._index_value, "_data", None) is not None:
394
+ return True
395
+ return False
396
+
397
+ def to_pandas(self):
398
+ return self._index_value.to_pandas()
399
+
400
+
401
+ class DtypesValue(Serializable):
402
+ """
403
+ Meta class for dtypes.
404
+ """
405
+
406
+ __slots__ = ()
407
+
408
+ _key = StringField("key")
409
+ _value = SeriesField("value")
410
+
411
+ def __init__(self, key=None, value=None, **kw):
412
+ super().__init__(_key=key, _value=value, **kw)
413
+ if self._key is None:
414
+ self._key = tokenize(self._value)
415
+
416
+ @property
417
+ def key(self):
418
+ return self._key
419
+
420
+ @property
421
+ def value(self):
422
+ return self._value
423
+
424
+
425
+ def refresh_index_value(tileable: ENTITY_TYPE):
426
+ index_to_index_values = dict()
427
+ for chunk in tileable.chunks:
428
+ if chunk.ndim == 1:
429
+ index_to_index_values[chunk.index] = chunk.index_value
430
+ elif chunk.index[1] == 0:
431
+ index_to_index_values[chunk.index] = chunk.index_value
432
+ index_value = merge_index_value(index_to_index_values, store_data=False)
433
+ # keep key as original index_value's
434
+ index_value._index_value._key = tileable.index_value.key
435
+ tileable._index_value = index_value
436
+
437
+
438
+ def refresh_dtypes(tileable: ENTITY_TYPE):
439
+ all_dtypes = [c.dtypes_value.value for c in tileable.chunks if c.index[0] == 0]
440
+ dtypes = pd.concat(all_dtypes)
441
+ tileable._dtypes = dtypes
442
+ columns_values = parse_index(dtypes.index, store_data=True)
443
+ tileable._columns_value = columns_values
444
+ tileable._dtypes_value = DtypesValue(key=tokenize(dtypes), value=dtypes)
445
+
446
+
447
+ _tileable_key_property = "_tileable_key"
448
+ _tileable_dtypes_property = "_tileable_dtypes"
449
+ _tileable_index_value_property = "_tileable_index_value"
450
+ _tileable_columns_value_property = "_tileable_columns_value"
451
+ _nsplits_property = "_tileable_nsplits"
452
+ _lazy_chunk_meta_properties = (
453
+ _tileable_key_property,
454
+ _tileable_dtypes_property,
455
+ _tileable_index_value_property,
456
+ _tileable_columns_value_property,
457
+ _nsplits_property,
458
+ )
459
+
460
+
461
+ @functools.lru_cache(maxsize=128)
462
+ def _get_cum_nsplit(nsplit: Tuple[int]) -> List[int]:
463
+ return [0] + np.cumsum(nsplit).tolist()
464
+
465
+
466
+ def _calc_axis_slice(nsplit: Tuple[int], index: int) -> slice:
467
+ if not isinstance(nsplit, tuple):
468
+ nsplit = tuple(nsplit)
469
+ cum_nsplit = _get_cum_nsplit(nsplit)
470
+ return slice(cum_nsplit[index], cum_nsplit[index + 1])
471
+
472
+
473
+ def _on_deserialize_index_value(index_value):
474
+ if index_value is None:
475
+ return
476
+ try:
477
+ getattr(index_value, "value")
478
+ return index_value
479
+ except AttributeError:
480
+ return
481
+
482
+
483
+ class _ToPandasMixin(_ExecuteAndFetchMixin):
484
+ __slots__ = ()
485
+
486
+ def to_pandas(self, session=None, **kw):
487
+ return self._execute_and_fetch(session=session, **kw)
488
+
489
+
490
+ class _BatchedFetcher:
491
+ __slots__ = ()
492
+
493
+ def _iter(self, batch_size=None, session=None, **kw):
494
+ from .indexing.iloc import iloc
495
+
496
+ if batch_size is not None:
497
+ size = self.shape[0]
498
+ n_batch = ceildiv(size, batch_size)
499
+
500
+ if n_batch > 1:
501
+ for i in range(n_batch):
502
+ batch_data = iloc(self)[batch_size * i : batch_size * (i + 1)]
503
+ yield batch_data._fetch(session=session, **kw)
504
+ else:
505
+ yield self._fetch(session=session, **kw)
506
+ else:
507
+ # if batch_size is not specified, use first batch to estimate
508
+ # batch_size.
509
+ default_batch_bytes = 50 * 1024**2
510
+ first_batch = 1000
511
+ size = self.shape[0]
512
+
513
+ if size >= first_batch:
514
+ batch_data = iloc(self)[:first_batch]
515
+ first_batch_data = batch_data._fetch(session=session, **kw)
516
+ yield first_batch_data
517
+ data_size = estimate_pandas_size(first_batch_data)
518
+ batch_size = int(default_batch_bytes / data_size * first_batch)
519
+ n_batch = ceildiv(size - 1000, batch_size)
520
+ for i in range(n_batch):
521
+ batch_data = iloc(self)[
522
+ first_batch
523
+ + batch_size * i : first_batch
524
+ + batch_size * (i + 1)
525
+ ]
526
+ yield batch_data._fetch(session=session, **kw)
527
+ else:
528
+ yield self._fetch(session=session, **kw)
529
+
530
+ def iterbatch(self, batch_size=None, session=None, **kw):
531
+ # stop triggering execution under build mode
532
+ if is_build_mode():
533
+ raise ValueError("Cannot fetch data under build mode")
534
+
535
+ # trigger execution
536
+ self.execute(session=session, **kw)
537
+ return self._iter(batch_size=batch_size, session=session)
538
+
539
+ def fetch(self, session=None, **kw):
540
+ from .indexing.iloc import DataFrameIlocGetItem, SeriesIlocGetItem
541
+
542
+ batch_size = kw.pop("batch_size", None)
543
+ if isinstance(self.op, (DataFrameIlocGetItem, SeriesIlocGetItem)):
544
+ # see GH#1871
545
+ # already iloc, do not trigger batch fetch
546
+ return self._fetch(session=session, **kw)
547
+ else:
548
+ batches = list(self._iter(batch_size=batch_size, session=session, **kw))
549
+ return pd.concat(batches) if len(batches) > 1 else batches[0]
550
+
551
+ def fetch_infos(self, fields=None, session=None, **kw):
552
+ return self._fetch_infos(fields=fields, session=session, **kw)
553
+
554
+
555
+ class IndexData(HasShapeTileableData, _ToPandasMixin):
556
+ __slots__ = ()
557
+ type_name = "Index"
558
+
559
+ # optional field
560
+ _dtype = DataTypeField("dtype")
561
+ _name = AnyField("name")
562
+ _names = AnyField("names")
563
+ _index_value = ReferenceField(
564
+ "index_value", IndexValue, on_deserialize=_on_deserialize_index_value
565
+ )
566
+
567
+ def __init__(
568
+ self,
569
+ op=None,
570
+ shape=None,
571
+ nsplits=None,
572
+ dtype=None,
573
+ name=None,
574
+ names=None,
575
+ index_value=None,
576
+ **kw,
577
+ ):
578
+ super().__init__(
579
+ _op=op,
580
+ _shape=shape,
581
+ _nsplits=nsplits,
582
+ _dtype=dtype,
583
+ _name=name,
584
+ _names=names,
585
+ _index_value=index_value,
586
+ **kw,
587
+ )
588
+
589
+ @property
590
+ def params(self) -> Dict[str, Any]:
591
+ # params return the properties which useful to rebuild a new tileable object
592
+ return {
593
+ "shape": self.shape,
594
+ "dtype": self.dtype,
595
+ "name": self.name,
596
+ "index_value": self.index_value,
597
+ }
598
+
599
+ @params.setter
600
+ def params(self, new_params: Dict[str, Any]):
601
+ params = new_params.copy()
602
+ new_shape = params.pop("shape", None)
603
+ if new_shape is not None:
604
+ self._shape = new_shape
605
+ dtype = params.pop("dtype", None)
606
+ if dtype is not None:
607
+ self._dtype = dtype
608
+ index_value = params.pop("index_value", None)
609
+ if index_value is not None:
610
+ self._index_value = index_value
611
+ name = params.pop("name", None)
612
+ if name is not None:
613
+ self._name = name
614
+ if params: # pragma: no cover
615
+ raise TypeError(f"Unknown params: {list(params)}")
616
+
617
+ def refresh_params(self):
618
+ # refresh params when chunks updated
619
+ refresh_tileable_shape(self)
620
+ refresh_index_value(self)
621
+ if self._dtype is None:
622
+ self._dtype = self.chunks[0].dtype
623
+ if self._name is None:
624
+ self._name = self.chunks[0].name
625
+
626
+ def refresh_from_table_meta(self, table_meta: DataFrameTableMeta) -> None:
627
+ pass
628
+
629
+ def _to_str(self, representation=False):
630
+ if is_build_mode() or len(self._executed_sessions) == 0:
631
+ # in build mode, or not executed, just return representation
632
+ if representation:
633
+ return f"Index <op={type(self._op).__name__}, key={self.key}"
634
+ else:
635
+ return f"Index(op={type(self._op).__name__})"
636
+ else:
637
+ data = self.fetch(session=self._executed_sessions[-1])
638
+ return repr(data) if repr(data) else str(data)
639
+
640
+ def __str__(self):
641
+ return self._to_str(representation=False)
642
+
643
+ def __repr__(self):
644
+ return self._to_str(representation=True)
645
+
646
+ def _to_maxframe_tensor(self, dtype=None, order="K", extract_multi_index=False):
647
+ tensor = self.to_tensor(extract_multi_index=extract_multi_index)
648
+ dtype = dtype if dtype is not None else tensor.dtype
649
+ return tensor.astype(dtype=dtype, order=order, copy=False)
650
+
651
+ def __maxframe_tensor__(self, dtype=None, order="K"):
652
+ return self._to_maxframe_tensor(dtype=dtype, order=order)
653
+
654
+ @property
655
+ def dtype(self):
656
+ return getattr(self, "_dtype", None) or self.op.dtype
657
+
658
+ @property
659
+ def name(self):
660
+ return self._name
661
+
662
+ @property
663
+ def names(self):
664
+ return getattr(self, "_names", None) or [self.name]
665
+
666
+ @property
667
+ def index_value(self) -> IndexValue:
668
+ return self._index_value
669
+
670
+ @property
671
+ def inferred_type(self):
672
+ return self._index_value.inferred_type
673
+
674
+ def to_tensor(self, dtype=None, extract_multi_index=False):
675
+ from ..tensor.datasource.from_dataframe import from_index
676
+
677
+ return from_index(self, dtype=dtype, extract_multi_index=extract_multi_index)
678
+
679
+
680
+ class Index(HasShapeTileable, _ToPandasMixin):
681
+ __slots__ = "_df_or_series", "_parent_key", "_axis"
682
+ _allow_data_type_ = (IndexData,)
683
+ type_name = "Index"
684
+
685
+ def __new__(cls, data: Union[pd.Index, IndexData] = None, **_):
686
+ if data is not None and not isinstance(data, pd.Index):
687
+ # create corresponding Index class
688
+ # according to type of index_value
689
+ clz = globals()[type(data.index_value.value).__name__]
690
+ else:
691
+ clz = cls
692
+ return object.__new__(clz)
693
+
694
+ def __len__(self):
695
+ return len(self._data)
696
+
697
+ def __maxframe_tensor__(self, dtype=None, order="K"):
698
+ return self._data.__maxframe_tensor__(dtype=dtype, order=order)
699
+
700
+ def _get_df_or_series(self):
701
+ obj = getattr(self, "_df_or_series", None)
702
+ if obj is not None:
703
+ return obj()
704
+ return None
705
+
706
+ def _set_df_or_series(self, df_or_series, axis):
707
+ self._df_or_series = weakref.ref(df_or_series)
708
+ self._parent_key = df_or_series.key
709
+ self._axis = axis
710
+
711
+ @property
712
+ def T(self):
713
+ """Return the transpose, which is by definition self."""
714
+ return self
715
+
716
+ @property
717
+ def name(self):
718
+ return self._data.name
719
+
720
+ @name.setter
721
+ def name(self, value):
722
+ df_or_series = self._get_df_or_series()
723
+ if df_or_series is not None and df_or_series.key == self._parent_key:
724
+ df_or_series.rename_axis(value, axis=self._axis, inplace=True)
725
+ self.data = df_or_series.axes[self._axis].data
726
+ else:
727
+ self.rename(value, inplace=True)
728
+
729
+ @property
730
+ def names(self):
731
+ return self._data.names
732
+
733
+ @names.setter
734
+ def names(self, value):
735
+ df_or_series = self._get_df_or_series()
736
+ if df_or_series is not None:
737
+ df_or_series.rename_axis(value, axis=self._axis, inplace=True)
738
+ self.data = df_or_series.axes[self._axis].data
739
+ else:
740
+ self.rename(value, inplace=True)
741
+
742
+ @property
743
+ def values(self):
744
+ return self.to_tensor()
745
+
746
+ def to_frame(self, index: bool = True, name=None):
747
+ """
748
+ Create a DataFrame with a column containing the Index.
749
+
750
+ Parameters
751
+ ----------
752
+ index : bool, default True
753
+ Set the index of the returned DataFrame as the original Index.
754
+
755
+ name : object, default None
756
+ The passed name should substitute for the index name (if it has
757
+ one).
758
+
759
+ Returns
760
+ -------
761
+ DataFrame
762
+ DataFrame containing the original Index data.
763
+
764
+ See Also
765
+ --------
766
+ Index.to_series : Convert an Index to a Series.
767
+ Series.to_frame : Convert Series to DataFrame.
768
+
769
+ Examples
770
+ --------
771
+ >>> import maxframe.dataframe as md
772
+ >>> idx = md.Index(['Ant', 'Bear', 'Cow'], name='animal')
773
+ >>> idx.to_frame().execute()
774
+ animal
775
+ animal
776
+ Ant Ant
777
+ Bear Bear
778
+ Cow Cow
779
+
780
+ By default, the original Index is reused. To enforce a new Index:
781
+
782
+ >>> idx.to_frame(index=False).execute()
783
+ animal
784
+ 0 Ant
785
+ 1 Bear
786
+ 2 Cow
787
+
788
+ To override the name of the resulting column, specify `name`:
789
+
790
+ >>> idx.to_frame(index=False, name='zoo').execute()
791
+ zoo
792
+ 0 Ant
793
+ 1 Bear
794
+ 2 Cow
795
+ """
796
+ from . import dataframe_from_tensor
797
+
798
+ if isinstance(self.index_value.value, IndexValue.MultiIndex):
799
+ old_names = self.index_value.value.names
800
+
801
+ if (
802
+ name is not None
803
+ and not isinstance(name, Iterable)
804
+ or isinstance(name, str)
805
+ ):
806
+ raise TypeError("'name' must be a list / sequence of column names.")
807
+
808
+ name = list(name if name is not None else old_names)
809
+ if len(name) != len(old_names):
810
+ raise ValueError(
811
+ "'name' should have same length as number of levels on index."
812
+ )
813
+
814
+ columns = [
815
+ old or new or idx for idx, (old, new) in enumerate(zip(old_names, name))
816
+ ]
817
+ else:
818
+ columns = [name or self.name or 0]
819
+ index_ = self if index else None
820
+ return dataframe_from_tensor(
821
+ self._data._to_maxframe_tensor(self, extract_multi_index=True),
822
+ index=index_,
823
+ columns=columns,
824
+ )
825
+
826
+ def to_series(self, index=None, name=None):
827
+ """
828
+ Create a Series with both index and values equal to the index keys.
829
+
830
+ Useful with map for returning an indexer based on an index.
831
+
832
+ Parameters
833
+ ----------
834
+ index : Index, optional
835
+ Index of resulting Series. If None, defaults to original index.
836
+ name : str, optional
837
+ Dame of resulting Series. If None, defaults to name of original
838
+ index.
839
+
840
+ Returns
841
+ -------
842
+ Series
843
+ The dtype will be based on the type of the Index values.
844
+ """
845
+ from . import series_from_index
846
+
847
+ return series_from_index(self, index=index, name=name)
848
+
849
+
850
+ class RangeIndex(Index):
851
+ __slots__ = ()
852
+
853
+
854
+ class CategoricalIndex(Index):
855
+ __slots__ = ()
856
+
857
+
858
+ class IntervalIndex(Index):
859
+ __slots__ = ()
860
+
861
+
862
+ class DatetimeIndex(Index):
863
+ __slots__ = ()
864
+
865
+
866
+ class TimedeltaIndex(Index):
867
+ __slots__ = ()
868
+
869
+
870
+ class PeriodIndex(Index):
871
+ __slots__ = ()
872
+
873
+
874
+ class Int64Index(Index):
875
+ __slots__ = ()
876
+
877
+
878
+ class UInt64Index(Index):
879
+ __slots__ = ()
880
+
881
+
882
+ class Float64Index(Index):
883
+ __slots__ = ()
884
+
885
+
886
+ class MultiIndex(Index):
887
+ __slots__ = ()
888
+
889
+
890
+ class BaseSeriesData(HasShapeTileableData, _ToPandasMixin):
891
+ __slots__ = "_cache", "_accessors"
892
+
893
+ # optional field
894
+ _dtype = DataTypeField("dtype")
895
+ _name = AnyField("name")
896
+ _index_value = ReferenceField(
897
+ "index_value", IndexValue, on_deserialize=_on_deserialize_index_value
898
+ )
899
+
900
+ def __init__(
901
+ self,
902
+ op=None,
903
+ shape=None,
904
+ nsplits=None,
905
+ dtype=None,
906
+ name=None,
907
+ index_value=None,
908
+ **kw,
909
+ ):
910
+ super().__init__(
911
+ _op=op,
912
+ _shape=shape,
913
+ _nsplits=nsplits,
914
+ _dtype=dtype,
915
+ _name=name,
916
+ _index_value=index_value,
917
+ **kw,
918
+ )
919
+ self._accessors = dict()
920
+
921
+ def _get_params(self) -> Dict[str, Any]:
922
+ # params return the properties which useful to rebuild a new tileable object
923
+ return {
924
+ "shape": self.shape,
925
+ "dtype": self.dtype,
926
+ "name": self.name,
927
+ "index_value": self.index_value,
928
+ }
929
+
930
+ def _set_params(self, new_params: Dict[str, Any]):
931
+ params = new_params.copy()
932
+ new_shape = params.pop("shape", None)
933
+ if new_shape is not None:
934
+ self._shape = new_shape
935
+ dtype = params.pop("dtype", None)
936
+ if dtype is not None:
937
+ self._dtype = dtype
938
+ index_value = params.pop("index_value", None)
939
+ if index_value is not None:
940
+ self._index_value = index_value
941
+ name = params.pop("name", None)
942
+ if name is not None:
943
+ self._name = name
944
+ if params: # pragma: no cover
945
+ raise TypeError(f"Unknown params: {list(params)}")
946
+
947
+ params = property(_get_params, _set_params)
948
+
949
+ def refresh_params(self):
950
+ # refresh params when chunks updated
951
+ refresh_tileable_shape(self)
952
+ refresh_index_value(self)
953
+ if self._dtype is None:
954
+ self._dtype = self.chunks[0].dtype
955
+ if self._name is None:
956
+ self._name = self.chunks[0].name
957
+
958
+ def refresh_from_table_meta(self, table_meta: DataFrameTableMeta) -> None:
959
+ pass
960
+
961
+ def _to_str(self, representation=False):
962
+ if is_build_mode() or len(self._executed_sessions) == 0:
963
+ # in build mode, or not executed, just return representation
964
+ if representation:
965
+ return (
966
+ f"{self.type_name} <op={type(self._op).__name__}, key={self.key}>"
967
+ )
968
+ else:
969
+ return f"{self.type_name}(op={type(self._op).__name__})"
970
+ else:
971
+ corner_data = fetch_corner_data(self, session=self._executed_sessions[-1])
972
+
973
+ buf = StringIO()
974
+ max_rows = pd.get_option("display.max_rows")
975
+ corner_max_rows = (
976
+ max_rows
977
+ if self.shape[0] <= max_rows or corner_data.shape[0] == 0
978
+ else corner_data.shape[0] - 1
979
+ ) # make sure max_rows < corner_data
980
+
981
+ with pd.option_context("display.max_rows", corner_max_rows):
982
+ if self.shape[0] <= max_rows:
983
+ corner_series = corner_data
984
+ else:
985
+ corner_series = ReprSeries(corner_data, self.shape)
986
+ buf.write(repr(corner_series) if representation else str(corner_series))
987
+
988
+ return buf.getvalue()
989
+
990
+ def __str__(self):
991
+ return self._to_str(representation=False)
992
+
993
+ def __repr__(self):
994
+ return self._to_str(representation=True)
995
+
996
+ @property
997
+ def dtype(self):
998
+ return getattr(self, "_dtype", None) or getattr(self.op, "dtype", None)
999
+
1000
+ @property
1001
+ def name(self):
1002
+ return self._name
1003
+
1004
+ @property
1005
+ def index_value(self):
1006
+ return self._index_value
1007
+
1008
+ @property
1009
+ def index(self):
1010
+ from .datasource.index import from_tileable
1011
+
1012
+ return from_tileable(self)
1013
+
1014
+ @property
1015
+ def axes(self):
1016
+ return [self.index]
1017
+
1018
+ @property
1019
+ def empty(self):
1020
+ shape = getattr(self, "_shape")
1021
+ if np.any(np.isnan(shape)):
1022
+ raise ValueError("Tileable object must be executed first")
1023
+ return shape == (0,)
1024
+
1025
+ def to_tensor(self, dtype=None):
1026
+ from ..tensor.datasource.from_dataframe import from_series
1027
+
1028
+ return from_series(self, dtype=dtype)
1029
+
1030
+ @staticmethod
1031
+ def from_tensor(in_tensor, index=None, name=None):
1032
+ from .datasource.from_tensor import series_from_tensor
1033
+
1034
+ return series_from_tensor(in_tensor, index=index, name=name)
1035
+
1036
+
1037
+ class SeriesData(_BatchedFetcher, BaseSeriesData):
1038
+ type_name = "Series"
1039
+
1040
+ def __maxframe_tensor__(self, dtype=None, order="K"):
1041
+ tensor = self.to_tensor()
1042
+ dtype = dtype if dtype is not None else tensor.dtype
1043
+ return tensor.astype(dtype=dtype, order=order, copy=False)
1044
+
1045
+ def iteritems(self, batch_size=10000, session=None):
1046
+ for batch_data in self.iterbatch(batch_size=batch_size, session=session):
1047
+ yield from getattr(batch_data, "iteritems")()
1048
+
1049
+ items = iteritems
1050
+
1051
+ def to_dict(self, into=dict, batch_size=10000, session=None):
1052
+ fetch_kwargs = dict(batch_size=batch_size)
1053
+ return self.to_pandas(session=session, fetch_kwargs=fetch_kwargs).to_dict(
1054
+ into=into
1055
+ )
1056
+
1057
+
1058
+ class Series(HasShapeTileable, _ToPandasMixin):
1059
+ __slots__ = ("_cache",)
1060
+ _allow_data_type_ = (SeriesData,)
1061
+ type_name = "Series"
1062
+
1063
+ def to_tensor(self, dtype=None):
1064
+ return self._data.to_tensor(dtype=dtype)
1065
+
1066
+ def from_tensor(self, in_tensor, index=None, name=None):
1067
+ return self._data.from_tensor(in_tensor, index=index, name=name)
1068
+
1069
+ @property
1070
+ def T(self):
1071
+ """Return the transpose, which is by definition self."""
1072
+ return self
1073
+
1074
+ @property
1075
+ def ndim(self):
1076
+ """
1077
+ Return an int representing the number of axes / array dimensions.
1078
+
1079
+ Return 1 if Series. Otherwise return 2 if DataFrame.
1080
+
1081
+ See Also
1082
+ --------
1083
+ ndarray.ndim : Number of array dimensions.
1084
+
1085
+ Examples
1086
+ --------
1087
+ >>> import maxframe.dataframe as md
1088
+ >>> s = md.Series({'a': 1, 'b': 2, 'c': 3})
1089
+ >>> s.ndim.execute()
1090
+ 1
1091
+
1092
+ >>> df = md.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
1093
+ >>> df.ndim.execute()
1094
+ 2
1095
+ """
1096
+ return super().ndim
1097
+
1098
+ @property
1099
+ def index(self):
1100
+ """
1101
+ The index (axis labels) of the Series.
1102
+ """
1103
+ idx = self._data.index
1104
+ idx._set_df_or_series(self, 0)
1105
+ return idx
1106
+
1107
+ @index.setter
1108
+ def index(self, new_index):
1109
+ self.set_axis(new_index, axis=0, inplace=True)
1110
+
1111
+ @property
1112
+ def name(self):
1113
+ return self._data.name
1114
+
1115
+ @name.setter
1116
+ def name(self, val):
1117
+ from .indexing.rename import DataFrameRename
1118
+
1119
+ op = DataFrameRename(new_name=val, output_types=[OutputType.series])
1120
+ new_series = op(self)
1121
+ self.data = new_series.data
1122
+
1123
+ @property
1124
+ def dtype(self):
1125
+ """
1126
+ Return the dtype object of the underlying data.
1127
+ """
1128
+ return self._data.dtype
1129
+
1130
+ def copy(self, deep=True): # pylint: disable=arguments-differ
1131
+ """
1132
+ Make a copy of this object's indices and data.
1133
+
1134
+ When ``deep=True`` (default), a new object will be created with a
1135
+ copy of the calling object's data and indices. Modifications to
1136
+ the data or indices of the copy will not be reflected in the
1137
+ original object (see notes below).
1138
+
1139
+ When ``deep=False``, a new object will be created without copying
1140
+ the calling object's data or index (only references to the data
1141
+ and index are copied). Any changes to the data of the original
1142
+ will be reflected in the shallow copy (and vice versa).
1143
+
1144
+ Parameters
1145
+ ----------
1146
+ deep : bool, default True
1147
+ Make a deep copy, including a copy of the data and the indices.
1148
+ With ``deep=False`` neither the indices nor the data are copied.
1149
+
1150
+ Returns
1151
+ -------
1152
+ copy : Series or DataFrame
1153
+ Object type matches caller.
1154
+ """
1155
+ if deep:
1156
+ return super().copy()
1157
+ else:
1158
+ return super()._view()
1159
+
1160
+ def __len__(self):
1161
+ return len(self._data)
1162
+
1163
+ def __maxframe_tensor__(self, dtype=None, order="K"):
1164
+ return self._data.__maxframe_tensor__(dtype=dtype, order=order)
1165
+
1166
+ def keys(self):
1167
+ """
1168
+ Return alias for index.
1169
+
1170
+ Returns
1171
+ -------
1172
+ Index
1173
+ Index of the Series.
1174
+ """
1175
+ return self.index
1176
+
1177
+ @property
1178
+ def values(self):
1179
+ return self.to_tensor()
1180
+
1181
+ def iteritems(self, batch_size=10000, session=None):
1182
+ """
1183
+ Lazily iterate over (index, value) tuples.
1184
+
1185
+ This method returns an iterable tuple (index, value). This is
1186
+ convenient if you want to create a lazy iterator.
1187
+
1188
+ Returns
1189
+ -------
1190
+ iterable
1191
+ Iterable of tuples containing the (index, value) pairs from a
1192
+ Series.
1193
+
1194
+ See Also
1195
+ --------
1196
+ DataFrame.items : Iterate over (column name, Series) pairs.
1197
+ DataFrame.iterrows : Iterate over DataFrame rows as (index, Series) pairs.
1198
+
1199
+ Examples
1200
+ --------
1201
+ >>> import maxframe.dataframe as md
1202
+ >>> s = md.Series(['A', 'B', 'C'])
1203
+ >>> for index, value in s.items():
1204
+ ... print(f"Index : {index}, Value : {value}")
1205
+ Index : 0, Value : A
1206
+ Index : 1, Value : B
1207
+ Index : 2, Value : C
1208
+ """
1209
+ return self._data.iteritems(batch_size=batch_size, session=session)
1210
+
1211
+ items = iteritems
1212
+
1213
+ def to_dict(self, into=dict, batch_size=10000, session=None):
1214
+ """
1215
+ Convert Series to {label -> value} dict or dict-like object.
1216
+
1217
+ Parameters
1218
+ ----------
1219
+ into : class, default dict
1220
+ The collections.abc.Mapping subclass to use as the return
1221
+ object. Can be the actual class or an empty
1222
+ instance of the mapping type you want. If you want a
1223
+ collections.defaultdict, you must pass it initialized.
1224
+
1225
+ Returns
1226
+ -------
1227
+ collections.abc.Mapping
1228
+ Key-value representation of Series.
1229
+
1230
+ Examples
1231
+ --------
1232
+ >>> import maxframe.dataframe as md
1233
+ >>> s = md.Series([1, 2, 3, 4])
1234
+ >>> s.to_dict()
1235
+ {0: 1, 1: 2, 2: 3, 3: 4}
1236
+ >>> from collections import OrderedDict, defaultdict
1237
+ >>> s.to_dict(OrderedDict)
1238
+ OrderedDict([(0, 1), (1, 2), (2, 3), (3, 4)])
1239
+ >>> dd = defaultdict(list)
1240
+ >>> s.to_dict(dd)
1241
+ defaultdict(<class 'list'>, {0: 1, 1: 2, 2: 3, 3: 4})
1242
+ """
1243
+ return self._data.to_dict(into=into, batch_size=batch_size, session=session)
1244
+
1245
+ def to_frame(self, name=None):
1246
+ """
1247
+ Convert Series to DataFrame.
1248
+
1249
+ Parameters
1250
+ ----------
1251
+ name : object, default None
1252
+ The passed name should substitute for the series name (if it has
1253
+ one).
1254
+
1255
+ Returns
1256
+ -------
1257
+ DataFrame
1258
+ DataFrame representation of Series.
1259
+
1260
+ Examples
1261
+ --------
1262
+ >>> import maxframe.dataframe as md
1263
+ >>> s = md.Series(["a", "b", "c"], name="vals")
1264
+ >>> s.to_frame().execute()
1265
+ vals
1266
+ 0 a
1267
+ 1 b
1268
+ 2 c
1269
+ """
1270
+ from . import dataframe_from_tensor
1271
+
1272
+ name = name or self.name or 0
1273
+ return dataframe_from_tensor(self, columns=[name])
1274
+
1275
+ def between(self, left, right, inclusive="both"):
1276
+ """
1277
+ Return boolean Series equivalent to left <= series <= right.
1278
+ This function returns a boolean vector containing `True` wherever the
1279
+ corresponding Series element is between the boundary values `left` and
1280
+ `right`. NA values are treated as `False`.
1281
+
1282
+ Parameters
1283
+ ----------
1284
+ left : scalar or list-like
1285
+ Left boundary.
1286
+ right : scalar or list-like
1287
+ Right boundary.
1288
+ inclusive : {"both", "neither", "left", "right"}
1289
+ Include boundaries. Whether to set each bound as closed or open.
1290
+
1291
+ Returns
1292
+ -------
1293
+ Series
1294
+ Series representing whether each element is between left and
1295
+ right (inclusive).
1296
+
1297
+ See Also
1298
+ --------
1299
+ Series.gt : Greater than of series and other.
1300
+ Series.lt : Less than of series and other.
1301
+
1302
+ Notes
1303
+ -----
1304
+ This function is equivalent to ``(left <= ser) & (ser <= right)``
1305
+
1306
+ Examples
1307
+ --------
1308
+ >>> import maxframe.dataframe as md
1309
+ >>> s = md.Series([2, 0, 4, 8, np.nan])
1310
+
1311
+ Boundary values are included by default:
1312
+
1313
+ >>> s.between(1, 4).execute()
1314
+ 0 True
1315
+ 1 False
1316
+ 2 True
1317
+ 3 False
1318
+ 4 False
1319
+ dtype: bool
1320
+
1321
+ With `inclusive` set to ``"neither"`` boundary values are excluded:
1322
+
1323
+ >>> s.between(1, 4, inclusive="neither").execute()
1324
+ 0 True
1325
+ 1 False
1326
+ 2 False
1327
+ 3 False
1328
+ 4 False
1329
+ dtype: bool
1330
+
1331
+ `left` and `right` can be any scalar value:
1332
+
1333
+ >>> s = md.Series(['Alice', 'Bob', 'Carol', 'Eve'])
1334
+ >>> s.between('Anna', 'Daniel').execute()
1335
+ 0 False
1336
+ 1 True
1337
+ 2 True
1338
+ 3 False
1339
+ dtype: bool
1340
+ """
1341
+ if isinstance(inclusive, bool): # pragma: no cover
1342
+ # for pandas < 1.3.0
1343
+ if inclusive:
1344
+ inclusive = "both"
1345
+ else:
1346
+ inclusive = "neither"
1347
+ if inclusive == "both":
1348
+ lmask = self >= left
1349
+ rmask = self <= right
1350
+ elif inclusive == "left":
1351
+ lmask = self >= left
1352
+ rmask = self < right
1353
+ elif inclusive == "right":
1354
+ lmask = self > left
1355
+ rmask = self <= right
1356
+ elif inclusive == "neither":
1357
+ lmask = self > left
1358
+ rmask = self < right
1359
+ else:
1360
+ raise ValueError(
1361
+ "Inclusive has to be either string of 'both',"
1362
+ "'left', 'right', or 'neither'."
1363
+ )
1364
+
1365
+ return lmask & rmask
1366
+
1367
+ # def median(
1368
+ # self, axis=None, skipna=True, out=None, overwrite_input=False, keepdims=False
1369
+ # ):
1370
+ # """
1371
+ # Return the median of the values over the requested axis.
1372
+ #
1373
+ # Parameters
1374
+ # ----------
1375
+ # axis : {index (0)}
1376
+ # Axis or axes along which the medians are computed. The default
1377
+ # is to compute the median along a flattened version of the tensor.
1378
+ # A sequence of axes is supported since version 1.9.0.
1379
+ # skipna : bool, optional, default True
1380
+ # Exclude NA/null values when computing the result.
1381
+ # out : Tensor, default None
1382
+ # Output tensor in which to place the result. It must
1383
+ # have the same shape and buffer length as the expected output,
1384
+ # but the type (of the output) will be cast if necessary.
1385
+ # overwrite_input : bool, default False
1386
+ # Just for compatibility with Numpy, would not take effect.
1387
+ # keepdims : bool, default False
1388
+ # If this is set to True, the axes which are reduced are left
1389
+ # in the result as dimensions with size one. With this option,
1390
+ # the result will broadcast correctly against the original `arr`.
1391
+ #
1392
+ # Returns
1393
+ # -------
1394
+ # median : scalar
1395
+ # Return the median of the values over the requested axis.
1396
+ #
1397
+ # See Also
1398
+ # --------
1399
+ # tensor.mean, tensor.percentile
1400
+ #
1401
+ # Notes
1402
+ # -----
1403
+ # Given a vector ``V`` of length ``N``, the median of ``V`` is the
1404
+ # middle value of a sorted copy of ``V``, ``V_sorted`` - i
1405
+ # e., ``V_sorted[(N-1)/2]``, when ``N`` is odd, and the average of the
1406
+ # two middle values of ``V_sorted`` when ``N`` is even.
1407
+ #
1408
+ # Examples
1409
+ # --------
1410
+ # >>> import maxframe.dataframe as md
1411
+ # >>> a = md.Series([10, 7, 4, 3, 2, 1])
1412
+ # >>> a.median().execute()
1413
+ # 2.0
1414
+ # >>> mt.median(a).execute()
1415
+ # 3.5
1416
+ # >>> a = md.Series([10, 7, 4, None, 2, 1])
1417
+ # >>> a.median().execute()
1418
+ # 4.0
1419
+ # >>> a.median(skipna=False).execute()
1420
+ # nan
1421
+ # """
1422
+ # if skipna:
1423
+ # return statistics.median(
1424
+ # self.dropna(),
1425
+ # axis=None,
1426
+ # out=None,
1427
+ # overwrite_input=False,
1428
+ # keepdims=False,
1429
+ # )
1430
+ # else:
1431
+ # return statistics.median(
1432
+ # self, axis=None, out=None, overwrite_input=False, keepdims=False
1433
+ # )
1434
+
1435
+
1436
+ class BaseDataFrameData(HasShapeTileableData, _ToPandasMixin):
1437
+ __slots__ = "_accessors", "_dtypes_value", "_dtypes_dict"
1438
+
1439
+ # optional fields
1440
+ _dtypes = SeriesField("dtypes")
1441
+ _index_value = ReferenceField(
1442
+ "index_value", IndexValue, on_deserialize=_on_deserialize_index_value
1443
+ )
1444
+ _columns_value = ReferenceField("columns_value", IndexValue)
1445
+
1446
+ def __init__(
1447
+ self,
1448
+ op=None,
1449
+ shape=None,
1450
+ nsplits=None,
1451
+ dtypes=None,
1452
+ index_value=None,
1453
+ columns_value=None,
1454
+ **kw,
1455
+ ):
1456
+ super().__init__(
1457
+ _op=op,
1458
+ _shape=shape,
1459
+ _nsplits=nsplits,
1460
+ _dtypes=dtypes,
1461
+ _index_value=index_value,
1462
+ _columns_value=columns_value,
1463
+ **kw,
1464
+ )
1465
+ self._accessors = dict()
1466
+ self._dtypes_value = None
1467
+ self._dtypes_dict = None
1468
+
1469
+ def __on_deserialize__(self):
1470
+ super().__on_deserialize__()
1471
+ self._accessors = dict()
1472
+ self._dtypes_value = None
1473
+ self._dtypes_dict = None
1474
+
1475
+ def _get_params(self) -> Dict[str, Any]:
1476
+ # params return the properties which useful to rebuild a new tileable object
1477
+ return {
1478
+ "shape": self.shape,
1479
+ "dtypes": self.dtypes,
1480
+ "index_value": self.index_value,
1481
+ "columns_value": self.columns_value,
1482
+ "dtypes_value": self.dtypes_value,
1483
+ }
1484
+
1485
+ def _set_params(self, new_params: Dict[str, Any]):
1486
+ params = new_params.copy()
1487
+ new_shape = params.pop("shape", None)
1488
+ if new_shape is not None:
1489
+ self._shape = new_shape
1490
+ index_value = params.pop("index_value", None)
1491
+ if index_value is not None:
1492
+ self._index_value = index_value
1493
+ dtypes = params.pop("dtypes", None)
1494
+ if dtypes is not None:
1495
+ self._dtypes = dtypes
1496
+ columns_value = params.pop("columns_value", None)
1497
+ if columns_value is not None:
1498
+ self._columns_value = columns_value
1499
+ dtypes_value = params.pop("dtypes_value", None)
1500
+ if dtypes_value is not None:
1501
+ if dtypes is None:
1502
+ self._dtypes = dtypes_value.value
1503
+ if columns_value is None:
1504
+ self._columns_value = parse_index(self._dtypes.index, store_data=True)
1505
+ self._dtypes_value = dtypes_value
1506
+ if params: # pragma: no cover
1507
+ raise TypeError(f"Unknown params: {list(params)}")
1508
+
1509
+ params = property(_get_params, _set_params)
1510
+
1511
+ def refresh_params(self):
1512
+ # refresh params when chunks updated
1513
+ refresh_tileable_shape(self)
1514
+ refresh_index_value(self)
1515
+ refresh_dtypes(self)
1516
+
1517
+ def refresh_from_table_meta(self, table_meta: DataFrameTableMeta) -> None:
1518
+ dtypes = table_meta.pd_column_dtypes
1519
+ self._dtypes = dtypes
1520
+ self._columns_value = parse_index(dtypes.index, store_data=True)
1521
+ self._dtypes_value = DtypesValue(key=tokenize(dtypes), value=dtypes)
1522
+ new_shape = list(self._shape)
1523
+ new_shape[0] = len(dtypes)
1524
+ self._shape = tuple(new_shape)
1525
+
1526
+ @property
1527
+ def dtypes(self):
1528
+ dt = getattr(self, "_dtypes", None)
1529
+ if dt is not None:
1530
+ return dt
1531
+ return getattr(self.op, "dtypes", None)
1532
+
1533
+ @property
1534
+ def dtypes_value(self):
1535
+ if self._dtypes_value is not None:
1536
+ return self._dtypes_value
1537
+ # TODO(qinxuye): when creating Dataframe,
1538
+ # dtypes_value instead of dtypes later must be passed into
1539
+ dtypes = self.dtypes
1540
+ if dtypes is not None:
1541
+ self._dtypes_value = DtypesValue(key=tokenize(dtypes), value=dtypes)
1542
+ return self._dtypes_value
1543
+
1544
+ @property
1545
+ def index_value(self):
1546
+ return self._index_value
1547
+
1548
+ @property
1549
+ def columns_value(self):
1550
+ return self._columns_value
1551
+
1552
+ @property
1553
+ def empty(self):
1554
+ shape = getattr(self, "_shape")
1555
+ if np.any(np.isnan(shape)):
1556
+ raise ValueError("Tileable object must be executed first")
1557
+ return 0 in shape
1558
+
1559
+ def to_tensor(self, dtype=None):
1560
+ from ..tensor.datasource.from_dataframe import from_dataframe
1561
+
1562
+ return from_dataframe(self, dtype=dtype)
1563
+
1564
+ @staticmethod
1565
+ def from_tensor(in_tensor, index=None, columns=None):
1566
+ from .datasource.from_tensor import dataframe_from_tensor
1567
+
1568
+ return dataframe_from_tensor(in_tensor, index=index, columns=columns)
1569
+
1570
+ @staticmethod
1571
+ def from_records(records, **kw):
1572
+ from .datasource.from_records import from_records
1573
+
1574
+ return from_records(records, **kw)
1575
+
1576
+ @property
1577
+ def index(self):
1578
+ from .datasource.index import from_tileable
1579
+
1580
+ return from_tileable(self)
1581
+
1582
+ @property
1583
+ def columns(self):
1584
+ from .datasource.index import from_pandas as from_pandas_index
1585
+
1586
+ return from_pandas_index(self.dtypes.index, store_data=True)
1587
+
1588
+ @property
1589
+ def axes(self):
1590
+ return [self.index, self.columns]
1591
+
1592
+ def _get_dtypes_dict(self):
1593
+ if self._dtypes_dict is None:
1594
+ self._dtypes_dict = d = dict()
1595
+ for k, v in self.dtypes.items():
1596
+ try:
1597
+ obj_list = d[k]
1598
+ except KeyError:
1599
+ obj_list = d[k] = []
1600
+ obj_list.append(v)
1601
+ return self._dtypes_dict
1602
+
1603
+ def _get_dtypes_by_columns(self, columns: list):
1604
+ dtypes_dict = self._get_dtypes_dict()
1605
+ return functools.reduce(operator.add, (dtypes_dict[c] for c in columns), [])
1606
+
1607
+ def _get_columns_by_columns(self, columns: list):
1608
+ dtypes_dict = self._get_dtypes_dict()
1609
+ return functools.reduce(
1610
+ operator.add, ([c] * len(dtypes_dict[c]) for c in columns), []
1611
+ )
1612
+
1613
+
1614
+ class DataFrameData(_BatchedFetcher, BaseDataFrameData):
1615
+ type_name = "DataFrame"
1616
+
1617
+ def _to_str(self, representation=False):
1618
+ if is_build_mode() or len(self._executed_sessions) == 0:
1619
+ # in build mode, or not executed, just return representation
1620
+ if representation:
1621
+ return (
1622
+ f"{self.type_name} <op={type(self._op).__name__}, key={self.key}>"
1623
+ )
1624
+ else:
1625
+ return f"{self.type_name}(op={type(self._op).__name__})"
1626
+ else:
1627
+ corner_data = fetch_corner_data(self, session=self._executed_sessions[-1])
1628
+
1629
+ buf = StringIO()
1630
+ max_rows = pd.get_option("display.max_rows")
1631
+
1632
+ if self.shape[0] <= max_rows or corner_data.shape[0] == 0:
1633
+ buf.write(repr(corner_data) if representation else str(corner_data))
1634
+ else:
1635
+ # remember we cannot directly call repr(df),
1636
+ # because the [... rows x ... columns] may show wrong rows
1637
+ with pd.option_context(
1638
+ "display.show_dimensions",
1639
+ False,
1640
+ "display.max_rows",
1641
+ corner_data.shape[0] - 1,
1642
+ ):
1643
+ if representation:
1644
+ s = repr(corner_data)
1645
+ else:
1646
+ s = str(corner_data)
1647
+ buf.write(s)
1648
+ if pd.get_option("display.show_dimensions"):
1649
+ n_rows, n_cols = self.shape
1650
+ buf.write(f"\n\n[{n_rows} rows x {n_cols} columns]")
1651
+
1652
+ return buf.getvalue()
1653
+
1654
+ def __str__(self):
1655
+ return self._to_str(representation=False)
1656
+
1657
+ def __repr__(self):
1658
+ return self._to_str(representation=True)
1659
+
1660
+ def __maxframe_tensor__(self, dtype=None, order="K"):
1661
+ return self.to_tensor().astype(dtype=dtype, order=order, copy=False)
1662
+
1663
+ def _repr_html_(self):
1664
+ if len(self._executed_sessions) == 0:
1665
+ # not executed before, fall back to normal repr
1666
+ raise NotImplementedError
1667
+
1668
+ corner_data = fetch_corner_data(self, session=self._executed_sessions[-1])
1669
+
1670
+ buf = StringIO()
1671
+ max_rows = pd.get_option("display.max_rows")
1672
+ if self.shape[0] <= max_rows:
1673
+ buf.write(corner_data._repr_html_())
1674
+ else:
1675
+ with pd.option_context(
1676
+ "display.show_dimensions",
1677
+ False,
1678
+ "display.max_rows",
1679
+ corner_data.shape[0] - 1,
1680
+ ):
1681
+ buf.write(corner_data._repr_html_().rstrip().rstrip("</div>"))
1682
+ if pd.get_option("display.show_dimensions"):
1683
+ n_rows, n_cols = self.shape
1684
+ buf.write(f"<p>{n_rows} rows × {n_cols} columns</p>\n")
1685
+ buf.write("</div>")
1686
+
1687
+ return buf.getvalue()
1688
+
1689
+ def items(self):
1690
+ for col_name in self.dtypes.index:
1691
+ yield col_name, self[col_name]
1692
+
1693
+ iteritems = items
1694
+
1695
+ def iterrows(self, batch_size=1000, session=None):
1696
+ for batch_data in self.iterbatch(batch_size=batch_size, session=session):
1697
+ yield from getattr(batch_data, "iterrows")()
1698
+
1699
+ def itertuples(self, index=True, name="Pandas", batch_size=1000, session=None):
1700
+ for batch_data in self.iterbatch(batch_size=batch_size, session=session):
1701
+ yield from getattr(batch_data, "itertuples")(index=index, name=name)
1702
+
1703
+ def _need_execution(self):
1704
+ if self._dtypes is None:
1705
+ return True
1706
+ return False
1707
+
1708
+
1709
+ class DataFrame(HasShapeTileable, _ToPandasMixin):
1710
+ __slots__ = ("_cache",)
1711
+ _allow_data_type_ = (DataFrameData,)
1712
+ type_name = "DataFrame"
1713
+
1714
+ def __len__(self):
1715
+ return len(self._data)
1716
+
1717
+ def to_tensor(self):
1718
+ return self._data.to_tensor()
1719
+
1720
+ def from_tensor(self, in_tensor, index=None, columns=None):
1721
+ return self._data.from_tensor(in_tensor, index=index, columns=columns)
1722
+
1723
+ def from_records(self, records, **kw):
1724
+ return self._data.from_records(records, **kw)
1725
+
1726
+ def __maxframe_tensor__(self, dtype=None, order="K"):
1727
+ return self._data.__maxframe_tensor__(dtype=dtype, order=order)
1728
+
1729
+ def __getattr__(self, key):
1730
+ try:
1731
+ return getattr(self._data, key)
1732
+ except AttributeError:
1733
+ if key in self.dtypes:
1734
+ return self[key]
1735
+ else:
1736
+ raise
1737
+
1738
+ def __dir__(self):
1739
+ result = list(super().__dir__())
1740
+ return sorted(
1741
+ result
1742
+ + [k for k in self.dtypes.index if isinstance(k, str) and k.isidentifier()]
1743
+ )
1744
+
1745
+ @property
1746
+ def T(self):
1747
+ return self.transpose()
1748
+
1749
+ @property
1750
+ def ndim(self):
1751
+ """
1752
+ Return an int representing the number of axes / array dimensions.
1753
+
1754
+ Return 1 if Series. Otherwise return 2 if DataFrame.
1755
+
1756
+ See Also
1757
+ --------
1758
+ ndarray.ndim : Number of array dimensions.
1759
+
1760
+ Examples
1761
+ --------
1762
+ >>> import maxframe.dataframe as md
1763
+ >>> s = md.Series({'a': 1, 'b': 2, 'c': 3})
1764
+ >>> s.ndim.execute()
1765
+ 1
1766
+
1767
+ >>> df = md.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
1768
+ >>> df.ndim.execute()
1769
+ 2
1770
+ """
1771
+ return super().ndim
1772
+
1773
+ @property
1774
+ def index(self):
1775
+ idx = self._data.index
1776
+ idx._set_df_or_series(self, 0)
1777
+ return idx
1778
+
1779
+ @index.setter
1780
+ def index(self, new_index):
1781
+ self.set_axis(new_index, axis=0, inplace=True)
1782
+
1783
+ @property
1784
+ def columns(self):
1785
+ col = self._data.columns
1786
+ col._set_df_or_series(self, 1)
1787
+ return col
1788
+
1789
+ @columns.setter
1790
+ def columns(self, new_columns):
1791
+ self.set_axis(new_columns, axis=1, inplace=True)
1792
+
1793
+ def keys(self):
1794
+ """
1795
+ Get the 'info axis' (see Indexing for more).
1796
+
1797
+ This is index for Series, columns for DataFrame.
1798
+
1799
+ Returns
1800
+ -------
1801
+ Index
1802
+ Info axis.
1803
+ """
1804
+ return self.columns
1805
+
1806
+ @property
1807
+ def values(self):
1808
+ return self.to_tensor()
1809
+
1810
+ @property
1811
+ def dtypes(self):
1812
+ """
1813
+ Return the dtypes in the DataFrame.
1814
+
1815
+ This returns a Series with the data type of each column.
1816
+ The result's index is the original DataFrame's columns. Columns
1817
+ with mixed types are stored with the ``object`` dtype. See
1818
+ :ref:`the User Guide <basics.dtypes>` for more.
1819
+
1820
+ Returns
1821
+ -------
1822
+ pandas.Series
1823
+ The data type of each column.
1824
+
1825
+ Examples
1826
+ --------
1827
+ >>> import maxframe.dataframe as md
1828
+ >>> df = md.DataFrame({'float': [1.0],
1829
+ ... 'int': [1],
1830
+ ... 'datetime': [md.Timestamp('20180310')],
1831
+ ... 'string': ['foo']})
1832
+ >>> df.dtypes
1833
+ float float64
1834
+ int int64
1835
+ datetime datetime64[ns]
1836
+ string object
1837
+ dtype: object
1838
+ """
1839
+ return self._data.dtypes
1840
+
1841
+ def iterrows(self, batch_size=1000, session=None):
1842
+ """
1843
+ Iterate over DataFrame rows as (index, Series) pairs.
1844
+
1845
+ Yields
1846
+ ------
1847
+ index : label or tuple of label
1848
+ The index of the row. A tuple for a `MultiIndex`.
1849
+ data : Series
1850
+ The data of the row as a Series.
1851
+
1852
+ it : generator
1853
+ A generator that iterates over the rows of the frame.
1854
+
1855
+ See Also
1856
+ --------
1857
+ DataFrame.itertuples : Iterate over DataFrame rows as namedtuples of the values.
1858
+ DataFrame.items : Iterate over (column name, Series) pairs.
1859
+
1860
+ Notes
1861
+ -----
1862
+
1863
+ 1. Because ``iterrows`` returns a Series for each row,
1864
+ it does **not** preserve dtypes across the rows (dtypes are
1865
+ preserved across columns for DataFrames). For example,
1866
+
1867
+ >>> import maxframe.dataframe as md
1868
+ >>> df = md.DataFrame([[1, 1.5]], columns=['int', 'float'])
1869
+ >>> row = next(df.iterrows())[1]
1870
+ >>> row
1871
+ int 1.0
1872
+ float 1.5
1873
+ Name: 0, dtype: float64
1874
+ >>> print(row['int'].dtype)
1875
+ float64
1876
+ >>> print(df['int'].dtype)
1877
+ int64
1878
+
1879
+ To preserve dtypes while iterating over the rows, it is better
1880
+ to use :meth:`itertuples` which returns namedtuples of the values
1881
+ and which is generally faster than ``iterrows``.
1882
+
1883
+ 2. You should **never modify** something you are iterating over.
1884
+ This is not guaranteed to work in all cases. Depending on the
1885
+ data types, the iterator returns a copy and not a view, and writing
1886
+ to it will have no effect.
1887
+ """
1888
+ return self._data.iterrows(batch_size=batch_size, session=session)
1889
+
1890
+ def itertuples(self, index=True, name="Pandas", batch_size=1000, session=None):
1891
+ """
1892
+ Iterate over DataFrame rows as namedtuples.
1893
+
1894
+ Parameters
1895
+ ----------
1896
+ index : bool, default True
1897
+ If True, return the index as the first element of the tuple.
1898
+ name : str or None, default "Pandas"
1899
+ The name of the returned namedtuples or None to return regular
1900
+ tuples.
1901
+
1902
+ Returns
1903
+ -------
1904
+ iterator
1905
+ An object to iterate over namedtuples for each row in the
1906
+ DataFrame with the first field possibly being the index and
1907
+ following fields being the column values.
1908
+
1909
+ See Also
1910
+ --------
1911
+ DataFrame.iterrows : Iterate over DataFrame rows as (index, Series)
1912
+ pairs.
1913
+ DataFrame.items : Iterate over (column name, Series) pairs.
1914
+
1915
+ Notes
1916
+ -----
1917
+ The column names will be renamed to positional names if they are
1918
+ invalid Python identifiers, repeated, or start with an underscore.
1919
+ On python versions < 3.7 regular tuples are returned for DataFrames
1920
+ with a large number of columns (>254).
1921
+
1922
+ Examples
1923
+ --------
1924
+ >>> import maxframe.dataframe as md
1925
+ >>> df = md.DataFrame({'num_legs': [4, 2], 'num_wings': [0, 2]},
1926
+ ... index=['dog', 'hawk'])
1927
+ >>> df.execute()
1928
+ num_legs num_wings
1929
+ dog 4 0
1930
+ hawk 2 2
1931
+ >>> for row in df.itertuples():
1932
+ ... print(row)
1933
+ ...
1934
+ Pandas(Index='dog', num_legs=4, num_wings=0)
1935
+ Pandas(Index='hawk', num_legs=2, num_wings=2)
1936
+
1937
+ By setting the `index` parameter to False we can remove the index
1938
+ as the first element of the tuple:
1939
+
1940
+ >>> for row in df.itertuples(index=False):
1941
+ ... print(row)
1942
+ ...
1943
+ Pandas(num_legs=4, num_wings=0)
1944
+ Pandas(num_legs=2, num_wings=2)
1945
+
1946
+ With the `name` parameter set we set a custom name for the yielded
1947
+ namedtuples:
1948
+
1949
+ >>> for row in df.itertuples(name='Animal'):
1950
+ ... print(row)
1951
+ ...
1952
+ Animal(Index='dog', num_legs=4, num_wings=0)
1953
+ Animal(Index='hawk', num_legs=2, num_wings=2)
1954
+ """
1955
+ return self._data.itertuples(
1956
+ batch_size=batch_size, session=session, index=index, name=name
1957
+ )
1958
+
1959
+ def assign(self, **kwargs):
1960
+ """
1961
+ Assign new columns to a DataFrame.
1962
+ Returns a new object with all original columns in addition to new ones.
1963
+ Existing columns that are re-assigned will be overwritten.
1964
+
1965
+ Parameters
1966
+ ----------
1967
+ **kwargs : dict of {str: callable or Series}
1968
+ The column names are keywords. If the values are
1969
+ callable, they are computed on the DataFrame and
1970
+ assigned to the new columns. The callable must not
1971
+ change input DataFrame (though pandas doesn't check it).
1972
+ If the values are not callable, (e.g. a Series, scalar, or array),
1973
+ they are simply assigned.
1974
+
1975
+ Returns
1976
+ -------
1977
+ DataFrame
1978
+ A new DataFrame with the new columns in addition to
1979
+ all the existing columns.
1980
+
1981
+ Notes
1982
+ -----
1983
+ Assigning multiple columns within the same ``assign`` is possible.
1984
+ Later items in 'kwargs' may refer to newly created or modified
1985
+ columns in 'df'; items are computed and assigned into 'df' in order.
1986
+
1987
+ Examples
1988
+ --------
1989
+ >>> import maxframe.dataframe as md
1990
+ >>> df = md.DataFrame({'temp_c': [17.0, 25.0]},
1991
+ ... index=['Portland', 'Berkeley'])
1992
+ >>> df.execute()
1993
+ temp_c
1994
+ Portland 17.0
1995
+ Berkeley 25.0
1996
+
1997
+ Where the value is a callable, evaluated on `df`:
1998
+
1999
+ >>> df.assign(temp_f=lambda x: x.temp_c * 9 / 5 + 32).execute()
2000
+ temp_c temp_f
2001
+ Portland 17.0 62.6
2002
+ Berkeley 25.0 77.0
2003
+
2004
+ Alternatively, the same behavior can be achieved by directly
2005
+ referencing an existing Series or sequence:
2006
+
2007
+ >>> df.assign(temp_f=df['temp_c'] * 9 / 5 + 32).execute()
2008
+ temp_c temp_f
2009
+ Portland 17.0 62.6
2010
+ Berkeley 25.0 77.0
2011
+
2012
+ You can create multiple columns within the same assign where one
2013
+ of the columns depends on another one defined within the same assign:
2014
+
2015
+ >>> df.assign(temp_f=lambda x: x['temp_c'] * 9 / 5 + 32,
2016
+ ... temp_k=lambda x: (x['temp_f'] + 459.67) * 5 / 9).execute()
2017
+ temp_c temp_f temp_k
2018
+ Portland 17.0 62.6 290.15
2019
+ Berkeley 25.0 77.0 298.15
2020
+ """
2021
+
2022
+ data = self.copy()
2023
+
2024
+ for k, v in kwargs.items():
2025
+ data[k] = apply_if_callable(v, data)
2026
+ return data
2027
+
2028
+
2029
+ class DataFrameGroupByData(BaseDataFrameData):
2030
+ type_name = "DataFrameGroupBy"
2031
+
2032
+ _key_dtypes = SeriesField("key_dtypes")
2033
+ _selection = AnyField("selection")
2034
+
2035
+ @property
2036
+ def key_dtypes(self):
2037
+ return self._key_dtypes
2038
+
2039
+ @property
2040
+ def selection(self):
2041
+ return self._selection
2042
+
2043
+ def _get_params(self) -> Dict[str, Any]:
2044
+ p = super()._get_params()
2045
+ p.update(dict(key_dtypes=self.key_dtypes, selection=self.selection))
2046
+ return p
2047
+
2048
+ def _set_params(self, new_params: Dict[str, Any]):
2049
+ params = new_params.copy()
2050
+ key_dtypes = params.pop("key_dtypes", None)
2051
+ if key_dtypes is not None:
2052
+ self._key_dtypes = key_dtypes
2053
+ selection = params.pop("selection", None)
2054
+ if selection is not None:
2055
+ self._selection = selection
2056
+ super()._set_params(params)
2057
+
2058
+ params = property(_get_params, _set_params)
2059
+
2060
+ def __init__(self, key_dtypes=None, selection=None, **kw):
2061
+ super().__init__(_key_dtypes=key_dtypes, _selection=selection, **kw)
2062
+
2063
+ def _equal(self, o):
2064
+ # FIXME We need to implemented a true `==` operator for DataFrameGroupby
2065
+ if is_build_mode():
2066
+ return self is o
2067
+ else:
2068
+ return self == o
2069
+
2070
+
2071
+ class SeriesGroupByData(BaseSeriesData):
2072
+ type_name = "SeriesGroupBy"
2073
+
2074
+ _key_dtypes = AnyField("key_dtypes")
2075
+
2076
+ @property
2077
+ def key_dtypes(self):
2078
+ return self._key_dtypes
2079
+
2080
+ def _get_params(self) -> Dict[str, Any]:
2081
+ p = super()._get_params()
2082
+ p["key_dtypes"] = self.key_dtypes
2083
+ return p
2084
+
2085
+ def _set_params(self, new_params: Dict[str, Any]):
2086
+ params = new_params.copy()
2087
+ key_dtypes = params.pop("key_dtypes", None)
2088
+ if key_dtypes is not None:
2089
+ self._key_dtypes = key_dtypes
2090
+ super()._set_params(params)
2091
+
2092
+ params = property(_get_params, _set_params)
2093
+
2094
+ def __init__(self, key_dtypes=None, **kw):
2095
+ super().__init__(_key_dtypes=key_dtypes, **kw)
2096
+
2097
+ def _equal(self, o):
2098
+ # FIXME We need to implemented a true `==` operator for DataFrameGroupby
2099
+ if is_build_mode():
2100
+ return self is o
2101
+ else:
2102
+ return self == o
2103
+
2104
+
2105
+ class GroupBy(Tileable, _ToPandasMixin):
2106
+ __slots__ = ()
2107
+
2108
+
2109
+ class DataFrameGroupBy(GroupBy):
2110
+ __slots__ = ()
2111
+ _allow_data_type_ = (DataFrameGroupByData,)
2112
+ type_name = "DataFrameGroupBy"
2113
+
2114
+ def __eq__(self, other):
2115
+ return self._equal(other)
2116
+
2117
+ def __hash__(self):
2118
+ # NB: we have customized __eq__ explicitly, thus we need define __hash__ explicitly as well.
2119
+ return super().__hash__()
2120
+
2121
+ def __getattr__(self, item):
2122
+ try:
2123
+ return super().__getattr__(item)
2124
+ except AttributeError:
2125
+ if item in self.dtypes:
2126
+ return self[item]
2127
+ else:
2128
+ raise
2129
+
2130
+ def __dir__(self):
2131
+ result = list(super().__dir__())
2132
+ return sorted(
2133
+ result
2134
+ + [k for k in self.dtypes.index if isinstance(k, str) and k.isidentifier()]
2135
+ )
2136
+
2137
+
2138
+ class SeriesGroupBy(GroupBy):
2139
+ __slots__ = ()
2140
+ _allow_data_type_ = (SeriesGroupByData,)
2141
+ type_name = "SeriesGroupBy"
2142
+
2143
+ def __eq__(self, other):
2144
+ return self._equal(other)
2145
+
2146
+ def __hash__(self):
2147
+ # NB: we have customized __eq__ explicitly, thus we need define __hash__ explicitly as well.
2148
+ return super().__hash__()
2149
+
2150
+
2151
+ class CategoricalData(HasShapeTileableData, _ToPandasMixin):
2152
+ __slots__ = ("_cache",)
2153
+ type_name = "Categorical"
2154
+
2155
+ # optional field
2156
+ _dtype = DataTypeField("dtype")
2157
+ _categories_value = ReferenceField(
2158
+ "categories_value", IndexValue, on_deserialize=_on_deserialize_index_value
2159
+ )
2160
+
2161
+ def __init__(
2162
+ self,
2163
+ op=None,
2164
+ shape=None,
2165
+ nsplits=None,
2166
+ dtype=None,
2167
+ categories_value=None,
2168
+ **kw,
2169
+ ):
2170
+ super().__init__(
2171
+ _op=op,
2172
+ _shape=shape,
2173
+ _nsplits=nsplits,
2174
+ _dtype=dtype,
2175
+ _categories_value=categories_value,
2176
+ **kw,
2177
+ )
2178
+
2179
+ @property
2180
+ def params(self) -> Dict[str, Any]:
2181
+ # params return the properties which useful to rebuild a new tileable object
2182
+ return {
2183
+ "shape": self.shape,
2184
+ "dtype": self.dtype,
2185
+ "categories_value": self.categories_value,
2186
+ }
2187
+
2188
+ @params.setter
2189
+ def params(self, new_params: Dict[str, Any]):
2190
+ params = new_params.copy()
2191
+ new_shape = params.pop("shape", None)
2192
+ if new_shape is not None:
2193
+ self._shape = new_shape
2194
+ dtype = params.pop("dtype", None)
2195
+ if dtype is not None:
2196
+ self._dtype = dtype
2197
+ categories_value = params.pop("categories_value", None)
2198
+ if categories_value is not None:
2199
+ self._categories_value = categories_value
2200
+ if params: # pragma: no cover
2201
+ raise TypeError(f"Unknown params: {list(params)}")
2202
+
2203
+ def refresh_params(self):
2204
+ # refresh params when chunks updated
2205
+ refresh_tileable_shape(self)
2206
+ if self._dtype is None:
2207
+ self._dtype = self.chunks[0].dtype
2208
+ if self._categories_value is None:
2209
+ categories = []
2210
+ for chunk in self.chunks:
2211
+ categories.extend(chunk.categories_value.to_pandas())
2212
+ self._categories_value = parse_index(
2213
+ pd.Categorical(categories).categories, store_data=True
2214
+ )
2215
+
2216
+ def refresh_from_table_meta(self, table_meta: DataFrameTableMeta) -> None:
2217
+ pass
2218
+
2219
+ def _to_str(self, representation=False):
2220
+ if is_build_mode() or len(self._executed_sessions) == 0:
2221
+ # in build mode, or not executed, just return representation
2222
+ if representation:
2223
+ return f"{self.type_name} <op={type(self.op).__name__}, key={self.key}>"
2224
+ else:
2225
+ return f"{self.type_name}(op={type(self.op).__name__})"
2226
+ else:
2227
+ data = self.fetch(session=self._executed_sessions[-1])
2228
+ return repr(data) if repr(data) else str(data)
2229
+
2230
+ def __str__(self):
2231
+ return self._to_str(representation=False)
2232
+
2233
+ def __repr__(self):
2234
+ return self._to_str(representation=True)
2235
+
2236
+ def _equal(self, o):
2237
+ # FIXME We need to implemented a true `==` operator for DataFrameGroupby
2238
+ if is_build_mode():
2239
+ return self is o
2240
+ else: # pragma: no cover
2241
+ return self == o
2242
+
2243
+ @property
2244
+ def dtype(self):
2245
+ return getattr(self, "_dtype", None) or self.op.dtype
2246
+
2247
+ @property
2248
+ def categories_value(self):
2249
+ return self._categories_value
2250
+
2251
+ def __eq__(self, other):
2252
+ return self._equal(other)
2253
+
2254
+ def __hash__(self):
2255
+ # NB: we have customized __eq__ explicitly, thus we need define __hash__ explicitly as well.
2256
+ return super().__hash__()
2257
+
2258
+
2259
+ class Categorical(HasShapeTileable, _ToPandasMixin):
2260
+ __slots__ = ()
2261
+ _allow_data_type_ = (CategoricalData,)
2262
+ type_name = "Categorical"
2263
+
2264
+ def __len__(self):
2265
+ return len(self._data)
2266
+
2267
+ def __eq__(self, other):
2268
+ return self._equal(other)
2269
+
2270
+ def __hash__(self):
2271
+ # NB: we have customized __eq__ explicitly, thus we need define __hash__ explicitly as well.
2272
+ return super().__hash__()
2273
+
2274
+
2275
+ class DataFrameOrSeriesData(HasShapeTileableData, _ToPandasMixin):
2276
+ __slots__ = ()
2277
+
2278
+ _data_type = StringField("data_type")
2279
+ _data_params = DictField("data_params")
2280
+
2281
+ def __init__(
2282
+ self,
2283
+ op=None,
2284
+ data_type=None,
2285
+ data_params=None,
2286
+ **kw,
2287
+ ):
2288
+ self._data_type = data_type
2289
+ self._data_params = data_params or dict()
2290
+ super().__init__(
2291
+ _op=op,
2292
+ **kw,
2293
+ )
2294
+
2295
+ def __getattr__(self, item):
2296
+ if item in self._data_params:
2297
+ return self._data_params[item]
2298
+ raise AttributeError(f"'{type(self)}' object has no attribute '{item}'")
2299
+
2300
+ @property
2301
+ def shape(self):
2302
+ return self._data_params.get("shape", None)
2303
+
2304
+ @property
2305
+ def nsplits(self):
2306
+ return self._data_params.get("nsplits", None)
2307
+
2308
+ @property
2309
+ def data_type(self):
2310
+ return self._data_type
2311
+
2312
+ @property
2313
+ def data_params(self):
2314
+ return self._data_params
2315
+
2316
+ @property
2317
+ def params(self) -> Dict[str, Any]:
2318
+ return {"data_type": self._data_type, "data_params": self._data_params}
2319
+
2320
+ @params.setter
2321
+ def params(self, new_params: Dict[str, Any]):
2322
+ # After execution, create DataFrameFetch, and the data
2323
+ # corresponding to the original key is still DataFrameOrSeries type,
2324
+ # so when restoring DataFrameOrSeries type,
2325
+ # there is no "data_type" field in params.
2326
+ if "data_type" not in new_params:
2327
+ if "dtype" in new_params:
2328
+ self._data_type = "series"
2329
+ else:
2330
+ self._data_type = "dataframe"
2331
+ self._data_params = new_params.copy()
2332
+ else:
2333
+ self._data_type = new_params.get("data_type")
2334
+ self._data_params = {
2335
+ k: v for k, v in new_params.get("data_params", {}).items()
2336
+ }
2337
+
2338
+ def refresh_params(self):
2339
+ index_to_index_values = dict()
2340
+ for chunk in self.chunks:
2341
+ if chunk.ndim == 1:
2342
+ index_to_index_values[chunk.index] = chunk.index_value
2343
+ elif chunk.index[1] == 0:
2344
+ index_to_index_values[chunk.index] = chunk.index_value
2345
+ index_value = merge_index_value(index_to_index_values, store_data=False)
2346
+ nsplits = calc_nsplits({c.index: c.shape for c in self.chunks})
2347
+ shape = tuple(sum(ns) for ns in nsplits)
2348
+
2349
+ data_params = dict()
2350
+ data_params["nsplits"] = nsplits
2351
+ data_params["shape"] = shape
2352
+ data_params["index_value"] = index_value
2353
+
2354
+ self._data_type = self._chunks[0]._data_type
2355
+ if self.data_type == "dataframe":
2356
+ all_dtypes = [c.dtypes_value.value for c in self.chunks if c.index[0] == 0]
2357
+ dtypes = pd.concat(all_dtypes)
2358
+ data_params["dtypes"] = dtypes
2359
+ columns_values = parse_index(dtypes.index, store_data=True)
2360
+ data_params["columns_value"] = columns_values
2361
+ data_params["dtypes_value"] = DtypesValue(
2362
+ key=tokenize(dtypes), value=dtypes
2363
+ )
2364
+ else:
2365
+ data_params["dtype"] = self.chunks[0].dtype
2366
+ data_params["name"] = self.chunks[0].name
2367
+ self._data_params.update(data_params)
2368
+
2369
+ def refresh_from_table_meta(self, table_meta: DataFrameTableMeta) -> None:
2370
+ pass
2371
+
2372
+ def ensure_data(self):
2373
+ from .fetch.core import DataFrameFetch
2374
+
2375
+ self.execute()
2376
+ default_sess = get_default_session()
2377
+ self._detach_session(default_sess._session)
2378
+
2379
+ fetch_tileable = default_sess._session._tileable_to_fetch[self]
2380
+ new = DataFrameFetch(
2381
+ output_types=[getattr(OutputType, self.data_type)]
2382
+ ).new_tileable(
2383
+ [],
2384
+ _key=self.key,
2385
+ chunks=fetch_tileable.chunks,
2386
+ nsplits=fetch_tileable.nsplits,
2387
+ **self.data_params,
2388
+ )
2389
+ new._attach_session(default_sess._session)
2390
+ return new
2391
+
2392
+
2393
+ class DataFrameOrSeries(HasShapeTileable, _ToPandasMixin):
2394
+ __slots__ = ()
2395
+ _allow_data_type_ = (DataFrameOrSeriesData,)
2396
+ type_name = "DataFrameOrSeries"
2397
+
2398
+
2399
+ INDEX_TYPE = (Index, IndexData)
2400
+ SERIES_TYPE = (Series, SeriesData)
2401
+ DATAFRAME_OR_SERIES_TYPE = (DataFrameOrSeries, DataFrameOrSeriesData)
2402
+ DATAFRAME_TYPE = (DataFrame, DataFrameData)
2403
+ DATAFRAME_GROUPBY_TYPE = (DataFrameGroupBy, DataFrameGroupByData)
2404
+ SERIES_GROUPBY_TYPE = (SeriesGroupBy, SeriesGroupByData)
2405
+ GROUPBY_TYPE = (GroupBy,) + DATAFRAME_GROUPBY_TYPE + SERIES_GROUPBY_TYPE
2406
+ CATEGORICAL_TYPE = (Categorical, CategoricalData)
2407
+ TILEABLE_TYPE = (
2408
+ INDEX_TYPE + SERIES_TYPE + DATAFRAME_TYPE + GROUPBY_TYPE + CATEGORICAL_TYPE
2409
+ )
2410
+
2411
+ register_output_types(OutputType.dataframe, DATAFRAME_TYPE)
2412
+ register_output_types(OutputType.series, SERIES_TYPE)
2413
+ register_output_types(OutputType.df_or_series, DATAFRAME_OR_SERIES_TYPE)
2414
+ register_output_types(OutputType.index, INDEX_TYPE)
2415
+ register_output_types(OutputType.categorical, CATEGORICAL_TYPE)
2416
+ register_output_types(OutputType.dataframe_groupby, DATAFRAME_GROUPBY_TYPE)
2417
+ register_output_types(OutputType.series_groupby, SERIES_GROUPBY_TYPE)