maxframe 2.0.0b2__cp37-cp37m-win32.whl → 2.3.0rc1__cp37-cp37m-win32.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of maxframe might be problematic. Click here for more details.

Files changed (443) hide show
  1. maxframe/__init__.py +1 -0
  2. maxframe/_utils.cp37-win32.pyd +0 -0
  3. maxframe/_utils.pyx +14 -1
  4. maxframe/codegen/core.py +9 -8
  5. maxframe/codegen/spe/core.py +1 -1
  6. maxframe/codegen/spe/dataframe/__init__.py +1 -0
  7. maxframe/codegen/spe/dataframe/accessors/base.py +18 -0
  8. maxframe/codegen/spe/dataframe/accessors/dict_.py +25 -130
  9. maxframe/codegen/spe/dataframe/accessors/list_.py +12 -48
  10. maxframe/codegen/spe/dataframe/accessors/struct_.py +28 -0
  11. maxframe/codegen/spe/dataframe/arithmetic.py +7 -2
  12. maxframe/codegen/spe/dataframe/groupby.py +88 -0
  13. maxframe/codegen/spe/dataframe/indexing.py +99 -4
  14. maxframe/codegen/spe/dataframe/merge.py +38 -1
  15. maxframe/codegen/spe/dataframe/misc.py +11 -33
  16. maxframe/codegen/spe/dataframe/reduction.py +32 -9
  17. maxframe/codegen/spe/dataframe/reshape.py +46 -0
  18. maxframe/codegen/spe/dataframe/sort.py +39 -18
  19. maxframe/codegen/spe/dataframe/tests/accessors/test_dict.py +9 -15
  20. maxframe/codegen/spe/dataframe/tests/accessors/test_list.py +4 -7
  21. maxframe/codegen/spe/dataframe/tests/accessors/test_struct.py +75 -0
  22. maxframe/codegen/spe/dataframe/tests/indexing/test_iloc.py +20 -1
  23. maxframe/codegen/spe/dataframe/tests/indexing/test_loc.py +35 -0
  24. maxframe/codegen/spe/dataframe/tests/misc/test_misc.py +0 -32
  25. maxframe/codegen/spe/dataframe/tests/test_groupby.py +81 -18
  26. maxframe/codegen/spe/dataframe/tests/test_merge.py +27 -1
  27. maxframe/codegen/spe/dataframe/tests/test_reduction.py +13 -0
  28. maxframe/codegen/spe/dataframe/tests/test_reshape.py +79 -0
  29. maxframe/codegen/spe/dataframe/tests/test_sort.py +20 -0
  30. maxframe/codegen/spe/dataframe/tseries.py +9 -0
  31. maxframe/codegen/spe/learn/contrib/lightgbm.py +4 -3
  32. maxframe/codegen/spe/learn/contrib/tests/test_xgboost.py +2 -1
  33. maxframe/codegen/spe/learn/metrics/__init__.py +1 -1
  34. maxframe/codegen/spe/learn/metrics/_ranking.py +76 -0
  35. maxframe/codegen/spe/learn/metrics/pairwise.py +51 -0
  36. maxframe/codegen/spe/learn/metrics/tests/test_pairwise.py +36 -0
  37. maxframe/codegen/spe/learn/metrics/tests/test_ranking.py +59 -0
  38. maxframe/codegen/spe/tensor/__init__.py +3 -0
  39. maxframe/codegen/spe/tensor/datasource.py +1 -0
  40. maxframe/codegen/spe/tensor/fft.py +74 -0
  41. maxframe/codegen/spe/tensor/linalg.py +29 -2
  42. maxframe/codegen/spe/tensor/misc.py +79 -25
  43. maxframe/codegen/spe/tensor/spatial.py +45 -0
  44. maxframe/codegen/spe/tensor/statistics.py +44 -0
  45. maxframe/codegen/spe/tensor/tests/test_fft.py +64 -0
  46. maxframe/codegen/spe/tensor/tests/test_linalg.py +15 -1
  47. maxframe/codegen/spe/tensor/tests/test_misc.py +52 -2
  48. maxframe/codegen/spe/tensor/tests/test_spatial.py +33 -0
  49. maxframe/codegen/spe/tensor/tests/test_statistics.py +15 -1
  50. maxframe/codegen/spe/tests/test_spe_codegen.py +6 -12
  51. maxframe/codegen/spe/utils.py +2 -0
  52. maxframe/config/config.py +73 -9
  53. maxframe/config/tests/test_validators.py +13 -1
  54. maxframe/config/validators.py +49 -0
  55. maxframe/conftest.py +54 -17
  56. maxframe/core/accessor.py +2 -2
  57. maxframe/core/base.py +2 -1
  58. maxframe/core/entity/core.py +5 -0
  59. maxframe/core/entity/tileables.py +3 -1
  60. maxframe/core/graph/core.cp37-win32.pyd +0 -0
  61. maxframe/core/graph/entity.py +8 -3
  62. maxframe/core/mode.py +6 -1
  63. maxframe/core/operator/base.py +9 -2
  64. maxframe/core/operator/core.py +10 -2
  65. maxframe/core/operator/utils.py +13 -0
  66. maxframe/dataframe/__init__.py +12 -5
  67. maxframe/dataframe/accessors/__init__.py +1 -1
  68. maxframe/dataframe/accessors/compat.py +45 -0
  69. maxframe/dataframe/accessors/datetime_/__init__.py +4 -1
  70. maxframe/dataframe/accessors/dict_/contains.py +7 -16
  71. maxframe/dataframe/accessors/dict_/core.py +48 -0
  72. maxframe/dataframe/accessors/dict_/getitem.py +17 -21
  73. maxframe/dataframe/accessors/dict_/length.py +7 -16
  74. maxframe/dataframe/accessors/dict_/remove.py +6 -18
  75. maxframe/dataframe/accessors/dict_/setitem.py +8 -18
  76. maxframe/dataframe/accessors/dict_/tests/test_dict_accessor.py +62 -22
  77. maxframe/dataframe/accessors/list_/__init__.py +2 -2
  78. maxframe/dataframe/accessors/list_/core.py +48 -0
  79. maxframe/dataframe/accessors/list_/getitem.py +12 -19
  80. maxframe/dataframe/accessors/list_/length.py +7 -16
  81. maxframe/dataframe/accessors/list_/tests/test_list_accessor.py +11 -9
  82. maxframe/dataframe/accessors/string_/__init__.py +4 -1
  83. maxframe/dataframe/accessors/struct_/__init__.py +37 -0
  84. maxframe/dataframe/accessors/struct_/accessor.py +39 -0
  85. maxframe/dataframe/accessors/struct_/core.py +43 -0
  86. maxframe/dataframe/accessors/struct_/dtypes.py +53 -0
  87. maxframe/dataframe/accessors/struct_/field.py +123 -0
  88. maxframe/dataframe/accessors/struct_/tests/__init__.py +13 -0
  89. maxframe/dataframe/accessors/struct_/tests/test_struct_accessor.py +91 -0
  90. maxframe/dataframe/arithmetic/__init__.py +18 -4
  91. maxframe/dataframe/arithmetic/between.py +106 -0
  92. maxframe/dataframe/arithmetic/dot.py +237 -0
  93. maxframe/dataframe/arithmetic/maximum.py +33 -0
  94. maxframe/dataframe/arithmetic/minimum.py +33 -0
  95. maxframe/dataframe/arithmetic/{around.py → round.py} +11 -7
  96. maxframe/dataframe/core.py +161 -224
  97. maxframe/dataframe/datasource/__init__.py +18 -0
  98. maxframe/dataframe/datasource/core.py +6 -0
  99. maxframe/dataframe/datasource/direct.py +57 -0
  100. maxframe/dataframe/datasource/from_dict.py +124 -0
  101. maxframe/dataframe/datasource/from_index.py +1 -1
  102. maxframe/dataframe/datasource/from_records.py +77 -0
  103. maxframe/dataframe/datasource/from_tensor.py +109 -41
  104. maxframe/dataframe/datasource/read_csv.py +21 -14
  105. maxframe/dataframe/datasource/read_odps_query.py +29 -6
  106. maxframe/dataframe/datasource/read_odps_table.py +32 -10
  107. maxframe/dataframe/datasource/read_parquet.py +38 -39
  108. maxframe/dataframe/datasource/tests/test_datasource.py +37 -0
  109. maxframe/dataframe/datastore/__init__.py +11 -1
  110. maxframe/dataframe/datastore/direct.py +268 -0
  111. maxframe/dataframe/datastore/to_csv.py +29 -41
  112. maxframe/dataframe/datastore/to_odps.py +36 -4
  113. maxframe/dataframe/extensions/__init__.py +20 -4
  114. maxframe/dataframe/extensions/apply_chunk.py +32 -6
  115. maxframe/dataframe/extensions/cartesian_chunk.py +153 -0
  116. maxframe/dataframe/extensions/collect_kv.py +126 -0
  117. maxframe/dataframe/extensions/extract_kv.py +177 -0
  118. maxframe/dataframe/extensions/flatjson.py +2 -1
  119. maxframe/dataframe/extensions/map_reduce.py +263 -0
  120. maxframe/dataframe/extensions/rebalance.py +62 -0
  121. maxframe/dataframe/extensions/tests/test_apply_chunk.py +9 -2
  122. maxframe/dataframe/extensions/tests/test_extensions.py +54 -0
  123. maxframe/dataframe/extensions/tests/test_map_reduce.py +135 -0
  124. maxframe/dataframe/groupby/__init__.py +17 -2
  125. maxframe/dataframe/groupby/aggregation.py +86 -49
  126. maxframe/dataframe/groupby/apply.py +1 -1
  127. maxframe/dataframe/groupby/apply_chunk.py +19 -5
  128. maxframe/dataframe/groupby/core.py +116 -16
  129. maxframe/dataframe/groupby/cum.py +4 -25
  130. maxframe/dataframe/groupby/expanding.py +264 -0
  131. maxframe/dataframe/groupby/fill.py +1 -1
  132. maxframe/dataframe/groupby/getitem.py +12 -5
  133. maxframe/dataframe/groupby/head.py +11 -1
  134. maxframe/dataframe/groupby/rank.py +136 -0
  135. maxframe/dataframe/groupby/rolling.py +206 -0
  136. maxframe/dataframe/groupby/shift.py +114 -0
  137. maxframe/dataframe/groupby/tests/test_groupby.py +0 -5
  138. maxframe/dataframe/indexing/__init__.py +22 -2
  139. maxframe/dataframe/indexing/droplevel.py +195 -0
  140. maxframe/dataframe/indexing/filter.py +169 -0
  141. maxframe/dataframe/indexing/get_level_values.py +76 -0
  142. maxframe/dataframe/indexing/iat.py +45 -0
  143. maxframe/dataframe/indexing/iloc.py +152 -12
  144. maxframe/dataframe/indexing/insert.py +46 -18
  145. maxframe/dataframe/indexing/loc.py +287 -7
  146. maxframe/dataframe/indexing/reindex.py +14 -5
  147. maxframe/dataframe/indexing/rename.py +6 -0
  148. maxframe/dataframe/indexing/rename_axis.py +2 -2
  149. maxframe/dataframe/indexing/reorder_levels.py +143 -0
  150. maxframe/dataframe/indexing/reset_index.py +33 -6
  151. maxframe/dataframe/indexing/sample.py +8 -0
  152. maxframe/dataframe/indexing/setitem.py +3 -3
  153. maxframe/dataframe/indexing/swaplevel.py +185 -0
  154. maxframe/dataframe/indexing/take.py +99 -0
  155. maxframe/dataframe/indexing/truncate.py +140 -0
  156. maxframe/dataframe/indexing/where.py +0 -11
  157. maxframe/dataframe/indexing/xs.py +148 -0
  158. maxframe/dataframe/merge/__init__.py +15 -1
  159. maxframe/dataframe/merge/append.py +97 -98
  160. maxframe/dataframe/merge/combine.py +244 -0
  161. maxframe/dataframe/merge/combine_first.py +120 -0
  162. maxframe/dataframe/merge/compare.py +387 -0
  163. maxframe/dataframe/merge/concat.py +183 -0
  164. maxframe/dataframe/merge/update.py +271 -0
  165. maxframe/dataframe/misc/__init__.py +28 -11
  166. maxframe/dataframe/misc/_duplicate.py +10 -4
  167. maxframe/dataframe/misc/apply.py +1 -1
  168. maxframe/dataframe/misc/check_unique.py +82 -0
  169. maxframe/dataframe/misc/clip.py +145 -0
  170. maxframe/dataframe/misc/describe.py +175 -9
  171. maxframe/dataframe/misc/drop.py +31 -0
  172. maxframe/dataframe/misc/drop_duplicates.py +2 -2
  173. maxframe/dataframe/misc/duplicated.py +2 -2
  174. maxframe/dataframe/misc/get_dummies.py +5 -1
  175. maxframe/dataframe/misc/infer_dtypes.py +251 -0
  176. maxframe/dataframe/misc/isin.py +2 -2
  177. maxframe/dataframe/misc/map.py +125 -18
  178. maxframe/dataframe/misc/repeat.py +159 -0
  179. maxframe/dataframe/misc/tests/test_misc.py +48 -3
  180. maxframe/dataframe/misc/to_numeric.py +3 -0
  181. maxframe/dataframe/misc/transform.py +12 -5
  182. maxframe/dataframe/misc/transpose.py +13 -1
  183. maxframe/dataframe/misc/valid_index.py +115 -0
  184. maxframe/dataframe/misc/value_counts.py +38 -4
  185. maxframe/dataframe/missing/checkna.py +14 -6
  186. maxframe/dataframe/missing/dropna.py +5 -0
  187. maxframe/dataframe/missing/fillna.py +1 -1
  188. maxframe/dataframe/missing/replace.py +7 -4
  189. maxframe/dataframe/reduction/__init__.py +35 -16
  190. maxframe/dataframe/reduction/aggregation.py +43 -14
  191. maxframe/dataframe/reduction/all.py +2 -2
  192. maxframe/dataframe/reduction/any.py +2 -2
  193. maxframe/dataframe/reduction/argmax.py +103 -0
  194. maxframe/dataframe/reduction/argmin.py +103 -0
  195. maxframe/dataframe/reduction/core.py +80 -24
  196. maxframe/dataframe/reduction/count.py +13 -9
  197. maxframe/dataframe/reduction/cov.py +166 -0
  198. maxframe/dataframe/reduction/cummax.py +2 -2
  199. maxframe/dataframe/reduction/cummin.py +2 -2
  200. maxframe/dataframe/reduction/cumprod.py +2 -2
  201. maxframe/dataframe/reduction/cumsum.py +2 -2
  202. maxframe/dataframe/reduction/custom_reduction.py +2 -2
  203. maxframe/dataframe/reduction/idxmax.py +185 -0
  204. maxframe/dataframe/reduction/idxmin.py +185 -0
  205. maxframe/dataframe/reduction/kurtosis.py +37 -30
  206. maxframe/dataframe/reduction/max.py +2 -2
  207. maxframe/dataframe/reduction/mean.py +9 -7
  208. maxframe/dataframe/reduction/median.py +2 -2
  209. maxframe/dataframe/reduction/min.py +2 -2
  210. maxframe/dataframe/reduction/mode.py +144 -0
  211. maxframe/dataframe/reduction/nunique.py +19 -11
  212. maxframe/dataframe/reduction/prod.py +18 -13
  213. maxframe/dataframe/reduction/reduction_size.py +2 -2
  214. maxframe/dataframe/reduction/sem.py +13 -9
  215. maxframe/dataframe/reduction/skew.py +31 -27
  216. maxframe/dataframe/reduction/str_concat.py +10 -7
  217. maxframe/dataframe/reduction/sum.py +18 -14
  218. maxframe/dataframe/reduction/tests/test_reduction.py +12 -0
  219. maxframe/dataframe/reduction/unique.py +20 -3
  220. maxframe/dataframe/reduction/var.py +16 -12
  221. maxframe/dataframe/reshape/__init__.py +38 -0
  222. maxframe/dataframe/{misc → reshape}/pivot.py +1 -0
  223. maxframe/dataframe/{misc → reshape}/pivot_table.py +1 -0
  224. maxframe/dataframe/reshape/unstack.py +114 -0
  225. maxframe/dataframe/sort/__init__.py +16 -1
  226. maxframe/dataframe/sort/argsort.py +68 -0
  227. maxframe/dataframe/sort/core.py +2 -1
  228. maxframe/dataframe/sort/nlargest.py +238 -0
  229. maxframe/dataframe/sort/nsmallest.py +228 -0
  230. maxframe/dataframe/sort/rank.py +147 -0
  231. maxframe/dataframe/statistics/__init__.py +3 -3
  232. maxframe/dataframe/statistics/corr.py +1 -0
  233. maxframe/dataframe/statistics/quantile.py +2 -2
  234. maxframe/dataframe/tests/test_typing.py +104 -0
  235. maxframe/dataframe/tests/test_utils.py +66 -2
  236. maxframe/dataframe/tseries/__init__.py +19 -0
  237. maxframe/dataframe/tseries/at_time.py +61 -0
  238. maxframe/dataframe/tseries/between_time.py +122 -0
  239. maxframe/dataframe/typing_.py +185 -0
  240. maxframe/dataframe/utils.py +125 -52
  241. maxframe/dataframe/window/aggregation.py +8 -4
  242. maxframe/dataframe/window/core.py +14 -1
  243. maxframe/dataframe/window/ewm.py +1 -3
  244. maxframe/dataframe/window/expanding.py +37 -35
  245. maxframe/dataframe/window/rolling.py +49 -39
  246. maxframe/dataframe/window/tests/test_expanding.py +1 -7
  247. maxframe/dataframe/window/tests/test_rolling.py +1 -1
  248. maxframe/env.py +7 -4
  249. maxframe/errors.py +2 -2
  250. maxframe/io/odpsio/schema.py +9 -3
  251. maxframe/io/odpsio/tableio.py +7 -2
  252. maxframe/io/odpsio/tests/test_schema.py +198 -83
  253. maxframe/learn/__init__.py +10 -2
  254. maxframe/learn/cluster/__init__.py +15 -0
  255. maxframe/learn/cluster/_kmeans.py +782 -0
  256. maxframe/learn/contrib/llm/core.py +18 -7
  257. maxframe/learn/contrib/llm/deploy/__init__.py +13 -0
  258. maxframe/learn/contrib/llm/deploy/config.py +221 -0
  259. maxframe/learn/contrib/llm/deploy/core.py +247 -0
  260. maxframe/learn/contrib/llm/deploy/framework.py +35 -0
  261. maxframe/learn/contrib/llm/deploy/loader.py +360 -0
  262. maxframe/learn/contrib/llm/deploy/tests/__init__.py +13 -0
  263. maxframe/learn/contrib/llm/deploy/tests/test_register_models.py +359 -0
  264. maxframe/learn/contrib/llm/models/__init__.py +1 -0
  265. maxframe/learn/contrib/llm/models/dashscope.py +12 -6
  266. maxframe/learn/contrib/llm/models/managed.py +76 -11
  267. maxframe/learn/contrib/llm/models/openai.py +72 -0
  268. maxframe/learn/contrib/llm/tests/__init__.py +13 -0
  269. maxframe/learn/contrib/llm/tests/test_core.py +34 -0
  270. maxframe/learn/contrib/llm/tests/test_openai.py +187 -0
  271. maxframe/learn/contrib/llm/tests/test_text_gen.py +155 -0
  272. maxframe/learn/contrib/llm/text.py +348 -42
  273. maxframe/learn/contrib/models.py +4 -1
  274. maxframe/learn/contrib/xgboost/classifier.py +2 -0
  275. maxframe/learn/contrib/xgboost/core.py +113 -4
  276. maxframe/learn/contrib/xgboost/predict.py +4 -2
  277. maxframe/learn/contrib/xgboost/regressor.py +5 -0
  278. maxframe/learn/contrib/xgboost/train.py +7 -2
  279. maxframe/learn/core.py +66 -0
  280. maxframe/learn/linear_model/_base.py +58 -1
  281. maxframe/learn/linear_model/_lin_reg.py +1 -1
  282. maxframe/learn/metrics/__init__.py +6 -0
  283. maxframe/learn/metrics/_classification.py +145 -0
  284. maxframe/learn/metrics/_ranking.py +477 -0
  285. maxframe/learn/metrics/_scorer.py +60 -0
  286. maxframe/learn/metrics/pairwise/__init__.py +21 -0
  287. maxframe/learn/metrics/pairwise/core.py +77 -0
  288. maxframe/learn/metrics/pairwise/cosine.py +115 -0
  289. maxframe/learn/metrics/pairwise/euclidean.py +176 -0
  290. maxframe/learn/metrics/pairwise/haversine.py +96 -0
  291. maxframe/learn/metrics/pairwise/manhattan.py +80 -0
  292. maxframe/learn/metrics/pairwise/pairwise.py +127 -0
  293. maxframe/learn/metrics/pairwise/pairwise_distances_topk.py +121 -0
  294. maxframe/learn/metrics/pairwise/rbf_kernel.py +51 -0
  295. maxframe/learn/metrics/tests/__init__.py +13 -0
  296. maxframe/learn/metrics/tests/test_scorer.py +26 -0
  297. maxframe/learn/preprocessing/_data/min_max_scaler.py +34 -23
  298. maxframe/learn/preprocessing/_data/standard_scaler.py +34 -25
  299. maxframe/learn/utils/__init__.py +2 -1
  300. maxframe/learn/utils/checks.py +1 -2
  301. maxframe/learn/utils/core.py +59 -0
  302. maxframe/learn/utils/extmath.py +79 -9
  303. maxframe/learn/utils/odpsio.py +262 -0
  304. maxframe/learn/utils/validation.py +2 -2
  305. maxframe/lib/compat.py +40 -0
  306. maxframe/lib/dtypes_extension/__init__.py +16 -1
  307. maxframe/lib/dtypes_extension/_fake_arrow_dtype.py +604 -0
  308. maxframe/lib/dtypes_extension/blob.py +304 -0
  309. maxframe/lib/dtypes_extension/dtypes.py +40 -0
  310. maxframe/lib/dtypes_extension/tests/test_blob.py +88 -0
  311. maxframe/lib/dtypes_extension/tests/test_dtypes.py +16 -1
  312. maxframe/lib/dtypes_extension/tests/test_fake_arrow_dtype.py +75 -0
  313. maxframe/lib/filesystem/_oss_lib/common.py +124 -50
  314. maxframe/lib/filesystem/_oss_lib/glob.py +1 -1
  315. maxframe/lib/filesystem/_oss_lib/handle.py +21 -25
  316. maxframe/lib/filesystem/base.py +1 -1
  317. maxframe/lib/filesystem/core.py +1 -1
  318. maxframe/lib/filesystem/oss.py +115 -46
  319. maxframe/lib/filesystem/tests/test_oss.py +74 -36
  320. maxframe/lib/mmh3.cp37-win32.pyd +0 -0
  321. maxframe/lib/wrapped_pickle.py +10 -0
  322. maxframe/opcodes.py +41 -15
  323. maxframe/protocol.py +12 -0
  324. maxframe/remote/core.py +4 -0
  325. maxframe/serialization/__init__.py +11 -2
  326. maxframe/serialization/arrow.py +38 -13
  327. maxframe/serialization/blob.py +32 -0
  328. maxframe/serialization/core.cp37-win32.pyd +0 -0
  329. maxframe/serialization/core.pyx +39 -1
  330. maxframe/serialization/exception.py +2 -4
  331. maxframe/serialization/numpy.py +11 -0
  332. maxframe/serialization/pandas.py +46 -9
  333. maxframe/serialization/serializables/core.py +2 -2
  334. maxframe/serialization/tests/test_serial.py +31 -4
  335. maxframe/tensor/__init__.py +38 -8
  336. maxframe/tensor/arithmetic/__init__.py +19 -10
  337. maxframe/tensor/arithmetic/core.py +2 -2
  338. maxframe/tensor/arithmetic/iscomplexobj.py +53 -0
  339. maxframe/tensor/arithmetic/tests/test_arithmetic.py +6 -9
  340. maxframe/tensor/core.py +6 -2
  341. maxframe/tensor/datasource/tests/test_datasource.py +2 -1
  342. maxframe/tensor/extensions/__init__.py +2 -0
  343. maxframe/tensor/extensions/apply_chunk.py +3 -3
  344. maxframe/tensor/extensions/rebalance.py +65 -0
  345. maxframe/tensor/fft/__init__.py +32 -0
  346. maxframe/tensor/fft/core.py +168 -0
  347. maxframe/tensor/fft/fft.py +112 -0
  348. maxframe/tensor/fft/fft2.py +118 -0
  349. maxframe/tensor/fft/fftfreq.py +80 -0
  350. maxframe/tensor/fft/fftn.py +123 -0
  351. maxframe/tensor/fft/fftshift.py +79 -0
  352. maxframe/tensor/fft/hfft.py +112 -0
  353. maxframe/tensor/fft/ifft.py +114 -0
  354. maxframe/tensor/fft/ifft2.py +115 -0
  355. maxframe/tensor/fft/ifftn.py +123 -0
  356. maxframe/tensor/fft/ifftshift.py +73 -0
  357. maxframe/tensor/fft/ihfft.py +93 -0
  358. maxframe/tensor/fft/irfft.py +118 -0
  359. maxframe/tensor/fft/irfft2.py +62 -0
  360. maxframe/tensor/fft/irfftn.py +114 -0
  361. maxframe/tensor/fft/rfft.py +116 -0
  362. maxframe/tensor/fft/rfft2.py +63 -0
  363. maxframe/tensor/fft/rfftfreq.py +87 -0
  364. maxframe/tensor/fft/rfftn.py +113 -0
  365. maxframe/tensor/indexing/fill_diagonal.py +1 -7
  366. maxframe/tensor/linalg/__init__.py +7 -0
  367. maxframe/tensor/linalg/_einsumfunc.py +1025 -0
  368. maxframe/tensor/linalg/cholesky.py +117 -0
  369. maxframe/tensor/linalg/einsum.py +339 -0
  370. maxframe/tensor/linalg/lstsq.py +100 -0
  371. maxframe/tensor/linalg/matrix_norm.py +75 -0
  372. maxframe/tensor/linalg/norm.py +249 -0
  373. maxframe/tensor/linalg/solve.py +72 -0
  374. maxframe/tensor/linalg/solve_triangular.py +2 -2
  375. maxframe/tensor/linalg/vector_norm.py +113 -0
  376. maxframe/tensor/misc/__init__.py +24 -1
  377. maxframe/tensor/misc/argwhere.py +72 -0
  378. maxframe/tensor/misc/array_split.py +46 -0
  379. maxframe/tensor/misc/broadcast_arrays.py +57 -0
  380. maxframe/tensor/misc/copyto.py +130 -0
  381. maxframe/tensor/misc/delete.py +104 -0
  382. maxframe/tensor/misc/dsplit.py +68 -0
  383. maxframe/tensor/misc/ediff1d.py +74 -0
  384. maxframe/tensor/misc/expand_dims.py +85 -0
  385. maxframe/tensor/misc/flip.py +90 -0
  386. maxframe/tensor/misc/fliplr.py +64 -0
  387. maxframe/tensor/misc/flipud.py +68 -0
  388. maxframe/tensor/misc/hsplit.py +85 -0
  389. maxframe/tensor/misc/insert.py +139 -0
  390. maxframe/tensor/misc/moveaxis.py +83 -0
  391. maxframe/tensor/misc/result_type.py +88 -0
  392. maxframe/tensor/misc/roll.py +124 -0
  393. maxframe/tensor/misc/rollaxis.py +77 -0
  394. maxframe/tensor/misc/shape.py +89 -0
  395. maxframe/tensor/misc/split.py +190 -0
  396. maxframe/tensor/misc/tile.py +109 -0
  397. maxframe/tensor/misc/vsplit.py +74 -0
  398. maxframe/tensor/reduction/array_equal.py +2 -1
  399. maxframe/tensor/sort/__init__.py +2 -0
  400. maxframe/tensor/sort/argpartition.py +98 -0
  401. maxframe/tensor/sort/partition.py +228 -0
  402. maxframe/tensor/spatial/__init__.py +15 -0
  403. maxframe/tensor/spatial/distance/__init__.py +17 -0
  404. maxframe/tensor/spatial/distance/cdist.py +421 -0
  405. maxframe/tensor/spatial/distance/pdist.py +398 -0
  406. maxframe/tensor/spatial/distance/squareform.py +153 -0
  407. maxframe/tensor/special/__init__.py +159 -21
  408. maxframe/tensor/special/airy.py +55 -0
  409. maxframe/tensor/special/bessel.py +199 -0
  410. maxframe/tensor/special/core.py +65 -4
  411. maxframe/tensor/special/ellip_func_integrals.py +155 -0
  412. maxframe/tensor/special/ellip_harm.py +55 -0
  413. maxframe/tensor/special/err_fresnel.py +223 -0
  414. maxframe/tensor/special/gamma_funcs.py +303 -0
  415. maxframe/tensor/special/hypergeometric_funcs.py +69 -0
  416. maxframe/tensor/special/info_theory.py +189 -0
  417. maxframe/tensor/special/misc.py +21 -0
  418. maxframe/tensor/statistics/__init__.py +6 -0
  419. maxframe/tensor/statistics/corrcoef.py +77 -0
  420. maxframe/tensor/statistics/cov.py +222 -0
  421. maxframe/tensor/statistics/digitize.py +126 -0
  422. maxframe/tensor/statistics/histogram.py +520 -0
  423. maxframe/tensor/statistics/median.py +85 -0
  424. maxframe/tensor/statistics/ptp.py +89 -0
  425. maxframe/tensor/utils.py +3 -3
  426. maxframe/tests/test_udf.py +61 -0
  427. maxframe/tests/test_utils.py +51 -6
  428. maxframe/tests/utils.py +0 -2
  429. maxframe/typing_.py +2 -0
  430. maxframe/udf.py +130 -9
  431. maxframe/utils.py +254 -27
  432. {maxframe-2.0.0b2.dist-info → maxframe-2.3.0rc1.dist-info}/METADATA +3 -3
  433. {maxframe-2.0.0b2.dist-info → maxframe-2.3.0rc1.dist-info}/RECORD +442 -264
  434. maxframe_client/fetcher.py +35 -4
  435. maxframe_client/session/odps.py +7 -2
  436. maxframe_client/session/task.py +8 -1
  437. maxframe_client/tests/test_fetcher.py +76 -3
  438. maxframe_client/tests/test_session.py +28 -1
  439. maxframe/dataframe/arrays.py +0 -864
  440. /maxframe/dataframe/{misc → reshape}/melt.py +0 -0
  441. /maxframe/dataframe/{misc → reshape}/stack.py +0 -0
  442. {maxframe-2.0.0b2.dist-info → maxframe-2.3.0rc1.dist-info}/WHEEL +0 -0
  443. {maxframe-2.0.0b2.dist-info → maxframe-2.3.0rc1.dist-info}/top_level.txt +0 -0
@@ -27,7 +27,7 @@ def test_rolling():
27
27
  expected = df.rolling(
28
28
  3, min_periods=1, center=True, win_type="triang", closed="both"
29
29
  )
30
- assert repr(r) == repr(expected)
30
+ assert repr(r).split(",")[:4] == repr(expected).rsplit(",")[:4]
31
31
 
32
32
  assert "b" in dir(r)
33
33
 
maxframe/env.py CHANGED
@@ -17,12 +17,14 @@ MAXFRAME_NAMESPACE = "MAXFRAME_NAMESPACE"
17
17
 
18
18
  # Maxframe Service common envs
19
19
  MAXFRAME_HTTP_PORT_FILE = "MAXFRAME_PROXY_PORT_FILE"
20
- MAXFRAME_SERVICE_PORT = "MAXFRAME_SERVICE_PORT"
21
- MAXFRAME_SERVICE_PORT_RETRIES = "MAXFRAME_SERVICE_PORT_RETRIES"
20
+ MAXFRAME_INSIDE_TASK = "MAXFRAME_INSIDE_TASK"
21
+ MAXFRAME_SERVICE_BASE_URL = "MF_SERVICE_BASE_URL"
22
+ MAXFRAME_SERVICE_ALLOW_ORIGIN = "MAXFRAME_SERVICE_ALLOW_ORIGIN"
22
23
  MAXFRAME_SERVICE_LISTEN_ADDRESS = "MAXFRAME_SERVICE_LISTEN_ADDRESS"
23
24
  MAXFRAME_SERVICE_LOG_CONFIG_FILE = "MAXFRAME_SERVICE_LOG_CONFIG_FILE"
24
- MAXFRAME_SERVICE_ALLOW_ORIGIN = "MAXFRAME_SERVICE_ALLOW_ORIGIN"
25
- MAXFRAME_SERVICE_BASE_URL = "MF_SERVICE_BASE_URL"
25
+ MAXFRAME_SERVICE_PORT = "MAXFRAME_SERVICE_PORT"
26
+ MAXFRAME_SERVICE_PORT_RETRIES = "MAXFRAME_SERVICE_PORT_RETRIES"
27
+ MAXFRAME_USER_LOG_CONFIG_FILE = "MAXFRAME_USER_LOG_CONFIG_FILE"
26
28
 
27
29
  # ODPS envs
28
30
  ODPS_BEARER_TOKEN = "ODPS_BEARER_TOKEN"
@@ -31,4 +33,5 @@ ODPS_BEARER_TOKEN_TIMESTAMP_FILE = "ODPS_BEARER_TOKEN_TIMESTAMP_FILE"
31
33
  ODPS_PROJECT_NAME = "ODPS_PROJECT_NAME"
32
34
  ODPS_ENDPOINT = "ODPS_ENDPOINT"
33
35
  ODPS_TUNNEL_ENDPOINT = "ODPS_TUNNEL_ENDPOINT"
36
+ ODPS_NAMESPACE = "ODPS_NAMESPACE"
34
37
  ODPS_STORAGE_API_ENDPOINT = "ODPS_STORAGE_API_ENDPOINT"
maxframe/errors.py CHANGED
@@ -43,5 +43,5 @@ class SessionAlreadyClosedError(MaxFrameError):
43
43
 
44
44
 
45
45
  class EngineUnavailableError(MaxFrameIntentionalError):
46
- def __init__(self, engine_type: str):
47
- super().__init__(f"Engine {engine_type} is not ready")
46
+ def __init__(self, msg: str):
47
+ super().__init__(msg)
@@ -22,9 +22,10 @@ import pyarrow as pa
22
22
  from odps import types as odps_types
23
23
  from pandas.api import types as pd_types
24
24
 
25
+ from ...config import options
25
26
  from ...core import TILEABLE_TYPE, OutputType
26
27
  from ...dataframe.core import DATAFRAME_TYPE, INDEX_TYPE, SERIES_TYPE
27
- from ...lib.dtypes_extension import ArrowDtype
28
+ from ...lib.dtypes_extension import ArrowBlobType, ArrowDtype
28
29
  from ...protocol import DataFrameTableMeta
29
30
  from ...tensor.core import TENSOR_TYPE
30
31
  from ...utils import build_temp_table_name
@@ -65,7 +66,11 @@ _odps_type_to_arrow = {
65
66
  odps_types.timestamp_ntz: pa.timestamp("ns"),
66
67
  }
67
68
 
68
- _based_for_pandas_pa_types = (pa.ListType, pa.MapType)
69
+ if hasattr(odps_types, "blob"):
70
+ _arrow_to_odps_types[ArrowBlobType()] = odps_types.blob
71
+ _odps_type_to_arrow[odps_types.blob] = ArrowBlobType()
72
+
73
+ _based_for_pandas_pa_types = (pa.ListType, pa.MapType, pa.StructType)
69
74
 
70
75
 
71
76
  def is_based_for_pandas_dtype(arrow_type: pa.DataType) -> bool:
@@ -204,9 +209,10 @@ def odps_schema_to_pandas_dtypes(
204
209
  def arrow_table_to_pandas_dataframe(
205
210
  table: pa.Table, meta: DataFrameTableMeta = None
206
211
  ) -> pd.DataFrame:
212
+ use_arrow_backend = options.dataframe.dtype_backend == "pyarrow"
207
213
  df = table.to_pandas(
208
214
  types_mapper=lambda x: (
209
- ArrowDtype(x) if is_based_for_pandas_dtype(x) else None
215
+ ArrowDtype(x) if is_based_for_pandas_dtype(x) or use_arrow_backend else None
210
216
  ),
211
217
  ignore_metadata=True,
212
218
  )
@@ -274,6 +274,7 @@ class TunnelTableIO(ODPSTableIO):
274
274
  full_table_name: str,
275
275
  partitions: List[Optional[str]] = None,
276
276
  reopen: bool = False,
277
+ timeout: Optional[float] = None,
277
278
  ) -> Dict[Optional[str], TableDownloadSession]:
278
279
  table = odps_entry.get_table(full_table_name)
279
280
  tunnel = TableTunnel(odps_entry, quota_name=options.tunnel_quota_name)
@@ -295,14 +296,18 @@ class TunnelTableIO(ODPSTableIO):
295
296
  ):
296
297
  down_id = cls._down_session_ids[part_key]
297
298
  down_session = tunnel.create_download_session(
298
- table, async_mode=True, partition_spec=part, download_id=down_id
299
+ table,
300
+ async_mode=True,
301
+ partition_spec=part,
302
+ download_id=down_id,
303
+ timeout=timeout,
299
304
  )
300
305
  if down_session.status != TableDownloadStatus.Normal:
301
306
  down_session = None
302
307
 
303
308
  if down_session is None:
304
309
  down_session = tunnel.create_download_session(
305
- table, async_mode=True, partition_spec=part
310
+ table, async_mode=True, partition_spec=part, timeout=timeout
306
311
  )
307
312
 
308
313
  while len(cls._down_session_ids) >= _DOWNLOAD_ID_CACHE_SIZE:
@@ -12,6 +12,8 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
+ import os
16
+
15
17
  import numpy as np
16
18
  import pandas as pd
17
19
  import pyarrow as pa
@@ -19,9 +21,11 @@ import pytest
19
21
  from odps import types as odps_types
20
22
 
21
23
  from .... import dataframe as md
24
+ from .... import env
22
25
  from .... import tensor as mt
26
+ from ....config import option_context, options
23
27
  from ....core import OutputType
24
- from ....lib.dtypes_extension import ArrowDtype, dict_, list_
28
+ from ....lib.dtypes_extension import ArrowBlobType, ArrowDtype, dict_, list_
25
29
  from ....utils import pd_release_version
26
30
  from ..schema import (
27
31
  arrow_schema_to_odps_schema,
@@ -35,6 +39,16 @@ from ..schema import (
35
39
  )
36
40
 
37
41
 
42
+ @pytest.fixture
43
+ def set_dtype_backend(request):
44
+ os.environ[env.MAXFRAME_INSIDE_TASK] = "1"
45
+ with option_context({"dataframe.dtype_backend": request.param}):
46
+ try:
47
+ yield request.param
48
+ finally:
49
+ os.environ.pop(env.MAXFRAME_INSIDE_TASK)
50
+
51
+
38
52
  def _wrap_maxframe_obj(obj, wrap="no"):
39
53
  if wrap == "no":
40
54
  return obj
@@ -54,7 +68,9 @@ def _wrap_maxframe_obj(obj, wrap="no"):
54
68
 
55
69
 
56
70
  @pytest.mark.parametrize("wrap_obj", ["no", "yes", "data"])
57
- def test_pandas_to_odps_schema_dataframe(wrap_obj):
71
+ @pytest.mark.parametrize("set_dtype_backend", ["numpy", "pyarrow"], indirect=True)
72
+ def test_pandas_to_odps_schema_dataframe(wrap_obj, set_dtype_backend):
73
+ # Test with a simple DataFrame
58
74
  data = pd.DataFrame(np.random.rand(100, 5), columns=list("ABCDE"))
59
75
 
60
76
  test_df = _wrap_maxframe_obj(data, wrap=wrap_obj)
@@ -71,6 +87,7 @@ def test_pandas_to_odps_schema_dataframe(wrap_obj):
71
87
  assert meta.pd_column_level_names == [None]
72
88
  assert meta.pd_index_level_names == [None]
73
89
 
90
+ # Test with ignore_index=True to exclude index from schema
74
91
  test_df = _wrap_maxframe_obj(data, wrap=wrap_obj)
75
92
  schema, meta = pandas_to_odps_schema(test_df, ignore_index=True)
76
93
  assert [c.name for c in schema.columns] == list(test_df.dtypes.index.str.lower())
@@ -81,6 +98,7 @@ def test_pandas_to_odps_schema_dataframe(wrap_obj):
81
98
  assert meta.pd_column_level_names == [None]
82
99
  assert meta.pd_index_level_names == []
83
100
 
101
+ # Test with MultiIndex columns and index
84
102
  data.columns = pd.MultiIndex.from_tuples(
85
103
  [("A", "A"), ("A", "B"), ("A", "C"), ("B", "A"), ("B", "B")], names=["c1", "c2"]
86
104
  )
@@ -105,7 +123,9 @@ def test_pandas_to_odps_schema_dataframe(wrap_obj):
105
123
 
106
124
 
107
125
  @pytest.mark.parametrize("wrap_obj", ["no", "yes", "data"])
108
- def test_pandas_to_odps_schema_series(wrap_obj):
126
+ @pytest.mark.parametrize("set_dtype_backend", ["numpy", "pyarrow"], indirect=True)
127
+ def test_pandas_to_odps_schema_series(wrap_obj, set_dtype_backend):
128
+ # Test with a simple Series
109
129
  data = pd.Series(np.random.rand(100))
110
130
 
111
131
  test_s = _wrap_maxframe_obj(data, wrap=wrap_obj)
@@ -119,6 +139,7 @@ def test_pandas_to_odps_schema_series(wrap_obj):
119
139
  assert meta.pd_column_level_names == [None]
120
140
  assert meta.pd_index_level_names == [None]
121
141
 
142
+ # Test with ignore_index=True to exclude index from schema
122
143
  schema, meta = pandas_to_odps_schema(test_s, ignore_index=True)
123
144
  assert [c.name for c in schema.columns] == ["_data"]
124
145
  assert [c.type.name for c in schema.columns] == ["double"]
@@ -128,6 +149,7 @@ def test_pandas_to_odps_schema_series(wrap_obj):
128
149
  assert meta.pd_column_level_names == [None]
129
150
  assert meta.pd_index_level_names == []
130
151
 
152
+ # Test with named Series and MultiIndex
131
153
  data.index = pd.MultiIndex.from_arrays(
132
154
  [np.random.choice(list("ABC"), 100), np.random.randint(0, 10, 100)],
133
155
  names=["c1", "c2"],
@@ -146,7 +168,9 @@ def test_pandas_to_odps_schema_series(wrap_obj):
146
168
 
147
169
 
148
170
  @pytest.mark.parametrize("wrap_obj", ["no", "yes", "data"])
149
- def test_pandas_to_odps_schema_index(wrap_obj):
171
+ @pytest.mark.parametrize("set_dtype_backend", ["numpy", "pyarrow"], indirect=True)
172
+ def test_pandas_to_odps_schema_index(wrap_obj, set_dtype_backend):
173
+ # Test with a simple Index
150
174
  data = pd.Index(np.random.randint(0, 100, 100))
151
175
 
152
176
  test_idx = _wrap_maxframe_obj(data, wrap=wrap_obj)
@@ -162,6 +186,7 @@ def test_pandas_to_odps_schema_index(wrap_obj):
162
186
  assert meta.pd_column_level_names == []
163
187
  assert meta.pd_index_level_names == [None]
164
188
 
189
+ # Test with MultiIndex
165
190
  data = pd.MultiIndex.from_arrays(
166
191
  [np.random.choice(list("ABC"), 100), np.random.randint(0, 10, 100)],
167
192
  names=["c1", "c2"],
@@ -178,7 +203,8 @@ def test_pandas_to_odps_schema_index(wrap_obj):
178
203
 
179
204
 
180
205
  @pytest.mark.parametrize("wrap_obj", ["no", "yes", "data"])
181
- def test_pandas_to_odps_schema_scalar(wrap_obj):
206
+ @pytest.mark.parametrize("set_dtype_backend", ["numpy", "pyarrow"], indirect=True)
207
+ def test_pandas_to_odps_schema_scalar(wrap_obj, set_dtype_backend):
182
208
  data = 1234.56
183
209
 
184
210
  test_scalar = _wrap_maxframe_obj(data, wrap=wrap_obj)
@@ -196,7 +222,8 @@ def test_pandas_to_odps_schema_scalar(wrap_obj):
196
222
 
197
223
 
198
224
  @pytest.mark.parametrize("wrap_obj", ["no", "yes", "data"])
199
- def test_pandas_to_odps_schema_tensor(wrap_obj):
225
+ @pytest.mark.parametrize("set_dtype_backend", ["numpy", "pyarrow"], indirect=True)
226
+ def test_pandas_to_odps_schema_tensor(wrap_obj, set_dtype_backend):
200
227
  data = np.array([1, 2, 3])
201
228
 
202
229
  test_tensor = _wrap_maxframe_obj(data, wrap=wrap_obj)
@@ -214,6 +241,7 @@ def test_pandas_to_odps_schema_tensor(wrap_obj):
214
241
 
215
242
 
216
243
  def test_odps_arrow_schema_conversion():
244
+ # Create an ODPS schema with various data types
217
245
  odps_schema = odps_types.OdpsSchema(
218
246
  [
219
247
  odps_types.Column("col1", "string"),
@@ -293,110 +321,168 @@ def test_odps_arrow_schema_conversion():
293
321
  c.type for c in odps_schema2.columns
294
322
  ]
295
323
 
324
+ # Test that unsupported data types raise TypeError
296
325
  with pytest.raises(TypeError):
297
326
  arrow_schema_to_odps_schema(pa.schema([("col1", pa.float16())]))
298
327
 
299
328
 
300
- def test_odps_pandas_schema_conversion():
301
- odps_schema = odps_types.OdpsSchema(
302
- [
303
- odps_types.Column("col1", "string"),
304
- odps_types.Column("col2", "binary"),
305
- odps_types.Column("col3", "tinyint"),
306
- odps_types.Column("col4", "smallint"),
307
- odps_types.Column("col5", "int"),
308
- odps_types.Column("col6", "bigint"),
309
- odps_types.Column("col7", "boolean"),
310
- odps_types.Column("col8", "float"),
311
- odps_types.Column("col9", "double"),
312
- # odps_types.Column("col10", "date"),
313
- odps_types.Column("col11", "datetime"),
314
- odps_types.Column("col12", "timestamp"),
315
- # odps_types.Column("col13", "decimal(10, 2)"),
316
- odps_types.Column("col14", "array<string>"),
317
- odps_types.Column("col15", "map<string, bigint>"),
318
- # odps_types.Column("col16", "struct<a1: string, a2: map<string, bigint>>"),
319
- # odps_types.Column("col17", "CHAR(15)"),
320
- # odps_types.Column("col18", "VARCHAR(15)"),
321
- # odps_types.Column("col19", "decimal"),
322
- ]
323
- )
324
- pd_dtypes = odps_schema_to_pandas_dtypes(odps_schema)
325
- pd.testing.assert_series_equal(
326
- pd_dtypes,
327
- pd.Series(
328
- [
329
- np.dtype("O"), # string
330
- np.dtype("O"), # binary
331
- np.dtype(np.int8),
332
- np.dtype(np.int16),
333
- np.dtype(np.int32),
334
- np.dtype(np.int64),
335
- np.dtype(np.bool_),
336
- np.dtype(np.float32),
337
- np.dtype(np.float64),
338
- np.dtype(
339
- "datetime64[ms]" if pd_release_version[0] >= 2 else "datetime64[ns]"
340
- ),
341
- np.dtype("datetime64[ns]"),
342
- ArrowDtype(pa.list_(pa.string())),
343
- ArrowDtype(pa.map_(pa.string(), pa.int64())),
344
- ],
345
- index=[c.name for c in odps_schema.columns],
329
+ def _get_odps_schema_for_test(cast_result=False):
330
+ test_pyarrow = options.dataframe.dtype_backend == "pyarrow"
331
+ cols = [
332
+ odps_types.Column("col1", "string"),
333
+ odps_types.Column(
334
+ "col2", "binary" if test_pyarrow or not cast_result else "string"
346
335
  ),
347
- )
336
+ odps_types.Column("col3", "tinyint"),
337
+ odps_types.Column("col4", "smallint"),
338
+ odps_types.Column("col5", "int"),
339
+ odps_types.Column("col6", "bigint"),
340
+ odps_types.Column("col7", "boolean"),
341
+ odps_types.Column("col8", "float"),
342
+ odps_types.Column("col9", "double"),
343
+ odps_types.Column("col10", "date") if test_pyarrow else None,
344
+ odps_types.Column(
345
+ "col11",
346
+ "datetime" if test_pyarrow or pd_release_version[0] >= 2 else "timestamp",
347
+ ),
348
+ odps_types.Column("col12", "timestamp"),
349
+ odps_types.Column("col13", "decimal(10, 2)") if test_pyarrow else None,
350
+ odps_types.Column("col14", "array<string>"),
351
+ odps_types.Column("col15", "map<string, bigint>"),
352
+ odps_types.Column("col16", "struct<a1: string, a2: map<string, bigint>>"),
353
+ odps_types.Column("col17", "CHAR(15)" if not cast_result else "string")
354
+ if test_pyarrow
355
+ else None,
356
+ odps_types.Column("col18", "VARCHAR(15)" if not cast_result else "string")
357
+ if test_pyarrow
358
+ else None,
359
+ ]
360
+ return odps_types.OdpsSchema([c for c in cols if c is not None])
348
361
 
349
- expected_odps_schema = odps_types.OdpsSchema(
362
+
363
+ def _assert_odps_schema_equal(left, right):
364
+ assert [c.name for c in left.columns] == [c.name for c in right.columns]
365
+ assert [c.type for c in left.columns] == [c.type for c in right.columns]
366
+
367
+
368
+ @pytest.mark.parametrize("set_dtype_backend", ["numpy"], indirect=True)
369
+ def test_odps_pandas_schema_conversion_with_numpy(set_dtype_backend):
370
+ # Create an ODPS schema with various data types
371
+ odps_schema = _get_odps_schema_for_test()
372
+ pd_dtypes = odps_schema_to_pandas_dtypes(odps_schema)
373
+
374
+ expected_series = pd.Series(
350
375
  [
351
- odps_types.Column("col1", "string"),
352
- odps_types.Column("col2", "string"), # binary
353
- odps_types.Column("col3", "tinyint"),
354
- odps_types.Column("col4", "smallint"),
355
- odps_types.Column("col5", "int"),
356
- odps_types.Column("col6", "bigint"),
357
- odps_types.Column("col7", "boolean"),
358
- odps_types.Column("col8", "float"),
359
- odps_types.Column("col9", "double"),
360
- # odps_types.Column("col10", "date"),
361
- odps_types.Column(
362
- "col11", "datetime" if pd_release_version[0] >= 2 else "timestamp"
376
+ np.dtype("O"), # string
377
+ np.dtype("O"), # binary
378
+ np.dtype(np.int8),
379
+ np.dtype(np.int16),
380
+ np.dtype(np.int32),
381
+ np.dtype(np.int64),
382
+ np.dtype(np.bool_),
383
+ np.dtype(np.float32),
384
+ np.dtype(np.float64),
385
+ np.dtype(
386
+ "datetime64[ms]" if pd_release_version[0] >= 2 else "datetime64[ns]"
363
387
  ),
364
- odps_types.Column("col12", "timestamp"),
365
- # odps_types.Column("col13", "decimal(10, 2)"),
366
- odps_types.Column("col14", "array<string>"),
367
- odps_types.Column("col15", "map<string, bigint>"),
368
- # odps_types.Column("col16", "struct<a1: string, a2: map<string, bigint>>"),
369
- # odps_types.Column("col17", "string"),
370
- # odps_types.Column("col18", "string"),
371
- # odps_types.Column("col19", "decimal(38, 18)"),
372
- ]
388
+ np.dtype("datetime64[ns]"),
389
+ ArrowDtype(pa.list_(pa.string())),
390
+ ArrowDtype(pa.map_(pa.string(), pa.int64())),
391
+ ArrowDtype(
392
+ pa.struct(
393
+ [
394
+ pa.field("a1", pa.string()),
395
+ pa.field("a2", pa.map_(pa.string(), pa.int64())),
396
+ ]
397
+ )
398
+ ),
399
+ ],
400
+ index=[c.name for c in odps_schema.columns],
373
401
  )
374
402
 
403
+ pd.testing.assert_series_equal(pd_dtypes, expected_series)
404
+
405
+ expected_odps_schema = _get_odps_schema_for_test(cast_result=True)
406
+
375
407
  odps_schema2 = arrow_schema_to_odps_schema(
376
408
  pandas_dtypes_to_arrow_schema(pd_dtypes, unknown_as_string=True)
377
409
  )
378
- assert [c.name for c in expected_odps_schema.columns] == [
379
- c.name for c in odps_schema2.columns
380
- ]
381
- assert [c.type for c in expected_odps_schema.columns] == [
382
- c.type for c in odps_schema2.columns
383
- ]
410
+ _assert_odps_schema_equal(expected_odps_schema, odps_schema2)
384
411
 
412
+ # Test that unsupported data types raise TypeError
385
413
  with pytest.raises(TypeError):
386
414
  arrow_schema_to_odps_schema(pa.schema([("col1", pa.float16())]))
387
415
 
388
416
 
417
+ @pytest.mark.parametrize("set_dtype_backend", ["pyarrow"], indirect=True)
418
+ def test_odps_pandas_schema_conversion_with_pyarrow(set_dtype_backend):
419
+ # Create an ODPS schema with various data types
420
+ odps_schema = _get_odps_schema_for_test()
421
+ pd_dtypes = odps_schema_to_pandas_dtypes(odps_schema)
422
+
423
+ # When dtype_backend is pyarrow, complex types should be ArrowDtype
424
+ expected_series = pd.Series(
425
+ [
426
+ ArrowDtype(pa.string()),
427
+ ArrowDtype(pa.binary()),
428
+ ArrowDtype(pa.int8()),
429
+ ArrowDtype(pa.int16()),
430
+ ArrowDtype(pa.int32()),
431
+ ArrowDtype(pa.int64()),
432
+ ArrowDtype(pa.bool_()),
433
+ ArrowDtype(pa.float32()),
434
+ ArrowDtype(pa.float64()),
435
+ ArrowDtype(pa.date32()),
436
+ ArrowDtype(pa.timestamp("ms")),
437
+ ArrowDtype(pa.timestamp("ns")),
438
+ ArrowDtype(pa.decimal128(10, 2)),
439
+ ArrowDtype(pa.list_(pa.string())),
440
+ ArrowDtype(pa.map_(pa.string(), pa.int64())),
441
+ ArrowDtype(
442
+ pa.struct(
443
+ [
444
+ pa.field("a1", pa.string()),
445
+ pa.field("a2", pa.map_(pa.string(), pa.int64())),
446
+ ]
447
+ )
448
+ ),
449
+ ArrowDtype(pa.string()),
450
+ ArrowDtype(pa.string()),
451
+ ],
452
+ index=[c.name for c in odps_schema.columns],
453
+ )
454
+
455
+ pd.testing.assert_series_equal(pd_dtypes, expected_series)
456
+
457
+ expected_odps_schema = _get_odps_schema_for_test(cast_result=True)
458
+
459
+ odps_schema2 = arrow_schema_to_odps_schema(
460
+ pandas_dtypes_to_arrow_schema(pd_dtypes, unknown_as_string=True)
461
+ )
462
+ _assert_odps_schema_equal(expected_odps_schema, odps_schema2)
463
+
464
+
389
465
  def test_build_column_name():
390
466
  records = dict()
467
+ # Test that long valid names are preserved
391
468
  assert build_table_column_name(0, "a" * 127, records) == "a" * 127
469
+
470
+ # Test that valid names with underscores and alphanumeric chars are preserved
392
471
  assert build_table_column_name(1, "_abc123", records) == "_abc123"
472
+
473
+ # Test that names with invalid characters are replaced with generated names
393
474
  assert build_table_column_name(2, "_abc'123", records) == "_column_2"
475
+
476
+ # Test that overly long names are replaced with generated names
394
477
  assert build_table_column_name(3, "a" * 256, records) == "_column_3"
478
+
479
+ # Test that tuple names are converted to underscore-separated strings
395
480
  assert build_table_column_name(4, ("A", 1), records) == "a_1"
396
481
 
397
482
 
398
483
  @pytest.mark.parametrize("wrap_obj", ["no", "yes", "data"])
399
- def test_build_table_meta(wrap_obj):
484
+ @pytest.mark.parametrize("set_dtype_backend", ["numpy", "pyarrow"], indirect=True)
485
+ def test_build_table_meta(wrap_obj, set_dtype_backend):
400
486
  data = pd.DataFrame(
401
487
  np.random.rand(100, 7),
402
488
  columns=["A", "A", "A_0", "A_1", "a_1", "B", "C"],
@@ -411,7 +497,9 @@ def test_build_table_meta(wrap_obj):
411
497
  @pytest.mark.skipif(
412
498
  pd_release_version[0] < 2, reason="only run under pandas 2.0 or greater"
413
499
  )
414
- def test_table_meta_with_datetime():
500
+ @pytest.mark.parametrize("set_dtype_backend", ["numpy", "pyarrow"], indirect=True)
501
+ def test_table_meta_with_datetime(set_dtype_backend):
502
+ # Test DataFrame with datetime column
415
503
  raw_df = pd.DataFrame(
416
504
  [
417
505
  [1, "abc", "2024-10-01 11:23:12"],
@@ -423,6 +511,7 @@ def test_table_meta_with_datetime():
423
511
  schema, _ = pandas_to_odps_schema(df, unknown_as_string=True)
424
512
  assert schema.columns[3].type == odps_types.datetime
425
513
 
514
+ # Test Series with datetime dtype
426
515
  raw_series = pd.Series(
427
516
  ["2024-10-01 11:23:12", "2024-10-02 22:55:13"], dtype="datetime64[ms]"
428
517
  )
@@ -430,6 +519,7 @@ def test_table_meta_with_datetime():
430
519
  schema, _ = pandas_to_odps_schema(s, unknown_as_string=True)
431
520
  assert schema.columns[1].type == odps_types.datetime
432
521
 
522
+ # Test Index with datetime dtype
433
523
  raw_index = pd.Index(
434
524
  ["2024-10-01 11:23:12", "2024-10-02 22:55:13"], dtype="datetime64[ms]"
435
525
  )
@@ -437,6 +527,7 @@ def test_table_meta_with_datetime():
437
527
  schema, _ = pandas_to_odps_schema(idx, unknown_as_string=True)
438
528
  assert schema.columns[0].type == odps_types.datetime
439
529
 
530
+ # Test MultiIndex with datetime column
440
531
  src_df = pd.DataFrame(
441
532
  [[1, "2024-10-01 11:23:12"], [3, "2024-10-02 22:55:13"]],
442
533
  columns=["A", "B"],
@@ -463,3 +554,27 @@ def test_pandas_types_to_arrow_schema():
463
554
  assert schema.field("int8").type == pa.int8()
464
555
  assert schema.field("map").type == pa.map_(pa.string(), pa.string())
465
556
  assert schema.field("list").type == pa.list_(pa.string())
557
+
558
+
559
+ @pytest.mark.skipif(
560
+ not hasattr(odps_types, "blob"),
561
+ reason="need pyodps to support blob type to run this test",
562
+ )
563
+ def test_blob_types_conversion():
564
+ pd_data = pd.DataFrame(
565
+ {
566
+ "int_col": pd.Series([1, 2], dtype=np.int64),
567
+ "blob_col": pd.Series([b"abcd", b"efgh"], dtype="blob"),
568
+ },
569
+ )
570
+ arrow_schema = pandas_types_to_arrow_schema(pd_data)
571
+ assert arrow_schema.field("int_col").type == pa.int64()
572
+ assert arrow_schema.field("blob_col").type == ArrowBlobType()
573
+
574
+ odps_schema = arrow_schema_to_odps_schema(arrow_schema)
575
+ assert odps_schema.columns[0].type == odps_types.bigint
576
+ assert odps_schema.columns[1].type == odps_types.blob
577
+
578
+ arrow_schema2 = odps_schema_to_arrow_schema(odps_schema)
579
+ assert arrow_schema2.field("int_col").type == pa.int64()
580
+ assert arrow_schema2.field("blob_col").type == ArrowBlobType()
@@ -12,6 +12,14 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- from . import contrib, model_selection, preprocessing
15
+ from . import (
16
+ cluster,
17
+ contrib,
18
+ linear_model,
19
+ metrics,
20
+ model_selection,
21
+ preprocessing,
22
+ utils,
23
+ )
16
24
 
17
- del contrib, model_selection, preprocessing
25
+ del cluster, contrib, linear_model, metrics, model_selection, preprocessing, utils
@@ -0,0 +1,15 @@
1
+ # Copyright 1999-2025 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from ._kmeans import KMeans, k_means