maxframe 2.2.0__cp39-cp39-macosx_10_9_universal2.whl → 2.3.0rc1__cp39-cp39-macosx_10_9_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of maxframe might be problematic. Click here for more details.

Files changed (114) hide show
  1. maxframe/_utils.cpython-39-darwin.so +0 -0
  2. maxframe/codegen/core.py +3 -2
  3. maxframe/codegen/spe/dataframe/merge.py +4 -0
  4. maxframe/codegen/spe/dataframe/misc.py +2 -0
  5. maxframe/codegen/spe/dataframe/reduction.py +18 -0
  6. maxframe/codegen/spe/dataframe/sort.py +9 -1
  7. maxframe/codegen/spe/dataframe/tests/test_reduction.py +13 -0
  8. maxframe/codegen/spe/dataframe/tseries.py +9 -0
  9. maxframe/codegen/spe/learn/contrib/lightgbm.py +4 -3
  10. maxframe/codegen/spe/tensor/datasource.py +1 -0
  11. maxframe/config/config.py +3 -0
  12. maxframe/conftest.py +10 -0
  13. maxframe/core/base.py +2 -1
  14. maxframe/core/entity/tileables.py +2 -0
  15. maxframe/core/graph/core.cpython-39-darwin.so +0 -0
  16. maxframe/core/graph/entity.py +7 -1
  17. maxframe/core/mode.py +6 -1
  18. maxframe/dataframe/__init__.py +2 -2
  19. maxframe/dataframe/arithmetic/__init__.py +4 -0
  20. maxframe/dataframe/arithmetic/maximum.py +33 -0
  21. maxframe/dataframe/arithmetic/minimum.py +33 -0
  22. maxframe/dataframe/core.py +98 -106
  23. maxframe/dataframe/datasource/core.py +6 -0
  24. maxframe/dataframe/datasource/direct.py +57 -0
  25. maxframe/dataframe/datasource/read_csv.py +19 -11
  26. maxframe/dataframe/datasource/read_odps_query.py +29 -6
  27. maxframe/dataframe/datasource/read_odps_table.py +32 -10
  28. maxframe/dataframe/datasource/read_parquet.py +38 -39
  29. maxframe/dataframe/datastore/__init__.py +6 -0
  30. maxframe/dataframe/datastore/direct.py +268 -0
  31. maxframe/dataframe/datastore/to_odps.py +6 -0
  32. maxframe/dataframe/extensions/flatjson.py +2 -1
  33. maxframe/dataframe/groupby/__init__.py +5 -1
  34. maxframe/dataframe/groupby/aggregation.py +10 -6
  35. maxframe/dataframe/groupby/apply_chunk.py +1 -3
  36. maxframe/dataframe/groupby/core.py +20 -4
  37. maxframe/dataframe/indexing/__init__.py +2 -1
  38. maxframe/dataframe/indexing/insert.py +45 -17
  39. maxframe/dataframe/merge/__init__.py +3 -0
  40. maxframe/dataframe/merge/combine.py +244 -0
  41. maxframe/dataframe/misc/__init__.py +14 -3
  42. maxframe/dataframe/misc/check_unique.py +41 -10
  43. maxframe/dataframe/misc/drop.py +31 -0
  44. maxframe/dataframe/misc/infer_dtypes.py +251 -0
  45. maxframe/dataframe/misc/map.py +31 -18
  46. maxframe/dataframe/misc/repeat.py +159 -0
  47. maxframe/dataframe/misc/tests/test_misc.py +35 -1
  48. maxframe/dataframe/missing/checkna.py +3 -2
  49. maxframe/dataframe/reduction/__init__.py +10 -5
  50. maxframe/dataframe/reduction/aggregation.py +6 -6
  51. maxframe/dataframe/reduction/argmax.py +7 -4
  52. maxframe/dataframe/reduction/argmin.py +7 -4
  53. maxframe/dataframe/reduction/core.py +18 -9
  54. maxframe/dataframe/reduction/mode.py +144 -0
  55. maxframe/dataframe/reduction/nunique.py +10 -3
  56. maxframe/dataframe/reduction/tests/test_reduction.py +12 -0
  57. maxframe/dataframe/sort/__init__.py +9 -2
  58. maxframe/dataframe/sort/argsort.py +7 -1
  59. maxframe/dataframe/sort/core.py +1 -1
  60. maxframe/dataframe/sort/rank.py +147 -0
  61. maxframe/dataframe/tseries/__init__.py +19 -0
  62. maxframe/dataframe/tseries/at_time.py +61 -0
  63. maxframe/dataframe/tseries/between_time.py +122 -0
  64. maxframe/dataframe/utils.py +30 -26
  65. maxframe/learn/contrib/llm/core.py +16 -7
  66. maxframe/learn/contrib/llm/deploy/__init__.py +13 -0
  67. maxframe/learn/contrib/llm/deploy/config.py +221 -0
  68. maxframe/learn/contrib/llm/deploy/core.py +247 -0
  69. maxframe/learn/contrib/llm/deploy/framework.py +35 -0
  70. maxframe/learn/contrib/llm/deploy/loader.py +360 -0
  71. maxframe/learn/contrib/llm/deploy/tests/__init__.py +13 -0
  72. maxframe/learn/contrib/llm/deploy/tests/test_register_models.py +359 -0
  73. maxframe/learn/contrib/llm/models/__init__.py +1 -0
  74. maxframe/learn/contrib/llm/models/dashscope.py +12 -6
  75. maxframe/learn/contrib/llm/models/managed.py +76 -11
  76. maxframe/learn/contrib/llm/models/openai.py +72 -0
  77. maxframe/learn/contrib/llm/tests/__init__.py +13 -0
  78. maxframe/learn/contrib/llm/tests/test_core.py +34 -0
  79. maxframe/learn/contrib/llm/tests/test_openai.py +187 -0
  80. maxframe/learn/contrib/llm/tests/test_text_gen.py +155 -0
  81. maxframe/learn/contrib/llm/text.py +348 -42
  82. maxframe/learn/contrib/models.py +4 -1
  83. maxframe/learn/contrib/xgboost/classifier.py +2 -0
  84. maxframe/learn/contrib/xgboost/core.py +31 -7
  85. maxframe/learn/contrib/xgboost/predict.py +4 -2
  86. maxframe/learn/contrib/xgboost/regressor.py +5 -0
  87. maxframe/learn/contrib/xgboost/train.py +2 -0
  88. maxframe/learn/preprocessing/_data/min_max_scaler.py +34 -23
  89. maxframe/learn/preprocessing/_data/standard_scaler.py +34 -25
  90. maxframe/learn/utils/__init__.py +1 -0
  91. maxframe/learn/utils/extmath.py +42 -9
  92. maxframe/learn/utils/odpsio.py +80 -11
  93. maxframe/lib/filesystem/_oss_lib/common.py +2 -0
  94. maxframe/lib/mmh3.cpython-39-darwin.so +0 -0
  95. maxframe/opcodes.py +9 -1
  96. maxframe/remote/core.py +4 -0
  97. maxframe/serialization/core.cpython-39-darwin.so +0 -0
  98. maxframe/serialization/tests/test_serial.py +2 -2
  99. maxframe/tensor/arithmetic/__init__.py +1 -1
  100. maxframe/tensor/arithmetic/core.py +2 -2
  101. maxframe/tensor/arithmetic/tests/test_arithmetic.py +0 -9
  102. maxframe/tensor/core.py +3 -0
  103. maxframe/tensor/misc/copyto.py +1 -1
  104. maxframe/tests/test_udf.py +61 -0
  105. maxframe/tests/test_utils.py +8 -5
  106. maxframe/udf.py +103 -7
  107. maxframe/utils.py +61 -8
  108. {maxframe-2.2.0.dist-info → maxframe-2.3.0rc1.dist-info}/METADATA +1 -2
  109. {maxframe-2.2.0.dist-info → maxframe-2.3.0rc1.dist-info}/RECORD +113 -90
  110. maxframe_client/session/task.py +8 -1
  111. maxframe_client/tests/test_session.py +24 -0
  112. maxframe/dataframe/arrays.py +0 -864
  113. {maxframe-2.2.0.dist-info → maxframe-2.3.0rc1.dist-info}/WHEEL +0 -0
  114. {maxframe-2.2.0.dist-info → maxframe-2.3.0rc1.dist-info}/top_level.txt +0 -0
@@ -1,864 +0,0 @@
1
- # Copyright 1999-2025 Alibaba Group Holding Ltd.
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
14
-
15
- import itertools
16
- import operator
17
- import re
18
- from copy import copy as copy_obj
19
- from numbers import Integral
20
- from typing import Sequence, Type
21
-
22
- import numpy as np
23
- import pandas as pd
24
- from pandas._libs import lib
25
- from pandas.api.extensions import (
26
- ExtensionArray,
27
- ExtensionDtype,
28
- register_extension_dtype,
29
- )
30
- from pandas.api.indexers import check_array_indexer
31
- from pandas.api.types import (
32
- is_array_like,
33
- is_list_like,
34
- is_scalar,
35
- is_string_dtype,
36
- pandas_dtype,
37
- )
38
- from pandas.arrays import StringArray as StringArrayBase
39
- from pandas.compat import set_function_name
40
- from pandas.core import ops
41
- from pandas.core.algorithms import take
42
-
43
- try:
44
- from pandas._libs.arrays import NDArrayBacked
45
- except ImportError:
46
- NDArrayBacked = None
47
- try:
48
- import pyarrow as pa
49
-
50
- pa_null = pa.NULL
51
- except ImportError: # pragma: no cover
52
- pa = None
53
- pa_null = None
54
- try:
55
- import pyarrow.compute as pc
56
- except ImportError: # pragma: no cover
57
- pc = None
58
-
59
- from ..config import options
60
- from ..core import is_kernel_mode
61
- from ..utils import pd_release_version, tokenize
62
-
63
- _use_bool_any_all = pd_release_version[:2] >= (1, 3)
64
- _use_extension_index = pd_release_version[:2] >= (1, 4)
65
- _object_engine_for_string_array = pd_release_version[:2] >= (1, 5)
66
-
67
- if _object_engine_for_string_array:
68
- StringArrayBase = type(StringArrayBase)(
69
- "StringArrayBase", StringArrayBase.__bases__, dict(StringArrayBase.__dict__)
70
- )
71
-
72
-
73
- class ArrowDtype(ExtensionDtype):
74
- @property
75
- def arrow_type(self): # pragma: no cover
76
- raise NotImplementedError
77
-
78
- def __from_arrow__(self, array):
79
- return self.construct_array_type()(array)
80
-
81
-
82
- @register_extension_dtype
83
- class ArrowStringDtype(ArrowDtype):
84
- """
85
- Extension dtype for arrow string data.
86
-
87
- .. warning::
88
-
89
- ArrowStringDtype is considered experimental. The implementation and
90
- parts of the API may change without warning.
91
-
92
- In particular, ArrowStringDtype.na_value may change to no longer be
93
- ``numpy.nan``.
94
-
95
- Attributes
96
- ----------
97
- None
98
-
99
- Methods
100
- -------
101
- None
102
-
103
- Examples
104
- --------
105
- >>> import maxframe.dataframe as md
106
- >>> md.ArrowStringDtype()
107
- ArrowStringDtype
108
- """
109
-
110
- type = str
111
- kind = "U"
112
- name = "Arrow[string]"
113
- na_value = pa_null
114
-
115
- @classmethod
116
- def construct_from_string(cls, string):
117
- if string == cls.name:
118
- return cls()
119
- else:
120
- raise TypeError(f"Cannot construct a '{cls}' from '{string}'")
121
-
122
- @classmethod
123
- def construct_array_type(cls) -> "Type[ArrowStringArray]":
124
- return ArrowStringArray
125
-
126
- @property
127
- def arrow_type(self):
128
- return pa.string()
129
-
130
-
131
- @register_extension_dtype
132
- class ArrowStringDtypeAlias(ArrowStringDtype):
133
- name = "arrow_string" # register an alias name for compatibility
134
-
135
-
136
- class ArrowListDtypeType(type):
137
- """
138
- the type of ArrowListDtype, this metaclass determines subclass ability
139
- """
140
-
141
- pass
142
-
143
-
144
- class ArrowListDtype(ArrowDtype):
145
- _metadata = ("_value_type",)
146
-
147
- def __init__(self, dtype):
148
- if isinstance(dtype, type(self)):
149
- dtype = dtype.value_type
150
- if pa and isinstance(dtype, pa.DataType):
151
- dtype = dtype.to_pandas_dtype()
152
-
153
- dtype = pandas_dtype(dtype)
154
- if is_string_dtype(dtype) and not isinstance(dtype, ArrowStringDtype):
155
- # convert string dtype to arrow string dtype
156
- dtype = ArrowStringDtype()
157
-
158
- self._value_type = dtype
159
-
160
- @property
161
- def value_type(self):
162
- return self._value_type
163
-
164
- @property
165
- def kind(self):
166
- return "O"
167
-
168
- @property
169
- def type(self):
170
- return ArrowListDtypeType
171
-
172
- @property
173
- def name(self):
174
- return f"Arrow[List[{self.value_type.name}]]"
175
-
176
- @property
177
- def arrow_type(self):
178
- if isinstance(self._value_type, ArrowDtype):
179
- arrow_subdtype = self._value_type.arrow_type
180
- else:
181
- arrow_subdtype = pa.from_numpy_dtype(self._value_type)
182
- return pa.list_(arrow_subdtype)
183
-
184
- def __repr__(self) -> str:
185
- return self.name
186
-
187
- @classmethod
188
- def construct_array_type(cls) -> "Type[ArrowListArray]":
189
- return ArrowListArray
190
-
191
- @classmethod
192
- def construct_from_string(cls, string):
193
- msg = f"Cannot construct a 'ArrowListDtype' from '{string}'"
194
- xpr = re.compile(r"Arrow\[List\[(?P<value_type>[^,]*)\]\]$")
195
- m = xpr.match(string)
196
- if m:
197
- value_type = m.groupdict()["value_type"]
198
- return ArrowListDtype(value_type)
199
- else:
200
- raise TypeError(msg)
201
-
202
- @classmethod
203
- def is_dtype(cls, dtype) -> bool:
204
- dtype = getattr(dtype, "dtype", dtype)
205
- if isinstance(dtype, str):
206
- try:
207
- cls.construct_from_string(dtype)
208
- except TypeError:
209
- return False
210
- else:
211
- return True
212
- else:
213
- return isinstance(dtype, cls)
214
-
215
- def __hash__(self):
216
- return super().__hash__()
217
-
218
- def __eq__(self, other):
219
- if not isinstance(other, ArrowListDtype):
220
- return False
221
-
222
- value_type = self._value_type
223
- other_value_type = other._value_type
224
-
225
- try:
226
- return value_type == other_value_type
227
- except TypeError:
228
- # cannot compare numpy dtype and extension dtype
229
- return other_value_type == value_type
230
-
231
-
232
- class ArrowArray(ExtensionArray):
233
- _arrow_type = None
234
-
235
- def __init__(self, values, dtype: ArrowDtype = None, copy=False):
236
- pandas_only = self._pandas_only()
237
-
238
- if pa is not None and not pandas_only:
239
- self._init_by_arrow(values, dtype=dtype, copy=copy)
240
- elif not is_kernel_mode():
241
- # not in kernel mode, allow to use numpy handle data
242
- # just for infer dtypes purpose
243
- self._init_by_numpy(values, dtype=dtype, copy=copy)
244
- else:
245
- raise ImportError("Cannot create ArrowArray when `pyarrow` not installed")
246
-
247
- # for test purpose
248
- self._force_use_pandas = pandas_only
249
-
250
- def _init_by_arrow(self, values, dtype: ArrowDtype = None, copy=False):
251
- if isinstance(values, (pd.Index, pd.Series)):
252
- # for pandas Index and Series,
253
- # convert to PandasArray
254
- values = values.array
255
-
256
- if isinstance(values, type(self)):
257
- arrow_array = values._arrow_array
258
- elif isinstance(values, ExtensionArray):
259
- # if come from pandas object like index,
260
- # convert to pandas StringArray first,
261
- # validation will be done in construct
262
- arrow_array = pa.chunked_array([pa.array(values, from_pandas=True)])
263
- elif isinstance(values, pa.ChunkedArray):
264
- arrow_array = values
265
- elif isinstance(values, pa.Array):
266
- arrow_array = pa.chunked_array([values])
267
- elif len(values) == 0: # pragma: no cover
268
- arrow_array = pa.chunked_array([pa.array([], type=dtype.arrow_type)])
269
- else:
270
- arrow_array = pa.chunked_array([pa.array(values, type=dtype.arrow_type)])
271
-
272
- if copy:
273
- arrow_array = copy_obj(arrow_array)
274
-
275
- self._use_arrow = True
276
- self._arrow_array = arrow_array
277
-
278
- if NDArrayBacked is not None and isinstance(self, NDArrayBacked):
279
- NDArrayBacked.__init__(self, np.array([]), dtype)
280
- else:
281
- self._dtype = dtype
282
-
283
- def _init_by_numpy(self, values, dtype: ArrowDtype = None, copy=False):
284
- self._use_arrow = False
285
-
286
- ndarray = np.array(values, copy=copy)
287
- if NDArrayBacked is not None and isinstance(self, NDArrayBacked):
288
- NDArrayBacked.__init__(self, ndarray, dtype)
289
- else:
290
- self._dtype = dtype
291
- self._ndarray = np.array(values, copy=copy)
292
-
293
- @classmethod
294
- def _pandas_only(cls):
295
- return options.dataframe.arrow_array.pandas_only
296
-
297
- def __repr__(self):
298
- return f"{type(self).__name__}({repr(self._array)})"
299
-
300
- @property
301
- def _array(self):
302
- return self._arrow_array if self._use_arrow else self._ndarray
303
-
304
- @property
305
- def dtype(self) -> "Type[ArrowDtype]":
306
- return self._dtype
307
-
308
- @property
309
- def nbytes(self) -> int:
310
- if self._use_arrow:
311
- return sum(
312
- x.size
313
- for chunk in self._arrow_array.chunks
314
- for x in chunk.buffers()
315
- if x is not None
316
- )
317
- else:
318
- return self._ndarray.nbytes
319
-
320
- @property
321
- def shape(self):
322
- if self._use_arrow:
323
- return (self._arrow_array.length(),)
324
- else:
325
- return self._ndarray.shape
326
-
327
- def memory_usage(self, deep=True) -> int:
328
- if self._use_arrow:
329
- return self.nbytes
330
- else:
331
- return pd.Series(self._ndarray).memory_usage(index=False, deep=deep)
332
-
333
- @classmethod
334
- def _to_arrow_array(cls, scalars):
335
- return pa.array(scalars)
336
-
337
- @classmethod
338
- def _from_sequence(cls, scalars, dtype=None, copy=False):
339
- if pa is None or cls._pandas_only():
340
- # pyarrow not installed, just return numpy
341
- ret = np.empty(len(scalars), dtype=object)
342
- ret[:] = scalars
343
- return cls(ret)
344
-
345
- if pa_null is not None and isinstance(scalars, type(pa_null)):
346
- scalars = []
347
- elif not hasattr(scalars, "dtype"):
348
- ret = np.empty(len(scalars), dtype=object)
349
- for i, s in enumerate(scalars):
350
- ret[i] = s
351
- scalars = ret
352
- elif isinstance(scalars, cls):
353
- if copy:
354
- scalars = scalars.copy()
355
- return scalars
356
- arrow_array = pa.chunked_array([cls._to_arrow_array(scalars)])
357
- return cls(arrow_array, dtype=dtype, copy=copy)
358
-
359
- @classmethod
360
- def _from_sequence_of_strings(cls, strings, dtype=None, copy=False):
361
- return cls._from_sequence(strings, dtype=dtype, copy=copy)
362
-
363
- @staticmethod
364
- def _can_process_slice_via_arrow(slc):
365
- if not isinstance(slc, slice):
366
- return False
367
- if slc.step is not None and slc.step != 1:
368
- return False
369
- if slc.start is not None and not isinstance(
370
- slc.start, Integral
371
- ): # pragma: no cover
372
- return False
373
- if slc.stop is not None and not isinstance(
374
- slc.stop, Integral
375
- ): # pragma: no cover
376
- return False
377
- return True
378
-
379
- def _values_for_factorize(self):
380
- arr = self.to_numpy()
381
- mask = self.isna()
382
- arr[mask] = -1
383
- return arr, -1
384
-
385
- def _values_for_argsort(self):
386
- return self.to_numpy()
387
-
388
- @classmethod
389
- def _from_factorized(cls, values, original):
390
- return cls(values)
391
-
392
- @staticmethod
393
- def _process_pos(pos, length, is_start):
394
- if pos is None:
395
- return 0 if is_start else length
396
- return pos + length if pos < 0 else pos
397
-
398
- @classmethod
399
- def _post_scalar_getitem(cls, lst):
400
- return lst.to_pandas()[0]
401
-
402
- def __getitem__(self, item):
403
- cls = type(self)
404
-
405
- if pa is None or self._force_use_pandas:
406
- # pyarrow not installed
407
- result = self._ndarray[item]
408
- if pd.api.types.is_scalar(item):
409
- return result
410
- else:
411
- return type(self)(result)
412
-
413
- has_take = hasattr(self._arrow_array, "take")
414
- if not self._force_use_pandas and has_take:
415
- if pd.api.types.is_scalar(item):
416
- item = item + len(self) if item < 0 else item
417
- return self._post_scalar_getitem(self._arrow_array.take([item]))
418
- elif self._can_process_slice_via_arrow(item):
419
- length = len(self)
420
- start, stop = item.start, item.stop
421
- start = self._process_pos(start, length, True)
422
- stop = self._process_pos(stop, length, False)
423
- return cls(
424
- self._arrow_array.slice(offset=start, length=stop - start),
425
- dtype=self._dtype,
426
- )
427
- elif hasattr(item, "dtype") and np.issubdtype(item.dtype, np.bool_):
428
- return cls(
429
- self._arrow_array.filter(pa.array(item, from_pandas=True)),
430
- dtype=self._dtype,
431
- )
432
- elif hasattr(item, "dtype"):
433
- length = len(self)
434
- item = np.where(item < 0, item + length, item)
435
- return cls(self._arrow_array.take(item), dtype=self._dtype)
436
-
437
- array = np.asarray(self._arrow_array.to_pandas())
438
- return cls(array[item], dtype=self._dtype)
439
-
440
- @classmethod
441
- def _concat_same_type(cls, to_concat: Sequence["ArrowArray"]) -> "ArrowArray":
442
- if pa is None or cls._pandas_only():
443
- # pyarrow not installed
444
- return cls(np.concatenate([x._array for x in to_concat]))
445
-
446
- chunks = list(
447
- itertools.chain.from_iterable(x._arrow_array.chunks for x in to_concat)
448
- )
449
- if len(chunks) == 0:
450
- chunks = [pa.array([], type=to_concat[0].dtype.arrow_type)]
451
- return cls(pa.chunked_array(chunks))
452
-
453
- def __len__(self):
454
- return len(self._array)
455
-
456
- def __array__(self, dtype=None):
457
- return self.to_numpy(dtype=dtype)
458
-
459
- def to_numpy(self, dtype=None, copy=False, na_value=lib.no_default):
460
- if self._use_arrow:
461
- array = np.asarray(self._arrow_array.to_pandas())
462
- else:
463
- array = self._ndarray
464
- if copy or na_value is not lib.no_default:
465
- array = array.copy()
466
- if na_value is not lib.no_default:
467
- array[self.isna()] = na_value
468
- return array
469
-
470
- @classmethod
471
- def _array_fillna(cls, array, value):
472
- return array.fillna(value)
473
-
474
- def fillna(self, value=None, method=None, limit=None):
475
- cls = type(self)
476
-
477
- if pa is None or self._force_use_pandas:
478
- # pyarrow not installed
479
- return cls(
480
- pd.Series(self.to_numpy()).fillna(
481
- value=value, method=method, limit=limit
482
- )
483
- )
484
-
485
- chunks = []
486
- for chunk_array in self._arrow_array.chunks:
487
- array = chunk_array.to_pandas()
488
- if method is None:
489
- result_array = self._array_fillna(array, value)
490
- else:
491
- result_array = array.fillna(value=value, method=method, limit=limit)
492
- chunks.append(pa.array(result_array, from_pandas=True))
493
- return cls(pa.chunked_array(chunks), dtype=self._dtype)
494
-
495
- def astype(self, dtype, copy=True):
496
- dtype = pandas_dtype(dtype)
497
- if isinstance(dtype, ArrowStringDtype):
498
- if copy:
499
- return self.copy()
500
- return self
501
-
502
- if pa is None or self._force_use_pandas:
503
- # pyarrow not installed
504
- if isinstance(dtype, ArrowDtype):
505
- dtype = dtype.type
506
- return type(self)(pd.Series(self.to_numpy()).astype(dtype, copy=copy))
507
-
508
- # try to slice 1 record to get the result dtype
509
- test_array = self._arrow_array.slice(0, 1).to_pandas()
510
- test_result_array = test_array.astype(dtype).array
511
- if _use_extension_index:
512
- test_result_type = type(test_array.astype(dtype).values)
513
- if test_result_type is np.ndarray:
514
- test_result_type = np.array
515
- else:
516
- test_result_type = type(test_result_array)
517
-
518
- result_array = test_result_type(
519
- np.full(
520
- self.shape,
521
- test_result_array.dtype.na_value,
522
- dtype=np.asarray(test_result_array).dtype,
523
- )
524
- )
525
-
526
- start = 0
527
- # use chunks to do astype
528
- for chunk_array in self._arrow_array.chunks:
529
- result_array[start : start + len(chunk_array)] = (
530
- chunk_array.to_pandas().astype(dtype).array
531
- )
532
- start += len(chunk_array)
533
- return result_array
534
-
535
- def isna(self):
536
- if (
537
- not self._force_use_pandas
538
- and self._use_arrow
539
- and hasattr(self._arrow_array, "is_null")
540
- ):
541
- return self._arrow_array.is_null().to_pandas().to_numpy()
542
- elif self._use_arrow:
543
- return pd.isna(self._arrow_array.to_pandas()).to_numpy()
544
- else:
545
- return pd.isna(self._ndarray)
546
-
547
- def take(self, indices, allow_fill=False, fill_value=None):
548
- if (
549
- allow_fill is False or (allow_fill and fill_value is self.dtype.na_value)
550
- ) and len(self) > 0:
551
- return type(self)(self[indices], dtype=self._dtype)
552
-
553
- if self._use_arrow:
554
- array = self._arrow_array.to_pandas().to_numpy()
555
- else:
556
- array = self._ndarray
557
-
558
- replace = False
559
- if allow_fill and (fill_value is None or fill_value == self._dtype.na_value):
560
- fill_value = self.dtype.na_value
561
- replace = True
562
-
563
- result = take(array, indices, fill_value=fill_value, allow_fill=allow_fill)
564
- del array
565
- if replace and pa is not None:
566
- # pyarrow cannot recognize pa.NULL
567
- result[result == self.dtype.na_value] = None
568
- return type(self)(result, dtype=self._dtype)
569
-
570
- def copy(self):
571
- if self._use_arrow:
572
- return type(self)(copy_obj(self._arrow_array))
573
- else:
574
- return type(self)(self._ndarray.copy())
575
-
576
- def unique(self):
577
- if self._force_use_pandas or not self._use_arrow or not hasattr(pc, "unique"):
578
- return type(self)(np.unique(self.to_numpy()), dtype=self._dtype)
579
- return type(self)(pc.unique(self._arrow_array), dtype=self._dtype)
580
-
581
- def value_counts(self, dropna=False):
582
- if self._use_arrow:
583
- series = self._arrow_array.to_pandas()
584
- else:
585
- series = pd.Series(self._ndarray)
586
- return type(self)(series.value_counts(dropna=dropna), dtype=self._dtype)
587
-
588
- if _use_bool_any_all:
589
-
590
- def any(self, axis=0, out=None):
591
- return self.to_numpy().astype(bool).any(axis=axis, out=out)
592
-
593
- def all(self, axis=0, out=None):
594
- return self.to_numpy().astype(bool).all(axis=axis, out=out)
595
-
596
- else:
597
-
598
- def any(self, axis=0, out=None):
599
- return self.to_numpy().any(axis=axis, out=out)
600
-
601
- def all(self, axis=0, out=None):
602
- return self.to_numpy().all(axis=axis, out=out)
603
-
604
- def __maxframe_tokenize__(self):
605
- if self._use_arrow:
606
- return tokenize(
607
- [
608
- memoryview(x)
609
- for chunk in self._arrow_array.chunks
610
- for x in chunk.buffers()
611
- if x is not None
612
- ]
613
- )
614
- else:
615
- return self._ndarray
616
-
617
-
618
- class ArrowStringArray(ArrowArray, StringArrayBase):
619
- def __init__(self, values, dtype=None, copy=False):
620
- if dtype is not None:
621
- assert isinstance(dtype, ArrowStringDtype)
622
- ArrowArray.__init__(self, values, ArrowStringDtype(), copy=copy)
623
-
624
- @classmethod
625
- def from_scalars(cls, values):
626
- if pa is None or cls._pandas_only():
627
- return cls._from_sequence(values)
628
- else:
629
- arrow_array = pa.chunked_array([cls._to_arrow_array(values)])
630
- return cls(arrow_array)
631
-
632
- @classmethod
633
- def _to_arrow_array(cls, scalars):
634
- return pa.array(scalars).cast(pa.string())
635
-
636
- def __setitem__(self, key, value):
637
- if isinstance(value, (pd.Index, pd.Series)):
638
- value = value.to_numpy()
639
- if isinstance(value, type(self)):
640
- value = value.to_numpy()
641
-
642
- key = check_array_indexer(self, key)
643
- scalar_key = is_scalar(key)
644
- scalar_value = is_scalar(value)
645
- if scalar_key and not scalar_value:
646
- raise ValueError("setting an array element with a sequence.")
647
-
648
- # validate new items
649
- if scalar_value:
650
- if pd.isna(value):
651
- value = None
652
- elif not isinstance(value, str):
653
- raise ValueError(
654
- f"Cannot set non-string value '{value}' into a ArrowStringArray."
655
- )
656
- else:
657
- if not is_array_like(value):
658
- value = np.asarray(value, dtype=object)
659
- if len(value) and not lib.is_string_array(value, skipna=True):
660
- raise ValueError("Must provide strings.")
661
-
662
- if self._use_arrow:
663
- string_array = np.asarray(self._arrow_array.to_pandas())
664
- string_array[key] = value
665
- self._arrow_array = pa.chunked_array([pa.array(string_array)])
666
- else:
667
- self._ndarray[key] = value
668
-
669
- # Override parent because we have different return types.
670
- @classmethod
671
- def _create_arithmetic_method(cls, op):
672
- # Note: this handles both arithmetic and comparison methods.
673
- def method(self, other):
674
- is_arithmetic = True if op.__name__ in ops.ARITHMETIC_BINOPS else False
675
- pandas_only = cls._pandas_only()
676
-
677
- is_other_array = False
678
- if not is_scalar(other):
679
- is_other_array = True
680
- other = np.asarray(other)
681
-
682
- self_is_na = self.isna()
683
- other_is_na = pd.isna(other)
684
- mask = self_is_na | other_is_na
685
-
686
- if pa is None or pandas_only:
687
- if is_arithmetic:
688
- ret = np.empty(self.shape, dtype=object)
689
- else:
690
- ret = np.zeros(self.shape, dtype=bool)
691
- valid = ~mask
692
- arr = (
693
- self._arrow_array.to_pandas().to_numpy()
694
- if self._use_arrow
695
- else self._ndarray
696
- )
697
- o = other[valid] if is_other_array else other
698
- ret[valid] = op(arr[valid], o)
699
- if is_arithmetic:
700
- return ArrowStringArray(ret)
701
- else:
702
- return pd.arrays.BooleanArray(ret, mask)
703
-
704
- chunks = []
705
- mask_chunks = []
706
- start = 0
707
- for chunk_array in self._arrow_array.chunks:
708
- chunk_array = np.asarray(chunk_array.to_pandas())
709
- end = start + len(chunk_array)
710
- chunk_mask = mask[start:end]
711
- chunk_valid = ~chunk_mask
712
-
713
- if is_arithmetic:
714
- result = np.empty(chunk_array.shape, dtype=object)
715
- else:
716
- result = np.zeros(chunk_array.shape, dtype=bool)
717
-
718
- chunk_other = other
719
- if is_other_array:
720
- chunk_other = other[start:end]
721
- chunk_other = chunk_other[chunk_valid]
722
-
723
- # calculate only for both not None
724
- result[chunk_valid] = op(chunk_array[chunk_valid], chunk_other)
725
-
726
- if is_arithmetic:
727
- chunks.append(pa.array(result, type=pa.string(), from_pandas=True))
728
- else:
729
- chunks.append(result)
730
- mask_chunks.append(chunk_mask)
731
-
732
- if is_arithmetic:
733
- return ArrowStringArray(pa.chunked_array(chunks))
734
- else:
735
- return pd.arrays.BooleanArray(
736
- np.concatenate(chunks), np.concatenate(mask_chunks)
737
- )
738
-
739
- return set_function_name(method, f"__{op.__name__}__", cls)
740
-
741
- def shift(self, periods: int = 1, fill_value: object = None) -> "ArrowStringArray":
742
- return ExtensionArray.shift(self, periods=periods, fill_value=fill_value)
743
-
744
- @classmethod
745
- def _add_arithmetic_ops(cls):
746
- cls.__add__ = cls._create_arithmetic_method(operator.add)
747
- cls.__radd__ = cls._create_arithmetic_method(ops.radd)
748
-
749
- cls.__mul__ = cls._create_arithmetic_method(operator.mul)
750
- cls.__rmul__ = cls._create_arithmetic_method(ops.rmul)
751
-
752
- @classmethod
753
- def _add_comparison_ops(cls):
754
- cls.__eq__ = cls._create_comparison_method(operator.eq)
755
- cls.__ne__ = cls._create_comparison_method(operator.ne)
756
- cls.__lt__ = cls._create_comparison_method(operator.lt)
757
- cls.__gt__ = cls._create_comparison_method(operator.gt)
758
- cls.__le__ = cls._create_comparison_method(operator.le)
759
- cls.__ge__ = cls._create_comparison_method(operator.ge)
760
-
761
- _create_comparison_method = _create_arithmetic_method
762
-
763
-
764
- ArrowStringArray._add_arithmetic_ops()
765
- ArrowStringArray._add_comparison_ops()
766
-
767
-
768
- class ArrowListArray(ArrowArray):
769
- def __init__(self, values, dtype: ArrowListDtype = None, copy=False):
770
- if dtype is None:
771
- if isinstance(values, type(self)):
772
- dtype = values.dtype
773
- elif pa is not None:
774
- if isinstance(values, pa.Array):
775
- dtype = ArrowListDtype(values.type.value_type)
776
- elif isinstance(values, pa.ChunkedArray):
777
- dtype = ArrowListDtype(values.type.value_type)
778
- else:
779
- values = pa.array(values)
780
- if values.type == pa.null():
781
- dtype = ArrowListDtype(pa.string())
782
- else:
783
- dtype = ArrowListDtype(values.type.value_type)
784
- else:
785
- value_type = np.asarray(values[0]).dtype
786
- dtype = ArrowListDtype(value_type)
787
-
788
- super().__init__(values, dtype=dtype, copy=copy)
789
-
790
- def to_numpy(self, dtype=None, copy=False, na_value=lib.no_default):
791
- if self._use_arrow:
792
- s = self._arrow_array.to_pandas()
793
- else:
794
- s = pd.Series(self._ndarray)
795
- s = s.map(lambda x: x.tolist() if hasattr(x, "tolist") else x)
796
- if copy or na_value is not lib.no_default:
797
- s = s.copy()
798
- if na_value is not lib.no_default:
799
- s[self.isna()] = na_value
800
- return np.asarray(s)
801
-
802
- @classmethod
803
- def _post_scalar_getitem(cls, lst):
804
- return lst[0].as_py()
805
-
806
- def __setitem__(self, key, value):
807
- if isinstance(value, (pd.Index, pd.Series)):
808
- value = value.to_numpy()
809
-
810
- key = check_array_indexer(self, key)
811
- scalar_key = is_scalar(key)
812
-
813
- # validate new items
814
- if scalar_key:
815
- if pd.isna(value):
816
- value = None
817
- elif not is_list_like(value):
818
- raise ValueError("Must provide list.")
819
-
820
- if self._use_arrow:
821
- array = np.asarray(self._arrow_array.to_pandas())
822
- array[key] = value
823
- self._arrow_array = pa.chunked_array(
824
- [pa.array(array, type=self.dtype.arrow_type)]
825
- )
826
- else:
827
- self._ndarray[key] = value
828
-
829
- @classmethod
830
- def _array_fillna(cls, series, value):
831
- # cannot fillna directly, because value is a list-like object
832
- return series.apply(lambda x: x if is_list_like(x) or not pd.isna(x) else value)
833
-
834
- def astype(self, dtype, copy=True):
835
- msg = f"cannot astype from {self.dtype} to {dtype}"
836
- dtype = pandas_dtype(dtype)
837
- if isinstance(dtype, ArrowListDtype):
838
- if self.dtype == dtype:
839
- if copy:
840
- return self.copy()
841
- return self
842
- else:
843
- if self._use_arrow:
844
- try:
845
- arrow_array = self._arrow_array.cast(dtype.arrow_type)
846
- return ArrowListArray(arrow_array)
847
- except (NotImplementedError, pa.ArrowInvalid):
848
- raise TypeError(msg)
849
- else:
850
-
851
- def f(x):
852
- return pd.Series(x).astype(dtype.value_type.type).tolist()
853
-
854
- try:
855
- arr = pd.Series(self._ndarray)
856
- ret = arr.map(f).to_numpy()
857
- return ArrowStringArray(ret)
858
- except ValueError:
859
- raise TypeError(msg)
860
-
861
- try:
862
- return super().astype(dtype, copy=copy)
863
- except ValueError:
864
- raise TypeError(msg)