maxframe 2.2.0__cp39-cp39-macosx_10_9_universal2.whl → 2.3.0rc1__cp39-cp39-macosx_10_9_universal2.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of maxframe might be problematic. Click here for more details.
- maxframe/_utils.cpython-39-darwin.so +0 -0
- maxframe/codegen/core.py +3 -2
- maxframe/codegen/spe/dataframe/merge.py +4 -0
- maxframe/codegen/spe/dataframe/misc.py +2 -0
- maxframe/codegen/spe/dataframe/reduction.py +18 -0
- maxframe/codegen/spe/dataframe/sort.py +9 -1
- maxframe/codegen/spe/dataframe/tests/test_reduction.py +13 -0
- maxframe/codegen/spe/dataframe/tseries.py +9 -0
- maxframe/codegen/spe/learn/contrib/lightgbm.py +4 -3
- maxframe/codegen/spe/tensor/datasource.py +1 -0
- maxframe/config/config.py +3 -0
- maxframe/conftest.py +10 -0
- maxframe/core/base.py +2 -1
- maxframe/core/entity/tileables.py +2 -0
- maxframe/core/graph/core.cpython-39-darwin.so +0 -0
- maxframe/core/graph/entity.py +7 -1
- maxframe/core/mode.py +6 -1
- maxframe/dataframe/__init__.py +2 -2
- maxframe/dataframe/arithmetic/__init__.py +4 -0
- maxframe/dataframe/arithmetic/maximum.py +33 -0
- maxframe/dataframe/arithmetic/minimum.py +33 -0
- maxframe/dataframe/core.py +98 -106
- maxframe/dataframe/datasource/core.py +6 -0
- maxframe/dataframe/datasource/direct.py +57 -0
- maxframe/dataframe/datasource/read_csv.py +19 -11
- maxframe/dataframe/datasource/read_odps_query.py +29 -6
- maxframe/dataframe/datasource/read_odps_table.py +32 -10
- maxframe/dataframe/datasource/read_parquet.py +38 -39
- maxframe/dataframe/datastore/__init__.py +6 -0
- maxframe/dataframe/datastore/direct.py +268 -0
- maxframe/dataframe/datastore/to_odps.py +6 -0
- maxframe/dataframe/extensions/flatjson.py +2 -1
- maxframe/dataframe/groupby/__init__.py +5 -1
- maxframe/dataframe/groupby/aggregation.py +10 -6
- maxframe/dataframe/groupby/apply_chunk.py +1 -3
- maxframe/dataframe/groupby/core.py +20 -4
- maxframe/dataframe/indexing/__init__.py +2 -1
- maxframe/dataframe/indexing/insert.py +45 -17
- maxframe/dataframe/merge/__init__.py +3 -0
- maxframe/dataframe/merge/combine.py +244 -0
- maxframe/dataframe/misc/__init__.py +14 -3
- maxframe/dataframe/misc/check_unique.py +41 -10
- maxframe/dataframe/misc/drop.py +31 -0
- maxframe/dataframe/misc/infer_dtypes.py +251 -0
- maxframe/dataframe/misc/map.py +31 -18
- maxframe/dataframe/misc/repeat.py +159 -0
- maxframe/dataframe/misc/tests/test_misc.py +35 -1
- maxframe/dataframe/missing/checkna.py +3 -2
- maxframe/dataframe/reduction/__init__.py +10 -5
- maxframe/dataframe/reduction/aggregation.py +6 -6
- maxframe/dataframe/reduction/argmax.py +7 -4
- maxframe/dataframe/reduction/argmin.py +7 -4
- maxframe/dataframe/reduction/core.py +18 -9
- maxframe/dataframe/reduction/mode.py +144 -0
- maxframe/dataframe/reduction/nunique.py +10 -3
- maxframe/dataframe/reduction/tests/test_reduction.py +12 -0
- maxframe/dataframe/sort/__init__.py +9 -2
- maxframe/dataframe/sort/argsort.py +7 -1
- maxframe/dataframe/sort/core.py +1 -1
- maxframe/dataframe/sort/rank.py +147 -0
- maxframe/dataframe/tseries/__init__.py +19 -0
- maxframe/dataframe/tseries/at_time.py +61 -0
- maxframe/dataframe/tseries/between_time.py +122 -0
- maxframe/dataframe/utils.py +30 -26
- maxframe/learn/contrib/llm/core.py +16 -7
- maxframe/learn/contrib/llm/deploy/__init__.py +13 -0
- maxframe/learn/contrib/llm/deploy/config.py +221 -0
- maxframe/learn/contrib/llm/deploy/core.py +247 -0
- maxframe/learn/contrib/llm/deploy/framework.py +35 -0
- maxframe/learn/contrib/llm/deploy/loader.py +360 -0
- maxframe/learn/contrib/llm/deploy/tests/__init__.py +13 -0
- maxframe/learn/contrib/llm/deploy/tests/test_register_models.py +359 -0
- maxframe/learn/contrib/llm/models/__init__.py +1 -0
- maxframe/learn/contrib/llm/models/dashscope.py +12 -6
- maxframe/learn/contrib/llm/models/managed.py +76 -11
- maxframe/learn/contrib/llm/models/openai.py +72 -0
- maxframe/learn/contrib/llm/tests/__init__.py +13 -0
- maxframe/learn/contrib/llm/tests/test_core.py +34 -0
- maxframe/learn/contrib/llm/tests/test_openai.py +187 -0
- maxframe/learn/contrib/llm/tests/test_text_gen.py +155 -0
- maxframe/learn/contrib/llm/text.py +348 -42
- maxframe/learn/contrib/models.py +4 -1
- maxframe/learn/contrib/xgboost/classifier.py +2 -0
- maxframe/learn/contrib/xgboost/core.py +31 -7
- maxframe/learn/contrib/xgboost/predict.py +4 -2
- maxframe/learn/contrib/xgboost/regressor.py +5 -0
- maxframe/learn/contrib/xgboost/train.py +2 -0
- maxframe/learn/preprocessing/_data/min_max_scaler.py +34 -23
- maxframe/learn/preprocessing/_data/standard_scaler.py +34 -25
- maxframe/learn/utils/__init__.py +1 -0
- maxframe/learn/utils/extmath.py +42 -9
- maxframe/learn/utils/odpsio.py +80 -11
- maxframe/lib/filesystem/_oss_lib/common.py +2 -0
- maxframe/lib/mmh3.cpython-39-darwin.so +0 -0
- maxframe/opcodes.py +9 -1
- maxframe/remote/core.py +4 -0
- maxframe/serialization/core.cpython-39-darwin.so +0 -0
- maxframe/serialization/tests/test_serial.py +2 -2
- maxframe/tensor/arithmetic/__init__.py +1 -1
- maxframe/tensor/arithmetic/core.py +2 -2
- maxframe/tensor/arithmetic/tests/test_arithmetic.py +0 -9
- maxframe/tensor/core.py +3 -0
- maxframe/tensor/misc/copyto.py +1 -1
- maxframe/tests/test_udf.py +61 -0
- maxframe/tests/test_utils.py +8 -5
- maxframe/udf.py +103 -7
- maxframe/utils.py +61 -8
- {maxframe-2.2.0.dist-info → maxframe-2.3.0rc1.dist-info}/METADATA +1 -2
- {maxframe-2.2.0.dist-info → maxframe-2.3.0rc1.dist-info}/RECORD +113 -90
- maxframe_client/session/task.py +8 -1
- maxframe_client/tests/test_session.py +24 -0
- maxframe/dataframe/arrays.py +0 -864
- {maxframe-2.2.0.dist-info → maxframe-2.3.0rc1.dist-info}/WHEEL +0 -0
- {maxframe-2.2.0.dist-info → maxframe-2.3.0rc1.dist-info}/top_level.txt +0 -0
maxframe/dataframe/arrays.py
DELETED
|
@@ -1,864 +0,0 @@
|
|
|
1
|
-
# Copyright 1999-2025 Alibaba Group Holding Ltd.
|
|
2
|
-
#
|
|
3
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
-
# you may not use this file except in compliance with the License.
|
|
5
|
-
# You may obtain a copy of the License at
|
|
6
|
-
#
|
|
7
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
-
#
|
|
9
|
-
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
-
# See the License for the specific language governing permissions and
|
|
13
|
-
# limitations under the License.
|
|
14
|
-
|
|
15
|
-
import itertools
|
|
16
|
-
import operator
|
|
17
|
-
import re
|
|
18
|
-
from copy import copy as copy_obj
|
|
19
|
-
from numbers import Integral
|
|
20
|
-
from typing import Sequence, Type
|
|
21
|
-
|
|
22
|
-
import numpy as np
|
|
23
|
-
import pandas as pd
|
|
24
|
-
from pandas._libs import lib
|
|
25
|
-
from pandas.api.extensions import (
|
|
26
|
-
ExtensionArray,
|
|
27
|
-
ExtensionDtype,
|
|
28
|
-
register_extension_dtype,
|
|
29
|
-
)
|
|
30
|
-
from pandas.api.indexers import check_array_indexer
|
|
31
|
-
from pandas.api.types import (
|
|
32
|
-
is_array_like,
|
|
33
|
-
is_list_like,
|
|
34
|
-
is_scalar,
|
|
35
|
-
is_string_dtype,
|
|
36
|
-
pandas_dtype,
|
|
37
|
-
)
|
|
38
|
-
from pandas.arrays import StringArray as StringArrayBase
|
|
39
|
-
from pandas.compat import set_function_name
|
|
40
|
-
from pandas.core import ops
|
|
41
|
-
from pandas.core.algorithms import take
|
|
42
|
-
|
|
43
|
-
try:
|
|
44
|
-
from pandas._libs.arrays import NDArrayBacked
|
|
45
|
-
except ImportError:
|
|
46
|
-
NDArrayBacked = None
|
|
47
|
-
try:
|
|
48
|
-
import pyarrow as pa
|
|
49
|
-
|
|
50
|
-
pa_null = pa.NULL
|
|
51
|
-
except ImportError: # pragma: no cover
|
|
52
|
-
pa = None
|
|
53
|
-
pa_null = None
|
|
54
|
-
try:
|
|
55
|
-
import pyarrow.compute as pc
|
|
56
|
-
except ImportError: # pragma: no cover
|
|
57
|
-
pc = None
|
|
58
|
-
|
|
59
|
-
from ..config import options
|
|
60
|
-
from ..core import is_kernel_mode
|
|
61
|
-
from ..utils import pd_release_version, tokenize
|
|
62
|
-
|
|
63
|
-
_use_bool_any_all = pd_release_version[:2] >= (1, 3)
|
|
64
|
-
_use_extension_index = pd_release_version[:2] >= (1, 4)
|
|
65
|
-
_object_engine_for_string_array = pd_release_version[:2] >= (1, 5)
|
|
66
|
-
|
|
67
|
-
if _object_engine_for_string_array:
|
|
68
|
-
StringArrayBase = type(StringArrayBase)(
|
|
69
|
-
"StringArrayBase", StringArrayBase.__bases__, dict(StringArrayBase.__dict__)
|
|
70
|
-
)
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
class ArrowDtype(ExtensionDtype):
|
|
74
|
-
@property
|
|
75
|
-
def arrow_type(self): # pragma: no cover
|
|
76
|
-
raise NotImplementedError
|
|
77
|
-
|
|
78
|
-
def __from_arrow__(self, array):
|
|
79
|
-
return self.construct_array_type()(array)
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
@register_extension_dtype
|
|
83
|
-
class ArrowStringDtype(ArrowDtype):
|
|
84
|
-
"""
|
|
85
|
-
Extension dtype for arrow string data.
|
|
86
|
-
|
|
87
|
-
.. warning::
|
|
88
|
-
|
|
89
|
-
ArrowStringDtype is considered experimental. The implementation and
|
|
90
|
-
parts of the API may change without warning.
|
|
91
|
-
|
|
92
|
-
In particular, ArrowStringDtype.na_value may change to no longer be
|
|
93
|
-
``numpy.nan``.
|
|
94
|
-
|
|
95
|
-
Attributes
|
|
96
|
-
----------
|
|
97
|
-
None
|
|
98
|
-
|
|
99
|
-
Methods
|
|
100
|
-
-------
|
|
101
|
-
None
|
|
102
|
-
|
|
103
|
-
Examples
|
|
104
|
-
--------
|
|
105
|
-
>>> import maxframe.dataframe as md
|
|
106
|
-
>>> md.ArrowStringDtype()
|
|
107
|
-
ArrowStringDtype
|
|
108
|
-
"""
|
|
109
|
-
|
|
110
|
-
type = str
|
|
111
|
-
kind = "U"
|
|
112
|
-
name = "Arrow[string]"
|
|
113
|
-
na_value = pa_null
|
|
114
|
-
|
|
115
|
-
@classmethod
|
|
116
|
-
def construct_from_string(cls, string):
|
|
117
|
-
if string == cls.name:
|
|
118
|
-
return cls()
|
|
119
|
-
else:
|
|
120
|
-
raise TypeError(f"Cannot construct a '{cls}' from '{string}'")
|
|
121
|
-
|
|
122
|
-
@classmethod
|
|
123
|
-
def construct_array_type(cls) -> "Type[ArrowStringArray]":
|
|
124
|
-
return ArrowStringArray
|
|
125
|
-
|
|
126
|
-
@property
|
|
127
|
-
def arrow_type(self):
|
|
128
|
-
return pa.string()
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
@register_extension_dtype
|
|
132
|
-
class ArrowStringDtypeAlias(ArrowStringDtype):
|
|
133
|
-
name = "arrow_string" # register an alias name for compatibility
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
class ArrowListDtypeType(type):
|
|
137
|
-
"""
|
|
138
|
-
the type of ArrowListDtype, this metaclass determines subclass ability
|
|
139
|
-
"""
|
|
140
|
-
|
|
141
|
-
pass
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
class ArrowListDtype(ArrowDtype):
|
|
145
|
-
_metadata = ("_value_type",)
|
|
146
|
-
|
|
147
|
-
def __init__(self, dtype):
|
|
148
|
-
if isinstance(dtype, type(self)):
|
|
149
|
-
dtype = dtype.value_type
|
|
150
|
-
if pa and isinstance(dtype, pa.DataType):
|
|
151
|
-
dtype = dtype.to_pandas_dtype()
|
|
152
|
-
|
|
153
|
-
dtype = pandas_dtype(dtype)
|
|
154
|
-
if is_string_dtype(dtype) and not isinstance(dtype, ArrowStringDtype):
|
|
155
|
-
# convert string dtype to arrow string dtype
|
|
156
|
-
dtype = ArrowStringDtype()
|
|
157
|
-
|
|
158
|
-
self._value_type = dtype
|
|
159
|
-
|
|
160
|
-
@property
|
|
161
|
-
def value_type(self):
|
|
162
|
-
return self._value_type
|
|
163
|
-
|
|
164
|
-
@property
|
|
165
|
-
def kind(self):
|
|
166
|
-
return "O"
|
|
167
|
-
|
|
168
|
-
@property
|
|
169
|
-
def type(self):
|
|
170
|
-
return ArrowListDtypeType
|
|
171
|
-
|
|
172
|
-
@property
|
|
173
|
-
def name(self):
|
|
174
|
-
return f"Arrow[List[{self.value_type.name}]]"
|
|
175
|
-
|
|
176
|
-
@property
|
|
177
|
-
def arrow_type(self):
|
|
178
|
-
if isinstance(self._value_type, ArrowDtype):
|
|
179
|
-
arrow_subdtype = self._value_type.arrow_type
|
|
180
|
-
else:
|
|
181
|
-
arrow_subdtype = pa.from_numpy_dtype(self._value_type)
|
|
182
|
-
return pa.list_(arrow_subdtype)
|
|
183
|
-
|
|
184
|
-
def __repr__(self) -> str:
|
|
185
|
-
return self.name
|
|
186
|
-
|
|
187
|
-
@classmethod
|
|
188
|
-
def construct_array_type(cls) -> "Type[ArrowListArray]":
|
|
189
|
-
return ArrowListArray
|
|
190
|
-
|
|
191
|
-
@classmethod
|
|
192
|
-
def construct_from_string(cls, string):
|
|
193
|
-
msg = f"Cannot construct a 'ArrowListDtype' from '{string}'"
|
|
194
|
-
xpr = re.compile(r"Arrow\[List\[(?P<value_type>[^,]*)\]\]$")
|
|
195
|
-
m = xpr.match(string)
|
|
196
|
-
if m:
|
|
197
|
-
value_type = m.groupdict()["value_type"]
|
|
198
|
-
return ArrowListDtype(value_type)
|
|
199
|
-
else:
|
|
200
|
-
raise TypeError(msg)
|
|
201
|
-
|
|
202
|
-
@classmethod
|
|
203
|
-
def is_dtype(cls, dtype) -> bool:
|
|
204
|
-
dtype = getattr(dtype, "dtype", dtype)
|
|
205
|
-
if isinstance(dtype, str):
|
|
206
|
-
try:
|
|
207
|
-
cls.construct_from_string(dtype)
|
|
208
|
-
except TypeError:
|
|
209
|
-
return False
|
|
210
|
-
else:
|
|
211
|
-
return True
|
|
212
|
-
else:
|
|
213
|
-
return isinstance(dtype, cls)
|
|
214
|
-
|
|
215
|
-
def __hash__(self):
|
|
216
|
-
return super().__hash__()
|
|
217
|
-
|
|
218
|
-
def __eq__(self, other):
|
|
219
|
-
if not isinstance(other, ArrowListDtype):
|
|
220
|
-
return False
|
|
221
|
-
|
|
222
|
-
value_type = self._value_type
|
|
223
|
-
other_value_type = other._value_type
|
|
224
|
-
|
|
225
|
-
try:
|
|
226
|
-
return value_type == other_value_type
|
|
227
|
-
except TypeError:
|
|
228
|
-
# cannot compare numpy dtype and extension dtype
|
|
229
|
-
return other_value_type == value_type
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
class ArrowArray(ExtensionArray):
|
|
233
|
-
_arrow_type = None
|
|
234
|
-
|
|
235
|
-
def __init__(self, values, dtype: ArrowDtype = None, copy=False):
|
|
236
|
-
pandas_only = self._pandas_only()
|
|
237
|
-
|
|
238
|
-
if pa is not None and not pandas_only:
|
|
239
|
-
self._init_by_arrow(values, dtype=dtype, copy=copy)
|
|
240
|
-
elif not is_kernel_mode():
|
|
241
|
-
# not in kernel mode, allow to use numpy handle data
|
|
242
|
-
# just for infer dtypes purpose
|
|
243
|
-
self._init_by_numpy(values, dtype=dtype, copy=copy)
|
|
244
|
-
else:
|
|
245
|
-
raise ImportError("Cannot create ArrowArray when `pyarrow` not installed")
|
|
246
|
-
|
|
247
|
-
# for test purpose
|
|
248
|
-
self._force_use_pandas = pandas_only
|
|
249
|
-
|
|
250
|
-
def _init_by_arrow(self, values, dtype: ArrowDtype = None, copy=False):
|
|
251
|
-
if isinstance(values, (pd.Index, pd.Series)):
|
|
252
|
-
# for pandas Index and Series,
|
|
253
|
-
# convert to PandasArray
|
|
254
|
-
values = values.array
|
|
255
|
-
|
|
256
|
-
if isinstance(values, type(self)):
|
|
257
|
-
arrow_array = values._arrow_array
|
|
258
|
-
elif isinstance(values, ExtensionArray):
|
|
259
|
-
# if come from pandas object like index,
|
|
260
|
-
# convert to pandas StringArray first,
|
|
261
|
-
# validation will be done in construct
|
|
262
|
-
arrow_array = pa.chunked_array([pa.array(values, from_pandas=True)])
|
|
263
|
-
elif isinstance(values, pa.ChunkedArray):
|
|
264
|
-
arrow_array = values
|
|
265
|
-
elif isinstance(values, pa.Array):
|
|
266
|
-
arrow_array = pa.chunked_array([values])
|
|
267
|
-
elif len(values) == 0: # pragma: no cover
|
|
268
|
-
arrow_array = pa.chunked_array([pa.array([], type=dtype.arrow_type)])
|
|
269
|
-
else:
|
|
270
|
-
arrow_array = pa.chunked_array([pa.array(values, type=dtype.arrow_type)])
|
|
271
|
-
|
|
272
|
-
if copy:
|
|
273
|
-
arrow_array = copy_obj(arrow_array)
|
|
274
|
-
|
|
275
|
-
self._use_arrow = True
|
|
276
|
-
self._arrow_array = arrow_array
|
|
277
|
-
|
|
278
|
-
if NDArrayBacked is not None and isinstance(self, NDArrayBacked):
|
|
279
|
-
NDArrayBacked.__init__(self, np.array([]), dtype)
|
|
280
|
-
else:
|
|
281
|
-
self._dtype = dtype
|
|
282
|
-
|
|
283
|
-
def _init_by_numpy(self, values, dtype: ArrowDtype = None, copy=False):
|
|
284
|
-
self._use_arrow = False
|
|
285
|
-
|
|
286
|
-
ndarray = np.array(values, copy=copy)
|
|
287
|
-
if NDArrayBacked is not None and isinstance(self, NDArrayBacked):
|
|
288
|
-
NDArrayBacked.__init__(self, ndarray, dtype)
|
|
289
|
-
else:
|
|
290
|
-
self._dtype = dtype
|
|
291
|
-
self._ndarray = np.array(values, copy=copy)
|
|
292
|
-
|
|
293
|
-
@classmethod
|
|
294
|
-
def _pandas_only(cls):
|
|
295
|
-
return options.dataframe.arrow_array.pandas_only
|
|
296
|
-
|
|
297
|
-
def __repr__(self):
|
|
298
|
-
return f"{type(self).__name__}({repr(self._array)})"
|
|
299
|
-
|
|
300
|
-
@property
|
|
301
|
-
def _array(self):
|
|
302
|
-
return self._arrow_array if self._use_arrow else self._ndarray
|
|
303
|
-
|
|
304
|
-
@property
|
|
305
|
-
def dtype(self) -> "Type[ArrowDtype]":
|
|
306
|
-
return self._dtype
|
|
307
|
-
|
|
308
|
-
@property
|
|
309
|
-
def nbytes(self) -> int:
|
|
310
|
-
if self._use_arrow:
|
|
311
|
-
return sum(
|
|
312
|
-
x.size
|
|
313
|
-
for chunk in self._arrow_array.chunks
|
|
314
|
-
for x in chunk.buffers()
|
|
315
|
-
if x is not None
|
|
316
|
-
)
|
|
317
|
-
else:
|
|
318
|
-
return self._ndarray.nbytes
|
|
319
|
-
|
|
320
|
-
@property
|
|
321
|
-
def shape(self):
|
|
322
|
-
if self._use_arrow:
|
|
323
|
-
return (self._arrow_array.length(),)
|
|
324
|
-
else:
|
|
325
|
-
return self._ndarray.shape
|
|
326
|
-
|
|
327
|
-
def memory_usage(self, deep=True) -> int:
|
|
328
|
-
if self._use_arrow:
|
|
329
|
-
return self.nbytes
|
|
330
|
-
else:
|
|
331
|
-
return pd.Series(self._ndarray).memory_usage(index=False, deep=deep)
|
|
332
|
-
|
|
333
|
-
@classmethod
|
|
334
|
-
def _to_arrow_array(cls, scalars):
|
|
335
|
-
return pa.array(scalars)
|
|
336
|
-
|
|
337
|
-
@classmethod
|
|
338
|
-
def _from_sequence(cls, scalars, dtype=None, copy=False):
|
|
339
|
-
if pa is None or cls._pandas_only():
|
|
340
|
-
# pyarrow not installed, just return numpy
|
|
341
|
-
ret = np.empty(len(scalars), dtype=object)
|
|
342
|
-
ret[:] = scalars
|
|
343
|
-
return cls(ret)
|
|
344
|
-
|
|
345
|
-
if pa_null is not None and isinstance(scalars, type(pa_null)):
|
|
346
|
-
scalars = []
|
|
347
|
-
elif not hasattr(scalars, "dtype"):
|
|
348
|
-
ret = np.empty(len(scalars), dtype=object)
|
|
349
|
-
for i, s in enumerate(scalars):
|
|
350
|
-
ret[i] = s
|
|
351
|
-
scalars = ret
|
|
352
|
-
elif isinstance(scalars, cls):
|
|
353
|
-
if copy:
|
|
354
|
-
scalars = scalars.copy()
|
|
355
|
-
return scalars
|
|
356
|
-
arrow_array = pa.chunked_array([cls._to_arrow_array(scalars)])
|
|
357
|
-
return cls(arrow_array, dtype=dtype, copy=copy)
|
|
358
|
-
|
|
359
|
-
@classmethod
|
|
360
|
-
def _from_sequence_of_strings(cls, strings, dtype=None, copy=False):
|
|
361
|
-
return cls._from_sequence(strings, dtype=dtype, copy=copy)
|
|
362
|
-
|
|
363
|
-
@staticmethod
|
|
364
|
-
def _can_process_slice_via_arrow(slc):
|
|
365
|
-
if not isinstance(slc, slice):
|
|
366
|
-
return False
|
|
367
|
-
if slc.step is not None and slc.step != 1:
|
|
368
|
-
return False
|
|
369
|
-
if slc.start is not None and not isinstance(
|
|
370
|
-
slc.start, Integral
|
|
371
|
-
): # pragma: no cover
|
|
372
|
-
return False
|
|
373
|
-
if slc.stop is not None and not isinstance(
|
|
374
|
-
slc.stop, Integral
|
|
375
|
-
): # pragma: no cover
|
|
376
|
-
return False
|
|
377
|
-
return True
|
|
378
|
-
|
|
379
|
-
def _values_for_factorize(self):
|
|
380
|
-
arr = self.to_numpy()
|
|
381
|
-
mask = self.isna()
|
|
382
|
-
arr[mask] = -1
|
|
383
|
-
return arr, -1
|
|
384
|
-
|
|
385
|
-
def _values_for_argsort(self):
|
|
386
|
-
return self.to_numpy()
|
|
387
|
-
|
|
388
|
-
@classmethod
|
|
389
|
-
def _from_factorized(cls, values, original):
|
|
390
|
-
return cls(values)
|
|
391
|
-
|
|
392
|
-
@staticmethod
|
|
393
|
-
def _process_pos(pos, length, is_start):
|
|
394
|
-
if pos is None:
|
|
395
|
-
return 0 if is_start else length
|
|
396
|
-
return pos + length if pos < 0 else pos
|
|
397
|
-
|
|
398
|
-
@classmethod
|
|
399
|
-
def _post_scalar_getitem(cls, lst):
|
|
400
|
-
return lst.to_pandas()[0]
|
|
401
|
-
|
|
402
|
-
def __getitem__(self, item):
|
|
403
|
-
cls = type(self)
|
|
404
|
-
|
|
405
|
-
if pa is None or self._force_use_pandas:
|
|
406
|
-
# pyarrow not installed
|
|
407
|
-
result = self._ndarray[item]
|
|
408
|
-
if pd.api.types.is_scalar(item):
|
|
409
|
-
return result
|
|
410
|
-
else:
|
|
411
|
-
return type(self)(result)
|
|
412
|
-
|
|
413
|
-
has_take = hasattr(self._arrow_array, "take")
|
|
414
|
-
if not self._force_use_pandas and has_take:
|
|
415
|
-
if pd.api.types.is_scalar(item):
|
|
416
|
-
item = item + len(self) if item < 0 else item
|
|
417
|
-
return self._post_scalar_getitem(self._arrow_array.take([item]))
|
|
418
|
-
elif self._can_process_slice_via_arrow(item):
|
|
419
|
-
length = len(self)
|
|
420
|
-
start, stop = item.start, item.stop
|
|
421
|
-
start = self._process_pos(start, length, True)
|
|
422
|
-
stop = self._process_pos(stop, length, False)
|
|
423
|
-
return cls(
|
|
424
|
-
self._arrow_array.slice(offset=start, length=stop - start),
|
|
425
|
-
dtype=self._dtype,
|
|
426
|
-
)
|
|
427
|
-
elif hasattr(item, "dtype") and np.issubdtype(item.dtype, np.bool_):
|
|
428
|
-
return cls(
|
|
429
|
-
self._arrow_array.filter(pa.array(item, from_pandas=True)),
|
|
430
|
-
dtype=self._dtype,
|
|
431
|
-
)
|
|
432
|
-
elif hasattr(item, "dtype"):
|
|
433
|
-
length = len(self)
|
|
434
|
-
item = np.where(item < 0, item + length, item)
|
|
435
|
-
return cls(self._arrow_array.take(item), dtype=self._dtype)
|
|
436
|
-
|
|
437
|
-
array = np.asarray(self._arrow_array.to_pandas())
|
|
438
|
-
return cls(array[item], dtype=self._dtype)
|
|
439
|
-
|
|
440
|
-
@classmethod
|
|
441
|
-
def _concat_same_type(cls, to_concat: Sequence["ArrowArray"]) -> "ArrowArray":
|
|
442
|
-
if pa is None or cls._pandas_only():
|
|
443
|
-
# pyarrow not installed
|
|
444
|
-
return cls(np.concatenate([x._array for x in to_concat]))
|
|
445
|
-
|
|
446
|
-
chunks = list(
|
|
447
|
-
itertools.chain.from_iterable(x._arrow_array.chunks for x in to_concat)
|
|
448
|
-
)
|
|
449
|
-
if len(chunks) == 0:
|
|
450
|
-
chunks = [pa.array([], type=to_concat[0].dtype.arrow_type)]
|
|
451
|
-
return cls(pa.chunked_array(chunks))
|
|
452
|
-
|
|
453
|
-
def __len__(self):
|
|
454
|
-
return len(self._array)
|
|
455
|
-
|
|
456
|
-
def __array__(self, dtype=None):
|
|
457
|
-
return self.to_numpy(dtype=dtype)
|
|
458
|
-
|
|
459
|
-
def to_numpy(self, dtype=None, copy=False, na_value=lib.no_default):
|
|
460
|
-
if self._use_arrow:
|
|
461
|
-
array = np.asarray(self._arrow_array.to_pandas())
|
|
462
|
-
else:
|
|
463
|
-
array = self._ndarray
|
|
464
|
-
if copy or na_value is not lib.no_default:
|
|
465
|
-
array = array.copy()
|
|
466
|
-
if na_value is not lib.no_default:
|
|
467
|
-
array[self.isna()] = na_value
|
|
468
|
-
return array
|
|
469
|
-
|
|
470
|
-
@classmethod
|
|
471
|
-
def _array_fillna(cls, array, value):
|
|
472
|
-
return array.fillna(value)
|
|
473
|
-
|
|
474
|
-
def fillna(self, value=None, method=None, limit=None):
|
|
475
|
-
cls = type(self)
|
|
476
|
-
|
|
477
|
-
if pa is None or self._force_use_pandas:
|
|
478
|
-
# pyarrow not installed
|
|
479
|
-
return cls(
|
|
480
|
-
pd.Series(self.to_numpy()).fillna(
|
|
481
|
-
value=value, method=method, limit=limit
|
|
482
|
-
)
|
|
483
|
-
)
|
|
484
|
-
|
|
485
|
-
chunks = []
|
|
486
|
-
for chunk_array in self._arrow_array.chunks:
|
|
487
|
-
array = chunk_array.to_pandas()
|
|
488
|
-
if method is None:
|
|
489
|
-
result_array = self._array_fillna(array, value)
|
|
490
|
-
else:
|
|
491
|
-
result_array = array.fillna(value=value, method=method, limit=limit)
|
|
492
|
-
chunks.append(pa.array(result_array, from_pandas=True))
|
|
493
|
-
return cls(pa.chunked_array(chunks), dtype=self._dtype)
|
|
494
|
-
|
|
495
|
-
def astype(self, dtype, copy=True):
|
|
496
|
-
dtype = pandas_dtype(dtype)
|
|
497
|
-
if isinstance(dtype, ArrowStringDtype):
|
|
498
|
-
if copy:
|
|
499
|
-
return self.copy()
|
|
500
|
-
return self
|
|
501
|
-
|
|
502
|
-
if pa is None or self._force_use_pandas:
|
|
503
|
-
# pyarrow not installed
|
|
504
|
-
if isinstance(dtype, ArrowDtype):
|
|
505
|
-
dtype = dtype.type
|
|
506
|
-
return type(self)(pd.Series(self.to_numpy()).astype(dtype, copy=copy))
|
|
507
|
-
|
|
508
|
-
# try to slice 1 record to get the result dtype
|
|
509
|
-
test_array = self._arrow_array.slice(0, 1).to_pandas()
|
|
510
|
-
test_result_array = test_array.astype(dtype).array
|
|
511
|
-
if _use_extension_index:
|
|
512
|
-
test_result_type = type(test_array.astype(dtype).values)
|
|
513
|
-
if test_result_type is np.ndarray:
|
|
514
|
-
test_result_type = np.array
|
|
515
|
-
else:
|
|
516
|
-
test_result_type = type(test_result_array)
|
|
517
|
-
|
|
518
|
-
result_array = test_result_type(
|
|
519
|
-
np.full(
|
|
520
|
-
self.shape,
|
|
521
|
-
test_result_array.dtype.na_value,
|
|
522
|
-
dtype=np.asarray(test_result_array).dtype,
|
|
523
|
-
)
|
|
524
|
-
)
|
|
525
|
-
|
|
526
|
-
start = 0
|
|
527
|
-
# use chunks to do astype
|
|
528
|
-
for chunk_array in self._arrow_array.chunks:
|
|
529
|
-
result_array[start : start + len(chunk_array)] = (
|
|
530
|
-
chunk_array.to_pandas().astype(dtype).array
|
|
531
|
-
)
|
|
532
|
-
start += len(chunk_array)
|
|
533
|
-
return result_array
|
|
534
|
-
|
|
535
|
-
def isna(self):
|
|
536
|
-
if (
|
|
537
|
-
not self._force_use_pandas
|
|
538
|
-
and self._use_arrow
|
|
539
|
-
and hasattr(self._arrow_array, "is_null")
|
|
540
|
-
):
|
|
541
|
-
return self._arrow_array.is_null().to_pandas().to_numpy()
|
|
542
|
-
elif self._use_arrow:
|
|
543
|
-
return pd.isna(self._arrow_array.to_pandas()).to_numpy()
|
|
544
|
-
else:
|
|
545
|
-
return pd.isna(self._ndarray)
|
|
546
|
-
|
|
547
|
-
def take(self, indices, allow_fill=False, fill_value=None):
|
|
548
|
-
if (
|
|
549
|
-
allow_fill is False or (allow_fill and fill_value is self.dtype.na_value)
|
|
550
|
-
) and len(self) > 0:
|
|
551
|
-
return type(self)(self[indices], dtype=self._dtype)
|
|
552
|
-
|
|
553
|
-
if self._use_arrow:
|
|
554
|
-
array = self._arrow_array.to_pandas().to_numpy()
|
|
555
|
-
else:
|
|
556
|
-
array = self._ndarray
|
|
557
|
-
|
|
558
|
-
replace = False
|
|
559
|
-
if allow_fill and (fill_value is None or fill_value == self._dtype.na_value):
|
|
560
|
-
fill_value = self.dtype.na_value
|
|
561
|
-
replace = True
|
|
562
|
-
|
|
563
|
-
result = take(array, indices, fill_value=fill_value, allow_fill=allow_fill)
|
|
564
|
-
del array
|
|
565
|
-
if replace and pa is not None:
|
|
566
|
-
# pyarrow cannot recognize pa.NULL
|
|
567
|
-
result[result == self.dtype.na_value] = None
|
|
568
|
-
return type(self)(result, dtype=self._dtype)
|
|
569
|
-
|
|
570
|
-
def copy(self):
|
|
571
|
-
if self._use_arrow:
|
|
572
|
-
return type(self)(copy_obj(self._arrow_array))
|
|
573
|
-
else:
|
|
574
|
-
return type(self)(self._ndarray.copy())
|
|
575
|
-
|
|
576
|
-
def unique(self):
|
|
577
|
-
if self._force_use_pandas or not self._use_arrow or not hasattr(pc, "unique"):
|
|
578
|
-
return type(self)(np.unique(self.to_numpy()), dtype=self._dtype)
|
|
579
|
-
return type(self)(pc.unique(self._arrow_array), dtype=self._dtype)
|
|
580
|
-
|
|
581
|
-
def value_counts(self, dropna=False):
|
|
582
|
-
if self._use_arrow:
|
|
583
|
-
series = self._arrow_array.to_pandas()
|
|
584
|
-
else:
|
|
585
|
-
series = pd.Series(self._ndarray)
|
|
586
|
-
return type(self)(series.value_counts(dropna=dropna), dtype=self._dtype)
|
|
587
|
-
|
|
588
|
-
if _use_bool_any_all:
|
|
589
|
-
|
|
590
|
-
def any(self, axis=0, out=None):
|
|
591
|
-
return self.to_numpy().astype(bool).any(axis=axis, out=out)
|
|
592
|
-
|
|
593
|
-
def all(self, axis=0, out=None):
|
|
594
|
-
return self.to_numpy().astype(bool).all(axis=axis, out=out)
|
|
595
|
-
|
|
596
|
-
else:
|
|
597
|
-
|
|
598
|
-
def any(self, axis=0, out=None):
|
|
599
|
-
return self.to_numpy().any(axis=axis, out=out)
|
|
600
|
-
|
|
601
|
-
def all(self, axis=0, out=None):
|
|
602
|
-
return self.to_numpy().all(axis=axis, out=out)
|
|
603
|
-
|
|
604
|
-
def __maxframe_tokenize__(self):
|
|
605
|
-
if self._use_arrow:
|
|
606
|
-
return tokenize(
|
|
607
|
-
[
|
|
608
|
-
memoryview(x)
|
|
609
|
-
for chunk in self._arrow_array.chunks
|
|
610
|
-
for x in chunk.buffers()
|
|
611
|
-
if x is not None
|
|
612
|
-
]
|
|
613
|
-
)
|
|
614
|
-
else:
|
|
615
|
-
return self._ndarray
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
class ArrowStringArray(ArrowArray, StringArrayBase):
|
|
619
|
-
def __init__(self, values, dtype=None, copy=False):
|
|
620
|
-
if dtype is not None:
|
|
621
|
-
assert isinstance(dtype, ArrowStringDtype)
|
|
622
|
-
ArrowArray.__init__(self, values, ArrowStringDtype(), copy=copy)
|
|
623
|
-
|
|
624
|
-
@classmethod
|
|
625
|
-
def from_scalars(cls, values):
|
|
626
|
-
if pa is None or cls._pandas_only():
|
|
627
|
-
return cls._from_sequence(values)
|
|
628
|
-
else:
|
|
629
|
-
arrow_array = pa.chunked_array([cls._to_arrow_array(values)])
|
|
630
|
-
return cls(arrow_array)
|
|
631
|
-
|
|
632
|
-
@classmethod
|
|
633
|
-
def _to_arrow_array(cls, scalars):
|
|
634
|
-
return pa.array(scalars).cast(pa.string())
|
|
635
|
-
|
|
636
|
-
def __setitem__(self, key, value):
|
|
637
|
-
if isinstance(value, (pd.Index, pd.Series)):
|
|
638
|
-
value = value.to_numpy()
|
|
639
|
-
if isinstance(value, type(self)):
|
|
640
|
-
value = value.to_numpy()
|
|
641
|
-
|
|
642
|
-
key = check_array_indexer(self, key)
|
|
643
|
-
scalar_key = is_scalar(key)
|
|
644
|
-
scalar_value = is_scalar(value)
|
|
645
|
-
if scalar_key and not scalar_value:
|
|
646
|
-
raise ValueError("setting an array element with a sequence.")
|
|
647
|
-
|
|
648
|
-
# validate new items
|
|
649
|
-
if scalar_value:
|
|
650
|
-
if pd.isna(value):
|
|
651
|
-
value = None
|
|
652
|
-
elif not isinstance(value, str):
|
|
653
|
-
raise ValueError(
|
|
654
|
-
f"Cannot set non-string value '{value}' into a ArrowStringArray."
|
|
655
|
-
)
|
|
656
|
-
else:
|
|
657
|
-
if not is_array_like(value):
|
|
658
|
-
value = np.asarray(value, dtype=object)
|
|
659
|
-
if len(value) and not lib.is_string_array(value, skipna=True):
|
|
660
|
-
raise ValueError("Must provide strings.")
|
|
661
|
-
|
|
662
|
-
if self._use_arrow:
|
|
663
|
-
string_array = np.asarray(self._arrow_array.to_pandas())
|
|
664
|
-
string_array[key] = value
|
|
665
|
-
self._arrow_array = pa.chunked_array([pa.array(string_array)])
|
|
666
|
-
else:
|
|
667
|
-
self._ndarray[key] = value
|
|
668
|
-
|
|
669
|
-
# Override parent because we have different return types.
|
|
670
|
-
@classmethod
|
|
671
|
-
def _create_arithmetic_method(cls, op):
|
|
672
|
-
# Note: this handles both arithmetic and comparison methods.
|
|
673
|
-
def method(self, other):
|
|
674
|
-
is_arithmetic = True if op.__name__ in ops.ARITHMETIC_BINOPS else False
|
|
675
|
-
pandas_only = cls._pandas_only()
|
|
676
|
-
|
|
677
|
-
is_other_array = False
|
|
678
|
-
if not is_scalar(other):
|
|
679
|
-
is_other_array = True
|
|
680
|
-
other = np.asarray(other)
|
|
681
|
-
|
|
682
|
-
self_is_na = self.isna()
|
|
683
|
-
other_is_na = pd.isna(other)
|
|
684
|
-
mask = self_is_na | other_is_na
|
|
685
|
-
|
|
686
|
-
if pa is None or pandas_only:
|
|
687
|
-
if is_arithmetic:
|
|
688
|
-
ret = np.empty(self.shape, dtype=object)
|
|
689
|
-
else:
|
|
690
|
-
ret = np.zeros(self.shape, dtype=bool)
|
|
691
|
-
valid = ~mask
|
|
692
|
-
arr = (
|
|
693
|
-
self._arrow_array.to_pandas().to_numpy()
|
|
694
|
-
if self._use_arrow
|
|
695
|
-
else self._ndarray
|
|
696
|
-
)
|
|
697
|
-
o = other[valid] if is_other_array else other
|
|
698
|
-
ret[valid] = op(arr[valid], o)
|
|
699
|
-
if is_arithmetic:
|
|
700
|
-
return ArrowStringArray(ret)
|
|
701
|
-
else:
|
|
702
|
-
return pd.arrays.BooleanArray(ret, mask)
|
|
703
|
-
|
|
704
|
-
chunks = []
|
|
705
|
-
mask_chunks = []
|
|
706
|
-
start = 0
|
|
707
|
-
for chunk_array in self._arrow_array.chunks:
|
|
708
|
-
chunk_array = np.asarray(chunk_array.to_pandas())
|
|
709
|
-
end = start + len(chunk_array)
|
|
710
|
-
chunk_mask = mask[start:end]
|
|
711
|
-
chunk_valid = ~chunk_mask
|
|
712
|
-
|
|
713
|
-
if is_arithmetic:
|
|
714
|
-
result = np.empty(chunk_array.shape, dtype=object)
|
|
715
|
-
else:
|
|
716
|
-
result = np.zeros(chunk_array.shape, dtype=bool)
|
|
717
|
-
|
|
718
|
-
chunk_other = other
|
|
719
|
-
if is_other_array:
|
|
720
|
-
chunk_other = other[start:end]
|
|
721
|
-
chunk_other = chunk_other[chunk_valid]
|
|
722
|
-
|
|
723
|
-
# calculate only for both not None
|
|
724
|
-
result[chunk_valid] = op(chunk_array[chunk_valid], chunk_other)
|
|
725
|
-
|
|
726
|
-
if is_arithmetic:
|
|
727
|
-
chunks.append(pa.array(result, type=pa.string(), from_pandas=True))
|
|
728
|
-
else:
|
|
729
|
-
chunks.append(result)
|
|
730
|
-
mask_chunks.append(chunk_mask)
|
|
731
|
-
|
|
732
|
-
if is_arithmetic:
|
|
733
|
-
return ArrowStringArray(pa.chunked_array(chunks))
|
|
734
|
-
else:
|
|
735
|
-
return pd.arrays.BooleanArray(
|
|
736
|
-
np.concatenate(chunks), np.concatenate(mask_chunks)
|
|
737
|
-
)
|
|
738
|
-
|
|
739
|
-
return set_function_name(method, f"__{op.__name__}__", cls)
|
|
740
|
-
|
|
741
|
-
def shift(self, periods: int = 1, fill_value: object = None) -> "ArrowStringArray":
|
|
742
|
-
return ExtensionArray.shift(self, periods=periods, fill_value=fill_value)
|
|
743
|
-
|
|
744
|
-
@classmethod
|
|
745
|
-
def _add_arithmetic_ops(cls):
|
|
746
|
-
cls.__add__ = cls._create_arithmetic_method(operator.add)
|
|
747
|
-
cls.__radd__ = cls._create_arithmetic_method(ops.radd)
|
|
748
|
-
|
|
749
|
-
cls.__mul__ = cls._create_arithmetic_method(operator.mul)
|
|
750
|
-
cls.__rmul__ = cls._create_arithmetic_method(ops.rmul)
|
|
751
|
-
|
|
752
|
-
@classmethod
|
|
753
|
-
def _add_comparison_ops(cls):
|
|
754
|
-
cls.__eq__ = cls._create_comparison_method(operator.eq)
|
|
755
|
-
cls.__ne__ = cls._create_comparison_method(operator.ne)
|
|
756
|
-
cls.__lt__ = cls._create_comparison_method(operator.lt)
|
|
757
|
-
cls.__gt__ = cls._create_comparison_method(operator.gt)
|
|
758
|
-
cls.__le__ = cls._create_comparison_method(operator.le)
|
|
759
|
-
cls.__ge__ = cls._create_comparison_method(operator.ge)
|
|
760
|
-
|
|
761
|
-
_create_comparison_method = _create_arithmetic_method
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
ArrowStringArray._add_arithmetic_ops()
|
|
765
|
-
ArrowStringArray._add_comparison_ops()
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
class ArrowListArray(ArrowArray):
|
|
769
|
-
def __init__(self, values, dtype: ArrowListDtype = None, copy=False):
|
|
770
|
-
if dtype is None:
|
|
771
|
-
if isinstance(values, type(self)):
|
|
772
|
-
dtype = values.dtype
|
|
773
|
-
elif pa is not None:
|
|
774
|
-
if isinstance(values, pa.Array):
|
|
775
|
-
dtype = ArrowListDtype(values.type.value_type)
|
|
776
|
-
elif isinstance(values, pa.ChunkedArray):
|
|
777
|
-
dtype = ArrowListDtype(values.type.value_type)
|
|
778
|
-
else:
|
|
779
|
-
values = pa.array(values)
|
|
780
|
-
if values.type == pa.null():
|
|
781
|
-
dtype = ArrowListDtype(pa.string())
|
|
782
|
-
else:
|
|
783
|
-
dtype = ArrowListDtype(values.type.value_type)
|
|
784
|
-
else:
|
|
785
|
-
value_type = np.asarray(values[0]).dtype
|
|
786
|
-
dtype = ArrowListDtype(value_type)
|
|
787
|
-
|
|
788
|
-
super().__init__(values, dtype=dtype, copy=copy)
|
|
789
|
-
|
|
790
|
-
def to_numpy(self, dtype=None, copy=False, na_value=lib.no_default):
|
|
791
|
-
if self._use_arrow:
|
|
792
|
-
s = self._arrow_array.to_pandas()
|
|
793
|
-
else:
|
|
794
|
-
s = pd.Series(self._ndarray)
|
|
795
|
-
s = s.map(lambda x: x.tolist() if hasattr(x, "tolist") else x)
|
|
796
|
-
if copy or na_value is not lib.no_default:
|
|
797
|
-
s = s.copy()
|
|
798
|
-
if na_value is not lib.no_default:
|
|
799
|
-
s[self.isna()] = na_value
|
|
800
|
-
return np.asarray(s)
|
|
801
|
-
|
|
802
|
-
@classmethod
|
|
803
|
-
def _post_scalar_getitem(cls, lst):
|
|
804
|
-
return lst[0].as_py()
|
|
805
|
-
|
|
806
|
-
def __setitem__(self, key, value):
|
|
807
|
-
if isinstance(value, (pd.Index, pd.Series)):
|
|
808
|
-
value = value.to_numpy()
|
|
809
|
-
|
|
810
|
-
key = check_array_indexer(self, key)
|
|
811
|
-
scalar_key = is_scalar(key)
|
|
812
|
-
|
|
813
|
-
# validate new items
|
|
814
|
-
if scalar_key:
|
|
815
|
-
if pd.isna(value):
|
|
816
|
-
value = None
|
|
817
|
-
elif not is_list_like(value):
|
|
818
|
-
raise ValueError("Must provide list.")
|
|
819
|
-
|
|
820
|
-
if self._use_arrow:
|
|
821
|
-
array = np.asarray(self._arrow_array.to_pandas())
|
|
822
|
-
array[key] = value
|
|
823
|
-
self._arrow_array = pa.chunked_array(
|
|
824
|
-
[pa.array(array, type=self.dtype.arrow_type)]
|
|
825
|
-
)
|
|
826
|
-
else:
|
|
827
|
-
self._ndarray[key] = value
|
|
828
|
-
|
|
829
|
-
@classmethod
|
|
830
|
-
def _array_fillna(cls, series, value):
|
|
831
|
-
# cannot fillna directly, because value is a list-like object
|
|
832
|
-
return series.apply(lambda x: x if is_list_like(x) or not pd.isna(x) else value)
|
|
833
|
-
|
|
834
|
-
def astype(self, dtype, copy=True):
|
|
835
|
-
msg = f"cannot astype from {self.dtype} to {dtype}"
|
|
836
|
-
dtype = pandas_dtype(dtype)
|
|
837
|
-
if isinstance(dtype, ArrowListDtype):
|
|
838
|
-
if self.dtype == dtype:
|
|
839
|
-
if copy:
|
|
840
|
-
return self.copy()
|
|
841
|
-
return self
|
|
842
|
-
else:
|
|
843
|
-
if self._use_arrow:
|
|
844
|
-
try:
|
|
845
|
-
arrow_array = self._arrow_array.cast(dtype.arrow_type)
|
|
846
|
-
return ArrowListArray(arrow_array)
|
|
847
|
-
except (NotImplementedError, pa.ArrowInvalid):
|
|
848
|
-
raise TypeError(msg)
|
|
849
|
-
else:
|
|
850
|
-
|
|
851
|
-
def f(x):
|
|
852
|
-
return pd.Series(x).astype(dtype.value_type.type).tolist()
|
|
853
|
-
|
|
854
|
-
try:
|
|
855
|
-
arr = pd.Series(self._ndarray)
|
|
856
|
-
ret = arr.map(f).to_numpy()
|
|
857
|
-
return ArrowStringArray(ret)
|
|
858
|
-
except ValueError:
|
|
859
|
-
raise TypeError(msg)
|
|
860
|
-
|
|
861
|
-
try:
|
|
862
|
-
return super().astype(dtype, copy=copy)
|
|
863
|
-
except ValueError:
|
|
864
|
-
raise TypeError(msg)
|