maxframe 1.0.0rc3__cp37-cp37m-win32.whl → 1.1.0__cp37-cp37m-win32.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of maxframe might be problematic. Click here for more details.

Files changed (112) hide show
  1. maxframe/_utils.cp37-win32.pyd +0 -0
  2. maxframe/codegen.py +1 -0
  3. maxframe/config/config.py +16 -1
  4. maxframe/conftest.py +52 -14
  5. maxframe/core/entity/executable.py +1 -1
  6. maxframe/core/graph/core.cp37-win32.pyd +0 -0
  7. maxframe/core/operator/base.py +2 -0
  8. maxframe/dataframe/arithmetic/docstring.py +26 -2
  9. maxframe/dataframe/arithmetic/equal.py +4 -2
  10. maxframe/dataframe/arithmetic/greater.py +4 -2
  11. maxframe/dataframe/arithmetic/greater_equal.py +4 -2
  12. maxframe/dataframe/arithmetic/less.py +2 -2
  13. maxframe/dataframe/arithmetic/less_equal.py +4 -2
  14. maxframe/dataframe/arithmetic/not_equal.py +4 -2
  15. maxframe/dataframe/arithmetic/tests/test_arithmetic.py +17 -16
  16. maxframe/dataframe/core.py +26 -2
  17. maxframe/dataframe/datasource/read_odps_query.py +116 -28
  18. maxframe/dataframe/datasource/read_odps_table.py +3 -1
  19. maxframe/dataframe/datasource/tests/test_datasource.py +93 -12
  20. maxframe/dataframe/datastore/to_odps.py +7 -0
  21. maxframe/dataframe/extensions/__init__.py +8 -0
  22. maxframe/dataframe/extensions/apply_chunk.py +649 -0
  23. maxframe/dataframe/extensions/flatjson.py +131 -0
  24. maxframe/dataframe/extensions/flatmap.py +314 -0
  25. maxframe/dataframe/extensions/reshuffle.py +1 -1
  26. maxframe/dataframe/extensions/tests/test_apply_chunk.py +186 -0
  27. maxframe/dataframe/extensions/tests/test_extensions.py +108 -3
  28. maxframe/dataframe/groupby/__init__.py +1 -0
  29. maxframe/dataframe/groupby/aggregation.py +1 -0
  30. maxframe/dataframe/groupby/apply.py +9 -1
  31. maxframe/dataframe/groupby/core.py +1 -1
  32. maxframe/dataframe/groupby/fill.py +4 -1
  33. maxframe/dataframe/groupby/getitem.py +6 -0
  34. maxframe/dataframe/groupby/tests/test_groupby.py +1 -1
  35. maxframe/dataframe/groupby/transform.py +8 -2
  36. maxframe/dataframe/indexing/add_prefix_suffix.py +1 -1
  37. maxframe/dataframe/indexing/loc.py +6 -4
  38. maxframe/dataframe/indexing/rename.py +11 -0
  39. maxframe/dataframe/initializer.py +11 -1
  40. maxframe/dataframe/merge/__init__.py +9 -1
  41. maxframe/dataframe/merge/concat.py +41 -31
  42. maxframe/dataframe/merge/merge.py +1 -1
  43. maxframe/dataframe/merge/tests/test_merge.py +3 -1
  44. maxframe/dataframe/misc/apply.py +3 -0
  45. maxframe/dataframe/misc/drop_duplicates.py +23 -2
  46. maxframe/dataframe/misc/map.py +3 -1
  47. maxframe/dataframe/misc/tests/test_misc.py +24 -2
  48. maxframe/dataframe/misc/transform.py +22 -13
  49. maxframe/dataframe/reduction/__init__.py +3 -0
  50. maxframe/dataframe/reduction/aggregation.py +1 -0
  51. maxframe/dataframe/reduction/median.py +56 -0
  52. maxframe/dataframe/reduction/tests/test_reduction.py +17 -7
  53. maxframe/dataframe/statistics/quantile.py +8 -2
  54. maxframe/dataframe/statistics/tests/test_statistics.py +4 -4
  55. maxframe/dataframe/tests/test_initializer.py +33 -2
  56. maxframe/dataframe/tests/test_utils.py +60 -0
  57. maxframe/dataframe/utils.py +110 -7
  58. maxframe/dataframe/window/expanding.py +5 -3
  59. maxframe/dataframe/window/tests/test_expanding.py +2 -2
  60. maxframe/io/objects/tests/test_object_io.py +39 -12
  61. maxframe/io/odpsio/arrow.py +30 -2
  62. maxframe/io/odpsio/schema.py +28 -8
  63. maxframe/io/odpsio/tableio.py +55 -133
  64. maxframe/io/odpsio/tests/test_schema.py +40 -4
  65. maxframe/io/odpsio/tests/test_tableio.py +5 -5
  66. maxframe/io/odpsio/tests/test_volumeio.py +35 -11
  67. maxframe/io/odpsio/volumeio.py +36 -6
  68. maxframe/learn/contrib/__init__.py +3 -1
  69. maxframe/learn/contrib/graph/__init__.py +15 -0
  70. maxframe/learn/contrib/graph/connected_components.py +215 -0
  71. maxframe/learn/contrib/graph/tests/__init__.py +13 -0
  72. maxframe/learn/contrib/graph/tests/test_connected_components.py +53 -0
  73. maxframe/learn/contrib/llm/__init__.py +16 -0
  74. maxframe/learn/contrib/llm/core.py +54 -0
  75. maxframe/learn/contrib/llm/models/__init__.py +14 -0
  76. maxframe/learn/contrib/llm/models/dashscope.py +73 -0
  77. maxframe/learn/contrib/llm/multi_modal.py +42 -0
  78. maxframe/learn/contrib/llm/text.py +42 -0
  79. maxframe/learn/contrib/xgboost/classifier.py +3 -3
  80. maxframe/learn/contrib/xgboost/predict.py +8 -39
  81. maxframe/learn/contrib/xgboost/train.py +4 -3
  82. maxframe/lib/mmh3.cp37-win32.pyd +0 -0
  83. maxframe/lib/sparse/tests/test_sparse.py +15 -15
  84. maxframe/opcodes.py +10 -1
  85. maxframe/protocol.py +6 -1
  86. maxframe/serialization/core.cp37-win32.pyd +0 -0
  87. maxframe/serialization/core.pyx +13 -1
  88. maxframe/serialization/pandas.py +50 -20
  89. maxframe/serialization/serializables/core.py +24 -5
  90. maxframe/serialization/serializables/field_type.py +4 -1
  91. maxframe/serialization/serializables/tests/test_serializable.py +8 -1
  92. maxframe/serialization/tests/test_serial.py +2 -1
  93. maxframe/session.py +9 -2
  94. maxframe/tensor/__init__.py +19 -7
  95. maxframe/tensor/indexing/getitem.py +2 -0
  96. maxframe/tensor/merge/concatenate.py +23 -20
  97. maxframe/tensor/merge/vstack.py +5 -1
  98. maxframe/tensor/misc/transpose.py +1 -1
  99. maxframe/tests/utils.py +16 -0
  100. maxframe/udf.py +27 -0
  101. maxframe/utils.py +64 -14
  102. {maxframe-1.0.0rc3.dist-info → maxframe-1.1.0.dist-info}/METADATA +2 -2
  103. {maxframe-1.0.0rc3.dist-info → maxframe-1.1.0.dist-info}/RECORD +112 -96
  104. maxframe_client/clients/framedriver.py +4 -1
  105. maxframe_client/fetcher.py +28 -10
  106. maxframe_client/session/consts.py +3 -0
  107. maxframe_client/session/odps.py +104 -20
  108. maxframe_client/session/task.py +42 -26
  109. maxframe_client/session/tests/test_task.py +0 -4
  110. maxframe_client/tests/test_session.py +44 -12
  111. {maxframe-1.0.0rc3.dist-info → maxframe-1.1.0.dist-info}/WHEEL +0 -0
  112. {maxframe-1.0.0rc3.dist-info → maxframe-1.1.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,42 @@
1
+ # Copyright 1999-2024 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ from typing import Any, Dict
15
+
16
+ from ....dataframe.core import DATAFRAME_TYPE, SERIES_TYPE
17
+ from .core import LLM
18
+
19
+
20
+ class MultiModalLLM(LLM):
21
+ def generate(
22
+ self,
23
+ data,
24
+ prompt_template: Dict[str, Any],
25
+ params: Dict[str, Any] = None,
26
+ ):
27
+ raise NotImplementedError
28
+
29
+
30
+ def generate(
31
+ data,
32
+ model: MultiModalLLM,
33
+ prompt_template: Dict[str, Any],
34
+ params: Dict[str, Any] = None,
35
+ ):
36
+ if not isinstance(data, DATAFRAME_TYPE) and not isinstance(data, SERIES_TYPE):
37
+ raise ValueError("data must be a maxframe dataframe or series object")
38
+ if not isinstance(model, MultiModalLLM):
39
+ raise ValueError("model must be a MultiModalLLM object")
40
+ params = params if params is not None else dict()
41
+ model.validate_params(params)
42
+ return model.generate(data, prompt_template, params)
@@ -0,0 +1,42 @@
1
+ # Copyright 1999-2024 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ from typing import Any, Dict
15
+
16
+ from ....dataframe.core import DATAFRAME_TYPE, SERIES_TYPE
17
+ from .core import LLM
18
+
19
+
20
+ class TextLLM(LLM):
21
+ def generate(
22
+ self,
23
+ data,
24
+ prompt_template: Dict[str, Any],
25
+ params: Dict[str, Any] = None,
26
+ ):
27
+ raise NotImplementedError
28
+
29
+
30
+ def generate(
31
+ data,
32
+ model: TextLLM,
33
+ prompt_template: Dict[str, Any],
34
+ params: Dict[str, Any] = None,
35
+ ):
36
+ if not isinstance(data, DATAFRAME_TYPE) and not isinstance(data, SERIES_TYPE):
37
+ raise ValueError("data must be a maxframe dataframe or series object")
38
+ if not isinstance(model, TextLLM):
39
+ raise ValueError("model must be a TextLLM object")
40
+ params = params if params is not None else dict()
41
+ model.validate_params(params)
42
+ return model.generate(data, prompt_template, params)
@@ -14,7 +14,8 @@
14
14
 
15
15
  import numpy as np
16
16
 
17
- from ....tensor import argmax, transpose, vstack
17
+ from ....tensor import argmax, transpose
18
+ from ....tensor.merge.vstack import _vstack
18
19
  from ..utils import make_import_error_func
19
20
  from .core import XGBScikitLearnBase, xgboost
20
21
 
@@ -89,7 +90,6 @@ else:
89
90
  if ntree_limit is not None:
90
91
  raise NotImplementedError("ntree_limit is not currently supported")
91
92
  prediction = predict(self.get_booster(), data, flag=flag, **kw)
92
-
93
93
  if len(prediction.shape) == 2 and prediction.shape[1] == self.n_classes_:
94
94
  # multi-class
95
95
  return prediction
@@ -103,7 +103,7 @@ else:
103
103
  # binary logistic function
104
104
  classone_probs = prediction
105
105
  classzero_probs = 1.0 - classone_probs
106
- return transpose(vstack((classzero_probs, classone_probs)))
106
+ return transpose(_vstack((classzero_probs, classone_probs)))
107
107
 
108
108
  @property
109
109
  def classes_(self) -> np.ndarray:
@@ -14,20 +14,18 @@
14
14
 
15
15
 
16
16
  import numpy as np
17
- import pandas as pd
18
17
 
19
18
  from .... import opcodes
20
19
  from ....core.entity.output_types import OutputType
21
20
  from ....core.operator.base import Operator
22
21
  from ....core.operator.core import TileableOperatorMixin
23
- from ....dataframe.utils import parse_index
24
22
  from ....serialization.serializables import (
25
23
  BoolField,
26
24
  KeyField,
27
25
  ReferenceField,
28
26
  TupleField,
29
27
  )
30
- from ....tensor.core import TENSOR_TYPE, TensorOrder
28
+ from ....tensor.core import TensorOrder
31
29
  from .core import BoosterData
32
30
  from .dmatrix import check_data
33
31
 
@@ -65,35 +63,12 @@ class XGBPredict(Operator, TileableOperatorMixin):
65
63
  else:
66
64
  shape = (self.data.shape[0],)
67
65
  inputs = [self.data, self.model]
68
- if self.output_types[0] == OutputType.tensor:
69
- # tensor
70
- return self.new_tileable(
71
- inputs,
72
- shape=shape,
73
- dtype=self.output_dtype,
74
- order=TensorOrder.C_ORDER,
75
- )
76
- elif self.output_types[0] == OutputType.dataframe:
77
- # dataframe
78
- dtypes = pd.DataFrame(
79
- np.random.rand(0, num_class), dtype=self.output_dtype
80
- ).dtypes
81
- return self.new_tileable(
82
- inputs,
83
- shape=shape,
84
- dtypes=dtypes,
85
- columns_value=parse_index(dtypes.index),
86
- index_value=self.data.index_value,
87
- )
88
- else:
89
- # series
90
- return self.new_tileable(
91
- inputs,
92
- shape=shape,
93
- index_value=self.data.index_value,
94
- name="predictions",
95
- dtype=self.output_dtype,
96
- )
66
+ return self.new_tileable(
67
+ inputs,
68
+ shape=shape,
69
+ dtype=self.output_dtype,
70
+ order=TensorOrder.C_ORDER,
71
+ )
97
72
 
98
73
 
99
74
  def predict(
@@ -124,13 +99,7 @@ def predict(
124
99
  data = check_data(data)
125
100
  # TODO: check model datatype
126
101
 
127
- num_class = getattr(model.op, "num_class", None)
128
- if isinstance(data, TENSOR_TYPE):
129
- output_types = [OutputType.tensor]
130
- elif num_class is not None:
131
- output_types = [OutputType.dataframe]
132
- else:
133
- output_types = [OutputType.series]
102
+ output_types = [OutputType.tensor]
134
103
 
135
104
  iteration_range = iteration_range or (0, 0)
136
105
 
@@ -102,7 +102,7 @@ def train(params, dtrain, evals=None, evals_result=None, num_class=None, **kwarg
102
102
  Parameters
103
103
  ----------
104
104
  Parameters are the same as `xgboost.train`. Note that train is an eager-execution
105
- API. The call will be blocked until training finished.
105
+ API if evals is passed, thus the call will be blocked until training finished.
106
106
 
107
107
  Returns
108
108
  -------
@@ -121,11 +121,12 @@ def train(params, dtrain, evals=None, evals_result=None, num_class=None, **kwarg
121
121
  processed_evals.append((eval_dmatrix, name))
122
122
  else:
123
123
  processed_evals.append((to_dmatrix(eval_dmatrix), name))
124
- return XGBTrain(
124
+ data = XGBTrain(
125
125
  params=params,
126
126
  dtrain=dtrain,
127
127
  evals=processed_evals,
128
128
  evals_result=evals_result,
129
129
  num_class=num_class,
130
130
  **kwargs,
131
- )(evals_result).execute(session=session, **run_kwargs)
131
+ )(evals_result)
132
+ return data.execute(session=session, **run_kwargs) if evals else data
Binary file
@@ -55,13 +55,13 @@ def test_sparse_creation():
55
55
  s = SparseNDArray(s1_data)
56
56
  assert s.ndim == 2
57
57
  assert isinstance(s, SparseMatrix)
58
- assert_array_equal(s.toarray(), s1_data.A)
59
- assert_array_equal(s.todense(), s1_data.A)
58
+ assert_array_equal(s.toarray(), s1_data.toarray())
59
+ assert_array_equal(s.todense(), s1_data.toarray())
60
60
 
61
61
  ss = pickle.loads(pickle.dumps(s))
62
62
  assert s == ss
63
- assert_array_equal(ss.toarray(), s1_data.A)
64
- assert_array_equal(ss.todense(), s1_data.A)
63
+ assert_array_equal(ss.toarray(), s1_data.toarray())
64
+ assert_array_equal(ss.todense(), s1_data.toarray())
65
65
 
66
66
  v = SparseNDArray(v1, shape=(3,))
67
67
  assert s.ndim
@@ -331,12 +331,12 @@ def test_sparse_dot():
331
331
 
332
332
  assert_array_equal(mls.dot(s1, v1_s), s1.dot(v1_data))
333
333
  assert_array_equal(mls.dot(s2, v1_s), s2.dot(v1_data))
334
- assert_array_equal(mls.dot(v2_s, s1), v2_data.dot(s1_data.A))
335
- assert_array_equal(mls.dot(v2_s, s2), v2_data.dot(s2_data.A))
334
+ assert_array_equal(mls.dot(v2_s, s1), v2_data.dot(s1_data.toarray()))
335
+ assert_array_equal(mls.dot(v2_s, s2), v2_data.dot(s2_data.toarray()))
336
336
  assert_array_equal(mls.dot(v1_s, v1_s), v1_data.dot(v1_data), almost=True)
337
337
  assert_array_equal(mls.dot(v2_s, v2_s), v2_data.dot(v2_data), almost=True)
338
338
 
339
- assert_array_equal(mls.dot(v2_s, s1, sparse=False), v2_data.dot(s1_data.A))
339
+ assert_array_equal(mls.dot(v2_s, s1, sparse=False), v2_data.dot(s1_data.toarray()))
340
340
  assert_array_equal(mls.dot(v1_s, v1_s, sparse=False), v1_data.dot(v1_data))
341
341
 
342
342
 
@@ -390,7 +390,7 @@ def test_sparse_fill_diagonal():
390
390
  arr = SparseNDArray(s1)
391
391
  arr.fill_diagonal(3)
392
392
 
393
- expected = s1.copy().A
393
+ expected = s1.copy().toarray()
394
394
  np.fill_diagonal(expected, 3)
395
395
 
396
396
  np.testing.assert_array_equal(arr.toarray(), expected)
@@ -399,7 +399,7 @@ def test_sparse_fill_diagonal():
399
399
  arr = SparseNDArray(s1)
400
400
  arr.fill_diagonal(3, wrap=True)
401
401
 
402
- expected = s1.copy().A
402
+ expected = s1.copy().toarray()
403
403
  np.fill_diagonal(expected, 3, wrap=True)
404
404
 
405
405
  np.testing.assert_array_equal(arr.toarray(), expected)
@@ -408,7 +408,7 @@ def test_sparse_fill_diagonal():
408
408
  arr = SparseNDArray(s1)
409
409
  arr.fill_diagonal([1, 2, 3])
410
410
 
411
- expected = s1.copy().A
411
+ expected = s1.copy().toarray()
412
412
  np.fill_diagonal(expected, [1, 2, 3])
413
413
 
414
414
  np.testing.assert_array_equal(arr.toarray(), expected)
@@ -417,7 +417,7 @@ def test_sparse_fill_diagonal():
417
417
  arr = SparseNDArray(s1)
418
418
  arr.fill_diagonal([1, 2, 3], wrap=True)
419
419
 
420
- expected = s1.copy().A
420
+ expected = s1.copy().toarray()
421
421
  np.fill_diagonal(expected, [1, 2, 3], wrap=True)
422
422
 
423
423
  np.testing.assert_array_equal(arr.toarray(), expected)
@@ -427,7 +427,7 @@ def test_sparse_fill_diagonal():
427
427
  arr = SparseNDArray(s1)
428
428
  arr.fill_diagonal(val)
429
429
 
430
- expected = s1.copy().A
430
+ expected = s1.copy().toarray()
431
431
  np.fill_diagonal(expected, val)
432
432
 
433
433
  np.testing.assert_array_equal(arr.toarray(), expected)
@@ -437,7 +437,7 @@ def test_sparse_fill_diagonal():
437
437
  arr = SparseNDArray(s1)
438
438
  arr.fill_diagonal(val, wrap=True)
439
439
 
440
- expected = s1.copy().A
440
+ expected = s1.copy().toarray()
441
441
  np.fill_diagonal(expected, val, wrap=True)
442
442
 
443
443
  np.testing.assert_array_equal(arr.toarray(), expected)
@@ -447,7 +447,7 @@ def test_sparse_fill_diagonal():
447
447
  arr = SparseNDArray(s1)
448
448
  arr.fill_diagonal(val)
449
449
 
450
- expected = s1.copy().A
450
+ expected = s1.copy().toarray()
451
451
  np.fill_diagonal(expected, val)
452
452
 
453
453
  np.testing.assert_array_equal(arr.toarray(), expected)
@@ -457,7 +457,7 @@ def test_sparse_fill_diagonal():
457
457
  arr = SparseNDArray(s1)
458
458
  arr.fill_diagonal(val, wrap=True)
459
459
 
460
- expected = s1.copy().A
460
+ expected = s1.copy().toarray()
461
461
  np.fill_diagonal(expected, val, wrap=True)
462
462
 
463
463
  np.testing.assert_array_equal(arr.toarray(), expected)
maxframe/opcodes.py CHANGED
@@ -270,6 +270,7 @@ KURTOSIS = 351
270
270
  SEM = 352
271
271
  STR_CONCAT = 353
272
272
  MAD = 354
273
+ MEDIAN = 355
273
274
 
274
275
  # tensor operator
275
276
  RESHAPE = 401
@@ -377,7 +378,6 @@ DROP_DUPLICATES = 728
377
378
  MELT = 729
378
379
  RENAME = 731
379
380
  INSERT = 732
380
- MAP_CHUNK = 733
381
381
  CARTESIAN_CHUNK = 734
382
382
  EXPLODE = 735
383
383
  REPLACE = 736
@@ -392,6 +392,10 @@ PIVOT_TABLE = 744
392
392
 
393
393
  FUSE = 801
394
394
 
395
+ # LLM
396
+ DASHSCOPE_TEXT_GENERATION = 810
397
+ DASHSCOPE_MULTI_MODAL_GENERATION = 811
398
+
395
399
  # table like input for tensor
396
400
  TABLE_COO = 1003
397
401
  # store tensor as coo format
@@ -532,6 +536,8 @@ STATSMODELS_TRAIN = 3012
532
536
  STATSMODELS_PREDICT = 3013
533
537
 
534
538
  # learn
539
+ CONNECTED_COMPONENTS = 3100
540
+
535
541
  # checks
536
542
  CHECK_NON_NEGATIVE = 3300
537
543
  # classifier check targets
@@ -566,6 +572,9 @@ CHOLESKY_FUSE = 999988
566
572
 
567
573
  # MaxFrame-dedicated functions
568
574
  DATAFRAME_RESHUFFLE = 10001
575
+ FLATMAP = 10002
576
+ FLATJSON = 10003
577
+ APPLY_CHUNK = 10004
569
578
 
570
579
  # MaxFrame internal operators
571
580
  DATAFRAME_PROJECTION_SAME_INDEX_MERGE = 100001
maxframe/protocol.py CHANGED
@@ -375,6 +375,11 @@ class ExecuteDagRequest(Serializable):
375
375
  value_type=FieldTypes.reference,
376
376
  default=None,
377
377
  )
378
+ new_settings: Dict[str, Any] = DictField(
379
+ "new_settings",
380
+ key_type=FieldTypes.string,
381
+ default=None,
382
+ )
378
383
 
379
384
 
380
385
  class SubDagSubmitInstanceInfo(JsonSerializable):
@@ -511,7 +516,7 @@ class DataFrameTableMeta(JsonSerializable):
511
516
  return True
512
517
 
513
518
  def to_json(self) -> dict:
514
- b64_pk = lambda x: base64.b64encode(pickle.dumps(x))
519
+ b64_pk = lambda x: base64.b64encode(pickle.dumps(x)).decode()
515
520
  ret = {
516
521
  "table_name": self.table_name,
517
522
  "type": self.type.value,
Binary file
@@ -37,7 +37,7 @@ from .._utils import NamedType
37
37
  from .._utils cimport TypeDispatcher
38
38
 
39
39
  from ..lib import wrapped_pickle as pickle
40
- from ..utils import arrow_type_from_str
40
+ from ..utils import NoDefault, arrow_type_from_str, no_default
41
41
 
42
42
  try:
43
43
  from pandas import ArrowDtype
@@ -94,6 +94,7 @@ cdef:
94
94
  int COMPLEX_SERIALIZER = 12
95
95
  int SLICE_SERIALIZER = 13
96
96
  int REGEX_SERIALIZER = 14
97
+ int NO_DEFAULT_SERIALIZER = 15
97
98
  int PLACEHOLDER_SERIALIZER = 4096
98
99
 
99
100
 
@@ -803,6 +804,16 @@ cdef class RegexSerializer(Serializer):
803
804
  return re.compile((<bytes>(subs[0])).decode(), serialized[0])
804
805
 
805
806
 
807
+ cdef class NoDefaultSerializer(Serializer):
808
+ serializer_id = NO_DEFAULT_SERIALIZER
809
+
810
+ cpdef serial(self, object obj, dict context):
811
+ return [], [], True
812
+
813
+ cpdef deserial(self, list obj, dict context, list subs):
814
+ return no_default
815
+
816
+
806
817
  cdef class Placeholder:
807
818
  """
808
819
  Placeholder object to reduce duplicated serialization
@@ -857,6 +868,7 @@ DtypeSerializer.register(ExtensionDtype)
857
868
  ComplexSerializer.register(complex)
858
869
  SliceSerializer.register(slice)
859
870
  RegexSerializer.register(re.Pattern)
871
+ NoDefaultSerializer.register(NoDefault)
860
872
  PlaceholderSerializer.register(Placeholder)
861
873
 
862
874
 
@@ -134,8 +134,10 @@ class ArraySerializer(Serializer):
134
134
  data_parts = [obj.tolist()]
135
135
  else:
136
136
  data_parts = [obj.to_numpy().tolist()]
137
- else:
137
+ elif hasattr(obj, "_data"):
138
138
  data_parts = [getattr(obj, "_data")]
139
+ else:
140
+ data_parts = [getattr(obj, "_pa_array")]
139
141
  return [ser_type], [dtype] + data_parts, False
140
142
 
141
143
  def deserial(self, serialized: List, context: Dict, subs: List):
@@ -155,38 +157,66 @@ class PdTimestampSerializer(Serializer):
155
157
  else:
156
158
  zone_info = []
157
159
  ts = obj.to_pydatetime().timestamp()
158
- return (
159
- [int(ts), obj.microsecond, obj.nanosecond],
160
- zone_info,
161
- bool(zone_info),
162
- )
160
+ elements = [int(ts), obj.microsecond, obj.nanosecond]
161
+ if hasattr(obj, "unit"):
162
+ elements.append(str(obj.unit))
163
+ return elements, zone_info, bool(zone_info)
163
164
 
164
165
  def deserial(self, serialized: List, context: Dict, subs: List):
165
166
  if subs:
166
- val = pd.Timestamp.utcfromtimestamp(serialized[0]).replace(
167
- microsecond=serialized[1], nanosecond=serialized[2]
168
- )
169
- val = val.replace(tzinfo=datetime.timezone.utc).tz_convert(subs[0])
167
+ pydt = datetime.datetime.utcfromtimestamp(serialized[0])
168
+ kwargs = {
169
+ "year": pydt.year,
170
+ "month": pydt.month,
171
+ "day": pydt.day,
172
+ "hour": pydt.hour,
173
+ "minute": pydt.minute,
174
+ "second": pydt.second,
175
+ "microsecond": serialized[1],
176
+ "nanosecond": serialized[2],
177
+ "tzinfo": datetime.timezone.utc,
178
+ }
179
+ if len(serialized) > 3:
180
+ kwargs["unit"] = serialized[3]
181
+ val = pd.Timestamp(**kwargs).tz_convert(subs[0])
170
182
  else:
171
- val = pd.Timestamp.fromtimestamp(serialized[0]).replace(
172
- microsecond=serialized[1], nanosecond=serialized[2]
173
- )
183
+ pydt = datetime.datetime.fromtimestamp(serialized[0])
184
+ kwargs = {
185
+ "year": pydt.year,
186
+ "month": pydt.month,
187
+ "day": pydt.day,
188
+ "hour": pydt.hour,
189
+ "minute": pydt.minute,
190
+ "second": pydt.second,
191
+ "microsecond": serialized[1],
192
+ "nanosecond": serialized[2],
193
+ }
194
+ if len(serialized) >= 4:
195
+ kwargs["unit"] = serialized[3]
196
+ val = pd.Timestamp(**kwargs)
174
197
  return val
175
198
 
176
199
 
177
200
  class PdTimedeltaSerializer(Serializer):
178
201
  def serial(self, obj: pd.Timedelta, context: Dict):
179
- return [int(obj.seconds), obj.microseconds, obj.nanoseconds, obj.days], [], True
202
+ elements = [int(obj.seconds), obj.microseconds, obj.nanoseconds, obj.days]
203
+ if hasattr(obj, "unit"):
204
+ elements.append(str(obj.unit))
205
+ return elements, [], True
180
206
 
181
207
  def deserial(self, serialized: List, context: Dict, subs: List):
182
208
  days = 0 if len(serialized) < 4 else serialized[3]
209
+ unit = None if len(serialized) < 5 else serialized[4]
183
210
  seconds, microseconds, nanoseconds = serialized[:3]
184
- return pd.Timedelta(
185
- days=days,
186
- seconds=seconds,
187
- microseconds=microseconds,
188
- nanoseconds=nanoseconds,
189
- )
211
+ kwargs = {
212
+ "days": days,
213
+ "seconds": seconds,
214
+ "microseconds": microseconds,
215
+ "nanoseconds": nanoseconds,
216
+ }
217
+ if unit is not None:
218
+ kwargs["unit"] = unit
219
+ return pd.Timedelta(**kwargs)
190
220
 
191
221
 
192
222
  class NoDefaultSerializer(Serializer):
@@ -19,6 +19,7 @@ from typing import Any, Dict, List, Optional, Tuple, Type
19
19
  import msgpack
20
20
 
21
21
  from ...lib.mmh3 import hash
22
+ from ...utils import no_default
22
23
  from ..core import Placeholder, Serializer, buffered, load_type
23
24
  from .field import Field
24
25
  from .field_type import DictType, ListType, PrimitiveFieldType, TupleType
@@ -211,6 +212,22 @@ class _NoFieldValue:
211
212
  _no_field_value = _NoFieldValue()
212
213
 
213
214
 
215
+ def _to_primitive_placeholder(v: Any) -> Any:
216
+ if v is _no_field_value or v is no_default:
217
+ return {}
218
+ return v
219
+
220
+
221
+ def _restore_primitive_placeholder(v: Any) -> Any:
222
+ if type(v) is dict:
223
+ if v == {}:
224
+ return _no_field_value
225
+ else:
226
+ return v
227
+ else:
228
+ return v
229
+
230
+
214
231
  class SerializableSerializer(Serializer):
215
232
  """
216
233
  Leverage DictSerializer to perform serde.
@@ -241,9 +258,7 @@ class SerializableSerializer(Serializer):
241
258
  else:
242
259
  primitive_vals = self._get_field_values(obj, obj._PRIMITIVE_FIELDS)
243
260
  # replace _no_field_value as {} to make them msgpack-serializable
244
- primitive_vals = [
245
- v if v is not _no_field_value else {} for v in primitive_vals
246
- ]
261
+ primitive_vals = [_to_primitive_placeholder(v) for v in primitive_vals]
247
262
  if obj._cache_primitive_serial:
248
263
  primitive_vals = msgpack.dumps(primitive_vals)
249
264
  _primitive_serial_cache[obj] = primitive_vals
@@ -311,7 +326,9 @@ class SerializableSerializer(Serializer):
311
326
  cls_fields = server_fields[server_field_num : field_num + count]
312
327
  cls_values = values[field_num : field_num + count]
313
328
  for field, value in zip(cls_fields, cls_values):
314
- if not is_primitive or value != {}:
329
+ if is_primitive:
330
+ value = _restore_primitive_placeholder(value)
331
+ if not is_primitive or value is not _no_field_value:
315
332
  cls._set_field_value(obj, field, value)
316
333
  field_num += count
317
334
  try:
@@ -356,7 +373,9 @@ class SerializableSerializer(Serializer):
356
373
  server_fields + deprecated_fields, key=lambda f: f.name
357
374
  )
358
375
  for field, value in zip(server_fields, values):
359
- if not is_primitive or value != {}:
376
+ if is_primitive:
377
+ value = _restore_primitive_placeholder(value)
378
+ if not is_primitive or value is not _no_field_value:
360
379
  try:
361
380
  cls._set_field_value(obj, field, value)
362
381
  except AttributeError: # pragma: no cover
@@ -46,6 +46,9 @@ class PrimitiveType(Enum):
46
46
  complex128 = 25
47
47
 
48
48
 
49
+ _np_unicode = np.unicode_ if hasattr(np, "unicode_") else np.str_
50
+
51
+
49
52
  _primitive_type_to_valid_types = {
50
53
  PrimitiveType.bool: (bool, np.bool_),
51
54
  PrimitiveType.int8: (int, np.int8),
@@ -60,7 +63,7 @@ _primitive_type_to_valid_types = {
60
63
  PrimitiveType.float32: (float, np.float32),
61
64
  PrimitiveType.float64: (float, np.float64),
62
65
  PrimitiveType.bytes: (bytes, np.bytes_),
63
- PrimitiveType.string: (str, np.unicode_),
66
+ PrimitiveType.string: (str, _np_unicode),
64
67
  PrimitiveType.complex64: (complex, np.complex64),
65
68
  PrimitiveType.complex128: (complex, np.complex128),
66
69
  }
@@ -21,6 +21,7 @@ import pytest
21
21
 
22
22
  from ....core import EntityData
23
23
  from ....lib.wrapped_pickle import switch_unpickle
24
+ from ....utils import no_default
24
25
  from ... import deserialize, serialize
25
26
  from .. import (
26
27
  AnyField,
@@ -143,6 +144,7 @@ class MySerializable(Serializable):
143
144
  oneof1_val=f"{__name__}.MySerializable",
144
145
  oneof2_val=MySimpleSerializable,
145
146
  )
147
+ _no_default_val = Float64Field("no_default_val", default=no_default)
146
148
 
147
149
 
148
150
  @pytest.mark.parametrize("set_is_ci", [False, True], indirect=True)
@@ -187,6 +189,7 @@ def test_serializable(set_is_ci):
187
189
  _dict_val={"a": b"bytes_value"},
188
190
  _ref_val=MySerializable(),
189
191
  _oneof_val=MySerializable(_id="2"),
192
+ _no_default_val=no_default,
190
193
  )
191
194
 
192
195
  header, buffers = serialize(my_serializable)
@@ -234,7 +237,11 @@ def _assert_serializable_eq(my_serializable, my_serializable2):
234
237
  if not hasattr(my_serializable, field.name):
235
238
  continue
236
239
  expect_value = getattr(my_serializable, field_name)
237
- actual_value = getattr(my_serializable2, field_name)
240
+ if expect_value is no_default:
241
+ assert not hasattr(my_serializable2, field.name)
242
+ continue
243
+ else:
244
+ actual_value = getattr(my_serializable2, field_name)
238
245
  if isinstance(expect_value, np.ndarray):
239
246
  np.testing.assert_array_equal(expect_value, actual_value)
240
247
  elif isinstance(expect_value, pd.DataFrame):
@@ -42,7 +42,7 @@ except ImportError:
42
42
  from ...lib.sparse import SparseMatrix
43
43
  from ...lib.wrapped_pickle import switch_unpickle
44
44
  from ...tests.utils import require_cudf, require_cupy
45
- from ...utils import lazy_import
45
+ from ...utils import lazy_import, no_default
46
46
  from .. import (
47
47
  PickleContainer,
48
48
  RemoteException,
@@ -90,6 +90,7 @@ class CustomNamedTuple(NamedTuple):
90
90
  pd.Timedelta(102.234154131),
91
91
  {"abc": 5.6, "def": [3.4], "gh": None, "ijk": {}},
92
92
  OrderedDict([("abcd", 5.6)]),
93
+ no_default,
93
94
  ],
94
95
  )
95
96
  @switch_unpickle