maxframe 0.1.0b5__cp311-cp311-macosx_10_9_universal2.whl → 1.0.0__cp311-cp311-macosx_10_9_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of maxframe might be problematic. Click here for more details.

Files changed (203) hide show
  1. maxframe/_utils.cpython-311-darwin.so +0 -0
  2. maxframe/codegen.py +10 -4
  3. maxframe/config/config.py +68 -10
  4. maxframe/config/validators.py +42 -11
  5. maxframe/conftest.py +58 -14
  6. maxframe/core/__init__.py +2 -16
  7. maxframe/core/entity/__init__.py +1 -12
  8. maxframe/core/entity/executable.py +1 -1
  9. maxframe/core/entity/objects.py +46 -45
  10. maxframe/core/entity/output_types.py +0 -3
  11. maxframe/core/entity/tests/test_objects.py +43 -0
  12. maxframe/core/entity/tileables.py +5 -78
  13. maxframe/core/graph/__init__.py +2 -2
  14. maxframe/core/graph/builder/__init__.py +0 -1
  15. maxframe/core/graph/builder/base.py +5 -4
  16. maxframe/core/graph/builder/tileable.py +4 -4
  17. maxframe/core/graph/builder/utils.py +4 -8
  18. maxframe/core/graph/core.cpython-311-darwin.so +0 -0
  19. maxframe/core/graph/core.pyx +4 -4
  20. maxframe/core/graph/entity.py +9 -33
  21. maxframe/core/operator/__init__.py +2 -9
  22. maxframe/core/operator/base.py +3 -5
  23. maxframe/core/operator/objects.py +0 -9
  24. maxframe/core/operator/utils.py +55 -0
  25. maxframe/dataframe/__init__.py +1 -1
  26. maxframe/dataframe/arithmetic/around.py +5 -17
  27. maxframe/dataframe/arithmetic/core.py +15 -7
  28. maxframe/dataframe/arithmetic/docstring.py +7 -33
  29. maxframe/dataframe/arithmetic/equal.py +4 -2
  30. maxframe/dataframe/arithmetic/greater.py +4 -2
  31. maxframe/dataframe/arithmetic/greater_equal.py +4 -2
  32. maxframe/dataframe/arithmetic/less.py +2 -2
  33. maxframe/dataframe/arithmetic/less_equal.py +4 -2
  34. maxframe/dataframe/arithmetic/not_equal.py +4 -2
  35. maxframe/dataframe/arithmetic/tests/test_arithmetic.py +39 -16
  36. maxframe/dataframe/core.py +31 -7
  37. maxframe/dataframe/datasource/date_range.py +2 -2
  38. maxframe/dataframe/datasource/read_odps_query.py +117 -23
  39. maxframe/dataframe/datasource/read_odps_table.py +6 -3
  40. maxframe/dataframe/datasource/tests/test_datasource.py +103 -8
  41. maxframe/dataframe/datastore/tests/test_to_odps.py +48 -0
  42. maxframe/dataframe/datastore/to_odps.py +28 -0
  43. maxframe/dataframe/extensions/__init__.py +5 -0
  44. maxframe/dataframe/extensions/flatjson.py +131 -0
  45. maxframe/dataframe/extensions/flatmap.py +317 -0
  46. maxframe/dataframe/extensions/reshuffle.py +1 -1
  47. maxframe/dataframe/extensions/tests/test_extensions.py +108 -3
  48. maxframe/dataframe/groupby/core.py +1 -1
  49. maxframe/dataframe/groupby/cum.py +0 -1
  50. maxframe/dataframe/groupby/fill.py +4 -1
  51. maxframe/dataframe/groupby/getitem.py +6 -0
  52. maxframe/dataframe/groupby/tests/test_groupby.py +5 -1
  53. maxframe/dataframe/groupby/transform.py +5 -1
  54. maxframe/dataframe/indexing/align.py +1 -1
  55. maxframe/dataframe/indexing/loc.py +6 -4
  56. maxframe/dataframe/indexing/rename.py +5 -28
  57. maxframe/dataframe/indexing/sample.py +0 -1
  58. maxframe/dataframe/indexing/set_index.py +68 -1
  59. maxframe/dataframe/initializer.py +11 -1
  60. maxframe/dataframe/merge/__init__.py +9 -1
  61. maxframe/dataframe/merge/concat.py +41 -31
  62. maxframe/dataframe/merge/merge.py +237 -3
  63. maxframe/dataframe/merge/tests/test_merge.py +126 -1
  64. maxframe/dataframe/misc/apply.py +5 -10
  65. maxframe/dataframe/misc/case_when.py +1 -1
  66. maxframe/dataframe/misc/describe.py +2 -2
  67. maxframe/dataframe/misc/drop_duplicates.py +8 -8
  68. maxframe/dataframe/misc/eval.py +4 -0
  69. maxframe/dataframe/misc/memory_usage.py +2 -2
  70. maxframe/dataframe/misc/pct_change.py +1 -83
  71. maxframe/dataframe/misc/tests/test_misc.py +33 -2
  72. maxframe/dataframe/misc/transform.py +1 -30
  73. maxframe/dataframe/misc/value_counts.py +4 -17
  74. maxframe/dataframe/missing/dropna.py +1 -1
  75. maxframe/dataframe/missing/fillna.py +5 -5
  76. maxframe/dataframe/operators.py +1 -17
  77. maxframe/dataframe/reduction/core.py +2 -2
  78. maxframe/dataframe/reduction/tests/test_reduction.py +2 -4
  79. maxframe/dataframe/sort/sort_values.py +1 -11
  80. maxframe/dataframe/statistics/corr.py +3 -3
  81. maxframe/dataframe/statistics/quantile.py +13 -19
  82. maxframe/dataframe/statistics/tests/test_statistics.py +4 -4
  83. maxframe/dataframe/tests/test_initializer.py +33 -2
  84. maxframe/dataframe/utils.py +26 -11
  85. maxframe/dataframe/window/expanding.py +5 -3
  86. maxframe/dataframe/window/tests/test_expanding.py +2 -2
  87. maxframe/errors.py +13 -0
  88. maxframe/extension.py +12 -0
  89. maxframe/io/__init__.py +13 -0
  90. maxframe/io/objects/__init__.py +24 -0
  91. maxframe/io/objects/core.py +140 -0
  92. maxframe/io/objects/tensor.py +76 -0
  93. maxframe/io/objects/tests/__init__.py +13 -0
  94. maxframe/io/objects/tests/test_object_io.py +97 -0
  95. maxframe/{odpsio → io/odpsio}/__init__.py +3 -1
  96. maxframe/{odpsio → io/odpsio}/arrow.py +42 -10
  97. maxframe/{odpsio → io/odpsio}/schema.py +38 -16
  98. maxframe/io/odpsio/tableio.py +719 -0
  99. maxframe/io/odpsio/tests/__init__.py +13 -0
  100. maxframe/{odpsio → io/odpsio}/tests/test_schema.py +59 -22
  101. maxframe/{odpsio → io/odpsio}/tests/test_tableio.py +50 -23
  102. maxframe/{odpsio → io/odpsio}/tests/test_volumeio.py +4 -6
  103. maxframe/io/odpsio/volumeio.py +63 -0
  104. maxframe/learn/contrib/__init__.py +3 -1
  105. maxframe/learn/contrib/graph/__init__.py +15 -0
  106. maxframe/learn/contrib/graph/connected_components.py +215 -0
  107. maxframe/learn/contrib/graph/tests/__init__.py +13 -0
  108. maxframe/learn/contrib/graph/tests/test_connected_components.py +53 -0
  109. maxframe/learn/contrib/llm/__init__.py +16 -0
  110. maxframe/learn/contrib/llm/core.py +54 -0
  111. maxframe/learn/contrib/llm/models/__init__.py +14 -0
  112. maxframe/learn/contrib/llm/models/dashscope.py +73 -0
  113. maxframe/learn/contrib/llm/multi_modal.py +42 -0
  114. maxframe/learn/contrib/llm/text.py +42 -0
  115. maxframe/learn/contrib/xgboost/classifier.py +26 -2
  116. maxframe/learn/contrib/xgboost/core.py +87 -2
  117. maxframe/learn/contrib/xgboost/dmatrix.py +3 -6
  118. maxframe/learn/contrib/xgboost/predict.py +29 -46
  119. maxframe/learn/contrib/xgboost/regressor.py +3 -10
  120. maxframe/learn/contrib/xgboost/train.py +29 -18
  121. maxframe/{core/operator/fuse.py → learn/core.py} +7 -10
  122. maxframe/lib/mmh3.cpython-311-darwin.so +0 -0
  123. maxframe/lib/mmh3.pyi +43 -0
  124. maxframe/lib/sparse/tests/test_sparse.py +15 -15
  125. maxframe/lib/wrapped_pickle.py +2 -1
  126. maxframe/opcodes.py +8 -0
  127. maxframe/protocol.py +154 -27
  128. maxframe/remote/core.py +4 -8
  129. maxframe/serialization/__init__.py +1 -0
  130. maxframe/serialization/core.cpython-311-darwin.so +0 -0
  131. maxframe/serialization/core.pxd +3 -0
  132. maxframe/serialization/core.pyi +3 -0
  133. maxframe/serialization/core.pyx +67 -26
  134. maxframe/serialization/exception.py +1 -1
  135. maxframe/serialization/pandas.py +52 -17
  136. maxframe/serialization/serializables/core.py +180 -15
  137. maxframe/serialization/serializables/field_type.py +4 -1
  138. maxframe/serialization/serializables/tests/test_serializable.py +54 -5
  139. maxframe/serialization/tests/test_serial.py +2 -1
  140. maxframe/session.py +9 -2
  141. maxframe/tensor/__init__.py +81 -2
  142. maxframe/tensor/arithmetic/isclose.py +1 -0
  143. maxframe/tensor/arithmetic/tests/test_arithmetic.py +22 -18
  144. maxframe/tensor/core.py +5 -136
  145. maxframe/tensor/datasource/array.py +3 -0
  146. maxframe/tensor/datasource/full.py +1 -1
  147. maxframe/tensor/datasource/tests/test_datasource.py +1 -1
  148. maxframe/tensor/indexing/flatnonzero.py +1 -1
  149. maxframe/tensor/indexing/getitem.py +2 -0
  150. maxframe/tensor/merge/__init__.py +2 -0
  151. maxframe/tensor/merge/concatenate.py +101 -0
  152. maxframe/tensor/merge/tests/test_merge.py +30 -1
  153. maxframe/tensor/merge/vstack.py +74 -0
  154. maxframe/tensor/{base → misc}/__init__.py +2 -0
  155. maxframe/tensor/{base → misc}/atleast_1d.py +1 -3
  156. maxframe/tensor/misc/atleast_2d.py +70 -0
  157. maxframe/tensor/misc/atleast_3d.py +85 -0
  158. maxframe/tensor/misc/tests/__init__.py +13 -0
  159. maxframe/tensor/{base → misc}/transpose.py +22 -18
  160. maxframe/tensor/{base → misc}/unique.py +3 -3
  161. maxframe/tensor/operators.py +1 -7
  162. maxframe/tensor/random/core.py +1 -1
  163. maxframe/tensor/reduction/count_nonzero.py +2 -1
  164. maxframe/tensor/reduction/mean.py +1 -0
  165. maxframe/tensor/reduction/nanmean.py +1 -0
  166. maxframe/tensor/reduction/nanvar.py +2 -0
  167. maxframe/tensor/reduction/tests/test_reduction.py +12 -1
  168. maxframe/tensor/reduction/var.py +2 -0
  169. maxframe/tensor/statistics/quantile.py +2 -2
  170. maxframe/tensor/utils.py +2 -22
  171. maxframe/tests/test_protocol.py +34 -0
  172. maxframe/tests/test_utils.py +0 -12
  173. maxframe/tests/utils.py +17 -2
  174. maxframe/typing_.py +4 -1
  175. maxframe/udf.py +8 -9
  176. maxframe/utils.py +106 -86
  177. {maxframe-0.1.0b5.dist-info → maxframe-1.0.0.dist-info}/METADATA +25 -25
  178. {maxframe-0.1.0b5.dist-info → maxframe-1.0.0.dist-info}/RECORD +197 -173
  179. {maxframe-0.1.0b5.dist-info → maxframe-1.0.0.dist-info}/WHEEL +1 -1
  180. maxframe_client/__init__.py +0 -1
  181. maxframe_client/clients/framedriver.py +4 -1
  182. maxframe_client/fetcher.py +81 -74
  183. maxframe_client/session/consts.py +3 -0
  184. maxframe_client/session/graph.py +8 -2
  185. maxframe_client/session/odps.py +194 -40
  186. maxframe_client/session/task.py +94 -39
  187. maxframe_client/tests/test_fetcher.py +21 -3
  188. maxframe_client/tests/test_session.py +109 -8
  189. maxframe/core/entity/chunks.py +0 -68
  190. maxframe/core/entity/fuse.py +0 -73
  191. maxframe/core/graph/builder/chunk.py +0 -430
  192. maxframe/odpsio/tableio.py +0 -322
  193. maxframe/odpsio/volumeio.py +0 -95
  194. maxframe_client/clients/spe.py +0 -104
  195. /maxframe/{odpsio → core/entity}/tests/__init__.py +0 -0
  196. /maxframe/{tensor/base → dataframe/datastore}/tests/__init__.py +0 -0
  197. /maxframe/{odpsio → io/odpsio}/tests/test_arrow.py +0 -0
  198. /maxframe/tensor/{base → misc}/astype.py +0 -0
  199. /maxframe/tensor/{base → misc}/broadcast_to.py +0 -0
  200. /maxframe/tensor/{base → misc}/ravel.py +0 -0
  201. /maxframe/tensor/{base/tests/test_base.py → misc/tests/test_misc.py} +0 -0
  202. /maxframe/tensor/{base → misc}/where.py +0 -0
  203. {maxframe-0.1.0b5.dist-info → maxframe-1.0.0.dist-info}/top_level.txt +0 -0
maxframe/remote/core.py CHANGED
@@ -15,7 +15,7 @@
15
15
  from functools import partial
16
16
 
17
17
  from .. import opcodes
18
- from ..core import ENTITY_TYPE, ChunkData
18
+ from ..core import ENTITY_TYPE
19
19
  from ..core.operator import ObjectOperator, ObjectOperatorMixin
20
20
  from ..dataframe.core import DATAFRAME_TYPE, INDEX_TYPE, SERIES_TYPE
21
21
  from ..serialization.serializables import (
@@ -26,7 +26,7 @@ from ..serialization.serializables import (
26
26
  ListField,
27
27
  )
28
28
  from ..tensor.core import TENSOR_TYPE
29
- from ..utils import build_fetch_tileable, find_objects, replace_objects
29
+ from ..utils import find_objects, replace_objects
30
30
 
31
31
 
32
32
  class RemoteFunction(ObjectOperatorMixin, ObjectOperator):
@@ -63,12 +63,8 @@ class RemoteFunction(ObjectOperatorMixin, ObjectOperator):
63
63
  if raw_inputs is not None:
64
64
  for raw_inp in raw_inputs:
65
65
  if self._no_prepare(raw_inp):
66
- if not isinstance(self._inputs[0], ChunkData):
67
- # not in tile, set_inputs from tileable
68
- mapping[raw_inp] = next(function_inputs)
69
- else:
70
- # in tile, set_inputs from chunk
71
- mapping[raw_inp] = build_fetch_tileable(raw_inp)
66
+ # not in tile, set_inputs from tileable
67
+ mapping[raw_inp] = next(function_inputs)
72
68
  else:
73
69
  mapping[raw_inp] = next(function_inputs)
74
70
  self.function_args = replace_objects(self.function_args, mapping)
@@ -17,6 +17,7 @@ from .core import (
17
17
  PickleContainer,
18
18
  Serializer,
19
19
  deserialize,
20
+ load_type,
20
21
  pickle_buffers,
21
22
  serialize,
22
23
  serialize_with_spawn,
@@ -18,6 +18,9 @@ from libc.stdint cimport int32_t, uint64_t
18
18
  cdef class Serializer:
19
19
  cdef int _serializer_id
20
20
 
21
+ cpdef bint is_public_data_exist(self, dict context, object key)
22
+ cpdef put_public_data(self, dict context, object key, object value)
23
+ cpdef get_public_data(self, dict context, object key)
21
24
  cpdef serial(self, object obj, dict context)
22
25
  cpdef deserial(self, list serialized, dict context, list subs)
23
26
  cpdef on_deserial_error(
@@ -29,6 +29,9 @@ class PickleContainer:
29
29
 
30
30
  class Serializer:
31
31
  serializer_id: int
32
+ def is_public_data_exist(self, context: Dict, key: Any) -> bool: ...
33
+ def put_public_data(self, context: Dict, key: Any, value: Any) -> None: ...
34
+ def get_public_data(self, context: Dict, key: Any) -> Any: ...
32
35
  def serial(self, obj: Any, context: Dict): ...
33
36
  def deserial(self, serialized: List, context: Dict, subs: List[Any]): ...
34
37
  def on_deserial_error(
@@ -37,7 +37,7 @@ from .._utils import NamedType
37
37
  from .._utils cimport TypeDispatcher
38
38
 
39
39
  from ..lib import wrapped_pickle as pickle
40
- from ..utils import arrow_type_from_str
40
+ from ..utils import NoDefault, arrow_type_from_str, no_default
41
41
 
42
42
  try:
43
43
  from pandas import ArrowDtype
@@ -94,6 +94,7 @@ cdef:
94
94
  int COMPLEX_SERIALIZER = 12
95
95
  int SLICE_SERIALIZER = 13
96
96
  int REGEX_SERIALIZER = 14
97
+ int NO_DEFAULT_SERIALIZER = 15
97
98
  int PLACEHOLDER_SERIALIZER = 4096
98
99
 
99
100
 
@@ -130,11 +131,30 @@ cdef Serializer get_deserializer(int32_t deserializer_id):
130
131
 
131
132
  cdef class Serializer:
132
133
  serializer_id = None
134
+ _public_data_context_key = 0x7fffffff - 1
133
135
 
134
136
  def __cinit__(self):
135
137
  # make the value can be referenced with C code
136
138
  self._serializer_id = self.serializer_id
137
139
 
140
+ cpdef bint is_public_data_exist(self, dict context, object key):
141
+ cdef dict public_dict = context.get(self._public_data_context_key, None)
142
+ if public_dict is None:
143
+ return False
144
+ return key in public_dict
145
+
146
+ cpdef put_public_data(self, dict context, object key, object value):
147
+ cdef dict public_dict = context.get(self._public_data_context_key, None)
148
+ if public_dict is None:
149
+ public_dict = context[self._public_data_context_key] = {}
150
+ public_dict[key] = value
151
+
152
+ cpdef get_public_data(self, dict context, object key):
153
+ cdef dict public_dict = context.get(self._public_data_context_key, None)
154
+ if public_dict is None:
155
+ return None
156
+ return public_dict.get(key)
157
+
138
158
  cpdef serial(self, object obj, dict context):
139
159
  """
140
160
  Returns intermediate serialization result of certain object.
@@ -784,6 +804,16 @@ cdef class RegexSerializer(Serializer):
784
804
  return re.compile((<bytes>(subs[0])).decode(), serialized[0])
785
805
 
786
806
 
807
+ cdef class NoDefaultSerializer(Serializer):
808
+ serializer_id = NO_DEFAULT_SERIALIZER
809
+
810
+ cpdef serial(self, object obj, dict context):
811
+ return [], [], True
812
+
813
+ cpdef deserial(self, list obj, dict context, list subs):
814
+ return no_default
815
+
816
+
787
817
  cdef class Placeholder:
788
818
  """
789
819
  Placeholder object to reduce duplicated serialization
@@ -838,6 +868,7 @@ DtypeSerializer.register(ExtensionDtype)
838
868
  ComplexSerializer.register(complex)
839
869
  SliceSerializer.register(slice)
840
870
  RegexSerializer.register(re.Pattern)
871
+ NoDefaultSerializer.register(NoDefault)
841
872
  PlaceholderSerializer.register(Placeholder)
842
873
 
843
874
 
@@ -993,17 +1024,20 @@ def serialize(obj, dict context = None):
993
1024
  cdef list subs
994
1025
  cdef bint final
995
1026
  cdef _IdContextHolder id_context_holder = _IdContextHolder()
1027
+ cdef tuple result
996
1028
 
997
1029
  context = context if context is not None else dict()
998
1030
  serialized, subs, final = _serial_single(obj, context, id_context_holder)
999
1031
  if final or not subs:
1000
1032
  # marked as a leaf node, return directly
1001
- return [{}, serialized], subs
1002
-
1003
- serial_stack.append(_SerialStackItem(serialized, subs))
1004
- return _serialize_with_stack(
1005
- serial_stack, None, context, id_context_holder, result_bufs_list
1006
- )
1033
+ result = [{}, serialized], subs
1034
+ else:
1035
+ serial_stack.append(_SerialStackItem(serialized, subs))
1036
+ result = _serialize_with_stack(
1037
+ serial_stack, None, context, id_context_holder, result_bufs_list
1038
+ )
1039
+ result[0][0]["_PUB"] = context.get(Serializer._public_data_context_key)
1040
+ return result
1007
1041
 
1008
1042
 
1009
1043
  async def serialize_with_spawn(
@@ -1036,31 +1070,38 @@ async def serialize_with_spawn(
1036
1070
  cdef list subs
1037
1071
  cdef bint final
1038
1072
  cdef _IdContextHolder id_context_holder = _IdContextHolder()
1073
+ cdef tuple result
1039
1074
 
1040
1075
  context = context if context is not None else dict()
1041
1076
  serialized, subs, final = _serial_single(obj, context, id_context_holder)
1042
1077
  if final or not subs:
1043
1078
  # marked as a leaf node, return directly
1044
- return [{}, serialized], subs
1045
-
1046
- serial_stack.append(_SerialStackItem(serialized, subs))
1079
+ result = [{}, serialized], subs
1080
+ else:
1081
+ serial_stack.append(_SerialStackItem(serialized, subs))
1047
1082
 
1048
- try:
1049
- result = _serialize_with_stack(
1050
- serial_stack, None, context, id_context_holder, result_bufs_list, spawn_threshold
1051
- )
1052
- except _SerializeObjectOverflow as ex:
1053
- result = await asyncio.get_running_loop().run_in_executor(
1054
- executor,
1055
- _serialize_with_stack,
1056
- serial_stack,
1057
- ex.cur_serialized,
1058
- context,
1059
- id_context_holder,
1060
- result_bufs_list,
1061
- 0,
1062
- ex.num_total_serialized,
1063
- )
1083
+ try:
1084
+ result = _serialize_with_stack(
1085
+ serial_stack,
1086
+ None,
1087
+ context,
1088
+ id_context_holder,
1089
+ result_bufs_list,
1090
+ spawn_threshold,
1091
+ )
1092
+ except _SerializeObjectOverflow as ex:
1093
+ result = await asyncio.get_running_loop().run_in_executor(
1094
+ executor,
1095
+ _serialize_with_stack,
1096
+ serial_stack,
1097
+ ex.cur_serialized,
1098
+ context,
1099
+ id_context_holder,
1100
+ result_bufs_list,
1101
+ 0,
1102
+ ex.num_total_serialized,
1103
+ )
1104
+ result[0][0]["_PUB"] = context.get(Serializer._public_data_context_key)
1064
1105
  return result
1065
1106
 
1066
1107
 
@@ -35,7 +35,7 @@ class RemoteException(MaxFrameError):
35
35
  def from_exception(cls, exc: Exception):
36
36
  try:
37
37
  buffers = pickle_buffers(exc)
38
- except (TypeError, pickle.PicklingError):
38
+ except:
39
39
  logger.exception("Cannot pickle exception %s", exc)
40
40
  buffers = []
41
41
 
@@ -134,8 +134,10 @@ class ArraySerializer(Serializer):
134
134
  data_parts = [obj.tolist()]
135
135
  else:
136
136
  data_parts = [obj.to_numpy().tolist()]
137
- else:
137
+ elif hasattr(obj, "_data"):
138
138
  data_parts = [getattr(obj, "_data")]
139
+ else:
140
+ data_parts = [getattr(obj, "_pa_array")]
139
141
  return [ser_type], [dtype] + data_parts, False
140
142
 
141
143
  def deserial(self, serialized: List, context: Dict, subs: List):
@@ -155,33 +157,66 @@ class PdTimestampSerializer(Serializer):
155
157
  else:
156
158
  zone_info = []
157
159
  ts = obj.to_pydatetime().timestamp()
158
- return (
159
- [int(ts), obj.microsecond, obj.nanosecond],
160
- zone_info,
161
- bool(zone_info),
162
- )
160
+ elements = [int(ts), obj.microsecond, obj.nanosecond]
161
+ if hasattr(obj, "unit"):
162
+ elements.append(str(obj.unit))
163
+ return elements, zone_info, bool(zone_info)
163
164
 
164
165
  def deserial(self, serialized: List, context: Dict, subs: List):
165
166
  if subs:
166
- val = pd.Timestamp.utcfromtimestamp(serialized[0]).replace(
167
- microsecond=serialized[1], nanosecond=serialized[2]
168
- )
169
- val = val.replace(tzinfo=datetime.timezone.utc).tz_convert(subs[0])
167
+ pydt = datetime.datetime.utcfromtimestamp(serialized[0])
168
+ kwargs = {
169
+ "year": pydt.year,
170
+ "month": pydt.month,
171
+ "day": pydt.day,
172
+ "hour": pydt.hour,
173
+ "minute": pydt.minute,
174
+ "second": pydt.second,
175
+ "microsecond": serialized[1],
176
+ "nanosecond": serialized[2],
177
+ "tzinfo": datetime.timezone.utc,
178
+ }
179
+ if len(serialized) > 3:
180
+ kwargs["unit"] = serialized[3]
181
+ val = pd.Timestamp(**kwargs).tz_convert(subs[0])
170
182
  else:
171
- val = pd.Timestamp.fromtimestamp(serialized[0]).replace(
172
- microsecond=serialized[1], nanosecond=serialized[2]
173
- )
183
+ pydt = datetime.datetime.fromtimestamp(serialized[0])
184
+ kwargs = {
185
+ "year": pydt.year,
186
+ "month": pydt.month,
187
+ "day": pydt.day,
188
+ "hour": pydt.hour,
189
+ "minute": pydt.minute,
190
+ "second": pydt.second,
191
+ "microsecond": serialized[1],
192
+ "nanosecond": serialized[2],
193
+ }
194
+ if len(serialized) >= 4:
195
+ kwargs["unit"] = serialized[3]
196
+ val = pd.Timestamp(**kwargs)
174
197
  return val
175
198
 
176
199
 
177
200
  class PdTimedeltaSerializer(Serializer):
178
201
  def serial(self, obj: pd.Timedelta, context: Dict):
179
- return [int(obj.seconds), obj.microseconds, obj.nanoseconds], [], True
202
+ elements = [int(obj.seconds), obj.microseconds, obj.nanoseconds, obj.days]
203
+ if hasattr(obj, "unit"):
204
+ elements.append(str(obj.unit))
205
+ return elements, [], True
180
206
 
181
207
  def deserial(self, serialized: List, context: Dict, subs: List):
182
- return pd.Timedelta(
183
- seconds=serialized[0], microseconds=serialized[1], nanoseconds=serialized[2]
184
- )
208
+ days = 0 if len(serialized) < 4 else serialized[3]
209
+ unit = None if len(serialized) < 5 else serialized[4]
210
+ seconds, microseconds, nanoseconds = serialized[:3]
211
+ kwargs = {
212
+ "days": days,
213
+ "seconds": seconds,
214
+ "microseconds": microseconds,
215
+ "nanoseconds": nanoseconds,
216
+ }
217
+ if unit is not None:
218
+ kwargs["unit"] = unit
219
+ return pd.Timedelta(**kwargs)
185
220
 
186
221
 
187
222
  class NoDefaultSerializer(Serializer):
@@ -12,12 +12,14 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- import operator
16
15
  import weakref
17
- from typing import Dict, List, Tuple, Type
16
+ from collections import defaultdict
17
+ from typing import Any, Dict, List, Optional, Tuple, Type
18
18
 
19
19
  import msgpack
20
20
 
21
+ from ...lib.mmh3 import hash
22
+ from ...utils import no_default
21
23
  from ..core import Placeholder, Serializer, buffered, load_type
22
24
  from .field import Field
23
25
  from .field_type import DictType, ListType, PrimitiveFieldType, TupleType
@@ -50,11 +52,19 @@ def _is_field_primitive_compound(field: Field):
50
52
  class SerializableMeta(type):
51
53
  def __new__(mcs, name: str, bases: Tuple[Type], properties: Dict):
52
54
  # All the fields including misc fields.
55
+ legacy_name_hash = hash(f"{properties.get('__module__')}.{name}")
56
+ name_hash = hash(
57
+ f"{properties.get('__module__')}.{properties.get('__qualname__')}"
58
+ )
53
59
  all_fields = dict()
60
+ # mapping field names to base classes
61
+ field_to_cls_hash = dict()
54
62
 
55
63
  for base in bases:
56
- if hasattr(base, "_FIELDS"):
57
- all_fields.update(base._FIELDS)
64
+ if not hasattr(base, "_FIELDS"):
65
+ continue
66
+ all_fields.update(base._FIELDS)
67
+ field_to_cls_hash.update(base._FIELD_TO_NAME_HASH)
58
68
 
59
69
  properties_without_fields = {}
60
70
  properties_field_slot_names = []
@@ -64,6 +74,8 @@ class SerializableMeta(type):
64
74
  continue
65
75
 
66
76
  field = all_fields.get(k)
77
+ # record the field for the class being created
78
+ field_to_cls_hash[k] = name_hash
67
79
  if field is None:
68
80
  properties_field_slot_names.append(k)
69
81
  else:
@@ -75,23 +87,44 @@ class SerializableMeta(type):
75
87
 
76
88
  # Make field order deterministic to serialize it as list instead of dict.
77
89
  field_order = list(all_fields)
78
- all_fields = dict(sorted(all_fields.items(), key=operator.itemgetter(0)))
79
90
  primitive_fields = []
91
+ primitive_field_names = set()
80
92
  non_primitive_fields = []
81
- for v in all_fields.values():
93
+ for field_name, v in all_fields.items():
82
94
  if _is_field_primitive_compound(v):
83
95
  primitive_fields.append(v)
96
+ primitive_field_names.add(field_name)
84
97
  else:
85
98
  non_primitive_fields.append(v)
86
99
 
100
+ # count number of fields for every base class
101
+ cls_to_primitive_field_count = defaultdict(lambda: 0)
102
+ cls_to_non_primitive_field_count = defaultdict(lambda: 0)
103
+ for field_name in field_order:
104
+ cls_hash = field_to_cls_hash[field_name]
105
+ if field_name in primitive_field_names:
106
+ cls_to_primitive_field_count[cls_hash] += 1
107
+ else:
108
+ cls_to_non_primitive_field_count[cls_hash] += 1
109
+
87
110
  slots = set(properties.pop("__slots__", set()))
88
111
  slots.update(properties_field_slot_names)
89
112
 
90
113
  properties = properties_without_fields
114
+
115
+ # todo remove this prop when all versions below v1.0.0rc1 is eliminated
116
+ properties["_LEGACY_NAME_HASH"] = legacy_name_hash
117
+
118
+ properties["_NAME_HASH"] = name_hash
91
119
  properties["_FIELDS"] = all_fields
92
120
  properties["_FIELD_ORDER"] = field_order
121
+ properties["_FIELD_TO_NAME_HASH"] = field_to_cls_hash
93
122
  properties["_PRIMITIVE_FIELDS"] = primitive_fields
123
+ properties["_CLS_TO_PRIMITIVE_FIELD_COUNT"] = dict(cls_to_primitive_field_count)
94
124
  properties["_NON_PRIMITIVE_FIELDS"] = non_primitive_fields
125
+ properties["_CLS_TO_NON_PRIMITIVE_FIELD_COUNT"] = dict(
126
+ cls_to_non_primitive_field_count
127
+ )
95
128
  properties["__slots__"] = tuple(slots)
96
129
 
97
130
  clz = type.__new__(mcs, name, bases, properties)
@@ -114,10 +147,14 @@ class Serializable(metaclass=SerializableMeta):
114
147
  _cache_primitive_serial = False
115
148
  _ignore_non_existing_keys = False
116
149
 
150
+ _NAME_HASH: int
117
151
  _FIELDS: Dict[str, Field]
118
152
  _FIELD_ORDER: List[str]
153
+ _FIELD_TO_NAME_HASH: Dict[str, int]
119
154
  _PRIMITIVE_FIELDS: List[str]
155
+ _CLS_TO_PRIMITIVE_FIELD_COUNT: Dict[int, int]
120
156
  _NON_PRIMITIVE_FIELDS: List[str]
157
+ _CLS_TO_NON_PRIMITIVE_FIELD_COUNT: Dict[int, int]
121
158
 
122
159
  def __init__(self, *args, **kwargs):
123
160
  fields = self._FIELDS
@@ -175,11 +212,31 @@ class _NoFieldValue:
175
212
  _no_field_value = _NoFieldValue()
176
213
 
177
214
 
215
+ def _to_primitive_placeholder(v: Any) -> Any:
216
+ if v is _no_field_value or v is no_default:
217
+ return {}
218
+ return v
219
+
220
+
221
+ def _restore_primitive_placeholder(v: Any) -> Any:
222
+ if type(v) is dict:
223
+ if v == {}:
224
+ return _no_field_value
225
+ else:
226
+ return v
227
+ else:
228
+ return v
229
+
230
+
178
231
  class SerializableSerializer(Serializer):
179
232
  """
180
233
  Leverage DictSerializer to perform serde.
181
234
  """
182
235
 
236
+ @classmethod
237
+ def _get_obj_field_count_key(cls, obj: Serializable, legacy: bool = False):
238
+ return f"FC_{obj._NAME_HASH if not legacy else obj._LEGACY_NAME_HASH}"
239
+
183
240
  @classmethod
184
241
  def _get_field_values(cls, obj: Serializable, fields):
185
242
  values = []
@@ -201,15 +258,25 @@ class SerializableSerializer(Serializer):
201
258
  else:
202
259
  primitive_vals = self._get_field_values(obj, obj._PRIMITIVE_FIELDS)
203
260
  # replace _no_field_value as {} to make them msgpack-serializable
204
- primitive_vals = [
205
- v if v is not _no_field_value else {} for v in primitive_vals
206
- ]
261
+ primitive_vals = [_to_primitive_placeholder(v) for v in primitive_vals]
207
262
  if obj._cache_primitive_serial:
208
263
  primitive_vals = msgpack.dumps(primitive_vals)
209
264
  _primitive_serial_cache[obj] = primitive_vals
210
265
 
211
266
  compound_vals = self._get_field_values(obj, obj._NON_PRIMITIVE_FIELDS)
212
267
  cls_module = f"{type(obj).__module__}#{type(obj).__qualname__}"
268
+
269
+ field_count_key = self._get_obj_field_count_key(obj)
270
+ if not self.is_public_data_exist(context, field_count_key):
271
+ # store field distribution for current Serializable
272
+ counts = [
273
+ list(obj._CLS_TO_PRIMITIVE_FIELD_COUNT.items()),
274
+ list(obj._CLS_TO_NON_PRIMITIVE_FIELD_COUNT.items()),
275
+ ]
276
+ field_count_data = msgpack.dumps(counts)
277
+ self.put_public_data(
278
+ context, self._get_obj_field_count_key(obj), field_count_data
279
+ )
213
280
  return [cls_module, primitive_vals], [compound_vals], False
214
281
 
215
282
  @staticmethod
@@ -229,6 +296,92 @@ class SerializableSerializer(Serializer):
229
296
  else:
230
297
  field.set(obj, value)
231
298
 
299
+ @classmethod
300
+ def _set_field_values(
301
+ cls,
302
+ obj: Serializable,
303
+ values: List[Any],
304
+ client_cls_to_field_count: Optional[Dict[str, int]],
305
+ is_primitive: bool = True,
306
+ ):
307
+ obj_class = type(obj)
308
+ if is_primitive:
309
+ server_cls_to_field_count = obj_class._CLS_TO_PRIMITIVE_FIELD_COUNT
310
+ server_fields = obj_class._PRIMITIVE_FIELDS
311
+ else:
312
+ server_cls_to_field_count = obj_class._CLS_TO_NON_PRIMITIVE_FIELD_COUNT
313
+ server_fields = obj_class._NON_PRIMITIVE_FIELDS
314
+
315
+ legacy_to_new_hash = {
316
+ c._LEGACY_NAME_HASH: c._NAME_HASH
317
+ for c in obj_class.__mro__
318
+ if hasattr(c, "_NAME_HASH") and c._LEGACY_NAME_HASH != c._NAME_HASH
319
+ }
320
+
321
+ if client_cls_to_field_count:
322
+ field_num, server_field_num = 0, 0
323
+ for cls_hash, count in client_cls_to_field_count.items():
324
+ # cut values and fields given field distribution
325
+ # at client and server end
326
+ cls_fields = server_fields[server_field_num : field_num + count]
327
+ cls_values = values[field_num : field_num + count]
328
+ for field, value in zip(cls_fields, cls_values):
329
+ if is_primitive:
330
+ value = _restore_primitive_placeholder(value)
331
+ if not is_primitive or value is not _no_field_value:
332
+ cls._set_field_value(obj, field, value)
333
+ field_num += count
334
+ try:
335
+ server_field_num += server_cls_to_field_count[cls_hash]
336
+ except KeyError:
337
+ try:
338
+ # todo remove this fallback when all
339
+ # versions below v1.0.0rc1 is eliminated
340
+ server_field_num += server_cls_to_field_count[
341
+ legacy_to_new_hash[cls_hash]
342
+ ]
343
+ except KeyError:
344
+ # it is possible that certain type of field does not exist
345
+ # at server side
346
+ pass
347
+ else:
348
+ # handle legacy serialization style, with all fields sorted by name
349
+ # todo remove this branch when all versions below v0.1.0b5 is eliminated
350
+ from .field import AnyField
351
+
352
+ if is_primitive:
353
+ new_field_attr = "_legacy_new_primitives"
354
+ deprecated_field_attr = "_legacy_deprecated_primitives"
355
+ else:
356
+ new_field_attr = "_legacy_new_non_primitives"
357
+ deprecated_field_attr = "_legacy_deprecated_non_primitives"
358
+
359
+ # remove fields added on later releases
360
+ new_names = set(getattr(obj_class, new_field_attr, None) or [])
361
+ server_fields = [f for f in server_fields if f.name not in new_names]
362
+
363
+ # fill fields deprecated on later releases
364
+ deprecated_fields = []
365
+ deprecated_names = set()
366
+ if hasattr(obj_class, deprecated_field_attr):
367
+ deprecated_names = set(getattr(obj_class, deprecated_field_attr))
368
+ for field_name in deprecated_names:
369
+ field = AnyField(tag=field_name)
370
+ field.name = field_name
371
+ deprecated_fields.append(field)
372
+ server_fields = sorted(
373
+ server_fields + deprecated_fields, key=lambda f: f.name
374
+ )
375
+ for field, value in zip(server_fields, values):
376
+ if is_primitive:
377
+ value = _restore_primitive_placeholder(value)
378
+ if not is_primitive or value is not _no_field_value:
379
+ try:
380
+ cls._set_field_value(obj, field, value)
381
+ except AttributeError: # pragma: no cover
382
+ if field.name not in deprecated_names:
383
+ raise
384
+
232
385
  def deserial(self, serialized: List, context: Dict, subs: List) -> Serializable:
233
386
  obj_class_name, primitives = serialized
234
387
  obj_class = load_type(obj_class_name, Serializable)
@@ -238,14 +391,26 @@ class SerializableSerializer(Serializer):
238
391
 
239
392
  obj = obj_class.__new__(obj_class)
240
393
 
241
- if primitives:
242
- for field, value in zip(obj_class._PRIMITIVE_FIELDS, primitives):
243
- if value != {}:
244
- self._set_field_value(obj, field, value)
394
+ field_count_data = self.get_public_data(
395
+ context, self._get_obj_field_count_key(obj)
396
+ )
397
+ if field_count_data is None:
398
+ # todo remove this fallback when all
399
+ # versions below v1.0.0rc1 is eliminated
400
+ field_count_data = self.get_public_data(
401
+ context, self._get_obj_field_count_key(obj, legacy=True)
402
+ )
403
+ if field_count_data is not None:
404
+ cls_to_prim_key, cls_to_non_prim_key = msgpack.loads(field_count_data)
405
+ cls_to_prim_key = dict(cls_to_prim_key)
406
+ cls_to_non_prim_key = dict(cls_to_non_prim_key)
407
+ else:
408
+ cls_to_prim_key, cls_to_non_prim_key = None, None
245
409
 
410
+ if primitives:
411
+ self._set_field_values(obj, primitives, cls_to_prim_key, True)
246
412
  if obj_class._NON_PRIMITIVE_FIELDS:
247
- for field, value in zip(obj_class._NON_PRIMITIVE_FIELDS, subs[0]):
248
- self._set_field_value(obj, field, value)
413
+ self._set_field_values(obj, subs[0], cls_to_non_prim_key, False)
249
414
  obj.__on_deserialize__()
250
415
  return obj
251
416
 
@@ -46,6 +46,9 @@ class PrimitiveType(Enum):
46
46
  complex128 = 25
47
47
 
48
48
 
49
+ _np_unicode = np.unicode_ if hasattr(np, "unicode_") else np.str_
50
+
51
+
49
52
  _primitive_type_to_valid_types = {
50
53
  PrimitiveType.bool: (bool, np.bool_),
51
54
  PrimitiveType.int8: (int, np.int8),
@@ -60,7 +63,7 @@ _primitive_type_to_valid_types = {
60
63
  PrimitiveType.float32: (float, np.float32),
61
64
  PrimitiveType.float64: (float, np.float64),
62
65
  PrimitiveType.bytes: (bytes, np.bytes_),
63
- PrimitiveType.string: (str, np.unicode_),
66
+ PrimitiveType.string: (str, _np_unicode),
64
67
  PrimitiveType.complex64: (complex, np.complex64),
65
68
  PrimitiveType.complex128: (complex, np.complex128),
66
69
  }