rasa-pro 3.10.10__py3-none-any.whl → 3.10.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of rasa-pro might be problematic. Click here for more details.

Files changed (38) hide show
  1. README.md +17 -396
  2. rasa/cli/arguments/train.py +9 -3
  3. rasa/cli/train.py +40 -2
  4. rasa/cli/utils.py +7 -5
  5. rasa/constants.py +1 -1
  6. rasa/core/featurizers/single_state_featurizer.py +22 -1
  7. rasa/core/featurizers/tracker_featurizers.py +115 -18
  8. rasa/core/policies/ted_policy.py +58 -33
  9. rasa/core/policies/unexpected_intent_policy.py +15 -7
  10. rasa/dialogue_understanding/commands/change_flow_command.py +6 -0
  11. rasa/dialogue_understanding/generator/multi_step/multi_step_llm_command_generator.py +20 -3
  12. rasa/dialogue_understanding/generator/single_step/single_step_llm_command_generator.py +29 -4
  13. rasa/e2e_test/e2e_test_runner.py +2 -2
  14. rasa/engine/storage/local_model_storage.py +41 -12
  15. rasa/model_training.py +10 -3
  16. rasa/nlu/classifiers/diet_classifier.py +38 -25
  17. rasa/nlu/classifiers/logistic_regression_classifier.py +22 -9
  18. rasa/nlu/classifiers/sklearn_intent_classifier.py +37 -16
  19. rasa/nlu/extractors/crf_entity_extractor.py +93 -50
  20. rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py +45 -16
  21. rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py +52 -17
  22. rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py +5 -3
  23. rasa/nlu/persistor.py +37 -15
  24. rasa/shared/constants.py +4 -1
  25. rasa/shared/importers/importer.py +7 -8
  26. rasa/shared/nlu/training_data/features.py +120 -2
  27. rasa/shared/utils/io.py +1 -0
  28. rasa/utils/io.py +0 -66
  29. rasa/utils/tensorflow/feature_array.py +366 -0
  30. rasa/utils/tensorflow/model_data.py +2 -193
  31. rasa/version.py +1 -1
  32. rasa_pro-3.10.12.dist-info/METADATA +196 -0
  33. {rasa_pro-3.10.10.dist-info → rasa_pro-3.10.12.dist-info}/RECORD +36 -36
  34. rasa/shared/importers/remote_importer.py +0 -196
  35. rasa_pro-3.10.10.dist-info/METADATA +0 -575
  36. {rasa_pro-3.10.10.dist-info → rasa_pro-3.10.12.dist-info}/NOTICE +0 -0
  37. {rasa_pro-3.10.10.dist-info → rasa_pro-3.10.12.dist-info}/WHEEL +0 -0
  38. {rasa_pro-3.10.10.dist-info → rasa_pro-3.10.12.dist-info}/entry_points.txt +0 -0
rasa/shared/constants.py CHANGED
@@ -111,7 +111,10 @@ CONFIG_KEYS_NLU = ["language", "pipeline"] + CONFIG_MANDATORY_COMMON_KEYS
111
111
  CONFIG_KEYS = CONFIG_KEYS_CORE + CONFIG_KEYS_NLU
112
112
  CONFIG_MANDATORY_KEYS_CORE: List[Text] = [] + CONFIG_MANDATORY_COMMON_KEYS
113
113
  CONFIG_MANDATORY_KEYS_NLU = ["language"] + CONFIG_MANDATORY_COMMON_KEYS
114
- CONFIG_MANDATORY_KEYS = CONFIG_MANDATORY_KEYS_CORE + CONFIG_MANDATORY_KEYS_NLU
114
+ # we need the list to contain unique values
115
+ CONFIG_MANDATORY_KEYS = list(
116
+ set(CONFIG_MANDATORY_KEYS_CORE + CONFIG_MANDATORY_KEYS_NLU)
117
+ )
115
118
 
116
119
  # Keys related to Forms (in the Domain)
117
120
  REQUIRED_SLOTS_KEY = "required_slots"
@@ -1,3 +1,5 @@
1
+ from __future__ import annotations
2
+
1
3
  import logging
2
4
  from abc import ABC, abstractmethod
3
5
  from functools import reduce
@@ -114,7 +116,7 @@ class TrainingDataImporter(ABC):
114
116
  domain_path: Optional[Text] = None,
115
117
  training_data_paths: Optional[List[Text]] = None,
116
118
  args: Optional[Dict[Text, Any]] = {},
117
- ) -> "TrainingDataImporter":
119
+ ) -> TrainingDataImporter:
118
120
  """Loads a `TrainingDataImporter` instance from a configuration file."""
119
121
  config = read_config_file(config_path)
120
122
  return TrainingDataImporter.load_from_dict(
@@ -127,7 +129,7 @@ class TrainingDataImporter(ABC):
127
129
  domain_path: Optional[Text] = None,
128
130
  training_data_paths: Optional[List[Text]] = None,
129
131
  args: Optional[Dict[Text, Any]] = {},
130
- ) -> "TrainingDataImporter":
132
+ ) -> TrainingDataImporter:
131
133
  """Loads core `TrainingDataImporter` instance.
132
134
 
133
135
  Instance loaded from configuration file will only read Core training data.
@@ -143,7 +145,7 @@ class TrainingDataImporter(ABC):
143
145
  domain_path: Optional[Text] = None,
144
146
  training_data_paths: Optional[List[Text]] = None,
145
147
  args: Optional[Dict[Text, Any]] = {},
146
- ) -> "TrainingDataImporter":
148
+ ) -> TrainingDataImporter:
147
149
  """Loads nlu `TrainingDataImporter` instance.
148
150
 
149
151
  Instance loaded from configuration file will only read NLU training data.
@@ -166,7 +168,7 @@ class TrainingDataImporter(ABC):
166
168
  domain_path: Optional[Text] = None,
167
169
  training_data_paths: Optional[List[Text]] = None,
168
170
  args: Optional[Dict[Text, Any]] = None,
169
- ) -> "TrainingDataImporter":
171
+ ) -> TrainingDataImporter:
170
172
  """Loads a `TrainingDataImporter` instance from a dictionary."""
171
173
  from rasa.shared.importers.rasa import RasaFileImporter
172
174
 
@@ -195,18 +197,15 @@ class TrainingDataImporter(ABC):
195
197
  domain_path: Optional[Text] = None,
196
198
  training_data_paths: Optional[List[Text]] = None,
197
199
  args: Optional[Dict[Text, Any]] = None,
198
- ) -> Optional["TrainingDataImporter"]:
200
+ ) -> Optional[TrainingDataImporter]:
199
201
  from rasa.shared.importers.multi_project import MultiProjectImporter
200
202
  from rasa.shared.importers.rasa import RasaFileImporter
201
- from rasa.shared.importers.remote_importer import RemoteTrainingDataImporter
202
203
 
203
204
  module_path = importer_config.pop("name", None)
204
205
  if module_path == RasaFileImporter.__name__:
205
206
  importer_class: Type[TrainingDataImporter] = RasaFileImporter
206
207
  elif module_path == MultiProjectImporter.__name__:
207
208
  importer_class = MultiProjectImporter
208
- elif module_path == RemoteTrainingDataImporter.__name__:
209
- importer_class = RemoteTrainingDataImporter
210
209
  else:
211
210
  try:
212
211
  importer_class = rasa.shared.utils.common.class_from_module_path(
@@ -1,15 +1,133 @@
1
1
  from __future__ import annotations
2
- from typing import Iterable, Union, Text, Optional, List, Any, Tuple, Dict, Set
2
+
3
3
  import itertools
4
+ from dataclasses import dataclass
5
+ from typing import Iterable, Union, Text, Optional, List, Any, Tuple, Dict, Set
4
6
 
5
7
  import numpy as np
6
8
  import scipy.sparse
9
+ from safetensors.numpy import save_file, load_file
7
10
 
8
- import rasa.shared.utils.io
9
11
  import rasa.shared.nlu.training_data.util
12
+ import rasa.shared.utils.io
10
13
  from rasa.shared.nlu.constants import FEATURE_TYPE_SEQUENCE, FEATURE_TYPE_SENTENCE
11
14
 
12
15
 
16
+ @dataclass
17
+ class FeatureMetadata:
18
+ data_type: str
19
+ attribute: str
20
+ origin: Union[str, List[str]]
21
+ is_sparse: bool
22
+ shape: tuple
23
+ safetensors_key: str
24
+
25
+
26
+ def save_features(
27
+ features_dict: Dict[Text, List[Features]], file_name: str
28
+ ) -> Dict[str, Any]:
29
+ """Save a dictionary of Features lists to disk using safetensors.
30
+
31
+ Args:
32
+ features_dict: Dictionary mapping strings to lists of Features objects
33
+ file_name: File to save the features to
34
+
35
+ Returns:
36
+ The metadata to reconstruct the features.
37
+ """
38
+ # All tensors are stored in a single safetensors file
39
+ tensors_to_save = {}
40
+ # Metadata will be stored separately
41
+ metadata = {}
42
+
43
+ for key, features_list in features_dict.items():
44
+ feature_metadata_list = []
45
+
46
+ for idx, feature in enumerate(features_list):
47
+ # Create a unique key for this tensor in the safetensors file
48
+ safetensors_key = f"{key}_{idx}"
49
+
50
+ # Convert sparse matrices to dense if needed
51
+ if feature.is_sparse():
52
+ # For sparse matrices, use the COO format
53
+ coo = feature.features.tocoo() # type:ignore[union-attr]
54
+ # Save data, row indices and col indices separately
55
+ tensors_to_save[f"{safetensors_key}_data"] = coo.data
56
+ tensors_to_save[f"{safetensors_key}_row"] = coo.row
57
+ tensors_to_save[f"{safetensors_key}_col"] = coo.col
58
+ else:
59
+ tensors_to_save[safetensors_key] = feature.features
60
+
61
+ # Store metadata
62
+ metadata_item = FeatureMetadata(
63
+ data_type=feature.type,
64
+ attribute=feature.attribute,
65
+ origin=feature.origin,
66
+ is_sparse=feature.is_sparse(),
67
+ shape=feature.features.shape,
68
+ safetensors_key=safetensors_key,
69
+ )
70
+ feature_metadata_list.append(vars(metadata_item))
71
+
72
+ metadata[key] = feature_metadata_list
73
+
74
+ # Save tensors
75
+ save_file(tensors_to_save, file_name)
76
+
77
+ return metadata
78
+
79
+
80
+ def load_features(
81
+ filename: str, metadata: Dict[str, Any]
82
+ ) -> Dict[Text, List[Features]]:
83
+ """Load Features dictionary from disk.
84
+
85
+ Args:
86
+ filename: File name of the safetensors file.
87
+ metadata: Metadata to reconstruct the features.
88
+
89
+ Returns:
90
+ Dictionary mapping strings to lists of Features objects
91
+ """
92
+ # Load tensors
93
+ tensors = load_file(filename)
94
+
95
+ # Reconstruct the features dictionary
96
+ features_dict: Dict[Text, List[Features]] = {}
97
+
98
+ for key, feature_metadata_list in metadata.items():
99
+ features_list = []
100
+
101
+ for meta in feature_metadata_list:
102
+ safetensors_key = meta["safetensors_key"]
103
+
104
+ if meta["is_sparse"]:
105
+ # Reconstruct sparse matrix from COO format
106
+ data = tensors[f"{safetensors_key}_data"]
107
+ row = tensors[f"{safetensors_key}_row"]
108
+ col = tensors[f"{safetensors_key}_col"]
109
+
110
+ features_matrix = scipy.sparse.coo_matrix(
111
+ (data, (row, col)), shape=tuple(meta["shape"])
112
+ ).tocsr() # Convert back to CSR format
113
+ else:
114
+ features_matrix = tensors[safetensors_key]
115
+
116
+ # Reconstruct Features object
117
+ features = Features(
118
+ features=features_matrix,
119
+ feature_type=meta["data_type"],
120
+ attribute=meta["attribute"],
121
+ origin=meta["origin"],
122
+ )
123
+
124
+ features_list.append(features)
125
+
126
+ features_dict[key] = features_list
127
+
128
+ return features_dict
129
+
130
+
13
131
  class Features:
14
132
  """Stores the features produced by any featurizer."""
15
133
 
rasa/shared/utils/io.py CHANGED
@@ -13,6 +13,7 @@ from typing import Any, cast, Callable, Dict, List, Optional, Text, Type, TypeVa
13
13
  import warnings
14
14
  import random
15
15
  import string
16
+
16
17
  import portalocker
17
18
 
18
19
  from rasa.shared.constants import (
rasa/utils/io.py CHANGED
@@ -2,7 +2,6 @@ import asyncio
2
2
  import filecmp
3
3
  import logging
4
4
  import os
5
- import pickle
6
5
  import tempfile
7
6
  import warnings
8
7
  import re
@@ -98,29 +97,6 @@ def enable_async_loop_debugging(
98
97
  return event_loop
99
98
 
100
99
 
101
- def pickle_dump(filename: Union[Text, Path], obj: Any) -> None:
102
- """Saves object to file.
103
-
104
- Args:
105
- filename: the filename to save the object to
106
- obj: the object to store
107
- """
108
- with open(filename, "wb") as f:
109
- pickle.dump(obj, f)
110
-
111
-
112
- def pickle_load(filename: Union[Text, Path]) -> Any:
113
- """Loads an object from a file.
114
-
115
- Args:
116
- filename: the filename to load the object from
117
-
118
- Returns: the loaded object
119
- """
120
- with open(filename, "rb") as f:
121
- return pickle.load(f)
122
-
123
-
124
100
  def create_temporary_file(data: Any, suffix: Text = "", mode: Text = "w+") -> Text:
125
101
  """Creates a tempfile.NamedTemporaryFile object for data."""
126
102
  encoding = None if "b" in mode else rasa.shared.utils.io.DEFAULT_ENCODING
@@ -191,48 +167,6 @@ def create_validator(
191
167
  return FunctionValidator
192
168
 
193
169
 
194
- def json_unpickle(
195
- file_name: Union[Text, Path], encode_non_string_keys: bool = False
196
- ) -> Any:
197
- """Unpickle an object from file using json.
198
-
199
- Args:
200
- file_name: the file to load the object from
201
- encode_non_string_keys: If set to `True` then jsonpickle will encode non-string
202
- dictionary keys instead of coercing them into strings via `repr()`.
203
-
204
- Returns: the object
205
- """
206
- import jsonpickle.ext.numpy as jsonpickle_numpy
207
- import jsonpickle
208
-
209
- jsonpickle_numpy.register_handlers()
210
-
211
- file_content = rasa.shared.utils.io.read_file(file_name)
212
- return jsonpickle.loads(file_content, keys=encode_non_string_keys)
213
-
214
-
215
- def json_pickle(
216
- file_name: Union[Text, Path], obj: Any, encode_non_string_keys: bool = False
217
- ) -> None:
218
- """Pickle an object to a file using json.
219
-
220
- Args:
221
- file_name: the file to store the object to
222
- obj: the object to store
223
- encode_non_string_keys: If set to `True` then jsonpickle will encode non-string
224
- dictionary keys instead of coercing them into strings via `repr()`.
225
- """
226
- import jsonpickle.ext.numpy as jsonpickle_numpy
227
- import jsonpickle
228
-
229
- jsonpickle_numpy.register_handlers()
230
-
231
- rasa.shared.utils.io.write_text_file(
232
- jsonpickle.dumps(obj, keys=encode_non_string_keys), file_name
233
- )
234
-
235
-
236
170
  def get_emoji_regex() -> Pattern:
237
171
  """Returns regex to identify emojis."""
238
172
  return re.compile(
@@ -0,0 +1,366 @@
1
+ from typing import Dict, Any, List, Tuple, Optional, Union
2
+
3
+ from safetensors.numpy import save_file
4
+ import numpy as np
5
+ from safetensors.numpy import load_file
6
+ import scipy.sparse
7
+
8
+ import rasa.shared.utils.io
9
+
10
+
11
+ def _recursive_serialize(
12
+ array: Any, prefix: str, data_dict: Dict[str, Any], metadata: List[Dict[str, Any]]
13
+ ) -> None:
14
+ """Recursively serialize arrays and matrices for high dimensional data."""
15
+ if isinstance(array, np.ndarray) and array.ndim <= 2:
16
+ data_key = f"{prefix}_array"
17
+ data_dict[data_key] = array
18
+ metadata.append({"type": "dense", "key": data_key, "shape": array.shape})
19
+
20
+ elif isinstance(array, list) and all([isinstance(v, float) for v in array]):
21
+ data_key = f"{prefix}_list"
22
+ data_dict[data_key] = np.array(array, dtype=np.float32)
23
+ metadata.append({"type": "list", "key": data_key})
24
+
25
+ elif isinstance(array, list) and all([isinstance(v, int) for v in array]):
26
+ data_key = f"{prefix}_list"
27
+ data_dict[data_key] = np.array(array, dtype=np.int64)
28
+ metadata.append({"type": "list", "key": data_key})
29
+
30
+ elif isinstance(array, scipy.sparse.spmatrix):
31
+ data_key_data = f"{prefix}_data"
32
+ data_key_row = f"{prefix}_row"
33
+ data_key_col = f"{prefix}_col"
34
+ array = array.tocoo()
35
+ data_dict.update(
36
+ {
37
+ data_key_data: array.data,
38
+ data_key_row: array.row,
39
+ data_key_col: array.col,
40
+ }
41
+ )
42
+ metadata.append({"type": "sparse", "key": prefix, "shape": array.shape})
43
+
44
+ elif isinstance(array, list) or isinstance(array, np.ndarray):
45
+ group_metadata = {"type": "group", "subcomponents": []}
46
+ for idx, item in enumerate(array):
47
+ new_prefix = f"{prefix}_{idx}"
48
+ _recursive_serialize(
49
+ item, new_prefix, data_dict, group_metadata["subcomponents"]
50
+ )
51
+ metadata.append(group_metadata)
52
+
53
+
54
+ def _serialize_nested_data(
55
+ nested_data: Dict[str, Dict[str, List["FeatureArray"]]],
56
+ prefix: str,
57
+ data_dict: Dict[str, np.ndarray],
58
+ metadata: List[Dict[str, Union[str, List]]],
59
+ ) -> None:
60
+ """Handle serialization across dictionary and list levels."""
61
+ for outer_key, inner_dict in nested_data.items():
62
+ inner_metadata = {"key": outer_key, "components": []}
63
+
64
+ for inner_key, feature_arrays in inner_dict.items():
65
+ array_metadata = {
66
+ "key": inner_key,
67
+ "number_of_dimensions": feature_arrays[0].number_of_dimensions,
68
+ "features": [],
69
+ }
70
+
71
+ for idx, feature_array in enumerate(feature_arrays):
72
+ feature_prefix = f"{prefix}_{outer_key}_{inner_key}_{idx}"
73
+ _recursive_serialize(
74
+ feature_array.tolist(),
75
+ feature_prefix,
76
+ data_dict,
77
+ array_metadata["features"],
78
+ )
79
+
80
+ inner_metadata["components"].append(array_metadata) # type:ignore[attr-defined]
81
+
82
+ metadata.append(inner_metadata)
83
+
84
+
85
+ def serialize_nested_feature_arrays(
86
+ nested_feature_array: Dict[str, Dict[str, List["FeatureArray"]]],
87
+ data_filename: str,
88
+ metadata_filename: str,
89
+ ) -> None:
90
+ data_dict: Dict[str, np.ndarray] = {}
91
+ metadata: List[Dict[str, Union[str, List]]] = []
92
+
93
+ _serialize_nested_data(nested_feature_array, "component", data_dict, metadata)
94
+
95
+ # Save serialized data and metadata
96
+ save_file(data_dict, data_filename)
97
+ rasa.shared.utils.io.dump_obj_as_json_to_file(metadata_filename, metadata)
98
+
99
+
100
+ def _recursive_deserialize(
101
+ metadata: List[Dict[str, Any]], data: Dict[str, Any]
102
+ ) -> List[Any]:
103
+ """Recursively deserialize arrays and matrices for high dimensional data."""
104
+ result = []
105
+
106
+ for item in metadata:
107
+ if item["type"] == "dense":
108
+ key = item["key"]
109
+ array = np.asarray(data[key]).reshape(item["shape"])
110
+ result.append(array)
111
+
112
+ elif item["type"] == "list":
113
+ key = item["key"]
114
+ result.append(list(data[key]))
115
+
116
+ elif item["type"] == "sparse":
117
+ data_vals = data[f"{item['key']}_data"]
118
+ row_vals = data[f"{item['key']}_row"]
119
+ col_vals = data[f"{item['key']}_col"]
120
+ sparse_matrix = scipy.sparse.coo_matrix(
121
+ (data_vals, (row_vals, col_vals)), shape=item["shape"]
122
+ )
123
+ result.append(sparse_matrix)
124
+ elif item["type"] == "group":
125
+ sublist = _recursive_deserialize(item["subcomponents"], data)
126
+ result.append(sublist)
127
+
128
+ return result
129
+
130
+
131
+ def _deserialize_nested_data(
132
+ metadata: List[Dict[str, Any]], data_dict: Dict[str, Any]
133
+ ) -> Dict[str, Dict[str, List["FeatureArray"]]]:
134
+ """Handle deserialization across all dictionary and list levels."""
135
+ result: Dict[str, Dict[str, List["FeatureArray"]]] = {}
136
+
137
+ for outer_item in metadata:
138
+ outer_key = outer_item["key"]
139
+ result[outer_key] = {}
140
+
141
+ for inner_item in outer_item["components"]:
142
+ inner_key = inner_item["key"]
143
+ feature_arrays = []
144
+
145
+ # Reconstruct the list of FeatureArrays
146
+ for feature_item in inner_item["features"]:
147
+ # Reconstruct the list of FeatureArrays
148
+ feature_array_data = _recursive_deserialize([feature_item], data_dict)
149
+ # Prepare the input for the FeatureArray;
150
+ # ensure it is np.ndarray compatible
151
+ input_array = np.array(feature_array_data[0], dtype=object)
152
+ feature_array = FeatureArray(
153
+ input_array, inner_item["number_of_dimensions"]
154
+ )
155
+ feature_arrays.append(feature_array)
156
+
157
+ result[outer_key][inner_key] = feature_arrays
158
+
159
+ return result
160
+
161
+
162
+ def deserialize_nested_feature_arrays(
163
+ data_filename: str, metadata_filename: str
164
+ ) -> Dict[str, Dict[str, List["FeatureArray"]]]:
165
+ metadata = rasa.shared.utils.io.read_json_file(metadata_filename)
166
+ data_dict = load_file(data_filename)
167
+
168
+ return _deserialize_nested_data(metadata, data_dict)
169
+
170
+
171
+ class FeatureArray(np.ndarray):
172
+ """Stores any kind of features ready to be used by a RasaModel.
173
+
174
+ Next to the input numpy array of features, it also received the number of
175
+ dimensions of the features.
176
+ As our features can have 1 to 4 dimensions we might have different number of numpy
177
+ arrays stacked. The number of dimensions helps us to figure out how to handle this
178
+ particular feature array. Also, it is automatically determined whether the feature
179
+ array is sparse or not and the number of units is determined as well.
180
+
181
+ Subclassing np.array: https://numpy.org/doc/stable/user/basics.subclassing.html
182
+ """
183
+
184
+ def __new__(
185
+ cls, input_array: np.ndarray, number_of_dimensions: int
186
+ ) -> "FeatureArray":
187
+ """Create and return a new object. See help(type) for accurate signature."""
188
+ FeatureArray._validate_number_of_dimensions(number_of_dimensions, input_array)
189
+
190
+ feature_array = np.asarray(input_array).view(cls)
191
+
192
+ if number_of_dimensions <= 2:
193
+ feature_array.units = input_array.shape[-1]
194
+ feature_array.is_sparse = isinstance(input_array[0], scipy.sparse.spmatrix)
195
+ elif number_of_dimensions == 3:
196
+ feature_array.units = input_array[0].shape[-1]
197
+ feature_array.is_sparse = isinstance(input_array[0], scipy.sparse.spmatrix)
198
+ elif number_of_dimensions == 4:
199
+ feature_array.units = input_array[0][0].shape[-1]
200
+ feature_array.is_sparse = isinstance(
201
+ input_array[0][0], scipy.sparse.spmatrix
202
+ )
203
+ else:
204
+ raise ValueError(
205
+ f"Number of dimensions '{number_of_dimensions}' currently not "
206
+ f"supported."
207
+ )
208
+
209
+ feature_array.number_of_dimensions = number_of_dimensions
210
+
211
+ return feature_array
212
+
213
+ def __init__(
214
+ self, input_array: Any, number_of_dimensions: int, **kwargs: Any
215
+ ) -> None:
216
+ """Initialize. FeatureArray.
217
+
218
+ Needed in order to avoid 'Invalid keyword argument number_of_dimensions
219
+ to function FeatureArray.__init__ '
220
+ Args:
221
+ input_array: the array that contains features
222
+ number_of_dimensions: number of dimensions in input_array
223
+ """
224
+ super().__init__(**kwargs)
225
+ self.number_of_dimensions = number_of_dimensions
226
+
227
+ def __array_finalize__(self, obj: Optional[np.ndarray]) -> None:
228
+ """This method is called when the system allocates a new array from obj.
229
+
230
+ Args:
231
+ obj: A subclass (subtype) of ndarray.
232
+ """
233
+ if obj is None:
234
+ return
235
+
236
+ self.units = getattr(obj, "units", None)
237
+ self.number_of_dimensions = getattr(obj, "number_of_dimensions", None) # type: ignore[assignment]
238
+ self.is_sparse = getattr(obj, "is_sparse", None)
239
+
240
+ default_attributes = {
241
+ "units": self.units,
242
+ "number_of_dimensions": self.number_of_dimensions,
243
+ "is_spare": self.is_sparse,
244
+ }
245
+ self.__dict__.update(default_attributes)
246
+
247
+ # pytype: disable=attribute-error
248
+ def __array_ufunc__(
249
+ self, ufunc: Any, method: str, *inputs: Any, **kwargs: Any
250
+ ) -> Any:
251
+ """Overwrite this method as we are subclassing numpy array.
252
+
253
+ Args:
254
+ ufunc: The ufunc object that was called.
255
+ method: A string indicating which Ufunc method was called
256
+ (one of "__call__", "reduce", "reduceat", "accumulate", "outer",
257
+ "inner").
258
+ *inputs: A tuple of the input arguments to the ufunc.
259
+ **kwargs: Any additional arguments
260
+
261
+ Returns:
262
+ The result of the operation.
263
+ """
264
+ f = {
265
+ "reduce": ufunc.reduce,
266
+ "accumulate": ufunc.accumulate,
267
+ "reduceat": ufunc.reduceat,
268
+ "outer": ufunc.outer,
269
+ "at": ufunc.at,
270
+ "__call__": ufunc,
271
+ }
272
+ # convert the inputs to np.ndarray to prevent recursion, call the function,
273
+ # then cast it back as FeatureArray
274
+ output = FeatureArray(
275
+ f[method](*(i.view(np.ndarray) for i in inputs), **kwargs),
276
+ number_of_dimensions=kwargs["number_of_dimensions"],
277
+ )
278
+ output.__dict__ = self.__dict__ # carry forward attributes
279
+ return output
280
+
281
+ def __reduce__(self) -> Tuple[Any, Any, Any]:
282
+ """Needed in order to pickle this object.
283
+
284
+ Returns:
285
+ A tuple.
286
+ """
287
+ pickled_state = super(FeatureArray, self).__reduce__()
288
+ if isinstance(pickled_state, str):
289
+ raise TypeError("np array __reduce__ returned string instead of tuple.")
290
+ new_state = pickled_state[2] + (
291
+ self.number_of_dimensions,
292
+ self.is_sparse,
293
+ self.units,
294
+ )
295
+ return pickled_state[0], pickled_state[1], new_state
296
+
297
+ def __setstate__(self, state: Any, **kwargs: Any) -> None:
298
+ """Sets the state.
299
+
300
+ Args:
301
+ state: The state argument must be a sequence that contains the following
302
+ elements version, shape, dtype, isFortan, rawdata.
303
+ **kwargs: Any additional parameter
304
+ """
305
+ # Needed in order to load the object
306
+ self.number_of_dimensions = state[-3]
307
+ self.is_sparse = state[-2]
308
+ self.units = state[-1]
309
+ super(FeatureArray, self).__setstate__(state[0:-3], **kwargs)
310
+
311
+ # pytype: enable=attribute-error
312
+
313
+ @staticmethod
314
+ def _validate_number_of_dimensions(
315
+ number_of_dimensions: int, input_array: np.ndarray
316
+ ) -> None:
317
+ """Validates if the input array has given number of dimensions.
318
+
319
+ Args:
320
+ number_of_dimensions: number of dimensions
321
+ input_array: input array
322
+
323
+ Raises: ValueError in case the dimensions do not match
324
+ """
325
+ # when loading the feature arrays from disk, the shape represents
326
+ # the correct number of dimensions
327
+ if len(input_array.shape) == number_of_dimensions:
328
+ return
329
+
330
+ _sub_array = input_array
331
+ dim = 0
332
+ # Go number_of_dimensions into the given input_array
333
+ for i in range(1, number_of_dimensions + 1):
334
+ _sub_array = _sub_array[0]
335
+ if isinstance(_sub_array, scipy.sparse.spmatrix):
336
+ dim = i
337
+ break
338
+ if isinstance(_sub_array, np.ndarray) and _sub_array.shape[0] == 0:
339
+ # sequence dimension is 0, we are dealing with "fake" features
340
+ dim = i
341
+ break
342
+
343
+ # If the resulting sub_array is sparse, the remaining number of dimensions
344
+ # should be at least 2
345
+ if isinstance(_sub_array, scipy.sparse.spmatrix):
346
+ if dim > 2:
347
+ raise ValueError(
348
+ f"Given number of dimensions '{number_of_dimensions}' does not "
349
+ f"match dimensions of given input array: {input_array}."
350
+ )
351
+ elif isinstance(_sub_array, np.ndarray) and _sub_array.shape[0] == 0:
352
+ # sequence dimension is 0, we are dealing with "fake" features,
353
+ # but they should be of dim 2
354
+ if dim > 2:
355
+ raise ValueError(
356
+ f"Given number of dimensions '{number_of_dimensions}' does not "
357
+ f"match dimensions of given input array: {input_array}."
358
+ )
359
+ # If the resulting sub_array is dense, the sub_array should be a single number
360
+ elif not np.issubdtype(type(_sub_array), np.integer) and not isinstance(
361
+ _sub_array, (np.float32, np.float64)
362
+ ):
363
+ raise ValueError(
364
+ f"Given number of dimensions '{number_of_dimensions}' does not match "
365
+ f"dimensions of given input array: {input_array}."
366
+ )