rasa-pro 3.8.16__py3-none-any.whl → 3.8.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of rasa-pro might be problematic. Click here for more details.

@@ -1,30 +1,32 @@
1
1
  from __future__ import annotations
2
+
2
3
  import logging
3
4
  import re
5
+ from typing import Any, Dict, List, Optional, Text, Tuple, Set, Type, Union
6
+
7
+ import numpy as np
4
8
  import scipy.sparse
5
- from typing import Any, Dict, List, Optional, Text, Tuple, Set, Type
6
- from rasa.nlu.tokenizers.tokenizer import Tokenizer
9
+ from sklearn.exceptions import NotFittedError
10
+ from sklearn.feature_extraction.text import CountVectorizer
7
11
 
8
12
  import rasa.shared.utils.io
9
13
  from rasa.engine.graph import GraphComponent, ExecutionContext
10
14
  from rasa.engine.recipes.default_recipe import DefaultV1Recipe
11
15
  from rasa.engine.storage.resource import Resource
12
16
  from rasa.engine.storage.storage import ModelStorage
13
- from rasa.nlu.featurizers.sparse_featurizer.sparse_featurizer import SparseFeaturizer
14
- from rasa.nlu.utils.spacy_utils import SpacyModel
15
- from rasa.shared.constants import DOCS_URL_COMPONENTS
16
- import rasa.utils.io as io_utils
17
- from sklearn.exceptions import NotFittedError
18
- from sklearn.feature_extraction.text import CountVectorizer
19
- from rasa.shared.nlu.training_data.training_data import TrainingData
20
- from rasa.shared.nlu.training_data.message import Message
21
- from rasa.shared.exceptions import RasaException, FileIOException
22
17
  from rasa.nlu.constants import (
23
18
  TOKENS_NAMES,
24
19
  MESSAGE_ATTRIBUTES,
25
20
  DENSE_FEATURIZABLE_ATTRIBUTES,
26
21
  )
22
+ from rasa.nlu.featurizers.sparse_featurizer.sparse_featurizer import SparseFeaturizer
23
+ from rasa.nlu.tokenizers.tokenizer import Tokenizer
24
+ from rasa.nlu.utils.spacy_utils import SpacyModel
25
+ from rasa.shared.constants import DOCS_URL_COMPONENTS
26
+ from rasa.shared.exceptions import RasaException, FileIOException
27
27
  from rasa.shared.nlu.constants import TEXT, INTENT, INTENT_RESPONSE_KEY, ACTION_NAME
28
+ from rasa.shared.nlu.training_data.message import Message
29
+ from rasa.shared.nlu.training_data.training_data import TrainingData
28
30
 
29
31
  BUFFER_SLOTS_PREFIX = "buf_"
30
32
 
@@ -690,6 +692,31 @@ class CountVectorsFeaturizer(SparseFeaturizer, GraphComponent):
690
692
  """Check if any model got trained."""
691
693
  return any(value is not None for value in attribute_vocabularies.values())
692
694
 
695
+ @staticmethod
696
+ def convert_vocab(
697
+ vocab: Dict[str, Union[int, Optional[Dict[str, int]]]], to_int: bool
698
+ ) -> Dict[str, Union[None, int, np.int64, Dict[str, Union[int, np.int64]]]]:
699
+ """Converts numpy integers in the vocabulary to Python integers."""
700
+
701
+ def convert_value(value: int) -> Union[int, np.int64]:
702
+ """Helper function to convert a single value based on to_int flag."""
703
+ return int(value) if to_int else np.int64(value)
704
+
705
+ result_dict: Dict[
706
+ str, Union[None, int, np.int64, Dict[str, Union[int, np.int64]]]
707
+ ] = {}
708
+ for key, sub_dict in vocab.items():
709
+ if isinstance(sub_dict, int):
710
+ result_dict[key] = convert_value(sub_dict)
711
+ elif not sub_dict:
712
+ result_dict[key] = None
713
+ else:
714
+ result_dict[key] = {
715
+ sub_key: convert_value(value) for sub_key, value in sub_dict.items()
716
+ }
717
+
718
+ return result_dict
719
+
693
720
  def persist(self) -> None:
694
721
  """Persist this model into the passed directory.
695
722
 
@@ -703,17 +730,18 @@ class CountVectorsFeaturizer(SparseFeaturizer, GraphComponent):
703
730
  attribute_vocabularies = self._collect_vectorizer_vocabularies()
704
731
  if self._is_any_model_trained(attribute_vocabularies):
705
732
  # Definitely need to persist some vocabularies
706
- featurizer_file = model_dir / "vocabularies.pkl"
733
+ featurizer_file = model_dir / "vocabularies.json"
707
734
 
708
735
  # Only persist vocabulary from one attribute if `use_shared_vocab`.
709
736
  # Can be loaded and distributed to all attributes.
710
- vocab = (
737
+ loaded_vocab = (
711
738
  attribute_vocabularies[TEXT]
712
739
  if self.use_shared_vocab
713
740
  else attribute_vocabularies
714
741
  )
742
+ vocab = self.convert_vocab(loaded_vocab, to_int=True)
715
743
 
716
- io_utils.json_pickle(featurizer_file, vocab)
744
+ rasa.shared.utils.io.dump_obj_as_json_to_file(featurizer_file, vocab)
717
745
 
718
746
  # Dump OOV words separately as they might have been modified during
719
747
  # training
@@ -788,8 +816,9 @@ class CountVectorsFeaturizer(SparseFeaturizer, GraphComponent):
788
816
  """Loads trained component (see parent class for full docstring)."""
789
817
  try:
790
818
  with model_storage.read_from(resource) as model_dir:
791
- featurizer_file = model_dir / "vocabularies.pkl"
792
- vocabulary = io_utils.json_unpickle(featurizer_file)
819
+ featurizer_file = model_dir / "vocabularies.json"
820
+ vocabulary = rasa.shared.utils.io.read_json_file(featurizer_file)
821
+ vocabulary = cls.convert_vocab(vocabulary, to_int=False)
793
822
 
794
823
  share_vocabulary = config["use_shared_vocab"]
795
824
 
@@ -1,9 +1,7 @@
1
1
  from __future__ import annotations
2
+
2
3
  import logging
3
4
  from collections import OrderedDict
4
-
5
- import scipy.sparse
6
- import numpy as np
7
5
  from typing import (
8
6
  Any,
9
7
  Dict,
@@ -17,30 +15,34 @@ from typing import (
17
15
  Union,
18
16
  )
19
17
 
18
+ import numpy as np
19
+ import scipy.sparse
20
+
21
+ import rasa.shared.utils.io
22
+ import rasa.utils.io
20
23
  from rasa.engine.graph import ExecutionContext, GraphComponent
21
24
  from rasa.engine.recipes.default_recipe import DefaultV1Recipe
22
25
  from rasa.engine.storage.resource import Resource
23
26
  from rasa.engine.storage.storage import ModelStorage
27
+ from rasa.nlu.constants import TOKENS_NAMES
28
+ from rasa.nlu.featurizers.sparse_featurizer.sparse_featurizer import SparseFeaturizer
24
29
  from rasa.nlu.tokenizers.spacy_tokenizer import POS_TAG_KEY, SpacyTokenizer
25
30
  from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer
26
- from rasa.nlu.featurizers.sparse_featurizer.sparse_featurizer import SparseFeaturizer
27
- from rasa.nlu.constants import TOKENS_NAMES
28
31
  from rasa.shared.constants import DOCS_URL_COMPONENTS
29
- from rasa.shared.nlu.training_data.training_data import TrainingData
30
- from rasa.shared.nlu.training_data.message import Message
31
- from rasa.shared.nlu.constants import TEXT
32
32
  from rasa.shared.exceptions import InvalidConfigException
33
- import rasa.shared.utils.io
34
- import rasa.utils.io
33
+ from rasa.shared.nlu.constants import TEXT
34
+ from rasa.shared.nlu.training_data.message import Message
35
+ from rasa.shared.nlu.training_data.training_data import TrainingData
35
36
 
36
37
  logger = logging.getLogger(__name__)
37
38
 
38
-
39
39
  END_OF_SENTENCE = "EOS"
40
40
  BEGIN_OF_SENTENCE = "BOS"
41
41
 
42
42
  FEATURES = "features"
43
43
 
44
+ SEPERATOR = "###"
45
+
44
46
 
45
47
  @DefaultV1Recipe.register(
46
48
  DefaultV1Recipe.ComponentType.MESSAGE_FEATURIZER, is_trainable=True
@@ -72,7 +74,7 @@ class LexicalSyntacticFeaturizer(SparseFeaturizer, GraphComponent):
72
74
  of the token at position `t+1`.
73
75
  """
74
76
 
75
- FILENAME_FEATURE_TO_IDX_DICT = "feature_to_idx_dict.pkl"
77
+ FILENAME_FEATURE_TO_IDX_DICT = "feature_to_idx_dict.json"
76
78
 
77
79
  # NOTE: "suffix5" of the token "is" will be "is". Hence, when combining multiple
78
80
  # prefixes, short words will be represented/encoded repeatedly.
@@ -491,6 +493,32 @@ class LexicalSyntacticFeaturizer(SparseFeaturizer, GraphComponent):
491
493
  """Creates a new untrained component (see parent class for full docstring)."""
492
494
  return cls(config, model_storage, resource, execution_context)
493
495
 
496
+ @staticmethod
497
+ def _restructure_feature_to_idx_dict(
498
+ loaded_data: Dict[str, Dict[str, int]],
499
+ ) -> Dict[Tuple[int, str], Dict[str, int]]:
500
+ """Reconstructs the feature to idx dict.
501
+
502
+ When storing the feature_to_idx_dict to disk, we need to convert the tuple (key)
503
+ into a string to be able to store it via json. When loading the data
504
+ we need to reconstruct the tuple from the stored string.
505
+
506
+ Args:
507
+ loaded_data: The loaded feature to idx dict from file.
508
+
509
+ Returns:
510
+ The reconstructed feature_to_idx_dict
511
+ """
512
+ feature_to_idx_dict = {}
513
+ for tuple_string, feature_value in loaded_data.items():
514
+ # Example of tuple_string: "1###low"
515
+ index, feature_name = tuple_string.split(SEPERATOR)
516
+
517
+ feature_key = (int(index), feature_name)
518
+ feature_to_idx_dict[feature_key] = feature_value
519
+
520
+ return feature_to_idx_dict
521
+
494
522
  @classmethod
495
523
  def load(
496
524
  cls,
@@ -503,10 +531,13 @@ class LexicalSyntacticFeaturizer(SparseFeaturizer, GraphComponent):
503
531
  """Loads trained component (see parent class for full docstring)."""
504
532
  try:
505
533
  with model_storage.read_from(resource) as model_path:
506
- feature_to_idx_dict = rasa.utils.io.json_unpickle(
534
+ loaded_data = rasa.shared.utils.io.read_json_file(
507
535
  model_path / cls.FILENAME_FEATURE_TO_IDX_DICT,
508
- encode_non_string_keys=True,
509
536
  )
537
+
538
+ # convert the key back into tuple
539
+ feature_to_idx_dict = cls._restructure_feature_to_idx_dict(loaded_data)
540
+
510
541
  return cls(
511
542
  config=config,
512
543
  model_storage=model_storage,
@@ -531,9 +562,13 @@ class LexicalSyntacticFeaturizer(SparseFeaturizer, GraphComponent):
531
562
  if not self._feature_to_idx_dict:
532
563
  return None
533
564
 
565
+ # as we cannot dump tuples, convert the tuple into a string
566
+ restructured_feature_dict = {
567
+ f"{k[0]}{SEPERATOR}{k[1]}": v for k, v in self._feature_to_idx_dict.items()
568
+ }
569
+
534
570
  with self._model_storage.write_to(self._resource) as model_path:
535
- rasa.utils.io.json_pickle(
571
+ rasa.shared.utils.io.dump_obj_as_json_to_file(
536
572
  model_path / self.FILENAME_FEATURE_TO_IDX_DICT,
537
- self._feature_to_idx_dict,
538
- encode_non_string_keys=True,
573
+ restructured_feature_dict,
539
574
  )
@@ -1,11 +1,13 @@
1
1
  from __future__ import annotations
2
+
2
3
  import logging
3
4
  import re
4
5
  from typing import Any, Dict, List, Optional, Text, Tuple, Type
6
+
5
7
  import numpy as np
6
8
  import scipy.sparse
7
- from rasa.nlu.tokenizers.tokenizer import Tokenizer
8
9
 
10
+ from rasa.nlu.tokenizers.tokenizer import Tokenizer
9
11
  import rasa.shared.utils.io
10
12
  import rasa.utils.io
11
13
  import rasa.nlu.utils.pattern_utils as pattern_utils
@@ -240,7 +242,7 @@ class RegexFeaturizer(SparseFeaturizer, GraphComponent):
240
242
 
241
243
  try:
242
244
  with model_storage.read_from(resource) as model_dir:
243
- patterns_file_name = model_dir / "patterns.pkl"
245
+ patterns_file_name = model_dir / "patterns.json"
244
246
  known_patterns = rasa.shared.utils.io.read_json_file(patterns_file_name)
245
247
  except (ValueError, FileNotFoundError):
246
248
  logger.warning(
@@ -258,7 +260,7 @@ class RegexFeaturizer(SparseFeaturizer, GraphComponent):
258
260
 
259
261
  def _persist(self) -> None:
260
262
  with self._model_storage.write_to(self._resource) as model_dir:
261
- regex_file = model_dir / "patterns.pkl"
263
+ regex_file = model_dir / "patterns.json"
262
264
  rasa.shared.utils.io.dump_obj_as_json_to_file(
263
265
  regex_file, self.known_patterns
264
266
  )
@@ -1,15 +1,133 @@
1
1
  from __future__ import annotations
2
- from typing import Iterable, Union, Text, Optional, List, Any, Tuple, Dict, Set
2
+
3
3
  import itertools
4
+ from dataclasses import dataclass
5
+ from typing import Iterable, Union, Text, Optional, List, Any, Tuple, Dict, Set
4
6
 
5
7
  import numpy as np
6
8
  import scipy.sparse
9
+ from safetensors.numpy import save_file, load_file
7
10
 
8
- import rasa.shared.utils.io
9
11
  import rasa.shared.nlu.training_data.util
12
+ import rasa.shared.utils.io
10
13
  from rasa.shared.nlu.constants import FEATURE_TYPE_SEQUENCE, FEATURE_TYPE_SENTENCE
11
14
 
12
15
 
16
+ @dataclass
17
+ class FeatureMetadata:
18
+ data_type: str
19
+ attribute: str
20
+ origin: Union[str, List[str]]
21
+ is_sparse: bool
22
+ shape: tuple
23
+ safetensors_key: str
24
+
25
+
26
+ def save_features(
27
+ features_dict: Dict[Text, List[Features]], file_name: str
28
+ ) -> Dict[str, Any]:
29
+ """Save a dictionary of Features lists to disk using safetensors.
30
+
31
+ Args:
32
+ features_dict: Dictionary mapping strings to lists of Features objects
33
+ file_name: File to save the features to
34
+
35
+ Returns:
36
+ The metadata to reconstruct the features.
37
+ """
38
+ # All tensors are stored in a single safetensors file
39
+ tensors_to_save = {}
40
+ # Metadata will be stored separately
41
+ metadata = {}
42
+
43
+ for key, features_list in features_dict.items():
44
+ feature_metadata_list = []
45
+
46
+ for idx, feature in enumerate(features_list):
47
+ # Create a unique key for this tensor in the safetensors file
48
+ safetensors_key = f"{key}_{idx}"
49
+
50
+ # Convert sparse matrices to dense if needed
51
+ if feature.is_sparse():
52
+ # For sparse matrices, use the COO format
53
+ coo = feature.features.tocoo() # type:ignore[union-attr]
54
+ # Save data, row indices and col indices separately
55
+ tensors_to_save[f"{safetensors_key}_data"] = coo.data
56
+ tensors_to_save[f"{safetensors_key}_row"] = coo.row
57
+ tensors_to_save[f"{safetensors_key}_col"] = coo.col
58
+ else:
59
+ tensors_to_save[safetensors_key] = feature.features
60
+
61
+ # Store metadata
62
+ metadata_item = FeatureMetadata(
63
+ data_type=feature.type,
64
+ attribute=feature.attribute,
65
+ origin=feature.origin,
66
+ is_sparse=feature.is_sparse(),
67
+ shape=feature.features.shape,
68
+ safetensors_key=safetensors_key,
69
+ )
70
+ feature_metadata_list.append(vars(metadata_item))
71
+
72
+ metadata[key] = feature_metadata_list
73
+
74
+ # Save tensors
75
+ save_file(tensors_to_save, file_name)
76
+
77
+ return metadata
78
+
79
+
80
+ def load_features(
81
+ filename: str, metadata: Dict[str, Any]
82
+ ) -> Dict[Text, List[Features]]:
83
+ """Load Features dictionary from disk.
84
+
85
+ Args:
86
+ filename: File name of the safetensors file.
87
+ metadata: Metadata to reconstruct the features.
88
+
89
+ Returns:
90
+ Dictionary mapping strings to lists of Features objects
91
+ """
92
+ # Load tensors
93
+ tensors = load_file(filename)
94
+
95
+ # Reconstruct the features dictionary
96
+ features_dict: Dict[Text, List[Features]] = {}
97
+
98
+ for key, feature_metadata_list in metadata.items():
99
+ features_list = []
100
+
101
+ for meta in feature_metadata_list:
102
+ safetensors_key = meta["safetensors_key"]
103
+
104
+ if meta["is_sparse"]:
105
+ # Reconstruct sparse matrix from COO format
106
+ data = tensors[f"{safetensors_key}_data"]
107
+ row = tensors[f"{safetensors_key}_row"]
108
+ col = tensors[f"{safetensors_key}_col"]
109
+
110
+ features_matrix = scipy.sparse.coo_matrix(
111
+ (data, (row, col)), shape=tuple(meta["shape"])
112
+ ).tocsr() # Convert back to CSR format
113
+ else:
114
+ features_matrix = tensors[safetensors_key]
115
+
116
+ # Reconstruct Features object
117
+ features = Features(
118
+ features=features_matrix,
119
+ feature_type=meta["data_type"],
120
+ attribute=meta["attribute"],
121
+ origin=meta["origin"],
122
+ )
123
+
124
+ features_list.append(features)
125
+
126
+ features_dict[key] = features_list
127
+
128
+ return features_dict
129
+
130
+
13
131
  class Features:
14
132
  """Stores the features produced by any featurizer."""
15
133
 
rasa/shared/utils/io.py CHANGED
@@ -10,6 +10,7 @@ from typing import Any, Dict, List, Optional, Text, Type, Union
10
10
  import warnings
11
11
  import random
12
12
  import string
13
+
13
14
  import portalocker
14
15
 
15
16
  from rasa.shared.constants import (
rasa/utils/io.py CHANGED
@@ -2,7 +2,6 @@ import asyncio
2
2
  import filecmp
3
3
  import logging
4
4
  import os
5
- import pickle
6
5
  import tempfile
7
6
  import warnings
8
7
  import re
@@ -98,29 +97,6 @@ def enable_async_loop_debugging(
98
97
  return event_loop
99
98
 
100
99
 
101
- def pickle_dump(filename: Union[Text, Path], obj: Any) -> None:
102
- """Saves object to file.
103
-
104
- Args:
105
- filename: the filename to save the object to
106
- obj: the object to store
107
- """
108
- with open(filename, "wb") as f:
109
- pickle.dump(obj, f)
110
-
111
-
112
- def pickle_load(filename: Union[Text, Path]) -> Any:
113
- """Loads an object from a file.
114
-
115
- Args:
116
- filename: the filename to load the object from
117
-
118
- Returns: the loaded object
119
- """
120
- with open(filename, "rb") as f:
121
- return pickle.load(f)
122
-
123
-
124
100
  def create_temporary_file(data: Any, suffix: Text = "", mode: Text = "w+") -> Text:
125
101
  """Creates a tempfile.NamedTemporaryFile object for data."""
126
102
  encoding = None if "b" in mode else rasa.shared.utils.io.DEFAULT_ENCODING
@@ -191,48 +167,6 @@ def create_validator(
191
167
  return FunctionValidator
192
168
 
193
169
 
194
- def json_unpickle(
195
- file_name: Union[Text, Path], encode_non_string_keys: bool = False
196
- ) -> Any:
197
- """Unpickle an object from file using json.
198
-
199
- Args:
200
- file_name: the file to load the object from
201
- encode_non_string_keys: If set to `True` then jsonpickle will encode non-string
202
- dictionary keys instead of coercing them into strings via `repr()`.
203
-
204
- Returns: the object
205
- """
206
- import jsonpickle.ext.numpy as jsonpickle_numpy
207
- import jsonpickle
208
-
209
- jsonpickle_numpy.register_handlers()
210
-
211
- file_content = rasa.shared.utils.io.read_file(file_name)
212
- return jsonpickle.loads(file_content, keys=encode_non_string_keys)
213
-
214
-
215
- def json_pickle(
216
- file_name: Union[Text, Path], obj: Any, encode_non_string_keys: bool = False
217
- ) -> None:
218
- """Pickle an object to a file using json.
219
-
220
- Args:
221
- file_name: the file to store the object to
222
- obj: the object to store
223
- encode_non_string_keys: If set to `True` then jsonpickle will encode non-string
224
- dictionary keys instead of coercing them into strings via `repr()`.
225
- """
226
- import jsonpickle.ext.numpy as jsonpickle_numpy
227
- import jsonpickle
228
-
229
- jsonpickle_numpy.register_handlers()
230
-
231
- rasa.shared.utils.io.write_text_file(
232
- jsonpickle.dumps(obj, keys=encode_non_string_keys), file_name
233
- )
234
-
235
-
236
170
  def get_emoji_regex() -> Pattern:
237
171
  """Returns regex to identify emojis."""
238
172
  return re.compile(