rasa-pro 3.10.8__py3-none-any.whl → 3.10.9.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of rasa-pro might be problematic. Click here for more details.

@@ -1,30 +1,32 @@
1
1
  from __future__ import annotations
2
+
2
3
  import logging
3
4
  import re
5
+ from typing import Any, Dict, List, Optional, Text, Tuple, Set, Type, Union
6
+
7
+ import numpy as np
4
8
  import scipy.sparse
5
- from typing import Any, Dict, List, Optional, Text, Tuple, Set, Type
6
- from rasa.nlu.tokenizers.tokenizer import Tokenizer
9
+ from sklearn.exceptions import NotFittedError
10
+ from sklearn.feature_extraction.text import CountVectorizer
7
11
 
8
12
  import rasa.shared.utils.io
9
13
  from rasa.engine.graph import GraphComponent, ExecutionContext
10
14
  from rasa.engine.recipes.default_recipe import DefaultV1Recipe
11
15
  from rasa.engine.storage.resource import Resource
12
16
  from rasa.engine.storage.storage import ModelStorage
13
- from rasa.nlu.featurizers.sparse_featurizer.sparse_featurizer import SparseFeaturizer
14
- from rasa.nlu.utils.spacy_utils import SpacyModel
15
- from rasa.shared.constants import DOCS_URL_COMPONENTS
16
- import rasa.utils.io as io_utils
17
- from sklearn.exceptions import NotFittedError
18
- from sklearn.feature_extraction.text import CountVectorizer
19
- from rasa.shared.nlu.training_data.training_data import TrainingData
20
- from rasa.shared.nlu.training_data.message import Message
21
- from rasa.shared.exceptions import RasaException, FileIOException
22
17
  from rasa.nlu.constants import (
23
18
  TOKENS_NAMES,
24
19
  MESSAGE_ATTRIBUTES,
25
20
  DENSE_FEATURIZABLE_ATTRIBUTES,
26
21
  )
22
+ from rasa.nlu.featurizers.sparse_featurizer.sparse_featurizer import SparseFeaturizer
23
+ from rasa.nlu.tokenizers.tokenizer import Tokenizer
24
+ from rasa.nlu.utils.spacy_utils import SpacyModel
25
+ from rasa.shared.constants import DOCS_URL_COMPONENTS
26
+ from rasa.shared.exceptions import RasaException, FileIOException
27
27
  from rasa.shared.nlu.constants import TEXT, INTENT, INTENT_RESPONSE_KEY, ACTION_NAME
28
+ from rasa.shared.nlu.training_data.message import Message
29
+ from rasa.shared.nlu.training_data.training_data import TrainingData
28
30
 
29
31
  BUFFER_SLOTS_PREFIX = "buf_"
30
32
 
@@ -688,6 +690,31 @@ class CountVectorsFeaturizer(SparseFeaturizer, GraphComponent):
688
690
  """Check if any model got trained."""
689
691
  return any(value is not None for value in attribute_vocabularies.values())
690
692
 
693
+ @staticmethod
694
+ def convert_vocab(
695
+ vocab: Dict[str, Union[int, Optional[Dict[str, int]]]], to_int: bool
696
+ ) -> Dict[str, Union[None, int, np.int64, Dict[str, Union[int, np.int64]]]]:
697
+ """Converts numpy integers in the vocabulary to Python integers."""
698
+
699
+ def convert_value(value: int) -> Union[int, np.int64]:
700
+ """Helper function to convert a single value based on to_int flag."""
701
+ return int(value) if to_int else np.int64(value)
702
+
703
+ result_dict: Dict[
704
+ str, Union[None, int, np.int64, Dict[str, Union[int, np.int64]]]
705
+ ] = {}
706
+ for key, sub_dict in vocab.items():
707
+ if isinstance(sub_dict, int):
708
+ result_dict[key] = convert_value(sub_dict)
709
+ elif not sub_dict:
710
+ result_dict[key] = None
711
+ else:
712
+ result_dict[key] = {
713
+ sub_key: convert_value(value) for sub_key, value in sub_dict.items()
714
+ }
715
+
716
+ return result_dict
717
+
691
718
  def persist(self) -> None:
692
719
  """Persist this model into the passed directory.
693
720
 
@@ -701,17 +728,18 @@ class CountVectorsFeaturizer(SparseFeaturizer, GraphComponent):
701
728
  attribute_vocabularies = self._collect_vectorizer_vocabularies()
702
729
  if self._is_any_model_trained(attribute_vocabularies):
703
730
  # Definitely need to persist some vocabularies
704
- featurizer_file = model_dir / "vocabularies.pkl"
731
+ featurizer_file = model_dir / "vocabularies.json"
705
732
 
706
733
  # Only persist vocabulary from one attribute if `use_shared_vocab`.
707
734
  # Can be loaded and distributed to all attributes.
708
- vocab = (
735
+ loaded_vocab = (
709
736
  attribute_vocabularies[TEXT]
710
737
  if self.use_shared_vocab
711
738
  else attribute_vocabularies
712
739
  )
740
+ vocab = self.convert_vocab(loaded_vocab, to_int=True)
713
741
 
714
- io_utils.json_pickle(featurizer_file, vocab)
742
+ rasa.shared.utils.io.dump_obj_as_json_to_file(featurizer_file, vocab)
715
743
 
716
744
  # Dump OOV words separately as they might have been modified during
717
745
  # training
@@ -786,8 +814,9 @@ class CountVectorsFeaturizer(SparseFeaturizer, GraphComponent):
786
814
  """Loads trained component (see parent class for full docstring)."""
787
815
  try:
788
816
  with model_storage.read_from(resource) as model_dir:
789
- featurizer_file = model_dir / "vocabularies.pkl"
790
- vocabulary = io_utils.json_unpickle(featurizer_file)
817
+ featurizer_file = model_dir / "vocabularies.json"
818
+ vocabulary = rasa.shared.utils.io.read_json_file(featurizer_file)
819
+ vocabulary = cls.convert_vocab(vocabulary, to_int=False)
791
820
 
792
821
  share_vocabulary = config["use_shared_vocab"]
793
822
 
@@ -1,9 +1,7 @@
1
1
  from __future__ import annotations
2
+
2
3
  import logging
3
4
  from collections import OrderedDict
4
-
5
- import scipy.sparse
6
- import numpy as np
7
5
  from typing import (
8
6
  Any,
9
7
  Dict,
@@ -17,30 +15,34 @@ from typing import (
17
15
  Union,
18
16
  )
19
17
 
18
+ import numpy as np
19
+ import scipy.sparse
20
+
21
+ import rasa.shared.utils.io
22
+ import rasa.utils.io
20
23
  from rasa.engine.graph import ExecutionContext, GraphComponent
21
24
  from rasa.engine.recipes.default_recipe import DefaultV1Recipe
22
25
  from rasa.engine.storage.resource import Resource
23
26
  from rasa.engine.storage.storage import ModelStorage
27
+ from rasa.nlu.constants import TOKENS_NAMES
28
+ from rasa.nlu.featurizers.sparse_featurizer.sparse_featurizer import SparseFeaturizer
24
29
  from rasa.nlu.tokenizers.spacy_tokenizer import POS_TAG_KEY, SpacyTokenizer
25
30
  from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer
26
- from rasa.nlu.featurizers.sparse_featurizer.sparse_featurizer import SparseFeaturizer
27
- from rasa.nlu.constants import TOKENS_NAMES
28
31
  from rasa.shared.constants import DOCS_URL_COMPONENTS
29
- from rasa.shared.nlu.training_data.training_data import TrainingData
30
- from rasa.shared.nlu.training_data.message import Message
31
- from rasa.shared.nlu.constants import TEXT
32
32
  from rasa.shared.exceptions import InvalidConfigException
33
- import rasa.shared.utils.io
34
- import rasa.utils.io
33
+ from rasa.shared.nlu.constants import TEXT
34
+ from rasa.shared.nlu.training_data.message import Message
35
+ from rasa.shared.nlu.training_data.training_data import TrainingData
35
36
 
36
37
  logger = logging.getLogger(__name__)
37
38
 
38
-
39
39
  END_OF_SENTENCE = "EOS"
40
40
  BEGIN_OF_SENTENCE = "BOS"
41
41
 
42
42
  FEATURES = "features"
43
43
 
44
+ SEPERATOR = "###"
45
+
44
46
 
45
47
  @DefaultV1Recipe.register(
46
48
  DefaultV1Recipe.ComponentType.MESSAGE_FEATURIZER, is_trainable=True
@@ -72,7 +74,7 @@ class LexicalSyntacticFeaturizer(SparseFeaturizer, GraphComponent):
72
74
  of the token at position `t+1`.
73
75
  """
74
76
 
75
- FILENAME_FEATURE_TO_IDX_DICT = "feature_to_idx_dict.pkl"
77
+ FILENAME_FEATURE_TO_IDX_DICT = "feature_to_idx_dict.json"
76
78
 
77
79
  # NOTE: "suffix5" of the token "is" will be "is". Hence, when combining multiple
78
80
  # prefixes, short words will be represented/encoded repeatedly.
@@ -488,6 +490,32 @@ class LexicalSyntacticFeaturizer(SparseFeaturizer, GraphComponent):
488
490
  """Creates a new untrained component (see parent class for full docstring)."""
489
491
  return cls(config, model_storage, resource, execution_context)
490
492
 
493
+ @staticmethod
494
+ def _restructure_feature_to_idx_dict(
495
+ loaded_data: Dict[str, Dict[str, int]],
496
+ ) -> Dict[Tuple[int, str], Dict[str, int]]:
497
+ """Reconstructs the feature to idx dict.
498
+
499
+ When storing the feature_to_idx_dict to disk, we need to convert the tuple (key)
500
+ into a string to be able to store it via json. When loading the data
501
+ we need to reconstruct the tuple from the stored string.
502
+
503
+ Args:
504
+ loaded_data: The loaded feature to idx dict from file.
505
+
506
+ Returns:
507
+ The reconstructed feature_to_idx_dict
508
+ """
509
+ feature_to_idx_dict = {}
510
+ for tuple_string, feature_value in loaded_data.items():
511
+ # Example of tuple_string: "1###low"
512
+ index, feature_name = tuple_string.split(SEPERATOR)
513
+
514
+ feature_key = (int(index), feature_name)
515
+ feature_to_idx_dict[feature_key] = feature_value
516
+
517
+ return feature_to_idx_dict
518
+
491
519
  @classmethod
492
520
  def load(
493
521
  cls,
@@ -500,10 +528,13 @@ class LexicalSyntacticFeaturizer(SparseFeaturizer, GraphComponent):
500
528
  """Loads trained component (see parent class for full docstring)."""
501
529
  try:
502
530
  with model_storage.read_from(resource) as model_path:
503
- feature_to_idx_dict = rasa.utils.io.json_unpickle(
531
+ loaded_data = rasa.shared.utils.io.read_json_file(
504
532
  model_path / cls.FILENAME_FEATURE_TO_IDX_DICT,
505
- encode_non_string_keys=True,
506
533
  )
534
+
535
+ # convert the key back into tuple
536
+ feature_to_idx_dict = cls._restructure_feature_to_idx_dict(loaded_data)
537
+
507
538
  return cls(
508
539
  config=config,
509
540
  model_storage=model_storage,
@@ -528,9 +559,13 @@ class LexicalSyntacticFeaturizer(SparseFeaturizer, GraphComponent):
528
559
  if not self._feature_to_idx_dict:
529
560
  return None
530
561
 
562
+ # as we cannot dump tuples, convert the tuple into a string
563
+ restructured_feature_dict = {
564
+ f"{k[0]}{SEPERATOR}{k[1]}": v for k, v in self._feature_to_idx_dict.items()
565
+ }
566
+
531
567
  with self._model_storage.write_to(self._resource) as model_path:
532
- rasa.utils.io.json_pickle(
568
+ rasa.shared.utils.io.dump_obj_as_json_to_file(
533
569
  model_path / self.FILENAME_FEATURE_TO_IDX_DICT,
534
- self._feature_to_idx_dict,
535
- encode_non_string_keys=True,
570
+ restructured_feature_dict,
536
571
  )
@@ -1,11 +1,13 @@
1
1
  from __future__ import annotations
2
+
2
3
  import logging
3
4
  import re
4
5
  from typing import Any, Dict, List, Optional, Text, Tuple, Type
6
+
5
7
  import numpy as np
6
8
  import scipy.sparse
7
- from rasa.nlu.tokenizers.tokenizer import Tokenizer
8
9
 
10
+ from rasa.nlu.tokenizers.tokenizer import Tokenizer
9
11
  import rasa.shared.utils.io
10
12
  import rasa.utils.io
11
13
  import rasa.nlu.utils.pattern_utils as pattern_utils
@@ -240,7 +242,7 @@ class RegexFeaturizer(SparseFeaturizer, GraphComponent):
240
242
 
241
243
  try:
242
244
  with model_storage.read_from(resource) as model_dir:
243
- patterns_file_name = model_dir / "patterns.pkl"
245
+ patterns_file_name = model_dir / "patterns.json"
244
246
  known_patterns = rasa.shared.utils.io.read_json_file(patterns_file_name)
245
247
  except (ValueError, FileNotFoundError):
246
248
  logger.warning(
@@ -258,7 +260,7 @@ class RegexFeaturizer(SparseFeaturizer, GraphComponent):
258
260
 
259
261
  def _persist(self) -> None:
260
262
  with self._model_storage.write_to(self._resource) as model_dir:
261
- regex_file = model_dir / "patterns.pkl"
263
+ regex_file = model_dir / "patterns.json"
262
264
  rasa.shared.utils.io.dump_obj_as_json_to_file(
263
265
  regex_file, self.known_patterns
264
266
  )
@@ -1,15 +1,133 @@
1
1
  from __future__ import annotations
2
- from typing import Iterable, Union, Text, Optional, List, Any, Tuple, Dict, Set
2
+
3
3
  import itertools
4
+ from dataclasses import dataclass
5
+ from typing import Iterable, Union, Text, Optional, List, Any, Tuple, Dict, Set
4
6
 
5
7
  import numpy as np
6
8
  import scipy.sparse
9
+ from safetensors.numpy import save_file, load_file
7
10
 
8
- import rasa.shared.utils.io
9
11
  import rasa.shared.nlu.training_data.util
12
+ import rasa.shared.utils.io
10
13
  from rasa.shared.nlu.constants import FEATURE_TYPE_SEQUENCE, FEATURE_TYPE_SENTENCE
11
14
 
12
15
 
16
+ @dataclass
17
+ class FeatureMetadata:
18
+ data_type: str
19
+ attribute: str
20
+ origin: Union[str, List[str]]
21
+ is_sparse: bool
22
+ shape: tuple
23
+ safetensors_key: str
24
+
25
+
26
+ def save_features(
27
+ features_dict: Dict[Text, List[Features]], file_name: str
28
+ ) -> Dict[str, Any]:
29
+ """Save a dictionary of Features lists to disk using safetensors.
30
+
31
+ Args:
32
+ features_dict: Dictionary mapping strings to lists of Features objects
33
+ file_name: File to save the features to
34
+
35
+ Returns:
36
+ The metadata to reconstruct the features.
37
+ """
38
+ # All tensors are stored in a single safetensors file
39
+ tensors_to_save = {}
40
+ # Metadata will be stored separately
41
+ metadata = {}
42
+
43
+ for key, features_list in features_dict.items():
44
+ feature_metadata_list = []
45
+
46
+ for idx, feature in enumerate(features_list):
47
+ # Create a unique key for this tensor in the safetensors file
48
+ safetensors_key = f"{key}_{idx}"
49
+
50
+ # Convert sparse matrices to dense if needed
51
+ if feature.is_sparse():
52
+ # For sparse matrices, use the COO format
53
+ coo = feature.features.tocoo() # type:ignore[union-attr]
54
+ # Save data, row indices and col indices separately
55
+ tensors_to_save[f"{safetensors_key}_data"] = coo.data
56
+ tensors_to_save[f"{safetensors_key}_row"] = coo.row
57
+ tensors_to_save[f"{safetensors_key}_col"] = coo.col
58
+ else:
59
+ tensors_to_save[safetensors_key] = feature.features
60
+
61
+ # Store metadata
62
+ metadata_item = FeatureMetadata(
63
+ data_type=feature.type,
64
+ attribute=feature.attribute,
65
+ origin=feature.origin,
66
+ is_sparse=feature.is_sparse(),
67
+ shape=feature.features.shape,
68
+ safetensors_key=safetensors_key,
69
+ )
70
+ feature_metadata_list.append(vars(metadata_item))
71
+
72
+ metadata[key] = feature_metadata_list
73
+
74
+ # Save tensors
75
+ save_file(tensors_to_save, file_name)
76
+
77
+ return metadata
78
+
79
+
80
+ def load_features(
81
+ filename: str, metadata: Dict[str, Any]
82
+ ) -> Dict[Text, List[Features]]:
83
+ """Load Features dictionary from disk.
84
+
85
+ Args:
86
+ filename: File name of the safetensors file.
87
+ metadata: Metadata to reconstruct the features.
88
+
89
+ Returns:
90
+ Dictionary mapping strings to lists of Features objects
91
+ """
92
+ # Load tensors
93
+ tensors = load_file(filename)
94
+
95
+ # Reconstruct the features dictionary
96
+ features_dict: Dict[Text, List[Features]] = {}
97
+
98
+ for key, feature_metadata_list in metadata.items():
99
+ features_list = []
100
+
101
+ for meta in feature_metadata_list:
102
+ safetensors_key = meta["safetensors_key"]
103
+
104
+ if meta["is_sparse"]:
105
+ # Reconstruct sparse matrix from COO format
106
+ data = tensors[f"{safetensors_key}_data"]
107
+ row = tensors[f"{safetensors_key}_row"]
108
+ col = tensors[f"{safetensors_key}_col"]
109
+
110
+ features_matrix = scipy.sparse.coo_matrix(
111
+ (data, (row, col)), shape=tuple(meta["shape"])
112
+ ).tocsr() # Convert back to CSR format
113
+ else:
114
+ features_matrix = tensors[safetensors_key]
115
+
116
+ # Reconstruct Features object
117
+ features = Features(
118
+ features=features_matrix,
119
+ feature_type=meta["data_type"],
120
+ attribute=meta["attribute"],
121
+ origin=meta["origin"],
122
+ )
123
+
124
+ features_list.append(features)
125
+
126
+ features_dict[key] = features_list
127
+
128
+ return features_dict
129
+
130
+
13
131
  class Features:
14
132
  """Stores the features produced by any featurizer."""
15
133
 
rasa/shared/utils/io.py CHANGED
@@ -13,6 +13,7 @@ from typing import Any, cast, Callable, Dict, List, Optional, Text, Type, TypeVa
13
13
  import warnings
14
14
  import random
15
15
  import string
16
+
16
17
  import portalocker
17
18
 
18
19
  from rasa.shared.constants import (
rasa/utils/io.py CHANGED
@@ -2,7 +2,6 @@ import asyncio
2
2
  import filecmp
3
3
  import logging
4
4
  import os
5
- import pickle
6
5
  import tempfile
7
6
  import warnings
8
7
  import re
@@ -98,29 +97,6 @@ def enable_async_loop_debugging(
98
97
  return event_loop
99
98
 
100
99
 
101
- def pickle_dump(filename: Union[Text, Path], obj: Any) -> None:
102
- """Saves object to file.
103
-
104
- Args:
105
- filename: the filename to save the object to
106
- obj: the object to store
107
- """
108
- with open(filename, "wb") as f:
109
- pickle.dump(obj, f)
110
-
111
-
112
- def pickle_load(filename: Union[Text, Path]) -> Any:
113
- """Loads an object from a file.
114
-
115
- Args:
116
- filename: the filename to load the object from
117
-
118
- Returns: the loaded object
119
- """
120
- with open(filename, "rb") as f:
121
- return pickle.load(f)
122
-
123
-
124
100
  def create_temporary_file(data: Any, suffix: Text = "", mode: Text = "w+") -> Text:
125
101
  """Creates a tempfile.NamedTemporaryFile object for data."""
126
102
  encoding = None if "b" in mode else rasa.shared.utils.io.DEFAULT_ENCODING
@@ -191,48 +167,6 @@ def create_validator(
191
167
  return FunctionValidator
192
168
 
193
169
 
194
- def json_unpickle(
195
- file_name: Union[Text, Path], encode_non_string_keys: bool = False
196
- ) -> Any:
197
- """Unpickle an object from file using json.
198
-
199
- Args:
200
- file_name: the file to load the object from
201
- encode_non_string_keys: If set to `True` then jsonpickle will encode non-string
202
- dictionary keys instead of coercing them into strings via `repr()`.
203
-
204
- Returns: the object
205
- """
206
- import jsonpickle.ext.numpy as jsonpickle_numpy
207
- import jsonpickle
208
-
209
- jsonpickle_numpy.register_handlers()
210
-
211
- file_content = rasa.shared.utils.io.read_file(file_name)
212
- return jsonpickle.loads(file_content, keys=encode_non_string_keys)
213
-
214
-
215
- def json_pickle(
216
- file_name: Union[Text, Path], obj: Any, encode_non_string_keys: bool = False
217
- ) -> None:
218
- """Pickle an object to a file using json.
219
-
220
- Args:
221
- file_name: the file to store the object to
222
- obj: the object to store
223
- encode_non_string_keys: If set to `True` then jsonpickle will encode non-string
224
- dictionary keys instead of coercing them into strings via `repr()`.
225
- """
226
- import jsonpickle.ext.numpy as jsonpickle_numpy
227
- import jsonpickle
228
-
229
- jsonpickle_numpy.register_handlers()
230
-
231
- rasa.shared.utils.io.write_text_file(
232
- jsonpickle.dumps(obj, keys=encode_non_string_keys), file_name
233
- )
234
-
235
-
236
170
  def get_emoji_regex() -> Pattern:
237
171
  """Returns regex to identify emojis."""
238
172
  return re.compile(