PyPI - rasa-pro - Versions diffs - 3.10.8__py3-none-any.whl → 3.10.9.dev1__py3-none-any.whl - Mend

rasa-pro 3.10.8py3-none-any.whl → 3.10.9.dev1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of rasa-pro might be problematic. Click here for more details.

Files changed (26) hide show

rasa/constants.py +1 -1
rasa/core/featurizers/single_state_featurizer.py +22 -1
rasa/core/featurizers/tracker_featurizers.py +115 -18
rasa/core/policies/ted_policy.py +58 -33
rasa/core/policies/unexpected_intent_policy.py +15 -7
rasa/core/processor.py +57 -8
rasa/e2e_test/utils/io.py +3 -1
rasa/nlu/classifiers/diet_classifier.py +38 -25
rasa/nlu/classifiers/logistic_regression_classifier.py +22 -9
rasa/nlu/classifiers/sklearn_intent_classifier.py +37 -16
rasa/nlu/extractors/crf_entity_extractor.py +93 -50
rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py +45 -16
rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py +52 -17
rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py +5 -3
rasa/shared/nlu/training_data/features.py +120 -2
rasa/shared/utils/io.py +1 -0
rasa/utils/io.py +0 -66
rasa/utils/tensorflow/feature_array.py +366 -0
rasa/utils/tensorflow/model_data.py +2 -193
rasa/version.py +1 -1
{rasa_pro-3.10.8.dist-info → rasa_pro-3.10.9.dev1.dist-info}/METADATA +3 -3
{rasa_pro-3.10.8.dist-info → rasa_pro-3.10.9.dev1.dist-info}/RECORD +25 -25
{rasa_pro-3.10.8.dist-info → rasa_pro-3.10.9.dev1.dist-info}/WHEEL +1 -1
rasa/keys +0 -1
{rasa_pro-3.10.8.dist-info → rasa_pro-3.10.9.dev1.dist-info}/NOTICE +0 -0
{rasa_pro-3.10.8.dist-info → rasa_pro-3.10.9.dev1.dist-info}/entry_points.txt +0 -0

rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py CHANGED Viewed

@@ -1,30 +1,32 @@
 from __future__ import annotations
 import logging
 import re
+from typing import Any, Dict, List, Optional, Text, Tuple, Set, Type, Union
+import numpy as np
 import scipy.sparse
-from typing import Any, Dict, List, Optional, Text, Tuple, Set, Type
-from rasa.nlu.tokenizers.tokenizer import Tokenizer
+from sklearn.exceptions import NotFittedError
+from sklearn.feature_extraction.text import CountVectorizer
 import rasa.shared.utils.io
 from rasa.engine.graph import GraphComponent, ExecutionContext
 from rasa.engine.recipes.default_recipe import DefaultV1Recipe
 from rasa.engine.storage.resource import Resource
 from rasa.engine.storage.storage import ModelStorage
-from rasa.nlu.featurizers.sparse_featurizer.sparse_featurizer import SparseFeaturizer
-from rasa.nlu.utils.spacy_utils import SpacyModel
-from rasa.shared.constants import DOCS_URL_COMPONENTS
-import rasa.utils.io as io_utils
-from sklearn.exceptions import NotFittedError
-from sklearn.feature_extraction.text import CountVectorizer
-from rasa.shared.nlu.training_data.training_data import TrainingData
-from rasa.shared.nlu.training_data.message import Message
-from rasa.shared.exceptions import RasaException, FileIOException
 from rasa.nlu.constants import (
     TOKENS_NAMES,
     MESSAGE_ATTRIBUTES,
     DENSE_FEATURIZABLE_ATTRIBUTES,
 )
+from rasa.nlu.featurizers.sparse_featurizer.sparse_featurizer import SparseFeaturizer
+from rasa.nlu.tokenizers.tokenizer import Tokenizer
+from rasa.nlu.utils.spacy_utils import SpacyModel
+from rasa.shared.constants import DOCS_URL_COMPONENTS
+from rasa.shared.exceptions import RasaException, FileIOException
 from rasa.shared.nlu.constants import TEXT, INTENT, INTENT_RESPONSE_KEY, ACTION_NAME
+from rasa.shared.nlu.training_data.message import Message
+from rasa.shared.nlu.training_data.training_data import TrainingData
 BUFFER_SLOTS_PREFIX = "buf_"
@@ -688,6 +690,31 @@ class CountVectorsFeaturizer(SparseFeaturizer, GraphComponent):
         """Check if any model got trained."""
         return any(value is not None for value in attribute_vocabularies.values())
+    @staticmethod
+    def convert_vocab(
+        vocab: Dict[str, Union[int, Optional[Dict[str, int]]]], to_int: bool
+    ) -> Dict[str, Union[None, int, np.int64, Dict[str, Union[int, np.int64]]]]:
+        """Converts numpy integers in the vocabulary to Python integers."""
+        def convert_value(value: int) -> Union[int, np.int64]:
+            """Helper function to convert a single value based on to_int flag."""
+            return int(value) if to_int else np.int64(value)
+        result_dict: Dict[
+            str, Union[None, int, np.int64, Dict[str, Union[int, np.int64]]]
+        ] = {}
+        for key, sub_dict in vocab.items():
+            if isinstance(sub_dict, int):
+                result_dict[key] = convert_value(sub_dict)
+            elif not sub_dict:
+                result_dict[key] = None
+            else:
+                result_dict[key] = {
+                    sub_key: convert_value(value) for sub_key, value in sub_dict.items()
+                }
+        return result_dict
     def persist(self) -> None:
         """Persist this model into the passed directory.
@@ -701,17 +728,18 @@ class CountVectorsFeaturizer(SparseFeaturizer, GraphComponent):
             attribute_vocabularies = self._collect_vectorizer_vocabularies()
             if self._is_any_model_trained(attribute_vocabularies):
                 # Definitely need to persist some vocabularies
-                featurizer_file = model_dir / "vocabularies.pkl"
+                featurizer_file = model_dir / "vocabularies.json"
                 # Only persist vocabulary from one attribute if `use_shared_vocab`.
                 # Can be loaded and distributed to all attributes.
-                vocab = (
+                loaded_vocab = (
                     attribute_vocabularies[TEXT]
                     if self.use_shared_vocab
                     else attribute_vocabularies
                 )
+                vocab = self.convert_vocab(loaded_vocab, to_int=True)
-                io_utils.json_pickle(featurizer_file, vocab)
+                rasa.shared.utils.io.dump_obj_as_json_to_file(featurizer_file, vocab)
                 # Dump OOV words separately as they might have been modified during
                 # training
@@ -786,8 +814,9 @@ class CountVectorsFeaturizer(SparseFeaturizer, GraphComponent):
         """Loads trained component (see parent class for full docstring)."""
         try:
             with model_storage.read_from(resource) as model_dir:
-                featurizer_file = model_dir / "vocabularies.pkl"
-                vocabulary = io_utils.json_unpickle(featurizer_file)
+                featurizer_file = model_dir / "vocabularies.json"
+                vocabulary = rasa.shared.utils.io.read_json_file(featurizer_file)
+                vocabulary = cls.convert_vocab(vocabulary, to_int=False)
                 share_vocabulary = config["use_shared_vocab"]

rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py CHANGED Viewed

@@ -1,9 +1,7 @@
 from __future__ import annotations
 import logging
 from collections import OrderedDict
-import scipy.sparse
-import numpy as np
 from typing import (
     Any,
     Dict,
@@ -17,30 +15,34 @@ from typing import (
     Union,
 )
+import numpy as np
+import scipy.sparse
+import rasa.shared.utils.io
+import rasa.utils.io
 from rasa.engine.graph import ExecutionContext, GraphComponent
 from rasa.engine.recipes.default_recipe import DefaultV1Recipe
 from rasa.engine.storage.resource import Resource
 from rasa.engine.storage.storage import ModelStorage
+from rasa.nlu.constants import TOKENS_NAMES
+from rasa.nlu.featurizers.sparse_featurizer.sparse_featurizer import SparseFeaturizer
 from rasa.nlu.tokenizers.spacy_tokenizer import POS_TAG_KEY, SpacyTokenizer
 from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer
-from rasa.nlu.featurizers.sparse_featurizer.sparse_featurizer import SparseFeaturizer
-from rasa.nlu.constants import TOKENS_NAMES
 from rasa.shared.constants import DOCS_URL_COMPONENTS
-from rasa.shared.nlu.training_data.training_data import TrainingData
-from rasa.shared.nlu.training_data.message import Message
-from rasa.shared.nlu.constants import TEXT
 from rasa.shared.exceptions import InvalidConfigException
-import rasa.shared.utils.io
-import rasa.utils.io
+from rasa.shared.nlu.constants import TEXT
+from rasa.shared.nlu.training_data.message import Message
+from rasa.shared.nlu.training_data.training_data import TrainingData
 logger = logging.getLogger(__name__)
 END_OF_SENTENCE = "EOS"
 BEGIN_OF_SENTENCE = "BOS"
 FEATURES = "features"
+SEPERATOR = "###"
 @DefaultV1Recipe.register(
     DefaultV1Recipe.ComponentType.MESSAGE_FEATURIZER, is_trainable=True
@@ -72,7 +74,7 @@ class LexicalSyntacticFeaturizer(SparseFeaturizer, GraphComponent):
       of the token at position `t+1`.
     """
-    FILENAME_FEATURE_TO_IDX_DICT = "feature_to_idx_dict.pkl"
+    FILENAME_FEATURE_TO_IDX_DICT = "feature_to_idx_dict.json"
     # NOTE: "suffix5" of the token "is" will be "is". Hence, when combining multiple
     # prefixes, short words will be represented/encoded repeatedly.
@@ -488,6 +490,32 @@ class LexicalSyntacticFeaturizer(SparseFeaturizer, GraphComponent):
         """Creates a new untrained component (see parent class for full docstring)."""
         return cls(config, model_storage, resource, execution_context)
+    @staticmethod
+    def _restructure_feature_to_idx_dict(
+        loaded_data: Dict[str, Dict[str, int]],
+    ) -> Dict[Tuple[int, str], Dict[str, int]]:
+        """Reconstructs the feature to idx dict.
+        When storing the feature_to_idx_dict to disk, we need to convert the tuple (key)
+        into a string to be able to store it via json. When loading the data
+        we need to reconstruct the tuple from the stored string.
+        Args:
+            loaded_data: The loaded feature to idx dict from file.
+        Returns:
+            The reconstructed feature_to_idx_dict
+        """
+        feature_to_idx_dict = {}
+        for tuple_string, feature_value in loaded_data.items():
+            # Example of tuple_string: "1###low"
+            index, feature_name = tuple_string.split(SEPERATOR)
+            feature_key = (int(index), feature_name)
+            feature_to_idx_dict[feature_key] = feature_value
+        return feature_to_idx_dict
     @classmethod
     def load(
         cls,
@@ -500,10 +528,13 @@ class LexicalSyntacticFeaturizer(SparseFeaturizer, GraphComponent):
         """Loads trained component (see parent class for full docstring)."""
         try:
             with model_storage.read_from(resource) as model_path:
-                feature_to_idx_dict = rasa.utils.io.json_unpickle(
+                loaded_data = rasa.shared.utils.io.read_json_file(
                     model_path / cls.FILENAME_FEATURE_TO_IDX_DICT,
-                    encode_non_string_keys=True,
                 )
+                # convert the key back into tuple
+                feature_to_idx_dict = cls._restructure_feature_to_idx_dict(loaded_data)
                 return cls(
                     config=config,
                     model_storage=model_storage,
@@ -528,9 +559,13 @@ class LexicalSyntacticFeaturizer(SparseFeaturizer, GraphComponent):
         if not self._feature_to_idx_dict:
             return None
+        # as we cannot dump tuples, convert the tuple into a string
+        restructured_feature_dict = {
+            f"{k[0]}{SEPERATOR}{k[1]}": v for k, v in self._feature_to_idx_dict.items()
+        }
         with self._model_storage.write_to(self._resource) as model_path:
-            rasa.utils.io.json_pickle(
+            rasa.shared.utils.io.dump_obj_as_json_to_file(
                 model_path / self.FILENAME_FEATURE_TO_IDX_DICT,
-                self._feature_to_idx_dict,
-                encode_non_string_keys=True,
+                restructured_feature_dict,
             )

rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py CHANGED Viewed

@@ -1,11 +1,13 @@
 from __future__ import annotations
 import logging
 import re
 from typing import Any, Dict, List, Optional, Text, Tuple, Type
 import numpy as np
 import scipy.sparse
-from rasa.nlu.tokenizers.tokenizer import Tokenizer
+from rasa.nlu.tokenizers.tokenizer import Tokenizer
 import rasa.shared.utils.io
 import rasa.utils.io
 import rasa.nlu.utils.pattern_utils as pattern_utils
@@ -240,7 +242,7 @@ class RegexFeaturizer(SparseFeaturizer, GraphComponent):
         try:
             with model_storage.read_from(resource) as model_dir:
-                patterns_file_name = model_dir / "patterns.pkl"
+                patterns_file_name = model_dir / "patterns.json"
                 known_patterns = rasa.shared.utils.io.read_json_file(patterns_file_name)
         except (ValueError, FileNotFoundError):
             logger.warning(
@@ -258,7 +260,7 @@ class RegexFeaturizer(SparseFeaturizer, GraphComponent):
     def _persist(self) -> None:
         with self._model_storage.write_to(self._resource) as model_dir:
-            regex_file = model_dir / "patterns.pkl"
+            regex_file = model_dir / "patterns.json"
             rasa.shared.utils.io.dump_obj_as_json_to_file(
                 regex_file, self.known_patterns
             )

rasa/shared/nlu/training_data/features.py CHANGED Viewed

@@ -1,15 +1,133 @@
 from __future__ import annotations
-from typing import Iterable, Union, Text, Optional, List, Any, Tuple, Dict, Set
 import itertools
+from dataclasses import dataclass
+from typing import Iterable, Union, Text, Optional, List, Any, Tuple, Dict, Set
 import numpy as np
 import scipy.sparse
+from safetensors.numpy import save_file, load_file
-import rasa.shared.utils.io
 import rasa.shared.nlu.training_data.util
+import rasa.shared.utils.io
 from rasa.shared.nlu.constants import FEATURE_TYPE_SEQUENCE, FEATURE_TYPE_SENTENCE
+@dataclass
+class FeatureMetadata:
+    data_type: str
+    attribute: str
+    origin: Union[str, List[str]]
+    is_sparse: bool
+    shape: tuple
+    safetensors_key: str
+def save_features(
+    features_dict: Dict[Text, List[Features]], file_name: str
+) -> Dict[str, Any]:
+    """Save a dictionary of Features lists to disk using safetensors.
+    Args:
+        features_dict: Dictionary mapping strings to lists of Features objects
+        file_name: File to save the features to
+    Returns:
+        The metadata to reconstruct the features.
+    """
+    # All tensors are stored in a single safetensors file
+    tensors_to_save = {}
+    # Metadata will be stored separately
+    metadata = {}
+    for key, features_list in features_dict.items():
+        feature_metadata_list = []
+        for idx, feature in enumerate(features_list):
+            # Create a unique key for this tensor in the safetensors file
+            safetensors_key = f"{key}_{idx}"
+            # Convert sparse matrices to dense if needed
+            if feature.is_sparse():
+                # For sparse matrices, use the COO format
+                coo = feature.features.tocoo()  # type:ignore[union-attr]
+                # Save data, row indices and col indices separately
+                tensors_to_save[f"{safetensors_key}_data"] = coo.data
+                tensors_to_save[f"{safetensors_key}_row"] = coo.row
+                tensors_to_save[f"{safetensors_key}_col"] = coo.col
+            else:
+                tensors_to_save[safetensors_key] = feature.features
+            # Store metadata
+            metadata_item = FeatureMetadata(
+                data_type=feature.type,
+                attribute=feature.attribute,
+                origin=feature.origin,
+                is_sparse=feature.is_sparse(),
+                shape=feature.features.shape,
+                safetensors_key=safetensors_key,
+            )
+            feature_metadata_list.append(vars(metadata_item))
+        metadata[key] = feature_metadata_list
+    # Save tensors
+    save_file(tensors_to_save, file_name)
+    return metadata
+def load_features(
+    filename: str, metadata: Dict[str, Any]
+) -> Dict[Text, List[Features]]:
+    """Load Features dictionary from disk.
+    Args:
+        filename: File name of the safetensors file.
+        metadata: Metadata to reconstruct the features.
+    Returns:
+        Dictionary mapping strings to lists of Features objects
+    """
+    # Load tensors
+    tensors = load_file(filename)
+    # Reconstruct the features dictionary
+    features_dict: Dict[Text, List[Features]] = {}
+    for key, feature_metadata_list in metadata.items():
+        features_list = []
+        for meta in feature_metadata_list:
+            safetensors_key = meta["safetensors_key"]
+            if meta["is_sparse"]:
+                # Reconstruct sparse matrix from COO format
+                data = tensors[f"{safetensors_key}_data"]
+                row = tensors[f"{safetensors_key}_row"]
+                col = tensors[f"{safetensors_key}_col"]
+                features_matrix = scipy.sparse.coo_matrix(
+                    (data, (row, col)), shape=tuple(meta["shape"])
+                ).tocsr()  # Convert back to CSR format
+            else:
+                features_matrix = tensors[safetensors_key]
+            # Reconstruct Features object
+            features = Features(
+                features=features_matrix,
+                feature_type=meta["data_type"],
+                attribute=meta["attribute"],
+                origin=meta["origin"],
+            )
+            features_list.append(features)
+        features_dict[key] = features_list
+    return features_dict
 class Features:
     """Stores the features produced by any featurizer."""

rasa/shared/utils/io.py CHANGED Viewed

@@ -13,6 +13,7 @@ from typing import Any, cast, Callable, Dict, List, Optional, Text, Type, TypeVa
 import warnings
 import random
 import string
 import portalocker
 from rasa.shared.constants import (

rasa/utils/io.py CHANGED Viewed

@@ -2,7 +2,6 @@ import asyncio
 import filecmp
 import logging
 import os
-import pickle
 import tempfile
 import warnings
 import re
@@ -98,29 +97,6 @@ def enable_async_loop_debugging(
     return event_loop
-def pickle_dump(filename: Union[Text, Path], obj: Any) -> None:
-    """Saves object to file.
-    Args:
-        filename: the filename to save the object to
-        obj: the object to store
-    """
-    with open(filename, "wb") as f:
-        pickle.dump(obj, f)
-def pickle_load(filename: Union[Text, Path]) -> Any:
-    """Loads an object from a file.
-    Args:
-        filename: the filename to load the object from
-    Returns: the loaded object
-    """
-    with open(filename, "rb") as f:
-        return pickle.load(f)
 def create_temporary_file(data: Any, suffix: Text = "", mode: Text = "w+") -> Text:
     """Creates a tempfile.NamedTemporaryFile object for data."""
     encoding = None if "b" in mode else rasa.shared.utils.io.DEFAULT_ENCODING
@@ -191,48 +167,6 @@ def create_validator(
     return FunctionValidator
-def json_unpickle(
-    file_name: Union[Text, Path], encode_non_string_keys: bool = False
-) -> Any:
-    """Unpickle an object from file using json.
-    Args:
-        file_name: the file to load the object from
-        encode_non_string_keys: If set to `True` then jsonpickle will encode non-string
-          dictionary keys instead of coercing them into strings via `repr()`.
-    Returns: the object
-    """
-    import jsonpickle.ext.numpy as jsonpickle_numpy
-    import jsonpickle
-    jsonpickle_numpy.register_handlers()
-    file_content = rasa.shared.utils.io.read_file(file_name)
-    return jsonpickle.loads(file_content, keys=encode_non_string_keys)
-def json_pickle(
-    file_name: Union[Text, Path], obj: Any, encode_non_string_keys: bool = False
-) -> None:
-    """Pickle an object to a file using json.
-    Args:
-        file_name: the file to store the object to
-        obj: the object to store
-        encode_non_string_keys: If set to `True` then jsonpickle will encode non-string
-          dictionary keys instead of coercing them into strings via `repr()`.
-    """
-    import jsonpickle.ext.numpy as jsonpickle_numpy
-    import jsonpickle
-    jsonpickle_numpy.register_handlers()
-    rasa.shared.utils.io.write_text_file(
-        jsonpickle.dumps(obj, keys=encode_non_string_keys), file_name
-    )
 def get_emoji_regex() -> Pattern:
     """Returns regex to identify emojis."""
     return re.compile(

rasa-pro 3.10.8__py3-none-any.whl → 3.10.9.dev1__py3-none-any.whl

Potentially problematic release.

rasa-pro 3.10.8py3-none-any.whl → 3.10.9.dev1py3-none-any.whl