PyPI - rasa-pro - Versions diffs - 3.10.10__py3-none-any.whl → 3.10.12__py3-none-any.whl - Mend

rasa-pro 3.10.10py3-none-any.whl → 3.10.12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of rasa-pro might be problematic. Click here for more details.

Files changed (38) hide show

README.md +17 -396
rasa/cli/arguments/train.py +9 -3
rasa/cli/train.py +40 -2
rasa/cli/utils.py +7 -5
rasa/constants.py +1 -1
rasa/core/featurizers/single_state_featurizer.py +22 -1
rasa/core/featurizers/tracker_featurizers.py +115 -18
rasa/core/policies/ted_policy.py +58 -33
rasa/core/policies/unexpected_intent_policy.py +15 -7
rasa/dialogue_understanding/commands/change_flow_command.py +6 -0
rasa/dialogue_understanding/generator/multi_step/multi_step_llm_command_generator.py +20 -3
rasa/dialogue_understanding/generator/single_step/single_step_llm_command_generator.py +29 -4
rasa/e2e_test/e2e_test_runner.py +2 -2
rasa/engine/storage/local_model_storage.py +41 -12
rasa/model_training.py +10 -3
rasa/nlu/classifiers/diet_classifier.py +38 -25
rasa/nlu/classifiers/logistic_regression_classifier.py +22 -9
rasa/nlu/classifiers/sklearn_intent_classifier.py +37 -16
rasa/nlu/extractors/crf_entity_extractor.py +93 -50
rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py +45 -16
rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py +52 -17
rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py +5 -3
rasa/nlu/persistor.py +37 -15
rasa/shared/constants.py +4 -1
rasa/shared/importers/importer.py +7 -8
rasa/shared/nlu/training_data/features.py +120 -2
rasa/shared/utils/io.py +1 -0
rasa/utils/io.py +0 -66
rasa/utils/tensorflow/feature_array.py +366 -0
rasa/utils/tensorflow/model_data.py +2 -193
rasa/version.py +1 -1
rasa_pro-3.10.12.dist-info/METADATA +196 -0
{rasa_pro-3.10.10.dist-info → rasa_pro-3.10.12.dist-info}/RECORD +36 -36
rasa/shared/importers/remote_importer.py +0 -196
rasa_pro-3.10.10.dist-info/METADATA +0 -575
{rasa_pro-3.10.10.dist-info → rasa_pro-3.10.12.dist-info}/NOTICE +0 -0
{rasa_pro-3.10.10.dist-info → rasa_pro-3.10.12.dist-info}/WHEEL +0 -0
{rasa_pro-3.10.10.dist-info → rasa_pro-3.10.12.dist-info}/entry_points.txt +0 -0

rasa/nlu/extractors/crf_entity_extractor.py CHANGED Viewed

@@ -4,9 +4,9 @@ from collections import OrderedDict
 from enum import Enum
 import logging
 import typing
+from typing import Any, Dict, List, Optional, Text, Tuple, Callable, Type
 import numpy as np
-from typing import Any, Dict, List, Optional, Text, Tuple, Callable, Type
 import rasa.nlu.utils.bilou_utils as bilou_utils
 import rasa.shared.utils.io
@@ -41,6 +41,9 @@ if typing.TYPE_CHECKING:
     from sklearn_crfsuite import CRF
+CONFIG_FEATURES = "features"
 class CRFToken:
     def __init__(
         self,
@@ -60,6 +63,29 @@ class CRFToken:
         self.entity_role_tag = entity_role_tag
         self.entity_group_tag = entity_group_tag
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "text": self.text,
+            "pos_tag": self.pos_tag,
+            "pattern": self.pattern,
+            "dense_features": [str(x) for x in list(self.dense_features)],
+            "entity_tag": self.entity_tag,
+            "entity_role_tag": self.entity_role_tag,
+            "entity_group_tag": self.entity_group_tag,
+        }
+    @classmethod
+    def create_from_dict(cls, data: Dict[str, Any]) -> "CRFToken":
+        return cls(
+            data["text"],
+            data["pos_tag"],
+            data["pattern"],
+            np.array([float(x) for x in data["dense_features"]]),
+            data["entity_tag"],
+            data["entity_role_tag"],
+            data["entity_group_tag"],
+        )
 class CRFEntityExtractorOptions(str, Enum):
     """Features that can be used for the 'CRFEntityExtractor'."""
@@ -88,8 +114,6 @@ class CRFEntityExtractorOptions(str, Enum):
 class CRFEntityExtractor(GraphComponent, EntityExtractorMixin):
     """Implements conditional random fields (CRF) to do named entity recognition."""
-    CONFIG_FEATURES = "features"
     function_dict: Dict[Text, Callable[[CRFToken], Any]] = {  # noqa: RUF012
         CRFEntityExtractorOptions.LOW: lambda crf_token: crf_token.text.lower(),
         CRFEntityExtractorOptions.TITLE: lambda crf_token: crf_token.text.istitle(),
@@ -137,7 +161,7 @@ class CRFEntityExtractor(GraphComponent, EntityExtractorMixin):
             # "is the preceding token in title case?"
             # POS features require SpacyTokenizer
             # pattern feature require RegexFeaturizer
-            CRFEntityExtractor.CONFIG_FEATURES: [
+            CONFIG_FEATURES: [
                 [
                     CRFEntityExtractorOptions.LOW,
                     CRFEntityExtractorOptions.TITLE,
@@ -200,7 +224,7 @@ class CRFEntityExtractor(GraphComponent, EntityExtractorMixin):
         )
     def _validate_configuration(self) -> None:
-        if len(self.component_config.get(self.CONFIG_FEATURES, [])) % 2 != 1:
+        if len(self.component_config.get(CONFIG_FEATURES, [])) % 2 != 1:
             raise ValueError(
                 "Need an odd number of crf feature lists to have a center word."
             )
@@ -251,9 +275,11 @@ class CRFEntityExtractor(GraphComponent, EntityExtractorMixin):
         ]
         dataset = [self._convert_to_crf_tokens(example) for example in entity_examples]
-        self._train_model(dataset)
+        self.entity_taggers = self.train_model(
+            dataset, self.component_config, self.crf_order
+        )
-        self.persist()
+        self.persist(dataset)
         return self._resource
@@ -299,7 +325,9 @@ class CRFEntityExtractor(GraphComponent, EntityExtractorMixin):
             if include_tag_features:
                 self._add_tag_to_crf_token(crf_tokens, predictions)
-            features = self._crf_tokens_to_features(crf_tokens, include_tag_features)
+            features = self._crf_tokens_to_features(
+                crf_tokens, self.component_config, include_tag_features
+            )
             predictions[tag_name] = entity_tagger.predict_marginals_single(features)
         # convert predictions into a list of tags and a list of confidences
@@ -389,27 +417,25 @@ class CRFEntityExtractor(GraphComponent, EntityExtractorMixin):
         **kwargs: Any,
     ) -> CRFEntityExtractor:
         """Loads trained component (see parent class for full docstring)."""
-        import joblib
         try:
-            entity_taggers = OrderedDict()
             with model_storage.read_from(resource) as model_dir:
-                # We have to load in the same order as we persisted things as otherwise
-                # the predictions might be off
-                file_names = sorted(model_dir.glob("**/*.pkl"))
-                if not file_names:
-                    logger.debug(
-                        "Failed to load model for 'CRFEntityExtractor'. "
-                        "Maybe you did not provide enough training data and "
-                        "no model was trained."
-                    )
-                    return cls(config, model_storage, resource)
+                dataset = rasa.shared.utils.io.read_json_file(
+                    model_dir / "crf_dataset.json"
+                )
+                crf_order = rasa.shared.utils.io.read_json_file(
+                    model_dir / "crf_order.json"
+                )
-                for file_name in file_names:
-                    name = file_name.stem[1:]
-                    entity_taggers[name] = joblib.load(file_name)
+                dataset = [
+                    [CRFToken.create_from_dict(token_data) for token_data in sub_list]
+                    for sub_list in dataset
+                ]
+                entity_taggers = cls.train_model(dataset, config, crf_order)
-                return cls(config, model_storage, resource, entity_taggers)
+                entity_extractor = cls(config, model_storage, resource, entity_taggers)
+                entity_extractor.crf_order = crf_order
+                return entity_extractor
         except ValueError:
             logger.warning(
                 f"Failed to load {cls.__name__} from model storage. Resource "
@@ -417,23 +443,29 @@ class CRFEntityExtractor(GraphComponent, EntityExtractorMixin):
             )
             return cls(config, model_storage, resource)
-    def persist(self) -> None:
+    def persist(self, dataset: List[List[CRFToken]]) -> None:
         """Persist this model into the passed directory."""
-        import joblib
         with self._model_storage.write_to(self._resource) as model_dir:
-            if self.entity_taggers:
-                for idx, (name, entity_tagger) in enumerate(
-                    self.entity_taggers.items()
-                ):
-                    model_file_name = model_dir / f"{idx}{name}.pkl"
-                    joblib.dump(entity_tagger, model_file_name)
+            data_to_store = [
+                [token.to_dict() for token in sub_list] for sub_list in dataset
+            ]
+            rasa.shared.utils.io.dump_obj_as_json_to_file(
+                model_dir / "crf_dataset.json", data_to_store
+            )
+            rasa.shared.utils.io.dump_obj_as_json_to_file(
+                model_dir / "crf_order.json", self.crf_order
+            )
+    @classmethod
     def _crf_tokens_to_features(
-        self, crf_tokens: List[CRFToken], include_tag_features: bool = False
+        cls,
+        crf_tokens: List[CRFToken],
+        config: Dict[str, Any],
+        include_tag_features: bool = False,
     ) -> List[Dict[Text, Any]]:
         """Convert the list of tokens into discrete features."""
-        configured_features = self.component_config[self.CONFIG_FEATURES]
+        configured_features = config[CONFIG_FEATURES]
         sentence_features = []
         for token_idx in range(len(crf_tokens)):
@@ -444,28 +476,31 @@ class CRFEntityExtractor(GraphComponent, EntityExtractorMixin):
             half_window_size = window_size // 2
             window_range = range(-half_window_size, half_window_size + 1)
-            token_features = self._create_features_for_token(
+            token_features = cls._create_features_for_token(
                 crf_tokens,
                 token_idx,
                 half_window_size,
                 window_range,
                 include_tag_features,
+                config,
             )
             sentence_features.append(token_features)
         return sentence_features
+    @classmethod
     def _create_features_for_token(
-        self,
+        cls,
         crf_tokens: List[CRFToken],
         token_idx: int,
         half_window_size: int,
         window_range: range,
         include_tag_features: bool,
+        config: Dict[str, Any],
     ) -> Dict[Text, Any]:
         """Convert a token into discrete features including words before and after."""
-        configured_features = self.component_config[self.CONFIG_FEATURES]
+        configured_features = config[CONFIG_FEATURES]
         prefixes = [str(i) for i in window_range]
         token_features = {}
@@ -505,13 +540,13 @@ class CRFEntityExtractor(GraphComponent, EntityExtractorMixin):
                         # set in the training data, 'matched' is either 'True' or
                         # 'False' depending on whether the token actually matches the
                         # pattern or not
-                        regex_patterns = self.function_dict[feature](token)
+                        regex_patterns = cls.function_dict[feature](token)
                         for pattern_name, matched in regex_patterns.items():
                             token_features[f"{prefix}:{feature}:{pattern_name}"] = (
                                 matched
                             )
                     else:
-                        value = self.function_dict[feature](token)
+                        value = cls.function_dict[feature](token)
                         token_features[f"{prefix}:{feature}"] = value
         return token_features
@@ -635,38 +670,46 @@ class CRFEntityExtractor(GraphComponent, EntityExtractorMixin):
         return tags
-    def _train_model(self, df_train: List[List[CRFToken]]) -> None:
+    @classmethod
+    def train_model(
+        cls,
+        df_train: List[List[CRFToken]],
+        config: Dict[str, Any],
+        crf_order: List[str],
+    ) -> OrderedDict[str, CRF]:
         """Train the crf tagger based on the training data."""
         import sklearn_crfsuite
-        self.entity_taggers = OrderedDict()
+        entity_taggers = OrderedDict()
-        for tag_name in self.crf_order:
+        for tag_name in crf_order:
             logger.debug(f"Training CRF for '{tag_name}'.")
             # add entity tag features for second level CRFs
             include_tag_features = tag_name != ENTITY_ATTRIBUTE_TYPE
             X_train = (
-                self._crf_tokens_to_features(sentence, include_tag_features)
+                cls._crf_tokens_to_features(sentence, config, include_tag_features)
                 for sentence in df_train
             )
             y_train = (
-                self._crf_tokens_to_tags(sentence, tag_name) for sentence in df_train
+                cls._crf_tokens_to_tags(sentence, tag_name) for sentence in df_train
             )
             entity_tagger = sklearn_crfsuite.CRF(
                 algorithm="lbfgs",
                 # coefficient for L1 penalty
-                c1=self.component_config["L1_c"],
+                c1=config["L1_c"],
                 # coefficient for L2 penalty
-                c2=self.component_config["L2_c"],
+                c2=config["L2_c"],
                 # stop earlier
-                max_iterations=self.component_config["max_iterations"],
+                max_iterations=config["max_iterations"],
                 # include transitions that are possible, but not observed
                 all_possible_transitions=True,
             )
             entity_tagger.fit(X_train, y_train)
-            self.entity_taggers[tag_name] = entity_tagger
+            entity_taggers[tag_name] = entity_tagger
             logger.debug("Training finished.")
+        return entity_taggers

rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py CHANGED Viewed

@@ -1,30 +1,32 @@
 from __future__ import annotations
 import logging
 import re
+from typing import Any, Dict, List, Optional, Text, Tuple, Set, Type, Union
+import numpy as np
 import scipy.sparse
-from typing import Any, Dict, List, Optional, Text, Tuple, Set, Type
-from rasa.nlu.tokenizers.tokenizer import Tokenizer
+from sklearn.exceptions import NotFittedError
+from sklearn.feature_extraction.text import CountVectorizer
 import rasa.shared.utils.io
 from rasa.engine.graph import GraphComponent, ExecutionContext
 from rasa.engine.recipes.default_recipe import DefaultV1Recipe
 from rasa.engine.storage.resource import Resource
 from rasa.engine.storage.storage import ModelStorage
-from rasa.nlu.featurizers.sparse_featurizer.sparse_featurizer import SparseFeaturizer
-from rasa.nlu.utils.spacy_utils import SpacyModel
-from rasa.shared.constants import DOCS_URL_COMPONENTS
-import rasa.utils.io as io_utils
-from sklearn.exceptions import NotFittedError
-from sklearn.feature_extraction.text import CountVectorizer
-from rasa.shared.nlu.training_data.training_data import TrainingData
-from rasa.shared.nlu.training_data.message import Message
-from rasa.shared.exceptions import RasaException, FileIOException
 from rasa.nlu.constants import (
     TOKENS_NAMES,
     MESSAGE_ATTRIBUTES,
     DENSE_FEATURIZABLE_ATTRIBUTES,
 )
+from rasa.nlu.featurizers.sparse_featurizer.sparse_featurizer import SparseFeaturizer
+from rasa.nlu.tokenizers.tokenizer import Tokenizer
+from rasa.nlu.utils.spacy_utils import SpacyModel
+from rasa.shared.constants import DOCS_URL_COMPONENTS
+from rasa.shared.exceptions import RasaException, FileIOException
 from rasa.shared.nlu.constants import TEXT, INTENT, INTENT_RESPONSE_KEY, ACTION_NAME
+from rasa.shared.nlu.training_data.message import Message
+from rasa.shared.nlu.training_data.training_data import TrainingData
 BUFFER_SLOTS_PREFIX = "buf_"
@@ -688,6 +690,31 @@ class CountVectorsFeaturizer(SparseFeaturizer, GraphComponent):
         """Check if any model got trained."""
         return any(value is not None for value in attribute_vocabularies.values())
+    @staticmethod
+    def convert_vocab(
+        vocab: Dict[str, Union[int, Optional[Dict[str, int]]]], to_int: bool
+    ) -> Dict[str, Union[None, int, np.int64, Dict[str, Union[int, np.int64]]]]:
+        """Converts numpy integers in the vocabulary to Python integers."""
+        def convert_value(value: int) -> Union[int, np.int64]:
+            """Helper function to convert a single value based on to_int flag."""
+            return int(value) if to_int else np.int64(value)
+        result_dict: Dict[
+            str, Union[None, int, np.int64, Dict[str, Union[int, np.int64]]]
+        ] = {}
+        for key, sub_dict in vocab.items():
+            if isinstance(sub_dict, int):
+                result_dict[key] = convert_value(sub_dict)
+            elif not sub_dict:
+                result_dict[key] = None
+            else:
+                result_dict[key] = {
+                    sub_key: convert_value(value) for sub_key, value in sub_dict.items()
+                }
+        return result_dict
     def persist(self) -> None:
         """Persist this model into the passed directory.
@@ -701,17 +728,18 @@ class CountVectorsFeaturizer(SparseFeaturizer, GraphComponent):
             attribute_vocabularies = self._collect_vectorizer_vocabularies()
             if self._is_any_model_trained(attribute_vocabularies):
                 # Definitely need to persist some vocabularies
-                featurizer_file = model_dir / "vocabularies.pkl"
+                featurizer_file = model_dir / "vocabularies.json"
                 # Only persist vocabulary from one attribute if `use_shared_vocab`.
                 # Can be loaded and distributed to all attributes.
-                vocab = (
+                loaded_vocab = (
                     attribute_vocabularies[TEXT]
                     if self.use_shared_vocab
                     else attribute_vocabularies
                 )
+                vocab = self.convert_vocab(loaded_vocab, to_int=True)
-                io_utils.json_pickle(featurizer_file, vocab)
+                rasa.shared.utils.io.dump_obj_as_json_to_file(featurizer_file, vocab)
                 # Dump OOV words separately as they might have been modified during
                 # training
@@ -786,8 +814,9 @@ class CountVectorsFeaturizer(SparseFeaturizer, GraphComponent):
         """Loads trained component (see parent class for full docstring)."""
         try:
             with model_storage.read_from(resource) as model_dir:
-                featurizer_file = model_dir / "vocabularies.pkl"
-                vocabulary = io_utils.json_unpickle(featurizer_file)
+                featurizer_file = model_dir / "vocabularies.json"
+                vocabulary = rasa.shared.utils.io.read_json_file(featurizer_file)
+                vocabulary = cls.convert_vocab(vocabulary, to_int=False)
                 share_vocabulary = config["use_shared_vocab"]

rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py CHANGED Viewed

@@ -1,9 +1,7 @@
 from __future__ import annotations
 import logging
 from collections import OrderedDict
-import scipy.sparse
-import numpy as np
 from typing import (
     Any,
     Dict,
@@ -17,30 +15,34 @@ from typing import (
     Union,
 )
+import numpy as np
+import scipy.sparse
+import rasa.shared.utils.io
+import rasa.utils.io
 from rasa.engine.graph import ExecutionContext, GraphComponent
 from rasa.engine.recipes.default_recipe import DefaultV1Recipe
 from rasa.engine.storage.resource import Resource
 from rasa.engine.storage.storage import ModelStorage
+from rasa.nlu.constants import TOKENS_NAMES
+from rasa.nlu.featurizers.sparse_featurizer.sparse_featurizer import SparseFeaturizer
 from rasa.nlu.tokenizers.spacy_tokenizer import POS_TAG_KEY, SpacyTokenizer
 from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer
-from rasa.nlu.featurizers.sparse_featurizer.sparse_featurizer import SparseFeaturizer
-from rasa.nlu.constants import TOKENS_NAMES
 from rasa.shared.constants import DOCS_URL_COMPONENTS
-from rasa.shared.nlu.training_data.training_data import TrainingData
-from rasa.shared.nlu.training_data.message import Message
-from rasa.shared.nlu.constants import TEXT
 from rasa.shared.exceptions import InvalidConfigException
-import rasa.shared.utils.io
-import rasa.utils.io
+from rasa.shared.nlu.constants import TEXT
+from rasa.shared.nlu.training_data.message import Message
+from rasa.shared.nlu.training_data.training_data import TrainingData
 logger = logging.getLogger(__name__)
 END_OF_SENTENCE = "EOS"
 BEGIN_OF_SENTENCE = "BOS"
 FEATURES = "features"
+SEPERATOR = "###"
 @DefaultV1Recipe.register(
     DefaultV1Recipe.ComponentType.MESSAGE_FEATURIZER, is_trainable=True
@@ -72,7 +74,7 @@ class LexicalSyntacticFeaturizer(SparseFeaturizer, GraphComponent):
       of the token at position `t+1`.
     """
-    FILENAME_FEATURE_TO_IDX_DICT = "feature_to_idx_dict.pkl"
+    FILENAME_FEATURE_TO_IDX_DICT = "feature_to_idx_dict.json"
     # NOTE: "suffix5" of the token "is" will be "is". Hence, when combining multiple
     # prefixes, short words will be represented/encoded repeatedly.
@@ -488,6 +490,32 @@ class LexicalSyntacticFeaturizer(SparseFeaturizer, GraphComponent):
         """Creates a new untrained component (see parent class for full docstring)."""
         return cls(config, model_storage, resource, execution_context)
+    @staticmethod
+    def _restructure_feature_to_idx_dict(
+        loaded_data: Dict[str, Dict[str, int]],
+    ) -> Dict[Tuple[int, str], Dict[str, int]]:
+        """Reconstructs the feature to idx dict.
+        When storing the feature_to_idx_dict to disk, we need to convert the tuple (key)
+        into a string to be able to store it via json. When loading the data
+        we need to reconstruct the tuple from the stored string.
+        Args:
+            loaded_data: The loaded feature to idx dict from file.
+        Returns:
+            The reconstructed feature_to_idx_dict
+        """
+        feature_to_idx_dict = {}
+        for tuple_string, feature_value in loaded_data.items():
+            # Example of tuple_string: "1###low"
+            index, feature_name = tuple_string.split(SEPERATOR)
+            feature_key = (int(index), feature_name)
+            feature_to_idx_dict[feature_key] = feature_value
+        return feature_to_idx_dict
     @classmethod
     def load(
         cls,
@@ -500,10 +528,13 @@ class LexicalSyntacticFeaturizer(SparseFeaturizer, GraphComponent):
         """Loads trained component (see parent class for full docstring)."""
         try:
             with model_storage.read_from(resource) as model_path:
-                feature_to_idx_dict = rasa.utils.io.json_unpickle(
+                loaded_data = rasa.shared.utils.io.read_json_file(
                     model_path / cls.FILENAME_FEATURE_TO_IDX_DICT,
-                    encode_non_string_keys=True,
                 )
+                # convert the key back into tuple
+                feature_to_idx_dict = cls._restructure_feature_to_idx_dict(loaded_data)
                 return cls(
                     config=config,
                     model_storage=model_storage,
@@ -528,9 +559,13 @@ class LexicalSyntacticFeaturizer(SparseFeaturizer, GraphComponent):
         if not self._feature_to_idx_dict:
             return None
+        # as we cannot dump tuples, convert the tuple into a string
+        restructured_feature_dict = {
+            f"{k[0]}{SEPERATOR}{k[1]}": v for k, v in self._feature_to_idx_dict.items()
+        }
         with self._model_storage.write_to(self._resource) as model_path:
-            rasa.utils.io.json_pickle(
+            rasa.shared.utils.io.dump_obj_as_json_to_file(
                 model_path / self.FILENAME_FEATURE_TO_IDX_DICT,
-                self._feature_to_idx_dict,
-                encode_non_string_keys=True,
+                restructured_feature_dict,
             )

rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py CHANGED Viewed

@@ -1,11 +1,13 @@
 from __future__ import annotations
 import logging
 import re
 from typing import Any, Dict, List, Optional, Text, Tuple, Type
 import numpy as np
 import scipy.sparse
-from rasa.nlu.tokenizers.tokenizer import Tokenizer
+from rasa.nlu.tokenizers.tokenizer import Tokenizer
 import rasa.shared.utils.io
 import rasa.utils.io
 import rasa.nlu.utils.pattern_utils as pattern_utils
@@ -240,7 +242,7 @@ class RegexFeaturizer(SparseFeaturizer, GraphComponent):
         try:
             with model_storage.read_from(resource) as model_dir:
-                patterns_file_name = model_dir / "patterns.pkl"
+                patterns_file_name = model_dir / "patterns.json"
                 known_patterns = rasa.shared.utils.io.read_json_file(patterns_file_name)
         except (ValueError, FileNotFoundError):
             logger.warning(
@@ -258,7 +260,7 @@ class RegexFeaturizer(SparseFeaturizer, GraphComponent):
     def _persist(self) -> None:
         with self._model_storage.write_to(self._resource) as model_dir:
-            regex_file = model_dir / "patterns.pkl"
+            regex_file = model_dir / "patterns.json"
             rasa.shared.utils.io.dump_obj_as_json_to_file(
                 regex_file, self.known_patterns
             )

rasa/nlu/persistor.py CHANGED Viewed

@@ -82,16 +82,14 @@ def get_persistor(storage: StorageType) -> Optional[Persistor]:
     Currently, `aws`, `gcs`, `azure` and providing module paths are supported remote
     storages.
     """
-    storage = storage.value if isinstance(storage, RemoteStorageType) else storage
-    if storage == RemoteStorageType.AWS.value:
+    if storage == RemoteStorageType.AWS:
         return AWSPersistor(
             os.environ.get(BUCKET_NAME_ENV), os.environ.get(AWS_ENDPOINT_URL_ENV)
         )
-    if storage == RemoteStorageType.GCS.value:
+    if storage == RemoteStorageType.GCS:
         return GCSPersistor(os.environ.get(BUCKET_NAME_ENV))
-    if storage == RemoteStorageType.AZURE.value:
+    if storage == RemoteStorageType.AZURE:
         return AzurePersistor(
             os.environ.get(AZURE_CONTAINER_ENV),
             os.environ.get(AZURE_ACCOUNT_NAME_ENV),
@@ -125,24 +123,36 @@ class Persistor(abc.ABC):
         """Downloads a model that has been persisted to cloud storage.
         Downloaded model will be saved to the `target_path`.
-        If `target_path` is a directory, the model will be saved to that directory.
-        If `target_path` is a file, the model will be saved to that file.
+        If `target_path` is a directory, the model will be downloaded to that directory.
+        If `target_path` is a file, the model will be downloaded to that file.
         Args:
             model_name: The name of the model to retrieve.
-            target_path: The path to which the model should be saved.
+            target_path: The path to which the model should be downloaded.
         """
         tar_name = model_name
         if not model_name.endswith(MODEL_ARCHIVE_EXTENSION):
             # ensure backward compatibility
             tar_name = self._tar_name(model_name)
-        tar_name = self._create_file_key(tar_name)
-        self._retrieve_tar(tar_name)
-        self._copy(os.path.basename(tar_name), target_path)
+        remote_object_path = self._create_file_key(tar_name)
+        self._retrieve_tar(remote_object_path)
+        target_tar_file_name = os.path.basename(tar_name)
         if os.path.isdir(target_path):
-            return os.path.join(target_path, model_name)
+            target_path = os.path.join(target_path, target_tar_file_name)
+        if not os.path.exists(target_path):
+            structlogger.debug(
+                "persistor.retrieve.copy_model",
+                event_info=f"Copying model '{target_tar_file_name}' to "
+                f"'{target_path}'.",
+            )
+            self._copy(target_tar_file_name, target_path)
+        structlogger.debug(
+            "persistor.retrieve.model_retrieved",
+            event_info=f"Model retrieved and saved to '{target_path}'.",
+        )
         return target_path
     @abc.abstractmethod
@@ -262,6 +272,11 @@ class AWSPersistor(Persistor):
     def _persist_tar(self, file_key: Text, tar_path: Text) -> None:
         """Uploads a model persisted in the `target_dir` to s3."""
+        structlogger.debug(
+            "aws_persistor.persist_tar.uploading_model",
+            event_info=f"Uploading tar archive {file_key} to "
+            f"s3 bucket '{self.bucket_name}'.",
+        )
         with open(tar_path, "rb") as f:
             self.s3.Object(self.bucket_name, file_key).put(Body=f)
@@ -329,7 +344,7 @@ class GCSPersistor(Persistor):
     def _retrieve_tar(self, target_filename: Text) -> None:
         """Downloads a model that has previously been persisted to GCS."""
         blob = self.bucket.blob(target_filename)
-        blob.download_to_filename(target_filename)
+        blob.download_to_filename(os.path.basename(target_filename))
 class AzurePersistor(Persistor):
@@ -370,12 +385,19 @@ class AzurePersistor(Persistor):
     def _persist_tar(self, file_key: Text, tar_path: Text) -> None:
         """Uploads a model persisted in the `target_dir` to Azure."""
         with open(tar_path, "rb") as data:
-            self._container_client().upload_blob(name=file_key, data=data)
+            self._container_client().upload_blob(
+                name=file_key,
+                data=data,
+                # overwrite is set to True to keep in line with
+                # how GCS and AWS APIs work this enables easy
+                # updating of models in the cloud
+                overwrite=True,
+            )
     def _retrieve_tar(self, target_filename: Text) -> None:
         """Downloads a model that has previously been persisted to Azure."""
         blob_client = self._container_client().get_blob_client(target_filename)
-        with open(target_filename, "wb") as blob:
+        with open(os.path.basename(target_filename), "wb") as blob:
             download_stream = blob_client.download_blob()
             blob.write(download_stream.readall())

rasa-pro 3.10.10__py3-none-any.whl → 3.10.12__py3-none-any.whl

Potentially problematic release.

rasa-pro 3.10.10py3-none-any.whl → 3.10.12py3-none-any.whl