PyPI - ChatterBot - Versions diffs - 1.2.2__py3-none-any.whl → 1.2.4__py3-none-any.whl - Mend

ChatterBot 1.2.2py3-none-any.whl → 1.2.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

chatterbot/__init__.py +1 -1
chatterbot/__main__.py +15 -0
chatterbot/chatterbot.py +52 -6
chatterbot/comparisons.py +3 -13
chatterbot/ext/sqlalchemy_app/models.py +2 -2
chatterbot/logic/specific_response.py +48 -9
chatterbot/logic/time_adapter.py +3 -7
chatterbot/logic/unit_conversion.py +4 -3
chatterbot/storage/__init__.py +2 -0
chatterbot/storage/redis.py +390 -0
chatterbot/storage/sql_storage.py +1 -1
chatterbot/tagging.py +3 -7
chatterbot/trainers.py +297 -109
chatterbot/utils.py +16 -25
chatterbot/vectorstores.py +74 -0
{chatterbot-1.2.2.dist-info → chatterbot-1.2.4.dist-info}/METADATA +16 -20
{chatterbot-1.2.2.dist-info → chatterbot-1.2.4.dist-info}/RECORD +20 -18
{chatterbot-1.2.2.dist-info → chatterbot-1.2.4.dist-info}/WHEEL +1 -1
{chatterbot-1.2.2.dist-info → chatterbot-1.2.4.dist-info/licenses}/LICENSE +0 -0
{chatterbot-1.2.2.dist-info → chatterbot-1.2.4.dist-info}/top_level.txt +0 -0

chatterbot/storage/sql_storage.py CHANGED Viewed

@@ -326,7 +326,7 @@ class SQLStorageAdapter(StorageAdapter):
         record = None
         if hasattr(statement, 'id') and statement.id is not None:
-            record = session.query(Statement).get(statement.id)
+            record = session.get(Statement, statement.id)
         else:
             record = session.query(Statement).filter(
                 Statement.text == statement.text,

chatterbot/tagging.py CHANGED Viewed

@@ -1,4 +1,5 @@
-from chatterbot import languages, constants
+from chatterbot import languages
+from chatterbot.utils import get_model_for_language
 import spacy
@@ -42,12 +43,7 @@ class PosLemmaTagger(object):
         self.language = language or languages.ENG
-        try:
-            model = constants.DEFAULT_LANGUAGE_TO_SPACY_MODEL_MAP[self.language]
-        except KeyError as e:
-            raise KeyError(
-                f'Spacy model is not available for language {self.language}'
-            ) from e
+        model = get_model_for_language(self.language)
         # Disable the Named Entity Recognition (NER) component because it is not necessary
         self.nlp = spacy.load(model, exclude=['ner'])

chatterbot/trainers.py CHANGED Viewed

@@ -25,7 +25,7 @@ class Trainer(object):
         environment_default = bool(int(os.environ.get('CHATTERBOT_SHOW_TRAINING_PROGRESS', True)))
-        self.show_training_progress = kwargs.get(
+        self.disable_progress = not kwargs.get(
             'show_training_progress',
             environment_default
         )
@@ -54,7 +54,7 @@ class Trainer(object):
         def __init__(self, message=None):
             default = (
                 'A training class must be specified before calling train(). '
-                'See https://docs.chatterbot.us/training.html'
+                'See https://docs.chatterbot.us/training/'
             )
             super().__init__(message or default)
@@ -82,7 +82,7 @@ class ListTrainer(Trainer):
     where the list represents a conversation.
     """
-    def train(self, conversation):
+    def train(self, conversation: list):
         """
         Train the chat bot based on the provided list of
         statements that represents a single conversation.
@@ -96,7 +96,7 @@ class ListTrainer(Trainer):
         documents = self.chatbot.tagger.as_nlp_pipeline(conversation)
         # for text in enumerate(conversation):
-        for document in tqdm(documents, desc='List Trainer', disable=not self.show_training_progress):
+        for document in tqdm(documents, desc='List Trainer', disable=self.disable_progress):
             statement_search_text = document._.search_index
             statement = self.get_preprocessed_statement(
@@ -135,7 +135,7 @@ class ChatterBotCorpusTrainer(Trainer):
         for corpus, categories, _file_path in tqdm(
             load_corpus(*data_file_paths),
             desc='ChatterBot Corpus Trainer',
-            disable=not self.show_training_progress
+            disable=self.disable_progress
         ):
             statements_to_create = []
@@ -172,32 +172,259 @@ class ChatterBotCorpusTrainer(Trainer):
                 self.chatbot.storage.create_many(statements_to_create)
-class UbuntuCorpusTrainer(Trainer):
+class GenericFileTrainer(Trainer):
     """
+    Allows the chat bot to be trained using data from a CSV or JSON file,
+    or directory of those file types.
+    """
+    def __init__(self, chatbot, **kwargs):
+        """
+        data_path: str The path to the data file or directory.
+        field_map: dict A dictionary containing the column name to header mapping.
+        """
+        super().__init__(chatbot, **kwargs)
+        self.file_extension = None
+        # NOTE: If the key is an integer, this be the
+        # column index instead of the key or header
+        DEFAULT_STATEMENT_TO_HEADER_MAPPING = {
+            'text': 'text',
+            'conversation': 'conversation',
+            'created_at': 'created_at',
+            'persona': 'persona',
+            'tags': 'tags'
+        }
+        self.field_map = kwargs.get(
+            'field_map',
+            DEFAULT_STATEMENT_TO_HEADER_MAPPING
+        )
+    def _get_file_list(self, data_path, limit):
+        """
+        Get a list of files to read from the data set.
+        """
+        if self.file_extension is None:
+            raise self.TrainerInitializationException(
+                'The file_extension attribute must be set before calling train().'
+            )
+        # List all csv or json files in the specified directory
+        if os.path.isdir(data_path):
+            glob_path = os.path.join(data_path, '**', f'*.{self.file_extension}')
+            # Use iglob instead of glob for better performance with
+            # large directories because it returns an iterator
+            data_files = glob.iglob(glob_path, recursive=True)
+            for index, file_path in enumerate(data_files):
+                if limit is not None and index >= limit:
+                    break
+                yield file_path
+        else:
+            return [data_path]
+    def train(self, data_path: str, limit=None):
+        """
+        Train a chatbot with data from the data file.
+        :param str data_path: The path to the data file or directory.
+        :param int limit: The maximum number of files to train from.
+        """
+        if data_path is None:
+            raise self.TrainerInitializationException(
+                'The data_path argument must be set to the path of a file or directory.'
+            )
+        data_files = self._get_file_list(data_path, limit)
+        files_processed = 0
+        for data_file in tqdm(data_files, desc='Training', disable=self.disable_progress):
+            previous_statement_text = None
+            previous_statement_search_text = ''
+            file_extension = data_file.split('.')[-1].lower()
+            statements_to_create = []
+            with open(data_file, 'r', encoding='utf-8') as file:
+                if self.file_extension == 'json':
+                    data = json.load(file)
+                    data = data['conversation']
+                elif file_extension == 'csv':
+                    use_header = bool(isinstance(next(iter(self.field_map.values())), str))
+                    if use_header:
+                        data = csv.DictReader(file)
+                    else:
+                        data = csv.reader(file)
+                elif file_extension == 'tsv':
+                    use_header = bool(isinstance(next(iter(self.field_map.values())), str))
+                    if use_header:
+                        data = csv.DictReader(file, delimiter='\t')
+                    else:
+                        data = csv.reader(file, delimiter='\t')
+                else:
+                    self.logger.warning(f'Skipping unsupported file type: {file_extension}')
+                    continue
+                files_processed += 1
+                text_row = self.field_map['text']
+                documents = self.chatbot.tagger.as_nlp_pipeline([
+                    (
+                        row[text_row],
+                        {
+                            # Include any defined metadata columns
+                            key: row[value]
+                            for key, value in self.field_map.items()
+                            if key != text_row
+                        }
+                    ) for row in data if len(row) > 0
+                ])
+            for document, context in documents:
+                statement = Statement(
+                    text=document.text,
+                    conversation=context.get('conversation', 'training'),
+                    persona=context.get('persona', None),
+                    tags=context.get('tags', [])
+                )
+                if 'created_at' in context:
+                    statement.created_at = date_parser.parse(context['created_at'])
+                statement.search_text = document._.search_index
+                statement.search_in_response_to = previous_statement_search_text
+                # Use the in_response_to attribute for the previous statement if
+                # one is defined, otherwise use the last statement which was created
+                if 'in_response_to' in self.field_map.keys():
+                    statement.in_response_to = context.get(self.field_map['in_response_to'], None)
+                else:
+                    statement.in_response_to = previous_statement_text
+                for preprocessor in self.chatbot.preprocessors:
+                    statement = preprocessor(statement)
+                previous_statement_text = statement.text
+                previous_statement_search_text = statement.search_text
+                statements_to_create.append(statement)
+            self.chatbot.storage.create_many(statements_to_create)
+        if files_processed:
+            self.chatbot.logger.info(
+                'Training completed. {} files were read.'.format(files_processed)
+            )
+        else:
+            self.chatbot.logger.warning(
+                'No [{}] files were detected at: {}'.format(
+                    self.file_extension,
+                    data_path
+                )
+            )
+class CsvFileTrainer(GenericFileTrainer):
+    """
+    .. note::
+        Added in version 1.2.4
+    Allow chatbots to be trained with data from a CSV file or
+    directory of CSV files.
+    TSV files are also supported, as long as the file_extension
+    parameter is set to 'tsv'.
+    :param str file_extension: The file extension to look for when searching for files (defaults to 'csv').
+    :param str field_map: A dictionary containing the database column name to header mapping.
+                          Values can be either the header name (str) or the column index (int).
+    """
+    def __init__(self, chatbot, **kwargs):
+        super().__init__(chatbot, **kwargs)
+        self.file_extension = kwargs.get('file_extension', 'csv')
+class JsonFileTrainer(GenericFileTrainer):
+    """
+    .. note::
+        Added in version 1.2.4
+    Allow chatbots to be trained with data from a JSON file or
+    directory of JSON files.
+    :param str field_map: A dictionary containing the database column name to header mapping.
+    """
+    def __init__(self, chatbot, **kwargs):
+        super().__init__(chatbot, **kwargs)
+        self.file_extension = 'json'
+        DEFAULT_STATEMENT_TO_KEY_MAPPING = {
+            'text': 'text',
+            'conversation': 'conversation',
+            'created_at': 'created_at',
+            'in_response_to': 'in_response_to',
+            'persona': 'persona',
+            'tags': 'tags'
+        }
+        self.field_map = kwargs.get(
+            'field_map',
+            DEFAULT_STATEMENT_TO_KEY_MAPPING
+        )
+class UbuntuCorpusTrainer(CsvFileTrainer):
+    """
+    .. note::
+        PENDING DEPRECATION: Please use the ``CsvFileTrainer`` for data formats similar to this one.
     Allow chatbots to be trained with the data from the Ubuntu Dialog Corpus.
     For more information about the Ubuntu Dialog Corpus visit:
     https://dataset.cs.mcgill.ca/ubuntu-corpus-1.0/
+    :param str ubuntu_corpus_data_directory: The directory where the Ubuntu corpus data is already located, or where it should be downloaded and extracted.
     """
     def __init__(self, chatbot, **kwargs):
         super().__init__(chatbot, **kwargs)
         home_directory = os.path.expanduser('~')
-        self.data_download_url = kwargs.get(
-            'ubuntu_corpus_data_download_url',
-            'http://cs.mcgill.ca/~jpineau/datasets/ubuntu-corpus-1.0/ubuntu_dialogs.tgz'
-        )
+        self.data_download_url = None
         self.data_directory = kwargs.get(
             'ubuntu_corpus_data_directory',
             os.path.join(home_directory, 'ubuntu_data')
         )
-        self.extracted_data_directory = os.path.join(
+        # Directory containing extracted data
+        self.data_path = os.path.join(
             self.data_directory, 'ubuntu_dialogs'
         )
+        self.field_map = {
+            'text': 3,
+            'created_at': 0,
+            'persona': 1,
+        }
     def is_downloaded(self, file_path):
         """
         Check if the data file is already downloaded.
@@ -222,7 +449,6 @@ class UbuntuCorpusTrainer(Trainer):
         """
         Download a file from the given url.
         Show a progress indicator for the download status.
-        Based on: http://stackoverflow.com/a/15645088/1547223
         """
         import requests
@@ -238,7 +464,8 @@ class UbuntuCorpusTrainer(Trainer):
             return file_path
         with open(file_path, 'wb') as open_file:
-            print('Downloading %s' % url)
+            if show_status:
+                print('Downloading %s' % url)
             response = requests.get(url, stream=True)
             total_length = response.headers.get('content-length')
@@ -246,136 +473,97 @@ class UbuntuCorpusTrainer(Trainer):
                 # No content length header
                 open_file.write(response.content)
             else:
-                download = 0
-                total_length = int(total_length)
-                for data in response.iter_content(chunk_size=4096):
-                    download += len(data)
+                for data in tqdm(
+                    response.iter_content(chunk_size=4096),
+                    desc='Downloading',
+                    disable=not show_status
+                ):
                     open_file.write(data)
-                    if show_status:
-                        done = int(50 * download / total_length)
-                        sys.stdout.write('\r[%s%s]' % ('=' * done, ' ' * (50 - done)))
-                        sys.stdout.flush()
-            # Add a new line after the download bar
-            sys.stdout.write('\n')
-        print('Download location: %s' % file_path)
+        if show_status:
+            print('Download location: %s' % file_path)
         return file_path
     def extract(self, file_path):
         """
         Extract a tar file at the specified file path.
         """
-        print('Extracting {}'.format(file_path))
-        if not os.path.exists(self.extracted_data_directory):
-            os.makedirs(self.extracted_data_directory)
+        if not self.disable_progress:
+            print('Extracting {}'.format(file_path))
-        def track_progress(members):
-            sys.stdout.write('.')
-            for member in members:
-                # This will be the current file being extracted
-                yield member
+        if not os.path.exists(self.data_path):
+            os.makedirs(self.data_path)
-        with tarfile.open(file_path) as tar:
-            def is_within_directory(directory, target):
+        def is_within_directory(directory, target):
-                abs_directory = os.path.abspath(directory)
-                abs_target = os.path.abspath(target)
+            abs_directory = os.path.abspath(directory)
+            abs_target = os.path.abspath(target)
-                prefix = os.path.commonprefix([abs_directory, abs_target])
+            prefix = os.path.commonprefix([abs_directory, abs_target])
-                return prefix == abs_directory
+            return prefix == abs_directory
-            def safe_extract(tar, path=".", members=None, *, numeric_owner=False):
+        def safe_extract(tar, path='.', members=None, *, numeric_owner=False):
-                for member in tar.getmembers():
-                    member_path = os.path.join(path, member.name)
-                    if not is_within_directory(path, member_path):
-                        raise Exception("Attempted Path Traversal in Tar File")
+            for member in tar.getmembers():
+                member_path = os.path.join(path, member.name)
+                if not is_within_directory(path, member_path):
+                    raise Exception('Attempted Path Traversal in Tar File')
-                tar.extractall(path, members, numeric_owner=numeric_owner)
+            tar.extractall(path, members, numeric_owner=numeric_owner)
-            safe_extract(tar, path=self.extracted_data_directory, members=track_progress(tar))
+        try:
+            with tarfile.open(file_path, 'r') as tar:
+                safe_extract(tar, path=self.data_path, members=tqdm(tar, disable=self.disable_progress))
+        except tarfile.ReadError as e:
+            raise self.TrainerInitializationException(
+                f'The provided data file is not a valid tar file: {file_path}'
+            ) from e
-        self.chatbot.logger.info('File extracted to {}'.format(self.extracted_data_directory))
+        self.chatbot.logger.info('File extracted to {}'.format(self.data_path))
         return True
-    def train(self, limit=None):
+    def _get_file_list(self, data_path, limit):
         """
-        limit: int If defined, the number of files to read from the data set.
+        Get a list of files to read from the data set.
         """
+        if self.data_download_url is None:
+            raise self.TrainerInitializationException(
+                'The data_download_url attribute must be set before calling train().'
+            )
         # Download and extract the Ubuntu dialog corpus if needed
         corpus_download_path = self.download(self.data_download_url)
         # Extract if the directory does not already exist
-        if not self.is_extracted(self.extracted_data_directory):
+        if not self.is_extracted(data_path):
             self.extract(corpus_download_path)
         extracted_corpus_path = os.path.join(
-            self.extracted_data_directory,
-            '**', '**', '*.tsv'
+            data_path, '**', '**', '*.tsv'
         )
-        def chunks(items, items_per_chunk):
-            for start_index in range(0, len(items), items_per_chunk):
-                end_index = start_index + items_per_chunk
-                yield items[start_index:end_index]
+        # Use iglob instead of glob for better performance with
+        # large directories because it returns an iterator
+        data_files = glob.iglob(extracted_corpus_path)
-        file_list = glob.glob(extracted_corpus_path)
+        for index, file_path in enumerate(data_files):
+            if limit is not None and index >= limit:
+                break
-        # Limit the number of files used if a limit is defined
-        if limit is not None:
-            file_list = file_list[:limit]
+            yield file_path
-        file_groups = tuple(chunks(file_list, 5000))
+    def train(self, data_download_url, limit=None):
+        """
+        :param str data_download_url: The URL to download the Ubuntu dialog corpus from.
+        :param int limit: The maximum number of files to train from.
+        """
+        self.data_download_url = data_download_url
         start_time = time.time()
+        super().train(self.data_path, limit=limit)
-        for batch_number, tsv_files in enumerate(file_groups):
-            statements_from_file = []
-            for tsv_file in tqdm(tsv_files, desc=f'Training with batch {batch_number} of {len(file_groups)}'):
-                with open(tsv_file, 'r', encoding='utf-8') as tsv:
-                    reader = csv.reader(tsv, delimiter='\t')
-                    previous_statement_text = None
-                    previous_statement_search_text = ''
-                    documents = self.chatbot.tagger.as_nlp_pipeline([
-                        (
-                            row[3],
-                            {
-                                'persona': row[1],
-                                'created_at': row[0],
-                            }
-                         ) for row in reader if len(row) > 0
-                    ])
-                    for document, context in documents:
-                        statement_search_text = document._.search_index
-                        statement = Statement(
-                            text=document.text,
-                            in_response_to=previous_statement_text,
-                            conversation='training',
-                            created_at=date_parser.parse(context['created_at']),
-                            persona=context['persona'],
-                            search_text=statement_search_text,
-                            search_in_response_to=previous_statement_search_text
-                        )
-                        for preprocessor in self.chatbot.preprocessors:
-                            statement = preprocessor(statement)
-                        previous_statement_text = statement.text
-                        previous_statement_search_text = statement_search_text
-                        statements_from_file.append(statement)
-            self.chatbot.storage.create_many(statements_from_file)
-        print('Training took', time.time() - start_time, 'seconds.')
+        if not self.disable_progress:
+            print('Training took', time.time() - start_time, 'seconds.')

chatterbot/utils.py CHANGED Viewed

@@ -89,30 +89,21 @@ def get_response_time(chatbot, statement='Hello'):
     return time.time() - start_time
-def print_progress_bar(description, iteration_counter, total_items, progress_bar_length=20):
+def get_model_for_language(language):
     """
-    Print progress bar
-    :param description: Training description
-    :type description: str
-    :param iteration_counter: Incremental counter
-    :type iteration_counter: int
-    :param total_items: total number items
-    :type total_items: int
-    :param progress_bar_length: Progress bar length
-    :type progress_bar_length: int
-    :returns: void
-    :rtype: void
-    DEPRECTTED: use `tqdm` instead
+    Returns the spacy model for the specified language.
     """
-    percent = float(iteration_counter) / total_items
-    hashes = '#' * int(round(percent * progress_bar_length))
-    spaces = ' ' * (progress_bar_length - len(hashes))
-    sys.stdout.write('\r{0}: [{1}] {2}%'.format(description, hashes + spaces, int(round(percent * 100))))
-    sys.stdout.flush()
-    if total_items == iteration_counter:
-        print('\r')
+    from chatterbot import constants
+    try:
+        model = constants.DEFAULT_LANGUAGE_TO_SPACY_MODEL_MAP[language]
+    except KeyError as e:
+        if hasattr(language, 'ENGLISH_NAME'):
+            language_name = language.ENGLISH_NAME
+        else:
+            language_name = language
+        raise KeyError(
+            f'A corresponding spacy model for "{language_name}" could not be found.'
+        ) from e
+    return model

chatterbot/vectorstores.py ADDED Viewed

@@ -0,0 +1,74 @@
+"""
+Redis vector store.
+"""
+from __future__ import annotations
+from typing import Any, List, Sequence
+from langchain_core.documents import Document
+from redisvl.redis.utils import convert_bytes
+from redisvl.query import FilterQuery
+from langchain_core.documents import Document
+from langchain_redis.vectorstores import RedisVectorStore as LangChainRedisVectorStore
+class RedisVectorStore(LangChainRedisVectorStore):
+    """
+    Redis vector store integration.
+    """
+    def query_search(
+        self,
+        k=4,
+        filter=None,
+        sort_by=None,
+    ) -> List[Document]:
+        """
+        Return docs based on the provided query.
+        k: int, default=4
+            Number of documents to return.
+        filter: str, default=None
+            A filter expression to apply to the query.
+        sort_by: str, default=None
+            A field to sort the results by.
+        returns:
+            A list of Documents most matching the query.
+        """
+        from chatterbot import ChatBot
+        return_fields = [
+            self.config.content_field
+        ]
+        return_fields += [
+            field.name
+            for field in self._index.schema.fields.values()
+            if field.name
+            not in [self.config.embedding_field, self.config.content_field]
+        ]
+        query = FilterQuery(
+            return_fields=return_fields,
+            num_results=k,
+            filter_expression=filter,
+            sort_by=sort_by,
+        )
+        try:
+            results = self._index.query(query)
+        except Exception as e:
+            raise ChatBot.ChatBotException(f'Error querying index: {query}') from e
+        if results:
+            with self._index.client.pipeline(transaction=False) as pipe:
+                for document in results:
+                    pipe.hgetall(document['id'])
+                full_documents = convert_bytes(pipe.execute())
+        else:
+            full_documents = []
+        return self._prepare_docs_full(
+            True, results, full_documents, True
+        )

ChatterBot 1.2.2__py3-none-any.whl → 1.2.4__py3-none-any.whl

ChatterBot 1.2.2py3-none-any.whl → 1.2.4py3-none-any.whl