PyPI - ChatterBot - Versions diffs - 1.2.3__py3-none-any.whl → 1.2.4__py3-none-any.whl - Mend

ChatterBot 1.2.3py3-none-any.whl → 1.2.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

chatterbot/__init__.py +1 -1
chatterbot/__main__.py +15 -0
chatterbot/chatterbot.py +52 -6
chatterbot/comparisons.py +3 -13
chatterbot/ext/sqlalchemy_app/models.py +2 -2
chatterbot/logic/specific_response.py +3 -7
chatterbot/logic/time_adapter.py +3 -7
chatterbot/search.py +0 -55
chatterbot/storage/sql_storage.py +1 -1
chatterbot/tagging.py +3 -7
chatterbot/trainers.py +297 -109
chatterbot/utils.py +16 -25
{chatterbot-1.2.3.dist-info → chatterbot-1.2.4.dist-info}/METADATA +8 -18
{chatterbot-1.2.3.dist-info → chatterbot-1.2.4.dist-info}/RECORD +17 -17
{chatterbot-1.2.3.dist-info → chatterbot-1.2.4.dist-info}/WHEEL +1 -1
{chatterbot-1.2.3.dist-info → chatterbot-1.2.4.dist-info/licenses}/LICENSE +0 -0
{chatterbot-1.2.3.dist-info → chatterbot-1.2.4.dist-info}/top_level.txt +0 -0

chatterbot/__init__.py CHANGED Viewed

@@ -4,7 +4,7 @@ ChatterBot is a machine learning, conversational dialog engine.
 from .chatterbot import ChatBot
-__version__ = '1.2.3'
+__version__ = '1.2.4'
 __all__ = (
     'ChatBot',

chatterbot/__main__.py CHANGED Viewed

@@ -1,7 +1,16 @@
+"""
+Example usage for ChatterBot command line arguments:
+python -m chatterbot --help
+"""
 import sys
 def get_chatterbot_version():
+    """
+    Return the version of the current package.
+    """
     from chatterbot import __version__
     return __version__
@@ -10,3 +19,9 @@ def get_chatterbot_version():
 if __name__ == '__main__':
     if '--version' in sys.argv:
         print(get_chatterbot_version())
+    elif '--help' in sys.argv:
+        print('usage: chatterbot [--version, --help]')
+        print('  --version: Print the version of ChatterBot')
+        print('  --help: Print this help message')
+        print()
+        print('Documentation at https://docs.chatterbot.us')

chatterbot/chatterbot.py CHANGED Viewed

@@ -11,11 +11,41 @@ import spacy
 class ChatBot(object):
     """
     A conversational dialog chat bot.
+    :param name: A name is the only required parameter for the ChatBot class.
+    :type name: str
+    :keyword storage_adapter: The dot-notated import path to a storage adapter class.
+                              Defaults to ``"chatterbot.storage.SQLStorageAdapter"``.
+    :type storage_adapter: str
+    :param logic_adapters: A list of dot-notated import paths to each logic adapter the bot uses.
+                           Defaults to ``["chatterbot.logic.BestMatch"]``.
+    :type logic_adapters: list
+    :param tagger: The tagger to use for the chat bot.
+                   Defaults to :class:`~chatterbot.tagging.PosLemmaTagger`
+    :type tagger: object
+    :param tagger_language: The language to use for the tagger.
+                            Defaults to :class:`~chatterbot.languages.ENG`.
+    :type tagger_language: object
+    :param preprocessors: A list of preprocessor functions to use for the chat bot.
+    :type preprocessors: list
+    :param read_only: If True, the chat bot will not save any input it receives, defaults to False.
+    :type read_only: bool
+    :param logger: A ``Logger`` object.
+    :type logger: logging.Logger
     """
     def __init__(self, name, **kwargs):
         self.name = name
+        self.logger = kwargs.get('logger', logging.getLogger(__name__))
         storage_adapter = kwargs.get('storage_adapter', 'chatterbot.storage.SQLStorageAdapter')
         logic_adapters = kwargs.get('logic_adapters', [
@@ -30,11 +60,29 @@ class ChatBot(object):
         self.storage = utils.initialize_class(storage_adapter, **kwargs)
-        Tagger = kwargs.get('tagger', PosLemmaTagger)
+        tagger_language = kwargs.get('tagger_language', languages.ENG)
-        self.tagger = Tagger(language=kwargs.get(
-            'tagger_language', languages.ENG
-        ))
+        try:
+            Tagger = kwargs.get('tagger', PosLemmaTagger)
+            self.tagger = Tagger(language=tagger_language)
+        except IOError as io_error:
+            # Return a more helpful error message if possible
+            if "Can't find model" in str(io_error):
+                model_name = utils.get_model_for_language(tagger_language)
+                if hasattr(tagger_language, 'ENGLISH_NAME'):
+                    language_name = tagger_language.ENGLISH_NAME
+                else:
+                    language_name = tagger_language
+                raise self.ChatBotException(
+                    'Setup error:\n'
+                    f'The Spacy model for "{language_name}" language is missing.\n'
+                    'Please install the model using the command:\n\n'
+                    f'python -m spacy download {model_name}\n\n'
+                    'See https://spacy.io/usage/models for more information about available models.'
+                ) from io_error
+            else:
+                raise io_error
         primary_search_algorithm = IndexedTextSearch(self, **kwargs)
         text_search_algorithm = TextSearch(self, **kwargs)
@@ -63,8 +111,6 @@ class ChatBot(object):
         # NOTE: 'xx' is the language code for a multi-language model
         self.nlp = spacy.blank(self.tagger.language.ISO_639_1)
-        self.logger = kwargs.get('logger', logging.getLogger(__name__))
         # Allow the bot to save input it receives so that it can learn
         self.read_only = kwargs.get('read_only', False)

chatterbot/comparisons.py CHANGED Viewed

@@ -2,7 +2,7 @@
 This module contains various text-comparison algorithms
 designed to compare one statement to another.
 """
-from chatterbot import constants
+from chatterbot.utils import get_model_for_language
 from difflib import SequenceMatcher
 import spacy
@@ -100,12 +100,7 @@ class SpacySimilarity(Comparator):
     def __init__(self, language):
         super().__init__(language)
-        try:
-            model = constants.DEFAULT_LANGUAGE_TO_SPACY_MODEL_MAP[self.language]
-        except KeyError as e:
-            raise KeyError(
-                f'Spacy model is not available for language {self.language}'
-            ) from e
+        model = get_model_for_language(language)
         # Disable the Named Entity Recognition (NER) component because it is not necessary
         self.nlp = spacy.load(model, exclude=['ner'])
@@ -157,12 +152,7 @@ class JaccardSimilarity(Comparator):
     def __init__(self, language):
         super().__init__(language)
-        try:
-            model = constants.DEFAULT_LANGUAGE_TO_SPACY_MODEL_MAP[self.language]
-        except KeyError as e:
-            raise KeyError(
-                f'Spacy model is not available for language {self.language}'
-            ) from e
+        model = get_model_for_language(language)
         # Disable the Named Entity Recognition (NER) component because it is not necessary
         self.nlp = spacy.load(model, exclude=['ner'])

chatterbot/ext/sqlalchemy_app/models.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from sqlalchemy import Table, Column, Integer, String, DateTime, ForeignKey
-from sqlalchemy.orm import relationship
+from sqlalchemy.orm import relationship, declarative_base
 from sqlalchemy.sql import func
-from sqlalchemy.ext.declarative import declared_attr, declarative_base
+from sqlalchemy.ext.declarative import declared_attr
 from chatterbot.conversation import StatementMixin
 from chatterbot import constants

chatterbot/logic/specific_response.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from chatterbot.logic import LogicAdapter
 from chatterbot.conversation import Statement
-from chatterbot import constants, languages
+from chatterbot import languages
+from chatterbot.utils import get_model_for_language
 import spacy
@@ -35,12 +36,7 @@ class SpecificResponseAdapter(LogicAdapter):
         self._output_text = kwargs.get('output_text')
     def _initialize_nlp(self, language):
-        try:
-            model = constants.DEFAULT_LANGUAGE_TO_SPACY_MODEL_MAP[language]
-        except KeyError as e:
-            raise KeyError(
-                f'Spacy model is not available for language {language}'
-            ) from e
+        model = get_model_for_language(language)
         return spacy.load(model)

chatterbot/logic/time_adapter.py CHANGED Viewed

@@ -1,7 +1,8 @@
 from datetime import datetime
-from chatterbot import constants, languages
+from chatterbot import languages
 from chatterbot.logic import LogicAdapter
 from chatterbot.conversation import Statement
+from chatterbot.utils import get_model_for_language
 import spacy
@@ -36,12 +37,7 @@ class TimeLogicAdapter(LogicAdapter):
         language = kwargs.get('language', languages.ENG)
-        try:
-            model = constants.DEFAULT_LANGUAGE_TO_SPACY_MODEL_MAP[language]
-        except KeyError as e:
-            raise KeyError(
-                f'Spacy model is not available for language {language}'
-            ) from e
+        model = get_model_for_language(language)
         self.nlp = spacy.load(model)

chatterbot/search.py CHANGED Viewed

@@ -149,58 +149,3 @@ class TextSearch:
                 ))
                 yield statement
-class VectorSearch:
-    """
-    .. note:: BETA feature: this search method is new and experimental.
-    Search for similar text based on a :term:`vector database`.
-    """
-    name = 'vector_search'
-    def __init__(self, chatbot, **kwargs):
-        from chatterbot.storage import RedisVectorStorageAdapter
-        # Good documentation:
-        # https://python.langchain.com/docs/integrations/vectorstores/redis/
-        #
-        # https://hub.docker.com/r/redis/redis-stack
-        # Mondodb:
-        # > Vector Search is only supported on Atlas Clusters
-        # https://www.mongodb.com/community/forums/t/can-a-local-mongodb-instance-be-used-when-working-with-langchain-mongodbatlasvectorsearch/265356
-        # FAISS:
-        # https://python.langchain.com/docs/integrations/vectorstores/faiss/
-        print("Starting Redis Vector Store")
-        # TODO: look into:
-        # https://python.langchain.com/api_reference/redis/chat_message_history/langchain_redis.chat_message_history.RedisChatMessageHistory.html
-        # The VectorSearch class is only compatible with the RedisVectorStorageAdapter
-        if not isinstance(chatbot.storage, RedisVectorStorageAdapter):
-            raise Exception(
-                'The VectorSearch search method requires the RedisVectorStorageAdapter storage adapter.'
-            )
-    def search(self, input_statement, **additional_parameters):
-        print("Querying Vector Store")
-        # Similarity search with score and filter
-        # NOTE: It looks like `return_all` is needed to return the full document
-        # specifically what we need here is the ID
-        scored_results = self.storage.vector_store.similarity_search_with_score(
-            input_statement.text, k=2, return_all=True
-        )
-        # sort_by="score", filter={"category": "likes"})
-        print("Similarity Search with Score Results:\n")
-        for doc, score in scored_results:
-            print(f"Content: {doc.page_content[:150]}...")
-            print(f"ID: {doc.id}")
-            print(f"Metadata: {doc.metadata}")
-            print(f"Score: {score}")
-            print()

chatterbot/storage/sql_storage.py CHANGED Viewed

@@ -326,7 +326,7 @@ class SQLStorageAdapter(StorageAdapter):
         record = None
         if hasattr(statement, 'id') and statement.id is not None:
-            record = session.query(Statement).get(statement.id)
+            record = session.get(Statement, statement.id)
         else:
             record = session.query(Statement).filter(
                 Statement.text == statement.text,

chatterbot/tagging.py CHANGED Viewed

@@ -1,4 +1,5 @@
-from chatterbot import languages, constants
+from chatterbot import languages
+from chatterbot.utils import get_model_for_language
 import spacy
@@ -42,12 +43,7 @@ class PosLemmaTagger(object):
         self.language = language or languages.ENG
-        try:
-            model = constants.DEFAULT_LANGUAGE_TO_SPACY_MODEL_MAP[self.language]
-        except KeyError as e:
-            raise KeyError(
-                f'Spacy model is not available for language {self.language}'
-            ) from e
+        model = get_model_for_language(self.language)
         # Disable the Named Entity Recognition (NER) component because it is not necessary
         self.nlp = spacy.load(model, exclude=['ner'])

chatterbot/trainers.py CHANGED Viewed

@@ -25,7 +25,7 @@ class Trainer(object):
         environment_default = bool(int(os.environ.get('CHATTERBOT_SHOW_TRAINING_PROGRESS', True)))
-        self.show_training_progress = kwargs.get(
+        self.disable_progress = not kwargs.get(
             'show_training_progress',
             environment_default
         )
@@ -54,7 +54,7 @@ class Trainer(object):
         def __init__(self, message=None):
             default = (
                 'A training class must be specified before calling train(). '
-                'See https://docs.chatterbot.us/training.html'
+                'See https://docs.chatterbot.us/training/'
             )
             super().__init__(message or default)
@@ -82,7 +82,7 @@ class ListTrainer(Trainer):
     where the list represents a conversation.
     """
-    def train(self, conversation):
+    def train(self, conversation: list):
         """
         Train the chat bot based on the provided list of
         statements that represents a single conversation.
@@ -96,7 +96,7 @@ class ListTrainer(Trainer):
         documents = self.chatbot.tagger.as_nlp_pipeline(conversation)
         # for text in enumerate(conversation):
-        for document in tqdm(documents, desc='List Trainer', disable=not self.show_training_progress):
+        for document in tqdm(documents, desc='List Trainer', disable=self.disable_progress):
             statement_search_text = document._.search_index
             statement = self.get_preprocessed_statement(
@@ -135,7 +135,7 @@ class ChatterBotCorpusTrainer(Trainer):
         for corpus, categories, _file_path in tqdm(
             load_corpus(*data_file_paths),
             desc='ChatterBot Corpus Trainer',
-            disable=not self.show_training_progress
+            disable=self.disable_progress
         ):
             statements_to_create = []
@@ -172,32 +172,259 @@ class ChatterBotCorpusTrainer(Trainer):
                 self.chatbot.storage.create_many(statements_to_create)
-class UbuntuCorpusTrainer(Trainer):
+class GenericFileTrainer(Trainer):
     """
+    Allows the chat bot to be trained using data from a CSV or JSON file,
+    or directory of those file types.
+    """
+    def __init__(self, chatbot, **kwargs):
+        """
+        data_path: str The path to the data file or directory.
+        field_map: dict A dictionary containing the column name to header mapping.
+        """
+        super().__init__(chatbot, **kwargs)
+        self.file_extension = None
+        # NOTE: If the key is an integer, this be the
+        # column index instead of the key or header
+        DEFAULT_STATEMENT_TO_HEADER_MAPPING = {
+            'text': 'text',
+            'conversation': 'conversation',
+            'created_at': 'created_at',
+            'persona': 'persona',
+            'tags': 'tags'
+        }
+        self.field_map = kwargs.get(
+            'field_map',
+            DEFAULT_STATEMENT_TO_HEADER_MAPPING
+        )
+    def _get_file_list(self, data_path, limit):
+        """
+        Get a list of files to read from the data set.
+        """
+        if self.file_extension is None:
+            raise self.TrainerInitializationException(
+                'The file_extension attribute must be set before calling train().'
+            )
+        # List all csv or json files in the specified directory
+        if os.path.isdir(data_path):
+            glob_path = os.path.join(data_path, '**', f'*.{self.file_extension}')
+            # Use iglob instead of glob for better performance with
+            # large directories because it returns an iterator
+            data_files = glob.iglob(glob_path, recursive=True)
+            for index, file_path in enumerate(data_files):
+                if limit is not None and index >= limit:
+                    break
+                yield file_path
+        else:
+            return [data_path]
+    def train(self, data_path: str, limit=None):
+        """
+        Train a chatbot with data from the data file.
+        :param str data_path: The path to the data file or directory.
+        :param int limit: The maximum number of files to train from.
+        """
+        if data_path is None:
+            raise self.TrainerInitializationException(
+                'The data_path argument must be set to the path of a file or directory.'
+            )
+        data_files = self._get_file_list(data_path, limit)
+        files_processed = 0
+        for data_file in tqdm(data_files, desc='Training', disable=self.disable_progress):
+            previous_statement_text = None
+            previous_statement_search_text = ''
+            file_extension = data_file.split('.')[-1].lower()
+            statements_to_create = []
+            with open(data_file, 'r', encoding='utf-8') as file:
+                if self.file_extension == 'json':
+                    data = json.load(file)
+                    data = data['conversation']
+                elif file_extension == 'csv':
+                    use_header = bool(isinstance(next(iter(self.field_map.values())), str))
+                    if use_header:
+                        data = csv.DictReader(file)
+                    else:
+                        data = csv.reader(file)
+                elif file_extension == 'tsv':
+                    use_header = bool(isinstance(next(iter(self.field_map.values())), str))
+                    if use_header:
+                        data = csv.DictReader(file, delimiter='\t')
+                    else:
+                        data = csv.reader(file, delimiter='\t')
+                else:
+                    self.logger.warning(f'Skipping unsupported file type: {file_extension}')
+                    continue
+                files_processed += 1
+                text_row = self.field_map['text']
+                documents = self.chatbot.tagger.as_nlp_pipeline([
+                    (
+                        row[text_row],
+                        {
+                            # Include any defined metadata columns
+                            key: row[value]
+                            for key, value in self.field_map.items()
+                            if key != text_row
+                        }
+                    ) for row in data if len(row) > 0
+                ])
+            for document, context in documents:
+                statement = Statement(
+                    text=document.text,
+                    conversation=context.get('conversation', 'training'),
+                    persona=context.get('persona', None),
+                    tags=context.get('tags', [])
+                )
+                if 'created_at' in context:
+                    statement.created_at = date_parser.parse(context['created_at'])
+                statement.search_text = document._.search_index
+                statement.search_in_response_to = previous_statement_search_text
+                # Use the in_response_to attribute for the previous statement if
+                # one is defined, otherwise use the last statement which was created
+                if 'in_response_to' in self.field_map.keys():
+                    statement.in_response_to = context.get(self.field_map['in_response_to'], None)
+                else:
+                    statement.in_response_to = previous_statement_text
+                for preprocessor in self.chatbot.preprocessors:
+                    statement = preprocessor(statement)
+                previous_statement_text = statement.text
+                previous_statement_search_text = statement.search_text
+                statements_to_create.append(statement)
+            self.chatbot.storage.create_many(statements_to_create)
+        if files_processed:
+            self.chatbot.logger.info(
+                'Training completed. {} files were read.'.format(files_processed)
+            )
+        else:
+            self.chatbot.logger.warning(
+                'No [{}] files were detected at: {}'.format(
+                    self.file_extension,
+                    data_path
+                )
+            )
+class CsvFileTrainer(GenericFileTrainer):
+    """
+    .. note::
+        Added in version 1.2.4
+    Allow chatbots to be trained with data from a CSV file or
+    directory of CSV files.
+    TSV files are also supported, as long as the file_extension
+    parameter is set to 'tsv'.
+    :param str file_extension: The file extension to look for when searching for files (defaults to 'csv').
+    :param str field_map: A dictionary containing the database column name to header mapping.
+                          Values can be either the header name (str) or the column index (int).
+    """
+    def __init__(self, chatbot, **kwargs):
+        super().__init__(chatbot, **kwargs)
+        self.file_extension = kwargs.get('file_extension', 'csv')
+class JsonFileTrainer(GenericFileTrainer):
+    """
+    .. note::
+        Added in version 1.2.4
+    Allow chatbots to be trained with data from a JSON file or
+    directory of JSON files.
+    :param str field_map: A dictionary containing the database column name to header mapping.
+    """
+    def __init__(self, chatbot, **kwargs):
+        super().__init__(chatbot, **kwargs)
+        self.file_extension = 'json'
+        DEFAULT_STATEMENT_TO_KEY_MAPPING = {
+            'text': 'text',
+            'conversation': 'conversation',
+            'created_at': 'created_at',
+            'in_response_to': 'in_response_to',
+            'persona': 'persona',
+            'tags': 'tags'
+        }
+        self.field_map = kwargs.get(
+            'field_map',
+            DEFAULT_STATEMENT_TO_KEY_MAPPING
+        )
+class UbuntuCorpusTrainer(CsvFileTrainer):
+    """
+    .. note::
+        PENDING DEPRECATION: Please use the ``CsvFileTrainer`` for data formats similar to this one.
     Allow chatbots to be trained with the data from the Ubuntu Dialog Corpus.
     For more information about the Ubuntu Dialog Corpus visit:
     https://dataset.cs.mcgill.ca/ubuntu-corpus-1.0/
+    :param str ubuntu_corpus_data_directory: The directory where the Ubuntu corpus data is already located, or where it should be downloaded and extracted.
     """
     def __init__(self, chatbot, **kwargs):
         super().__init__(chatbot, **kwargs)
         home_directory = os.path.expanduser('~')
-        self.data_download_url = kwargs.get(
-            'ubuntu_corpus_data_download_url',
-            'http://cs.mcgill.ca/~jpineau/datasets/ubuntu-corpus-1.0/ubuntu_dialogs.tgz'
-        )
+        self.data_download_url = None
         self.data_directory = kwargs.get(
             'ubuntu_corpus_data_directory',
             os.path.join(home_directory, 'ubuntu_data')
         )
-        self.extracted_data_directory = os.path.join(
+        # Directory containing extracted data
+        self.data_path = os.path.join(
             self.data_directory, 'ubuntu_dialogs'
         )
+        self.field_map = {
+            'text': 3,
+            'created_at': 0,
+            'persona': 1,
+        }
     def is_downloaded(self, file_path):
         """
         Check if the data file is already downloaded.
@@ -222,7 +449,6 @@ class UbuntuCorpusTrainer(Trainer):
         """
         Download a file from the given url.
         Show a progress indicator for the download status.
-        Based on: http://stackoverflow.com/a/15645088/1547223
         """
         import requests
@@ -238,7 +464,8 @@ class UbuntuCorpusTrainer(Trainer):
             return file_path
         with open(file_path, 'wb') as open_file:
-            print('Downloading %s' % url)
+            if show_status:
+                print('Downloading %s' % url)
             response = requests.get(url, stream=True)
             total_length = response.headers.get('content-length')
@@ -246,136 +473,97 @@ class UbuntuCorpusTrainer(Trainer):
                 # No content length header
                 open_file.write(response.content)
             else:
-                download = 0
-                total_length = int(total_length)
-                for data in response.iter_content(chunk_size=4096):
-                    download += len(data)
+                for data in tqdm(
+                    response.iter_content(chunk_size=4096),
+                    desc='Downloading',
+                    disable=not show_status
+                ):
                     open_file.write(data)
-                    if show_status:
-                        done = int(50 * download / total_length)
-                        sys.stdout.write('\r[%s%s]' % ('=' * done, ' ' * (50 - done)))
-                        sys.stdout.flush()
-            # Add a new line after the download bar
-            sys.stdout.write('\n')
-        print('Download location: %s' % file_path)
+        if show_status:
+            print('Download location: %s' % file_path)
         return file_path
     def extract(self, file_path):
         """
         Extract a tar file at the specified file path.
         """
-        print('Extracting {}'.format(file_path))
-        if not os.path.exists(self.extracted_data_directory):
-            os.makedirs(self.extracted_data_directory)
+        if not self.disable_progress:
+            print('Extracting {}'.format(file_path))
-        def track_progress(members):
-            sys.stdout.write('.')
-            for member in members:
-                # This will be the current file being extracted
-                yield member
+        if not os.path.exists(self.data_path):
+            os.makedirs(self.data_path)
-        with tarfile.open(file_path) as tar:
-            def is_within_directory(directory, target):
+        def is_within_directory(directory, target):
-                abs_directory = os.path.abspath(directory)
-                abs_target = os.path.abspath(target)
+            abs_directory = os.path.abspath(directory)
+            abs_target = os.path.abspath(target)
-                prefix = os.path.commonprefix([abs_directory, abs_target])
+            prefix = os.path.commonprefix([abs_directory, abs_target])
-                return prefix == abs_directory
+            return prefix == abs_directory
-            def safe_extract(tar, path=".", members=None, *, numeric_owner=False):
+        def safe_extract(tar, path='.', members=None, *, numeric_owner=False):
-                for member in tar.getmembers():
-                    member_path = os.path.join(path, member.name)
-                    if not is_within_directory(path, member_path):
-                        raise Exception("Attempted Path Traversal in Tar File")
+            for member in tar.getmembers():
+                member_path = os.path.join(path, member.name)
+                if not is_within_directory(path, member_path):
+                    raise Exception('Attempted Path Traversal in Tar File')
-                tar.extractall(path, members, numeric_owner=numeric_owner)
+            tar.extractall(path, members, numeric_owner=numeric_owner)
-            safe_extract(tar, path=self.extracted_data_directory, members=track_progress(tar))
+        try:
+            with tarfile.open(file_path, 'r') as tar:
+                safe_extract(tar, path=self.data_path, members=tqdm(tar, disable=self.disable_progress))
+        except tarfile.ReadError as e:
+            raise self.TrainerInitializationException(
+                f'The provided data file is not a valid tar file: {file_path}'
+            ) from e
-        self.chatbot.logger.info('File extracted to {}'.format(self.extracted_data_directory))
+        self.chatbot.logger.info('File extracted to {}'.format(self.data_path))
         return True
-    def train(self, limit=None):
+    def _get_file_list(self, data_path, limit):
         """
-        limit: int If defined, the number of files to read from the data set.
+        Get a list of files to read from the data set.
         """
+        if self.data_download_url is None:
+            raise self.TrainerInitializationException(
+                'The data_download_url attribute must be set before calling train().'
+            )
         # Download and extract the Ubuntu dialog corpus if needed
         corpus_download_path = self.download(self.data_download_url)
         # Extract if the directory does not already exist
-        if not self.is_extracted(self.extracted_data_directory):
+        if not self.is_extracted(data_path):
             self.extract(corpus_download_path)
         extracted_corpus_path = os.path.join(
-            self.extracted_data_directory,
-            '**', '**', '*.tsv'
+            data_path, '**', '**', '*.tsv'
         )
-        def chunks(items, items_per_chunk):
-            for start_index in range(0, len(items), items_per_chunk):
-                end_index = start_index + items_per_chunk
-                yield items[start_index:end_index]
+        # Use iglob instead of glob for better performance with
+        # large directories because it returns an iterator
+        data_files = glob.iglob(extracted_corpus_path)
-        file_list = glob.glob(extracted_corpus_path)
+        for index, file_path in enumerate(data_files):
+            if limit is not None and index >= limit:
+                break
-        # Limit the number of files used if a limit is defined
-        if limit is not None:
-            file_list = file_list[:limit]
+            yield file_path
-        file_groups = tuple(chunks(file_list, 5000))
+    def train(self, data_download_url, limit=None):
+        """
+        :param str data_download_url: The URL to download the Ubuntu dialog corpus from.
+        :param int limit: The maximum number of files to train from.
+        """
+        self.data_download_url = data_download_url
         start_time = time.time()
+        super().train(self.data_path, limit=limit)
-        for batch_number, tsv_files in enumerate(file_groups):
-            statements_from_file = []
-            for tsv_file in tqdm(tsv_files, desc=f'Training with batch {batch_number} of {len(file_groups)}'):
-                with open(tsv_file, 'r', encoding='utf-8') as tsv:
-                    reader = csv.reader(tsv, delimiter='\t')
-                    previous_statement_text = None
-                    previous_statement_search_text = ''
-                    documents = self.chatbot.tagger.as_nlp_pipeline([
-                        (
-                            row[3],
-                            {
-                                'persona': row[1],
-                                'created_at': row[0],
-                            }
-                         ) for row in reader if len(row) > 0
-                    ])
-                    for document, context in documents:
-                        statement_search_text = document._.search_index
-                        statement = Statement(
-                            text=document.text,
-                            in_response_to=previous_statement_text,
-                            conversation='training',
-                            created_at=date_parser.parse(context['created_at']),
-                            persona=context['persona'],
-                            search_text=statement_search_text,
-                            search_in_response_to=previous_statement_search_text
-                        )
-                        for preprocessor in self.chatbot.preprocessors:
-                            statement = preprocessor(statement)
-                        previous_statement_text = statement.text
-                        previous_statement_search_text = statement_search_text
-                        statements_from_file.append(statement)
-            self.chatbot.storage.create_many(statements_from_file)
-        print('Training took', time.time() - start_time, 'seconds.')
+        if not self.disable_progress:
+            print('Training took', time.time() - start_time, 'seconds.')

chatterbot/utils.py CHANGED Viewed

@@ -89,30 +89,21 @@ def get_response_time(chatbot, statement='Hello'):
     return time.time() - start_time
-def print_progress_bar(description, iteration_counter, total_items, progress_bar_length=20):
+def get_model_for_language(language):
     """
-    Print progress bar
-    :param description: Training description
-    :type description: str
-    :param iteration_counter: Incremental counter
-    :type iteration_counter: int
-    :param total_items: total number items
-    :type total_items: int
-    :param progress_bar_length: Progress bar length
-    :type progress_bar_length: int
-    :returns: void
-    :rtype: void
-    DEPRECTTED: use `tqdm` instead
+    Returns the spacy model for the specified language.
     """
-    percent = float(iteration_counter) / total_items
-    hashes = '#' * int(round(percent * progress_bar_length))
-    spaces = ' ' * (progress_bar_length - len(hashes))
-    sys.stdout.write('\r{0}: [{1}] {2}%'.format(description, hashes + spaces, int(round(percent * 100))))
-    sys.stdout.flush()
-    if total_items == iteration_counter:
-        print('\r')
+    from chatterbot import constants
+    try:
+        model = constants.DEFAULT_LANGUAGE_TO_SPACY_MODEL_MAP[language]
+    except KeyError as e:
+        if hasattr(language, 'ENGLISH_NAME'):
+            language_name = language.ENGLISH_NAME
+        else:
+            language_name = language
+        raise KeyError(
+            f'A corresponding spacy model for "{language_name}" could not be found.'
+        ) from e
+    return model

{chatterbot-1.2.3.dist-info → chatterbot-1.2.4.dist-info}/METADATA RENAMED Viewed

@@ -1,27 +1,15 @@
-Metadata-Version: 2.2
+Metadata-Version: 2.4
 Name: ChatterBot
-Version: 1.2.3
+Version: 1.2.4
 Summary: ChatterBot is a machine learning, conversational dialog engine
 Author: Gunther Cox
-License: Copyright (c) 2016 - 2025, Gunther Cox
-        All rights reserved.
-        Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
-        * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
-        * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-        * Neither the name of ChatterBot nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-        THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+License-Expression: BSD-3-Clause
 Project-URL: Documentation, https://docs.chatterbot.us
 Project-URL: Repository, https://github.com/gunthercox/ChatterBot
+Project-URL: Changelog, https://github.com/gunthercox/ChatterBot/releases
 Keywords: ChatterBot,chatbot,chat,bot,natural language processing,nlp,artificial intelligence,ai
 Classifier: Development Status :: 4 - Beta
 Classifier: Intended Audience :: Developers
-Classifier: License :: OSI Approved :: BSD License
 Classifier: Operating System :: OS Independent
 Classifier: Environment :: Console
 Classifier: Environment :: Web Environment
@@ -40,7 +28,7 @@ Classifier: Programming Language :: Python
 Classifier: Programming Language :: Python :: 3
 Classifier: Programming Language :: Python :: 3.9
 Classifier: Programming Language :: Python :: 3 :: Only
-Requires-Python: ~=3.9
+Requires-Python: <3.13,>=3.9
 Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: mathparse<0.2,>=0.1
@@ -54,6 +42,7 @@ Requires-Dist: coverage; extra == "test"
 Requires-Dist: nose; extra == "test"
 Requires-Dist: sphinx<8.2,>=5.3; extra == "test"
 Requires-Dist: sphinx-sitemap>=2.6.0; extra == "test"
+Requires-Dist: huggingface_hub; extra == "test"
 Provides-Extra: dev
 Requires-Dist: pint>=0.8.1; extra == "dev"
 Requires-Dist: pyyaml<7.0,>=6.0; extra == "dev"
@@ -65,6 +54,7 @@ Requires-Dist: langchain-huggingface; extra == "redis"
 Requires-Dist: sentence-transformers; extra == "redis"
 Provides-Extra: mongodb
 Requires-Dist: pymongo<4.12,>=4.11; extra == "mongodb"
+Dynamic: license-file
 ![ChatterBot: Machine learning in Python](https://i.imgur.com/b3SCmGT.png)
@@ -167,7 +157,7 @@ See release notes for changes https://github.com/gunthercox/ChatterBot/releases
    a new branch `my-pull-request`.
 3. [Create a pull request](https://help.github.com/articles/creating-a-pull-request/).
 4. Please follow the [Python style guide for PEP-8](https://www.python.org/dev/peps/pep-0008/).
-5. Use the projects [built-in automated testing](https://docs.chatterbot.us/testing.html).
+5. Use the projects [built-in automated testing](https://docs.chatterbot.us/testing/).
    to help make sure that your contribution is free from errors.
 # License

{chatterbot-1.2.3.dist-info → chatterbot-1.2.4.dist-info}/RECORD RENAMED Viewed

@@ -1,8 +1,8 @@
-chatterbot/__init__.py,sha256=PhC2oXazQN3HNYXvWb33IAqEzwyt5QtqcfESq8eg3sg,158
-chatterbot/__main__.py,sha256=nk19D56TlPT9Zdqkq4qZZrOnLKEc4YTwUVWmXYwSyHg,207
+chatterbot/__init__.py,sha256=Woq2bFnaAs8yTE2HVPsxXEyzFrXs1njsGJnJVgbYGvI,158
+chatterbot/__main__.py,sha256=zvH4uxtGlGrP-ht_LkhX29duzjm3hRH800SDCq4YOwg,637
 chatterbot/adapters.py,sha256=LJ_KqLpHKPdYAFpMGK63RVH4weV5X0Zh5uGyan6qdVU,878
-chatterbot/chatterbot.py,sha256=YLKLkQ-XI4Unr3rbzjpGIupOqenuevm21tAnx-yFFgQ,10400
-chatterbot/comparisons.py,sha256=8-qLFWC1Z7tZ3iPUpyY6AD9l-whSo3QE1Rno_SzIp-I,6570
+chatterbot/chatterbot.py,sha256=BW_XQK78iOvc0fZ8EsEglNUdjyRE2lxUI_sP-fa4gCc,12505
+chatterbot/comparisons.py,sha256=8kkjW-lhS-57XSUlQI5B-dAdJO-CvkIirWLBKtbe4gw,6187
 chatterbot/components.py,sha256=ld3Xam8olBClvE5QqcFYggE7Q7tODCFek7BO7lhfyeU,1782
 chatterbot/constants.py,sha256=c_KPQKc82CHX6H3maeyTYqWatx6j-N-8HJhmejoVi60,1875
 chatterbot/conversation.py,sha256=Y-WOxPN7I3igRyAEe5py1sfS6JIYPdbwjVlY3kM8Ys8,3175
@@ -13,10 +13,10 @@ chatterbot/languages.py,sha256=XSenfc5FxHk_JWG5gGHsZvjvrPBbCaVCm_OU-BeER_M,32784
 chatterbot/parsing.py,sha256=vS-w70cMkjq4YEpDOv_pXWhAI6Zj06WYDAcMDhYDj0M,23174
 chatterbot/preprocessors.py,sha256=aI4v987dZc7GOKhO43i0i73EX748hehYSpzikFHpEXs,1271
 chatterbot/response_selection.py,sha256=aYeZ54jpGIcQnI-1-TDcua_f1p3PiM5_iMg4hF5ZaIU,2951
-chatterbot/search.py,sha256=S4MFL1JzPqT-pv7tCgd-MIqf0T9Ia_KOLoNgzdoCP4Y,7035
-chatterbot/tagging.py,sha256=GLY9wg_rvn6pSYVML-HcxkIo_3BZ3SAyj-q1oNZY8pI,2584
-chatterbot/trainers.py,sha256=U1yh0_V7FFL51MeQe1P1Q59weceDbkHh_2kDiDYpSEc,13315
-chatterbot/utils.py,sha256=ckQXvsjp2FO9GcWxziY67JovN7mShnE4RlzdYarQY5k,3277
+chatterbot/search.py,sha256=FTwwON2eKPWqoc5uoKh4AUmuXDCqyfMcMcXB4wijpxg,4910
+chatterbot/tagging.py,sha256=czcI2g18vILujphkjvobRyEewJU8-QjS7QRzY-hCZ4o,2429
+chatterbot/trainers.py,sha256=9mxi1_UmtiuXuEzpn4uztnV8PObD0Xt0PrAbTZ6oyt0,19294
+chatterbot/utils.py,sha256=tGmUt-KYYylD2fiG_oq_XxhGbAHukzwudZ_6hNuraIA,2944
 chatterbot/vectorstores.py,sha256=-S1NB8PrZzoFIu95n2W7N4UaXuCUpyDUXIGYFebjv08,2056
 chatterbot/ext/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 chatterbot/ext/django_chatterbot/__init__.py,sha256=iWzmBzpAsYwkwi1faxAPFY9L1bbL97RgVXK2uqULIMc,92
@@ -47,22 +47,22 @@ chatterbot/ext/django_chatterbot/migrations/0018_text_max_length.py,sha256=508Tx
 chatterbot/ext/django_chatterbot/migrations/0019_alter_statement_id_alter_tag_id_and_more.py,sha256=rsVxwDFMQ-cU1KMhjDq9Wcl_6gTPKc_dc3p-gv_R7v8,999
 chatterbot/ext/django_chatterbot/migrations/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 chatterbot/ext/sqlalchemy_app/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-chatterbot/ext/sqlalchemy_app/models.py,sha256=pjU4e2BUSitw_IAkrk4iFQ9pZRU35y5MomvX7aiBFCw,2492
+chatterbot/ext/sqlalchemy_app/models.py,sha256=ZQ-R_5rA-f1agaqYGUQhuuO7zx__BvTDUvJo5R7ZrDY,2492
 chatterbot/logic/__init__.py,sha256=28-5swBCPfSVMl8xB5C8frOKZ2oj28rQfenbd9E4r-4,531
 chatterbot/logic/best_match.py,sha256=8TNW0uZ_Uq-XPfaZUMUZDVH6KzDT65j59xblxQBv-dQ,4820
 chatterbot/logic/logic_adapter.py,sha256=5kNEirh5fiF5hhSMFXD7bIkKwXHmrSsSS4qDm-6xry0,4694
 chatterbot/logic/mathematical_evaluation.py,sha256=GPDKUwNFajERof2R-MkPGi2jJRP-rKAGm_f0V9JHDHE,2282
-chatterbot/logic/specific_response.py,sha256=o17YIeu9DzucO8MXMP3kwNIBb1b8br60bbAhSE7AZWc,2386
-chatterbot/logic/time_adapter.py,sha256=mxdoQGeC5IjREH4PU5iHYOIPEvnYnzgysocR8xMYWXc,2406
+chatterbot/logic/specific_response.py,sha256=akWHkfe0AjzlCUvjs_PbKFNkX4SZhu_tzY45xCRXoo0,2236
+chatterbot/logic/time_adapter.py,sha256=1PT6tWtGauZLRH02-Xlh2LublDpu_3hnCqHBqNGM9yg,2256
 chatterbot/logic/unit_conversion.py,sha256=-ENMLqZqtZx0riUi0guda2oJECST0M7pZG4cSIv3ieM,5898
 chatterbot/storage/__init__.py,sha256=ADw0WQe0YKr1UIDQLaxwf0mHDnuKW_CSzgz11K4TM-4,465
 chatterbot/storage/django_storage.py,sha256=S5S4GipD7FyNJy4RWu5-S8sLPuSJIObwTtqTpnJu-ok,6159
 chatterbot/storage/mongodb.py,sha256=Ozvdvcjb3LGZxcvbSQGzwP9VloYQbmsa2FaKunFpMyU,7934
 chatterbot/storage/redis.py,sha256=FKROrzZ-7WXZ8ZoK0dKmTDdS45TxL04XOSeu0p3Jrak,12675
-chatterbot/storage/sql_storage.py,sha256=VVYZvclG_74IN-MrG0edc-RQ2gUO6gRQyCWWSO0MmCk,13082
+chatterbot/storage/sql_storage.py,sha256=dAMLByFKQgbiTFoBUtKDeqadYRdwVO5fz1OONTcVCH4,13076
 chatterbot/storage/storage_adapter.py,sha256=fvyb-qNiB0HMJ0siVMCWUIY--6d-C47N1_kKZVFZAv4,6110
-chatterbot-1.2.3.dist-info/LICENSE,sha256=5b04U8mi0wp5gJMYlKi49EalnD9Q2nwY_6UEI_Avgu4,1476
-chatterbot-1.2.3.dist-info/METADATA,sha256=xnofLrmf6knmhcwBVcodzvxpZQ-eb4tbLB970dXQG8I,8503
-chatterbot-1.2.3.dist-info/WHEEL,sha256=52BFRY2Up02UkjOa29eZOS2VxUrpPORXg1pkohGGUS8,91
-chatterbot-1.2.3.dist-info/top_level.txt,sha256=W2TzAbAJ-eBXTIKZZhVlkrh87msJNmBQpyhkrHqjSrE,11
-chatterbot-1.2.3.dist-info/RECORD,,
+chatterbot-1.2.4.dist-info/licenses/LICENSE,sha256=5b04U8mi0wp5gJMYlKi49EalnD9Q2nwY_6UEI_Avgu4,1476
+chatterbot-1.2.4.dist-info/METADATA,sha256=lrGa5gxvPrNRh6fCKqr7zPvRp_qmY293ijj0ODW4uZM,7049
+chatterbot-1.2.4.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
+chatterbot-1.2.4.dist-info/top_level.txt,sha256=W2TzAbAJ-eBXTIKZZhVlkrh87msJNmBQpyhkrHqjSrE,11
+chatterbot-1.2.4.dist-info/RECORD,,

{chatterbot-1.2.3.dist-info → chatterbot-1.2.4.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (76.0.0)
+Generator: setuptools (78.1.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

{chatterbot-1.2.3.dist-info → chatterbot-1.2.4.dist-info/licenses}/LICENSE RENAMED Viewed

File without changes

{chatterbot-1.2.3.dist-info → chatterbot-1.2.4.dist-info}/top_level.txt RENAMED Viewed

File without changes

ChatterBot 1.2.3__py3-none-any.whl → 1.2.4__py3-none-any.whl

ChatterBot 1.2.3py3-none-any.whl → 1.2.4py3-none-any.whl