PyPI - camel-ai - Versions diffs - 0.1.6.2__py3-none-any.whl → 0.1.6.4__py3-none-any.whl - Mend

camel-ai 0.1.6.2py3-none-any.whl → 0.1.6.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of camel-ai might be problematic. Click here for more details.

Files changed (26) hide show

camel/__init__.py +1 -1
camel/configs/mistral_config.py +13 -9
camel/embeddings/mistral_embedding.py +5 -5
camel/interpreters/docker_interpreter.py +1 -1
camel/loaders/__init__.py +1 -2
camel/loaders/base_io.py +118 -52
camel/loaders/jina_url_reader.py +6 -6
camel/loaders/unstructured_io.py +24 -286
camel/models/__init__.py +2 -0
camel/models/mistral_model.py +120 -26
camel/models/model_factory.py +3 -3
camel/models/openai_compatibility_model.py +105 -0
camel/retrievers/auto_retriever.py +25 -35
camel/retrievers/vector_retriever.py +20 -18
camel/storages/object_storages/__init__.py +22 -0
camel/storages/object_storages/amazon_s3.py +205 -0
camel/storages/object_storages/azure_blob.py +166 -0
camel/storages/object_storages/base.py +115 -0
camel/storages/object_storages/google_cloud.py +152 -0
camel/toolkits/retrieval_toolkit.py +5 -5
camel/toolkits/search_toolkit.py +4 -4
camel/types/enums.py +7 -0
camel/utils/token_counting.py +7 -3
{camel_ai-0.1.6.2.dist-info → camel_ai-0.1.6.4.dist-info}/METADATA +9 -5
{camel_ai-0.1.6.2.dist-info → camel_ai-0.1.6.4.dist-info}/RECORD +26 -20
{camel_ai-0.1.6.2.dist-info → camel_ai-0.1.6.4.dist-info}/WHEEL +0 -0

camel/models/openai_compatibility_model.py ADDED Viewed

@@ -0,0 +1,105 @@
+# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
+from typing import Any, Dict, List, Optional, Union
+from openai import OpenAI, Stream
+from camel.messages import OpenAIMessage
+from camel.types import ChatCompletion, ChatCompletionChunk, ModelType
+from camel.utils import (
+    BaseTokenCounter,
+    OpenAITokenCounter,
+)
+class OpenAICompatibilityModel:
+    r"""Constructor for model backend supporting OpenAI compatibility."""
+    def __init__(
+        self,
+        model_type: str,
+        model_config_dict: Dict[str, Any],
+        api_key: str,
+        url: str,
+        token_counter: Optional[BaseTokenCounter] = None,
+    ) -> None:
+        r"""Constructor for model backend.
+        Args:
+            model_type (ModelType): Model for which a backend is created.
+            model_config_dict (Dict[str, Any]): A dictionary that will
+                be fed into openai.ChatCompletion.create().
+            api_key (str): The API key for authenticating with the
+                model service. (default: :obj:`None`)
+            url (str): The url to the model service. (default:
+                :obj:`None`)
+            token_counter (Optional[BaseTokenCounter]): Token counter to use
+                for the model. If not provided, `OpenAITokenCounter(ModelType.
+                GPT_3_5_TURBO)` will be used.
+        """
+        self.model_type = model_type
+        self.model_config_dict = model_config_dict
+        self._token_counter = token_counter
+        self._client = OpenAI(
+            timeout=60,
+            max_retries=3,
+            api_key=api_key,
+            base_url=url,
+        )
+    def run(
+        self,
+        messages: List[OpenAIMessage],
+    ) -> Union[ChatCompletion, Stream[ChatCompletionChunk]]:
+        r"""Runs inference of OpenAI chat completion.
+        Args:
+            messages (List[OpenAIMessage]): Message list with the chat history
+                in OpenAI API format.
+        Returns:
+            Union[ChatCompletion, Stream[ChatCompletionChunk]]:
+                `ChatCompletion` in the non-stream mode, or
+                `Stream[ChatCompletionChunk]` in the stream mode.
+        """
+        response = self._client.chat.completions.create(
+            messages=messages,
+            model=self.model_type,
+            **self.model_config_dict,
+        )
+        return response
+    @property
+    def token_counter(self) -> BaseTokenCounter:
+        r"""Initialize the token counter for the model backend.
+        Returns:
+            OpenAITokenCounter: The token counter following the model's
+                tokenization style.
+        """
+        if not self._token_counter:
+            self._token_counter = OpenAITokenCounter(ModelType.GPT_3_5_TURBO)
+        return self._token_counter
+    @property
+    def stream(self) -> bool:
+        r"""Returns whether the model is in stream mode, which sends partial
+        results each time.
+        Returns:
+            bool: Whether the model is in stream mode.
+        """
+        return self.model_config_dict.get('stream', False)

camel/retrievers/auto_retriever.py CHANGED Viewed

@@ -97,36 +97,36 @@ class AutoRetriever:
             f"Unsupported vector storage type: {self.storage_type}"
         )
-    def _collection_name_generator(self, content_input_path: str) -> str:
+    def _collection_name_generator(self, content: str) -> str:
         r"""Generates a valid collection name from a given file path or URL.
         Args:
-            content_input_path: str. The input URL or file path from which to
-                generate the collection name.
+            contents (str): Local file path, remote URL or string content.
         Returns:
             str: A sanitized, valid collection name suitable for use.
         """
-        # Check path type
-        parsed_url = urlparse(content_input_path)
-        self.is_url = all([parsed_url.scheme, parsed_url.netloc])
+        # Check if the content is URL
+        parsed_url = urlparse(content)
+        is_url = all([parsed_url.scheme, parsed_url.netloc])
         # Convert given path into a collection name, ensuring it only
         # contains numbers, letters, and underscores
-        if self.is_url:
+        if is_url:
             # For URLs, remove https://, replace /, and any characters not
             # allowed by Milvus with _
             collection_name = re.sub(
                 r'[^0-9a-zA-Z]+',
                 '_',
-                content_input_path.replace("https://", ""),
+                content.replace("https://", ""),
             )
-        else:
+        elif os.path.exists(content):
             # For file paths, get the stem and replace spaces with _, also
             # ensuring only allowed characters are present
-            collection_name = re.sub(
-                r'[^0-9a-zA-Z]+', '_', Path(content_input_path).stem
-            )
+            collection_name = re.sub(r'[^0-9a-zA-Z]+', '_', Path(content).stem)
+        else:
+            # the content is string input
+            collection_name = content[:10]
         # Ensure the collection name does not start or end with underscore
         collection_name = collection_name.strip("_")
@@ -193,7 +193,7 @@ class AutoRetriever:
     def run_vector_retriever(
         self,
         query: str,
-        content_input_paths: Union[str, List[str]],
+        contents: Union[str, List[str]],
         top_k: int = DEFAULT_TOP_K_RESULTS,
         similarity_threshold: float = DEFAULT_SIMILARITY_THRESHOLD,
         return_detailed_info: bool = False,
@@ -203,8 +203,8 @@ class AutoRetriever:
         Args:
             query (str): Query string for information retriever.
-            content_input_paths (Union[str, List[str]]): Paths to local
-                files or remote URLs.
+            contents (Union[str, List[str]]): Local file paths, remote URLs or
+                string contents.
             top_k (int, optional): The number of top results to return during
                 retrieve. Must be a positive integer. Defaults to
                 `DEFAULT_TOP_K_RESULTS`.
@@ -223,24 +223,18 @@ class AutoRetriever:
         Raises:
             ValueError: If there's an vector storage existing with content
                 name in the vector path but the payload is None. If
-                `content_input_paths` is empty.
+                `contents` is empty.
             RuntimeError: If any errors occur during the retrieve process.
         """
-        if not content_input_paths:
-            raise ValueError("content_input_paths cannot be empty.")
+        if not contents:
+            raise ValueError("content cannot be empty.")
-        content_input_paths = (
-            [content_input_paths]
-            if isinstance(content_input_paths, str)
-            else content_input_paths
-        )
+        contents = [contents] if isinstance(contents, str) else contents
         all_retrieved_info = []
-        for content_input_path in content_input_paths:
+        for content in contents:
             # Generate a valid collection name
-            collection_name = self._collection_name_generator(
-                content_input_path
-            )
+            collection_name = self._collection_name_generator(content)
             try:
                 vector_storage_instance = self._initialize_vector_storage(
                     collection_name
@@ -251,13 +245,11 @@ class AutoRetriever:
                 file_is_modified = False  # initialize with a default value
                 if (
                     vector_storage_instance.status().vector_count != 0
-                    and not self.is_url
+                    and os.path.exists(content)
                 ):
                     # Get original modified date from file
                     modified_date_from_file = (
-                        self._get_file_modified_date_from_file(
-                            content_input_path
-                        )
+                        self._get_file_modified_date_from_file(content)
                     )
                     # Get modified date from vector storage
                     modified_date_from_storage = (
@@ -280,18 +272,16 @@ class AutoRetriever:
                     # Process and store the content to the vector storage
                     vr = VectorRetriever(
                         storage=vector_storage_instance,
-                        similarity_threshold=similarity_threshold,
                         embedding_model=self.embedding_model,
                     )
-                    vr.process(content_input_path)
+                    vr.process(content)
                 else:
                     vr = VectorRetriever(
                         storage=vector_storage_instance,
-                        similarity_threshold=similarity_threshold,
                         embedding_model=self.embedding_model,
                     )
                 # Retrieve info by given query from the vector storage
-                retrieved_info = vr.query(query, top_k)
+                retrieved_info = vr.query(query, top_k, similarity_threshold)
                 all_retrieved_info.extend(retrieved_info)
             except Exception as e:
                 raise RuntimeError(

camel/retrievers/vector_retriever.py CHANGED Viewed

@@ -11,7 +11,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
+import os
 from typing import Any, Dict, List, Optional
+from urllib.parse import urlparse
 from camel.embeddings import BaseEmbedding, OpenAIEmbedding
 from camel.loaders import UnstructuredIO
@@ -38,24 +40,18 @@ class VectorRetriever(BaseRetriever):
         embedding_model (BaseEmbedding): Embedding model used to generate
             vector embeddings.
         storage (BaseVectorStorage): Vector storage to query.
-        similarity_threshold (float, optional): The similarity threshold
-            for filtering results. Defaults to `DEFAULT_SIMILARITY_THRESHOLD`.
         unstructured_modules (UnstructuredIO): A module for parsing files and
             URLs and chunking content based on specified parameters.
     """
     def __init__(
         self,
-        similarity_threshold: float = DEFAULT_SIMILARITY_THRESHOLD,
         embedding_model: Optional[BaseEmbedding] = None,
         storage: Optional[BaseVectorStorage] = None,
     ) -> None:
         r"""Initializes the retriever class with an optional embedding model.
         Args:
-            similarity_threshold (float, optional): The similarity threshold
-                for filtering results. Defaults to
-                `DEFAULT_SIMILARITY_THRESHOLD`.
             embedding_model (Optional[BaseEmbedding]): The embedding model
                 instance. Defaults to `OpenAIEmbedding` if not provided.
             storage (BaseVectorStorage): Vector storage to query.
@@ -68,12 +64,11 @@ class VectorRetriever(BaseRetriever):
                 vector_dim=self.embedding_model.get_output_dim()
             )
         )
-        self.similarity_threshold = similarity_threshold
-        self.unstructured_modules: UnstructuredIO = UnstructuredIO()
+        self.uio: UnstructuredIO = UnstructuredIO()
     def process(
         self,
-        content_input_path: str,
+        content: str,
         chunk_type: str = "chunk_by_title",
         **kwargs: Any,
     ) -> None:
@@ -82,16 +77,19 @@ class VectorRetriever(BaseRetriever):
         vector storage.
         Args:
-            content_input_path (str): File path or URL of the content to be
-                processed.
+            contents (str): Local file path, remote URL or string content.
             chunk_type (str): Type of chunking going to apply. Defaults to
                 "chunk_by_title".
             **kwargs (Any): Additional keyword arguments for content parsing.
         """
-        elements = self.unstructured_modules.parse_file_or_url(
-            content_input_path, **kwargs
-        )
-        chunks = self.unstructured_modules.chunk_elements(
+        # Check if the content is URL
+        parsed_url = urlparse(content)
+        is_url = all([parsed_url.scheme, parsed_url.netloc])
+        if is_url or os.path.exists(content):
+            elements = self.uio.parse_file_or_url(content, **kwargs)
+        else:
+            elements = [self.uio.create_element_from_text(text=content)]
+        chunks = self.uio.chunk_elements(
             chunk_type=chunk_type, elements=elements
         )
         # Iterate to process and store embeddings, set batch of 50
@@ -105,7 +103,7 @@ class VectorRetriever(BaseRetriever):
             # Prepare the payload for each vector record, includes the content
             # path, chunk metadata, and chunk text
             for vector, chunk in zip(batch_vectors, batch_chunks):
-                content_path_info = {"content path": content_input_path}
+                content_path_info = {"content path": content}
                 chunk_metadata = {"metadata": chunk.metadata.to_dict()}
                 chunk_text = {"text": str(chunk)}
                 combined_dict = {
@@ -124,12 +122,16 @@ class VectorRetriever(BaseRetriever):
         self,
         query: str,
         top_k: int = DEFAULT_TOP_K_RESULTS,
+        similarity_threshold: float = DEFAULT_SIMILARITY_THRESHOLD,
     ) -> List[Dict[str, Any]]:
         r"""Executes a query in vector storage and compiles the retrieved
         results into a dictionary.
         Args:
             query (str): Query string for information retriever.
+            similarity_threshold (float, optional): The similarity threshold
+                for filtering results. Defaults to
+                `DEFAULT_SIMILARITY_THRESHOLD`.
             top_k (int, optional): The number of top results to return during
                 retriever. Must be a positive integer. Defaults to 1.
@@ -161,7 +163,7 @@ class VectorRetriever(BaseRetriever):
         formatted_results = []
         for result in query_results:
             if (
-                result.similarity >= self.similarity_threshold
+                result.similarity >= similarity_threshold
                 and result.record.payload is not None
             ):
                 result_dict = {
@@ -182,7 +184,7 @@ class VectorRetriever(BaseRetriever):
                     'text': (
                         f"No suitable information retrieved "
                         f"from {content_path} with similarity_threshold"
-                        f" = {self.similarity_threshold}."
+                        f" = {similarity_threshold}."
                     )
                 }
             ]

camel/storages/object_storages/__init__.py ADDED Viewed

@@ -0,0 +1,22 @@
+# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
+from .amazon_s3 import AmazonS3Storage
+from .azure_blob import AzureBlobStorage
+from .google_cloud import GoogleCloudStorage
+__all__ = [
+    "AmazonS3Storage",
+    "AzureBlobStorage",
+    "GoogleCloudStorage",
+]

camel/storages/object_storages/amazon_s3.py ADDED Viewed

@@ -0,0 +1,205 @@
+# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
+import os
+from pathlib import Path, PurePath
+from typing import Optional, Tuple
+from warnings import warn
+from camel.loaders import File
+from camel.storages.object_storages.base import BaseObjectStorage
+class AmazonS3Storage(BaseObjectStorage):
+    r"""A class to connect with AWS S3 object storage to put and get objects
+    from one S3 bucket. The class will first try to use the credentials passed
+    as arguments, if not provided, it will look for the environment variables
+    `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY`. If none of these are
+    provided, it will try to use the local credentials (will be created if
+    logged in with AWS CLI).
+    Args:
+        bucket_name (str): The name of the S3 bucket.
+        create_if_not_exists (bool, optional): Whether to create the bucket if
+            it does not exist. Defaults to True.
+        access_key_id (Optional[str], optional): The AWS access key ID.
+            Defaults to None.
+        secret_access_key (Optional[str], optional): The AWS secret access key.
+            Defaults to None.
+        anonymous (bool, optional): Whether to use anonymous access. Defaults
+            to False.
+    References:
+        https://aws.amazon.com/pm/serv-s3/
+        https://aws.amazon.com/cli/
+    """
+    def __init__(
+        self,
+        bucket_name: str,
+        create_if_not_exists: bool = True,
+        access_key_id: Optional[str] = None,
+        secret_access_key: Optional[str] = None,
+        anonymous: bool = False,
+    ) -> None:
+        self._bucket_name = bucket_name
+        self._create_if_not_exists = create_if_not_exists
+        aws_key_id = access_key_id or os.getenv("AWS_ACCESS_KEY_ID")
+        aws_secret_key = secret_access_key or os.getenv(
+            "AWS_SECRET_ACCESS_KEY"
+        )
+        if not all([aws_key_id, aws_secret_key]) and not anonymous:
+            warn(
+                "AWS access key not configured. Local credentials will be "
+                "used."
+            )
+            # Make all the empty values None
+            aws_key_id = None
+            aws_secret_key = None
+        import boto3
+        from botocore import UNSIGNED
+        from botocore.config import Config
+        if not anonymous:
+            self._client = boto3.client(
+                "s3",
+                aws_access_key_id=aws_key_id,
+                aws_secret_access_key=aws_secret_key,
+            )
+        else:
+            self._client = boto3.client(
+                "s3", config=Config(signature_version=UNSIGNED)
+            )
+        self._prepare_and_check()
+    def _prepare_and_check(self) -> None:
+        r"""Check privileges and existence of the bucket."""
+        from botocore.exceptions import ClientError, NoCredentialsError
+        try:
+            self._client.head_bucket(Bucket=self._bucket_name)
+        except ClientError as e:
+            error_code = e.response['Error']['Code']
+            if error_code == '403':
+                raise PermissionError(
+                    f"Failed to access bucket {self._bucket_name}: "
+                    f"No permission."
+                )
+            elif error_code == '404':
+                if self._create_if_not_exists:
+                    self._client.create_bucket(Bucket=self._bucket_name)
+                    warn(
+                        f"Bucket {self._bucket_name} not found. Automatically "
+                        f"created."
+                    )
+                else:
+                    raise FileNotFoundError(
+                        f"Failed to access bucket {self._bucket_name}: Not "
+                        f"found."
+                    )
+            else:
+                raise e
+        except NoCredentialsError as e:
+            raise PermissionError("No AWS credentials found.") from e
+    @staticmethod
+    def canonicalize_path(file_path: PurePath) -> Tuple[str, str]:
+        r"""Canonicalize file path for Amazon S3.
+        Args:
+            file_path (PurePath): The path to be canonicalized.
+        Returns:
+            Tuple[str, str]: The canonicalized file key and file name.
+        """
+        return file_path.as_posix(), file_path.name
+    def _put_file(self, file_key: str, file: File) -> None:
+        r"""Put a file to the Amazon S3 bucket.
+        Args:
+            file_key (str): The path to the object in the bucket.
+            file (File): The file to be uploaded.
+        """
+        self._client.put_object(
+            Bucket=self._bucket_name, Key=file_key, Body=file.raw_bytes
+        )
+    def _get_file(self, file_key: str, filename: str) -> File:
+        r"""Get a file from the Amazon S3 bucket.
+        Args:
+            file_key (str): The path to the object in the bucket.
+            filename (str): The name of the file.
+        Returns:
+            File: The object from the S3 bucket.
+        """
+        response = self._client.get_object(
+            Bucket=self._bucket_name, Key=file_key
+        )
+        raw_bytes = response["Body"].read()
+        return File.create_file_from_raw_bytes(raw_bytes, filename)
+    def _upload_file(
+        self, local_file_path: Path, remote_file_key: str
+    ) -> None:
+        r"""Upload a local file to the Amazon S3 bucket.
+        Args:
+            local_file_path (Path): The path to the local file to be uploaded.
+            remote_file_key (str): The path to the object in the bucket.
+        """
+        self._client.upload_file(
+            Bucket=self._bucket_name,
+            Key=remote_file_key,
+            Filename=local_file_path,
+        )
+    def _download_file(
+        self,
+        local_file_path: Path,
+        remote_file_key: str,
+    ) -> None:
+        r"""Download a file from the Amazon S3 bucket to the local system.
+        Args:
+            local_file_path (Path): The path to the local file to be saved.
+            remote_file_key (str): The key of the object in the bucket.
+        """
+        self._client.download_file(
+            Bucket=self._bucket_name,
+            Key=remote_file_key,
+            Filename=local_file_path,
+        )
+    def _object_exists(self, file_key: str) -> bool:
+        r"""
+        Check if the object exists in the Amazon S3 bucket.
+        Args:
+            file_key: The key of the object in the bucket.
+        Returns:
+            bool: Whether the object exists in the bucket.
+        """
+        try:
+            self._client.head_object(Bucket=self._bucket_name, Key=file_key)
+            return True
+        except self._client.exceptions.ClientError:
+            return False

camel-ai 0.1.6.2__py3-none-any.whl → 0.1.6.4__py3-none-any.whl

Potentially problematic release.

camel-ai 0.1.6.2py3-none-any.whl → 0.1.6.4py3-none-any.whl