PyPI - camel-ai - Versions diffs - 0.1.6.2__py3-none-any.whl → 0.1.6.3__py3-none-any.whl - Mend

camel-ai 0.1.6.2py3-none-any.whl → 0.1.6.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of camel-ai might be problematic. Click here for more details.

Files changed (18) hide show

camel/__init__.py +1 -1
camel/interpreters/docker_interpreter.py +1 -1
camel/loaders/__init__.py +1 -2
camel/loaders/base_io.py +118 -52
camel/loaders/jina_url_reader.py +6 -6
camel/loaders/unstructured_io.py +24 -286
camel/retrievers/auto_retriever.py +25 -35
camel/retrievers/vector_retriever.py +20 -18
camel/storages/object_storages/__init__.py +22 -0
camel/storages/object_storages/amazon_s3.py +205 -0
camel/storages/object_storages/azure_blob.py +166 -0
camel/storages/object_storages/base.py +115 -0
camel/storages/object_storages/google_cloud.py +152 -0
camel/toolkits/retrieval_toolkit.py +5 -5
camel/toolkits/search_toolkit.py +4 -4
{camel_ai-0.1.6.2.dist-info → camel_ai-0.1.6.3.dist-info}/METADATA +7 -3
{camel_ai-0.1.6.2.dist-info → camel_ai-0.1.6.3.dist-info}/RECORD +18 -13
{camel_ai-0.1.6.2.dist-info → camel_ai-0.1.6.3.dist-info}/WHEEL +0 -0

camel/loaders/unstructured_io.py CHANGED Viewed

@@ -12,12 +12,18 @@
 # limitations under the License.
 # =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
 import uuid
-from typing import Any, Dict, List, Literal, Optional, Tuple, Union
+from typing import (
+    Any,
+    Dict,
+    List,
+    Literal,
+    Optional,
+    Tuple,
+    Union,
+)
 from unstructured.documents.elements import Element
-from camel.utils import dependencies_required
 class UnstructuredIO:
     r"""A class to handle various functionalities provided by the
@@ -25,56 +31,12 @@ class UnstructuredIO:
     extracting, staging, chunking data, and integrating with cloud
     services like S3 and Azure for data connection.
-    Attributes:
-        UNSTRUCTURED_MIN_VERSION (str): The minimum required version of
-            the Unstructured library.
+    References:
+        https://docs.unstructured.io/
     """
-    UNSTRUCTURED_MIN_VERSION = "0.10.30"  # Define the minimum version
-    def __init__(self):
-        r"""Initializes the UnstructuredIO class and ensures the
-        installed version of Unstructured library meets the minimum
-        requirements.
-        """
-        self._ensure_unstructured_version(self.UNSTRUCTURED_MIN_VERSION)
-    @dependencies_required('unstructured')
-    def _ensure_unstructured_version(self, min_version: str) -> None:
-        r"""Validates that the installed 'Unstructured' library version
-        satisfies the specified minimum version requirement. This function is
-        essential for ensuring compatibility with features that depend on a
-        certain version of the 'Unstructured' package.
-        Args:
-            min_version (str): The minimum version required, specified in
-                `'major.minor.patch'` format.
-        Raises:
-            ImportError: If the 'Unstructured' package is not available in the
-                environment.
-            ValueError: If the current `'Unstructured'` version is older than
-                the required minimum version.
-        Notes:
-            Uses the 'packaging.version' module to parse and compare version
-                strings.
-        """
-        from packaging import version
-        from unstructured.__version__ import __version__
-        # Use packaging.version to compare versions
-        min_ver = version.parse(min_version)
-        installed_ver = version.parse(__version__)
-        if installed_ver < min_ver:
-            raise ValueError(
-                f"Require `unstructured>={min_version}`, "
-                f"you have {__version__}."
-            )
+    @staticmethod
     def create_element_from_text(
-        self,
         text: str,
         element_id: Optional[Union[str, uuid.UUID]] = None,
         embeddings: Optional[List[float]] = None,
@@ -89,8 +51,8 @@ class UnstructuredIO:
         Args:
             text (str): The text content for the element.
-            element_id (Union[str, uuid.UUID], optional): Unique identifier
-                forthe element. Defaults to an empty string.
+            element_id (Optional[Union[str, uuid.UUID]], optional): Unique
+                identifier for the element. Defaults to `None`.
             embeddings (Optional[List[float]], optional): A list of float
                 numbers representing the text embeddings. Defaults to `None`.
             filename (Optional[str], optional): The name of the file the
@@ -120,13 +82,13 @@ class UnstructuredIO:
         return Text(
             text=text,
-            element_id=element_id if element_id else str(uuid.uuid4()),
+            element_id=element_id or uuid.uuid4(),
             metadata=metadata,
             embeddings=embeddings,
         )
+    @staticmethod
     def parse_file_or_url(
-        self,
         input_path: str,
         **kwargs: Any,
     ) -> List[Element]:
@@ -189,8 +151,8 @@ class UnstructuredIO:
                     "Failed to parse the unstructured file."
                 ) from e
+    @staticmethod
     def clean_text_data(
-        self,
         text: str,
         clean_options: Optional[List[Tuple[str, Dict[str, Any]]]] = None,
     ) -> str:
@@ -253,7 +215,7 @@ class UnstructuredIO:
         )
         from unstructured.cleaners.translate import translate_text
-        cleaning_functions = {
+        cleaning_functions: Any = {
             "clean_extra_whitespace": clean_extra_whitespace,
             "clean_bullets": clean_bullets,
             "clean_ordered_bullets": clean_ordered_bullets,
@@ -291,8 +253,8 @@ class UnstructuredIO:
         return cleaned_text
+    @staticmethod
     def extract_data_from_text(
-        self,
         text: str,
         extract_type: Literal[
             'extract_datetimetz',
@@ -340,7 +302,7 @@ class UnstructuredIO:
             extract_us_phone_number,
         )
-        extraction_functions = {
+        extraction_functions: Any = {
             "extract_datetimetz": extract_datetimetz,
             "extract_email_address": extract_email_address,
             "extract_ip_address": extract_ip_address,
@@ -357,8 +319,8 @@ class UnstructuredIO:
         return extraction_functions[extract_type](text, **kwargs)
+    @staticmethod
     def stage_elements(
-        self,
         elements: List[Any],
         stage_type: Literal[
             'convert_to_csv',
@@ -416,7 +378,7 @@ class UnstructuredIO:
             weaviate,
         )
-        staging_functions = {
+        staging_functions: Any = {
             "convert_to_csv": base.convert_to_csv,
             "convert_to_dataframe": base.convert_to_dataframe,
             "convert_to_dict": base.convert_to_dict,
@@ -441,8 +403,9 @@ class UnstructuredIO:
         return staging_functions[stage_type](elements, **kwargs)
+    @staticmethod
     def chunk_elements(
-        self, elements: List[Any], chunk_type: str, **kwargs
+        elements: List[Any], chunk_type: str, **kwargs
     ) -> List[Element]:
         r"""Chunks elements by titles.
@@ -470,228 +433,3 @@ class UnstructuredIO:
         # Format chunks into a list of dictionaries (or your preferred format)
         return chunking_functions[chunk_type](elements, **kwargs)
-    def run_s3_ingest(
-        self,
-        s3_url: str,
-        output_dir: str,
-        num_processes: int = 2,
-        anonymous: bool = True,
-    ) -> None:
-        r"""Processes documents from an S3 bucket and stores structured
-        outputs locally.
-        Args:
-            s3_url (str): The URL of the S3 bucket.
-            output_dir (str): Local directory to store the processed outputs.
-            num_processes (int, optional): Number of processes to use.
-                (default: :obj:`2`)
-            anonymous (bool, optional): Flag to run anonymously if
-                required. (default: :obj:`True`)
-        Notes:
-            You need to install the necessary extras by using:
-            `pip install "unstructured[s3]"`.
-        References:
-            https://unstructured-io.github.io/unstructured/
-        """
-        from unstructured.ingest.interfaces import (
-            FsspecConfig,
-            PartitionConfig,
-            ProcessorConfig,
-            ReadConfig,
-        )
-        from unstructured.ingest.runner import S3Runner
-        runner = S3Runner(
-            processor_config=ProcessorConfig(
-                verbose=True,
-                output_dir=output_dir,
-                num_processes=num_processes,
-            ),
-            read_config=ReadConfig(),
-            partition_config=PartitionConfig(),
-            fsspec_config=FsspecConfig(remote_url=s3_url),
-        )
-        runner.run(anonymous=anonymous)
-    def run_azure_ingest(
-        self,
-        azure_url: str,
-        output_dir: str,
-        account_name: str,
-        num_processes: int = 2,
-    ) -> None:
-        r"""Processes documents from an Azure storage container and stores
-        structured outputs locally.
-        Args:
-            azure_url (str): The URL of the Azure storage container.
-            output_dir (str): Local directory to store the processed outputs.
-            account_name (str): Azure account name for accessing the container.
-            num_processes (int, optional): Number of processes to use.
-                (default: :obj:`2`)
-        Notes:
-            You need to install the necessary extras by using:
-            `pip install "unstructured[azure]"`.
-        References:
-            https://unstructured-io.github.io/unstructured/
-        """
-        from unstructured.ingest.interfaces import (
-            FsspecConfig,
-            PartitionConfig,
-            ProcessorConfig,
-            ReadConfig,
-        )
-        from unstructured.ingest.runner import AzureRunner
-        runner = AzureRunner(
-            processor_config=ProcessorConfig(
-                verbose=True,
-                output_dir=output_dir,
-                num_processes=num_processes,
-            ),
-            read_config=ReadConfig(),
-            partition_config=PartitionConfig(),
-            fsspec_config=FsspecConfig(remote_url=azure_url),
-        )
-        runner.run(account_name=account_name)
-    def run_github_ingest(
-        self,
-        repo_url: str,
-        git_branch: str,
-        output_dir: str,
-        num_processes: int = 2,
-    ) -> None:
-        r"""Processes documents from a GitHub repository and stores
-        structured outputs locally.
-        Args:
-            repo_url (str): URL of the GitHub repository.
-            git_branch (str): Git branch name to process.
-            output_dir (str): Local directory to store the processed outputs.
-            num_processes (int, optional): Number of processes to use.
-                (default: :obj:`2`)
-        Notes:
-            You need to install the necessary extras by using:
-            `pip install "unstructured[github]"`.
-        References:
-            https://unstructured-io.github.io/unstructured/
-        """
-        from unstructured.ingest.interfaces import (
-            PartitionConfig,
-            ProcessorConfig,
-            ReadConfig,
-        )
-        from unstructured.ingest.runner import GithubRunner
-        runner = GithubRunner(
-            processor_config=ProcessorConfig(
-                verbose=True,
-                output_dir=output_dir,
-                num_processes=num_processes,
-            ),
-            read_config=ReadConfig(),
-            partition_config=PartitionConfig(),
-        )
-        runner.run(url=repo_url, git_branch=git_branch)
-    def run_slack_ingest(
-        self,
-        channels: List[str],
-        token: str,
-        start_date: str,
-        end_date: str,
-        output_dir: str,
-        num_processes: int = 2,
-    ) -> None:
-        r"""Processes documents from specified Slack channels and stores
-        structured outputs locally.
-        Args:
-            channels (List[str]): List of Slack channel IDs.
-            token (str): Slack API token.
-            start_date (str): Start date for fetching data.
-            end_date (str): End date for fetching data.
-            output_dir (str): Local directory to store the processed outputs.
-            num_processes (int, optional): Number of processes to use.
-                (default: :obj:`2`)
-        Notes:
-            You need to install the necessary extras by using:
-            `pip install "unstructured[slack]"`.
-        References:
-            https://unstructured-io.github.io/unstructured/
-        """
-        from unstructured.ingest.interfaces import (
-            PartitionConfig,
-            ProcessorConfig,
-            ReadConfig,
-        )
-        from unstructured.ingest.runner import SlackRunner
-        runner = SlackRunner(
-            processor_config=ProcessorConfig(
-                verbose=True,
-                output_dir=output_dir,
-                num_processes=num_processes,
-            ),
-            read_config=ReadConfig(),
-            partition_config=PartitionConfig(),
-        )
-        runner.run(
-            channels=channels,
-            token=token,
-            start_date=start_date,
-            end_date=end_date,
-        )
-    def run_discord_ingest(
-        self,
-        channels: List[str],
-        token: str,
-        output_dir: str,
-        num_processes: int = 2,
-    ) -> None:
-        r"""Processes messages from specified Discord channels and stores
-        structured outputs locally.
-        Args:
-            channels (List[str]): List of Discord channel IDs.
-            token (str): Discord bot token.
-            output_dir (str): Local directory to store the processed outputs.
-            num_processes (int, optional): Number of processes to use.
-                (default: :obj:`2`)
-        Notes:
-            You need to install the necessary extras by using:
-            `pip install "unstructured[discord]"`.
-        References:
-            https://unstructured-io.github.io/unstructured/
-        """
-        from unstructured.ingest.interfaces import (
-            PartitionConfig,
-            ProcessorConfig,
-            ReadConfig,
-        )
-        from unstructured.ingest.runner import DiscordRunner
-        runner = DiscordRunner(
-            processor_config=ProcessorConfig(
-                verbose=True,
-                output_dir=output_dir,
-                num_processes=num_processes,
-            ),
-            read_config=ReadConfig(),
-            partition_config=PartitionConfig(),
-        )
-        runner.run(channels=channels, token=token)

camel/retrievers/auto_retriever.py CHANGED Viewed

@@ -97,36 +97,36 @@ class AutoRetriever:
             f"Unsupported vector storage type: {self.storage_type}"
         )
-    def _collection_name_generator(self, content_input_path: str) -> str:
+    def _collection_name_generator(self, content: str) -> str:
         r"""Generates a valid collection name from a given file path or URL.
         Args:
-            content_input_path: str. The input URL or file path from which to
-                generate the collection name.
+            contents (str): Local file path, remote URL or string content.
         Returns:
             str: A sanitized, valid collection name suitable for use.
         """
-        # Check path type
-        parsed_url = urlparse(content_input_path)
-        self.is_url = all([parsed_url.scheme, parsed_url.netloc])
+        # Check if the content is URL
+        parsed_url = urlparse(content)
+        is_url = all([parsed_url.scheme, parsed_url.netloc])
         # Convert given path into a collection name, ensuring it only
         # contains numbers, letters, and underscores
-        if self.is_url:
+        if is_url:
             # For URLs, remove https://, replace /, and any characters not
             # allowed by Milvus with _
             collection_name = re.sub(
                 r'[^0-9a-zA-Z]+',
                 '_',
-                content_input_path.replace("https://", ""),
+                content.replace("https://", ""),
             )
-        else:
+        elif os.path.exists(content):
             # For file paths, get the stem and replace spaces with _, also
             # ensuring only allowed characters are present
-            collection_name = re.sub(
-                r'[^0-9a-zA-Z]+', '_', Path(content_input_path).stem
-            )
+            collection_name = re.sub(r'[^0-9a-zA-Z]+', '_', Path(content).stem)
+        else:
+            # the content is string input
+            collection_name = content[:10]
         # Ensure the collection name does not start or end with underscore
         collection_name = collection_name.strip("_")
@@ -193,7 +193,7 @@ class AutoRetriever:
     def run_vector_retriever(
         self,
         query: str,
-        content_input_paths: Union[str, List[str]],
+        contents: Union[str, List[str]],
         top_k: int = DEFAULT_TOP_K_RESULTS,
         similarity_threshold: float = DEFAULT_SIMILARITY_THRESHOLD,
         return_detailed_info: bool = False,
@@ -203,8 +203,8 @@ class AutoRetriever:
         Args:
             query (str): Query string for information retriever.
-            content_input_paths (Union[str, List[str]]): Paths to local
-                files or remote URLs.
+            contents (Union[str, List[str]]): Local file paths, remote URLs or
+                string contents.
             top_k (int, optional): The number of top results to return during
                 retrieve. Must be a positive integer. Defaults to
                 `DEFAULT_TOP_K_RESULTS`.
@@ -223,24 +223,18 @@ class AutoRetriever:
         Raises:
             ValueError: If there's an vector storage existing with content
                 name in the vector path but the payload is None. If
-                `content_input_paths` is empty.
+                `contents` is empty.
             RuntimeError: If any errors occur during the retrieve process.
         """
-        if not content_input_paths:
-            raise ValueError("content_input_paths cannot be empty.")
+        if not contents:
+            raise ValueError("content cannot be empty.")
-        content_input_paths = (
-            [content_input_paths]
-            if isinstance(content_input_paths, str)
-            else content_input_paths
-        )
+        contents = [contents] if isinstance(contents, str) else contents
         all_retrieved_info = []
-        for content_input_path in content_input_paths:
+        for content in contents:
             # Generate a valid collection name
-            collection_name = self._collection_name_generator(
-                content_input_path
-            )
+            collection_name = self._collection_name_generator(content)
             try:
                 vector_storage_instance = self._initialize_vector_storage(
                     collection_name
@@ -251,13 +245,11 @@ class AutoRetriever:
                 file_is_modified = False  # initialize with a default value
                 if (
                     vector_storage_instance.status().vector_count != 0
-                    and not self.is_url
+                    and os.path.exists(content)
                 ):
                     # Get original modified date from file
                     modified_date_from_file = (
-                        self._get_file_modified_date_from_file(
-                            content_input_path
-                        )
+                        self._get_file_modified_date_from_file(content)
                     )
                     # Get modified date from vector storage
                     modified_date_from_storage = (
@@ -280,18 +272,16 @@ class AutoRetriever:
                     # Process and store the content to the vector storage
                     vr = VectorRetriever(
                         storage=vector_storage_instance,
-                        similarity_threshold=similarity_threshold,
                         embedding_model=self.embedding_model,
                     )
-                    vr.process(content_input_path)
+                    vr.process(content)
                 else:
                     vr = VectorRetriever(
                         storage=vector_storage_instance,
-                        similarity_threshold=similarity_threshold,
                         embedding_model=self.embedding_model,
                     )
                 # Retrieve info by given query from the vector storage
-                retrieved_info = vr.query(query, top_k)
+                retrieved_info = vr.query(query, top_k, similarity_threshold)
                 all_retrieved_info.extend(retrieved_info)
             except Exception as e:
                 raise RuntimeError(

camel/retrievers/vector_retriever.py CHANGED Viewed

@@ -11,7 +11,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
+import os
 from typing import Any, Dict, List, Optional
+from urllib.parse import urlparse
 from camel.embeddings import BaseEmbedding, OpenAIEmbedding
 from camel.loaders import UnstructuredIO
@@ -38,24 +40,18 @@ class VectorRetriever(BaseRetriever):
         embedding_model (BaseEmbedding): Embedding model used to generate
             vector embeddings.
         storage (BaseVectorStorage): Vector storage to query.
-        similarity_threshold (float, optional): The similarity threshold
-            for filtering results. Defaults to `DEFAULT_SIMILARITY_THRESHOLD`.
         unstructured_modules (UnstructuredIO): A module for parsing files and
             URLs and chunking content based on specified parameters.
     """
     def __init__(
         self,
-        similarity_threshold: float = DEFAULT_SIMILARITY_THRESHOLD,
         embedding_model: Optional[BaseEmbedding] = None,
         storage: Optional[BaseVectorStorage] = None,
     ) -> None:
         r"""Initializes the retriever class with an optional embedding model.
         Args:
-            similarity_threshold (float, optional): The similarity threshold
-                for filtering results. Defaults to
-                `DEFAULT_SIMILARITY_THRESHOLD`.
             embedding_model (Optional[BaseEmbedding]): The embedding model
                 instance. Defaults to `OpenAIEmbedding` if not provided.
             storage (BaseVectorStorage): Vector storage to query.
@@ -68,12 +64,11 @@ class VectorRetriever(BaseRetriever):
                 vector_dim=self.embedding_model.get_output_dim()
             )
         )
-        self.similarity_threshold = similarity_threshold
-        self.unstructured_modules: UnstructuredIO = UnstructuredIO()
+        self.uio: UnstructuredIO = UnstructuredIO()
     def process(
         self,
-        content_input_path: str,
+        content: str,
         chunk_type: str = "chunk_by_title",
         **kwargs: Any,
     ) -> None:
@@ -82,16 +77,19 @@ class VectorRetriever(BaseRetriever):
         vector storage.
         Args:
-            content_input_path (str): File path or URL of the content to be
-                processed.
+            contents (str): Local file path, remote URL or string content.
             chunk_type (str): Type of chunking going to apply. Defaults to
                 "chunk_by_title".
             **kwargs (Any): Additional keyword arguments for content parsing.
         """
-        elements = self.unstructured_modules.parse_file_or_url(
-            content_input_path, **kwargs
-        )
-        chunks = self.unstructured_modules.chunk_elements(
+        # Check if the content is URL
+        parsed_url = urlparse(content)
+        is_url = all([parsed_url.scheme, parsed_url.netloc])
+        if is_url or os.path.exists(content):
+            elements = self.uio.parse_file_or_url(content, **kwargs)
+        else:
+            elements = [self.uio.create_element_from_text(text=content)]
+        chunks = self.uio.chunk_elements(
             chunk_type=chunk_type, elements=elements
         )
         # Iterate to process and store embeddings, set batch of 50
@@ -105,7 +103,7 @@ class VectorRetriever(BaseRetriever):
             # Prepare the payload for each vector record, includes the content
             # path, chunk metadata, and chunk text
             for vector, chunk in zip(batch_vectors, batch_chunks):
-                content_path_info = {"content path": content_input_path}
+                content_path_info = {"content path": content}
                 chunk_metadata = {"metadata": chunk.metadata.to_dict()}
                 chunk_text = {"text": str(chunk)}
                 combined_dict = {
@@ -124,12 +122,16 @@ class VectorRetriever(BaseRetriever):
         self,
         query: str,
         top_k: int = DEFAULT_TOP_K_RESULTS,
+        similarity_threshold: float = DEFAULT_SIMILARITY_THRESHOLD,
     ) -> List[Dict[str, Any]]:
         r"""Executes a query in vector storage and compiles the retrieved
         results into a dictionary.
         Args:
             query (str): Query string for information retriever.
+            similarity_threshold (float, optional): The similarity threshold
+                for filtering results. Defaults to
+                `DEFAULT_SIMILARITY_THRESHOLD`.
             top_k (int, optional): The number of top results to return during
                 retriever. Must be a positive integer. Defaults to 1.
@@ -161,7 +163,7 @@ class VectorRetriever(BaseRetriever):
         formatted_results = []
         for result in query_results:
             if (
-                result.similarity >= self.similarity_threshold
+                result.similarity >= similarity_threshold
                 and result.record.payload is not None
             ):
                 result_dict = {
@@ -182,7 +184,7 @@ class VectorRetriever(BaseRetriever):
                     'text': (
                         f"No suitable information retrieved "
                         f"from {content_path} with similarity_threshold"
-                        f" = {self.similarity_threshold}."
+                        f" = {similarity_threshold}."
                     )
                 }
             ]

camel/storages/object_storages/__init__.py ADDED Viewed

@@ -0,0 +1,22 @@
+# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
+from .amazon_s3 import AmazonS3Storage
+from .azure_blob import AzureBlobStorage
+from .google_cloud import GoogleCloudStorage
+__all__ = [
+    "AmazonS3Storage",
+    "AzureBlobStorage",
+    "GoogleCloudStorage",
+]

camel-ai 0.1.6.2__py3-none-any.whl → 0.1.6.3__py3-none-any.whl

Potentially problematic release.

camel-ai 0.1.6.2py3-none-any.whl → 0.1.6.3py3-none-any.whl