PyPI - langroid - Versions diffs - 0.1.85__py3-none-any.whl → 0.1.219__py3-none-any.whl - Mend

langroid 0.1.85py3-none-any.whl → 0.1.219py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (107) hide show

langroid/__init__.py +95 -0
langroid/agent/__init__.py +40 -0
langroid/agent/base.py +222 -91
langroid/agent/batch.py +264 -0
langroid/agent/callbacks/chainlit.py +608 -0
langroid/agent/chat_agent.py +247 -101
langroid/agent/chat_document.py +41 -4
langroid/agent/openai_assistant.py +842 -0
langroid/agent/special/__init__.py +50 -0
langroid/agent/special/doc_chat_agent.py +837 -141
langroid/agent/special/lance_doc_chat_agent.py +258 -0
langroid/agent/special/lance_rag/__init__.py +9 -0
langroid/agent/special/lance_rag/critic_agent.py +136 -0
langroid/agent/special/lance_rag/lance_rag_task.py +80 -0
langroid/agent/special/lance_rag/query_planner_agent.py +180 -0
langroid/agent/special/lance_tools.py +44 -0
langroid/agent/special/neo4j/__init__.py +0 -0
langroid/agent/special/neo4j/csv_kg_chat.py +174 -0
langroid/agent/special/neo4j/neo4j_chat_agent.py +370 -0
langroid/agent/special/neo4j/utils/__init__.py +0 -0
langroid/agent/special/neo4j/utils/system_message.py +46 -0
langroid/agent/special/relevance_extractor_agent.py +127 -0
langroid/agent/special/retriever_agent.py +32 -198
langroid/agent/special/sql/__init__.py +11 -0
langroid/agent/special/sql/sql_chat_agent.py +47 -23
langroid/agent/special/sql/utils/__init__.py +22 -0
langroid/agent/special/sql/utils/description_extractors.py +95 -46
langroid/agent/special/sql/utils/populate_metadata.py +28 -21
langroid/agent/special/table_chat_agent.py +43 -9
langroid/agent/task.py +475 -122
langroid/agent/tool_message.py +75 -13
langroid/agent/tools/__init__.py +13 -0
langroid/agent/tools/duckduckgo_search_tool.py +66 -0
langroid/agent/tools/google_search_tool.py +11 -0
langroid/agent/tools/metaphor_search_tool.py +67 -0
langroid/agent/tools/recipient_tool.py +16 -29
langroid/agent/tools/run_python_code.py +60 -0
langroid/agent/tools/sciphi_search_rag_tool.py +79 -0
langroid/agent/tools/segment_extract_tool.py +36 -0
langroid/cachedb/__init__.py +9 -0
langroid/cachedb/base.py +22 -2
langroid/cachedb/momento_cachedb.py +26 -2
langroid/cachedb/redis_cachedb.py +78 -11
langroid/embedding_models/__init__.py +34 -0
langroid/embedding_models/base.py +21 -2
langroid/embedding_models/models.py +120 -18
langroid/embedding_models/protoc/embeddings.proto +19 -0
langroid/embedding_models/protoc/embeddings_pb2.py +33 -0
langroid/embedding_models/protoc/embeddings_pb2.pyi +50 -0
langroid/embedding_models/protoc/embeddings_pb2_grpc.py +79 -0
langroid/embedding_models/remote_embeds.py +153 -0
langroid/language_models/__init__.py +45 -0
langroid/language_models/azure_openai.py +80 -27
langroid/language_models/base.py +117 -12
langroid/language_models/config.py +5 -0
langroid/language_models/openai_assistants.py +3 -0
langroid/language_models/openai_gpt.py +558 -174
langroid/language_models/prompt_formatter/__init__.py +15 -0
langroid/language_models/prompt_formatter/base.py +4 -6
langroid/language_models/prompt_formatter/hf_formatter.py +135 -0
langroid/language_models/utils.py +18 -21
langroid/mytypes.py +25 -8
langroid/parsing/__init__.py +46 -0
langroid/parsing/document_parser.py +260 -63
langroid/parsing/image_text.py +32 -0
langroid/parsing/parse_json.py +143 -0
langroid/parsing/parser.py +122 -59
langroid/parsing/repo_loader.py +114 -52
langroid/parsing/search.py +68 -63
langroid/parsing/spider.py +3 -2
langroid/parsing/table_loader.py +44 -0
langroid/parsing/url_loader.py +59 -11
langroid/parsing/urls.py +85 -37
langroid/parsing/utils.py +298 -4
langroid/parsing/web_search.py +73 -0
langroid/prompts/__init__.py +11 -0
langroid/prompts/chat-gpt4-system-prompt.md +68 -0
langroid/prompts/prompts_config.py +1 -1
langroid/utils/__init__.py +17 -0
langroid/utils/algorithms/__init__.py +3 -0
langroid/utils/algorithms/graph.py +103 -0
langroid/utils/configuration.py +36 -5
langroid/utils/constants.py +4 -0
langroid/utils/globals.py +2 -2
langroid/utils/logging.py +2 -5
langroid/utils/output/__init__.py +21 -0
langroid/utils/output/printing.py +47 -1
langroid/utils/output/status.py +33 -0
langroid/utils/pandas_utils.py +30 -0
langroid/utils/pydantic_utils.py +616 -2
langroid/utils/system.py +98 -0
langroid/vector_store/__init__.py +40 -0
langroid/vector_store/base.py +203 -6
langroid/vector_store/chromadb.py +59 -32
langroid/vector_store/lancedb.py +463 -0
langroid/vector_store/meilisearch.py +10 -7
langroid/vector_store/momento.py +262 -0
langroid/vector_store/qdrantdb.py +104 -22
{langroid-0.1.85.dist-info → langroid-0.1.219.dist-info}/METADATA +329 -149
langroid-0.1.219.dist-info/RECORD +127 -0
{langroid-0.1.85.dist-info → langroid-0.1.219.dist-info}/WHEEL +1 -1
langroid/agent/special/recipient_validator_agent.py +0 -157
langroid/parsing/json.py +0 -64
langroid/utils/web/selenium_login.py +0 -36
langroid-0.1.85.dist-info/RECORD +0 -94
/langroid/{scripts → agent/callbacks}/__init__.py +0 -0
{langroid-0.1.85.dist-info → langroid-0.1.219.dist-info}/LICENSE +0 -0

langroid/vector_store/momento.py ADDED Viewed

@@ -0,0 +1,262 @@
+"""
+Momento Vector Index.
+https://docs.momentohq.com/vector-index/develop/api-reference
+"""
+import logging
+import os
+from typing import List, Optional, Sequence, Tuple, no_type_check
+import momento.responses.vector_index as mvi_response
+from dotenv import load_dotenv
+from momento import (
+    # PreviewVectorIndexClientAsync,
+    CredentialProvider,
+    PreviewVectorIndexClient,
+    VectorIndexConfigurations,
+)
+from momento.requests.vector_index import (
+    ALL_METADATA,
+    Item,
+    SimilarityMetric,
+)
+from langroid.embedding_models.base import (
+    EmbeddingModel,
+    EmbeddingModelsConfig,
+)
+from langroid.embedding_models.models import OpenAIEmbeddingsConfig
+from langroid.mytypes import Document, EmbeddingFunction
+from langroid.utils.configuration import settings
+from langroid.utils.pydantic_utils import (
+    flatten_pydantic_instance,
+    nested_dict_from_flat,
+)
+from langroid.vector_store.base import VectorStore, VectorStoreConfig
+logger = logging.getLogger(__name__)
+class MomentoVIConfig(VectorStoreConfig):
+    cloud: bool = True
+    collection_name: str | None = "temp"
+    embedding: EmbeddingModelsConfig = OpenAIEmbeddingsConfig()
+    distance: SimilarityMetric = SimilarityMetric.COSINE_SIMILARITY
+class MomentoVI(VectorStore):
+    def __init__(self, config: MomentoVIConfig = MomentoVIConfig()):
+        super().__init__(config)
+        self.config: MomentoVIConfig = config
+        emb_model = EmbeddingModel.create(config.embedding)
+        self.embedding_fn: EmbeddingFunction = emb_model.embedding_fn()
+        self.embedding_dim = emb_model.embedding_dims
+        self.host = config.host
+        self.port = config.port
+        load_dotenv()
+        api_key = os.getenv("MOMENTO_API_KEY")
+        if config.cloud:
+            if api_key is None:
+                raise ValueError(
+                    """MOMENTO_API_KEY env variable must be set to
+                    MomentoVI hosted service. Please set this in your .env file.
+                    """
+                )
+            self.client = PreviewVectorIndexClient(
+                configuration=VectorIndexConfigurations.Default.latest(),
+                credential_provider=CredentialProvider.from_string(api_key),
+            )
+        else:
+            raise NotImplementedError("MomentoVI local not available yet")
+        # Note: Only create collection if a non-null collection name is provided.
+        # This is useful to delay creation of vecdb until we have a suitable
+        # collection name (e.g. we could get it from the url or folder path).
+        if config.collection_name is not None:
+            self.create_collection(
+                config.collection_name, replace=config.replace_collection
+            )
+    def clear_empty_collections(self) -> int:
+        logger.warning(
+            """
+            Momento VI does not yet have a way to easily get size of indices,
+            so clear_empty_collections is not deleting any indices.
+            """
+        )
+        return 0
+    def clear_all_collections(self, really: bool = False, prefix: str = "") -> int:
+        """Clear all collections with the given prefix."""
+        if not really:
+            logger.warning("Not deleting all collections, set really=True to confirm")
+            return 0
+        coll_names = self.list_collections(empty=False)
+        coll_names = [name for name in coll_names if name.startswith(prefix)]
+        if len(coll_names) == 0:
+            logger.warning(f"No collections found with prefix {prefix}")
+            return 0
+        for name in coll_names:
+            self.delete_collection(name)
+        logger.warning(
+            f"""
+            Deleted {len(coll_names)} indices from Momento VI
+            """
+        )
+        return len(coll_names)
+    def list_collections(self, empty: bool = False) -> List[str]:
+        """
+        Returns:
+            List of collection names that have at least one vector.
+        Args:
+            empty (bool, optional): Whether to include empty collections.
+        """
+        response = self.client.list_indexes()
+        if isinstance(response, mvi_response.ListIndexes.Success):
+            return [ind.name for ind in response.indexes]
+        elif isinstance(response, mvi_response.ListIndexes.Error):
+            raise ValueError(f"Error listing collections: {response.message}")
+        else:
+            raise ValueError(f"Unexpected response: {response}")
+    def create_collection(self, collection_name: str, replace: bool = False) -> None:
+        """
+        Create a collection with the given name, optionally replacing an existing
+            collection if `replace` is True.
+        Args:
+            collection_name (str): Name of the collection to create.
+            replace (bool): Whether to replace an existing collection
+                with the same name. Defaults to False.
+        """
+        self.config.collection_name = collection_name
+        response = self.client.create_index(
+            index_name=collection_name,
+            num_dimensions=self.embedding_dim,
+            similarity_metric=self.config.distance,
+        )
+        if isinstance(response, mvi_response.CreateIndex.Success):
+            logger.info(f"Created collection {collection_name}")
+        elif isinstance(response, mvi_response.CreateIndex.IndexAlreadyExists):
+            logger.warning(f"Collection {collection_name} already exists")
+        elif isinstance(response, mvi_response.CreateIndex.Error):
+            raise ValueError(
+                f"Error creating collection {collection_name}: {response.message}"
+            )
+        if settings.debug:
+            level = logger.getEffectiveLevel()
+            logger.setLevel(logging.INFO)
+            logger.info(f"Collection {collection_name} created")
+            logger.setLevel(level)
+    def add_documents(self, documents: Sequence[Document]) -> None:
+        super().maybe_add_ids(documents)
+        if len(documents) == 0:
+            return
+        embedding_vecs = self.embedding_fn([doc.content for doc in documents])
+        if self.config.collection_name is None:
+            raise ValueError("No collection name set, cannot ingest docs")
+        self.create_collection(self.config.collection_name, replace=True)
+        items = [
+            Item(
+                id=str(d.id()),
+                vector=embedding_vecs[i],
+                metadata=flatten_pydantic_instance(d, force_str=True),
+                # force all values to str since Momento requires it
+            )
+            for i, d in enumerate(documents)
+        ]
+        # don't insert all at once, batch in chunks of b,
+        # else we get an API error
+        b = self.config.batch_size
+        for i in range(0, len(documents), b):
+            response = self.client.upsert_item_batch(
+                index_name=self.config.collection_name,
+                items=items[i : i + b],
+            )
+            if isinstance(response, mvi_response.UpsertItemBatch.Success):
+                continue
+            elif isinstance(response, mvi_response.UpsertItemBatch.Error):
+                raise ValueError(f"Error adding documents: {response.message}")
+            else:
+                raise ValueError(f"Unexpected response: {response}")
+    def delete_collection(self, collection_name: str) -> None:
+        delete_response = self.client.delete_index(collection_name)
+        if isinstance(delete_response, mvi_response.DeleteIndex.Success):
+            logger.warning(f"Deleted index {collection_name}")
+        elif isinstance(delete_response, mvi_response.DeleteIndex.Error):
+            logger.error(
+                f"Error while deleting index {collection_name}: "
+                f" {delete_response.message}"
+            )
+    def _to_int_or_uuid(self, id: str) -> int | str:
+        try:
+            return int(id)
+        except ValueError:
+            return id
+    def get_all_documents(self, where: str = "") -> List[Document]:
+        raise NotImplementedError(
+            """
+            MomentoVI does not support get_all_documents().
+            Please use a different vector database, e.g. qdrant or chromadb.
+            """
+        )
+    def get_documents_by_ids(self, ids: List[str]) -> List[Document]:
+        raise NotImplementedError(
+            """
+            MomentoVI does not support get_documents_by_ids.
+            Please use a different vector database, e.g. qdrant or chromadb.
+            """
+        )
+    @no_type_check
+    def similar_texts_with_scores(
+        self,
+        text: str,
+        k: int = 1,
+        where: Optional[str] = None,
+        neighbors: int = 0,  # ignored
+    ) -> List[Tuple[Document, float]]:
+        if self.config.collection_name is None:
+            raise ValueError("No collection name set, cannot search")
+        embedding = self.embedding_fn([text])[0]
+        response = self.client.search(
+            index_name=self.config.collection_name,
+            query_vector=embedding,
+            top_k=k,
+            metadata_fields=ALL_METADATA,
+        )
+        if isinstance(response, mvi_response.Search.Error):
+            logger.warning(
+                f"Error while searching on index {self.config.collection_name}:"
+                f" {response.message}"
+            )
+            return []
+        elif not isinstance(response, mvi_response.Search.Success):
+            logger.warning(f"Unexpected response: {response}")
+            return []
+        scores = [match.metadata["distance"] for match in response.hits]
+        docs = [
+            Document.parse_obj(nested_dict_from_flat(match.metadata))
+            for match in response.hits
+            if match is not None
+        ]
+        if len(docs) == 0:
+            logger.warning(f"No matches found for {text}")
+            return []
+        if settings.debug:
+            logger.info(f"Found {len(docs)} matches, max score: {max(scores)}")
+        doc_score_pairs = list(zip(docs, scores))
+        self.show_if_debug(doc_score_pairs)
+        return doc_score_pairs

langroid/vector_store/qdrantdb.py CHANGED Viewed

@@ -1,8 +1,10 @@
+import hashlib
+import json
 import logging
 import os
-from typing import List, Optional, Sequence, Tuple
+import uuid
+from typing import List, Optional, Sequence, Tuple, TypeVar
-from chromadb.api.types import EmbeddingFunction
 from dotenv import load_dotenv
 from qdrant_client import QdrantClient
 from qdrant_client.conversions.common_types import ScoredPoint
@@ -20,23 +22,50 @@ from langroid.embedding_models.base import (
     EmbeddingModelsConfig,
 )
 from langroid.embedding_models.models import OpenAIEmbeddingsConfig
-from langroid.mytypes import Document
+from langroid.mytypes import Document, EmbeddingFunction
 from langroid.utils.configuration import settings
 from langroid.vector_store.base import VectorStore, VectorStoreConfig
 logger = logging.getLogger(__name__)
+T = TypeVar("T")
+def from_optional(x: Optional[T], default: T) -> T:
+    if x is None:
+        return default
+    return x
+def is_valid_uuid(uuid_to_test: str) -> bool:
+    """
+    Check if a given string is a valid UUID.
+    """
+    try:
+        uuid_obj = uuid.UUID(uuid_to_test)
+        return str(uuid_obj) == uuid_to_test
+    except Exception:
+        pass
+    # Check for valid unsigned 64-bit integer
+    try:
+        int_value = int(uuid_to_test)
+        return 0 <= int_value <= 18446744073709551615
+    except ValueError:
+        return False
 class QdrantDBConfig(VectorStoreConfig):
     cloud: bool = True
-    collection_name: str | None = None
+    collection_name: str | None = "temp"
     storage_path: str = ".qdrant/data"
     embedding: EmbeddingModelsConfig = OpenAIEmbeddingsConfig()
     distance: str = Distance.COSINE
 class QdrantDB(VectorStore):
-    def __init__(self, config: QdrantDBConfig):
+    def __init__(self, config: QdrantDBConfig = QdrantDBConfig()):
         super().__init__(config)
         self.config = config
         emb_model = EmbeddingModel.create(config.embedding)
@@ -113,8 +142,10 @@ class QdrantDB(VectorStore):
         n_non_empty_deletes = 0
         for name in coll_names:
             info = self.client.get_collection(collection_name=name)
-            n_empty_deletes += info.points_count == 0
-            n_non_empty_deletes += info.points_count > 0
+            points_count = from_optional(info.points_count, 0)
+            n_empty_deletes += points_count == 0
+            n_non_empty_deletes += points_count > 0
             self.client.delete_collection(collection_name=name)
         logger.warning(
             f"""
@@ -135,11 +166,21 @@ class QdrantDB(VectorStore):
         colls = list(self.client.get_collections())[0][1]
         if empty:
             return [coll.name for coll in colls]
-        counts = [
-            self.client.get_collection(collection_name=coll.name).points_count
-            for coll in colls
-        ]
-        return [coll.name for coll, count in zip(colls, counts) if count > 0]
+        counts = []
+        for coll in colls:
+            try:
+                counts.append(
+                    from_optional(
+                        self.client.get_collection(
+                            collection_name=coll.name
+                        ).points_count,
+                        0,
+                    )
+                )
+            except Exception:
+                logger.warning(f"Error getting collection {coll.name}")
+                counts.append(0)
+        return [coll.name for coll, count in zip(colls, counts) if (count or 0) > 0]
     def create_collection(self, collection_name: str, replace: bool = False) -> None:
         """
@@ -154,7 +195,10 @@ class QdrantDB(VectorStore):
         collections = self.list_collections()
         if collection_name in collections:
             coll = self.client.get_collection(collection_name=collection_name)
-            if coll.status == CollectionStatus.GREEN and coll.points_count > 0:
+            if (
+                coll.status == CollectionStatus.GREEN
+                and from_optional(coll.points_count, 0) > 0
+            ):
                 logger.warning(f"Non-empty Collection {collection_name} already exists")
                 if not replace:
                     logger.warning("Not replacing collection")
@@ -178,9 +222,15 @@ class QdrantDB(VectorStore):
             logger.setLevel(level)
     def add_documents(self, documents: Sequence[Document]) -> None:
+        # Add id to metadata if not already present
+        super().maybe_add_ids(documents)
+        # Fix the ids due to qdrant finickiness
+        for doc in documents:
+            doc.metadata.id = str(self._to_int_or_uuid(doc.metadata.id))
         colls = self.list_collections(empty=True)
         if len(documents) == 0:
             return
+        document_dicts = [doc.dict() for doc in documents]
         embedding_vecs = self.embedding_fn([doc.content for doc in documents])
         if self.config.collection_name is None:
             raise ValueError("No collection name set, cannot ingest docs")
@@ -196,7 +246,7 @@ class QdrantDB(VectorStore):
                 points=Batch(
                     ids=ids[i : i + b],
                     vectors=embedding_vecs[i : i + b],
-                    payloads=documents[i : i + b],
+                    payloads=document_dicts[i : i + b],
                 ),
             )
@@ -205,19 +255,42 @@ class QdrantDB(VectorStore):
     def _to_int_or_uuid(self, id: str) -> int | str:
         try:
-            return int(id)
+            int_val = int(id)
+            if is_valid_uuid(id):
+                return int_val
         except ValueError:
+            pass
+        # If doc_id is already a valid UUID, return it as is
+        if isinstance(id, str) and is_valid_uuid(id):
             return id
-    def get_all_documents(self) -> List[Document]:
+        # Otherwise, generate a UUID from the doc_id
+        # Convert doc_id to string if it's not already
+        id_str = str(id)
+        # Hash the document ID using SHA-1
+        hash_object = hashlib.sha1(id_str.encode())
+        hash_digest = hash_object.hexdigest()
+        # Truncate or manipulate the hash to fit into a UUID (128 bits)
+        uuid_str = hash_digest[:32]
+        # Format this string into a UUID format
+        formatted_uuid = uuid.UUID(uuid_str)
+        return str(formatted_uuid)
+    def get_all_documents(self, where: str = "") -> List[Document]:
         if self.config.collection_name is None:
             raise ValueError("No collection name set, cannot retrieve docs")
         docs = []
         offset = 0
+        filter = Filter() if where == "" else Filter.parse_obj(json.loads(where))
         while True:
             results, next_page_offset = self.client.scroll(
                 collection_name=self.config.collection_name,
-                scroll_filter=None,
+                scroll_filter=filter,
                 offset=offset,
                 limit=10_000,  # try getting all at once, if not we keep paging
                 with_payload=True,
@@ -239,7 +312,11 @@ class QdrantDB(VectorStore):
             with_vectors=False,
             with_payload=True,
         )
-        docs = [Document(**record.payload) for record in records]  # type: ignore
+        # Note the records may NOT be in the order of the ids,
+        # so we re-order them here.
+        id2payload = {record.id: record.payload for record in records}
+        ordered_payloads = [id2payload[id] for id in _ids]
+        docs = [Document(**payload) for payload in ordered_payloads]  # type: ignore
         return docs
     def similar_texts_with_scores(
@@ -247,10 +324,14 @@ class QdrantDB(VectorStore):
         text: str,
         k: int = 1,
         where: Optional[str] = None,
+        neighbors: int = 0,
     ) -> List[Tuple[Document, float]]:
         embedding = self.embedding_fn([text])[0]
         # TODO filter may not work yet
-        filter = Filter() if where is None else Filter.from_json(where)  # type: ignore
+        if where is None or where == "":
+            filter = Filter()
+        else:
+            filter = Filter.parse_obj(json.loads(where))
         if self.config.collection_name is None:
             raise ValueError("No collection name set, cannot search")
         search_result: List[ScoredPoint] = self.client.search(
@@ -263,7 +344,7 @@ class QdrantDB(VectorStore):
                 exact=False,  # use Apx NN, not exact NN
             ),
         )
-        scores = [match.score for match in search_result]
+        scores = [match.score for match in search_result if match is not None]
         docs = [
             Document(**(match.payload))  # type: ignore
             for match in search_result
@@ -272,8 +353,9 @@ class QdrantDB(VectorStore):
         if len(docs) == 0:
             logger.warning(f"No matches found for {text}")
             return []
-        if settings.debug:
-            logger.info(f"Found {len(docs)} matches, max score: {max(scores)}")
         doc_score_pairs = list(zip(docs, scores))
+        max_score = max(ds[1] for ds in doc_score_pairs)
+        if settings.debug:
+            logger.info(f"Found {len(doc_score_pairs)} matches, max score: {max_score}")
         self.show_if_debug(doc_score_pairs)
         return doc_score_pairs

langroid 0.1.85__py3-none-any.whl → 0.1.219__py3-none-any.whl

langroid 0.1.85py3-none-any.whl → 0.1.219py3-none-any.whl