PyPI - letta-nightly - Versions diffs - 0.4.1.dev20241014104152__py3-none-any.whl → 0.5.0.dev20241015014828__py3-none-any.whl - Mend

letta-nightly 0.4.1.dev20241014104152py3-none-any.whl → 0.5.0.dev20241015014828py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of letta-nightly might be problematic. Click here for more details.

Files changed (32) hide show

letta/__init__.py +2 -2
letta/agent_store/db.py +18 -7
letta/agent_store/lancedb.py +2 -2
letta/agent_store/milvus.py +1 -1
letta/agent_store/qdrant.py +1 -1
letta/agent_store/storage.py +12 -10
letta/cli/cli_load.py +1 -1
letta/client/client.py +51 -0
letta/data_sources/connectors.py +124 -124
letta/data_sources/connectors_helper.py +97 -0
letta/llm_api/mistral.py +47 -0
letta/metadata.py +58 -0
letta/providers.py +44 -0
letta/schemas/file.py +31 -0
letta/schemas/job.py +1 -1
letta/schemas/letta_request.py +3 -3
letta/schemas/llm_config.py +1 -0
letta/schemas/message.py +6 -2
letta/schemas/passage.py +3 -3
letta/schemas/source.py +2 -2
letta/server/rest_api/routers/v1/agents.py +10 -16
letta/server/rest_api/routers/v1/jobs.py +17 -1
letta/server/rest_api/routers/v1/sources.py +7 -9
letta/server/server.py +86 -13
letta/server/static_files/assets/{index-9a9c449b.js → index-dc228d4a.js} +4 -4
letta/server/static_files/index.html +1 -1
{letta_nightly-0.4.1.dev20241014104152.dist-info → letta_nightly-0.5.0.dev20241015014828.dist-info}/METADATA +1 -1
{letta_nightly-0.4.1.dev20241014104152.dist-info → letta_nightly-0.5.0.dev20241015014828.dist-info}/RECORD +31 -29
letta/schemas/document.py +0 -21
{letta_nightly-0.4.1.dev20241014104152.dist-info → letta_nightly-0.5.0.dev20241015014828.dist-info}/LICENSE +0 -0
{letta_nightly-0.4.1.dev20241014104152.dist-info → letta_nightly-0.5.0.dev20241015014828.dist-info}/WHEEL +0 -0
{letta_nightly-0.4.1.dev20241014104152.dist-info → letta_nightly-0.5.0.dev20241015014828.dist-info}/entry_points.txt +0 -0

letta/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-__version__ = "0.4.1"
+__version__ = "0.5.0"
 # import clients
 from letta.client.admin import Admin
@@ -7,9 +7,9 @@ from letta.client.client import LocalClient, RESTClient, create_client
 # imports for easier access
 from letta.schemas.agent import AgentState
 from letta.schemas.block import Block
-from letta.schemas.document import Document
 from letta.schemas.embedding_config import EmbeddingConfig
 from letta.schemas.enums import JobStatus
+from letta.schemas.file import FileMetadata
 from letta.schemas.job import Job
 from letta.schemas.letta_message import LettaMessage
 from letta.schemas.llm_config import LLMConfig

letta/agent_store/db.py CHANGED Viewed

@@ -28,7 +28,7 @@ from letta.agent_store.storage import StorageConnector, TableType
 from letta.base import Base
 from letta.config import LettaConfig
 from letta.constants import MAX_EMBEDDING_DIM
-from letta.metadata import EmbeddingConfigColumn, ToolCallColumn
+from letta.metadata import EmbeddingConfigColumn, FileMetadataModel, ToolCallColumn
 # from letta.schemas.message import Message, Passage, Record, RecordType, ToolCall
 from letta.schemas.message import Message
@@ -141,7 +141,7 @@ class PassageModel(Base):
     id = Column(String, primary_key=True)
     user_id = Column(String, nullable=False)
     text = Column(String)
-    doc_id = Column(String)
+    file_id = Column(String)
     agent_id = Column(String)
     source_id = Column(String)
@@ -160,7 +160,7 @@ class PassageModel(Base):
     # Add a datetime column, with default value as the current time
     created_at = Column(DateTime(timezone=True))
-    Index("passage_idx_user", user_id, agent_id, doc_id),
+    Index("passage_idx_user", user_id, agent_id, file_id),
     def __repr__(self):
         return f"<Passage(passage_id='{self.id}', text='{self.text}', embedding='{self.embedding})>"
@@ -170,7 +170,7 @@ class PassageModel(Base):
             text=self.text,
             embedding=self.embedding,
             embedding_config=self.embedding_config,
-            doc_id=self.doc_id,
+            file_id=self.file_id,
             user_id=self.user_id,
             id=self.id,
             source_id=self.source_id,
@@ -365,12 +365,17 @@ class PostgresStorageConnector(SQLStorageConnector):
                 self.uri = self.config.archival_storage_uri
                 self.db_model = PassageModel
                 if self.config.archival_storage_uri is None:
-                    raise ValueError(f"Must specifiy archival_storage_uri in config {self.config.config_path}")
+                    raise ValueError(f"Must specify archival_storage_uri in config {self.config.config_path}")
             elif table_type == TableType.RECALL_MEMORY:
                 self.uri = self.config.recall_storage_uri
                 self.db_model = MessageModel
                 if self.config.recall_storage_uri is None:
-                    raise ValueError(f"Must specifiy recall_storage_uri in config {self.config.config_path}")
+                    raise ValueError(f"Must specify recall_storage_uri in config {self.config.config_path}")
+            elif table_type == TableType.FILES:
+                self.uri = self.config.metadata_storage_uri
+                self.db_model = FileMetadataModel
+                if self.config.metadata_storage_uri is None:
+                    raise ValueError(f"Must specify metadata_storage_uri in config {self.config.config_path}")
             else:
                 raise ValueError(f"Table type {table_type} not implemented")
@@ -487,8 +492,14 @@ class SQLLiteStorageConnector(SQLStorageConnector):
             # TODO: eventually implement URI option
             self.path = self.config.recall_storage_path
             if self.path is None:
-                raise ValueError(f"Must specifiy recall_storage_path in config {self.config.recall_storage_path}")
+                raise ValueError(f"Must specify recall_storage_path in config.")
             self.db_model = MessageModel
+        elif table_type == TableType.FILES:
+            self.path = self.config.metadata_storage_path
+            if self.path is None:
+                raise ValueError(f"Must specify metadata_storage_path in config.")
+            self.db_model = FileMetadataModel
         else:
             raise ValueError(f"Table type {table_type} not implemented")

letta/agent_store/lancedb.py CHANGED Viewed

@@ -24,7 +24,7 @@ def get_db_model(table_name: str, table_type: TableType):
             id: uuid.UUID
             user_id: str
             text: str
-            doc_id: str
+            file_id: str
             agent_id: str
             data_source: str
             embedding: Vector(config.default_embedding_config.embedding_dim)
@@ -37,7 +37,7 @@ def get_db_model(table_name: str, table_type: TableType):
                 return Passage(
                     text=self.text,
                     embedding=self.embedding,
-                    doc_id=self.doc_id,
+                    file_id=self.file_id,
                     user_id=self.user_id,
                     id=self.id,
                     data_source=self.data_source,

letta/agent_store/milvus.py CHANGED Viewed

@@ -26,7 +26,7 @@ class MilvusStorageConnector(StorageConnector):
             raise ValueError("Please set `archival_storage_uri` in the config file when using Milvus.")
         # need to be converted to strings
-        self.uuid_fields = ["id", "user_id", "agent_id", "source_id", "doc_id"]
+        self.uuid_fields = ["id", "user_id", "agent_id", "source_id", "file_id"]
     def _create_collection(self):
         schema = MilvusClient.create_schema(

letta/agent_store/qdrant.py CHANGED Viewed

@@ -38,7 +38,7 @@ class QdrantStorageConnector(StorageConnector):
                     distance=models.Distance.COSINE,
                 ),
             )
-        self.uuid_fields = ["id", "user_id", "agent_id", "source_id", "doc_id"]
+        self.uuid_fields = ["id", "user_id", "agent_id", "source_id", "file_id"]
     def get_all_paginated(self, filters: Optional[Dict] = {}, page_size: int = 10) -> Iterator[List[RecordType]]:
         from qdrant_client import grpc

letta/agent_store/storage.py CHANGED Viewed

@@ -10,7 +10,7 @@ from typing import Dict, List, Optional, Tuple, Type, Union
 from pydantic import BaseModel
 from letta.config import LettaConfig
-from letta.schemas.document import Document
+from letta.schemas.file import FileMetadata
 from letta.schemas.message import Message
 from letta.schemas.passage import Passage
 from letta.utils import printd
@@ -22,7 +22,7 @@ class TableType:
     ARCHIVAL_MEMORY = "archival_memory"  # recall memory table: letta_agent_{agent_id}
     RECALL_MEMORY = "recall_memory"  # archival memory table: letta_agent_recall_{agent_id}
     PASSAGES = "passages"  # TODO
-    DOCUMENTS = "documents"  # TODO
+    FILES = "files"
 # table names used by Letta
@@ -33,17 +33,17 @@ ARCHIVAL_TABLE_NAME = "letta_archival_memory_agent"  # agent memory
 # external data source tables
 PASSAGE_TABLE_NAME = "letta_passages"  # chunked/embedded passages (from source)
-DOCUMENT_TABLE_NAME = "letta_documents"  # original documents (from source)
+FILE_TABLE_NAME = "letta_files"  # original files (from source)
 class StorageConnector:
-    """Defines a DB connection that is user-specific to access data: Documents, Passages, Archival/Recall Memory"""
+    """Defines a DB connection that is user-specific to access data: files, Passages, Archival/Recall Memory"""
     type: Type[BaseModel]
     def __init__(
         self,
-        table_type: Union[TableType.ARCHIVAL_MEMORY, TableType.RECALL_MEMORY, TableType.PASSAGES, TableType.DOCUMENTS],
+        table_type: Union[TableType.ARCHIVAL_MEMORY, TableType.RECALL_MEMORY, TableType.PASSAGES, TableType.FILES],
         config: LettaConfig,
         user_id,
         agent_id=None,
@@ -59,9 +59,9 @@ class StorageConnector:
         elif table_type == TableType.RECALL_MEMORY:
             self.type = Message
             self.table_name = RECALL_TABLE_NAME
-        elif table_type == TableType.DOCUMENTS:
-            self.type = Document
-            self.table_name == DOCUMENT_TABLE_NAME
+        elif table_type == TableType.FILES:
+            self.type = FileMetadata
+            self.table_name = FILE_TABLE_NAME
         elif table_type == TableType.PASSAGES:
             self.type = Passage
             self.table_name = PASSAGE_TABLE_NAME
@@ -74,7 +74,7 @@ class StorageConnector:
             # agent-specific table
             assert agent_id is not None, "Agent ID must be provided for agent-specific tables"
             self.filters = {"user_id": self.user_id, "agent_id": self.agent_id}
-        elif self.table_type == TableType.PASSAGES or self.table_type == TableType.DOCUMENTS:
+        elif self.table_type == TableType.PASSAGES or self.table_type == TableType.FILES:
             # setup base filters for user-specific tables
             assert agent_id is None, "Agent ID must not be provided for user-specific tables"
             self.filters = {"user_id": self.user_id}
@@ -83,7 +83,7 @@ class StorageConnector:
     @staticmethod
     def get_storage_connector(
-        table_type: Union[TableType.ARCHIVAL_MEMORY, TableType.RECALL_MEMORY, TableType.PASSAGES, TableType.DOCUMENTS],
+        table_type: Union[TableType.ARCHIVAL_MEMORY, TableType.RECALL_MEMORY, TableType.PASSAGES, TableType.FILES],
         config: LettaConfig,
         user_id,
         agent_id=None,
@@ -92,6 +92,8 @@ class StorageConnector:
             storage_type = config.archival_storage_type
         elif table_type == TableType.RECALL_MEMORY:
             storage_type = config.recall_storage_type
+        elif table_type == TableType.FILES:
+            storage_type = config.metadata_storage_type
         else:
             raise ValueError(f"Table type {table_type} not implemented")

letta/cli/cli_load.py CHANGED Viewed

@@ -106,7 +106,7 @@ def load_vector_database(
     #            document_store=None,
     #            passage_store=passage_storage,
     #        )
-    #        print(f"Loaded {num_passages} passages and {num_documents} documents from {name}")
+    #        print(f"Loaded {num_passages} passages and {num_documents} files from {name}")
     #    except Exception as e:
     #        typer.secho(f"Failed to load data from provided information.\n{e}", fg=typer.colors.RED)
     #        ms.delete_source(source_id=source.id)

letta/client/client.py CHANGED Viewed

@@ -25,6 +25,7 @@ from letta.schemas.embedding_config import EmbeddingConfig
 # new schemas
 from letta.schemas.enums import JobStatus, MessageRole
+from letta.schemas.file import FileMetadata
 from letta.schemas.job import Job
 from letta.schemas.letta_request import LettaRequest
 from letta.schemas.letta_response import LettaResponse, LettaStreamingResponse
@@ -232,6 +233,9 @@ class AbstractClient(object):
     def list_attached_sources(self, agent_id: str) -> List[Source]:
         raise NotImplementedError
+    def list_files_from_source(self, source_id: str, limit: int = 1000, cursor: Optional[str] = None) -> List[FileMetadata]:
+        raise NotImplementedError
     def update_source(self, source_id: str, name: Optional[str] = None) -> Source:
         raise NotImplementedError
@@ -1016,6 +1020,12 @@ class RESTClient(AbstractClient):
             raise ValueError(f"Failed to get job: {response.text}")
         return Job(**response.json())
+    def delete_job(self, job_id: str) -> Job:
+        response = requests.delete(f"{self.base_url}/{self.api_prefix}/jobs/{job_id}", headers=self.headers)
+        if response.status_code != 200:
+            raise ValueError(f"Failed to delete job: {response.text}")
+        return Job(**response.json())
     def list_jobs(self):
         response = requests.get(f"{self.base_url}/{self.api_prefix}/jobs", headers=self.headers)
         return [Job(**job) for job in response.json()]
@@ -1088,6 +1098,30 @@ class RESTClient(AbstractClient):
             raise ValueError(f"Failed to list attached sources: {response.text}")
         return [Source(**source) for source in response.json()]
+    def list_files_from_source(self, source_id: str, limit: int = 1000, cursor: Optional[str] = None) -> List[FileMetadata]:
+        """
+        List files from source with pagination support.
+        Args:
+            source_id (str): ID of the source
+            limit (int): Number of files to return
+            cursor (Optional[str]): Pagination cursor for fetching the next page
+        Returns:
+            List[FileMetadata]: List of files
+        """
+        # Prepare query parameters for pagination
+        params = {"limit": limit, "cursor": cursor}
+        # Make the request to the FastAPI endpoint
+        response = requests.get(f"{self.base_url}/{self.api_prefix}/sources/{source_id}/files", headers=self.headers, params=params)
+        if response.status_code != 200:
+            raise ValueError(f"Failed to list files with source id {source_id}: [{response.status_code}] {response.text}")
+        # Parse the JSON response
+        return [FileMetadata(**metadata) for metadata in response.json()]
     def update_source(self, source_id: str, name: Optional[str] = None) -> Source:
         """
         Update a source
@@ -2162,6 +2196,9 @@ class LocalClient(AbstractClient):
     def get_job(self, job_id: str):
         return self.server.get_job(job_id=job_id)
+    def delete_job(self, job_id: str):
+        return self.server.delete_job(job_id)
     def list_jobs(self):
         return self.server.list_jobs(user_id=self.user_id)
@@ -2261,6 +2298,20 @@ class LocalClient(AbstractClient):
         """
         return self.server.list_attached_sources(agent_id=agent_id)
+    def list_files_from_source(self, source_id: str, limit: int = 1000, cursor: Optional[str] = None) -> List[FileMetadata]:
+        """
+        List files from source.
+        Args:
+            source_id (str): ID of the source
+            limit (int): The # of items to return
+            cursor (str): The cursor for fetching the next page
+        Returns:
+            files (List[FileMetadata]): List of files
+        """
+        return self.server.list_files_from_source(source_id=source_id, limit=limit, cursor=cursor)
     def update_source(self, source_id: str, name: Optional[str] = None) -> Source:
         """
         Update a source

letta/data_sources/connectors.py CHANGED Viewed

@@ -1,11 +1,15 @@
-from typing import Dict, Iterator, List, Optional, Tuple
+from typing import Dict, Iterator, List, Tuple
 import typer
-from llama_index.core import Document as LlamaIndexDocument
 from letta.agent_store.storage import StorageConnector
+from letta.data_sources.connectors_helper import (
+    assert_all_files_exist_locally,
+    extract_metadata_from_files,
+    get_filenames_in_dir,
+)
 from letta.embeddings import embedding_model
-from letta.schemas.document import Document
+from letta.schemas.file import FileMetadata
 from letta.schemas.passage import Passage
 from letta.schemas.source import Source
 from letta.utils import create_uuid_from_string
@@ -13,23 +17,23 @@ from letta.utils import create_uuid_from_string
 class DataConnector:
     """
-    Base class for data connectors that can be extended to generate documents and passages from a custom data source.
+    Base class for data connectors that can be extended to generate files and passages from a custom data source.
     """
-    def generate_documents(self) -> Iterator[Tuple[str, Dict]]:  # -> Iterator[Document]:
+    def find_files(self, source: Source) -> Iterator[FileMetadata]:
         """
-        Generate document text and metadata from a data source.
+        Generate file metadata from a data source.
         Returns:
-            documents (Iterator[Tuple[str, Dict]]): Generate a tuple of string text and metadata dictionary for each document.
+            files (Iterator[FileMetadata]): Generate file metadata for each file found.
         """
-    def generate_passages(self, documents: List[Document], chunk_size: int = 1024) -> Iterator[Tuple[str, Dict]]:  # -> Iterator[Passage]:
+    def generate_passages(self, file: FileMetadata, chunk_size: int = 1024) -> Iterator[Tuple[str, Dict]]:  # -> Iterator[Passage]:
         """
-        Generate passage text and metadata from a list of documents.
+        Generate passage text and metadata from a list of files.
         Args:
-            documents (List[Document]): List of documents to generate passages from.
+            file (FileMetadata): The document to generate passages from.
             chunk_size (int, optional): Chunk size for splitting passages. Defaults to 1024.
         Returns:
@@ -41,33 +45,25 @@ def load_data(
     connector: DataConnector,
     source: Source,
     passage_store: StorageConnector,
-    document_store: Optional[StorageConnector] = None,
+    file_metadata_store: StorageConnector,
 ):
-    """Load data from a connector (generates documents and passages) into a specified source_id, associatedw with a user_id."""
+    """Load data from a connector (generates file and passages) into a specified source_id, associatedw with a user_id."""
     embedding_config = source.embedding_config
     # embedding model
     embed_model = embedding_model(embedding_config)
-    # insert passages/documents
+    # insert passages/file
     passages = []
     embedding_to_document_name = {}
     passage_count = 0
-    document_count = 0
-    for document_text, document_metadata in connector.generate_documents():
-        # insert document into storage
-        document = Document(
-            text=document_text,
-            metadata_=document_metadata,
-            source_id=source.id,
-            user_id=source.user_id,
-        )
-        document_count += 1
-        if document_store:
-            document_store.insert(document)
+    file_count = 0
+    for file_metadata in connector.find_files(source):
+        file_count += 1
+        file_metadata_store.insert(file_metadata)
         # generate passages
-        for passage_text, passage_metadata in connector.generate_passages([document], chunk_size=embedding_config.embedding_chunk_size):
+        for passage_text, passage_metadata in connector.generate_passages(file_metadata, chunk_size=embedding_config.embedding_chunk_size):
             # for some reason, llama index parsers sometimes return empty strings
             if len(passage_text) == 0:
                 typer.secho(
@@ -89,7 +85,7 @@ def load_data(
             passage = Passage(
                 id=create_uuid_from_string(f"{str(source.id)}_{passage_text}"),
                 text=passage_text,
-                doc_id=document.id,
+                file_id=file_metadata.id,
                 source_id=source.id,
                 metadata_=passage_metadata,
                 user_id=source.user_id,
@@ -98,16 +94,16 @@ def load_data(
             )
             hashable_embedding = tuple(passage.embedding)
-            document_name = document.metadata_.get("file_path", document.id)
+            file_name = file_metadata.file_name
             if hashable_embedding in embedding_to_document_name:
                 typer.secho(
-                    f"Warning: Duplicate embedding found for passage in {document_name} (already exists in {embedding_to_document_name[hashable_embedding]}), skipping insert into VectorDB.",
+                    f"Warning: Duplicate embedding found for passage in {file_name} (already exists in {embedding_to_document_name[hashable_embedding]}), skipping insert into VectorDB.",
                     fg=typer.colors.YELLOW,
                 )
                 continue
             passages.append(passage)
-            embedding_to_document_name[hashable_embedding] = document_name
+            embedding_to_document_name[hashable_embedding] = file_name
             if len(passages) >= 100:
                 # insert passages into passage store
                 passage_store.insert_many(passages)
@@ -120,7 +116,7 @@ def load_data(
         passage_store.insert_many(passages)
         passage_count += len(passages)
-    return passage_count, document_count
+    return passage_count, file_count
 class DirectoryConnector(DataConnector):
@@ -143,105 +139,109 @@ class DirectoryConnector(DataConnector):
         if self.recursive == True:
             assert self.input_directory is not None, "Must provide input directory if recursive is True."
-    def generate_documents(self) -> Iterator[Tuple[str, Dict]]:  # -> Iterator[Document]:
-        from llama_index.core import SimpleDirectoryReader
+    def find_files(self, source: Source) -> Iterator[FileMetadata]:
         if self.input_directory is not None:
-            reader = SimpleDirectoryReader(
+            files = get_filenames_in_dir(
                 input_dir=self.input_directory,
                 recursive=self.recursive,
                 required_exts=[ext.strip() for ext in str(self.extensions).split(",")],
+                exclude=["*png", "*jpg", "*jpeg"],
             )
         else:
-            assert self.input_files is not None, "Must provide input files if input_dir is None"
-            reader = SimpleDirectoryReader(input_files=[str(f) for f in self.input_files])
-        llama_index_docs = reader.load_data(show_progress=True)
-        for llama_index_doc in llama_index_docs:
-            # TODO: add additional metadata?
-            # doc = Document(text=llama_index_doc.text, metadata=llama_index_doc.metadata)
-            # docs.append(doc)
-            yield llama_index_doc.text, llama_index_doc.metadata
-    def generate_passages(self, documents: List[Document], chunk_size: int = 1024) -> Iterator[Tuple[str, Dict]]:  # -> Iterator[Passage]:
-        # use llama index to run embeddings code
-        # from llama_index.core.node_parser import SentenceSplitter
+            files = self.input_files
+        # Check that file paths are valid
+        assert_all_files_exist_locally(files)
+        for metadata in extract_metadata_from_files(files):
+            yield FileMetadata(
+                user_id=source.user_id,
+                source_id=source.id,
+                file_name=metadata.get("file_name"),
+                file_path=metadata.get("file_path"),
+                file_type=metadata.get("file_type"),
+                file_size=metadata.get("file_size"),
+                file_creation_date=metadata.get("file_creation_date"),
+                file_last_modified_date=metadata.get("file_last_modified_date"),
+            )
+    def generate_passages(self, file: FileMetadata, chunk_size: int = 1024) -> Iterator[Tuple[str, Dict]]:
+        from llama_index.core import SimpleDirectoryReader
         from llama_index.core.node_parser import TokenTextSplitter
         parser = TokenTextSplitter(chunk_size=chunk_size)
-        for document in documents:
-            llama_index_docs = [LlamaIndexDocument(text=document.text, metadata=document.metadata_)]
-            nodes = parser.get_nodes_from_documents(llama_index_docs)
-            for node in nodes:
-                # passage = Passage(
-                #    text=node.text,
-                #    doc_id=document.id,
-                # )
-                yield node.text, None
-class WebConnector(DirectoryConnector):
-    def __init__(self, urls: List[str] = None, html_to_text: bool = True):
-        self.urls = urls
-        self.html_to_text = html_to_text
-    def generate_documents(self) -> Iterator[Tuple[str, Dict]]:  # -> Iterator[Document]:
-        from llama_index.readers.web import SimpleWebPageReader
-        documents = SimpleWebPageReader(html_to_text=self.html_to_text).load_data(self.urls)
-        for document in documents:
-            yield document.text, {"url": document.id_}
-class VectorDBConnector(DataConnector):
-    # NOTE: this class has not been properly tested, so is unlikely to work
-    # TODO: allow loading multiple tables (1:1 mapping between Document and Table)
-    def __init__(
-        self,
-        name: str,
-        uri: str,
-        table_name: str,
-        text_column: str,
-        embedding_column: str,
-        embedding_dim: int,
-    ):
-        self.name = name
-        self.uri = uri
-        self.table_name = table_name
-        self.text_column = text_column
-        self.embedding_column = embedding_column
-        self.embedding_dim = embedding_dim
-        # connect to db table
-        from sqlalchemy import create_engine
-        self.engine = create_engine(uri)
-    def generate_documents(self) -> Iterator[Tuple[str, Dict]]:  # -> Iterator[Document]:
-        yield self.table_name, None
-    def generate_passages(self, documents: List[Document], chunk_size: int = 1024) -> Iterator[Tuple[str, Dict]]:  # -> Iterator[Passage]:
-        from pgvector.sqlalchemy import Vector
-        from sqlalchemy import Inspector, MetaData, Table, select
-        metadata = MetaData()
-        # Create an inspector to inspect the database
-        inspector = Inspector.from_engine(self.engine)
-        table_names = inspector.get_table_names()
-        assert self.table_name in table_names, f"Table {self.table_name} not found in database: tables that exist {table_names}."
-        table = Table(self.table_name, metadata, autoload_with=self.engine)
-        # Prepare a select statement
-        select_statement = select(table.c[self.text_column], table.c[self.embedding_column].cast(Vector(self.embedding_dim)))
-        # Execute the query and fetch the results
-        # TODO: paginate results
-        with self.engine.connect() as connection:
-            result = connection.execute(select_statement).fetchall()
-        for text, embedding in result:
-            # assume that embeddings are the same model as in config
-            # TODO: don't re-compute embedding
-            yield text, {"embedding": embedding}
+        documents = SimpleDirectoryReader(input_files=[file.file_path]).load_data()
+        nodes = parser.get_nodes_from_documents(documents)
+        for node in nodes:
+            yield node.text, None
+"""
+The below isn't used anywhere, it isn't tested, and pretty much should be deleted.
+- Matt
+"""
+# class WebConnector(DirectoryConnector):
+#     def __init__(self, urls: List[str] = None, html_to_text: bool = True):
+#         self.urls = urls
+#         self.html_to_text = html_to_text
+#
+#     def generate_files(self) -> Iterator[Tuple[str, Dict]]:  # -> Iterator[Document]:
+#         from llama_index.readers.web import SimpleWebPageReader
+#
+#         files = SimpleWebPageReader(html_to_text=self.html_to_text).load_data(self.urls)
+#         for document in files:
+#             yield document.text, {"url": document.id_}
+#
+#
+# class VectorDBConnector(DataConnector):
+#     # NOTE: this class has not been properly tested, so is unlikely to work
+#     # TODO: allow loading multiple tables (1:1 mapping between FileMetadata and Table)
+#
+#     def __init__(
+#         self,
+#         name: str,
+#         uri: str,
+#         table_name: str,
+#         text_column: str,
+#         embedding_column: str,
+#         embedding_dim: int,
+#     ):
+#         self.name = name
+#         self.uri = uri
+#         self.table_name = table_name
+#         self.text_column = text_column
+#         self.embedding_column = embedding_column
+#         self.embedding_dim = embedding_dim
+#
+#         # connect to db table
+#         from sqlalchemy import create_engine
+#
+#         self.engine = create_engine(uri)
+#
+#     def generate_files(self) -> Iterator[Tuple[str, Dict]]:  # -> Iterator[Document]:
+#         yield self.table_name, None
+#
+#     def generate_passages(self, file_text: str, file: FileMetadata, chunk_size: int = 1024) -> Iterator[Tuple[str, Dict]]:  # -> Iterator[Passage]:
+#         from pgvector.sqlalchemy import Vector
+#         from sqlalchemy import Inspector, MetaData, Table, select
+#
+#         metadata = MetaData()
+#         # Create an inspector to inspect the database
+#         inspector = Inspector.from_engine(self.engine)
+#         table_names = inspector.get_table_names()
+#         assert self.table_name in table_names, f"Table {self.table_name} not found in database: tables that exist {table_names}."
+#
+#         table = Table(self.table_name, metadata, autoload_with=self.engine)
+#
+#         # Prepare a select statement
+#         select_statement = select(table.c[self.text_column], table.c[self.embedding_column].cast(Vector(self.embedding_dim)))
+#
+#         # Execute the query and fetch the results
+#         # TODO: paginate results
+#         with self.engine.connect() as connection:
+#             result = connection.execute(select_statement).fetchall()
+#
+#         for text, embedding in result:
+#             # assume that embeddings are the same model as in config
+#             # TODO: don't re-compute embedding
+#             yield text, {"embedding": embedding}

letta-nightly 0.4.1.dev20241014104152__py3-none-any.whl → 0.5.0.dev20241015014828__py3-none-any.whl

Potentially problematic release.

letta-nightly 0.4.1.dev20241014104152py3-none-any.whl → 0.5.0.dev20241015014828py3-none-any.whl