PyPI - langroid - Versions diffs - 0.1.85__py3-none-any.whl → 0.1.219__py3-none-any.whl - Mend

langroid 0.1.85py3-none-any.whl → 0.1.219py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (107) hide show

langroid/__init__.py +95 -0
langroid/agent/__init__.py +40 -0
langroid/agent/base.py +222 -91
langroid/agent/batch.py +264 -0
langroid/agent/callbacks/chainlit.py +608 -0
langroid/agent/chat_agent.py +247 -101
langroid/agent/chat_document.py +41 -4
langroid/agent/openai_assistant.py +842 -0
langroid/agent/special/__init__.py +50 -0
langroid/agent/special/doc_chat_agent.py +837 -141
langroid/agent/special/lance_doc_chat_agent.py +258 -0
langroid/agent/special/lance_rag/__init__.py +9 -0
langroid/agent/special/lance_rag/critic_agent.py +136 -0
langroid/agent/special/lance_rag/lance_rag_task.py +80 -0
langroid/agent/special/lance_rag/query_planner_agent.py +180 -0
langroid/agent/special/lance_tools.py +44 -0
langroid/agent/special/neo4j/__init__.py +0 -0
langroid/agent/special/neo4j/csv_kg_chat.py +174 -0
langroid/agent/special/neo4j/neo4j_chat_agent.py +370 -0
langroid/agent/special/neo4j/utils/__init__.py +0 -0
langroid/agent/special/neo4j/utils/system_message.py +46 -0
langroid/agent/special/relevance_extractor_agent.py +127 -0
langroid/agent/special/retriever_agent.py +32 -198
langroid/agent/special/sql/__init__.py +11 -0
langroid/agent/special/sql/sql_chat_agent.py +47 -23
langroid/agent/special/sql/utils/__init__.py +22 -0
langroid/agent/special/sql/utils/description_extractors.py +95 -46
langroid/agent/special/sql/utils/populate_metadata.py +28 -21
langroid/agent/special/table_chat_agent.py +43 -9
langroid/agent/task.py +475 -122
langroid/agent/tool_message.py +75 -13
langroid/agent/tools/__init__.py +13 -0
langroid/agent/tools/duckduckgo_search_tool.py +66 -0
langroid/agent/tools/google_search_tool.py +11 -0
langroid/agent/tools/metaphor_search_tool.py +67 -0
langroid/agent/tools/recipient_tool.py +16 -29
langroid/agent/tools/run_python_code.py +60 -0
langroid/agent/tools/sciphi_search_rag_tool.py +79 -0
langroid/agent/tools/segment_extract_tool.py +36 -0
langroid/cachedb/__init__.py +9 -0
langroid/cachedb/base.py +22 -2
langroid/cachedb/momento_cachedb.py +26 -2
langroid/cachedb/redis_cachedb.py +78 -11
langroid/embedding_models/__init__.py +34 -0
langroid/embedding_models/base.py +21 -2
langroid/embedding_models/models.py +120 -18
langroid/embedding_models/protoc/embeddings.proto +19 -0
langroid/embedding_models/protoc/embeddings_pb2.py +33 -0
langroid/embedding_models/protoc/embeddings_pb2.pyi +50 -0
langroid/embedding_models/protoc/embeddings_pb2_grpc.py +79 -0
langroid/embedding_models/remote_embeds.py +153 -0
langroid/language_models/__init__.py +45 -0
langroid/language_models/azure_openai.py +80 -27
langroid/language_models/base.py +117 -12
langroid/language_models/config.py +5 -0
langroid/language_models/openai_assistants.py +3 -0
langroid/language_models/openai_gpt.py +558 -174
langroid/language_models/prompt_formatter/__init__.py +15 -0
langroid/language_models/prompt_formatter/base.py +4 -6
langroid/language_models/prompt_formatter/hf_formatter.py +135 -0
langroid/language_models/utils.py +18 -21
langroid/mytypes.py +25 -8
langroid/parsing/__init__.py +46 -0
langroid/parsing/document_parser.py +260 -63
langroid/parsing/image_text.py +32 -0
langroid/parsing/parse_json.py +143 -0
langroid/parsing/parser.py +122 -59
langroid/parsing/repo_loader.py +114 -52
langroid/parsing/search.py +68 -63
langroid/parsing/spider.py +3 -2
langroid/parsing/table_loader.py +44 -0
langroid/parsing/url_loader.py +59 -11
langroid/parsing/urls.py +85 -37
langroid/parsing/utils.py +298 -4
langroid/parsing/web_search.py +73 -0
langroid/prompts/__init__.py +11 -0
langroid/prompts/chat-gpt4-system-prompt.md +68 -0
langroid/prompts/prompts_config.py +1 -1
langroid/utils/__init__.py +17 -0
langroid/utils/algorithms/__init__.py +3 -0
langroid/utils/algorithms/graph.py +103 -0
langroid/utils/configuration.py +36 -5
langroid/utils/constants.py +4 -0
langroid/utils/globals.py +2 -2
langroid/utils/logging.py +2 -5
langroid/utils/output/__init__.py +21 -0
langroid/utils/output/printing.py +47 -1
langroid/utils/output/status.py +33 -0
langroid/utils/pandas_utils.py +30 -0
langroid/utils/pydantic_utils.py +616 -2
langroid/utils/system.py +98 -0
langroid/vector_store/__init__.py +40 -0
langroid/vector_store/base.py +203 -6
langroid/vector_store/chromadb.py +59 -32
langroid/vector_store/lancedb.py +463 -0
langroid/vector_store/meilisearch.py +10 -7
langroid/vector_store/momento.py +262 -0
langroid/vector_store/qdrantdb.py +104 -22
{langroid-0.1.85.dist-info → langroid-0.1.219.dist-info}/METADATA +329 -149
langroid-0.1.219.dist-info/RECORD +127 -0
{langroid-0.1.85.dist-info → langroid-0.1.219.dist-info}/WHEEL +1 -1
langroid/agent/special/recipient_validator_agent.py +0 -157
langroid/parsing/json.py +0 -64
langroid/utils/web/selenium_login.py +0 -36
langroid-0.1.85.dist-info/RECORD +0 -94
/langroid/{scripts → agent/callbacks}/__init__.py +0 -0
{langroid-0.1.85.dist-info → langroid-0.1.219.dist-info}/LICENSE +0 -0

langroid/parsing/parser.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import logging
 from enum import Enum
-from functools import reduce
-from typing import List
+from typing import Dict, List, Literal
 import tiktoken
 from pydantic import BaseSettings
@@ -20,11 +19,21 @@ class Splitter(str, Enum):
 class PdfParsingConfig(BaseSettings):
-    library: str = "pdfplumber"
+    library: Literal[
+        "fitz",
+        "pdfplumber",
+        "pypdf",
+        "unstructured",
+        "pdf2image",
+    ] = "pdfplumber"
 class DocxParsingConfig(BaseSettings):
-    library: str = "unstructured"
+    library: Literal["python-docx", "unstructured"] = "unstructured"
+class DocParsingConfig(BaseSettings):
+    library: Literal["unstructured"] = "unstructured"
 class ParsingConfig(BaseSettings):
@@ -36,10 +45,12 @@ class ParsingConfig(BaseSettings):
     min_chunk_chars: int = 350
     discard_chunk_chars: int = 5  # discard chunks with fewer than this many chars
     n_similar_docs: int = 4
+    n_neighbor_ids: int = 5  # window size to store around each chunk
     separators: List[str] = ["\n\n", "\n", " ", ""]
     token_encoding_model: str = "text-embedding-ada-002"
     pdf: PdfParsingConfig = PdfParsingConfig()
     docx: DocxParsingConfig = DocxParsingConfig()
+    doc: DocParsingConfig = DocParsingConfig()
 class Parser:
@@ -51,72 +62,122 @@ class Parser:
         tokens = self.tokenizer.encode(text)
         return len(tokens)
+    def add_window_ids(self, chunks: List[Document]) -> None:
+        """Chunks may belong to multiple docs, but for each doc,
+        they appear consecutively. Add window_ids in metadata"""
+        # discard empty chunks
+        chunks = [c for c in chunks if c.content.strip() != ""]
+        if len(chunks) == 0:
+            return
+        # The original metadata.id (if any) is ignored since it will be same for all
+        # chunks and is useless. We want a distinct id for each chunk.
+        orig_ids = [c.metadata.id for c in chunks]
+        ids = [Document.hash_id(str(c)) for c in chunks]
+        id2chunk = {id: c for id, c in zip(ids, chunks)}
+        # group the ids by orig_id
+        orig_id_to_ids: Dict[str, List[str]] = {}
+        for orig_id, id in zip(orig_ids, ids):
+            if orig_id not in orig_id_to_ids:
+                orig_id_to_ids[orig_id] = []
+            orig_id_to_ids[orig_id].append(id)
+        # now each orig_id maps to a sequence of ids within a single doc
+        k = self.config.n_neighbor_ids
+        for orig, ids in orig_id_to_ids.items():
+            # ids are consecutive chunks in a single doc
+            n = len(ids)
+            window_ids = [ids[max(0, i - k) : min(n, i + k + 1)] for i in range(n)]
+            for i, _ in enumerate(ids):
+                c = id2chunk[ids[i]]
+                c.metadata.window_ids = window_ids[i]
+                c.metadata.id = ids[i]
+                c.metadata.is_chunk = True
     def split_simple(self, docs: List[Document]) -> List[Document]:
         if len(self.config.separators) == 0:
             raise ValueError("Must have at least one separator")
-        return [
-            Document(content=chunk.strip(), metadata=d.metadata)
-            for d in docs
-            for chunk in remove_extra_whitespace(d.content).split(
-                self.config.separators[0]
-            )
-            if chunk.strip() != ""
-        ]
+        final_docs = []
+        for d in docs:
+            if d.content.strip() == "":
+                continue
+            chunks = remove_extra_whitespace(d.content).split(self.config.separators[0])
+            chunk_docs = [
+                Document(
+                    content=c, metadata=d.metadata.copy(update=dict(is_chunk=True))
+                )
+                for c in chunks
+                if c.strip() != ""
+            ]
+            self.add_window_ids(chunk_docs)
+            final_docs += chunk_docs
+        return final_docs
     def split_para_sentence(self, docs: List[Document]) -> List[Document]:
-        final_chunks = []
         chunks = docs
         while True:
-            long_chunks = [
-                p
-                for p in chunks
-                if self.num_tokens(p.content) > 1.3 * self.config.chunk_size
-            ]
-            if len(long_chunks) == 0:
-                break
-            short_chunks = [
-                p
-                for p in chunks
-                if self.num_tokens(p.content) <= 1.3 * self.config.chunk_size
-            ]
-            final_chunks += short_chunks
-            chunks = self._split_para_sentence_once(long_chunks)
-            if len(chunks) == len(long_chunks):
-                max_len = max([self.num_tokens(p.content) for p in long_chunks])
-                logger.warning(
-                    f"""
-                    Unable to split {len(long_chunks)} long chunks
-                    using chunk_size = {self.config.chunk_size}.
-                    Max chunk size is {max_len} tokens.
-                    """
-                )
+            un_splittables = 0
+            split_chunks = []
+            for c in chunks:
+                if c.content.strip() == "":
+                    continue
+                if self.num_tokens(c.content) <= 1.3 * self.config.chunk_size:
+                    # small chunk: no need to split
+                    split_chunks.append(c)
+                    continue
+                splits = self._split_para_sentence_once([c])
+                un_splittables += len(splits) == 1
+                split_chunks += splits
+            if len(split_chunks) == len(chunks):
+                if un_splittables > 0:
+                    max_len = max([self.num_tokens(p.content) for p in chunks])
+                    logger.warning(
+                        f"""
+                        Unable to split {un_splittables} chunks
+                        using chunk_size = {self.config.chunk_size}.
+                        Max chunk size is {max_len} tokens.
+                        """
+                    )
                 break  # we won't be able to shorten them with current settings
+            chunks = split_chunks.copy()
-        return final_chunks + chunks
+        self.add_window_ids(chunks)
+        return chunks
     def _split_para_sentence_once(self, docs: List[Document]) -> List[Document]:
-        chunked_docs = [
-            [
-                Document(content=chunk.strip(), metadata=d.metadata)
-                for chunk in create_chunks(
-                    d.content, self.config.chunk_size, self.num_tokens
+        final_chunks = []
+        for d in docs:
+            if d.content.strip() == "":
+                continue
+            chunks = create_chunks(d.content, self.config.chunk_size, self.num_tokens)
+            chunk_docs = [
+                Document(
+                    content=c, metadata=d.metadata.copy(update=dict(is_chunk=True))
                 )
-                if chunk.strip() != ""
+                for c in chunks
+                if c.strip() != ""
             ]
-            for d in docs
-        ]
-        return reduce(lambda x, y: x + y, chunked_docs)
+            final_chunks += chunk_docs
+        return final_chunks
     def split_chunk_tokens(self, docs: List[Document]) -> List[Document]:
-        chunked_docs = [
-            [
-                Document(content=chunk.strip(), metadata=d.metadata)
-                for chunk in self.chunk_tokens(d.content)
-                if chunk.strip() != ""
+        final_docs = []
+        for d in docs:
+            chunks = self.chunk_tokens(d.content)
+            chunk_docs = [
+                Document(
+                    content=c, metadata=d.metadata.copy(update=dict(is_chunk=True))
+                )
+                for c in chunks
+                if c.strip() != ""
             ]
-            for d in docs
-        ]
-        return reduce(lambda x, y: x + y, chunked_docs)
+            self.add_window_ids(chunk_docs)
+            final_docs += chunk_docs
+        return final_docs
     def chunk_tokens(
         self,
@@ -198,17 +259,19 @@ class Parser:
             # Increment the number of chunks
             num_chunks += 1
-        # Handle the remaining tokens
-        if tokens:
-            remaining_text = self.tokenizer.decode(tokens).replace("\n", " ").strip()
-            if len(remaining_text) > self.config.discard_chunk_chars:
-                chunks.append(remaining_text)
+        # There may be remaining tokens, but we discard them
+        # since we have already reached the maximum number of chunks
         return chunks
     def split(self, docs: List[Document]) -> List[Document]:
         if len(docs) == 0:
             return []
+        # create ids in metadata of docs if absent:
+        # we need this to distinguish docs later in add_window_ids
+        for d in docs:
+            if d.metadata.id in [None, ""]:
+                d.metadata.id = d._unique_hash_id()
         # some docs are already splits, so don't split them further!
         chunked_docs = [d for d in docs if d.metadata.is_chunk]
         big_docs = [d for d in docs if not d.metadata.is_chunk]

langroid/parsing/repo_loader.py CHANGED Viewed

@@ -10,15 +10,15 @@ from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple, Union
 from urllib.parse import urlparse
-from bs4 import BeautifulSoup
 from dotenv import load_dotenv
 from github import Github
 from github.ContentFile import ContentFile
+from github.Label import Label
 from github.Repository import Repository
-from pydantic import BaseSettings
+from pydantic import BaseModel, BaseSettings, Field
 from langroid.mytypes import DocMetaData, Document
-from langroid.parsing.document_parser import DocumentParser
+from langroid.parsing.document_parser import DocumentParser, DocumentType
 from langroid.parsing.parser import Parser, ParsingConfig
 logger = logging.getLogger(__name__)
@@ -43,6 +43,22 @@ def _has_files(directory: str) -> bool:
     return False
+# Pydantic model for GitHub issue data
+class IssueData(BaseModel):
+    state: str = Field(..., description="State of issue e.g. open or closed")
+    year: int = Field(..., description="Year issue was created")
+    month: int = Field(..., description="Month issue was created")
+    day: int = Field(..., description="Day issue was created")
+    assignee: Optional[str] = Field(..., description="Assignee of issue")
+    size: Optional[str] = Field(..., description="Size of issue, e.g. XS, S, M, L, XXL")
+    text: str = Field(..., description="Text of issue, i.e. description body")
+def get_issue_size(labels: List[Label]) -> str | None:
+    sizes = ["XS", "S", "M", "L", "XL", "XXL"]
+    return next((label.name for label in labels if label.name in sizes), None)
 class RepoLoaderConfig(BaseSettings):
     """
     Configuration for RepoLoader.
@@ -155,6 +171,27 @@ class RepoLoader:
     def _get_dir_name(self) -> str:
         return urlparse(self.url).path.replace("/", "_")
+    def get_issues(self, k: int | None = 100) -> List[IssueData]:
+        """Get up to k issues from the GitHub repo."""
+        if k is None:
+            issues = self.repo.get_issues(state="all")
+        else:
+            issues = self.repo.get_issues(state="all")[:k]
+        issue_data_list = []
+        for issue in issues:
+            issue_data = IssueData(
+                state=issue.state,
+                year=issue.created_at.year,
+                month=issue.created_at.month,
+                day=issue.created_at.day,
+                assignee=issue.assignee.login if issue.assignee else None,
+                size=get_issue_size(issue.labels),
+                text=issue.body or "No issue description body.",
+            )
+            issue_data_list.append(issue_data)
+        return issue_data_list
     @staticmethod
     def _file_type(name: str) -> str:
         """
@@ -336,8 +373,8 @@ class RepoLoader:
         Returns:
             Tuple of (dict, List_of_Documents):
-              A dictionary containing file and directory names, with file contents, and
-              A list of Document objects for each file.
+                A dictionary containing file and directory names, with file
+                contents, and a list of Document objects for each file.
         """
         if path is None:
             if self.clone_path is None or not _has_files(self.clone_path):
@@ -382,8 +419,8 @@ class RepoLoader:
         Returns:
             Tuple of (dict, List_of_Documents):
-              A dictionary containing file and directory names, with file contents.
-              A list of Document objects for each file.
+                A dictionary containing file and directory names, with file contents.
+                A list of Document objects for each file.
         """
         folder_structure = {
@@ -453,18 +490,25 @@ class RepoLoader:
     @staticmethod
     def get_documents(
-        path: str,
+        path: str | bytes,
         parser: Parser = Parser(ParsingConfig()),
         file_types: Optional[List[str]] = None,
         exclude_dirs: Optional[List[str]] = None,
         depth: int = -1,
         lines: Optional[int] = None,
+        doc_type: str | DocumentType | None = None,
     ) -> List[Document]:
         """
         Recursively get all files under a path as Document objects.
         Args:
-            path (str): The path to the directory or file.
+            path (str|bytes): The path to the directory or file, or bytes content.
+                The bytes option is meant to support the case where the content
+                has already been read from a file in an upstream process
+                (e.g. from an API or a database), and we want to avoid having to
+                write it to a temporary file just to read it again.
+                (which can be very slow for large files,
+                especially in a docker container)
             parser (Parser): Parser to use to parse files.
             file_types (List[str], optional): List of file extensions OR
                 filenames OR file_path_names to  include.
@@ -475,6 +519,7 @@ class RepoLoader:
                 which includes all depths.
             lines (int, optional): Number of lines to read from each file.
                 Defaults to None, which reads all lines.
+            doc_type (str|DocumentType, optional): The type of document to parse.
         Returns:
             List[Document]: List of Document objects representing files.
@@ -482,52 +527,69 @@ class RepoLoader:
         """
         docs = []
         file_paths = []
-        path_obj = Path(path).resolve()
-        if path_obj.is_file():
-            file_paths.append(str(path_obj))
+        if isinstance(path, bytes):
+            file_paths.append(path)
         else:
-            path_depth = len(path_obj.parts)
-            for root, dirs, files in os.walk(path):
-                # Exclude directories if needed
-                if exclude_dirs:
-                    dirs[:] = [d for d in dirs if d not in exclude_dirs]
-                current_depth = len(Path(root).resolve().parts) - path_depth
-                if depth == -1 or current_depth <= depth:
-                    for file in files:
-                        file_path = str(Path(root) / file)
-                        if (
-                            file_types is None
-                            or RepoLoader._file_type(file_path) in file_types
-                            or os.path.basename(file_path) in file_types
-                            or file_path in file_types
-                        ):
-                            file_paths.append(file_path)
+            path_obj = Path(path).resolve()
+            if path_obj.is_file():
+                file_paths.append(str(path_obj))
+            else:
+                path_depth = len(path_obj.parts)
+                for root, dirs, files in os.walk(path):
+                    # Exclude directories if needed
+                    if exclude_dirs:
+                        dirs[:] = [d for d in dirs if d not in exclude_dirs]
+                    current_depth = len(Path(root).resolve().parts) - path_depth
+                    if depth == -1 or current_depth <= depth:
+                        for file in files:
+                            file_path = str(Path(root) / file)
+                            if (
+                                file_types is None
+                                or RepoLoader._file_type(file_path) in file_types
+                                or os.path.basename(file_path) in file_types
+                                or file_path in file_types
+                            ):
+                                file_paths.append(file_path)
         for file_path in file_paths:
-            _, file_extension = os.path.splitext(file_path)
-            if file_extension.lower() in [".pdf", ".docx"]:
-                doc_parser = DocumentParser.create(
+            docs.extend(
+                DocumentParser.chunks_from_path_or_bytes(
                     file_path,
-                    parser.config,
-                )
-                docs.extend(doc_parser.get_doc_chunks())
-            else:
-                with open(file_path, "r") as f:
-                    if lines is not None:
-                        file_lines = list(itertools.islice(f, lines))
-                        content = "\n".join(line.strip() for line in file_lines)
-                    else:
-                        content = f.read()
-                soup = BeautifulSoup(content, "html.parser")
-                text = soup.get_text()
-                docs.append(
-                    Document(
-                        content=text,
-                        metadata=DocMetaData(source=str(file_path)),
-                    )
+                    parser,
+                    doc_type=doc_type,
+                    lines=lines,
                 )
+            )
+            # dtype: DocumentType = DocumentParser._document_type(file_path, doc_type)
+            # if dtype in [DocumentType.PDF, DocumentType.DOC, DocumentType.DOCX]:
+            #     doc_parser = DocumentParser.create(
+            #         file_path,
+            #         parser.config,
+            #         doc_type=doc_type,
+            #     )
+            #     new_chunks = doc_parser.get_doc_chunks()
+            #     if len(new_chunks) == 0 and file_extension.lower() == ".pdf":
+            #         doc_parser = ImagePdfParser(file_path, parser.config)
+            #         new_chunks = doc_parser.get_doc_chunks()
+            #     docs.extend(new_chunks)
+            # else:
+            #     # try getting as plain text; these will be chunked downstream
+            #     with open(file_path, "r") as f:
+            #         if lines is not None:
+            #             file_lines = list(itertools.islice(f, lines))
+            #             content = "\n".join(line.strip() for line in file_lines)
+            #         else:
+            #             content = f.read()
+            #     soup = BeautifulSoup(content, "html.parser")
+            #     text = soup.get_text()
+            #     docs.append(
+            #         Document(
+            #             content=text,
+            #             metadata=DocMetaData(source=str(file_path)),
+            #         )
+            #     )
         return docs
@@ -543,8 +605,8 @@ class RepoLoader:
         of lines per file (if any of these are specified).
         Args:
-            k(int): max number of files to load, or None for all files
-            depth(int): max depth to recurse, or None for infinite depth
+            k (int): max number of files to load, or None for all files
+            depth (int): max depth to recurse, or None for infinite depth
             lines (int): max number of lines to get, from a file, or None for all lines
         Returns:

langroid 0.1.85__py3-none-any.whl → 0.1.219__py3-none-any.whl

langroid 0.1.85py3-none-any.whl → 0.1.219py3-none-any.whl