PyPI - langroid - Versions diffs - 0.1.218__py3-none-any.whl → 0.1.219__py3-none-any.whl - Mend

langroid 0.1.218py3-none-any.whl → 0.1.219py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

langroid/agent/special/doc_chat_agent.py +54 -25
langroid/parsing/document_parser.py +145 -22
langroid/parsing/repo_loader.py +69 -49
langroid/parsing/urls.py +18 -9
langroid/parsing/utils.py +27 -9
langroid/utils/system.py +1 -1
{langroid-0.1.218.dist-info → langroid-0.1.219.dist-info}/METADATA +2 -2
{langroid-0.1.218.dist-info → langroid-0.1.219.dist-info}/RECORD +10 -10
{langroid-0.1.218.dist-info → langroid-0.1.219.dist-info}/LICENSE +0 -0
{langroid-0.1.218.dist-info → langroid-0.1.219.dist-info}/WHEEL +0 -0

langroid/agent/special/doc_chat_agent.py CHANGED Viewed

@@ -35,6 +35,7 @@ from langroid.embedding_models.models import OpenAIEmbeddingsConfig
 from langroid.language_models.base import StreamingIfAllowed
 from langroid.language_models.openai_gpt import OpenAIChatModel, OpenAIGPTConfig
 from langroid.mytypes import DocMetaData, Document, Entity
+from langroid.parsing.document_parser import DocumentType
 from langroid.parsing.parser import Parser, ParsingConfig, PdfParsingConfig, Splitter
 from langroid.parsing.repo_loader import RepoLoader
 from langroid.parsing.search import (
@@ -44,7 +45,7 @@ from langroid.parsing.search import (
 )
 from langroid.parsing.table_loader import describe_dataframe
 from langroid.parsing.url_loader import URLLoader
-from langroid.parsing.urls import get_list_from_user, get_urls_and_paths
+from langroid.parsing.urls import get_list_from_user, get_urls_paths_bytes_indices
 from langroid.parsing.utils import batched
 from langroid.prompts.prompts_config import PromptsConfig
 from langroid.prompts.templates import SUMMARY_ANSWER_PROMPT_GPT4
@@ -126,7 +127,7 @@ class DocChatAgentConfig(ChatAgentConfig):
             llm=None  # use the parent's llm unless explicitly set here
         )
     )
-    doc_paths: List[str] = []
+    doc_paths: List[str | bytes] = []
     default_paths: List[str] = [
         "https://news.ycombinator.com/item?id=35629033",
         "https://www.newyorker.com/tech/annals-of-technology/chatgpt-is-a-blurry-jpeg-of-the-web",
@@ -248,62 +249,84 @@ class DocChatAgent(ChatAgent):
                 raise ValueError("VecDB not set")
             self.setup_documents(filter=self.config.filter)
             return
-        self.ingest_doc_paths(self.config.doc_paths)
+        self.ingest_doc_paths(self.config.doc_paths)  # type: ignore
     def ingest_doc_paths(
         self,
-        paths: List[str],
+        paths: str | bytes | List[str | bytes],
         metadata: (
             List[Dict[str, Any]] | Dict[str, Any] | DocMetaData | List[DocMetaData]
         ) = [],
+        doc_type: str | DocumentType | None = None,
     ) -> List[Document]:
         """Split, ingest docs from specified paths,
         do not add these to config.doc_paths.
         Args:
-            paths: List of file/folder paths or URLs
+            paths: document paths, urls or byte-content of docs.
+                The bytes option is intended to support cases where a document
+                has already been read in as bytes (e.g. from an API or a database),
+                and we want to avoid having to write it to a temporary file
+                just to read it back in.
             metadata: List of metadata dicts, one for each path.
                 If a single dict is passed in, it is used for all paths.
+            doc_type: DocumentType to use for parsing, if known.
+                MUST apply to all docs if specified.
+                This is especially useful when the `paths` are of bytes type,
+                to help with document type detection.
         Returns:
             List of Document objects
         """
+        if isinstance(paths, str) or isinstance(paths, bytes):
+            paths = [paths]
         all_paths = paths
-        paths_meta: Dict[str, Any] = {}
-        urls_meta: Dict[str, Any] = {}
-        urls, paths = get_urls_and_paths(paths)
+        paths_meta: Dict[int, Any] = {}
+        urls_meta: Dict[int, Any] = {}
+        idxs = range(len(all_paths))
+        url_idxs, path_idxs, bytes_idxs = get_urls_paths_bytes_indices(all_paths)
+        urls = [all_paths[i] for i in url_idxs]
+        paths = [all_paths[i] for i in path_idxs]
+        bytes_list = [all_paths[i] for i in bytes_idxs]
+        path_idxs.extend(bytes_idxs)
+        paths.extend(bytes_list)
         if (isinstance(metadata, list) and len(metadata) > 0) or not isinstance(
             metadata, list
         ):
             if isinstance(metadata, list):
-                path2meta = {
+                idx2meta = {
                     p: (
                         m
                         if isinstance(m, dict)
                         else (isinstance(m, DocMetaData) and m.dict())
                     )  # appease mypy
-                    for p, m in zip(all_paths, metadata)
+                    for p, m in zip(idxs, metadata)
                 }
             elif isinstance(metadata, dict):
-                path2meta = {p: metadata for p in all_paths}
+                idx2meta = {p: metadata for p in idxs}
             else:
-                path2meta = {p: metadata.dict() for p in all_paths}
-            urls_meta = {u: path2meta[u] for u in urls}
-            paths_meta = {p: path2meta[p] for p in paths}
+                idx2meta = {p: metadata.dict() for p in idxs}
+            urls_meta = {u: idx2meta[u] for u in url_idxs}
+            paths_meta = {p: idx2meta[p] for p in path_idxs}
         docs: List[Document] = []
         parser = Parser(self.config.parsing)
         if len(urls) > 0:
-            for u in urls:
-                meta = urls_meta.get(u, {})
-                loader = URLLoader(urls=[u], parser=parser)
+            for ui in url_idxs:
+                meta = urls_meta.get(ui, {})
+                loader = URLLoader(urls=[all_paths[ui]], parser=parser)  # type: ignore
                 url_docs = loader.load()
                 # update metadata of each doc with meta
                 for d in url_docs:
                     d.metadata = d.metadata.copy(update=meta)
                 docs.extend(url_docs)
-        if len(paths) > 0:
-            for p in paths:
-                meta = paths_meta.get(p, {})
-                path_docs = RepoLoader.get_documents(p, parser=parser)
+        if len(paths) > 0:  # paths OR bytes are handled similarly
+            for pi in path_idxs:
+                meta = paths_meta.get(pi, {})
+                p = all_paths[pi]
+                path_docs = RepoLoader.get_documents(
+                    p,
+                    parser=parser,
+                    doc_type=doc_type,
+                )
                 # update metadata of each doc with meta
                 for d in path_docs:
                     d.metadata = d.metadata.copy(update=meta)
@@ -317,11 +340,12 @@ class DocChatAgent(ChatAgent):
         print(
             f"""
         [green]I have processed the following {n_urls} URLs
-        and {n_paths} paths into {n_splits} parts:
+        and {n_paths} docs into {n_splits} parts:
         """.strip()
         )
-        print("\n".join(urls))
-        print("\n".join(paths))
+        path_reps = [p if isinstance(p, str) else "bytes" for p in paths]
+        print("\n".join([u for u in urls if isinstance(u, str)]))  # appease mypy
+        print("\n".join(path_reps))
         return docs
     def ingest_docs(
@@ -388,6 +412,7 @@ class DocChatAgent(ChatAgent):
                         + ",content="
                         + d.content
                     )
+        docs = docs[: self.config.parsing.max_chunks]
         # add embeddings in batches, to stay under limit of embeddings API
         batches = list(batched(docs, self.config.embed_batch_size))
         for batch in batches:
@@ -463,6 +488,10 @@ class DocChatAgent(ChatAgent):
             d.metadata.is_chunk = True
         return self.ingest_docs(docs)
+    def set_filter(self, filter: str) -> None:
+        self.config.filter = filter
+        self.setup_documents(filter=filter)
     def setup_documents(
         self,
         docs: List[Document] = [],
@@ -609,7 +638,7 @@ class DocChatAgent(ChatAgent):
         if len(inputs) == 0:
             if is_new_collection:
                 inputs = self.config.default_paths
-        self.config.doc_paths = inputs
+        self.config.doc_paths = inputs  # type: ignore
         self.ingest()
     def llm_response(

langroid/parsing/document_parser.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import itertools
 import logging
 import re
 from enum import Enum
@@ -8,6 +9,7 @@ import fitz
 import pdfplumber
 import pypdf
 import requests
+from bs4 import BeautifulSoup
 from PIL import Image
 from langroid.mytypes import DocMetaData, Document
@@ -20,6 +22,29 @@ class DocumentType(str, Enum):
     PDF = "pdf"
     DOCX = "docx"
     DOC = "doc"
+    TXT = "txt"
+def is_plain_text(path_or_bytes: str | bytes) -> bool:
+    if isinstance(path_or_bytes, str):
+        if path_or_bytes.startswith(("http://", "https://")):
+            response = requests.get(path_or_bytes)
+            response.raise_for_status()
+            content = response.content[:1024]
+        else:
+            with open(path_or_bytes, "rb") as f:
+                content = f.read(1024)
+    else:
+        content = path_or_bytes[:1024]
+    try:
+        # Attempt to decode the content as UTF-8
+        _ = content.decode("utf-8")
+        # Additional checks can go here, e.g., to verify that the content
+        # doesn't contain too many unusual characters for it to be considered text
+        return True
+    except UnicodeDecodeError:
+        # If decoding fails, it's likely not plain text (or not encoded in UTF-8)
+        return False
 class DocumentParser(Parser):
@@ -33,19 +58,26 @@ class DocumentParser(Parser):
     """
     @classmethod
-    def create(cls, source: str, config: ParsingConfig) -> "DocumentParser":
+    def create(
+        cls,
+        source: str | bytes,
+        config: ParsingConfig,
+        doc_type: str | DocumentType | None = None,
+    ) -> "DocumentParser":
         """
         Create a DocumentParser instance based on source type
             and config.<source_type>.library specified.
         Args:
-            source (str): The source of the PDF, either a URL or a file path.
+            source (str|bytes): The source, could be a URL, file path,
+                or bytes object.
             config (ParserConfig): The parser configuration.
+            doc_type (str|None): The type of document, if known
         Returns:
             DocumentParser: An instance of a DocumentParser subclass.
         """
-        if DocumentParser._document_type(source) == DocumentType.PDF:
+        if DocumentParser._document_type(source, doc_type) == DocumentType.PDF:
             if config.pdf.library == "fitz":
                 return FitzPDFParser(source, config)
             elif config.pdf.library == "pypdf":
@@ -60,7 +92,7 @@ class DocumentParser(Parser):
                 raise ValueError(
                     f"Unsupported PDF library specified: {config.pdf.library}"
                 )
-        elif DocumentParser._document_type(source) == DocumentType.DOCX:
+        elif DocumentParser._document_type(source, doc_type) == DocumentType.DOCX:
             if config.docx.library == "unstructured":
                 return UnstructuredDocxParser(source, config)
             elif config.docx.library == "python-docx":
@@ -69,42 +101,78 @@ class DocumentParser(Parser):
                 raise ValueError(
                     f"Unsupported DOCX library specified: {config.docx.library}"
                 )
-        elif DocumentParser._document_type(source) == DocumentType.DOC:
+        elif DocumentParser._document_type(source, doc_type) == DocumentType.DOC:
             return UnstructuredDocParser(source, config)
         else:
-            raise ValueError(f"Unsupported document type: {source}")
+            source_name = source if isinstance(source, str) else "bytes"
+            raise ValueError(f"Unsupported document type: {source_name}")
-    def __init__(self, source: str, config: ParsingConfig):
+    def __init__(self, source: str | bytes, config: ParsingConfig):
         """
-        Initialize the PDFParser.
         Args:
-            source (str): The source of the PDF, either a URL or a file path.
+            source (str|bytes): The source, which could be
+            a path, a URL or a bytes object.
         """
         super().__init__(config)
-        self.source = source
         self.config = config
-        self.doc_bytes = self._load_doc_as_bytesio()
+        if isinstance(source, bytes):
+            self.source = "bytes"
+            self.doc_bytes = BytesIO(source)
+        else:
+            self.source = source
+            self.doc_bytes = self._load_doc_as_bytesio()
     @staticmethod
-    def _document_type(source: str) -> DocumentType:
+    def _document_type(
+        source: str | bytes, doc_type: str | DocumentType | None = None
+    ) -> DocumentType:
         """
         Determine the type of document based on the source.
         Args:
-            source (str): The source of the PDF, either a URL or a file path.
+            source (str|bytes): The source, which could be a URL,
+                a file path, or a bytes object.
+            doc_type (str|DocumentType|None): The type of document, if known.
         Returns:
             str: The document type.
         """
-        if source.lower().endswith(".pdf"):
-            return DocumentType.PDF
-        elif source.lower().endswith(".docx"):
-            return DocumentType.DOCX
-        elif source.lower().endswith(".doc"):
-            return DocumentType.DOC
+        if isinstance(doc_type, DocumentType):
+            return doc_type
+        if doc_type:
+            return DocumentType(doc_type.lower())
+        if is_plain_text(source):
+            return DocumentType.TXT
+        if isinstance(source, str):
+            # detect file type from path extension
+            if source.lower().endswith(".pdf"):
+                return DocumentType.PDF
+            elif source.lower().endswith(".docx"):
+                return DocumentType.DOCX
+            elif source.lower().endswith(".doc"):
+                return DocumentType.DOC
+            else:
+                raise ValueError(f"Unsupported document type: {source}")
         else:
-            raise ValueError(f"Unsupported document type: {source}")
+            # must be bytes: attempt to detect type from content
+            # using magic mime type detection
+            import magic
+            mime_type = magic.from_buffer(source, mime=True)
+            if mime_type == "application/pdf":
+                return DocumentType.PDF
+            elif mime_type in [
+                "application/vnd.openxmlformats-officedocument"
+                ".wordprocessingml.document",
+                "application/zip",
+            ]:
+                # DOCX files are essentially ZIP files,
+                # but this might catch other ZIP-based formats too!
+                return DocumentType.DOCX
+            elif mime_type == "application/msword":
+                return DocumentType.DOC
+            else:
+                raise ValueError("Unsupported document type from bytes")
     def _load_doc_as_bytesio(self) -> BytesIO:
         """
@@ -121,6 +189,61 @@ class DocumentParser(Parser):
             with open(self.source, "rb") as f:
                 return BytesIO(f.read())
+    @staticmethod
+    def chunks_from_path_or_bytes(
+        source: str | bytes,
+        parser: Parser,
+        doc_type: str | DocumentType | None = None,
+        lines: int | None = None,
+    ) -> List[Document]:
+        """
+        Get document chunks from a file path or bytes object.
+        Args:
+            source (str|bytes): The source, which could be a URL, path or bytes object.
+            parser (Parser): The parser instance (for splitting the document).
+            doc_type (str|DocumentType|None): The type of document, if known.
+            lines (int|None): The number of lines to read from a plain text file.
+        Returns:
+            List[Document]: A list of `Document` objects,
+                each containing a chunk of text, determined by the
+                chunking and splitting settings in the parser config.
+        """
+        dtype: DocumentType = DocumentParser._document_type(source, doc_type)
+        if dtype in [DocumentType.PDF, DocumentType.DOC, DocumentType.DOCX]:
+            doc_parser = DocumentParser.create(
+                source,
+                parser.config,
+                doc_type=doc_type,
+            )
+            chunks = doc_parser.get_doc_chunks()
+            if len(chunks) == 0 and dtype == DocumentType.PDF:
+                doc_parser = ImagePdfParser(source, parser.config)
+                chunks = doc_parser.get_doc_chunks()
+            return chunks
+        else:
+            # try getting as plain text; these will be chunked downstream
+            # -- could be a bytes object or a path
+            if isinstance(source, bytes):
+                content = source.decode()
+                if lines is not None:
+                    file_lines = content.splitlines()[:lines]
+                    content = "\n".join(line.strip() for line in file_lines)
+            else:
+                with open(source, "r") as f:
+                    if lines is not None:
+                        file_lines = list(itertools.islice(f, lines))
+                        content = "\n".join(line.strip() for line in file_lines)
+                    else:
+                        content = f.read()
+            soup = BeautifulSoup(content, "html.parser")
+            text = soup.get_text()
+            source_name = source if isinstance(source, str) else "bytes"
+            doc = Document(
+                content=text,
+                metadata=DocMetaData(source=str(source_name)),
+            )
+            return parser.split([doc])
     def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]:
         """Yield each page in the PDF."""
         raise NotImplementedError
@@ -145,7 +268,7 @@ class DocumentParser(Parser):
     def get_doc(self) -> Document:
         """
-        Get entire text from pdf source as a single document.
+        Get entire text from source as a single document.
         Returns:
             a `Document` object containing the content of the pdf file,

langroid/parsing/repo_loader.py CHANGED Viewed

@@ -10,7 +10,6 @@ from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple, Union
 from urllib.parse import urlparse
-from bs4 import BeautifulSoup
 from dotenv import load_dotenv
 from github import Github
 from github.ContentFile import ContentFile
@@ -19,7 +18,7 @@ from github.Repository import Repository
 from pydantic import BaseModel, BaseSettings, Field
 from langroid.mytypes import DocMetaData, Document
-from langroid.parsing.document_parser import DocumentParser, ImagePdfParser
+from langroid.parsing.document_parser import DocumentParser, DocumentType
 from langroid.parsing.parser import Parser, ParsingConfig
 logger = logging.getLogger(__name__)
@@ -491,18 +490,25 @@ class RepoLoader:
     @staticmethod
     def get_documents(
-        path: str,
+        path: str | bytes,
         parser: Parser = Parser(ParsingConfig()),
         file_types: Optional[List[str]] = None,
         exclude_dirs: Optional[List[str]] = None,
         depth: int = -1,
         lines: Optional[int] = None,
+        doc_type: str | DocumentType | None = None,
     ) -> List[Document]:
         """
         Recursively get all files under a path as Document objects.
         Args:
-            path (str): The path to the directory or file.
+            path (str|bytes): The path to the directory or file, or bytes content.
+                The bytes option is meant to support the case where the content
+                has already been read from a file in an upstream process
+                (e.g. from an API or a database), and we want to avoid having to
+                write it to a temporary file just to read it again.
+                (which can be very slow for large files,
+                especially in a docker container)
             parser (Parser): Parser to use to parse files.
             file_types (List[str], optional): List of file extensions OR
                 filenames OR file_path_names to  include.
@@ -513,6 +519,7 @@ class RepoLoader:
                 which includes all depths.
             lines (int, optional): Number of lines to read from each file.
                 Defaults to None, which reads all lines.
+            doc_type (str|DocumentType, optional): The type of document to parse.
         Returns:
             List[Document]: List of Document objects representing files.
@@ -520,56 +527,69 @@ class RepoLoader:
         """
         docs = []
         file_paths = []
-        path_obj = Path(path).resolve()
-        if path_obj.is_file():
-            file_paths.append(str(path_obj))
+        if isinstance(path, bytes):
+            file_paths.append(path)
         else:
-            path_depth = len(path_obj.parts)
-            for root, dirs, files in os.walk(path):
-                # Exclude directories if needed
-                if exclude_dirs:
-                    dirs[:] = [d for d in dirs if d not in exclude_dirs]
-                current_depth = len(Path(root).resolve().parts) - path_depth
-                if depth == -1 or current_depth <= depth:
-                    for file in files:
-                        file_path = str(Path(root) / file)
-                        if (
-                            file_types is None
-                            or RepoLoader._file_type(file_path) in file_types
-                            or os.path.basename(file_path) in file_types
-                            or file_path in file_types
-                        ):
-                            file_paths.append(file_path)
+            path_obj = Path(path).resolve()
+            if path_obj.is_file():
+                file_paths.append(str(path_obj))
+            else:
+                path_depth = len(path_obj.parts)
+                for root, dirs, files in os.walk(path):
+                    # Exclude directories if needed
+                    if exclude_dirs:
+                        dirs[:] = [d for d in dirs if d not in exclude_dirs]
+                    current_depth = len(Path(root).resolve().parts) - path_depth
+                    if depth == -1 or current_depth <= depth:
+                        for file in files:
+                            file_path = str(Path(root) / file)
+                            if (
+                                file_types is None
+                                or RepoLoader._file_type(file_path) in file_types
+                                or os.path.basename(file_path) in file_types
+                                or file_path in file_types
+                            ):
+                                file_paths.append(file_path)
         for file_path in file_paths:
-            _, file_extension = os.path.splitext(file_path)
-            if file_extension.lower() in [".pdf", ".docx", ".doc"]:
-                doc_parser = DocumentParser.create(
+            docs.extend(
+                DocumentParser.chunks_from_path_or_bytes(
                     file_path,
-                    parser.config,
-                )
-                new_chunks = doc_parser.get_doc_chunks()
-                if len(new_chunks) == 0 and file_extension.lower() == ".pdf":
-                    doc_parser = ImagePdfParser(file_path, parser.config)
-                    new_chunks = doc_parser.get_doc_chunks()
-                docs.extend(new_chunks)
-            else:
-                with open(file_path, "r") as f:
-                    if lines is not None:
-                        file_lines = list(itertools.islice(f, lines))
-                        content = "\n".join(line.strip() for line in file_lines)
-                    else:
-                        content = f.read()
-                soup = BeautifulSoup(content, "html.parser")
-                text = soup.get_text()
-                docs.append(
-                    Document(
-                        content=text,
-                        metadata=DocMetaData(source=str(file_path)),
-                    )
+                    parser,
+                    doc_type=doc_type,
+                    lines=lines,
                 )
+            )
+            # dtype: DocumentType = DocumentParser._document_type(file_path, doc_type)
+            # if dtype in [DocumentType.PDF, DocumentType.DOC, DocumentType.DOCX]:
+            #     doc_parser = DocumentParser.create(
+            #         file_path,
+            #         parser.config,
+            #         doc_type=doc_type,
+            #     )
+            #     new_chunks = doc_parser.get_doc_chunks()
+            #     if len(new_chunks) == 0 and file_extension.lower() == ".pdf":
+            #         doc_parser = ImagePdfParser(file_path, parser.config)
+            #         new_chunks = doc_parser.get_doc_chunks()
+            #     docs.extend(new_chunks)
+            # else:
+            #     # try getting as plain text; these will be chunked downstream
+            #     with open(file_path, "r") as f:
+            #         if lines is not None:
+            #             file_lines = list(itertools.islice(f, lines))
+            #             content = "\n".join(line.strip() for line in file_lines)
+            #         else:
+            #             content = f.read()
+            #     soup = BeautifulSoup(content, "html.parser")
+            #     text = soup.get_text()
+            #     docs.append(
+            #         Document(
+            #             content=text,
+            #             metadata=DocMetaData(source=str(file_path)),
+            #         )
+            #     )
         return docs

langroid/parsing/urls.py CHANGED Viewed

@@ -112,26 +112,35 @@ def is_url(s: str) -> bool:
         return False
-def get_urls_and_paths(inputs: List[str]) -> Tuple[List[str], List[str]]:
+def get_urls_paths_bytes_indices(
+    inputs: List[str | bytes],
+) -> Tuple[List[int], List[int], List[int]]:
     """
-    Given a list of inputs, return a list of URLs and a list of paths.
+    Given a list of inputs, return a
+    list of indices of URLs, list of indices of paths, list of indices of byte-contents.
     Args:
-        inputs: list of strings
+        inputs: list of strings or bytes
     Returns:
-        list of URLs, list of paths
+        list of Indices of URLs,
+        list of indices of paths,
+        list of indices of byte-contents
     """
     urls = []
     paths = []
-    for item in inputs:
+    byte_list = []
+    for i, item in enumerate(inputs):
+        if isinstance(item, bytes):
+            byte_list.append(i)
+            continue
         try:
-            m = Url(url=parse_obj_as(HttpUrl, item))
-            urls.append(str(m.url))
+            Url(url=parse_obj_as(HttpUrl, item))
+            urls.append(i)
         except ValidationError:
             if os.path.exists(item):
-                paths.append(item)
+                paths.append(i)
             else:
                 logger.warning(f"{item} is neither a URL nor a path.")
-    return urls, paths
+    return urls, paths, byte_list
 def crawl_url(url: str, max_urls: int = 1) -> List[str]:

langroid/parsing/utils.py CHANGED Viewed

@@ -10,10 +10,11 @@ import nltk
 from faker import Faker
 from langroid.mytypes import Document
+from langroid.parsing.document_parser import DocumentType
 from langroid.parsing.parser import Parser, ParsingConfig
 from langroid.parsing.repo_loader import RepoLoader
 from langroid.parsing.url_loader import URLLoader
-from langroid.parsing.urls import get_urls_and_paths
+from langroid.parsing.urls import get_urls_paths_bytes_indices
 Faker.seed(23)
 random.seed(43)
@@ -314,37 +315,54 @@ def extract_numbered_segments(s: str, specs: str) -> str:
 def extract_content_from_path(
-    path: str | List[str], parsing: ParsingConfig
+    path: bytes | str | List[bytes | str],
+    parsing: ParsingConfig,
+    doc_type: str | DocumentType | None = None,
 ) -> str | List[str]:
     """
     Extract the content from a file path or URL, or a list of file paths or URLs.
     Args:
-        path (str | List[str]): The file path or URL, or a list of file paths or URLs.
+        path (bytes | str | List[str]): The file path or URL, or a list of file paths or
+            URLs, or bytes content. The bytes option is meant to support cases
+            where upstream code may have already loaded the content (e.g., from a
+            database or API) and we want to avoid having to copy the content to a
+            temporary file.
         parsing (ParsingConfig): The parsing configuration.
+        doc_type (str | DocumentType | None): The document type if known.
+            If multiple paths are given, this MUST apply to ALL docs.
     Returns:
         str | List[str]: The extracted content if a single file path or URL is provided,
                 or a list of extracted contents if a
                 list of file paths or URLs is provided.
     """
-    if isinstance(path, str):
-        path = [path]
+    if isinstance(path, str) or isinstance(path, bytes):
+        paths = [path]
     elif isinstance(path, list) and len(path) == 0:
         return ""
-    urls, path_list = get_urls_and_paths(path)
+    else:
+        paths = path
+    url_idxs, path_idxs, byte_idxs = get_urls_paths_bytes_indices(paths)
+    urls = [paths[i] for i in url_idxs]
+    path_list = [paths[i] for i in path_idxs]
+    byte_list = [paths[i] for i in byte_idxs]
+    path_list.extend(byte_list)
     parser = Parser(parsing)
     docs: List[Document] = []
     try:
         if len(urls) > 0:
-            loader = URLLoader(urls=urls, parser=parser)
+            loader = URLLoader(urls=urls, parser=parser)  # type: ignore
             docs = loader.load()
         if len(path_list) > 0:
             for p in path_list:
-                path_docs = RepoLoader.get_documents(p, parser=parser)
+                path_docs = RepoLoader.get_documents(
+                    p, parser=parser, doc_type=doc_type
+                )
                 docs.extend(path_docs)
     except Exception as e:
-        logger.warning(f"Error loading path {path}: {e}")
+        logger.warning(f"Error loading path {paths}: {e}")
         return ""
     if len(docs) == 1:
         return docs[0].content

langroid/utils/system.py CHANGED Viewed

@@ -131,7 +131,7 @@ def generate_user_id(org: str = "") -> str:
 def update_hash(hash: str | None = None, s: str = "") -> str:
     """
     Takes a SHA256 hash string and a new string, updates the hash with the new string,
-    and returns the updated hash string along with the original string.
+    and returns the updated hash string.
     Args:
         hash (str): A SHA256 hash string.

{langroid-0.1.218.dist-info → langroid-0.1.219.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: langroid
-Version: 0.1.218
+Version: 0.1.219
 Summary: Harness LLMs with Multi-Agent Programming
 License: MIT
 Author: Prasad Chalasani
@@ -85,7 +85,7 @@ Requires-Dist: pytest-redis (>=3.0.2,<4.0.0)
 Requires-Dist: python-docx (>=1.1.0,<2.0.0)
 Requires-Dist: python-dotenv (>=1.0.0,<2.0.0)
 Requires-Dist: python-socketio (>=5.11.0,<6.0.0) ; extra == "chainlit"
-Requires-Dist: qdrant-client (>=1.7.0,<2.0.0)
+Requires-Dist: qdrant-client (>=1.8.0,<2.0.0)
 Requires-Dist: rank-bm25 (>=0.2.2,<0.3.0)
 Requires-Dist: redis (>=5.0.1,<6.0.0)
 Requires-Dist: requests (>=2.31.0,<3.0.0)

{langroid-0.1.218.dist-info → langroid-0.1.219.dist-info}/RECORD RENAMED Viewed

@@ -10,7 +10,7 @@ langroid/agent/helpers.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 langroid/agent/junk,sha256=LxfuuW7Cijsg0szAzT81OjWWv1PMNI-6w_-DspVIO2s,339
 langroid/agent/openai_assistant.py,sha256=QTLBgnH6Btf2GWzN-WApvra-vPQWvYcXcAOULuIy4Ig,32702
 langroid/agent/special/__init__.py,sha256=XPE076zD-roskxNBn-A1hnh4AHoMiQN9gk1UDjPaBaU,1201
-langroid/agent/special/doc_chat_agent.py,sha256=xzUArFTeHS2Mg8x80S_a9W-rrPONeJn-gMwO6l_WxA0,51722
+langroid/agent/special/doc_chat_agent.py,sha256=-jMgaAvjMEIVL1iPpxhGYq3_YoIvSfic3em5FzoKtWQ,53342
 langroid/agent/special/lance_doc_chat_agent.py,sha256=USp0U3eTaJzwF_3bdqE7CedSLbaqAi2tm-VzygcyLaA,10175
 langroid/agent/special/lance_rag/__init__.py,sha256=QTbs0IVE2ZgDg8JJy1zN97rUUg4uEPH7SLGctFNumk4,174
 langroid/agent/special/lance_rag/critic_agent.py,sha256=pi_9eMBxEycbWTddtq_yz-mOb2V4SgGm3zfsOH1HU-Q,5775
@@ -75,19 +75,19 @@ langroid/parsing/agent_chats.py,sha256=sbZRV9ujdM5QXvvuHVjIi2ysYSYlap-uqfMMUKulr
 langroid/parsing/code-parsing.md,sha256=--cyyNiSZSDlIwcjAV4-shKrSiRe2ytF3AdSoS_hD2g,3294
 langroid/parsing/code_parser.py,sha256=BbDAzp35wkYQ9U1dpf1ARL0lVyi0tfqEc6_eox2C090,3727
 langroid/parsing/config.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-langroid/parsing/document_parser.py,sha256=Vx3zytPjeCAZcQQ0BSOaQY06GEP8jTlEDwWknWAvhLk,17130
+langroid/parsing/document_parser.py,sha256=uf1YhpC8-Z1RF7R0Yfy39VOHGf4YWwJjnDRrDIl3Q3E,22307
 langroid/parsing/image_text.py,sha256=sbLIQ5nHe2UnYUksBaQsmZGaX-X0qgEpPd7CEzi_z5M,910
 langroid/parsing/para_sentence_split.py,sha256=AJBzZojP3zpB-_IMiiHismhqcvkrVBQ3ZINoQyx_bE4,2000
 langroid/parsing/parse_json.py,sha256=tgB_oatcrgt6L9ZplC-xBBXjLzL1gjSQf1L2_W5kwFA,4230
 langroid/parsing/parser.py,sha256=vE5j1LVDeFQPmLrXCWBfvuoPsjjvVIGHcsIWCBR8HDM,10617
-langroid/parsing/repo_loader.py,sha256=oNTRFVlYmA-Smnq7nqheVEihJfoB48RTTF_oT-_s0g0,29615
+langroid/parsing/repo_loader.py,sha256=nyVBvkhh2nXTLFwMcnsayqMrjvtLKXXj89RTBzXBcng,30781
 langroid/parsing/search.py,sha256=plQtjarB9afGfJLB0CyPXPq3mM4m7kRsfd0_4brziEI,8846
 langroid/parsing/spider.py,sha256=w_mHR1B4KOmxsBLoVI8kMkMTEbwTzeK3ath9fOMJrTk,3043
 langroid/parsing/table_loader.py,sha256=qNM4obT_0Y4tjrxNBCNUYjKQ9oETCZ7FbolKBTcz-GM,3410
 langroid/parsing/url_loader.py,sha256=Na2TBlKuQkloZzkE2d7xl6mh9olS3CbpgCsJbJ-xhIA,4472
 langroid/parsing/url_loader_cookies.py,sha256=Lg4sNpRz9MByWq2mde6T0hKv68VZSV3mtMjNEHuFeSU,2327
-langroid/parsing/urls.py,sha256=Nv4yCWQLLBEjaiRdaZZVQNBEl_cfK_V6cVuPm91wGtU,7686
-langroid/parsing/utils.py,sha256=HdmKQ-5XCQXR5dpMtKT9mCGmb8iy3I_rIoObvGTnY7s,11768
+langroid/parsing/urls.py,sha256=5B0-2MM4LoFC7jHUJ0rft7Mx5GUrnmz8oFioO0iaMt8,7975
+langroid/parsing/utils.py,sha256=pbSAbfwA28EBNESpQRJee_Kp1b44qze-2_2b9qJOKfM,12646
 langroid/parsing/web_search.py,sha256=XSiSHB4c1Wa8RjWkC4Yh-ac8S7a2WPPYj0n-Ma716RY,4759
 langroid/prompts/__init__.py,sha256=B0vpJzIJlMR3mFRtoQwyALsFzBHvLp9f92acD8xJA_0,185
 langroid/prompts/chat-gpt4-system-prompt.md,sha256=Q3uLCJTPQvmUkZN2XDnkBC7M2K3X0F3C3GIQBaFvYvw,5329
@@ -110,7 +110,7 @@ langroid/utils/output/printing.py,sha256=5EsYB1O4qKhocW19aebOUzK82RD9U5nygbY21yo
 langroid/utils/output/status.py,sha256=VoSXmWDuddo1ipCzDAA6qlgffr5E4lSmBD0rIdNxxcs,774
 langroid/utils/pandas_utils.py,sha256=UctS986Jtl_MvU5rA7-GfrjEHXP7MNu8ePhepv0bTn0,755
 langroid/utils/pydantic_utils.py,sha256=yb-ghaQYL7EIYeiZ0tailvZvAuJZNF7UBXkd3z35OYc,21728
-langroid/utils/system.py,sha256=l-kFqIWkSD9YHOTka02dnihdbnR1mWVrnKSGK3LuEjo,4577
+langroid/utils/system.py,sha256=tWoEbzHzJ6ywdsoa9EwsQrZfGk2t7q87_zKNwau2C8s,4546
 langroid/utils/web/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 langroid/utils/web/login.py,sha256=1iz9eUAHa87vpKIkzwkmFa00avwFWivDSAr7QUhK7U0,2528
 langroid/vector_store/__init__.py,sha256=D82ioqPWxKTTbN0qiPNB-I1GjovhLw1MgDuYhcB3hCs,831
@@ -121,7 +121,7 @@ langroid/vector_store/meilisearch.py,sha256=d2huA9P-NoYRuAQ9ZeXJmMKr7ry8u90RUSR2
 langroid/vector_store/momento.py,sha256=9cui31TTrILid2KIzUpBkN2Ey3g_CZWOQVdaFsA4Ors,10045
 langroid/vector_store/qdrant_cloud.py,sha256=3im4Mip0QXLkR6wiqVsjV1QvhSElfxdFSuDKddBDQ-4,188
 langroid/vector_store/qdrantdb.py,sha256=_egbsP9SWBwmI827EDYSSOqfIQSmwNsmJfFTxrLpWYE,13457
-langroid-0.1.218.dist-info/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
-langroid-0.1.218.dist-info/METADATA,sha256=CnRttx3BwsvH83Ak1KJmp5wbDs-6oj0MieWkEZIP7pM,47945
-langroid-0.1.218.dist-info/WHEEL,sha256=FMvqSimYX_P7y0a7UY-_Mc83r5zkBZsCYPm7Lr0Bsq4,88
-langroid-0.1.218.dist-info/RECORD,,
+langroid-0.1.219.dist-info/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
+langroid-0.1.219.dist-info/METADATA,sha256=hPGE8zril18HUqkbbqKiSsFGwyMyCr0232TvF1HZx0Q,47945
+langroid-0.1.219.dist-info/WHEEL,sha256=FMvqSimYX_P7y0a7UY-_Mc83r5zkBZsCYPm7Lr0Bsq4,88
+langroid-0.1.219.dist-info/RECORD,,

{langroid-0.1.218.dist-info → langroid-0.1.219.dist-info}/LICENSE RENAMED Viewed

File without changes

{langroid-0.1.218.dist-info → langroid-0.1.219.dist-info}/WHEEL RENAMED Viewed

File without changes

langroid 0.1.218__py3-none-any.whl → 0.1.219__py3-none-any.whl

langroid 0.1.218py3-none-any.whl → 0.1.219py3-none-any.whl