PyPI - langroid - Versions diffs - 0.1.217__tar.gz → 0.1.219__tar.gz - Mend

langroid 0.1.217tar.gz → 0.1.219tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (127) hide show

{langroid-0.1.217 → langroid-0.1.219}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: langroid
-Version: 0.1.217
+Version: 0.1.219
 Summary: Harness LLMs with Multi-Agent Programming
 License: MIT
 Author: Prasad Chalasani
@@ -85,7 +85,7 @@ Requires-Dist: pytest-redis (>=3.0.2,<4.0.0)
 Requires-Dist: python-docx (>=1.1.0,<2.0.0)
 Requires-Dist: python-dotenv (>=1.0.0,<2.0.0)
 Requires-Dist: python-socketio (>=5.11.0,<6.0.0) ; extra == "chainlit"
-Requires-Dist: qdrant-client (>=1.7.0,<2.0.0)
+Requires-Dist: qdrant-client (>=1.8.0,<2.0.0)
 Requires-Dist: rank-bm25 (>=0.2.2,<0.3.0)
 Requires-Dist: redis (>=5.0.1,<6.0.0)
 Requires-Dist: requests (>=2.31.0,<3.0.0)
@@ -101,6 +101,7 @@ Requires-Dist: tiktoken (>=0.5.1,<0.6.0)
 Requires-Dist: torch (==2.0.0) ; extra == "hf-embeddings"
 Requires-Dist: trafilatura (>=1.5.0,<2.0.0)
 Requires-Dist: typer (>=0.9.0,<0.10.0)
+Requires-Dist: types-pyyaml (>=6.0.12.20240311,<7.0.0.0)
 Requires-Dist: types-redis (>=4.5.5.2,<5.0.0.0)
 Requires-Dist: types-requests (>=2.31.0.1,<3.0.0.0)
 Requires-Dist: unstructured[docx,pdf,pptx] (>=0.10.16,<0.10.18) ; extra == "unstructured"

{langroid-0.1.217 → langroid-0.1.219}/langroid/agent/special/doc_chat_agent.py RENAMED Viewed

@@ -35,6 +35,7 @@ from langroid.embedding_models.models import OpenAIEmbeddingsConfig
 from langroid.language_models.base import StreamingIfAllowed
 from langroid.language_models.openai_gpt import OpenAIChatModel, OpenAIGPTConfig
 from langroid.mytypes import DocMetaData, Document, Entity
+from langroid.parsing.document_parser import DocumentType
 from langroid.parsing.parser import Parser, ParsingConfig, PdfParsingConfig, Splitter
 from langroid.parsing.repo_loader import RepoLoader
 from langroid.parsing.search import (
@@ -44,7 +45,7 @@ from langroid.parsing.search import (
 )
 from langroid.parsing.table_loader import describe_dataframe
 from langroid.parsing.url_loader import URLLoader
-from langroid.parsing.urls import get_list_from_user, get_urls_and_paths
+from langroid.parsing.urls import get_list_from_user, get_urls_paths_bytes_indices
 from langroid.parsing.utils import batched
 from langroid.prompts.prompts_config import PromptsConfig
 from langroid.prompts.templates import SUMMARY_ANSWER_PROMPT_GPT4
@@ -126,7 +127,7 @@ class DocChatAgentConfig(ChatAgentConfig):
             llm=None  # use the parent's llm unless explicitly set here
         )
     )
-    doc_paths: List[str] = []
+    doc_paths: List[str | bytes] = []
     default_paths: List[str] = [
         "https://news.ycombinator.com/item?id=35629033",
         "https://www.newyorker.com/tech/annals-of-technology/chatgpt-is-a-blurry-jpeg-of-the-web",
@@ -248,62 +249,84 @@ class DocChatAgent(ChatAgent):
                 raise ValueError("VecDB not set")
             self.setup_documents(filter=self.config.filter)
             return
-        self.ingest_doc_paths(self.config.doc_paths)
+        self.ingest_doc_paths(self.config.doc_paths)  # type: ignore
     def ingest_doc_paths(
         self,
-        paths: List[str],
+        paths: str | bytes | List[str | bytes],
         metadata: (
             List[Dict[str, Any]] | Dict[str, Any] | DocMetaData | List[DocMetaData]
         ) = [],
+        doc_type: str | DocumentType | None = None,
     ) -> List[Document]:
         """Split, ingest docs from specified paths,
         do not add these to config.doc_paths.
         Args:
-            paths: List of file/folder paths or URLs
+            paths: document paths, urls or byte-content of docs.
+                The bytes option is intended to support cases where a document
+                has already been read in as bytes (e.g. from an API or a database),
+                and we want to avoid having to write it to a temporary file
+                just to read it back in.
             metadata: List of metadata dicts, one for each path.
                 If a single dict is passed in, it is used for all paths.
+            doc_type: DocumentType to use for parsing, if known.
+                MUST apply to all docs if specified.
+                This is especially useful when the `paths` are of bytes type,
+                to help with document type detection.
         Returns:
             List of Document objects
         """
+        if isinstance(paths, str) or isinstance(paths, bytes):
+            paths = [paths]
         all_paths = paths
-        paths_meta: Dict[str, Any] = {}
-        urls_meta: Dict[str, Any] = {}
-        urls, paths = get_urls_and_paths(paths)
+        paths_meta: Dict[int, Any] = {}
+        urls_meta: Dict[int, Any] = {}
+        idxs = range(len(all_paths))
+        url_idxs, path_idxs, bytes_idxs = get_urls_paths_bytes_indices(all_paths)
+        urls = [all_paths[i] for i in url_idxs]
+        paths = [all_paths[i] for i in path_idxs]
+        bytes_list = [all_paths[i] for i in bytes_idxs]
+        path_idxs.extend(bytes_idxs)
+        paths.extend(bytes_list)
         if (isinstance(metadata, list) and len(metadata) > 0) or not isinstance(
             metadata, list
         ):
             if isinstance(metadata, list):
-                path2meta = {
+                idx2meta = {
                     p: (
                         m
                         if isinstance(m, dict)
                         else (isinstance(m, DocMetaData) and m.dict())
                     )  # appease mypy
-                    for p, m in zip(all_paths, metadata)
+                    for p, m in zip(idxs, metadata)
                 }
             elif isinstance(metadata, dict):
-                path2meta = {p: metadata for p in all_paths}
+                idx2meta = {p: metadata for p in idxs}
             else:
-                path2meta = {p: metadata.dict() for p in all_paths}
-            urls_meta = {u: path2meta[u] for u in urls}
-            paths_meta = {p: path2meta[p] for p in paths}
+                idx2meta = {p: metadata.dict() for p in idxs}
+            urls_meta = {u: idx2meta[u] for u in url_idxs}
+            paths_meta = {p: idx2meta[p] for p in path_idxs}
         docs: List[Document] = []
         parser = Parser(self.config.parsing)
         if len(urls) > 0:
-            for u in urls:
-                meta = urls_meta.get(u, {})
-                loader = URLLoader(urls=[u], parser=parser)
+            for ui in url_idxs:
+                meta = urls_meta.get(ui, {})
+                loader = URLLoader(urls=[all_paths[ui]], parser=parser)  # type: ignore
                 url_docs = loader.load()
                 # update metadata of each doc with meta
                 for d in url_docs:
                     d.metadata = d.metadata.copy(update=meta)
                 docs.extend(url_docs)
-        if len(paths) > 0:
-            for p in paths:
-                meta = paths_meta.get(p, {})
-                path_docs = RepoLoader.get_documents(p, parser=parser)
+        if len(paths) > 0:  # paths OR bytes are handled similarly
+            for pi in path_idxs:
+                meta = paths_meta.get(pi, {})
+                p = all_paths[pi]
+                path_docs = RepoLoader.get_documents(
+                    p,
+                    parser=parser,
+                    doc_type=doc_type,
+                )
                 # update metadata of each doc with meta
                 for d in path_docs:
                     d.metadata = d.metadata.copy(update=meta)
@@ -317,11 +340,12 @@ class DocChatAgent(ChatAgent):
         print(
             f"""
         [green]I have processed the following {n_urls} URLs
-        and {n_paths} paths into {n_splits} parts:
+        and {n_paths} docs into {n_splits} parts:
         """.strip()
         )
-        print("\n".join(urls))
-        print("\n".join(paths))
+        path_reps = [p if isinstance(p, str) else "bytes" for p in paths]
+        print("\n".join([u for u in urls if isinstance(u, str)]))  # appease mypy
+        print("\n".join(path_reps))
         return docs
     def ingest_docs(
@@ -388,6 +412,7 @@ class DocChatAgent(ChatAgent):
                         + ",content="
                         + d.content
                     )
+        docs = docs[: self.config.parsing.max_chunks]
         # add embeddings in batches, to stay under limit of embeddings API
         batches = list(batched(docs, self.config.embed_batch_size))
         for batch in batches:
@@ -463,6 +488,10 @@ class DocChatAgent(ChatAgent):
             d.metadata.is_chunk = True
         return self.ingest_docs(docs)
+    def set_filter(self, filter: str) -> None:
+        self.config.filter = filter
+        self.setup_documents(filter=filter)
     def setup_documents(
         self,
         docs: List[Document] = [],
@@ -609,7 +638,7 @@ class DocChatAgent(ChatAgent):
         if len(inputs) == 0:
             if is_new_collection:
                 inputs = self.config.default_paths
-        self.config.doc_paths = inputs
+        self.config.doc_paths = inputs  # type: ignore
         self.ingest()
     def llm_response(

{langroid-0.1.217 → langroid-0.1.219}/langroid/parsing/document_parser.py RENAMED Viewed

@@ -1,3 +1,4 @@
+import itertools
 import logging
 import re
 from enum import Enum
@@ -8,6 +9,7 @@ import fitz
 import pdfplumber
 import pypdf
 import requests
+from bs4 import BeautifulSoup
 from PIL import Image
 from langroid.mytypes import DocMetaData, Document
@@ -20,6 +22,29 @@ class DocumentType(str, Enum):
     PDF = "pdf"
     DOCX = "docx"
     DOC = "doc"
+    TXT = "txt"
+def is_plain_text(path_or_bytes: str | bytes) -> bool:
+    if isinstance(path_or_bytes, str):
+        if path_or_bytes.startswith(("http://", "https://")):
+            response = requests.get(path_or_bytes)
+            response.raise_for_status()
+            content = response.content[:1024]
+        else:
+            with open(path_or_bytes, "rb") as f:
+                content = f.read(1024)
+    else:
+        content = path_or_bytes[:1024]
+    try:
+        # Attempt to decode the content as UTF-8
+        _ = content.decode("utf-8")
+        # Additional checks can go here, e.g., to verify that the content
+        # doesn't contain too many unusual characters for it to be considered text
+        return True
+    except UnicodeDecodeError:
+        # If decoding fails, it's likely not plain text (or not encoded in UTF-8)
+        return False
 class DocumentParser(Parser):
@@ -33,19 +58,26 @@ class DocumentParser(Parser):
     """
     @classmethod
-    def create(cls, source: str, config: ParsingConfig) -> "DocumentParser":
+    def create(
+        cls,
+        source: str | bytes,
+        config: ParsingConfig,
+        doc_type: str | DocumentType | None = None,
+    ) -> "DocumentParser":
         """
         Create a DocumentParser instance based on source type
             and config.<source_type>.library specified.
         Args:
-            source (str): The source of the PDF, either a URL or a file path.
+            source (str|bytes): The source, could be a URL, file path,
+                or bytes object.
             config (ParserConfig): The parser configuration.
+            doc_type (str|None): The type of document, if known
         Returns:
             DocumentParser: An instance of a DocumentParser subclass.
         """
-        if DocumentParser._document_type(source) == DocumentType.PDF:
+        if DocumentParser._document_type(source, doc_type) == DocumentType.PDF:
             if config.pdf.library == "fitz":
                 return FitzPDFParser(source, config)
             elif config.pdf.library == "pypdf":
@@ -60,7 +92,7 @@ class DocumentParser(Parser):
                 raise ValueError(
                     f"Unsupported PDF library specified: {config.pdf.library}"
                 )
-        elif DocumentParser._document_type(source) == DocumentType.DOCX:
+        elif DocumentParser._document_type(source, doc_type) == DocumentType.DOCX:
             if config.docx.library == "unstructured":
                 return UnstructuredDocxParser(source, config)
             elif config.docx.library == "python-docx":
@@ -69,42 +101,78 @@ class DocumentParser(Parser):
                 raise ValueError(
                     f"Unsupported DOCX library specified: {config.docx.library}"
                 )
-        elif DocumentParser._document_type(source) == DocumentType.DOC:
+        elif DocumentParser._document_type(source, doc_type) == DocumentType.DOC:
             return UnstructuredDocParser(source, config)
         else:
-            raise ValueError(f"Unsupported document type: {source}")
+            source_name = source if isinstance(source, str) else "bytes"
+            raise ValueError(f"Unsupported document type: {source_name}")
-    def __init__(self, source: str, config: ParsingConfig):
+    def __init__(self, source: str | bytes, config: ParsingConfig):
         """
-        Initialize the PDFParser.
         Args:
-            source (str): The source of the PDF, either a URL or a file path.
+            source (str|bytes): The source, which could be
+            a path, a URL or a bytes object.
         """
         super().__init__(config)
-        self.source = source
         self.config = config
-        self.doc_bytes = self._load_doc_as_bytesio()
+        if isinstance(source, bytes):
+            self.source = "bytes"
+            self.doc_bytes = BytesIO(source)
+        else:
+            self.source = source
+            self.doc_bytes = self._load_doc_as_bytesio()
     @staticmethod
-    def _document_type(source: str) -> DocumentType:
+    def _document_type(
+        source: str | bytes, doc_type: str | DocumentType | None = None
+    ) -> DocumentType:
         """
         Determine the type of document based on the source.
         Args:
-            source (str): The source of the PDF, either a URL or a file path.
+            source (str|bytes): The source, which could be a URL,
+                a file path, or a bytes object.
+            doc_type (str|DocumentType|None): The type of document, if known.
         Returns:
             str: The document type.
         """
-        if source.lower().endswith(".pdf"):
-            return DocumentType.PDF
-        elif source.lower().endswith(".docx"):
-            return DocumentType.DOCX
-        elif source.lower().endswith(".doc"):
-            return DocumentType.DOC
+        if isinstance(doc_type, DocumentType):
+            return doc_type
+        if doc_type:
+            return DocumentType(doc_type.lower())
+        if is_plain_text(source):
+            return DocumentType.TXT
+        if isinstance(source, str):
+            # detect file type from path extension
+            if source.lower().endswith(".pdf"):
+                return DocumentType.PDF
+            elif source.lower().endswith(".docx"):
+                return DocumentType.DOCX
+            elif source.lower().endswith(".doc"):
+                return DocumentType.DOC
+            else:
+                raise ValueError(f"Unsupported document type: {source}")
         else:
-            raise ValueError(f"Unsupported document type: {source}")
+            # must be bytes: attempt to detect type from content
+            # using magic mime type detection
+            import magic
+            mime_type = magic.from_buffer(source, mime=True)
+            if mime_type == "application/pdf":
+                return DocumentType.PDF
+            elif mime_type in [
+                "application/vnd.openxmlformats-officedocument"
+                ".wordprocessingml.document",
+                "application/zip",
+            ]:
+                # DOCX files are essentially ZIP files,
+                # but this might catch other ZIP-based formats too!
+                return DocumentType.DOCX
+            elif mime_type == "application/msword":
+                return DocumentType.DOC
+            else:
+                raise ValueError("Unsupported document type from bytes")
     def _load_doc_as_bytesio(self) -> BytesIO:
         """
@@ -121,6 +189,61 @@ class DocumentParser(Parser):
             with open(self.source, "rb") as f:
                 return BytesIO(f.read())
+    @staticmethod
+    def chunks_from_path_or_bytes(
+        source: str | bytes,
+        parser: Parser,
+        doc_type: str | DocumentType | None = None,
+        lines: int | None = None,
+    ) -> List[Document]:
+        """
+        Get document chunks from a file path or bytes object.
+        Args:
+            source (str|bytes): The source, which could be a URL, path or bytes object.
+            parser (Parser): The parser instance (for splitting the document).
+            doc_type (str|DocumentType|None): The type of document, if known.
+            lines (int|None): The number of lines to read from a plain text file.
+        Returns:
+            List[Document]: A list of `Document` objects,
+                each containing a chunk of text, determined by the
+                chunking and splitting settings in the parser config.
+        """
+        dtype: DocumentType = DocumentParser._document_type(source, doc_type)
+        if dtype in [DocumentType.PDF, DocumentType.DOC, DocumentType.DOCX]:
+            doc_parser = DocumentParser.create(
+                source,
+                parser.config,
+                doc_type=doc_type,
+            )
+            chunks = doc_parser.get_doc_chunks()
+            if len(chunks) == 0 and dtype == DocumentType.PDF:
+                doc_parser = ImagePdfParser(source, parser.config)
+                chunks = doc_parser.get_doc_chunks()
+            return chunks
+        else:
+            # try getting as plain text; these will be chunked downstream
+            # -- could be a bytes object or a path
+            if isinstance(source, bytes):
+                content = source.decode()
+                if lines is not None:
+                    file_lines = content.splitlines()[:lines]
+                    content = "\n".join(line.strip() for line in file_lines)
+            else:
+                with open(source, "r") as f:
+                    if lines is not None:
+                        file_lines = list(itertools.islice(f, lines))
+                        content = "\n".join(line.strip() for line in file_lines)
+                    else:
+                        content = f.read()
+            soup = BeautifulSoup(content, "html.parser")
+            text = soup.get_text()
+            source_name = source if isinstance(source, str) else "bytes"
+            doc = Document(
+                content=text,
+                metadata=DocMetaData(source=str(source_name)),
+            )
+            return parser.split([doc])
     def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]:
         """Yield each page in the PDF."""
         raise NotImplementedError
@@ -145,7 +268,7 @@ class DocumentParser(Parser):
     def get_doc(self) -> Document:
         """
-        Get entire text from pdf source as a single document.
+        Get entire text from source as a single document.
         Returns:
             a `Document` object containing the content of the pdf file,

{langroid-0.1.217 → langroid-0.1.219}/langroid/parsing/parse_json.py RENAMED Viewed

@@ -1,7 +1,7 @@
 import json
-import re
 from typing import Any, Iterator, List
+import yaml
 from pyparsing import nestedExpr, originalTextFor
@@ -45,37 +45,31 @@ def get_json_candidates(s: str) -> List[str]:
         return []
-def replace_undefined(s: str, undefined_placeholder: str = '"<undefined>"') -> str:
+def add_quotes(s: str) -> str:
     """
-    Replace undefined values in a potential json str with a placeholder.
+    Replace accidentally un-quoted string-like keys and values in a potential json str.
+    Intended to handle cases where a weak LLM may produce a JSON-like string
+    containing, e.g. "rent": DO-NOT-KNOW, where it "forgot" to put quotes on the value,
+    or city: "New York" where it "forgot" to put quotes on the key.
+    It will even handle cases like 'address: do not know'.
+    Got this fiendishly clever solution from
+    https://stackoverflow.com/a/66053900/10940584
+    Far better/safer than trying to do it with regexes.
     Args:
     - s (str): The potential JSON string to parse.
-    - undefined_placeholder (str): The placeholder or error message
-        for undefined values.
     Returns:
-    - str: The (potential) JSON string with undefined values
-        replaced by the placeholder.
+    - str: The (potential) JSON string with un-quoted string-like values
+        replaced by quoted values.
     """
-    # Preprocess the string to replace undefined values with the placeholder
-    # This regex looks for patterns like ": <identifier>" and replaces them
-    # with the placeholder.
-    # It's a simple approach and might need adjustments for complex cases
-    # This is an attempt to handle cases where a weak LLM may produce
-    # a JSON-like string without quotes around some values, e.g.
-    # {"rent": DO-NOT-KNOW }
-    preprocessed_s = re.sub(
-        r":\s*([a-zA-Z_][a-zA-Z_0-9\-]*)", f": {undefined_placeholder}", s
-    )
-    # Now, attempt to parse the preprocessed string as JSON
+    if is_valid_json(s):
+        return s
     try:
-        return preprocessed_s
+        dct = yaml.load(s, yaml.SafeLoader)
+        return json.dumps(dct)
     except Exception:
-        # If parsing fails, return an error message instead
-        # (this should be rare after preprocessing)
         return s
@@ -115,7 +109,7 @@ def extract_top_level_json(s: str) -> List[str]:
         candidate.replace("\\{", "{").replace("\\}", "}").replace("\\_", "_")
         for candidate in json_candidates
     ]
-    candidates = [replace_undefined(candidate) for candidate in normalized_candidates]
+    candidates = [add_quotes(candidate) for candidate in normalized_candidates]
     candidates = [repair_newlines(candidate) for candidate in candidates]
     top_level_jsons = [
         candidate for candidate in candidates if is_valid_json(candidate)

langroid 0.1.217__tar.gz → 0.1.219__tar.gz

langroid 0.1.217tar.gz → 0.1.219tar.gz