PyPI - langroid - Versions diffs - 0.1.139__py3-none-any.whl → 0.1.219__py3-none-any.whl - Mend

langroid 0.1.139py3-none-any.whl → 0.1.219py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (97) hide show

langroid/__init__.py +70 -0
langroid/agent/__init__.py +22 -0
langroid/agent/base.py +120 -33
langroid/agent/batch.py +134 -35
langroid/agent/callbacks/__init__.py +0 -0
langroid/agent/callbacks/chainlit.py +608 -0
langroid/agent/chat_agent.py +164 -100
langroid/agent/chat_document.py +19 -2
langroid/agent/openai_assistant.py +20 -10
langroid/agent/special/__init__.py +33 -10
langroid/agent/special/doc_chat_agent.py +521 -108
langroid/agent/special/lance_doc_chat_agent.py +258 -0
langroid/agent/special/lance_rag/__init__.py +9 -0
langroid/agent/special/lance_rag/critic_agent.py +136 -0
langroid/agent/special/lance_rag/lance_rag_task.py +80 -0
langroid/agent/special/lance_rag/query_planner_agent.py +180 -0
langroid/agent/special/lance_tools.py +44 -0
langroid/agent/special/neo4j/__init__.py +0 -0
langroid/agent/special/neo4j/csv_kg_chat.py +174 -0
langroid/agent/special/neo4j/neo4j_chat_agent.py +370 -0
langroid/agent/special/neo4j/utils/__init__.py +0 -0
langroid/agent/special/neo4j/utils/system_message.py +46 -0
langroid/agent/special/relevance_extractor_agent.py +23 -7
langroid/agent/special/retriever_agent.py +29 -174
langroid/agent/special/sql/__init__.py +7 -0
langroid/agent/special/sql/sql_chat_agent.py +47 -23
langroid/agent/special/sql/utils/__init__.py +11 -0
langroid/agent/special/sql/utils/description_extractors.py +95 -46
langroid/agent/special/sql/utils/populate_metadata.py +28 -21
langroid/agent/special/table_chat_agent.py +43 -9
langroid/agent/task.py +423 -114
langroid/agent/tool_message.py +67 -10
langroid/agent/tools/__init__.py +8 -0
langroid/agent/tools/duckduckgo_search_tool.py +66 -0
langroid/agent/tools/google_search_tool.py +11 -0
langroid/agent/tools/metaphor_search_tool.py +67 -0
langroid/agent/tools/recipient_tool.py +6 -24
langroid/agent/tools/sciphi_search_rag_tool.py +79 -0
langroid/cachedb/__init__.py +6 -0
langroid/embedding_models/__init__.py +24 -0
langroid/embedding_models/base.py +9 -1
langroid/embedding_models/models.py +117 -17
langroid/embedding_models/protoc/embeddings.proto +19 -0
langroid/embedding_models/protoc/embeddings_pb2.py +33 -0
langroid/embedding_models/protoc/embeddings_pb2.pyi +50 -0
langroid/embedding_models/protoc/embeddings_pb2_grpc.py +79 -0
langroid/embedding_models/remote_embeds.py +153 -0
langroid/language_models/__init__.py +22 -0
langroid/language_models/azure_openai.py +47 -4
langroid/language_models/base.py +26 -10
langroid/language_models/config.py +5 -0
langroid/language_models/openai_gpt.py +407 -121
langroid/language_models/prompt_formatter/__init__.py +9 -0
langroid/language_models/prompt_formatter/base.py +4 -6
langroid/language_models/prompt_formatter/hf_formatter.py +135 -0
langroid/language_models/utils.py +10 -9
langroid/mytypes.py +10 -4
langroid/parsing/__init__.py +33 -1
langroid/parsing/document_parser.py +259 -63
langroid/parsing/image_text.py +32 -0
langroid/parsing/parse_json.py +143 -0
langroid/parsing/parser.py +20 -7
langroid/parsing/repo_loader.py +108 -46
langroid/parsing/search.py +8 -0
langroid/parsing/table_loader.py +44 -0
langroid/parsing/url_loader.py +59 -13
langroid/parsing/urls.py +18 -9
langroid/parsing/utils.py +130 -9
langroid/parsing/web_search.py +73 -0
langroid/prompts/__init__.py +7 -0
langroid/prompts/chat-gpt4-system-prompt.md +68 -0
langroid/prompts/prompts_config.py +1 -1
langroid/utils/__init__.py +10 -0
langroid/utils/algorithms/__init__.py +3 -0
langroid/utils/configuration.py +0 -1
langroid/utils/constants.py +4 -0
langroid/utils/logging.py +2 -5
langroid/utils/output/__init__.py +15 -2
langroid/utils/output/status.py +33 -0
langroid/utils/pandas_utils.py +30 -0
langroid/utils/pydantic_utils.py +446 -4
langroid/utils/system.py +36 -1
langroid/vector_store/__init__.py +34 -2
langroid/vector_store/base.py +33 -2
langroid/vector_store/chromadb.py +42 -13
langroid/vector_store/lancedb.py +226 -60
langroid/vector_store/meilisearch.py +7 -6
langroid/vector_store/momento.py +3 -2
langroid/vector_store/qdrantdb.py +82 -11
{langroid-0.1.139.dist-info → langroid-0.1.219.dist-info}/METADATA +190 -129
langroid-0.1.219.dist-info/RECORD +127 -0
langroid/agent/special/recipient_validator_agent.py +0 -157
langroid/parsing/json.py +0 -64
langroid/utils/web/selenium_login.py +0 -36
langroid-0.1.139.dist-info/RECORD +0 -103
{langroid-0.1.139.dist-info → langroid-0.1.219.dist-info}/LICENSE +0 -0
{langroid-0.1.139.dist-info → langroid-0.1.219.dist-info}/WHEEL +0 -0

langroid/parsing/document_parser.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import itertools
 import logging
 import re
 from enum import Enum
@@ -8,10 +9,11 @@ import fitz
 import pdfplumber
 import pypdf
 import requests
+from bs4 import BeautifulSoup
+from PIL import Image
 from langroid.mytypes import DocMetaData, Document
 from langroid.parsing.parser import Parser, ParsingConfig
-from langroid.parsing.urls import url_to_tempfile
 logger = logging.getLogger(__name__)
@@ -19,6 +21,30 @@ logger = logging.getLogger(__name__)
 class DocumentType(str, Enum):
     PDF = "pdf"
     DOCX = "docx"
+    DOC = "doc"
+    TXT = "txt"
+def is_plain_text(path_or_bytes: str | bytes) -> bool:
+    if isinstance(path_or_bytes, str):
+        if path_or_bytes.startswith(("http://", "https://")):
+            response = requests.get(path_or_bytes)
+            response.raise_for_status()
+            content = response.content[:1024]
+        else:
+            with open(path_or_bytes, "rb") as f:
+                content = f.read(1024)
+    else:
+        content = path_or_bytes[:1024]
+    try:
+        # Attempt to decode the content as UTF-8
+        _ = content.decode("utf-8")
+        # Additional checks can go here, e.g., to verify that the content
+        # doesn't contain too many unusual characters for it to be considered text
+        return True
+    except UnicodeDecodeError:
+        # If decoding fails, it's likely not plain text (or not encoded in UTF-8)
+        return False
 class DocumentParser(Parser):
@@ -32,19 +58,26 @@ class DocumentParser(Parser):
     """
     @classmethod
-    def create(cls, source: str, config: ParsingConfig) -> "DocumentParser":
+    def create(
+        cls,
+        source: str | bytes,
+        config: ParsingConfig,
+        doc_type: str | DocumentType | None = None,
+    ) -> "DocumentParser":
         """
         Create a DocumentParser instance based on source type
             and config.<source_type>.library specified.
         Args:
-            source (str): The source of the PDF, either a URL or a file path.
+            source (str|bytes): The source, could be a URL, file path,
+                or bytes object.
             config (ParserConfig): The parser configuration.
+            doc_type (str|None): The type of document, if known
         Returns:
             DocumentParser: An instance of a DocumentParser subclass.
         """
-        if DocumentParser._document_type(source) == DocumentType.PDF:
+        if DocumentParser._document_type(source, doc_type) == DocumentType.PDF:
             if config.pdf.library == "fitz":
                 return FitzPDFParser(source, config)
             elif config.pdf.library == "pypdf":
@@ -53,51 +86,93 @@ class DocumentParser(Parser):
                 return PDFPlumberParser(source, config)
             elif config.pdf.library == "unstructured":
                 return UnstructuredPDFParser(source, config)
-            elif config.pdf.library == "haystack":
-                return HaystackPDFParser(source, config)
+            elif config.pdf.library == "pdf2image":
+                return ImagePdfParser(source, config)
             else:
                 raise ValueError(
                     f"Unsupported PDF library specified: {config.pdf.library}"
                 )
-        elif DocumentParser._document_type(source) == DocumentType.DOCX:
+        elif DocumentParser._document_type(source, doc_type) == DocumentType.DOCX:
             if config.docx.library == "unstructured":
                 return UnstructuredDocxParser(source, config)
+            elif config.docx.library == "python-docx":
+                return PythonDocxParser(source, config)
             else:
                 raise ValueError(
                     f"Unsupported DOCX library specified: {config.docx.library}"
                 )
+        elif DocumentParser._document_type(source, doc_type) == DocumentType.DOC:
+            return UnstructuredDocParser(source, config)
         else:
-            raise ValueError(f"Unsupported document type: {source}")
+            source_name = source if isinstance(source, str) else "bytes"
+            raise ValueError(f"Unsupported document type: {source_name}")
-    def __init__(self, source: str, config: ParsingConfig):
+    def __init__(self, source: str | bytes, config: ParsingConfig):
         """
-        Initialize the PDFParser.
         Args:
-            source (str): The source of the PDF, either a URL or a file path.
+            source (str|bytes): The source, which could be
+            a path, a URL or a bytes object.
         """
         super().__init__(config)
-        self.source = source
         self.config = config
-        self.doc_bytes = self._load_doc_as_bytesio()
+        if isinstance(source, bytes):
+            self.source = "bytes"
+            self.doc_bytes = BytesIO(source)
+        else:
+            self.source = source
+            self.doc_bytes = self._load_doc_as_bytesio()
     @staticmethod
-    def _document_type(source: str) -> DocumentType:
+    def _document_type(
+        source: str | bytes, doc_type: str | DocumentType | None = None
+    ) -> DocumentType:
         """
         Determine the type of document based on the source.
         Args:
-            source (str): The source of the PDF, either a URL or a file path.
+            source (str|bytes): The source, which could be a URL,
+                a file path, or a bytes object.
+            doc_type (str|DocumentType|None): The type of document, if known.
         Returns:
             str: The document type.
         """
-        if source.lower().endswith(".pdf"):
-            return DocumentType.PDF
-        elif source.lower().endswith(".docx"):
-            return DocumentType.DOCX
+        if isinstance(doc_type, DocumentType):
+            return doc_type
+        if doc_type:
+            return DocumentType(doc_type.lower())
+        if is_plain_text(source):
+            return DocumentType.TXT
+        if isinstance(source, str):
+            # detect file type from path extension
+            if source.lower().endswith(".pdf"):
+                return DocumentType.PDF
+            elif source.lower().endswith(".docx"):
+                return DocumentType.DOCX
+            elif source.lower().endswith(".doc"):
+                return DocumentType.DOC
+            else:
+                raise ValueError(f"Unsupported document type: {source}")
         else:
-            raise ValueError(f"Unsupported document type: {source}")
+            # must be bytes: attempt to detect type from content
+            # using magic mime type detection
+            import magic
+            mime_type = magic.from_buffer(source, mime=True)
+            if mime_type == "application/pdf":
+                return DocumentType.PDF
+            elif mime_type in [
+                "application/vnd.openxmlformats-officedocument"
+                ".wordprocessingml.document",
+                "application/zip",
+            ]:
+                # DOCX files are essentially ZIP files,
+                # but this might catch other ZIP-based formats too!
+                return DocumentType.DOCX
+            elif mime_type == "application/msword":
+                return DocumentType.DOC
+            else:
+                raise ValueError("Unsupported document type from bytes")
     def _load_doc_as_bytesio(self) -> BytesIO:
         """
@@ -114,6 +189,61 @@ class DocumentParser(Parser):
             with open(self.source, "rb") as f:
                 return BytesIO(f.read())
+    @staticmethod
+    def chunks_from_path_or_bytes(
+        source: str | bytes,
+        parser: Parser,
+        doc_type: str | DocumentType | None = None,
+        lines: int | None = None,
+    ) -> List[Document]:
+        """
+        Get document chunks from a file path or bytes object.
+        Args:
+            source (str|bytes): The source, which could be a URL, path or bytes object.
+            parser (Parser): The parser instance (for splitting the document).
+            doc_type (str|DocumentType|None): The type of document, if known.
+            lines (int|None): The number of lines to read from a plain text file.
+        Returns:
+            List[Document]: A list of `Document` objects,
+                each containing a chunk of text, determined by the
+                chunking and splitting settings in the parser config.
+        """
+        dtype: DocumentType = DocumentParser._document_type(source, doc_type)
+        if dtype in [DocumentType.PDF, DocumentType.DOC, DocumentType.DOCX]:
+            doc_parser = DocumentParser.create(
+                source,
+                parser.config,
+                doc_type=doc_type,
+            )
+            chunks = doc_parser.get_doc_chunks()
+            if len(chunks) == 0 and dtype == DocumentType.PDF:
+                doc_parser = ImagePdfParser(source, parser.config)
+                chunks = doc_parser.get_doc_chunks()
+            return chunks
+        else:
+            # try getting as plain text; these will be chunked downstream
+            # -- could be a bytes object or a path
+            if isinstance(source, bytes):
+                content = source.decode()
+                if lines is not None:
+                    file_lines = content.splitlines()[:lines]
+                    content = "\n".join(line.strip() for line in file_lines)
+            else:
+                with open(source, "r") as f:
+                    if lines is not None:
+                        file_lines = list(itertools.islice(f, lines))
+                        content = "\n".join(line.strip() for line in file_lines)
+                    else:
+                        content = f.read()
+            soup = BeautifulSoup(content, "html.parser")
+            text = soup.get_text()
+            source_name = source if isinstance(source, str) else "bytes"
+            doc = Document(
+                content=text,
+                metadata=DocMetaData(source=str(source_name)),
+            )
+            return parser.split([doc])
     def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]:
         """Yield each page in the PDF."""
         raise NotImplementedError
@@ -138,7 +268,7 @@ class DocumentParser(Parser):
     def get_doc(self) -> Document:
         """
-        Get entire text from pdf source as a single document.
+        Get entire text from source as a single document.
         Returns:
             a `Document` object containing the content of the pdf file,
@@ -294,50 +424,34 @@ class PDFPlumberParser(DocumentParser):
         return self.fix_text(page.extract_text())
-class HaystackPDFParser(DocumentParser):
+class ImagePdfParser(DocumentParser):
     """
-    Parser for processing PDFs using the `haystack` library.
+    Parser for processing PDFs that are images, i.e. not "true" PDFs.
     """
-    def get_doc_chunks(self) -> List[Document]:
-        """
-        Overrides the base class method to use the `haystack` library.
-        See there for more details.
+    def iterate_pages(
+        self,
+    ) -> Generator[Tuple[int, Image], None, None]:
+        from pdf2image import convert_from_bytes
+        images = convert_from_bytes(self.doc_bytes.getvalue())
+        for i, image in enumerate(images):
+            yield i, image
+    def extract_text_from_page(self, page: Image) -> str:
         """
+        Extract text from a given `pdf2image` page.
-        from haystack.nodes import PDFToTextConverter, PreProcessor
+        Args:
+            page (Image): The PIL Image object.
-        converter = PDFToTextConverter(
-            remove_numeric_tables=True,
-        )
-        path = self.source
-        if path.startswith(("http://", "https://")):
-            path = url_to_tempfile(path)
-        doc = converter.convert(file_path=path, meta=None)
-        # note self.config.chunk_size is in token units,
-        # and we use an approximation of 75 words per 100 tokens
-        # to convert to word units
-        preprocessor = PreProcessor(
-            clean_empty_lines=True,
-            clean_whitespace=True,
-            clean_header_footer=False,
-            split_by="word",
-            split_length=int(0.75 * self.config.chunk_size),
-            split_overlap=int(0.75 * self.config.overlap),
-            split_respect_sentence_boundary=True,
-            add_page_number=True,
-        )
-        chunks = preprocessor.process(doc)
-        return [
-            Document(
-                content=chunk.content,
-                metadata=DocMetaData(
-                    source=f"{self.source} page {chunk.meta['page']}",
-                    is_chunk=True,
-                ),
-            )
-            for chunk in chunks
-        ]
+        Returns:
+            str: Extracted text from the image.
+        """
+        import pytesseract
+        text = pytesseract.image_to_string(page)
+        return self.fix_text(text)
 class UnstructuredPDFParser(DocumentParser):
@@ -346,7 +460,17 @@ class UnstructuredPDFParser(DocumentParser):
     """
     def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]:  # type: ignore
-        from unstructured.partition.pdf import partition_pdf
+        try:
+            from unstructured.partition.pdf import partition_pdf
+        except ImportError:
+            raise ImportError(
+                """
+                The `unstructured` library is not installed by default with langroid.
+                To include this library, please install langroid with the
+                `unstructured` extra by running `pip install "langroid[unstructured]"`
+                or equivalent.
+                """
+            )
         # from unstructured.chunking.title import chunk_by_title
@@ -360,7 +484,7 @@ class UnstructuredPDFParser(DocumentParser):
                 Please try a different library by setting the `library` field
                 in the `pdf` section of the `parsing` field in the config file.
                 Supported libraries are:
-                fitz, pypdf, pdfplumber, unstructured, haystack
+                fitz, pypdf, pdfplumber, unstructured
                 """
             )
@@ -399,7 +523,17 @@ class UnstructuredDocxParser(DocumentParser):
     """
     def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]:  # type: ignore
-        from unstructured.partition.docx import partition_docx
+        try:
+            from unstructured.partition.docx import partition_docx
+        except ImportError:
+            raise ImportError(
+                """
+                The `unstructured` library is not installed by default with langroid.
+                To include this library, please install langroid with the
+                `unstructured` extra by running `pip install "langroid[unstructured]"`
+                or equivalent.
+                """
+            )
         elements = partition_docx(file=self.doc_bytes, include_page_breaks=True)
@@ -436,3 +570,65 @@ class UnstructuredDocxParser(DocumentParser):
         """
         text = " ".join(el.text for el in page)
         return self.fix_text(text)
+class UnstructuredDocParser(UnstructuredDocxParser):
+    def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]:  # type: ignore
+        try:
+            from unstructured.partition.doc import partition_doc
+        except ImportError:
+            raise ImportError(
+                """
+                The `unstructured` library is not installed by default with langroid.
+                To include this library, please install langroid with the
+                `unstructured` extra by running `pip install "langroid[unstructured]"`
+                or equivalent.
+                """
+            )
+        elements = partition_doc(filename=self.source, include_page_breaks=True)
+        page_number = 1
+        page_elements = []  # type: ignore
+        for el in elements:
+            if el.category == "PageBreak":
+                if page_elements:  # Avoid yielding empty pages at the start
+                    yield page_number, page_elements
+                page_number += 1
+                page_elements = []
+            else:
+                page_elements.append(el)
+        # Yield the last page if it's not empty
+        if page_elements:
+            yield page_number, page_elements
+class PythonDocxParser(DocumentParser):
+    """
+    Parser for processing DOCX files using the `python-docx` library.
+    """
+    def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]:
+        """
+        Simulate iterating through pages.
+        In a DOCX file, pages are not explicitly defined,
+        so we consider each paragraph as a separate 'page' for simplicity.
+        """
+        import docx
+        doc = docx.Document(self.doc_bytes)
+        for i, para in enumerate(doc.paragraphs, start=1):
+            yield i, [para]
+    def extract_text_from_page(self, page: Any) -> str:
+        """
+        Extract text from a given 'page', which in this case is a single paragraph.
+        Args:
+            page (list): A list containing a single Paragraph object.
+        Returns:
+            str: Extracted text from the paragraph.
+        """
+        paragraph = page[0]
+        return self.fix_text(paragraph.text)

langroid/parsing/image_text.py ADDED Viewed

@@ -0,0 +1,32 @@
+from typing import Union
+import pytesseract
+from pdf2image import convert_from_bytes, convert_from_path
+def pdf_image_to_text(input_data: Union[str, bytes]) -> str:
+    """
+    Converts a PDF that contains images to text using OCR.
+    Args:
+        input_data (Union[str, bytes]): The file path to the PDF or a bytes-like object
+            of the PDF content.
+    Returns:
+        str: The extracted text from the PDF.
+    """
+    # Check if the input is a file path (str) or bytes, and
+    # convert PDF to images accordingly
+    if isinstance(input_data, str):
+        images = convert_from_path(input_data)
+    elif isinstance(input_data, bytes):
+        images = convert_from_bytes(input_data)
+    else:
+        raise ValueError("input_data must be a file path (str) or bytes-like object")
+    text = ""
+    for image in images:
+        text += pytesseract.image_to_string(image)
+    return text

langroid/parsing/parse_json.py ADDED Viewed

@@ -0,0 +1,143 @@
+import json
+from typing import Any, Iterator, List
+import yaml
+from pyparsing import nestedExpr, originalTextFor
+def is_valid_json(json_str: str) -> bool:
+    """Check if the input string is a valid JSON.
+    Args:
+        json_str (str): The input string to check.
+    Returns:
+        bool: True if the input string is a valid JSON, False otherwise.
+    """
+    try:
+        json.loads(json_str)
+        return True
+    except ValueError:
+        return False
+def flatten(nested_list) -> Iterator[str]:  # type: ignore
+    """Flatten a nested list into a single list of strings"""
+    for item in nested_list:
+        if isinstance(item, (list, tuple)):
+            for subitem in flatten(item):
+                yield subitem
+        else:
+            yield item
+def get_json_candidates(s: str) -> List[str]:
+    """Get top-level JSON candidates, i.e. strings between curly braces."""
+    # Define the grammar for matching curly braces
+    curly_braces = originalTextFor(nestedExpr("{", "}"))
+    # Parse the string
+    try:
+        results = curly_braces.searchString(s)
+        # Properly convert nested lists to strings
+        return [r[0] for r in results]
+    except Exception:
+        return []
+def add_quotes(s: str) -> str:
+    """
+    Replace accidentally un-quoted string-like keys and values in a potential json str.
+    Intended to handle cases where a weak LLM may produce a JSON-like string
+    containing, e.g. "rent": DO-NOT-KNOW, where it "forgot" to put quotes on the value,
+    or city: "New York" where it "forgot" to put quotes on the key.
+    It will even handle cases like 'address: do not know'.
+    Got this fiendishly clever solution from
+    https://stackoverflow.com/a/66053900/10940584
+    Far better/safer than trying to do it with regexes.
+    Args:
+    - s (str): The potential JSON string to parse.
+    Returns:
+    - str: The (potential) JSON string with un-quoted string-like values
+        replaced by quoted values.
+    """
+    if is_valid_json(s):
+        return s
+    try:
+        dct = yaml.load(s, yaml.SafeLoader)
+        return json.dumps(dct)
+    except Exception:
+        return s
+def repair_newlines(s: str) -> str:
+    """
+    Attempt to load as json, and if it fails, try with newlines replaced by space.
+    Intended to handle cases where weak LLMs produce JSON-like strings where
+    some string-values contain explicit newlines, e.g.:
+    {"text": "This is a text\n with a newline"}
+    These would not be valid JSON, so we try to clean them up here.
+    """
+    try:
+        json.loads(s)
+        return s
+    except Exception:
+        try:
+            s = s.replace("\n", " ")
+            json.loads(s)
+            return s
+        except Exception:
+            return s
+def extract_top_level_json(s: str) -> List[str]:
+    """Extract all top-level JSON-formatted substrings from a given string.
+    Args:
+        s (str): The input string to search for JSON substrings.
+    Returns:
+        List[str]: A list of top-level JSON-formatted substrings.
+    """
+    # Find JSON object and array candidates
+    json_candidates = get_json_candidates(s)
+    normalized_candidates = [
+        candidate.replace("\\{", "{").replace("\\}", "}").replace("\\_", "_")
+        for candidate in json_candidates
+    ]
+    candidates = [add_quotes(candidate) for candidate in normalized_candidates]
+    candidates = [repair_newlines(candidate) for candidate in candidates]
+    top_level_jsons = [
+        candidate for candidate in candidates if is_valid_json(candidate)
+    ]
+    return top_level_jsons
+def top_level_json_field(s: str, f: str) -> Any:
+    """
+    Extract the value of a field f from a top-level JSON object.
+    If there are multiple, just return the first.
+    Args:
+        s (str): The input string to search for JSON substrings.
+        f (str): The field to extract from the JSON object.
+    Returns:
+        str: The value of the field f in the top-level JSON object, if any.
+            Otherwise, return an empty string.
+    """
+    jsons = extract_top_level_json(s)
+    if len(jsons) == 0:
+        return ""
+    for j in jsons:
+        json_data = json.loads(j)
+        if f in json_data:
+            return json_data[f]
+    return ""

langroid/parsing/parser.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import logging
 from enum import Enum
-from typing import Dict, List
+from typing import Dict, List, Literal
 import tiktoken
 from pydantic import BaseSettings
@@ -19,11 +19,21 @@ class Splitter(str, Enum):
 class PdfParsingConfig(BaseSettings):
-    library: str = "pdfplumber"
+    library: Literal[
+        "fitz",
+        "pdfplumber",
+        "pypdf",
+        "unstructured",
+        "pdf2image",
+    ] = "pdfplumber"
 class DocxParsingConfig(BaseSettings):
-    library: str = "unstructured"
+    library: Literal["python-docx", "unstructured"] = "unstructured"
+class DocParsingConfig(BaseSettings):
+    library: Literal["unstructured"] = "unstructured"
 class ParsingConfig(BaseSettings):
@@ -40,6 +50,7 @@ class ParsingConfig(BaseSettings):
     token_encoding_model: str = "text-embedding-ada-002"
     pdf: PdfParsingConfig = PdfParsingConfig()
     docx: DocxParsingConfig = DocxParsingConfig()
+    doc: DocParsingConfig = DocParsingConfig()
 class Parser:
@@ -55,6 +66,10 @@ class Parser:
         """Chunks may belong to multiple docs, but for each doc,
         they appear consecutively. Add window_ids in metadata"""
+        # discard empty chunks
+        chunks = [c for c in chunks if c.content.strip() != ""]
+        if len(chunks) == 0:
+            return
         # The original metadata.id (if any) is ignored since it will be same for all
         # chunks and is useless. We want a distinct id for each chunk.
         orig_ids = [c.metadata.id for c in chunks]
@@ -65,8 +80,8 @@ class Parser:
         orig_id_to_ids: Dict[str, List[str]] = {}
         for orig_id, id in zip(orig_ids, ids):
             if orig_id not in orig_id_to_ids:
-                orig_id_to_ids[orig_id] = []  # type: ignore
-            orig_id_to_ids[orig_id].append(id)  # type: ignore
+                orig_id_to_ids[orig_id] = []
+            orig_id_to_ids[orig_id].append(id)
         # now each orig_id maps to a sequence of ids within a single doc
@@ -77,8 +92,6 @@ class Parser:
             window_ids = [ids[max(0, i - k) : min(n, i + k + 1)] for i in range(n)]
             for i, _ in enumerate(ids):
                 c = id2chunk[ids[i]]
-                if c.content.strip() == "":
-                    continue
                 c.metadata.window_ids = window_ids[i]
                 c.metadata.id = ids[i]
                 c.metadata.is_chunk = True

langroid 0.1.139__py3-none-any.whl → 0.1.219__py3-none-any.whl

langroid 0.1.139py3-none-any.whl → 0.1.219py3-none-any.whl