PyPI - langroid - Versions diffs - 0.1.139__py3-none-any.whl → 0.1.219__py3-none-any.whl - Mend

langroid 0.1.139py3-none-any.whl → 0.1.219py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (97) hide show

langroid/__init__.py +70 -0
langroid/agent/__init__.py +22 -0
langroid/agent/base.py +120 -33
langroid/agent/batch.py +134 -35
langroid/agent/callbacks/__init__.py +0 -0
langroid/agent/callbacks/chainlit.py +608 -0
langroid/agent/chat_agent.py +164 -100
langroid/agent/chat_document.py +19 -2
langroid/agent/openai_assistant.py +20 -10
langroid/agent/special/__init__.py +33 -10
langroid/agent/special/doc_chat_agent.py +521 -108
langroid/agent/special/lance_doc_chat_agent.py +258 -0
langroid/agent/special/lance_rag/__init__.py +9 -0
langroid/agent/special/lance_rag/critic_agent.py +136 -0
langroid/agent/special/lance_rag/lance_rag_task.py +80 -0
langroid/agent/special/lance_rag/query_planner_agent.py +180 -0
langroid/agent/special/lance_tools.py +44 -0
langroid/agent/special/neo4j/__init__.py +0 -0
langroid/agent/special/neo4j/csv_kg_chat.py +174 -0
langroid/agent/special/neo4j/neo4j_chat_agent.py +370 -0
langroid/agent/special/neo4j/utils/__init__.py +0 -0
langroid/agent/special/neo4j/utils/system_message.py +46 -0
langroid/agent/special/relevance_extractor_agent.py +23 -7
langroid/agent/special/retriever_agent.py +29 -174
langroid/agent/special/sql/__init__.py +7 -0
langroid/agent/special/sql/sql_chat_agent.py +47 -23
langroid/agent/special/sql/utils/__init__.py +11 -0
langroid/agent/special/sql/utils/description_extractors.py +95 -46
langroid/agent/special/sql/utils/populate_metadata.py +28 -21
langroid/agent/special/table_chat_agent.py +43 -9
langroid/agent/task.py +423 -114
langroid/agent/tool_message.py +67 -10
langroid/agent/tools/__init__.py +8 -0
langroid/agent/tools/duckduckgo_search_tool.py +66 -0
langroid/agent/tools/google_search_tool.py +11 -0
langroid/agent/tools/metaphor_search_tool.py +67 -0
langroid/agent/tools/recipient_tool.py +6 -24
langroid/agent/tools/sciphi_search_rag_tool.py +79 -0
langroid/cachedb/__init__.py +6 -0
langroid/embedding_models/__init__.py +24 -0
langroid/embedding_models/base.py +9 -1
langroid/embedding_models/models.py +117 -17
langroid/embedding_models/protoc/embeddings.proto +19 -0
langroid/embedding_models/protoc/embeddings_pb2.py +33 -0
langroid/embedding_models/protoc/embeddings_pb2.pyi +50 -0
langroid/embedding_models/protoc/embeddings_pb2_grpc.py +79 -0
langroid/embedding_models/remote_embeds.py +153 -0
langroid/language_models/__init__.py +22 -0
langroid/language_models/azure_openai.py +47 -4
langroid/language_models/base.py +26 -10
langroid/language_models/config.py +5 -0
langroid/language_models/openai_gpt.py +407 -121
langroid/language_models/prompt_formatter/__init__.py +9 -0
langroid/language_models/prompt_formatter/base.py +4 -6
langroid/language_models/prompt_formatter/hf_formatter.py +135 -0
langroid/language_models/utils.py +10 -9
langroid/mytypes.py +10 -4
langroid/parsing/__init__.py +33 -1
langroid/parsing/document_parser.py +259 -63
langroid/parsing/image_text.py +32 -0
langroid/parsing/parse_json.py +143 -0
langroid/parsing/parser.py +20 -7
langroid/parsing/repo_loader.py +108 -46
langroid/parsing/search.py +8 -0
langroid/parsing/table_loader.py +44 -0
langroid/parsing/url_loader.py +59 -13
langroid/parsing/urls.py +18 -9
langroid/parsing/utils.py +130 -9
langroid/parsing/web_search.py +73 -0
langroid/prompts/__init__.py +7 -0
langroid/prompts/chat-gpt4-system-prompt.md +68 -0
langroid/prompts/prompts_config.py +1 -1
langroid/utils/__init__.py +10 -0
langroid/utils/algorithms/__init__.py +3 -0
langroid/utils/configuration.py +0 -1
langroid/utils/constants.py +4 -0
langroid/utils/logging.py +2 -5
langroid/utils/output/__init__.py +15 -2
langroid/utils/output/status.py +33 -0
langroid/utils/pandas_utils.py +30 -0
langroid/utils/pydantic_utils.py +446 -4
langroid/utils/system.py +36 -1
langroid/vector_store/__init__.py +34 -2
langroid/vector_store/base.py +33 -2
langroid/vector_store/chromadb.py +42 -13
langroid/vector_store/lancedb.py +226 -60
langroid/vector_store/meilisearch.py +7 -6
langroid/vector_store/momento.py +3 -2
langroid/vector_store/qdrantdb.py +82 -11
{langroid-0.1.139.dist-info → langroid-0.1.219.dist-info}/METADATA +190 -129
langroid-0.1.219.dist-info/RECORD +127 -0
langroid/agent/special/recipient_validator_agent.py +0 -157
langroid/parsing/json.py +0 -64
langroid/utils/web/selenium_login.py +0 -36
langroid-0.1.139.dist-info/RECORD +0 -103
{langroid-0.1.139.dist-info → langroid-0.1.219.dist-info}/LICENSE +0 -0
{langroid-0.1.139.dist-info → langroid-0.1.219.dist-info}/WHEEL +0 -0

langroid/parsing/repo_loader.py CHANGED Viewed

@@ -10,15 +10,15 @@ from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple, Union
 from urllib.parse import urlparse
-from bs4 import BeautifulSoup
 from dotenv import load_dotenv
 from github import Github
 from github.ContentFile import ContentFile
+from github.Label import Label
 from github.Repository import Repository
-from pydantic import BaseSettings
+from pydantic import BaseModel, BaseSettings, Field
 from langroid.mytypes import DocMetaData, Document
-from langroid.parsing.document_parser import DocumentParser
+from langroid.parsing.document_parser import DocumentParser, DocumentType
 from langroid.parsing.parser import Parser, ParsingConfig
 logger = logging.getLogger(__name__)
@@ -43,6 +43,22 @@ def _has_files(directory: str) -> bool:
     return False
+# Pydantic model for GitHub issue data
+class IssueData(BaseModel):
+    state: str = Field(..., description="State of issue e.g. open or closed")
+    year: int = Field(..., description="Year issue was created")
+    month: int = Field(..., description="Month issue was created")
+    day: int = Field(..., description="Day issue was created")
+    assignee: Optional[str] = Field(..., description="Assignee of issue")
+    size: Optional[str] = Field(..., description="Size of issue, e.g. XS, S, M, L, XXL")
+    text: str = Field(..., description="Text of issue, i.e. description body")
+def get_issue_size(labels: List[Label]) -> str | None:
+    sizes = ["XS", "S", "M", "L", "XL", "XXL"]
+    return next((label.name for label in labels if label.name in sizes), None)
 class RepoLoaderConfig(BaseSettings):
     """
     Configuration for RepoLoader.
@@ -155,6 +171,27 @@ class RepoLoader:
     def _get_dir_name(self) -> str:
         return urlparse(self.url).path.replace("/", "_")
+    def get_issues(self, k: int | None = 100) -> List[IssueData]:
+        """Get up to k issues from the GitHub repo."""
+        if k is None:
+            issues = self.repo.get_issues(state="all")
+        else:
+            issues = self.repo.get_issues(state="all")[:k]
+        issue_data_list = []
+        for issue in issues:
+            issue_data = IssueData(
+                state=issue.state,
+                year=issue.created_at.year,
+                month=issue.created_at.month,
+                day=issue.created_at.day,
+                assignee=issue.assignee.login if issue.assignee else None,
+                size=get_issue_size(issue.labels),
+                text=issue.body or "No issue description body.",
+            )
+            issue_data_list.append(issue_data)
+        return issue_data_list
     @staticmethod
     def _file_type(name: str) -> str:
         """
@@ -453,18 +490,25 @@ class RepoLoader:
     @staticmethod
     def get_documents(
-        path: str,
+        path: str | bytes,
         parser: Parser = Parser(ParsingConfig()),
         file_types: Optional[List[str]] = None,
         exclude_dirs: Optional[List[str]] = None,
         depth: int = -1,
         lines: Optional[int] = None,
+        doc_type: str | DocumentType | None = None,
     ) -> List[Document]:
         """
         Recursively get all files under a path as Document objects.
         Args:
-            path (str): The path to the directory or file.
+            path (str|bytes): The path to the directory or file, or bytes content.
+                The bytes option is meant to support the case where the content
+                has already been read from a file in an upstream process
+                (e.g. from an API or a database), and we want to avoid having to
+                write it to a temporary file just to read it again.
+                (which can be very slow for large files,
+                especially in a docker container)
             parser (Parser): Parser to use to parse files.
             file_types (List[str], optional): List of file extensions OR
                 filenames OR file_path_names to  include.
@@ -475,6 +519,7 @@ class RepoLoader:
                 which includes all depths.
             lines (int, optional): Number of lines to read from each file.
                 Defaults to None, which reads all lines.
+            doc_type (str|DocumentType, optional): The type of document to parse.
         Returns:
             List[Document]: List of Document objects representing files.
@@ -482,52 +527,69 @@ class RepoLoader:
         """
         docs = []
         file_paths = []
-        path_obj = Path(path).resolve()
-        if path_obj.is_file():
-            file_paths.append(str(path_obj))
+        if isinstance(path, bytes):
+            file_paths.append(path)
         else:
-            path_depth = len(path_obj.parts)
-            for root, dirs, files in os.walk(path):
-                # Exclude directories if needed
-                if exclude_dirs:
-                    dirs[:] = [d for d in dirs if d not in exclude_dirs]
-                current_depth = len(Path(root).resolve().parts) - path_depth
-                if depth == -1 or current_depth <= depth:
-                    for file in files:
-                        file_path = str(Path(root) / file)
-                        if (
-                            file_types is None
-                            or RepoLoader._file_type(file_path) in file_types
-                            or os.path.basename(file_path) in file_types
-                            or file_path in file_types
-                        ):
-                            file_paths.append(file_path)
+            path_obj = Path(path).resolve()
+            if path_obj.is_file():
+                file_paths.append(str(path_obj))
+            else:
+                path_depth = len(path_obj.parts)
+                for root, dirs, files in os.walk(path):
+                    # Exclude directories if needed
+                    if exclude_dirs:
+                        dirs[:] = [d for d in dirs if d not in exclude_dirs]
+                    current_depth = len(Path(root).resolve().parts) - path_depth
+                    if depth == -1 or current_depth <= depth:
+                        for file in files:
+                            file_path = str(Path(root) / file)
+                            if (
+                                file_types is None
+                                or RepoLoader._file_type(file_path) in file_types
+                                or os.path.basename(file_path) in file_types
+                                or file_path in file_types
+                            ):
+                                file_paths.append(file_path)
         for file_path in file_paths:
-            _, file_extension = os.path.splitext(file_path)
-            if file_extension.lower() in [".pdf", ".docx"]:
-                doc_parser = DocumentParser.create(
+            docs.extend(
+                DocumentParser.chunks_from_path_or_bytes(
                     file_path,
-                    parser.config,
-                )
-                docs.extend(doc_parser.get_doc_chunks())
-            else:
-                with open(file_path, "r") as f:
-                    if lines is not None:
-                        file_lines = list(itertools.islice(f, lines))
-                        content = "\n".join(line.strip() for line in file_lines)
-                    else:
-                        content = f.read()
-                soup = BeautifulSoup(content, "html.parser")
-                text = soup.get_text()
-                docs.append(
-                    Document(
-                        content=text,
-                        metadata=DocMetaData(source=str(file_path)),
-                    )
+                    parser,
+                    doc_type=doc_type,
+                    lines=lines,
                 )
+            )
+            # dtype: DocumentType = DocumentParser._document_type(file_path, doc_type)
+            # if dtype in [DocumentType.PDF, DocumentType.DOC, DocumentType.DOCX]:
+            #     doc_parser = DocumentParser.create(
+            #         file_path,
+            #         parser.config,
+            #         doc_type=doc_type,
+            #     )
+            #     new_chunks = doc_parser.get_doc_chunks()
+            #     if len(new_chunks) == 0 and file_extension.lower() == ".pdf":
+            #         doc_parser = ImagePdfParser(file_path, parser.config)
+            #         new_chunks = doc_parser.get_doc_chunks()
+            #     docs.extend(new_chunks)
+            # else:
+            #     # try getting as plain text; these will be chunked downstream
+            #     with open(file_path, "r") as f:
+            #         if lines is not None:
+            #             file_lines = list(itertools.islice(f, lines))
+            #             content = "\n".join(line.strip() for line in file_lines)
+            #         else:
+            #             content = f.read()
+            #     soup = BeautifulSoup(content, "html.parser")
+            #     text = soup.get_text()
+            #     docs.append(
+            #         Document(
+            #             content=text,
+            #             metadata=DocMetaData(source=str(file_path)),
+            #         )
+            #     )
         return docs

langroid/parsing/search.py CHANGED Viewed

@@ -64,6 +64,14 @@ def find_fuzzy_matches_in_docs(
                 break
     if words_after is None and words_before is None:
         return orig_doc_matches
+    if len(orig_doc_matches) == 0:
+        return []
+    if set(orig_doc_matches[0].__fields__) != {"content", "metadata"}:
+        # If there are fields beyond just content and metadata,
+        # we do NOT want to create new document objects with content fields
+        # based on words_before and words_after, since we don't know how to
+        # set those other fields.
+        return orig_doc_matches
     contextual_matches = []
     for match in orig_doc_matches:

langroid/parsing/table_loader.py CHANGED Viewed

@@ -1,4 +1,5 @@
 from csv import Sniffer
+from typing import List
 import pandas as pd
@@ -48,3 +49,46 @@ def read_tabular_data(path_or_url: str, sep: None | str = None) -> pd.DataFrame:
             "Unable to read data. "
             "Please ensure it is correctly formatted. Error: " + str(e)
         )
+def describe_dataframe(
+    df: pd.DataFrame, filter_fields: List[str] = [], n_vals: int = 10
+) -> str:
+    """
+    Generates a description of the columns in the dataframe,
+    along with a listing of up to `n_vals` unique values for each column.
+    Intended to be used to insert into an LLM context so it can generate
+    appropriate queries or filters on the df.
+    Args:
+    df (pd.DataFrame): The dataframe to describe.
+    filter_fields (list): A list of fields that can be used for filtering.
+        When non-empty, the values-list will be restricted to these.
+    n_vals (int): How many unique values to show for each column.
+    Returns:
+    str: A description of the dataframe.
+    """
+    description = []
+    for column in df.columns.to_list():
+        unique_values = df[column].dropna().unique()
+        unique_count = len(unique_values)
+        if column not in filter_fields:
+            values_desc = f"{unique_count} unique values"
+        else:
+            if unique_count > n_vals:
+                displayed_values = unique_values[:n_vals]
+                more_count = unique_count - n_vals
+                values_desc = f" Values - {displayed_values}, ... {more_count} more"
+            else:
+                values_desc = f" Values - {unique_values}"
+        col_type = "string" if df[column].dtype == "object" else df[column].dtype
+        col_desc = f"* {column} ({col_type}); {values_desc}"
+        description.append(col_desc)
+    all_cols = "\n".join(description)
+    return f"""
+        Name of each field, its type and unique values (up to {n_vals}):
+        {all_cols}
+        """

langroid/parsing/url_loader.py CHANGED Viewed

@@ -1,6 +1,9 @@
 import logging
+import os
+from tempfile import NamedTemporaryFile
 from typing import List, no_type_check
+import requests
 import trafilatura
 from trafilatura.downloads import (
     add_to_compressed_dict,
@@ -9,7 +12,7 @@ from trafilatura.downloads import (
 )
 from langroid.mytypes import DocMetaData, Document
-from langroid.parsing.document_parser import DocumentParser
+from langroid.parsing.document_parser import DocumentParser, ImagePdfParser
 from langroid.parsing.parser import Parser, ParsingConfig
 logging.getLogger("trafilatura").setLevel(logging.ERROR)
@@ -44,22 +47,65 @@ class URLLoader:
                 sleep_time=5,
             )
             for url, result in buffered_downloads(buffer, threads):
-                if url.lower().endswith(".pdf") or url.lower().endswith(".docx"):
+                if (
+                    url.lower().endswith(".pdf")
+                    or url.lower().endswith(".docx")
+                    or url.lower().endswith(".doc")
+                ):
                     doc_parser = DocumentParser.create(
                         url,
                         self.parser.config,
                     )
-                    docs.extend(doc_parser.get_doc_chunks())
+                    new_chunks = doc_parser.get_doc_chunks()
+                    if len(new_chunks) == 0:
+                        # If the document is empty, try to extract images
+                        img_parser = ImagePdfParser(url, self.parser.config)
+                        new_chunks = img_parser.get_doc_chunks()
+                    docs.extend(new_chunks)
                 else:
-                    text = trafilatura.extract(
-                        result,
-                        no_fallback=False,
-                        favor_recall=True,
-                    )
-                    if text is None and result is not None and isinstance(result, str):
-                        text = result
-                    if text is not None and text != "":
-                        docs.append(
-                            Document(content=text, metadata=DocMetaData(source=url))
+                    # Try to detect content type and handle accordingly
+                    headers = requests.head(url).headers
+                    content_type = headers.get("Content-Type", "").lower()
+                    temp_file_suffix = None
+                    if "application/pdf" in content_type:
+                        temp_file_suffix = ".pdf"
+                    elif (
+                        "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+                        in content_type
+                    ):
+                        temp_file_suffix = ".docx"
+                    elif "application/msword" in content_type:
+                        temp_file_suffix = ".doc"
+                    if temp_file_suffix:
+                        # Download the document content
+                        response = requests.get(url)
+                        with NamedTemporaryFile(
+                            delete=False, suffix=temp_file_suffix
+                        ) as temp_file:
+                            temp_file.write(response.content)
+                            temp_file_path = temp_file.name
+                        # Process the downloaded document
+                        doc_parser = DocumentParser.create(
+                            temp_file_path, self.parser.config
+                        )
+                        docs.extend(doc_parser.get_doc_chunks())
+                        # Clean up the temporary file
+                        os.remove(temp_file_path)
+                    else:
+                        text = trafilatura.extract(
+                            result,
+                            no_fallback=False,
+                            favor_recall=True,
                         )
+                        if (
+                            text is None
+                            and result is not None
+                            and isinstance(result, str)
+                        ):
+                            text = result
+                        if text is not None and text != "":
+                            docs.append(
+                                Document(content=text, metadata=DocMetaData(source=url))
+                            )
         return docs

langroid/parsing/urls.py CHANGED Viewed

@@ -112,26 +112,35 @@ def is_url(s: str) -> bool:
         return False
-def get_urls_and_paths(inputs: List[str]) -> Tuple[List[str], List[str]]:
+def get_urls_paths_bytes_indices(
+    inputs: List[str | bytes],
+) -> Tuple[List[int], List[int], List[int]]:
     """
-    Given a list of inputs, return a list of URLs and a list of paths.
+    Given a list of inputs, return a
+    list of indices of URLs, list of indices of paths, list of indices of byte-contents.
     Args:
-        inputs: list of strings
+        inputs: list of strings or bytes
     Returns:
-        list of URLs, list of paths
+        list of Indices of URLs,
+        list of indices of paths,
+        list of indices of byte-contents
     """
     urls = []
     paths = []
-    for item in inputs:
+    byte_list = []
+    for i, item in enumerate(inputs):
+        if isinstance(item, bytes):
+            byte_list.append(i)
+            continue
         try:
-            m = Url(url=parse_obj_as(HttpUrl, item))
-            urls.append(str(m.url))
+            Url(url=parse_obj_as(HttpUrl, item))
+            urls.append(i)
         except ValidationError:
             if os.path.exists(item):
-                paths.append(item)
+                paths.append(i)
             else:
                 logger.warning(f"{item} is neither a URL nor a path.")
-    return urls, paths
+    return urls, paths, byte_list
 def crawl_url(url: str, max_urls: int = 1) -> List[str]:

langroid/parsing/utils.py CHANGED Viewed

@@ -1,16 +1,26 @@
 import difflib
+import logging
 import random
 import re
 from functools import cache
 from itertools import islice
-from typing import Any, Iterable, List
+from typing import Iterable, List, Sequence, TypeVar
 import nltk
 from faker import Faker
+from langroid.mytypes import Document
+from langroid.parsing.document_parser import DocumentType
+from langroid.parsing.parser import Parser, ParsingConfig
+from langroid.parsing.repo_loader import RepoLoader
+from langroid.parsing.url_loader import URLLoader
+from langroid.parsing.urls import get_urls_paths_bytes_indices
 Faker.seed(23)
 random.seed(43)
+logger = logging.getLogger(__name__)
 # Ensures the NLTK resource is available
 @cache
@@ -21,7 +31,10 @@ def download_nltk_resource(resource: str) -> None:
         nltk.download(resource, quiet=True)
-def batched(iterable: Iterable[Any], n: int) -> Iterable[Any]:
+T = TypeVar("T")
+def batched(iterable: Iterable[T], n: int) -> Iterable[Sequence[T]]:
     """Batch data into tuples of length n. The last batch may be shorter."""
     # batched('ABCDEFG', 3) --> ABC DEF G
     if n < 1:
@@ -101,14 +114,35 @@ def split_paragraphs(text: str) -> List[str]:
     return [para.strip() for para in paras if para.strip()]
-def number_segments(s: str, len: int = 1) -> str:
+def split_newlines(text: str) -> List[str]:
+    """
+    Split the input text into lines using "\n" as the delimiter.
+    Args:
+        text (str): The input text.
+    Returns:
+        list: A list of lines.
+    """
+    lines = re.split(r"\n", text)
+    return [line.strip() for line in lines if line.strip()]
+def number_segments(s: str, granularity: int = 1) -> str:
     """
     Number the segments in a given text, preserving paragraph structure.
-    A segment is a sequence of `len` consecutive sentences.
+    A segment is a sequence of `len` consecutive "sentences", where a "sentence"
+    is either a normal sentence, or if there isn't enough punctuation to properly
+    identify sentences, then we use a pseudo-sentence via heuristics (split by newline
+    or failing that, just split every 40 words). The goal here is simply to number
+    segments at a reasonable granularity so the LLM can identify relevant segments,
+    in the RelevanceExtractorAgent.
     Args:
         s (str): The input text.
-        len (int): The number of sentences in a segment.
+        granularity (int): The number of sentences in a segment.
+            If this is -1, then the entire text is treated as a single segment,
+            and is numbered as <#1#>.
     Returns:
         str: The text with segments numbered in the style <#1#>, <#2#> etc.
@@ -117,15 +151,42 @@ def number_segments(s: str, len: int = 1) -> str:
         >>> number_segments("Hello world! How are you? Have a good day.")
         '<#1#> Hello world! <#2#> How are you? <#3#> Have a good day.'
     """
+    if granularity < 0:
+        return "<#1#> " + s
     numbered_text = []
     count = 0
     paragraphs = split_paragraphs(s)
     for paragraph in paragraphs:
         sentences = nltk.sent_tokenize(paragraph)
+        # Some docs are problematic (e.g. resumes) and have no (or too few) periods,
+        # so we can't split usefully into sentences.
+        # We try a series of heuristics to split into sentences,
+        # until the avg num words per sentence is less than 40.
+        avg_words_per_sentence = sum(
+            len(nltk.word_tokenize(sentence)) for sentence in sentences
+        ) / len(sentences)
+        if avg_words_per_sentence > 40:
+            sentences = split_newlines(paragraph)
+        avg_words_per_sentence = sum(
+            len(nltk.word_tokenize(sentence)) for sentence in sentences
+        ) / len(sentences)
+        if avg_words_per_sentence > 40:
+            # Still too long, just split on every 40 words
+            sentences = []
+            for sentence in nltk.sent_tokenize(paragraph):
+                words = nltk.word_tokenize(sentence)
+                for i in range(0, len(words), 40):
+                    # if there are less than 20 words left after this,
+                    # just add them to the last sentence and break
+                    if len(words) - i < 20:
+                        sentences.append(" ".join(words[i:]))
+                        break
+                    else:
+                        sentences.append(" ".join(words[i : i + 40]))
         for i, sentence in enumerate(sentences):
-            num = count // len + 1
-            number_prefix = f"<#{num}#>" if count % len == 0 else ""
+            num = count // granularity + 1
+            number_prefix = f"<#{num}#>" if count % granularity == 0 else ""
             sentence = f"{number_prefix} {sentence}"
             count += 1
             sentences[i] = sentence
@@ -136,7 +197,7 @@ def number_segments(s: str, len: int = 1) -> str:
 def number_sentences(s: str) -> str:
-    return number_segments(s, len=1)
+    return number_segments(s, granularity=1)
 def parse_number_range_list(specs: str) -> List[int]:
@@ -156,6 +217,9 @@ def parse_number_range_list(specs: str) -> List[int]:
     """
     spec_indices = set()  # type: ignore
     for part in specs.split(","):
+        # some weak LLMs may generate <#1#> instead of 1, so extract just the digits
+        # or the "-"
+        part = "".join(char for char in part if char.isdigit() or char == "-")
         if "-" in part:
             start, end = map(int, part.split("-"))
             spec_indices.update(range(start, end + 1))
@@ -224,7 +288,8 @@ def extract_numbered_segments(s: str, specs: str) -> str:
     # Regular expression to identify numbered segments like
     # <#1#> Hello world! This is me. <#2#> How are you? <#3#> Have a good day.
-    segment_pattern = re.compile(r"<#(\d+)#> ((?:(?!<#).)+)")
+    # Note we match any character between segment markers, including newlines.
+    segment_pattern = re.compile(r"<#(\d+)#>([\s\S]*?)(?=<#\d+#>|$)")
     # Split the text into paragraphs while preserving their boundaries
     paragraphs = split_paragraphs(s)
@@ -247,3 +312,59 @@ def extract_numbered_segments(s: str, specs: str) -> str:
             extracted_paragraphs.append(" ".join(extracted_segments))
     return "\n\n".join(extracted_paragraphs)
+def extract_content_from_path(
+    path: bytes | str | List[bytes | str],
+    parsing: ParsingConfig,
+    doc_type: str | DocumentType | None = None,
+) -> str | List[str]:
+    """
+    Extract the content from a file path or URL, or a list of file paths or URLs.
+    Args:
+        path (bytes | str | List[str]): The file path or URL, or a list of file paths or
+            URLs, or bytes content. The bytes option is meant to support cases
+            where upstream code may have already loaded the content (e.g., from a
+            database or API) and we want to avoid having to copy the content to a
+            temporary file.
+        parsing (ParsingConfig): The parsing configuration.
+        doc_type (str | DocumentType | None): The document type if known.
+            If multiple paths are given, this MUST apply to ALL docs.
+    Returns:
+        str | List[str]: The extracted content if a single file path or URL is provided,
+                or a list of extracted contents if a
+                list of file paths or URLs is provided.
+    """
+    if isinstance(path, str) or isinstance(path, bytes):
+        paths = [path]
+    elif isinstance(path, list) and len(path) == 0:
+        return ""
+    else:
+        paths = path
+    url_idxs, path_idxs, byte_idxs = get_urls_paths_bytes_indices(paths)
+    urls = [paths[i] for i in url_idxs]
+    path_list = [paths[i] for i in path_idxs]
+    byte_list = [paths[i] for i in byte_idxs]
+    path_list.extend(byte_list)
+    parser = Parser(parsing)
+    docs: List[Document] = []
+    try:
+        if len(urls) > 0:
+            loader = URLLoader(urls=urls, parser=parser)  # type: ignore
+            docs = loader.load()
+        if len(path_list) > 0:
+            for p in path_list:
+                path_docs = RepoLoader.get_documents(
+                    p, parser=parser, doc_type=doc_type
+                )
+                docs.extend(path_docs)
+    except Exception as e:
+        logger.warning(f"Error loading path {paths}: {e}")
+        return ""
+    if len(docs) == 1:
+        return docs[0].content
+    else:
+        return [d.content for d in docs]

langroid 0.1.139__py3-none-any.whl → 0.1.219__py3-none-any.whl

langroid 0.1.139py3-none-any.whl → 0.1.219py3-none-any.whl