PyPI - langroid - Versions diffs - 0.1.85__py3-none-any.whl → 0.1.219__py3-none-any.whl - Mend

langroid 0.1.85py3-none-any.whl → 0.1.219py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (107) hide show

langroid/__init__.py +95 -0
langroid/agent/__init__.py +40 -0
langroid/agent/base.py +222 -91
langroid/agent/batch.py +264 -0
langroid/agent/callbacks/chainlit.py +608 -0
langroid/agent/chat_agent.py +247 -101
langroid/agent/chat_document.py +41 -4
langroid/agent/openai_assistant.py +842 -0
langroid/agent/special/__init__.py +50 -0
langroid/agent/special/doc_chat_agent.py +837 -141
langroid/agent/special/lance_doc_chat_agent.py +258 -0
langroid/agent/special/lance_rag/__init__.py +9 -0
langroid/agent/special/lance_rag/critic_agent.py +136 -0
langroid/agent/special/lance_rag/lance_rag_task.py +80 -0
langroid/agent/special/lance_rag/query_planner_agent.py +180 -0
langroid/agent/special/lance_tools.py +44 -0
langroid/agent/special/neo4j/__init__.py +0 -0
langroid/agent/special/neo4j/csv_kg_chat.py +174 -0
langroid/agent/special/neo4j/neo4j_chat_agent.py +370 -0
langroid/agent/special/neo4j/utils/__init__.py +0 -0
langroid/agent/special/neo4j/utils/system_message.py +46 -0
langroid/agent/special/relevance_extractor_agent.py +127 -0
langroid/agent/special/retriever_agent.py +32 -198
langroid/agent/special/sql/__init__.py +11 -0
langroid/agent/special/sql/sql_chat_agent.py +47 -23
langroid/agent/special/sql/utils/__init__.py +22 -0
langroid/agent/special/sql/utils/description_extractors.py +95 -46
langroid/agent/special/sql/utils/populate_metadata.py +28 -21
langroid/agent/special/table_chat_agent.py +43 -9
langroid/agent/task.py +475 -122
langroid/agent/tool_message.py +75 -13
langroid/agent/tools/__init__.py +13 -0
langroid/agent/tools/duckduckgo_search_tool.py +66 -0
langroid/agent/tools/google_search_tool.py +11 -0
langroid/agent/tools/metaphor_search_tool.py +67 -0
langroid/agent/tools/recipient_tool.py +16 -29
langroid/agent/tools/run_python_code.py +60 -0
langroid/agent/tools/sciphi_search_rag_tool.py +79 -0
langroid/agent/tools/segment_extract_tool.py +36 -0
langroid/cachedb/__init__.py +9 -0
langroid/cachedb/base.py +22 -2
langroid/cachedb/momento_cachedb.py +26 -2
langroid/cachedb/redis_cachedb.py +78 -11
langroid/embedding_models/__init__.py +34 -0
langroid/embedding_models/base.py +21 -2
langroid/embedding_models/models.py +120 -18
langroid/embedding_models/protoc/embeddings.proto +19 -0
langroid/embedding_models/protoc/embeddings_pb2.py +33 -0
langroid/embedding_models/protoc/embeddings_pb2.pyi +50 -0
langroid/embedding_models/protoc/embeddings_pb2_grpc.py +79 -0
langroid/embedding_models/remote_embeds.py +153 -0
langroid/language_models/__init__.py +45 -0
langroid/language_models/azure_openai.py +80 -27
langroid/language_models/base.py +117 -12
langroid/language_models/config.py +5 -0
langroid/language_models/openai_assistants.py +3 -0
langroid/language_models/openai_gpt.py +558 -174
langroid/language_models/prompt_formatter/__init__.py +15 -0
langroid/language_models/prompt_formatter/base.py +4 -6
langroid/language_models/prompt_formatter/hf_formatter.py +135 -0
langroid/language_models/utils.py +18 -21
langroid/mytypes.py +25 -8
langroid/parsing/__init__.py +46 -0
langroid/parsing/document_parser.py +260 -63
langroid/parsing/image_text.py +32 -0
langroid/parsing/parse_json.py +143 -0
langroid/parsing/parser.py +122 -59
langroid/parsing/repo_loader.py +114 -52
langroid/parsing/search.py +68 -63
langroid/parsing/spider.py +3 -2
langroid/parsing/table_loader.py +44 -0
langroid/parsing/url_loader.py +59 -11
langroid/parsing/urls.py +85 -37
langroid/parsing/utils.py +298 -4
langroid/parsing/web_search.py +73 -0
langroid/prompts/__init__.py +11 -0
langroid/prompts/chat-gpt4-system-prompt.md +68 -0
langroid/prompts/prompts_config.py +1 -1
langroid/utils/__init__.py +17 -0
langroid/utils/algorithms/__init__.py +3 -0
langroid/utils/algorithms/graph.py +103 -0
langroid/utils/configuration.py +36 -5
langroid/utils/constants.py +4 -0
langroid/utils/globals.py +2 -2
langroid/utils/logging.py +2 -5
langroid/utils/output/__init__.py +21 -0
langroid/utils/output/printing.py +47 -1
langroid/utils/output/status.py +33 -0
langroid/utils/pandas_utils.py +30 -0
langroid/utils/pydantic_utils.py +616 -2
langroid/utils/system.py +98 -0
langroid/vector_store/__init__.py +40 -0
langroid/vector_store/base.py +203 -6
langroid/vector_store/chromadb.py +59 -32
langroid/vector_store/lancedb.py +463 -0
langroid/vector_store/meilisearch.py +10 -7
langroid/vector_store/momento.py +262 -0
langroid/vector_store/qdrantdb.py +104 -22
{langroid-0.1.85.dist-info → langroid-0.1.219.dist-info}/METADATA +329 -149
langroid-0.1.219.dist-info/RECORD +127 -0
{langroid-0.1.85.dist-info → langroid-0.1.219.dist-info}/WHEEL +1 -1
langroid/agent/special/recipient_validator_agent.py +0 -157
langroid/parsing/json.py +0 -64
langroid/utils/web/selenium_login.py +0 -36
langroid-0.1.85.dist-info/RECORD +0 -94
/langroid/{scripts → agent/callbacks}/__init__.py +0 -0
{langroid-0.1.85.dist-info → langroid-0.1.219.dist-info}/LICENSE +0 -0

langroid/parsing/search.py CHANGED Viewed

@@ -7,10 +7,8 @@ See tests for examples: tests/main/test_string_search.py
 """
 import difflib
-import re
 from typing import List, Tuple
-import nltk
 from nltk.corpus import stopwords
 from nltk.stem import WordNetLemmatizer
 from nltk.tokenize import RegexpTokenizer
@@ -19,10 +17,13 @@ from thefuzz import fuzz, process
 from langroid.mytypes import Document
+from .utils import download_nltk_resource
 def find_fuzzy_matches_in_docs(
     query: str,
     docs: List[Document],
+    docs_clean: List[Document],
     k: int,
     words_before: int | None = None,
     words_after: int | None = None,
@@ -48,58 +49,53 @@ def find_fuzzy_matches_in_docs(
         return []
     best_matches = process.extract(
         query,
-        [d.content for d in docs],
+        [d.content for d in docs_clean],
         limit=k,
         scorer=fuzz.partial_ratio,
     )
     real_matches = [m for m, score in best_matches if score > 50]
-    results = []
-    for match in real_matches:
-        words = match.split()
-        for doc in docs:
-            if match in doc.content:
-                words_in_text = doc.content.split()
-                first_word_idx = next(
-                    (
-                        i
-                        for i, word in enumerate(words_in_text)
-                        if word.startswith(words[0])
-                    ),
-                    -1,
-                )
-                if words_before is None:
-                    words_before = len(words_in_text)
-                if words_after is None:
-                    words_after = len(words_in_text)
-                if first_word_idx != -1:
-                    start_idx = max(0, first_word_idx - words_before)
-                    end_idx = min(
-                        len(words_in_text),
-                        first_word_idx + len(words) + words_after,
-                    )
-                    doc_match = Document(
-                        content=" ".join(words_in_text[start_idx:end_idx]),
-                        metadata=doc.metadata,
-                    )
-                    results.append(doc_match)
+    # find the original docs that corresponding to the matches
+    orig_doc_matches = []
+    for i, m in enumerate(real_matches):
+        for j, doc_clean in enumerate(docs_clean):
+            if m in doc_clean.content:
+                orig_doc_matches.append(docs[j])
                 break
+    if words_after is None and words_before is None:
+        return orig_doc_matches
+    if len(orig_doc_matches) == 0:
+        return []
+    if set(orig_doc_matches[0].__fields__) != {"content", "metadata"}:
+        # If there are fields beyond just content and metadata,
+        # we do NOT want to create new document objects with content fields
+        # based on words_before and words_after, since we don't know how to
+        # set those other fields.
+        return orig_doc_matches
+    contextual_matches = []
+    for match in orig_doc_matches:
+        choice_text = match.content
+        contexts = []
+        while choice_text != "":
+            context, start_pos, end_pos = get_context(
+                query, choice_text, words_before, words_after
+            )
+            if context == "" or end_pos == 0:
+                break
+            contexts.append(context)
+            words = choice_text.split()
+            end_pos = min(end_pos, len(words))
+            choice_text = " ".join(words[end_pos:])
+        if len(contexts) > 0:
+            contextual_matches.append(
+                Document(
+                    content=" ... ".join(contexts),
+                    metadata=match.metadata,
+                )
+            )
-    return results
-# Ensure NLTK resources are available
-def download_nltk_resources() -> None:
-    resources = ["punkt", "wordnet", "stopwords"]
-    for resource in resources:
-        try:
-            nltk.data.find(resource)
-        except LookupError:
-            nltk.download(resource)
-download_nltk_resources()
+    return contextual_matches
 def preprocess_text(text: str) -> str:
@@ -117,6 +113,10 @@ def preprocess_text(text: str) -> str:
     Returns:
         str: The preprocessed text.
     """
+    # Ensure the NLTK resources are available
+    for resource in ["punkt", "wordnet", "stopwords"]:
+        download_nltk_resource(resource)
     # Lowercase the text
     text = text.lower()
@@ -179,7 +179,7 @@ def get_context(
     text: str,
     words_before: int | None = 100,
     words_after: int | None = 100,
-) -> str:
+) -> Tuple[str, int, int]:
     """
     Returns a portion of text containing the best approximate match of the query,
     including b words before and a words after the match.
@@ -193,7 +193,9 @@ def get_context(
     Returns:
     str: A string containing b words before, the match, and a words after
         the best approximate match position of the query in the text. If no
-        match is found, returns "No match found".
+        match is found, returns empty string.
+    int: The start position of the match in the text.
+    int: The end position of the match in the text.
     Example:
     >>> get_context("apple", "The quick brown fox jumps over the apple.", 3, 2)
@@ -201,26 +203,29 @@ def get_context(
     """
     if words_after is None and words_before is None:
         # return entire text since we're not asked to return a bounded context
-        return text
+        return text, 0, 0
+    # make sure there is a good enough match to the query
+    if fuzz.partial_ratio(query, text) < 40:
+        return "", 0, 0
     sequence_matcher = difflib.SequenceMatcher(None, text, query)
     match = sequence_matcher.find_longest_match(0, len(text), 0, len(query))
     if match.size == 0:
-        return "No match found"
-    words = re.findall(r"\b\w+\b", text)
-    if words_after is None:
-        words_after = len(words)
-    if words_before is None:
-        words_before = len(words)
-    start_word_pos = len(re.findall(r"\b\w+\b", text[: match.a]))
-    start_pos = max(0, start_word_pos - words_before)
-    end_pos = min(
-        len(words), start_word_pos + words_after + len(re.findall(r"\b\w+\b", query))
-    )
+        return "", 0, 0
+    segments = text.split()
+    n_segs = len(segments)
+    start_segment_pos = len(text[: match.a].split())
+    words_before = words_before or n_segs
+    words_after = words_after or n_segs
+    start_pos = max(0, start_segment_pos - words_before)
+    end_pos = min(len(segments), start_segment_pos + words_after + len(query.split()))
-    return " ".join(words[start_pos:end_pos])
+    return " ".join(segments[start_pos:end_pos]), start_pos, end_pos
 def eliminate_near_duplicates(passages: List[str], threshold: float = 0.8) -> List[str]:

langroid/parsing/spider.py CHANGED Viewed

@@ -4,6 +4,7 @@ from urllib.parse import urlparse
 from pydispatch import dispatcher
 from scrapy import signals
 from scrapy.crawler import CrawlerRunner
+from scrapy.http import Response
 from scrapy.linkextractors import LinkExtractor
 from scrapy.spiders import CrawlSpider, Rule
 from twisted.internet import defer, reactor
@@ -30,7 +31,7 @@ class DomainSpecificSpider(CrawlSpider):  # type: ignore
         self.k = k
         self.visited_urls: Set[str] = set()
-    def parse_item(self, response):  # type: ignore
+    def parse_item(self, response: Response):  # type: ignore
         """Extracts URLs that are within the same domain.
         Args:
@@ -57,7 +58,7 @@ def scrapy_fetch_urls(url: str, k: int = 20) -> List[str]:
     """
     urls = []
-    def _collect_urls(spider, reason):
+    def _collect_urls(spider):
         """Handler for the spider_closed signal. Collects the visited URLs."""
         nonlocal urls
         urls.extend(list(spider.visited_urls))

langroid/parsing/table_loader.py CHANGED Viewed

@@ -1,4 +1,5 @@
 from csv import Sniffer
+from typing import List
 import pandas as pd
@@ -48,3 +49,46 @@ def read_tabular_data(path_or_url: str, sep: None | str = None) -> pd.DataFrame:
             "Unable to read data. "
             "Please ensure it is correctly formatted. Error: " + str(e)
         )
+def describe_dataframe(
+    df: pd.DataFrame, filter_fields: List[str] = [], n_vals: int = 10
+) -> str:
+    """
+    Generates a description of the columns in the dataframe,
+    along with a listing of up to `n_vals` unique values for each column.
+    Intended to be used to insert into an LLM context so it can generate
+    appropriate queries or filters on the df.
+    Args:
+    df (pd.DataFrame): The dataframe to describe.
+    filter_fields (list): A list of fields that can be used for filtering.
+        When non-empty, the values-list will be restricted to these.
+    n_vals (int): How many unique values to show for each column.
+    Returns:
+    str: A description of the dataframe.
+    """
+    description = []
+    for column in df.columns.to_list():
+        unique_values = df[column].dropna().unique()
+        unique_count = len(unique_values)
+        if column not in filter_fields:
+            values_desc = f"{unique_count} unique values"
+        else:
+            if unique_count > n_vals:
+                displayed_values = unique_values[:n_vals]
+                more_count = unique_count - n_vals
+                values_desc = f" Values - {displayed_values}, ... {more_count} more"
+            else:
+                values_desc = f" Values - {unique_values}"
+        col_type = "string" if df[column].dtype == "object" else df[column].dtype
+        col_desc = f"* {column} ({col_type}); {values_desc}"
+        description.append(col_desc)
+    all_cols = "\n".join(description)
+    return f"""
+        Name of each field, its type and unique values (up to {n_vals}):
+        {all_cols}
+        """

langroid/parsing/url_loader.py CHANGED Viewed

@@ -1,6 +1,9 @@
 import logging
+import os
+from tempfile import NamedTemporaryFile
 from typing import List, no_type_check
+import requests
 import trafilatura
 from trafilatura.downloads import (
     add_to_compressed_dict,
@@ -9,7 +12,7 @@ from trafilatura.downloads import (
 )
 from langroid.mytypes import DocMetaData, Document
-from langroid.parsing.document_parser import DocumentParser
+from langroid.parsing.document_parser import DocumentParser, ImagePdfParser
 from langroid.parsing.parser import Parser, ParsingConfig
 logging.getLogger("trafilatura").setLevel(logging.ERROR)
@@ -44,20 +47,65 @@ class URLLoader:
                 sleep_time=5,
             )
             for url, result in buffered_downloads(buffer, threads):
-                if url.lower().endswith(".pdf") or url.lower().endswith(".docx"):
+                if (
+                    url.lower().endswith(".pdf")
+                    or url.lower().endswith(".docx")
+                    or url.lower().endswith(".doc")
+                ):
                     doc_parser = DocumentParser.create(
                         url,
                         self.parser.config,
                     )
-                    docs.extend(doc_parser.get_doc_chunks())
+                    new_chunks = doc_parser.get_doc_chunks()
+                    if len(new_chunks) == 0:
+                        # If the document is empty, try to extract images
+                        img_parser = ImagePdfParser(url, self.parser.config)
+                        new_chunks = img_parser.get_doc_chunks()
+                    docs.extend(new_chunks)
                 else:
-                    text = trafilatura.extract(
-                        result,
-                        no_fallback=False,
-                        favor_recall=True,
-                    )
-                    if text is not None and text != "":
-                        docs.append(
-                            Document(content=text, metadata=DocMetaData(source=url))
+                    # Try to detect content type and handle accordingly
+                    headers = requests.head(url).headers
+                    content_type = headers.get("Content-Type", "").lower()
+                    temp_file_suffix = None
+                    if "application/pdf" in content_type:
+                        temp_file_suffix = ".pdf"
+                    elif (
+                        "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+                        in content_type
+                    ):
+                        temp_file_suffix = ".docx"
+                    elif "application/msword" in content_type:
+                        temp_file_suffix = ".doc"
+                    if temp_file_suffix:
+                        # Download the document content
+                        response = requests.get(url)
+                        with NamedTemporaryFile(
+                            delete=False, suffix=temp_file_suffix
+                        ) as temp_file:
+                            temp_file.write(response.content)
+                            temp_file_path = temp_file.name
+                        # Process the downloaded document
+                        doc_parser = DocumentParser.create(
+                            temp_file_path, self.parser.config
+                        )
+                        docs.extend(doc_parser.get_doc_chunks())
+                        # Clean up the temporary file
+                        os.remove(temp_file_path)
+                    else:
+                        text = trafilatura.extract(
+                            result,
+                            no_fallback=False,
+                            favor_recall=True,
                         )
+                        if (
+                            text is None
+                            and result is not None
+                            and isinstance(result, str)
+                        ):
+                            text = result
+                        if text is not None and text != "":
+                            docs.append(
+                                Document(content=text, metadata=DocMetaData(source=url))
+                            )
         return docs

langroid/parsing/urls.py CHANGED Viewed

@@ -4,7 +4,7 @@ import tempfile
 import urllib.parse
 import urllib.robotparser
 from typing import List, Optional, Set, Tuple
-from urllib.parse import urljoin
+from urllib.parse import urldefrag, urljoin, urlparse
 import fire
 import requests
@@ -14,8 +14,6 @@ from rich import print
 from rich.prompt import Prompt
 from trafilatura.spider import focused_crawler
-from langroid.parsing.spider import scrapy_fetch_urls
 logger = logging.getLogger(__name__)
@@ -86,7 +84,15 @@ def get_list_from_user(
             url = input_str
             input_str = Prompt.ask("[blue] How many new URLs to crawl?", default="0")
             max_urls = int(input_str) + 1
-            tot_urls = scrapy_fetch_urls(url, k=max_urls)
+            tot_urls = list(find_urls(url, max_links=max_urls, max_depth=2))
+            tot_urls_str = "\n".join(tot_urls)
+            print(
+                f"""
+                Found these {len(tot_urls)} links upto depth 2:
+                {tot_urls_str}
+                """
+            )
             input_set.update(tot_urls)
         else:
             input_set.add(input_str.strip())
@@ -106,32 +112,42 @@ def is_url(s: str) -> bool:
         return False
-def get_urls_and_paths(inputs: List[str]) -> Tuple[List[str], List[str]]:
+def get_urls_paths_bytes_indices(
+    inputs: List[str | bytes],
+) -> Tuple[List[int], List[int], List[int]]:
     """
-    Given a list of inputs, return a list of URLs and a list of paths.
+    Given a list of inputs, return a
+    list of indices of URLs, list of indices of paths, list of indices of byte-contents.
     Args:
-        inputs: list of strings
+        inputs: list of strings or bytes
     Returns:
-        list of URLs, list of paths
+        list of Indices of URLs,
+        list of indices of paths,
+        list of indices of byte-contents
     """
     urls = []
     paths = []
-    for item in inputs:
+    byte_list = []
+    for i, item in enumerate(inputs):
+        if isinstance(item, bytes):
+            byte_list.append(i)
+            continue
         try:
-            m = Url(url=parse_obj_as(HttpUrl, item))
-            urls.append(str(m.url))
+            Url(url=parse_obj_as(HttpUrl, item))
+            urls.append(i)
         except ValidationError:
             if os.path.exists(item):
-                paths.append(item)
+                paths.append(i)
             else:
                 logger.warning(f"{item} is neither a URL nor a path.")
-    return urls, paths
+    return urls, paths, byte_list
 def crawl_url(url: str, max_urls: int = 1) -> List[str]:
     """
     Crawl starting at the url and return a list of URLs to be parsed,
     up to a maximum of `max_urls`.
+    This has not been tested to work as intended. Ignore.
     """
     if max_urls == 1:
         # no need to crawl, just return the original list
@@ -161,6 +177,7 @@ def crawl_url(url: str, max_urls: int = 1) -> List[str]:
             )
         if to_visit is None:
             break
     if known_urls is None:
         return [url]
     final_urls = [s.strip() for s in known_urls]
@@ -169,46 +186,77 @@ def crawl_url(url: str, max_urls: int = 1) -> List[str]:
 def find_urls(
     url: str = "https://en.wikipedia.org/wiki/Generative_pre-trained_transformer",
+    max_links: int = 20,
     visited: Optional[Set[str]] = None,
     depth: int = 0,
     max_depth: int = 2,
+    match_domain: bool = True,
 ) -> Set[str]:
     """
     Recursively find all URLs on a given page.
     Args:
-        url:
-        visited:
-        depth:
-        max_depth:
+        url (str): The URL to start from.
+        max_links (int): The maximum number of links to find.
+        visited (set): A set of URLs that have already been visited.
+        depth (int): The current depth of the recursion.
+        max_depth (int): The maximum depth of the recursion.
+        match_domain (bool): Whether to only return URLs that are on the same domain.
     Returns:
+        set: A set of URLs found on the page.
     """
     if visited is None:
         visited = set()
-    visited.add(url)
-    try:
-        response = requests.get(url)
-        response.raise_for_status()
-    except (
-        requests.exceptions.HTTPError,
-        requests.exceptions.RequestException,
-    ):
-        print(f"Failed to fetch '{url}'")
+    if url in visited or depth > max_depth:
         return visited
-    soup = BeautifulSoup(response.content, "html.parser")
-    links = soup.find_all("a", href=True)
-    urls = [urljoin(url, link["href"]) for link in links]  # Construct full URLs
-    if depth < max_depth:
-        for link_url in urls:
-            if link_url not in visited:
-                find_urls(link_url, visited, depth + 1, max_depth)
+    visited.add(url)
+    base_domain = urlparse(url).netloc
-    return visited
+    try:
+        response = requests.get(url, timeout=5)
+        response.raise_for_status()
+        soup = BeautifulSoup(response.text, "html.parser")
+        links = [urljoin(url, a["href"]) for a in soup.find_all("a", href=True)]
+        # Defrag links: discard links that are to portions of same page
+        defragged_links = list(set(urldefrag(link).url for link in links))
+        # Filter links based on domain matching requirement
+        domain_matching_links = [
+            link for link in defragged_links if urlparse(link).netloc == base_domain
+        ]
+        # ensure url is first, since below we are taking first max_links urls
+        domain_matching_links = [url] + [x for x in domain_matching_links if x != url]
+        # If found links exceed max_links, return immediately
+        if len(domain_matching_links) >= max_links:
+            return set(domain_matching_links[:max_links])
+        for link in domain_matching_links:
+            if len(visited) >= max_links:
+                break
+            if link not in visited:
+                visited.update(
+                    find_urls(
+                        link,
+                        max_links,
+                        visited,
+                        depth + 1,
+                        max_depth,
+                        match_domain,
+                    )
+                )
+    except (requests.RequestException, Exception) as e:
+        print(f"Error fetching {url}. Error: {e}")
+    return set(list(visited)[:max_links])
 def org_user_from_github(url: str) -> str:

langroid 0.1.85__py3-none-any.whl → 0.1.219__py3-none-any.whl

langroid 0.1.85py3-none-any.whl → 0.1.219py3-none-any.whl