langroid 0.1.139__py3-none-any.whl → 0.1.219__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langroid/__init__.py +70 -0
- langroid/agent/__init__.py +22 -0
- langroid/agent/base.py +120 -33
- langroid/agent/batch.py +134 -35
- langroid/agent/callbacks/__init__.py +0 -0
- langroid/agent/callbacks/chainlit.py +608 -0
- langroid/agent/chat_agent.py +164 -100
- langroid/agent/chat_document.py +19 -2
- langroid/agent/openai_assistant.py +20 -10
- langroid/agent/special/__init__.py +33 -10
- langroid/agent/special/doc_chat_agent.py +521 -108
- langroid/agent/special/lance_doc_chat_agent.py +258 -0
- langroid/agent/special/lance_rag/__init__.py +9 -0
- langroid/agent/special/lance_rag/critic_agent.py +136 -0
- langroid/agent/special/lance_rag/lance_rag_task.py +80 -0
- langroid/agent/special/lance_rag/query_planner_agent.py +180 -0
- langroid/agent/special/lance_tools.py +44 -0
- langroid/agent/special/neo4j/__init__.py +0 -0
- langroid/agent/special/neo4j/csv_kg_chat.py +174 -0
- langroid/agent/special/neo4j/neo4j_chat_agent.py +370 -0
- langroid/agent/special/neo4j/utils/__init__.py +0 -0
- langroid/agent/special/neo4j/utils/system_message.py +46 -0
- langroid/agent/special/relevance_extractor_agent.py +23 -7
- langroid/agent/special/retriever_agent.py +29 -174
- langroid/agent/special/sql/__init__.py +7 -0
- langroid/agent/special/sql/sql_chat_agent.py +47 -23
- langroid/agent/special/sql/utils/__init__.py +11 -0
- langroid/agent/special/sql/utils/description_extractors.py +95 -46
- langroid/agent/special/sql/utils/populate_metadata.py +28 -21
- langroid/agent/special/table_chat_agent.py +43 -9
- langroid/agent/task.py +423 -114
- langroid/agent/tool_message.py +67 -10
- langroid/agent/tools/__init__.py +8 -0
- langroid/agent/tools/duckduckgo_search_tool.py +66 -0
- langroid/agent/tools/google_search_tool.py +11 -0
- langroid/agent/tools/metaphor_search_tool.py +67 -0
- langroid/agent/tools/recipient_tool.py +6 -24
- langroid/agent/tools/sciphi_search_rag_tool.py +79 -0
- langroid/cachedb/__init__.py +6 -0
- langroid/embedding_models/__init__.py +24 -0
- langroid/embedding_models/base.py +9 -1
- langroid/embedding_models/models.py +117 -17
- langroid/embedding_models/protoc/embeddings.proto +19 -0
- langroid/embedding_models/protoc/embeddings_pb2.py +33 -0
- langroid/embedding_models/protoc/embeddings_pb2.pyi +50 -0
- langroid/embedding_models/protoc/embeddings_pb2_grpc.py +79 -0
- langroid/embedding_models/remote_embeds.py +153 -0
- langroid/language_models/__init__.py +22 -0
- langroid/language_models/azure_openai.py +47 -4
- langroid/language_models/base.py +26 -10
- langroid/language_models/config.py +5 -0
- langroid/language_models/openai_gpt.py +407 -121
- langroid/language_models/prompt_formatter/__init__.py +9 -0
- langroid/language_models/prompt_formatter/base.py +4 -6
- langroid/language_models/prompt_formatter/hf_formatter.py +135 -0
- langroid/language_models/utils.py +10 -9
- langroid/mytypes.py +10 -4
- langroid/parsing/__init__.py +33 -1
- langroid/parsing/document_parser.py +259 -63
- langroid/parsing/image_text.py +32 -0
- langroid/parsing/parse_json.py +143 -0
- langroid/parsing/parser.py +20 -7
- langroid/parsing/repo_loader.py +108 -46
- langroid/parsing/search.py +8 -0
- langroid/parsing/table_loader.py +44 -0
- langroid/parsing/url_loader.py +59 -13
- langroid/parsing/urls.py +18 -9
- langroid/parsing/utils.py +130 -9
- langroid/parsing/web_search.py +73 -0
- langroid/prompts/__init__.py +7 -0
- langroid/prompts/chat-gpt4-system-prompt.md +68 -0
- langroid/prompts/prompts_config.py +1 -1
- langroid/utils/__init__.py +10 -0
- langroid/utils/algorithms/__init__.py +3 -0
- langroid/utils/configuration.py +0 -1
- langroid/utils/constants.py +4 -0
- langroid/utils/logging.py +2 -5
- langroid/utils/output/__init__.py +15 -2
- langroid/utils/output/status.py +33 -0
- langroid/utils/pandas_utils.py +30 -0
- langroid/utils/pydantic_utils.py +446 -4
- langroid/utils/system.py +36 -1
- langroid/vector_store/__init__.py +34 -2
- langroid/vector_store/base.py +33 -2
- langroid/vector_store/chromadb.py +42 -13
- langroid/vector_store/lancedb.py +226 -60
- langroid/vector_store/meilisearch.py +7 -6
- langroid/vector_store/momento.py +3 -2
- langroid/vector_store/qdrantdb.py +82 -11
- {langroid-0.1.139.dist-info → langroid-0.1.219.dist-info}/METADATA +190 -129
- langroid-0.1.219.dist-info/RECORD +127 -0
- langroid/agent/special/recipient_validator_agent.py +0 -157
- langroid/parsing/json.py +0 -64
- langroid/utils/web/selenium_login.py +0 -36
- langroid-0.1.139.dist-info/RECORD +0 -103
- {langroid-0.1.139.dist-info → langroid-0.1.219.dist-info}/LICENSE +0 -0
- {langroid-0.1.139.dist-info → langroid-0.1.219.dist-info}/WHEEL +0 -0
langroid/parsing/repo_loader.py
CHANGED
@@ -10,15 +10,15 @@ from pathlib import Path
|
|
10
10
|
from typing import Any, Dict, List, Optional, Tuple, Union
|
11
11
|
from urllib.parse import urlparse
|
12
12
|
|
13
|
-
from bs4 import BeautifulSoup
|
14
13
|
from dotenv import load_dotenv
|
15
14
|
from github import Github
|
16
15
|
from github.ContentFile import ContentFile
|
16
|
+
from github.Label import Label
|
17
17
|
from github.Repository import Repository
|
18
|
-
from pydantic import BaseSettings
|
18
|
+
from pydantic import BaseModel, BaseSettings, Field
|
19
19
|
|
20
20
|
from langroid.mytypes import DocMetaData, Document
|
21
|
-
from langroid.parsing.document_parser import DocumentParser
|
21
|
+
from langroid.parsing.document_parser import DocumentParser, DocumentType
|
22
22
|
from langroid.parsing.parser import Parser, ParsingConfig
|
23
23
|
|
24
24
|
logger = logging.getLogger(__name__)
|
@@ -43,6 +43,22 @@ def _has_files(directory: str) -> bool:
|
|
43
43
|
return False
|
44
44
|
|
45
45
|
|
46
|
+
# Pydantic model for GitHub issue data
|
47
|
+
class IssueData(BaseModel):
|
48
|
+
state: str = Field(..., description="State of issue e.g. open or closed")
|
49
|
+
year: int = Field(..., description="Year issue was created")
|
50
|
+
month: int = Field(..., description="Month issue was created")
|
51
|
+
day: int = Field(..., description="Day issue was created")
|
52
|
+
assignee: Optional[str] = Field(..., description="Assignee of issue")
|
53
|
+
size: Optional[str] = Field(..., description="Size of issue, e.g. XS, S, M, L, XXL")
|
54
|
+
text: str = Field(..., description="Text of issue, i.e. description body")
|
55
|
+
|
56
|
+
|
57
|
+
def get_issue_size(labels: List[Label]) -> str | None:
|
58
|
+
sizes = ["XS", "S", "M", "L", "XL", "XXL"]
|
59
|
+
return next((label.name for label in labels if label.name in sizes), None)
|
60
|
+
|
61
|
+
|
46
62
|
class RepoLoaderConfig(BaseSettings):
|
47
63
|
"""
|
48
64
|
Configuration for RepoLoader.
|
@@ -155,6 +171,27 @@ class RepoLoader:
|
|
155
171
|
def _get_dir_name(self) -> str:
|
156
172
|
return urlparse(self.url).path.replace("/", "_")
|
157
173
|
|
174
|
+
def get_issues(self, k: int | None = 100) -> List[IssueData]:
|
175
|
+
"""Get up to k issues from the GitHub repo."""
|
176
|
+
if k is None:
|
177
|
+
issues = self.repo.get_issues(state="all")
|
178
|
+
else:
|
179
|
+
issues = self.repo.get_issues(state="all")[:k]
|
180
|
+
issue_data_list = []
|
181
|
+
for issue in issues:
|
182
|
+
issue_data = IssueData(
|
183
|
+
state=issue.state,
|
184
|
+
year=issue.created_at.year,
|
185
|
+
month=issue.created_at.month,
|
186
|
+
day=issue.created_at.day,
|
187
|
+
assignee=issue.assignee.login if issue.assignee else None,
|
188
|
+
size=get_issue_size(issue.labels),
|
189
|
+
text=issue.body or "No issue description body.",
|
190
|
+
)
|
191
|
+
issue_data_list.append(issue_data)
|
192
|
+
|
193
|
+
return issue_data_list
|
194
|
+
|
158
195
|
@staticmethod
|
159
196
|
def _file_type(name: str) -> str:
|
160
197
|
"""
|
@@ -453,18 +490,25 @@ class RepoLoader:
|
|
453
490
|
|
454
491
|
@staticmethod
|
455
492
|
def get_documents(
|
456
|
-
path: str,
|
493
|
+
path: str | bytes,
|
457
494
|
parser: Parser = Parser(ParsingConfig()),
|
458
495
|
file_types: Optional[List[str]] = None,
|
459
496
|
exclude_dirs: Optional[List[str]] = None,
|
460
497
|
depth: int = -1,
|
461
498
|
lines: Optional[int] = None,
|
499
|
+
doc_type: str | DocumentType | None = None,
|
462
500
|
) -> List[Document]:
|
463
501
|
"""
|
464
502
|
Recursively get all files under a path as Document objects.
|
465
503
|
|
466
504
|
Args:
|
467
|
-
path (str): The path to the directory or file.
|
505
|
+
path (str|bytes): The path to the directory or file, or bytes content.
|
506
|
+
The bytes option is meant to support the case where the content
|
507
|
+
has already been read from a file in an upstream process
|
508
|
+
(e.g. from an API or a database), and we want to avoid having to
|
509
|
+
write it to a temporary file just to read it again.
|
510
|
+
(which can be very slow for large files,
|
511
|
+
especially in a docker container)
|
468
512
|
parser (Parser): Parser to use to parse files.
|
469
513
|
file_types (List[str], optional): List of file extensions OR
|
470
514
|
filenames OR file_path_names to include.
|
@@ -475,6 +519,7 @@ class RepoLoader:
|
|
475
519
|
which includes all depths.
|
476
520
|
lines (int, optional): Number of lines to read from each file.
|
477
521
|
Defaults to None, which reads all lines.
|
522
|
+
doc_type (str|DocumentType, optional): The type of document to parse.
|
478
523
|
|
479
524
|
Returns:
|
480
525
|
List[Document]: List of Document objects representing files.
|
@@ -482,52 +527,69 @@ class RepoLoader:
|
|
482
527
|
"""
|
483
528
|
docs = []
|
484
529
|
file_paths = []
|
485
|
-
|
486
|
-
|
487
|
-
if path_obj.is_file():
|
488
|
-
file_paths.append(str(path_obj))
|
530
|
+
if isinstance(path, bytes):
|
531
|
+
file_paths.append(path)
|
489
532
|
else:
|
490
|
-
|
491
|
-
|
492
|
-
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
|
498
|
-
|
499
|
-
|
500
|
-
|
501
|
-
|
502
|
-
|
503
|
-
|
504
|
-
|
505
|
-
|
506
|
-
|
533
|
+
path_obj = Path(path).resolve()
|
534
|
+
|
535
|
+
if path_obj.is_file():
|
536
|
+
file_paths.append(str(path_obj))
|
537
|
+
else:
|
538
|
+
path_depth = len(path_obj.parts)
|
539
|
+
for root, dirs, files in os.walk(path):
|
540
|
+
# Exclude directories if needed
|
541
|
+
if exclude_dirs:
|
542
|
+
dirs[:] = [d for d in dirs if d not in exclude_dirs]
|
543
|
+
|
544
|
+
current_depth = len(Path(root).resolve().parts) - path_depth
|
545
|
+
if depth == -1 or current_depth <= depth:
|
546
|
+
for file in files:
|
547
|
+
file_path = str(Path(root) / file)
|
548
|
+
if (
|
549
|
+
file_types is None
|
550
|
+
or RepoLoader._file_type(file_path) in file_types
|
551
|
+
or os.path.basename(file_path) in file_types
|
552
|
+
or file_path in file_types
|
553
|
+
):
|
554
|
+
file_paths.append(file_path)
|
507
555
|
|
508
556
|
for file_path in file_paths:
|
509
|
-
|
510
|
-
|
511
|
-
doc_parser = DocumentParser.create(
|
557
|
+
docs.extend(
|
558
|
+
DocumentParser.chunks_from_path_or_bytes(
|
512
559
|
file_path,
|
513
|
-
parser
|
514
|
-
|
515
|
-
|
516
|
-
else:
|
517
|
-
with open(file_path, "r") as f:
|
518
|
-
if lines is not None:
|
519
|
-
file_lines = list(itertools.islice(f, lines))
|
520
|
-
content = "\n".join(line.strip() for line in file_lines)
|
521
|
-
else:
|
522
|
-
content = f.read()
|
523
|
-
soup = BeautifulSoup(content, "html.parser")
|
524
|
-
text = soup.get_text()
|
525
|
-
docs.append(
|
526
|
-
Document(
|
527
|
-
content=text,
|
528
|
-
metadata=DocMetaData(source=str(file_path)),
|
529
|
-
)
|
560
|
+
parser,
|
561
|
+
doc_type=doc_type,
|
562
|
+
lines=lines,
|
530
563
|
)
|
564
|
+
)
|
565
|
+
# dtype: DocumentType = DocumentParser._document_type(file_path, doc_type)
|
566
|
+
# if dtype in [DocumentType.PDF, DocumentType.DOC, DocumentType.DOCX]:
|
567
|
+
# doc_parser = DocumentParser.create(
|
568
|
+
# file_path,
|
569
|
+
# parser.config,
|
570
|
+
# doc_type=doc_type,
|
571
|
+
# )
|
572
|
+
# new_chunks = doc_parser.get_doc_chunks()
|
573
|
+
# if len(new_chunks) == 0 and file_extension.lower() == ".pdf":
|
574
|
+
# doc_parser = ImagePdfParser(file_path, parser.config)
|
575
|
+
# new_chunks = doc_parser.get_doc_chunks()
|
576
|
+
# docs.extend(new_chunks)
|
577
|
+
# else:
|
578
|
+
# # try getting as plain text; these will be chunked downstream
|
579
|
+
# with open(file_path, "r") as f:
|
580
|
+
# if lines is not None:
|
581
|
+
# file_lines = list(itertools.islice(f, lines))
|
582
|
+
# content = "\n".join(line.strip() for line in file_lines)
|
583
|
+
# else:
|
584
|
+
# content = f.read()
|
585
|
+
# soup = BeautifulSoup(content, "html.parser")
|
586
|
+
# text = soup.get_text()
|
587
|
+
# docs.append(
|
588
|
+
# Document(
|
589
|
+
# content=text,
|
590
|
+
# metadata=DocMetaData(source=str(file_path)),
|
591
|
+
# )
|
592
|
+
# )
|
531
593
|
|
532
594
|
return docs
|
533
595
|
|
langroid/parsing/search.py
CHANGED
@@ -64,6 +64,14 @@ def find_fuzzy_matches_in_docs(
|
|
64
64
|
break
|
65
65
|
if words_after is None and words_before is None:
|
66
66
|
return orig_doc_matches
|
67
|
+
if len(orig_doc_matches) == 0:
|
68
|
+
return []
|
69
|
+
if set(orig_doc_matches[0].__fields__) != {"content", "metadata"}:
|
70
|
+
# If there are fields beyond just content and metadata,
|
71
|
+
# we do NOT want to create new document objects with content fields
|
72
|
+
# based on words_before and words_after, since we don't know how to
|
73
|
+
# set those other fields.
|
74
|
+
return orig_doc_matches
|
67
75
|
|
68
76
|
contextual_matches = []
|
69
77
|
for match in orig_doc_matches:
|
langroid/parsing/table_loader.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
from csv import Sniffer
|
2
|
+
from typing import List
|
2
3
|
|
3
4
|
import pandas as pd
|
4
5
|
|
@@ -48,3 +49,46 @@ def read_tabular_data(path_or_url: str, sep: None | str = None) -> pd.DataFrame:
|
|
48
49
|
"Unable to read data. "
|
49
50
|
"Please ensure it is correctly formatted. Error: " + str(e)
|
50
51
|
)
|
52
|
+
|
53
|
+
|
54
|
+
def describe_dataframe(
|
55
|
+
df: pd.DataFrame, filter_fields: List[str] = [], n_vals: int = 10
|
56
|
+
) -> str:
|
57
|
+
"""
|
58
|
+
Generates a description of the columns in the dataframe,
|
59
|
+
along with a listing of up to `n_vals` unique values for each column.
|
60
|
+
Intended to be used to insert into an LLM context so it can generate
|
61
|
+
appropriate queries or filters on the df.
|
62
|
+
|
63
|
+
Args:
|
64
|
+
df (pd.DataFrame): The dataframe to describe.
|
65
|
+
filter_fields (list): A list of fields that can be used for filtering.
|
66
|
+
When non-empty, the values-list will be restricted to these.
|
67
|
+
n_vals (int): How many unique values to show for each column.
|
68
|
+
|
69
|
+
Returns:
|
70
|
+
str: A description of the dataframe.
|
71
|
+
"""
|
72
|
+
description = []
|
73
|
+
for column in df.columns.to_list():
|
74
|
+
unique_values = df[column].dropna().unique()
|
75
|
+
unique_count = len(unique_values)
|
76
|
+
if column not in filter_fields:
|
77
|
+
values_desc = f"{unique_count} unique values"
|
78
|
+
else:
|
79
|
+
if unique_count > n_vals:
|
80
|
+
displayed_values = unique_values[:n_vals]
|
81
|
+
more_count = unique_count - n_vals
|
82
|
+
values_desc = f" Values - {displayed_values}, ... {more_count} more"
|
83
|
+
else:
|
84
|
+
values_desc = f" Values - {unique_values}"
|
85
|
+
col_type = "string" if df[column].dtype == "object" else df[column].dtype
|
86
|
+
col_desc = f"* {column} ({col_type}); {values_desc}"
|
87
|
+
description.append(col_desc)
|
88
|
+
|
89
|
+
all_cols = "\n".join(description)
|
90
|
+
|
91
|
+
return f"""
|
92
|
+
Name of each field, its type and unique values (up to {n_vals}):
|
93
|
+
{all_cols}
|
94
|
+
"""
|
langroid/parsing/url_loader.py
CHANGED
@@ -1,6 +1,9 @@
|
|
1
1
|
import logging
|
2
|
+
import os
|
3
|
+
from tempfile import NamedTemporaryFile
|
2
4
|
from typing import List, no_type_check
|
3
5
|
|
6
|
+
import requests
|
4
7
|
import trafilatura
|
5
8
|
from trafilatura.downloads import (
|
6
9
|
add_to_compressed_dict,
|
@@ -9,7 +12,7 @@ from trafilatura.downloads import (
|
|
9
12
|
)
|
10
13
|
|
11
14
|
from langroid.mytypes import DocMetaData, Document
|
12
|
-
from langroid.parsing.document_parser import DocumentParser
|
15
|
+
from langroid.parsing.document_parser import DocumentParser, ImagePdfParser
|
13
16
|
from langroid.parsing.parser import Parser, ParsingConfig
|
14
17
|
|
15
18
|
logging.getLogger("trafilatura").setLevel(logging.ERROR)
|
@@ -44,22 +47,65 @@ class URLLoader:
|
|
44
47
|
sleep_time=5,
|
45
48
|
)
|
46
49
|
for url, result in buffered_downloads(buffer, threads):
|
47
|
-
if
|
50
|
+
if (
|
51
|
+
url.lower().endswith(".pdf")
|
52
|
+
or url.lower().endswith(".docx")
|
53
|
+
or url.lower().endswith(".doc")
|
54
|
+
):
|
48
55
|
doc_parser = DocumentParser.create(
|
49
56
|
url,
|
50
57
|
self.parser.config,
|
51
58
|
)
|
52
|
-
|
59
|
+
new_chunks = doc_parser.get_doc_chunks()
|
60
|
+
if len(new_chunks) == 0:
|
61
|
+
# If the document is empty, try to extract images
|
62
|
+
img_parser = ImagePdfParser(url, self.parser.config)
|
63
|
+
new_chunks = img_parser.get_doc_chunks()
|
64
|
+
docs.extend(new_chunks)
|
53
65
|
else:
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
66
|
+
# Try to detect content type and handle accordingly
|
67
|
+
headers = requests.head(url).headers
|
68
|
+
content_type = headers.get("Content-Type", "").lower()
|
69
|
+
temp_file_suffix = None
|
70
|
+
if "application/pdf" in content_type:
|
71
|
+
temp_file_suffix = ".pdf"
|
72
|
+
elif (
|
73
|
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
74
|
+
in content_type
|
75
|
+
):
|
76
|
+
temp_file_suffix = ".docx"
|
77
|
+
elif "application/msword" in content_type:
|
78
|
+
temp_file_suffix = ".doc"
|
79
|
+
|
80
|
+
if temp_file_suffix:
|
81
|
+
# Download the document content
|
82
|
+
response = requests.get(url)
|
83
|
+
with NamedTemporaryFile(
|
84
|
+
delete=False, suffix=temp_file_suffix
|
85
|
+
) as temp_file:
|
86
|
+
temp_file.write(response.content)
|
87
|
+
temp_file_path = temp_file.name
|
88
|
+
# Process the downloaded document
|
89
|
+
doc_parser = DocumentParser.create(
|
90
|
+
temp_file_path, self.parser.config
|
91
|
+
)
|
92
|
+
docs.extend(doc_parser.get_doc_chunks())
|
93
|
+
# Clean up the temporary file
|
94
|
+
os.remove(temp_file_path)
|
95
|
+
else:
|
96
|
+
text = trafilatura.extract(
|
97
|
+
result,
|
98
|
+
no_fallback=False,
|
99
|
+
favor_recall=True,
|
64
100
|
)
|
101
|
+
if (
|
102
|
+
text is None
|
103
|
+
and result is not None
|
104
|
+
and isinstance(result, str)
|
105
|
+
):
|
106
|
+
text = result
|
107
|
+
if text is not None and text != "":
|
108
|
+
docs.append(
|
109
|
+
Document(content=text, metadata=DocMetaData(source=url))
|
110
|
+
)
|
65
111
|
return docs
|
langroid/parsing/urls.py
CHANGED
@@ -112,26 +112,35 @@ def is_url(s: str) -> bool:
|
|
112
112
|
return False
|
113
113
|
|
114
114
|
|
115
|
-
def
|
115
|
+
def get_urls_paths_bytes_indices(
|
116
|
+
inputs: List[str | bytes],
|
117
|
+
) -> Tuple[List[int], List[int], List[int]]:
|
116
118
|
"""
|
117
|
-
Given a list of inputs, return a
|
119
|
+
Given a list of inputs, return a
|
120
|
+
list of indices of URLs, list of indices of paths, list of indices of byte-contents.
|
118
121
|
Args:
|
119
|
-
inputs: list of strings
|
122
|
+
inputs: list of strings or bytes
|
120
123
|
Returns:
|
121
|
-
list of
|
124
|
+
list of Indices of URLs,
|
125
|
+
list of indices of paths,
|
126
|
+
list of indices of byte-contents
|
122
127
|
"""
|
123
128
|
urls = []
|
124
129
|
paths = []
|
125
|
-
|
130
|
+
byte_list = []
|
131
|
+
for i, item in enumerate(inputs):
|
132
|
+
if isinstance(item, bytes):
|
133
|
+
byte_list.append(i)
|
134
|
+
continue
|
126
135
|
try:
|
127
|
-
|
128
|
-
urls.append(
|
136
|
+
Url(url=parse_obj_as(HttpUrl, item))
|
137
|
+
urls.append(i)
|
129
138
|
except ValidationError:
|
130
139
|
if os.path.exists(item):
|
131
|
-
paths.append(
|
140
|
+
paths.append(i)
|
132
141
|
else:
|
133
142
|
logger.warning(f"{item} is neither a URL nor a path.")
|
134
|
-
return urls, paths
|
143
|
+
return urls, paths, byte_list
|
135
144
|
|
136
145
|
|
137
146
|
def crawl_url(url: str, max_urls: int = 1) -> List[str]:
|
langroid/parsing/utils.py
CHANGED
@@ -1,16 +1,26 @@
|
|
1
1
|
import difflib
|
2
|
+
import logging
|
2
3
|
import random
|
3
4
|
import re
|
4
5
|
from functools import cache
|
5
6
|
from itertools import islice
|
6
|
-
from typing import
|
7
|
+
from typing import Iterable, List, Sequence, TypeVar
|
7
8
|
|
8
9
|
import nltk
|
9
10
|
from faker import Faker
|
10
11
|
|
12
|
+
from langroid.mytypes import Document
|
13
|
+
from langroid.parsing.document_parser import DocumentType
|
14
|
+
from langroid.parsing.parser import Parser, ParsingConfig
|
15
|
+
from langroid.parsing.repo_loader import RepoLoader
|
16
|
+
from langroid.parsing.url_loader import URLLoader
|
17
|
+
from langroid.parsing.urls import get_urls_paths_bytes_indices
|
18
|
+
|
11
19
|
Faker.seed(23)
|
12
20
|
random.seed(43)
|
13
21
|
|
22
|
+
logger = logging.getLogger(__name__)
|
23
|
+
|
14
24
|
|
15
25
|
# Ensures the NLTK resource is available
|
16
26
|
@cache
|
@@ -21,7 +31,10 @@ def download_nltk_resource(resource: str) -> None:
|
|
21
31
|
nltk.download(resource, quiet=True)
|
22
32
|
|
23
33
|
|
24
|
-
|
34
|
+
T = TypeVar("T")
|
35
|
+
|
36
|
+
|
37
|
+
def batched(iterable: Iterable[T], n: int) -> Iterable[Sequence[T]]:
|
25
38
|
"""Batch data into tuples of length n. The last batch may be shorter."""
|
26
39
|
# batched('ABCDEFG', 3) --> ABC DEF G
|
27
40
|
if n < 1:
|
@@ -101,14 +114,35 @@ def split_paragraphs(text: str) -> List[str]:
|
|
101
114
|
return [para.strip() for para in paras if para.strip()]
|
102
115
|
|
103
116
|
|
104
|
-
def
|
117
|
+
def split_newlines(text: str) -> List[str]:
|
118
|
+
"""
|
119
|
+
Split the input text into lines using "\n" as the delimiter.
|
120
|
+
|
121
|
+
Args:
|
122
|
+
text (str): The input text.
|
123
|
+
|
124
|
+
Returns:
|
125
|
+
list: A list of lines.
|
126
|
+
"""
|
127
|
+
lines = re.split(r"\n", text)
|
128
|
+
return [line.strip() for line in lines if line.strip()]
|
129
|
+
|
130
|
+
|
131
|
+
def number_segments(s: str, granularity: int = 1) -> str:
|
105
132
|
"""
|
106
133
|
Number the segments in a given text, preserving paragraph structure.
|
107
|
-
A segment is a sequence of `len` consecutive sentences
|
134
|
+
A segment is a sequence of `len` consecutive "sentences", where a "sentence"
|
135
|
+
is either a normal sentence, or if there isn't enough punctuation to properly
|
136
|
+
identify sentences, then we use a pseudo-sentence via heuristics (split by newline
|
137
|
+
or failing that, just split every 40 words). The goal here is simply to number
|
138
|
+
segments at a reasonable granularity so the LLM can identify relevant segments,
|
139
|
+
in the RelevanceExtractorAgent.
|
108
140
|
|
109
141
|
Args:
|
110
142
|
s (str): The input text.
|
111
|
-
|
143
|
+
granularity (int): The number of sentences in a segment.
|
144
|
+
If this is -1, then the entire text is treated as a single segment,
|
145
|
+
and is numbered as <#1#>.
|
112
146
|
|
113
147
|
Returns:
|
114
148
|
str: The text with segments numbered in the style <#1#>, <#2#> etc.
|
@@ -117,15 +151,42 @@ def number_segments(s: str, len: int = 1) -> str:
|
|
117
151
|
>>> number_segments("Hello world! How are you? Have a good day.")
|
118
152
|
'<#1#> Hello world! <#2#> How are you? <#3#> Have a good day.'
|
119
153
|
"""
|
154
|
+
if granularity < 0:
|
155
|
+
return "<#1#> " + s
|
120
156
|
numbered_text = []
|
121
157
|
count = 0
|
122
158
|
|
123
159
|
paragraphs = split_paragraphs(s)
|
124
160
|
for paragraph in paragraphs:
|
125
161
|
sentences = nltk.sent_tokenize(paragraph)
|
162
|
+
# Some docs are problematic (e.g. resumes) and have no (or too few) periods,
|
163
|
+
# so we can't split usefully into sentences.
|
164
|
+
# We try a series of heuristics to split into sentences,
|
165
|
+
# until the avg num words per sentence is less than 40.
|
166
|
+
avg_words_per_sentence = sum(
|
167
|
+
len(nltk.word_tokenize(sentence)) for sentence in sentences
|
168
|
+
) / len(sentences)
|
169
|
+
if avg_words_per_sentence > 40:
|
170
|
+
sentences = split_newlines(paragraph)
|
171
|
+
avg_words_per_sentence = sum(
|
172
|
+
len(nltk.word_tokenize(sentence)) for sentence in sentences
|
173
|
+
) / len(sentences)
|
174
|
+
if avg_words_per_sentence > 40:
|
175
|
+
# Still too long, just split on every 40 words
|
176
|
+
sentences = []
|
177
|
+
for sentence in nltk.sent_tokenize(paragraph):
|
178
|
+
words = nltk.word_tokenize(sentence)
|
179
|
+
for i in range(0, len(words), 40):
|
180
|
+
# if there are less than 20 words left after this,
|
181
|
+
# just add them to the last sentence and break
|
182
|
+
if len(words) - i < 20:
|
183
|
+
sentences.append(" ".join(words[i:]))
|
184
|
+
break
|
185
|
+
else:
|
186
|
+
sentences.append(" ".join(words[i : i + 40]))
|
126
187
|
for i, sentence in enumerate(sentences):
|
127
|
-
num = count //
|
128
|
-
number_prefix = f"<#{num}#>" if count %
|
188
|
+
num = count // granularity + 1
|
189
|
+
number_prefix = f"<#{num}#>" if count % granularity == 0 else ""
|
129
190
|
sentence = f"{number_prefix} {sentence}"
|
130
191
|
count += 1
|
131
192
|
sentences[i] = sentence
|
@@ -136,7 +197,7 @@ def number_segments(s: str, len: int = 1) -> str:
|
|
136
197
|
|
137
198
|
|
138
199
|
def number_sentences(s: str) -> str:
|
139
|
-
return number_segments(s,
|
200
|
+
return number_segments(s, granularity=1)
|
140
201
|
|
141
202
|
|
142
203
|
def parse_number_range_list(specs: str) -> List[int]:
|
@@ -156,6 +217,9 @@ def parse_number_range_list(specs: str) -> List[int]:
|
|
156
217
|
"""
|
157
218
|
spec_indices = set() # type: ignore
|
158
219
|
for part in specs.split(","):
|
220
|
+
# some weak LLMs may generate <#1#> instead of 1, so extract just the digits
|
221
|
+
# or the "-"
|
222
|
+
part = "".join(char for char in part if char.isdigit() or char == "-")
|
159
223
|
if "-" in part:
|
160
224
|
start, end = map(int, part.split("-"))
|
161
225
|
spec_indices.update(range(start, end + 1))
|
@@ -224,7 +288,8 @@ def extract_numbered_segments(s: str, specs: str) -> str:
|
|
224
288
|
|
225
289
|
# Regular expression to identify numbered segments like
|
226
290
|
# <#1#> Hello world! This is me. <#2#> How are you? <#3#> Have a good day.
|
227
|
-
|
291
|
+
# Note we match any character between segment markers, including newlines.
|
292
|
+
segment_pattern = re.compile(r"<#(\d+)#>([\s\S]*?)(?=<#\d+#>|$)")
|
228
293
|
|
229
294
|
# Split the text into paragraphs while preserving their boundaries
|
230
295
|
paragraphs = split_paragraphs(s)
|
@@ -247,3 +312,59 @@ def extract_numbered_segments(s: str, specs: str) -> str:
|
|
247
312
|
extracted_paragraphs.append(" ".join(extracted_segments))
|
248
313
|
|
249
314
|
return "\n\n".join(extracted_paragraphs)
|
315
|
+
|
316
|
+
|
317
|
+
def extract_content_from_path(
|
318
|
+
path: bytes | str | List[bytes | str],
|
319
|
+
parsing: ParsingConfig,
|
320
|
+
doc_type: str | DocumentType | None = None,
|
321
|
+
) -> str | List[str]:
|
322
|
+
"""
|
323
|
+
Extract the content from a file path or URL, or a list of file paths or URLs.
|
324
|
+
|
325
|
+
Args:
|
326
|
+
path (bytes | str | List[str]): The file path or URL, or a list of file paths or
|
327
|
+
URLs, or bytes content. The bytes option is meant to support cases
|
328
|
+
where upstream code may have already loaded the content (e.g., from a
|
329
|
+
database or API) and we want to avoid having to copy the content to a
|
330
|
+
temporary file.
|
331
|
+
parsing (ParsingConfig): The parsing configuration.
|
332
|
+
doc_type (str | DocumentType | None): The document type if known.
|
333
|
+
If multiple paths are given, this MUST apply to ALL docs.
|
334
|
+
|
335
|
+
Returns:
|
336
|
+
str | List[str]: The extracted content if a single file path or URL is provided,
|
337
|
+
or a list of extracted contents if a
|
338
|
+
list of file paths or URLs is provided.
|
339
|
+
"""
|
340
|
+
if isinstance(path, str) or isinstance(path, bytes):
|
341
|
+
paths = [path]
|
342
|
+
elif isinstance(path, list) and len(path) == 0:
|
343
|
+
return ""
|
344
|
+
else:
|
345
|
+
paths = path
|
346
|
+
|
347
|
+
url_idxs, path_idxs, byte_idxs = get_urls_paths_bytes_indices(paths)
|
348
|
+
urls = [paths[i] for i in url_idxs]
|
349
|
+
path_list = [paths[i] for i in path_idxs]
|
350
|
+
byte_list = [paths[i] for i in byte_idxs]
|
351
|
+
path_list.extend(byte_list)
|
352
|
+
parser = Parser(parsing)
|
353
|
+
docs: List[Document] = []
|
354
|
+
try:
|
355
|
+
if len(urls) > 0:
|
356
|
+
loader = URLLoader(urls=urls, parser=parser) # type: ignore
|
357
|
+
docs = loader.load()
|
358
|
+
if len(path_list) > 0:
|
359
|
+
for p in path_list:
|
360
|
+
path_docs = RepoLoader.get_documents(
|
361
|
+
p, parser=parser, doc_type=doc_type
|
362
|
+
)
|
363
|
+
docs.extend(path_docs)
|
364
|
+
except Exception as e:
|
365
|
+
logger.warning(f"Error loading path {paths}: {e}")
|
366
|
+
return ""
|
367
|
+
if len(docs) == 1:
|
368
|
+
return docs[0].content
|
369
|
+
else:
|
370
|
+
return [d.content for d in docs]
|