langroid 0.1.85__py3-none-any.whl → 0.1.219__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langroid/__init__.py +95 -0
- langroid/agent/__init__.py +40 -0
- langroid/agent/base.py +222 -91
- langroid/agent/batch.py +264 -0
- langroid/agent/callbacks/chainlit.py +608 -0
- langroid/agent/chat_agent.py +247 -101
- langroid/agent/chat_document.py +41 -4
- langroid/agent/openai_assistant.py +842 -0
- langroid/agent/special/__init__.py +50 -0
- langroid/agent/special/doc_chat_agent.py +837 -141
- langroid/agent/special/lance_doc_chat_agent.py +258 -0
- langroid/agent/special/lance_rag/__init__.py +9 -0
- langroid/agent/special/lance_rag/critic_agent.py +136 -0
- langroid/agent/special/lance_rag/lance_rag_task.py +80 -0
- langroid/agent/special/lance_rag/query_planner_agent.py +180 -0
- langroid/agent/special/lance_tools.py +44 -0
- langroid/agent/special/neo4j/__init__.py +0 -0
- langroid/agent/special/neo4j/csv_kg_chat.py +174 -0
- langroid/agent/special/neo4j/neo4j_chat_agent.py +370 -0
- langroid/agent/special/neo4j/utils/__init__.py +0 -0
- langroid/agent/special/neo4j/utils/system_message.py +46 -0
- langroid/agent/special/relevance_extractor_agent.py +127 -0
- langroid/agent/special/retriever_agent.py +32 -198
- langroid/agent/special/sql/__init__.py +11 -0
- langroid/agent/special/sql/sql_chat_agent.py +47 -23
- langroid/agent/special/sql/utils/__init__.py +22 -0
- langroid/agent/special/sql/utils/description_extractors.py +95 -46
- langroid/agent/special/sql/utils/populate_metadata.py +28 -21
- langroid/agent/special/table_chat_agent.py +43 -9
- langroid/agent/task.py +475 -122
- langroid/agent/tool_message.py +75 -13
- langroid/agent/tools/__init__.py +13 -0
- langroid/agent/tools/duckduckgo_search_tool.py +66 -0
- langroid/agent/tools/google_search_tool.py +11 -0
- langroid/agent/tools/metaphor_search_tool.py +67 -0
- langroid/agent/tools/recipient_tool.py +16 -29
- langroid/agent/tools/run_python_code.py +60 -0
- langroid/agent/tools/sciphi_search_rag_tool.py +79 -0
- langroid/agent/tools/segment_extract_tool.py +36 -0
- langroid/cachedb/__init__.py +9 -0
- langroid/cachedb/base.py +22 -2
- langroid/cachedb/momento_cachedb.py +26 -2
- langroid/cachedb/redis_cachedb.py +78 -11
- langroid/embedding_models/__init__.py +34 -0
- langroid/embedding_models/base.py +21 -2
- langroid/embedding_models/models.py +120 -18
- langroid/embedding_models/protoc/embeddings.proto +19 -0
- langroid/embedding_models/protoc/embeddings_pb2.py +33 -0
- langroid/embedding_models/protoc/embeddings_pb2.pyi +50 -0
- langroid/embedding_models/protoc/embeddings_pb2_grpc.py +79 -0
- langroid/embedding_models/remote_embeds.py +153 -0
- langroid/language_models/__init__.py +45 -0
- langroid/language_models/azure_openai.py +80 -27
- langroid/language_models/base.py +117 -12
- langroid/language_models/config.py +5 -0
- langroid/language_models/openai_assistants.py +3 -0
- langroid/language_models/openai_gpt.py +558 -174
- langroid/language_models/prompt_formatter/__init__.py +15 -0
- langroid/language_models/prompt_formatter/base.py +4 -6
- langroid/language_models/prompt_formatter/hf_formatter.py +135 -0
- langroid/language_models/utils.py +18 -21
- langroid/mytypes.py +25 -8
- langroid/parsing/__init__.py +46 -0
- langroid/parsing/document_parser.py +260 -63
- langroid/parsing/image_text.py +32 -0
- langroid/parsing/parse_json.py +143 -0
- langroid/parsing/parser.py +122 -59
- langroid/parsing/repo_loader.py +114 -52
- langroid/parsing/search.py +68 -63
- langroid/parsing/spider.py +3 -2
- langroid/parsing/table_loader.py +44 -0
- langroid/parsing/url_loader.py +59 -11
- langroid/parsing/urls.py +85 -37
- langroid/parsing/utils.py +298 -4
- langroid/parsing/web_search.py +73 -0
- langroid/prompts/__init__.py +11 -0
- langroid/prompts/chat-gpt4-system-prompt.md +68 -0
- langroid/prompts/prompts_config.py +1 -1
- langroid/utils/__init__.py +17 -0
- langroid/utils/algorithms/__init__.py +3 -0
- langroid/utils/algorithms/graph.py +103 -0
- langroid/utils/configuration.py +36 -5
- langroid/utils/constants.py +4 -0
- langroid/utils/globals.py +2 -2
- langroid/utils/logging.py +2 -5
- langroid/utils/output/__init__.py +21 -0
- langroid/utils/output/printing.py +47 -1
- langroid/utils/output/status.py +33 -0
- langroid/utils/pandas_utils.py +30 -0
- langroid/utils/pydantic_utils.py +616 -2
- langroid/utils/system.py +98 -0
- langroid/vector_store/__init__.py +40 -0
- langroid/vector_store/base.py +203 -6
- langroid/vector_store/chromadb.py +59 -32
- langroid/vector_store/lancedb.py +463 -0
- langroid/vector_store/meilisearch.py +10 -7
- langroid/vector_store/momento.py +262 -0
- langroid/vector_store/qdrantdb.py +104 -22
- {langroid-0.1.85.dist-info → langroid-0.1.219.dist-info}/METADATA +329 -149
- langroid-0.1.219.dist-info/RECORD +127 -0
- {langroid-0.1.85.dist-info → langroid-0.1.219.dist-info}/WHEEL +1 -1
- langroid/agent/special/recipient_validator_agent.py +0 -157
- langroid/parsing/json.py +0 -64
- langroid/utils/web/selenium_login.py +0 -36
- langroid-0.1.85.dist-info/RECORD +0 -94
- /langroid/{scripts → agent/callbacks}/__init__.py +0 -0
- {langroid-0.1.85.dist-info → langroid-0.1.219.dist-info}/LICENSE +0 -0
langroid/parsing/utils.py
CHANGED
@@ -1,19 +1,40 @@
|
|
1
1
|
import difflib
|
2
|
+
import logging
|
2
3
|
import random
|
4
|
+
import re
|
5
|
+
from functools import cache
|
3
6
|
from itertools import islice
|
4
|
-
from typing import
|
7
|
+
from typing import Iterable, List, Sequence, TypeVar
|
5
8
|
|
6
9
|
import nltk
|
7
10
|
from faker import Faker
|
8
11
|
|
9
|
-
|
10
|
-
|
12
|
+
from langroid.mytypes import Document
|
13
|
+
from langroid.parsing.document_parser import DocumentType
|
14
|
+
from langroid.parsing.parser import Parser, ParsingConfig
|
15
|
+
from langroid.parsing.repo_loader import RepoLoader
|
16
|
+
from langroid.parsing.url_loader import URLLoader
|
17
|
+
from langroid.parsing.urls import get_urls_paths_bytes_indices
|
11
18
|
|
12
19
|
Faker.seed(23)
|
13
20
|
random.seed(43)
|
14
21
|
|
22
|
+
logger = logging.getLogger(__name__)
|
15
23
|
|
16
|
-
|
24
|
+
|
25
|
+
# Ensures the NLTK resource is available
|
26
|
+
@cache
|
27
|
+
def download_nltk_resource(resource: str) -> None:
|
28
|
+
try:
|
29
|
+
nltk.data.find(resource)
|
30
|
+
except LookupError:
|
31
|
+
nltk.download(resource, quiet=True)
|
32
|
+
|
33
|
+
|
34
|
+
T = TypeVar("T")
|
35
|
+
|
36
|
+
|
37
|
+
def batched(iterable: Iterable[T], n: int) -> Iterable[Sequence[T]]:
|
17
38
|
"""Batch data into tuples of length n. The last batch may be shorter."""
|
18
39
|
# batched('ABCDEFG', 3) --> ABC DEF G
|
19
40
|
if n < 1:
|
@@ -25,6 +46,8 @@ def batched(iterable: Iterable[Any], n: int) -> Iterable[Any]:
|
|
25
46
|
|
26
47
|
def generate_random_sentences(k: int) -> str:
|
27
48
|
# Load the sample text
|
49
|
+
download_nltk_resource("gutenberg")
|
50
|
+
|
28
51
|
from nltk.corpus import gutenberg
|
29
52
|
|
30
53
|
text = gutenberg.raw("austen-emma.txt")
|
@@ -74,3 +97,274 @@ def closest_string(query: str, string_list: List[str]) -> str:
|
|
74
97
|
)
|
75
98
|
|
76
99
|
return original_closest_match
|
100
|
+
|
101
|
+
|
102
|
+
def split_paragraphs(text: str) -> List[str]:
|
103
|
+
"""
|
104
|
+
Split the input text into paragraphs using "\n\n" as the delimiter.
|
105
|
+
|
106
|
+
Args:
|
107
|
+
text (str): The input text.
|
108
|
+
|
109
|
+
Returns:
|
110
|
+
list: A list of paragraphs.
|
111
|
+
"""
|
112
|
+
# Split based on a newline, followed by spaces/tabs, then another newline.
|
113
|
+
paras = re.split(r"\n[ \t]*\n", text)
|
114
|
+
return [para.strip() for para in paras if para.strip()]
|
115
|
+
|
116
|
+
|
117
|
+
def split_newlines(text: str) -> List[str]:
|
118
|
+
"""
|
119
|
+
Split the input text into lines using "\n" as the delimiter.
|
120
|
+
|
121
|
+
Args:
|
122
|
+
text (str): The input text.
|
123
|
+
|
124
|
+
Returns:
|
125
|
+
list: A list of lines.
|
126
|
+
"""
|
127
|
+
lines = re.split(r"\n", text)
|
128
|
+
return [line.strip() for line in lines if line.strip()]
|
129
|
+
|
130
|
+
|
131
|
+
def number_segments(s: str, granularity: int = 1) -> str:
|
132
|
+
"""
|
133
|
+
Number the segments in a given text, preserving paragraph structure.
|
134
|
+
A segment is a sequence of `len` consecutive "sentences", where a "sentence"
|
135
|
+
is either a normal sentence, or if there isn't enough punctuation to properly
|
136
|
+
identify sentences, then we use a pseudo-sentence via heuristics (split by newline
|
137
|
+
or failing that, just split every 40 words). The goal here is simply to number
|
138
|
+
segments at a reasonable granularity so the LLM can identify relevant segments,
|
139
|
+
in the RelevanceExtractorAgent.
|
140
|
+
|
141
|
+
Args:
|
142
|
+
s (str): The input text.
|
143
|
+
granularity (int): The number of sentences in a segment.
|
144
|
+
If this is -1, then the entire text is treated as a single segment,
|
145
|
+
and is numbered as <#1#>.
|
146
|
+
|
147
|
+
Returns:
|
148
|
+
str: The text with segments numbered in the style <#1#>, <#2#> etc.
|
149
|
+
|
150
|
+
Example:
|
151
|
+
>>> number_segments("Hello world! How are you? Have a good day.")
|
152
|
+
'<#1#> Hello world! <#2#> How are you? <#3#> Have a good day.'
|
153
|
+
"""
|
154
|
+
if granularity < 0:
|
155
|
+
return "<#1#> " + s
|
156
|
+
numbered_text = []
|
157
|
+
count = 0
|
158
|
+
|
159
|
+
paragraphs = split_paragraphs(s)
|
160
|
+
for paragraph in paragraphs:
|
161
|
+
sentences = nltk.sent_tokenize(paragraph)
|
162
|
+
# Some docs are problematic (e.g. resumes) and have no (or too few) periods,
|
163
|
+
# so we can't split usefully into sentences.
|
164
|
+
# We try a series of heuristics to split into sentences,
|
165
|
+
# until the avg num words per sentence is less than 40.
|
166
|
+
avg_words_per_sentence = sum(
|
167
|
+
len(nltk.word_tokenize(sentence)) for sentence in sentences
|
168
|
+
) / len(sentences)
|
169
|
+
if avg_words_per_sentence > 40:
|
170
|
+
sentences = split_newlines(paragraph)
|
171
|
+
avg_words_per_sentence = sum(
|
172
|
+
len(nltk.word_tokenize(sentence)) for sentence in sentences
|
173
|
+
) / len(sentences)
|
174
|
+
if avg_words_per_sentence > 40:
|
175
|
+
# Still too long, just split on every 40 words
|
176
|
+
sentences = []
|
177
|
+
for sentence in nltk.sent_tokenize(paragraph):
|
178
|
+
words = nltk.word_tokenize(sentence)
|
179
|
+
for i in range(0, len(words), 40):
|
180
|
+
# if there are less than 20 words left after this,
|
181
|
+
# just add them to the last sentence and break
|
182
|
+
if len(words) - i < 20:
|
183
|
+
sentences.append(" ".join(words[i:]))
|
184
|
+
break
|
185
|
+
else:
|
186
|
+
sentences.append(" ".join(words[i : i + 40]))
|
187
|
+
for i, sentence in enumerate(sentences):
|
188
|
+
num = count // granularity + 1
|
189
|
+
number_prefix = f"<#{num}#>" if count % granularity == 0 else ""
|
190
|
+
sentence = f"{number_prefix} {sentence}"
|
191
|
+
count += 1
|
192
|
+
sentences[i] = sentence
|
193
|
+
numbered_paragraph = " ".join(sentences)
|
194
|
+
numbered_text.append(numbered_paragraph)
|
195
|
+
|
196
|
+
return " \n\n ".join(numbered_text)
|
197
|
+
|
198
|
+
|
199
|
+
def number_sentences(s: str) -> str:
|
200
|
+
return number_segments(s, granularity=1)
|
201
|
+
|
202
|
+
|
203
|
+
def parse_number_range_list(specs: str) -> List[int]:
|
204
|
+
"""
|
205
|
+
Parse a specs string like "3,5,7-10" into a list of integers.
|
206
|
+
|
207
|
+
Args:
|
208
|
+
specs (str): A string containing segment numbers and/or ranges
|
209
|
+
(e.g., "3,5,7-10").
|
210
|
+
|
211
|
+
Returns:
|
212
|
+
List[int]: List of segment numbers.
|
213
|
+
|
214
|
+
Example:
|
215
|
+
>>> parse_number_range_list("3,5,7-10")
|
216
|
+
[3, 5, 7, 8, 9, 10]
|
217
|
+
"""
|
218
|
+
spec_indices = set() # type: ignore
|
219
|
+
for part in specs.split(","):
|
220
|
+
# some weak LLMs may generate <#1#> instead of 1, so extract just the digits
|
221
|
+
# or the "-"
|
222
|
+
part = "".join(char for char in part if char.isdigit() or char == "-")
|
223
|
+
if "-" in part:
|
224
|
+
start, end = map(int, part.split("-"))
|
225
|
+
spec_indices.update(range(start, end + 1))
|
226
|
+
else:
|
227
|
+
spec_indices.add(int(part))
|
228
|
+
|
229
|
+
return sorted(list(spec_indices))
|
230
|
+
|
231
|
+
|
232
|
+
def strip_k(s: str, k: int = 2) -> str:
|
233
|
+
"""
|
234
|
+
Strip any leading and trailing whitespaces from the input text beyond length k.
|
235
|
+
This is useful for removing leading/trailing whitespaces from a text while
|
236
|
+
preserving paragraph structure.
|
237
|
+
|
238
|
+
Args:
|
239
|
+
s (str): The input text.
|
240
|
+
k (int): The number of leading and trailing whitespaces to retain.
|
241
|
+
|
242
|
+
Returns:
|
243
|
+
str: The text with leading and trailing whitespaces removed beyond length k.
|
244
|
+
"""
|
245
|
+
|
246
|
+
# Count leading and trailing whitespaces
|
247
|
+
leading_count = len(s) - len(s.lstrip())
|
248
|
+
trailing_count = len(s) - len(s.rstrip())
|
249
|
+
|
250
|
+
# Determine how many whitespaces to retain
|
251
|
+
leading_keep = min(leading_count, k)
|
252
|
+
trailing_keep = min(trailing_count, k)
|
253
|
+
|
254
|
+
# Use slicing to get the desired output
|
255
|
+
return s[leading_count - leading_keep : len(s) - (trailing_count - trailing_keep)]
|
256
|
+
|
257
|
+
|
258
|
+
def clean_whitespace(text: str) -> str:
|
259
|
+
"""Remove extra whitespace from the input text, while preserving
|
260
|
+
paragraph structure.
|
261
|
+
"""
|
262
|
+
paragraphs = split_paragraphs(text)
|
263
|
+
cleaned_paragraphs = [" ".join(p.split()) for p in paragraphs if p]
|
264
|
+
return "\n\n".join(cleaned_paragraphs) # Join the cleaned paragraphs.
|
265
|
+
|
266
|
+
|
267
|
+
def extract_numbered_segments(s: str, specs: str) -> str:
|
268
|
+
"""
|
269
|
+
Extract specified segments from a numbered text, preserving paragraph structure.
|
270
|
+
|
271
|
+
Args:
|
272
|
+
s (str): The input text containing numbered segments.
|
273
|
+
specs (str): A string containing segment numbers and/or ranges
|
274
|
+
(e.g., "3,5,7-10").
|
275
|
+
|
276
|
+
Returns:
|
277
|
+
str: Extracted segments, keeping original paragraph structures.
|
278
|
+
|
279
|
+
Example:
|
280
|
+
>>> text = "(1) Hello world! (2) How are you? (3) Have a good day."
|
281
|
+
>>> extract_numbered_segments(text, "1,3")
|
282
|
+
'Hello world! Have a good day.'
|
283
|
+
"""
|
284
|
+
# Use the helper function to get the list of indices from specs
|
285
|
+
if specs.strip() == "":
|
286
|
+
return ""
|
287
|
+
spec_indices = parse_number_range_list(specs)
|
288
|
+
|
289
|
+
# Regular expression to identify numbered segments like
|
290
|
+
# <#1#> Hello world! This is me. <#2#> How are you? <#3#> Have a good day.
|
291
|
+
# Note we match any character between segment markers, including newlines.
|
292
|
+
segment_pattern = re.compile(r"<#(\d+)#>([\s\S]*?)(?=<#\d+#>|$)")
|
293
|
+
|
294
|
+
# Split the text into paragraphs while preserving their boundaries
|
295
|
+
paragraphs = split_paragraphs(s)
|
296
|
+
|
297
|
+
extracted_paragraphs = []
|
298
|
+
|
299
|
+
for paragraph in paragraphs:
|
300
|
+
segments_with_numbers = segment_pattern.findall(paragraph)
|
301
|
+
|
302
|
+
# Extract the desired segments from this paragraph
|
303
|
+
extracted_segments = [
|
304
|
+
segment
|
305
|
+
for num, segment in segments_with_numbers
|
306
|
+
if int(num) in spec_indices
|
307
|
+
]
|
308
|
+
|
309
|
+
# If we extracted any segments from this paragraph,
|
310
|
+
# join them and append to results
|
311
|
+
if extracted_segments:
|
312
|
+
extracted_paragraphs.append(" ".join(extracted_segments))
|
313
|
+
|
314
|
+
return "\n\n".join(extracted_paragraphs)
|
315
|
+
|
316
|
+
|
317
|
+
def extract_content_from_path(
|
318
|
+
path: bytes | str | List[bytes | str],
|
319
|
+
parsing: ParsingConfig,
|
320
|
+
doc_type: str | DocumentType | None = None,
|
321
|
+
) -> str | List[str]:
|
322
|
+
"""
|
323
|
+
Extract the content from a file path or URL, or a list of file paths or URLs.
|
324
|
+
|
325
|
+
Args:
|
326
|
+
path (bytes | str | List[str]): The file path or URL, or a list of file paths or
|
327
|
+
URLs, or bytes content. The bytes option is meant to support cases
|
328
|
+
where upstream code may have already loaded the content (e.g., from a
|
329
|
+
database or API) and we want to avoid having to copy the content to a
|
330
|
+
temporary file.
|
331
|
+
parsing (ParsingConfig): The parsing configuration.
|
332
|
+
doc_type (str | DocumentType | None): The document type if known.
|
333
|
+
If multiple paths are given, this MUST apply to ALL docs.
|
334
|
+
|
335
|
+
Returns:
|
336
|
+
str | List[str]: The extracted content if a single file path or URL is provided,
|
337
|
+
or a list of extracted contents if a
|
338
|
+
list of file paths or URLs is provided.
|
339
|
+
"""
|
340
|
+
if isinstance(path, str) or isinstance(path, bytes):
|
341
|
+
paths = [path]
|
342
|
+
elif isinstance(path, list) and len(path) == 0:
|
343
|
+
return ""
|
344
|
+
else:
|
345
|
+
paths = path
|
346
|
+
|
347
|
+
url_idxs, path_idxs, byte_idxs = get_urls_paths_bytes_indices(paths)
|
348
|
+
urls = [paths[i] for i in url_idxs]
|
349
|
+
path_list = [paths[i] for i in path_idxs]
|
350
|
+
byte_list = [paths[i] for i in byte_idxs]
|
351
|
+
path_list.extend(byte_list)
|
352
|
+
parser = Parser(parsing)
|
353
|
+
docs: List[Document] = []
|
354
|
+
try:
|
355
|
+
if len(urls) > 0:
|
356
|
+
loader = URLLoader(urls=urls, parser=parser) # type: ignore
|
357
|
+
docs = loader.load()
|
358
|
+
if len(path_list) > 0:
|
359
|
+
for p in path_list:
|
360
|
+
path_docs = RepoLoader.get_documents(
|
361
|
+
p, parser=parser, doc_type=doc_type
|
362
|
+
)
|
363
|
+
docs.extend(path_docs)
|
364
|
+
except Exception as e:
|
365
|
+
logger.warning(f"Error loading path {paths}: {e}")
|
366
|
+
return ""
|
367
|
+
if len(docs) == 1:
|
368
|
+
return docs[0].content
|
369
|
+
else:
|
370
|
+
return [d.content for d in docs]
|
langroid/parsing/web_search.py
CHANGED
@@ -12,6 +12,7 @@ from typing import Dict, List
|
|
12
12
|
import requests
|
13
13
|
from bs4 import BeautifulSoup
|
14
14
|
from dotenv import load_dotenv
|
15
|
+
from duckduckgo_search import DDGS
|
15
16
|
from googleapiclient.discovery import Resource, build
|
16
17
|
from requests.models import Response
|
17
18
|
|
@@ -77,3 +78,75 @@ def google_search(query: str, num_results: int = 5) -> List[WebSearchResult]:
|
|
77
78
|
WebSearchResult(result["title"], result["link"], 3500, 300)
|
78
79
|
for result in raw_results
|
79
80
|
]
|
81
|
+
|
82
|
+
|
83
|
+
def metaphor_search(query: str, num_results: int = 5) -> List[WebSearchResult]:
|
84
|
+
"""
|
85
|
+
Method that makes an API call by Metaphor client that queries
|
86
|
+
the top num_results links that matches the query. Returns a list
|
87
|
+
of WebSearchResult objects.
|
88
|
+
|
89
|
+
Args:
|
90
|
+
query (str): The query body that users wants to make.
|
91
|
+
num_results (int): Number of top matching results that we want
|
92
|
+
to grab
|
93
|
+
"""
|
94
|
+
|
95
|
+
load_dotenv()
|
96
|
+
|
97
|
+
api_key = os.getenv("METAPHOR_API_KEY") or os.getenv("EXA_API_KEY")
|
98
|
+
if not api_key:
|
99
|
+
raise ValueError(
|
100
|
+
"""
|
101
|
+
Neither METAPHOR_API_KEY nor EXA_API_KEY environment variables are set.
|
102
|
+
Please set one of them to your API key, and try again.
|
103
|
+
"""
|
104
|
+
)
|
105
|
+
|
106
|
+
try:
|
107
|
+
from metaphor_python import Metaphor
|
108
|
+
except ImportError:
|
109
|
+
raise ImportError(
|
110
|
+
"You are attempting to use the `metaphor_python` library;"
|
111
|
+
"To use it, please install langroid with the `metaphor` extra, e.g. "
|
112
|
+
"`pip install langroid[metaphor]` or `poetry add langroid[metaphor]` "
|
113
|
+
"(it installs the `metaphor_python` package from pypi)."
|
114
|
+
)
|
115
|
+
|
116
|
+
client = Metaphor(api_key=api_key)
|
117
|
+
|
118
|
+
response = client.search(
|
119
|
+
query=query,
|
120
|
+
num_results=num_results,
|
121
|
+
)
|
122
|
+
raw_results = response.results
|
123
|
+
|
124
|
+
return [
|
125
|
+
WebSearchResult(result.title, result.url, 3500, 300) for result in raw_results
|
126
|
+
]
|
127
|
+
|
128
|
+
|
129
|
+
def duckduckgo_search(query: str, num_results: int = 5) -> List[WebSearchResult]:
|
130
|
+
"""
|
131
|
+
Method that makes an API call by DuckDuckGo client that queries
|
132
|
+
the top `num_results` links that matche the query. Returns a list
|
133
|
+
of WebSearchResult objects.
|
134
|
+
|
135
|
+
Args:
|
136
|
+
query (str): The query body that users wants to make.
|
137
|
+
num_results (int): Number of top matching results that we want
|
138
|
+
to grab
|
139
|
+
"""
|
140
|
+
|
141
|
+
with DDGS() as ddgs:
|
142
|
+
search_results = [r for r in ddgs.text(query, max_results=num_results)]
|
143
|
+
|
144
|
+
return [
|
145
|
+
WebSearchResult(
|
146
|
+
title=result["title"],
|
147
|
+
link=result["href"],
|
148
|
+
max_content_length=3500,
|
149
|
+
max_summary_length=300,
|
150
|
+
)
|
151
|
+
for result in search_results
|
152
|
+
]
|
langroid/prompts/__init__.py
CHANGED
@@ -0,0 +1,68 @@
|
|
1
|
+
Image input capabilities: Enabled
|
2
|
+
|
3
|
+
Tools
|
4
|
+
python
|
5
|
+
When you send a message containing Python code to python, it will be executed in a
|
6
|
+
stateful Jupyter notebook environment. python will respond with the output of the execution or time out after 60.0
|
7
|
+
seconds. The drive at '/mnt/data' can be used to save and persist user files. Internet access for this session is disabled. Do not make external web requests or API calls as they will fail.
|
8
|
+
|
9
|
+
dalle
|
10
|
+
// Whenever a description of an image is given, create a prompt that dalle can use to generate the image and abide to the following policy:
|
11
|
+
// 1. The prompt must be in English. Translate to English if needed.
|
12
|
+
// 2. DO NOT ask for permission to generate the image, just do it!
|
13
|
+
// 3. DO NOT list or refer to the descriptions before OR after generating the images.
|
14
|
+
// 4. Do not create more than 1 image, even if the user requests more.
|
15
|
+
// 5. Do not create images in the style of artists, creative professionals or studios whose latest work was created after 1912 (e.g. Picasso, Kahlo).
|
16
|
+
// - You can name artists, creative professionals or studios in prompts only if their latest work was created prior to 1912 (e.g. Van Gogh, Goya)
|
17
|
+
// - If asked to generate an image that would violate this policy, instead apply the following procedure: (a) substitute the artist's name with three adjectives that capture key aspects of the style; (b) include an associated artistic movement or era to provide context; and (c) mention the primary medium used by the artist
|
18
|
+
// 6. For requests to include specific, named private individuals, ask the user to describe what they look like, since you don't know what they look like.
|
19
|
+
// 7. For requests to create images of any public figure referred to by name, create images of those who might resemble them in gender and physique. But they shouldn't look like them. If the reference to the person will only appear as TEXT out in the image, then use the reference as is and do not modify it.
|
20
|
+
// 8. Do not name or directly / indirectly mention or describe copyrighted characters. Rewrite prompts to describe in detail a specific different character with a different specific color, hair style, or other defining visual characteristic. Do not discuss copyright policies in responses.
|
21
|
+
// The generated prompt sent to dalle should be very detailed, and around 100 words long.
|
22
|
+
// Example dalle invocation:
|
23
|
+
// // { // "prompt": "<insert prompt here>" // } //
|
24
|
+
namespace dalle {
|
25
|
+
|
26
|
+
// Create images from a text-only prompt.
|
27
|
+
type text2im = (_: {
|
28
|
+
// The size of the requested image. Use 1024x1024 (square) as the default, 1792x1024 if the user requests a wide image, and 1024x1792 for full-body portraits. Always include this parameter in the request.
|
29
|
+
size?: "1792x1024" | "1024x1024" | "1024x1792",
|
30
|
+
// The number of images to generate. If the user does not specify a number, generate 1 image.
|
31
|
+
n?: number, // default: 2
|
32
|
+
// The detailed image description, potentially modified to abide by the dalle policies. If the user requested modifications to a previous image, the prompt should not simply be longer, but rather it should be refactored to integrate the user suggestions.
|
33
|
+
prompt: string,
|
34
|
+
// If the user references a previous image, this field should be populated with the gen_id from the dalle image metadata.
|
35
|
+
referenced_image_ids?: string[],
|
36
|
+
}) => any;
|
37
|
+
|
38
|
+
} // namespace dalle
|
39
|
+
|
40
|
+
voice_mode
|
41
|
+
// Voice mode functions are not available in text conversations.
|
42
|
+
namespace voice_mode {
|
43
|
+
|
44
|
+
} // namespace voice_mode
|
45
|
+
|
46
|
+
browser
|
47
|
+
You have the tool browser. Use browser in the following circumstances:
|
48
|
+
- User is asking about current events or something that requires real-time information (weather, sports scores, etc.)
|
49
|
+
- User is asking about some term you are totally unfamiliar with (it might be new)
|
50
|
+
- User explicitly asks you to browse or provide links to references
|
51
|
+
|
52
|
+
Given a query that requires retrieval, your turn will consist of three steps:
|
53
|
+
|
54
|
+
Call the search function to get a list of results.
|
55
|
+
Call the mclick function to retrieve a diverse and high-quality subset of these results (in parallel). Remember to SELECT AT LEAST 3 sources when using mclick.
|
56
|
+
Write a response to the user based on these results. In your response, cite sources using the citation format below.
|
57
|
+
In some cases, you should repeat step 1 twice, if the initial results are unsatisfactory, and you believe that you can refine the query to get better results.
|
58
|
+
|
59
|
+
You can also open a url directly if one is provided by the user. Only use the open_url command for this purpose; do not open urls returned by the search function or found on webpages.
|
60
|
+
|
61
|
+
The browser tool has the following commands:
|
62
|
+
search(query: str, recency_days: int) Issues a query to a search engine and displays the results.
|
63
|
+
mclick(ids: list[str]). Retrieves the contents of the webpages with provided IDs (indices). You should ALWAYS SELECT AT LEAST 3 and at most 10 pages. Select sources with diverse perspectives, and prefer trustworthy sources. Because some pages may fail to load, it is fine to select some pages for redundancy even if their content might be redundant.
|
64
|
+
open_url(url: str) Opens the given URL and displays it.
|
65
|
+
|
66
|
+
For citing quotes from the 'browser' tool: please render in this format: 【{message idx}†{link text}】.
|
67
|
+
For long citations: please render in this format: [link text](message idx).
|
68
|
+
Otherwise do not render links.
|
langroid/utils/__init__.py
CHANGED
@@ -0,0 +1,17 @@
|
|
1
|
+
from . import configuration
|
2
|
+
from . import globals
|
3
|
+
from . import constants
|
4
|
+
from . import logging
|
5
|
+
from . import pydantic_utils
|
6
|
+
from . import system
|
7
|
+
from . import output
|
8
|
+
|
9
|
+
__all__ = [
|
10
|
+
"configuration",
|
11
|
+
"globals",
|
12
|
+
"constants",
|
13
|
+
"logging",
|
14
|
+
"pydantic_utils",
|
15
|
+
"system",
|
16
|
+
"output",
|
17
|
+
]
|
@@ -0,0 +1,103 @@
|
|
1
|
+
"""
|
2
|
+
Graph algos.
|
3
|
+
"""
|
4
|
+
|
5
|
+
from typing import Dict, List, no_type_check
|
6
|
+
|
7
|
+
import numpy as np
|
8
|
+
|
9
|
+
|
10
|
+
@no_type_check
|
11
|
+
def topological_sort(order: np.array) -> List[int]:
|
12
|
+
"""
|
13
|
+
Given a directed adjacency matrix, return a topological sort of the nodes.
|
14
|
+
order[i,j] = -1 means there is an edge from i to j.
|
15
|
+
order[i,j] = 0 means there is no edge from i to j.
|
16
|
+
order[i,j] = 1 means there is an edge from j to i.
|
17
|
+
|
18
|
+
Args:
|
19
|
+
order (np.array): The adjacency matrix.
|
20
|
+
|
21
|
+
Returns:
|
22
|
+
List[int]: The topological sort of the nodes.
|
23
|
+
|
24
|
+
"""
|
25
|
+
n = order.shape[0]
|
26
|
+
|
27
|
+
# Calculate the in-degrees
|
28
|
+
in_degree = [0] * n
|
29
|
+
for i in range(n):
|
30
|
+
for j in range(n):
|
31
|
+
if order[i, j] == -1:
|
32
|
+
in_degree[j] += 1
|
33
|
+
|
34
|
+
# Initialize the queue with nodes of in-degree 0
|
35
|
+
queue = [i for i in range(n) if in_degree[i] == 0]
|
36
|
+
result = []
|
37
|
+
|
38
|
+
while queue:
|
39
|
+
node = queue.pop(0)
|
40
|
+
result.append(node)
|
41
|
+
|
42
|
+
for i in range(n):
|
43
|
+
if order[node, i] == -1:
|
44
|
+
in_degree[i] -= 1
|
45
|
+
if in_degree[i] == 0:
|
46
|
+
queue.append(i)
|
47
|
+
|
48
|
+
assert len(result) == n, "Cycle detected"
|
49
|
+
return result
|
50
|
+
|
51
|
+
|
52
|
+
@no_type_check
|
53
|
+
def components(order: np.ndarray) -> List[List[int]]:
|
54
|
+
"""
|
55
|
+
Find the connected components in an undirected graph represented by a matrix.
|
56
|
+
|
57
|
+
Args:
|
58
|
+
order (np.ndarray): A matrix with values 0 or 1 indicating
|
59
|
+
undirected graph edges. `order[i][j] = 1` means an edge between `i`
|
60
|
+
and `j`, and `0` means no edge.
|
61
|
+
|
62
|
+
Returns:
|
63
|
+
List[List[int]]: A list of List where each List contains the indices of
|
64
|
+
nodes in the same connected component.
|
65
|
+
|
66
|
+
Example:
|
67
|
+
order = np.array([
|
68
|
+
[1, 1, 0, 0],
|
69
|
+
[1, 1, 1, 0],
|
70
|
+
[0, 1, 1, 0],
|
71
|
+
[0, 0, 0, 1]
|
72
|
+
])
|
73
|
+
components(order)
|
74
|
+
# [[0, 1, 2], [3]]
|
75
|
+
"""
|
76
|
+
|
77
|
+
i2g: Dict[int, int] = {} # index to group mapping
|
78
|
+
next_group = 0
|
79
|
+
n = order.shape[0]
|
80
|
+
for i in range(n):
|
81
|
+
connected_groups = {i2g[j] for j in np.nonzero(order[i, :])[0] if j in i2g}
|
82
|
+
|
83
|
+
# If the node is not part of any group
|
84
|
+
# and is not connected to any groups, assign a new group
|
85
|
+
if not connected_groups:
|
86
|
+
i2g[i] = next_group
|
87
|
+
next_group += 1
|
88
|
+
else:
|
89
|
+
# If the node is connected to multiple groups, we merge them
|
90
|
+
main_group = min(connected_groups)
|
91
|
+
for j in np.nonzero(order[i, :])[0]:
|
92
|
+
if i2g.get(j) in connected_groups:
|
93
|
+
i2g[j] = main_group
|
94
|
+
i2g[i] = main_group
|
95
|
+
|
96
|
+
# Convert i2g to a list of Lists
|
97
|
+
groups: Dict[int, List[int]] = {}
|
98
|
+
for index, group in i2g.items():
|
99
|
+
if group not in groups:
|
100
|
+
groups[group] = []
|
101
|
+
groups[group].append(index)
|
102
|
+
|
103
|
+
return list(groups.values())
|