langroid 0.31.1__py3-none-any.whl → 0.33.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {langroid-0.31.1.dist-info → langroid-0.33.3.dist-info}/METADATA +150 -124
- langroid-0.33.3.dist-info/RECORD +7 -0
- {langroid-0.31.1.dist-info → langroid-0.33.3.dist-info}/WHEEL +1 -1
- langroid-0.33.3.dist-info/entry_points.txt +4 -0
- pyproject.toml +317 -212
- langroid/__init__.py +0 -106
- langroid/agent/.chainlit/config.toml +0 -121
- langroid/agent/.chainlit/translations/bn.json +0 -231
- langroid/agent/.chainlit/translations/en-US.json +0 -229
- langroid/agent/.chainlit/translations/gu.json +0 -231
- langroid/agent/.chainlit/translations/he-IL.json +0 -231
- langroid/agent/.chainlit/translations/hi.json +0 -231
- langroid/agent/.chainlit/translations/kn.json +0 -231
- langroid/agent/.chainlit/translations/ml.json +0 -231
- langroid/agent/.chainlit/translations/mr.json +0 -231
- langroid/agent/.chainlit/translations/ta.json +0 -231
- langroid/agent/.chainlit/translations/te.json +0 -231
- langroid/agent/.chainlit/translations/zh-CN.json +0 -229
- langroid/agent/__init__.py +0 -41
- langroid/agent/base.py +0 -1981
- langroid/agent/batch.py +0 -398
- langroid/agent/callbacks/__init__.py +0 -0
- langroid/agent/callbacks/chainlit.py +0 -598
- langroid/agent/chat_agent.py +0 -1899
- langroid/agent/chat_document.py +0 -454
- langroid/agent/helpers.py +0 -0
- langroid/agent/junk +0 -13
- langroid/agent/openai_assistant.py +0 -882
- langroid/agent/special/__init__.py +0 -59
- langroid/agent/special/arangodb/__init__.py +0 -0
- langroid/agent/special/arangodb/arangodb_agent.py +0 -656
- langroid/agent/special/arangodb/system_messages.py +0 -186
- langroid/agent/special/arangodb/tools.py +0 -107
- langroid/agent/special/arangodb/utils.py +0 -36
- langroid/agent/special/doc_chat_agent.py +0 -1466
- langroid/agent/special/lance_doc_chat_agent.py +0 -262
- langroid/agent/special/lance_rag/__init__.py +0 -9
- langroid/agent/special/lance_rag/critic_agent.py +0 -198
- langroid/agent/special/lance_rag/lance_rag_task.py +0 -82
- langroid/agent/special/lance_rag/query_planner_agent.py +0 -260
- langroid/agent/special/lance_tools.py +0 -61
- langroid/agent/special/neo4j/__init__.py +0 -0
- langroid/agent/special/neo4j/csv_kg_chat.py +0 -174
- langroid/agent/special/neo4j/neo4j_chat_agent.py +0 -433
- langroid/agent/special/neo4j/system_messages.py +0 -120
- langroid/agent/special/neo4j/tools.py +0 -32
- langroid/agent/special/relevance_extractor_agent.py +0 -127
- langroid/agent/special/retriever_agent.py +0 -56
- langroid/agent/special/sql/__init__.py +0 -17
- langroid/agent/special/sql/sql_chat_agent.py +0 -654
- langroid/agent/special/sql/utils/__init__.py +0 -21
- langroid/agent/special/sql/utils/description_extractors.py +0 -190
- langroid/agent/special/sql/utils/populate_metadata.py +0 -85
- langroid/agent/special/sql/utils/system_message.py +0 -35
- langroid/agent/special/sql/utils/tools.py +0 -64
- langroid/agent/special/table_chat_agent.py +0 -263
- langroid/agent/structured_message.py +0 -9
- langroid/agent/task.py +0 -2093
- langroid/agent/tool_message.py +0 -393
- langroid/agent/tools/__init__.py +0 -38
- langroid/agent/tools/duckduckgo_search_tool.py +0 -50
- langroid/agent/tools/file_tools.py +0 -234
- langroid/agent/tools/google_search_tool.py +0 -39
- langroid/agent/tools/metaphor_search_tool.py +0 -67
- langroid/agent/tools/orchestration.py +0 -303
- langroid/agent/tools/recipient_tool.py +0 -235
- langroid/agent/tools/retrieval_tool.py +0 -32
- langroid/agent/tools/rewind_tool.py +0 -137
- langroid/agent/tools/segment_extract_tool.py +0 -41
- langroid/agent/typed_task.py +0 -19
- langroid/agent/xml_tool_message.py +0 -382
- langroid/agent_config.py +0 -0
- langroid/cachedb/__init__.py +0 -17
- langroid/cachedb/base.py +0 -58
- langroid/cachedb/momento_cachedb.py +0 -108
- langroid/cachedb/redis_cachedb.py +0 -153
- langroid/embedding_models/__init__.py +0 -39
- langroid/embedding_models/base.py +0 -74
- langroid/embedding_models/clustering.py +0 -189
- langroid/embedding_models/models.py +0 -461
- langroid/embedding_models/protoc/__init__.py +0 -0
- langroid/embedding_models/protoc/embeddings.proto +0 -19
- langroid/embedding_models/protoc/embeddings_pb2.py +0 -33
- langroid/embedding_models/protoc/embeddings_pb2.pyi +0 -50
- langroid/embedding_models/protoc/embeddings_pb2_grpc.py +0 -79
- langroid/embedding_models/remote_embeds.py +0 -153
- langroid/exceptions.py +0 -65
- langroid/experimental/team-save.py +0 -391
- langroid/language_models/.chainlit/config.toml +0 -121
- langroid/language_models/.chainlit/translations/en-US.json +0 -231
- langroid/language_models/__init__.py +0 -53
- langroid/language_models/azure_openai.py +0 -153
- langroid/language_models/base.py +0 -678
- langroid/language_models/config.py +0 -18
- langroid/language_models/mock_lm.py +0 -124
- langroid/language_models/openai_gpt.py +0 -1923
- langroid/language_models/prompt_formatter/__init__.py +0 -16
- langroid/language_models/prompt_formatter/base.py +0 -40
- langroid/language_models/prompt_formatter/hf_formatter.py +0 -132
- langroid/language_models/prompt_formatter/llama2_formatter.py +0 -75
- langroid/language_models/utils.py +0 -147
- langroid/mytypes.py +0 -84
- langroid/parsing/__init__.py +0 -52
- langroid/parsing/agent_chats.py +0 -38
- langroid/parsing/code-parsing.md +0 -86
- langroid/parsing/code_parser.py +0 -121
- langroid/parsing/config.py +0 -0
- langroid/parsing/document_parser.py +0 -718
- langroid/parsing/image_text.py +0 -32
- langroid/parsing/para_sentence_split.py +0 -62
- langroid/parsing/parse_json.py +0 -155
- langroid/parsing/parser.py +0 -313
- langroid/parsing/repo_loader.py +0 -790
- langroid/parsing/routing.py +0 -36
- langroid/parsing/search.py +0 -275
- langroid/parsing/spider.py +0 -102
- langroid/parsing/table_loader.py +0 -94
- langroid/parsing/url_loader.py +0 -111
- langroid/parsing/url_loader_cookies.py +0 -73
- langroid/parsing/urls.py +0 -273
- langroid/parsing/utils.py +0 -373
- langroid/parsing/web_search.py +0 -155
- langroid/prompts/__init__.py +0 -9
- langroid/prompts/chat-gpt4-system-prompt.md +0 -68
- langroid/prompts/dialog.py +0 -17
- langroid/prompts/prompts_config.py +0 -5
- langroid/prompts/templates.py +0 -141
- langroid/pydantic_v1/__init__.py +0 -10
- langroid/pydantic_v1/main.py +0 -4
- langroid/utils/.chainlit/config.toml +0 -121
- langroid/utils/.chainlit/translations/en-US.json +0 -231
- langroid/utils/__init__.py +0 -19
- langroid/utils/algorithms/__init__.py +0 -3
- langroid/utils/algorithms/graph.py +0 -103
- langroid/utils/configuration.py +0 -98
- langroid/utils/constants.py +0 -30
- langroid/utils/docker.py +0 -37
- langroid/utils/git_utils.py +0 -252
- langroid/utils/globals.py +0 -49
- langroid/utils/llms/__init__.py +0 -0
- langroid/utils/llms/strings.py +0 -8
- langroid/utils/logging.py +0 -135
- langroid/utils/object_registry.py +0 -66
- langroid/utils/output/__init__.py +0 -20
- langroid/utils/output/citations.py +0 -41
- langroid/utils/output/printing.py +0 -99
- langroid/utils/output/status.py +0 -40
- langroid/utils/pandas_utils.py +0 -30
- langroid/utils/pydantic_utils.py +0 -602
- langroid/utils/system.py +0 -286
- langroid/utils/types.py +0 -93
- langroid/utils/web/__init__.py +0 -0
- langroid/utils/web/login.py +0 -83
- langroid/vector_store/__init__.py +0 -50
- langroid/vector_store/base.py +0 -357
- langroid/vector_store/chromadb.py +0 -214
- langroid/vector_store/lancedb.py +0 -401
- langroid/vector_store/meilisearch.py +0 -299
- langroid/vector_store/momento.py +0 -278
- langroid/vector_store/qdrant_cloud.py +0 -6
- langroid/vector_store/qdrantdb.py +0 -468
- langroid-0.31.1.dist-info/RECORD +0 -162
- {langroid-0.31.1.dist-info → langroid-0.33.3.dist-info/licenses}/LICENSE +0 -0
langroid/parsing/routing.py
DELETED
@@ -1,36 +0,0 @@
|
|
1
|
-
import re
|
2
|
-
from typing import Optional, Tuple
|
3
|
-
|
4
|
-
|
5
|
-
def parse_addressed_message(
|
6
|
-
content: str, addressing: str = "@"
|
7
|
-
) -> Tuple[Optional[str], str]:
|
8
|
-
"""In a message-string containing possibly multiple @<recipient> occurrences,
|
9
|
-
find the last addressee and extract their name,
|
10
|
-
and the message content following it.
|
11
|
-
|
12
|
-
E.g. "thank you @bob, now I will ask @alice again. @alice, where is the mirror?" =>
|
13
|
-
("alice", "where is the mirror?")
|
14
|
-
|
15
|
-
Args:
|
16
|
-
content (str): The message content.
|
17
|
-
addressing (str, optional): The addressing character. Defaults to "@".
|
18
|
-
|
19
|
-
Returns:
|
20
|
-
Tuple[Optional[str], str]:
|
21
|
-
A tuple containing the last addressee and the subsequent message content.
|
22
|
-
"""
|
23
|
-
# Regex to find all occurrences of the pattern
|
24
|
-
pattern = re.compile(rf"{re.escape(addressing)}(\w+)[^\w]")
|
25
|
-
matches = list(pattern.finditer(content))
|
26
|
-
|
27
|
-
if not matches:
|
28
|
-
return None, content # No addressee found, return None and original content
|
29
|
-
|
30
|
-
# Get the last match
|
31
|
-
last_match = matches[-1]
|
32
|
-
last_addressee = last_match.group(1)
|
33
|
-
# Extract content after the last addressee
|
34
|
-
content_after = content[last_match.end() :].strip()
|
35
|
-
|
36
|
-
return last_addressee, content_after
|
langroid/parsing/search.py
DELETED
@@ -1,275 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
Utils to search for close matches in (a list of) strings.
|
3
|
-
Useful for retrieval of docs/chunks relevant to a query, in the context of
|
4
|
-
Retrieval-Augmented Generation (RAG), and SQLChat (e.g., to pull relevant parts of a
|
5
|
-
large schema).
|
6
|
-
See tests for examples: tests/main/test_string_search.py
|
7
|
-
"""
|
8
|
-
|
9
|
-
import difflib
|
10
|
-
from typing import List, Tuple
|
11
|
-
|
12
|
-
from nltk.corpus import stopwords
|
13
|
-
from nltk.stem import WordNetLemmatizer
|
14
|
-
from nltk.tokenize import RegexpTokenizer
|
15
|
-
from rank_bm25 import BM25Okapi
|
16
|
-
from thefuzz import fuzz, process
|
17
|
-
|
18
|
-
from langroid.mytypes import Document
|
19
|
-
|
20
|
-
from .utils import download_nltk_resource
|
21
|
-
|
22
|
-
|
23
|
-
def find_fuzzy_matches_in_docs(
|
24
|
-
query: str,
|
25
|
-
docs: List[Document],
|
26
|
-
docs_clean: List[Document],
|
27
|
-
k: int,
|
28
|
-
words_before: int | None = None,
|
29
|
-
words_after: int | None = None,
|
30
|
-
) -> List[Tuple[Document, float]]:
|
31
|
-
"""
|
32
|
-
Find approximate matches of the query in the docs and return surrounding
|
33
|
-
characters.
|
34
|
-
|
35
|
-
Args:
|
36
|
-
query (str): The search string.
|
37
|
-
docs (List[Document]): List of Document objects to search through.
|
38
|
-
docs_clean (List[Document]): List of Document objects with cleaned content.
|
39
|
-
k (int): Number of best matches to return.
|
40
|
-
words_before (int|None): Number of words to include before each match.
|
41
|
-
Default None => return max
|
42
|
-
words_after (int|None): Number of words to include after each match.
|
43
|
-
Default None => return max
|
44
|
-
|
45
|
-
Returns:
|
46
|
-
List[Tuple[Document,float]]: List of (Document, score) tuples.
|
47
|
-
"""
|
48
|
-
if len(docs) == 0:
|
49
|
-
return []
|
50
|
-
best_matches = process.extract(
|
51
|
-
query,
|
52
|
-
[d.content for d in docs_clean],
|
53
|
-
limit=k,
|
54
|
-
scorer=fuzz.partial_ratio,
|
55
|
-
)
|
56
|
-
|
57
|
-
real_matches = [(m, score) for m, score in best_matches if score > 50]
|
58
|
-
# find the original docs that corresponding to the matches
|
59
|
-
orig_doc_matches = []
|
60
|
-
for i, (m, s) in enumerate(real_matches):
|
61
|
-
for j, doc_clean in enumerate(docs_clean):
|
62
|
-
if m in doc_clean.content:
|
63
|
-
orig_doc_matches.append((docs[j], s))
|
64
|
-
break
|
65
|
-
if words_after is None and words_before is None:
|
66
|
-
return orig_doc_matches
|
67
|
-
if len(orig_doc_matches) == 0:
|
68
|
-
return []
|
69
|
-
if set(orig_doc_matches[0][0].__fields__) != {"content", "metadata"}:
|
70
|
-
# If there are fields beyond just content and metadata,
|
71
|
-
# we do NOT want to create new document objects with content fields
|
72
|
-
# based on words_before and words_after, since we don't know how to
|
73
|
-
# set those other fields.
|
74
|
-
return orig_doc_matches
|
75
|
-
|
76
|
-
contextual_matches = []
|
77
|
-
for match, score in orig_doc_matches:
|
78
|
-
choice_text = match.content
|
79
|
-
contexts = []
|
80
|
-
while choice_text != "":
|
81
|
-
context, start_pos, end_pos = get_context(
|
82
|
-
query, choice_text, words_before, words_after
|
83
|
-
)
|
84
|
-
if context == "" or end_pos == 0:
|
85
|
-
break
|
86
|
-
contexts.append(context)
|
87
|
-
words = choice_text.split()
|
88
|
-
end_pos = min(end_pos, len(words))
|
89
|
-
choice_text = " ".join(words[end_pos:])
|
90
|
-
if len(contexts) > 0:
|
91
|
-
contextual_matches.append(
|
92
|
-
(
|
93
|
-
Document(
|
94
|
-
content=" ... ".join(contexts),
|
95
|
-
metadata=match.metadata,
|
96
|
-
),
|
97
|
-
score,
|
98
|
-
)
|
99
|
-
)
|
100
|
-
|
101
|
-
return contextual_matches
|
102
|
-
|
103
|
-
|
104
|
-
def preprocess_text(text: str) -> str:
|
105
|
-
"""
|
106
|
-
Preprocesses the given text by:
|
107
|
-
1. Lowercasing all words.
|
108
|
-
2. Tokenizing (splitting the text into words).
|
109
|
-
3. Removing punctuation.
|
110
|
-
4. Removing stopwords.
|
111
|
-
5. Lemmatizing words.
|
112
|
-
|
113
|
-
Args:
|
114
|
-
text (str): The input text.
|
115
|
-
|
116
|
-
Returns:
|
117
|
-
str: The preprocessed text.
|
118
|
-
"""
|
119
|
-
# Ensure the NLTK resources are available
|
120
|
-
for resource in ["punkt", "wordnet", "stopwords"]:
|
121
|
-
download_nltk_resource(resource)
|
122
|
-
|
123
|
-
# Lowercase the text
|
124
|
-
text = text.lower()
|
125
|
-
|
126
|
-
# Tokenize the text and remove punctuation
|
127
|
-
tokenizer = RegexpTokenizer(r"\w+")
|
128
|
-
tokens = tokenizer.tokenize(text)
|
129
|
-
|
130
|
-
# Remove stopwords
|
131
|
-
stop_words = set(stopwords.words("english"))
|
132
|
-
tokens = [t for t in tokens if t not in stop_words]
|
133
|
-
|
134
|
-
# Lemmatize words
|
135
|
-
lemmatizer = WordNetLemmatizer()
|
136
|
-
tokens = [lemmatizer.lemmatize(t) for t in tokens]
|
137
|
-
|
138
|
-
# Join the words back into a string
|
139
|
-
text = " ".join(tokens)
|
140
|
-
|
141
|
-
return text
|
142
|
-
|
143
|
-
|
144
|
-
def find_closest_matches_with_bm25(
|
145
|
-
docs: List[Document],
|
146
|
-
docs_clean: List[Document],
|
147
|
-
query: str,
|
148
|
-
k: int = 5,
|
149
|
-
) -> List[Tuple[Document, float]]:
|
150
|
-
"""
|
151
|
-
Finds the k closest approximate matches using the BM25 algorithm.
|
152
|
-
|
153
|
-
Args:
|
154
|
-
docs (List[Document]): List of Documents to search through.
|
155
|
-
docs_clean (List[Document]): List of cleaned Documents
|
156
|
-
query (str): The search query.
|
157
|
-
k (int, optional): Number of matches to retrieve. Defaults to 5.
|
158
|
-
|
159
|
-
Returns:
|
160
|
-
List[Tuple[Document,float]]: List of (Document, score) tuples.
|
161
|
-
"""
|
162
|
-
if len(docs) == 0:
|
163
|
-
return []
|
164
|
-
texts = [doc.content for doc in docs_clean]
|
165
|
-
query = preprocess_text(query)
|
166
|
-
|
167
|
-
text_words = [text.split() for text in texts]
|
168
|
-
|
169
|
-
bm25 = BM25Okapi(text_words)
|
170
|
-
query_words = query.split()
|
171
|
-
doc_scores = bm25.get_scores(query_words)
|
172
|
-
|
173
|
-
# Get indices of top k scores
|
174
|
-
top_indices = sorted(range(len(doc_scores)), key=lambda i: -doc_scores[i])[:k]
|
175
|
-
|
176
|
-
# return the original docs, based on the scores from cleaned docs
|
177
|
-
return [(docs[i], doc_scores[i]) for i in top_indices]
|
178
|
-
|
179
|
-
|
180
|
-
def get_context(
|
181
|
-
query: str,
|
182
|
-
text: str,
|
183
|
-
words_before: int | None = 100,
|
184
|
-
words_after: int | None = 100,
|
185
|
-
) -> Tuple[str, int, int]:
|
186
|
-
"""
|
187
|
-
Returns a portion of text containing the best approximate match of the query,
|
188
|
-
including b words before and a words after the match.
|
189
|
-
|
190
|
-
Args:
|
191
|
-
query (str): The string to search for.
|
192
|
-
text (str): The body of text in which to search.
|
193
|
-
b (int): The number of words before the query to return.
|
194
|
-
a (int): The number of words after the query to return.
|
195
|
-
|
196
|
-
Returns:
|
197
|
-
str: A string containing b words before, the match, and a words after
|
198
|
-
the best approximate match position of the query in the text. If no
|
199
|
-
match is found, returns empty string.
|
200
|
-
int: The start position of the match in the text.
|
201
|
-
int: The end position of the match in the text.
|
202
|
-
|
203
|
-
Example:
|
204
|
-
>>> get_context("apple", "The quick brown fox jumps over the apple.", 3, 2)
|
205
|
-
# 'fox jumps over the apple.'
|
206
|
-
"""
|
207
|
-
if words_after is None and words_before is None:
|
208
|
-
# return entire text since we're not asked to return a bounded context
|
209
|
-
return text, 0, 0
|
210
|
-
|
211
|
-
# make sure there is a good enough match to the query
|
212
|
-
if fuzz.partial_ratio(query, text) < 40:
|
213
|
-
return "", 0, 0
|
214
|
-
|
215
|
-
sequence_matcher = difflib.SequenceMatcher(None, text, query)
|
216
|
-
match = sequence_matcher.find_longest_match(0, len(text), 0, len(query))
|
217
|
-
|
218
|
-
if match.size == 0:
|
219
|
-
return "", 0, 0
|
220
|
-
|
221
|
-
segments = text.split()
|
222
|
-
n_segs = len(segments)
|
223
|
-
|
224
|
-
start_segment_pos = len(text[: match.a].split())
|
225
|
-
|
226
|
-
words_before = words_before or n_segs
|
227
|
-
words_after = words_after or n_segs
|
228
|
-
start_pos = max(0, start_segment_pos - words_before)
|
229
|
-
end_pos = min(len(segments), start_segment_pos + words_after + len(query.split()))
|
230
|
-
|
231
|
-
return " ".join(segments[start_pos:end_pos]), start_pos, end_pos
|
232
|
-
|
233
|
-
|
234
|
-
def eliminate_near_duplicates(passages: List[str], threshold: float = 0.8) -> List[str]:
|
235
|
-
"""
|
236
|
-
Eliminate near duplicate text passages from a given list using MinHash and LSH.
|
237
|
-
TODO: this has not been tested and the datasketch lib is not a dependency.
|
238
|
-
Args:
|
239
|
-
passages (List[str]): A list of text passages.
|
240
|
-
threshold (float, optional): Jaccard similarity threshold to consider two
|
241
|
-
passages as near-duplicates. Default is 0.8.
|
242
|
-
|
243
|
-
Returns:
|
244
|
-
List[str]: A list of passages after eliminating near duplicates.
|
245
|
-
|
246
|
-
Example:
|
247
|
-
passages = ["Hello world", "Hello, world!", "Hi there", "Hello world!"]
|
248
|
-
print(eliminate_near_duplicates(passages))
|
249
|
-
# ['Hello world', 'Hi there']
|
250
|
-
"""
|
251
|
-
|
252
|
-
from datasketch import MinHash, MinHashLSH
|
253
|
-
|
254
|
-
# Create LSH index
|
255
|
-
lsh = MinHashLSH(threshold=threshold, num_perm=128)
|
256
|
-
|
257
|
-
# Create MinHash objects for each passage and insert to LSH
|
258
|
-
minhashes = {}
|
259
|
-
for idx, passage in enumerate(passages):
|
260
|
-
m = MinHash(num_perm=128)
|
261
|
-
for word in passage.split():
|
262
|
-
m.update(word.encode("utf-8"))
|
263
|
-
lsh.insert(idx, m)
|
264
|
-
minhashes[idx] = m
|
265
|
-
|
266
|
-
unique_idxs = set()
|
267
|
-
for idx in minhashes.keys():
|
268
|
-
# Query for similar passages (including itself)
|
269
|
-
result = lsh.query(minhashes[idx])
|
270
|
-
|
271
|
-
# If only the passage itself is returned, it's unique
|
272
|
-
if len(result) == 1 and idx in result:
|
273
|
-
unique_idxs.add(idx)
|
274
|
-
|
275
|
-
return [passages[idx] for idx in unique_idxs]
|
langroid/parsing/spider.py
DELETED
@@ -1,102 +0,0 @@
|
|
1
|
-
from typing import List, Set, no_type_check
|
2
|
-
from urllib.parse import urlparse
|
3
|
-
|
4
|
-
from langroid.exceptions import LangroidImportError
|
5
|
-
|
6
|
-
try:
|
7
|
-
from pydispatch import dispatcher
|
8
|
-
from scrapy import signals
|
9
|
-
from scrapy.crawler import CrawlerRunner
|
10
|
-
from scrapy.http import Response
|
11
|
-
from scrapy.linkextractors import LinkExtractor
|
12
|
-
from scrapy.spiders import CrawlSpider, Rule
|
13
|
-
from twisted.internet import defer, reactor
|
14
|
-
except ImportError:
|
15
|
-
raise LangroidImportError("scrapy", "scrapy")
|
16
|
-
|
17
|
-
|
18
|
-
@no_type_check
|
19
|
-
class DomainSpecificSpider(CrawlSpider): # type: ignore
|
20
|
-
name = "domain_specific_spider"
|
21
|
-
|
22
|
-
custom_settings = {"DEPTH_LIMIT": 1, "CLOSESPIDER_ITEMCOUNT": 20}
|
23
|
-
|
24
|
-
rules = (Rule(LinkExtractor(), callback="parse_item", follow=True),)
|
25
|
-
|
26
|
-
def __init__(self, start_url: str, k: int = 20, *args, **kwargs): # type: ignore
|
27
|
-
"""Initialize the spider with start_url and k.
|
28
|
-
|
29
|
-
Args:
|
30
|
-
start_url (str): The starting URL.
|
31
|
-
k (int, optional): The max desired final URLs. Defaults to 20.
|
32
|
-
"""
|
33
|
-
super(DomainSpecificSpider, self).__init__(*args, **kwargs)
|
34
|
-
self.start_urls = [start_url]
|
35
|
-
self.allowed_domains = [urlparse(start_url).netloc]
|
36
|
-
self.k = k
|
37
|
-
self.visited_urls: Set[str] = set()
|
38
|
-
|
39
|
-
def parse_item(self, response: Response): # type: ignore
|
40
|
-
"""Extracts URLs that are within the same domain.
|
41
|
-
|
42
|
-
Args:
|
43
|
-
response: The scrapy response object.
|
44
|
-
"""
|
45
|
-
for link in LinkExtractor(allow_domains=self.allowed_domains).extract_links(
|
46
|
-
response
|
47
|
-
):
|
48
|
-
if len(self.visited_urls) < self.k:
|
49
|
-
self.visited_urls.add(link.url)
|
50
|
-
yield {"url": link.url}
|
51
|
-
|
52
|
-
|
53
|
-
@no_type_check
|
54
|
-
def scrapy_fetch_urls(url: str, k: int = 20) -> List[str]:
|
55
|
-
"""Fetches up to k URLs reachable from the input URL using Scrapy.
|
56
|
-
|
57
|
-
Args:
|
58
|
-
url (str): The starting URL.
|
59
|
-
k (int, optional): The max desired final URLs. Defaults to 20.
|
60
|
-
|
61
|
-
Returns:
|
62
|
-
List[str]: List of URLs within the same domain as the input URL.
|
63
|
-
"""
|
64
|
-
urls = []
|
65
|
-
|
66
|
-
def _collect_urls(spider):
|
67
|
-
"""Handler for the spider_closed signal. Collects the visited URLs."""
|
68
|
-
nonlocal urls
|
69
|
-
urls.extend(list(spider.visited_urls))
|
70
|
-
|
71
|
-
# Connect the spider_closed signal with our handler
|
72
|
-
dispatcher.connect(_collect_urls, signal=signals.spider_closed)
|
73
|
-
|
74
|
-
runner = CrawlerRunner(
|
75
|
-
{
|
76
|
-
"USER_AGENT": "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
|
77
|
-
}
|
78
|
-
)
|
79
|
-
|
80
|
-
d = runner.crawl(DomainSpecificSpider, start_url=url, k=k)
|
81
|
-
|
82
|
-
# Block until crawling is done and then stop the reactor
|
83
|
-
crawl_deferred = defer.Deferred()
|
84
|
-
|
85
|
-
def _crawl_done(_):
|
86
|
-
reactor.stop()
|
87
|
-
crawl_deferred.callback(urls)
|
88
|
-
|
89
|
-
d.addBoth(_crawl_done)
|
90
|
-
|
91
|
-
# Start the reactor, it will stop once the crawl is done
|
92
|
-
reactor.run(installSignalHandlers=0)
|
93
|
-
|
94
|
-
# This will block until the deferred gets a result
|
95
|
-
return crawl_deferred.result
|
96
|
-
|
97
|
-
|
98
|
-
# Test the function
|
99
|
-
if __name__ == "__main__":
|
100
|
-
fetched_urls = scrapy_fetch_urls("https://example.com", 5)
|
101
|
-
for url in fetched_urls:
|
102
|
-
print(url)
|
langroid/parsing/table_loader.py
DELETED
@@ -1,94 +0,0 @@
|
|
1
|
-
from csv import Sniffer
|
2
|
-
from typing import List
|
3
|
-
|
4
|
-
import pandas as pd
|
5
|
-
|
6
|
-
|
7
|
-
def read_tabular_data(path_or_url: str, sep: None | str = None) -> pd.DataFrame:
|
8
|
-
"""
|
9
|
-
Reads tabular data from a file or URL and returns a pandas DataFrame.
|
10
|
-
The separator is auto-detected if not specified.
|
11
|
-
|
12
|
-
Args:
|
13
|
-
path_or_url (str): Path or URL to the file to be read.
|
14
|
-
|
15
|
-
Returns:
|
16
|
-
pd.DataFrame: Data from file or URL as a pandas DataFrame.
|
17
|
-
|
18
|
-
Raises:
|
19
|
-
ValueError: If the data cannot be read or is misformatted.
|
20
|
-
"""
|
21
|
-
try:
|
22
|
-
if sep is None:
|
23
|
-
# Read the first few lines to guess the separator
|
24
|
-
with pd.io.common.get_handle(path_or_url, "r") as file_handler:
|
25
|
-
first_lines = "".join(file_handler.handle.readlines(5))
|
26
|
-
sep = Sniffer().sniff(first_lines).delimiter
|
27
|
-
# If it's a local file, reset to the beginning
|
28
|
-
if hasattr(file_handler.handle, "seek"):
|
29
|
-
file_handler.handle.seek(0)
|
30
|
-
|
31
|
-
# Read the data
|
32
|
-
|
33
|
-
# get non-blank column names
|
34
|
-
with pd.io.common.get_handle(path_or_url, "r") as f:
|
35
|
-
header_line = f.handle.readline().strip()
|
36
|
-
valid_cols = [col for col in header_line.split(sep) if col]
|
37
|
-
valid_cols = [c.replace('"', "").replace("'", "") for c in valid_cols]
|
38
|
-
if hasattr(f.handle, "seek"):
|
39
|
-
f.handle.seek(0)
|
40
|
-
|
41
|
-
# use only those columns
|
42
|
-
data = pd.read_csv(path_or_url, sep=sep, usecols=valid_cols)
|
43
|
-
data.columns = data.columns.str.strip() # e.g. " column 1 " -> "column 1"
|
44
|
-
|
45
|
-
return data
|
46
|
-
|
47
|
-
except Exception as e:
|
48
|
-
raise ValueError(
|
49
|
-
"Unable to read data. "
|
50
|
-
"Please ensure it is correctly formatted. Error: " + str(e)
|
51
|
-
)
|
52
|
-
|
53
|
-
|
54
|
-
def describe_dataframe(
|
55
|
-
df: pd.DataFrame, filter_fields: List[str] = [], n_vals: int = 10
|
56
|
-
) -> str:
|
57
|
-
"""
|
58
|
-
Generates a description of the columns in the dataframe,
|
59
|
-
along with a listing of up to `n_vals` unique values for each column.
|
60
|
-
Intended to be used to insert into an LLM context so it can generate
|
61
|
-
appropriate queries or filters on the df.
|
62
|
-
|
63
|
-
Args:
|
64
|
-
df (pd.DataFrame): The dataframe to describe.
|
65
|
-
filter_fields (list): A list of fields that can be used for filtering.
|
66
|
-
When non-empty, the values-list will be restricted to these.
|
67
|
-
n_vals (int): How many unique values to show for each column.
|
68
|
-
|
69
|
-
Returns:
|
70
|
-
str: A description of the dataframe.
|
71
|
-
"""
|
72
|
-
description = []
|
73
|
-
for column in df.columns.to_list():
|
74
|
-
unique_values = df[column].dropna().unique()
|
75
|
-
unique_count = len(unique_values)
|
76
|
-
if column not in filter_fields:
|
77
|
-
values_desc = f"{unique_count} unique values"
|
78
|
-
else:
|
79
|
-
if unique_count > n_vals:
|
80
|
-
displayed_values = unique_values[:n_vals]
|
81
|
-
more_count = unique_count - n_vals
|
82
|
-
values_desc = f" Values - {displayed_values}, ... {more_count} more"
|
83
|
-
else:
|
84
|
-
values_desc = f" Values - {unique_values}"
|
85
|
-
col_type = "string" if df[column].dtype == "object" else df[column].dtype
|
86
|
-
col_desc = f"* {column} ({col_type}); {values_desc}"
|
87
|
-
description.append(col_desc)
|
88
|
-
|
89
|
-
all_cols = "\n".join(description)
|
90
|
-
|
91
|
-
return f"""
|
92
|
-
Name of each field, its type and unique values (up to {n_vals}):
|
93
|
-
{all_cols}
|
94
|
-
"""
|
langroid/parsing/url_loader.py
DELETED
@@ -1,111 +0,0 @@
|
|
1
|
-
import logging
|
2
|
-
import os
|
3
|
-
from tempfile import NamedTemporaryFile
|
4
|
-
from typing import List, no_type_check
|
5
|
-
|
6
|
-
import requests
|
7
|
-
import trafilatura
|
8
|
-
from trafilatura.downloads import (
|
9
|
-
add_to_compressed_dict,
|
10
|
-
buffered_downloads,
|
11
|
-
load_download_buffer,
|
12
|
-
)
|
13
|
-
|
14
|
-
from langroid.mytypes import DocMetaData, Document
|
15
|
-
from langroid.parsing.document_parser import DocumentParser, ImagePdfParser
|
16
|
-
from langroid.parsing.parser import Parser, ParsingConfig
|
17
|
-
|
18
|
-
logging.getLogger("trafilatura").setLevel(logging.ERROR)
|
19
|
-
|
20
|
-
|
21
|
-
class URLLoader:
|
22
|
-
"""
|
23
|
-
Load a list of URLs and extract the text content.
|
24
|
-
Alternative approaches could use `bs4` or `scrapy`.
|
25
|
-
|
26
|
-
TODO - this currently does not handle cookie dialogs,
|
27
|
-
i.e. if there is a cookie pop-up, most/all of the extracted
|
28
|
-
content could be cookie policy text.
|
29
|
-
We could use `playwright` to simulate a user clicking
|
30
|
-
the "accept" button on the cookie dialog.
|
31
|
-
"""
|
32
|
-
|
33
|
-
def __init__(self, urls: List[str], parser: Parser = Parser(ParsingConfig())):
|
34
|
-
self.urls = urls
|
35
|
-
self.parser = parser
|
36
|
-
|
37
|
-
@no_type_check
|
38
|
-
def load(self) -> List[Document]:
|
39
|
-
docs = []
|
40
|
-
threads = 4
|
41
|
-
# converted the input list to an internal format
|
42
|
-
dl_dict = add_to_compressed_dict(self.urls)
|
43
|
-
# processing loop
|
44
|
-
while not dl_dict.done:
|
45
|
-
buffer, dl_dict = load_download_buffer(
|
46
|
-
dl_dict,
|
47
|
-
sleep_time=5,
|
48
|
-
)
|
49
|
-
for url, result in buffered_downloads(buffer, threads):
|
50
|
-
if (
|
51
|
-
url.lower().endswith(".pdf")
|
52
|
-
or url.lower().endswith(".docx")
|
53
|
-
or url.lower().endswith(".doc")
|
54
|
-
):
|
55
|
-
doc_parser = DocumentParser.create(
|
56
|
-
url,
|
57
|
-
self.parser.config,
|
58
|
-
)
|
59
|
-
new_chunks = doc_parser.get_doc_chunks()
|
60
|
-
if len(new_chunks) == 0:
|
61
|
-
# If the document is empty, try to extract images
|
62
|
-
img_parser = ImagePdfParser(url, self.parser.config)
|
63
|
-
new_chunks = img_parser.get_doc_chunks()
|
64
|
-
docs.extend(new_chunks)
|
65
|
-
else:
|
66
|
-
# Try to detect content type and handle accordingly
|
67
|
-
headers = requests.head(url).headers
|
68
|
-
content_type = headers.get("Content-Type", "").lower()
|
69
|
-
temp_file_suffix = None
|
70
|
-
if "application/pdf" in content_type:
|
71
|
-
temp_file_suffix = ".pdf"
|
72
|
-
elif (
|
73
|
-
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
74
|
-
in content_type
|
75
|
-
):
|
76
|
-
temp_file_suffix = ".docx"
|
77
|
-
elif "application/msword" in content_type:
|
78
|
-
temp_file_suffix = ".doc"
|
79
|
-
|
80
|
-
if temp_file_suffix:
|
81
|
-
# Download the document content
|
82
|
-
response = requests.get(url)
|
83
|
-
with NamedTemporaryFile(
|
84
|
-
delete=False, suffix=temp_file_suffix
|
85
|
-
) as temp_file:
|
86
|
-
temp_file.write(response.content)
|
87
|
-
temp_file_path = temp_file.name
|
88
|
-
# Process the downloaded document
|
89
|
-
doc_parser = DocumentParser.create(
|
90
|
-
temp_file_path, self.parser.config
|
91
|
-
)
|
92
|
-
docs.extend(doc_parser.get_doc_chunks())
|
93
|
-
# Clean up the temporary file
|
94
|
-
os.remove(temp_file_path)
|
95
|
-
else:
|
96
|
-
text = trafilatura.extract(
|
97
|
-
result,
|
98
|
-
no_fallback=False,
|
99
|
-
favor_recall=True,
|
100
|
-
)
|
101
|
-
if (
|
102
|
-
text is None
|
103
|
-
and result is not None
|
104
|
-
and isinstance(result, str)
|
105
|
-
):
|
106
|
-
text = result
|
107
|
-
if text is not None and text != "":
|
108
|
-
docs.append(
|
109
|
-
Document(content=text, metadata=DocMetaData(source=url))
|
110
|
-
)
|
111
|
-
return docs
|
@@ -1,73 +0,0 @@
|
|
1
|
-
import logging
|
2
|
-
from typing import List, no_type_check
|
3
|
-
|
4
|
-
import trafilatura
|
5
|
-
from playwright.sync_api import sync_playwright
|
6
|
-
|
7
|
-
from langroid.mytypes import DocMetaData, Document
|
8
|
-
|
9
|
-
logging.getLogger("trafilatura").setLevel(logging.ERROR)
|
10
|
-
|
11
|
-
|
12
|
-
def accept_cookies_and_extract_content(url: str) -> str:
|
13
|
-
with sync_playwright() as playwright:
|
14
|
-
browser = playwright.chromium.launch(headless=True)
|
15
|
-
context = browser.new_context()
|
16
|
-
page = context.new_page()
|
17
|
-
page.goto(url)
|
18
|
-
|
19
|
-
# List of possible selectors or texts on the cookie consent buttons
|
20
|
-
possible_selectors = [
|
21
|
-
'text="Accept"',
|
22
|
-
'text="Agree"',
|
23
|
-
'text="OK"',
|
24
|
-
'text="Continue"',
|
25
|
-
]
|
26
|
-
|
27
|
-
# Try to click each possible consent button
|
28
|
-
for selector in possible_selectors:
|
29
|
-
try:
|
30
|
-
page.click(selector)
|
31
|
-
print(f"Clicked {selector}")
|
32
|
-
break # If click is successful, break out of the loop
|
33
|
-
except Exception:
|
34
|
-
print(f"Could not click {selector}")
|
35
|
-
|
36
|
-
# Extract and return the page's text content
|
37
|
-
content = page.content()
|
38
|
-
|
39
|
-
context.close()
|
40
|
-
browser.close()
|
41
|
-
content_str: str = content if isinstance(content, str) else ""
|
42
|
-
return content_str
|
43
|
-
|
44
|
-
|
45
|
-
class URLLoader:
|
46
|
-
"""
|
47
|
-
Load a list of URLs and extract the text content.
|
48
|
-
Alternative approaches could use `bs4` or `scrapy`.
|
49
|
-
|
50
|
-
TODO - this currently does not handle cookie dialogs,
|
51
|
-
i.e. if there is a cookie pop-up, most/all of the extracted
|
52
|
-
content could be cookie policy text.
|
53
|
-
We could use `playwright` to simulate a user clicking
|
54
|
-
the "accept" button on the cookie dialog.
|
55
|
-
"""
|
56
|
-
|
57
|
-
def __init__(self, urls: List[str]):
|
58
|
-
self.urls = urls
|
59
|
-
|
60
|
-
@no_type_check
|
61
|
-
def load(self) -> List[Document]:
|
62
|
-
docs = []
|
63
|
-
# converted the input list to an internal format
|
64
|
-
for url in self.urls:
|
65
|
-
html_content = accept_cookies_and_extract_content(url)
|
66
|
-
text = trafilatura.extract(
|
67
|
-
html_content,
|
68
|
-
no_fallback=False,
|
69
|
-
favor_recall=True,
|
70
|
-
)
|
71
|
-
if text is not None and text != "":
|
72
|
-
docs.append(Document(content=text, metadata=DocMetaData(source=url)))
|
73
|
-
return docs
|