langroid 0.31.1__py3-none-any.whl → 0.33.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (163) hide show
  1. {langroid-0.31.1.dist-info → langroid-0.33.3.dist-info}/METADATA +150 -124
  2. langroid-0.33.3.dist-info/RECORD +7 -0
  3. {langroid-0.31.1.dist-info → langroid-0.33.3.dist-info}/WHEEL +1 -1
  4. langroid-0.33.3.dist-info/entry_points.txt +4 -0
  5. pyproject.toml +317 -212
  6. langroid/__init__.py +0 -106
  7. langroid/agent/.chainlit/config.toml +0 -121
  8. langroid/agent/.chainlit/translations/bn.json +0 -231
  9. langroid/agent/.chainlit/translations/en-US.json +0 -229
  10. langroid/agent/.chainlit/translations/gu.json +0 -231
  11. langroid/agent/.chainlit/translations/he-IL.json +0 -231
  12. langroid/agent/.chainlit/translations/hi.json +0 -231
  13. langroid/agent/.chainlit/translations/kn.json +0 -231
  14. langroid/agent/.chainlit/translations/ml.json +0 -231
  15. langroid/agent/.chainlit/translations/mr.json +0 -231
  16. langroid/agent/.chainlit/translations/ta.json +0 -231
  17. langroid/agent/.chainlit/translations/te.json +0 -231
  18. langroid/agent/.chainlit/translations/zh-CN.json +0 -229
  19. langroid/agent/__init__.py +0 -41
  20. langroid/agent/base.py +0 -1981
  21. langroid/agent/batch.py +0 -398
  22. langroid/agent/callbacks/__init__.py +0 -0
  23. langroid/agent/callbacks/chainlit.py +0 -598
  24. langroid/agent/chat_agent.py +0 -1899
  25. langroid/agent/chat_document.py +0 -454
  26. langroid/agent/helpers.py +0 -0
  27. langroid/agent/junk +0 -13
  28. langroid/agent/openai_assistant.py +0 -882
  29. langroid/agent/special/__init__.py +0 -59
  30. langroid/agent/special/arangodb/__init__.py +0 -0
  31. langroid/agent/special/arangodb/arangodb_agent.py +0 -656
  32. langroid/agent/special/arangodb/system_messages.py +0 -186
  33. langroid/agent/special/arangodb/tools.py +0 -107
  34. langroid/agent/special/arangodb/utils.py +0 -36
  35. langroid/agent/special/doc_chat_agent.py +0 -1466
  36. langroid/agent/special/lance_doc_chat_agent.py +0 -262
  37. langroid/agent/special/lance_rag/__init__.py +0 -9
  38. langroid/agent/special/lance_rag/critic_agent.py +0 -198
  39. langroid/agent/special/lance_rag/lance_rag_task.py +0 -82
  40. langroid/agent/special/lance_rag/query_planner_agent.py +0 -260
  41. langroid/agent/special/lance_tools.py +0 -61
  42. langroid/agent/special/neo4j/__init__.py +0 -0
  43. langroid/agent/special/neo4j/csv_kg_chat.py +0 -174
  44. langroid/agent/special/neo4j/neo4j_chat_agent.py +0 -433
  45. langroid/agent/special/neo4j/system_messages.py +0 -120
  46. langroid/agent/special/neo4j/tools.py +0 -32
  47. langroid/agent/special/relevance_extractor_agent.py +0 -127
  48. langroid/agent/special/retriever_agent.py +0 -56
  49. langroid/agent/special/sql/__init__.py +0 -17
  50. langroid/agent/special/sql/sql_chat_agent.py +0 -654
  51. langroid/agent/special/sql/utils/__init__.py +0 -21
  52. langroid/agent/special/sql/utils/description_extractors.py +0 -190
  53. langroid/agent/special/sql/utils/populate_metadata.py +0 -85
  54. langroid/agent/special/sql/utils/system_message.py +0 -35
  55. langroid/agent/special/sql/utils/tools.py +0 -64
  56. langroid/agent/special/table_chat_agent.py +0 -263
  57. langroid/agent/structured_message.py +0 -9
  58. langroid/agent/task.py +0 -2093
  59. langroid/agent/tool_message.py +0 -393
  60. langroid/agent/tools/__init__.py +0 -38
  61. langroid/agent/tools/duckduckgo_search_tool.py +0 -50
  62. langroid/agent/tools/file_tools.py +0 -234
  63. langroid/agent/tools/google_search_tool.py +0 -39
  64. langroid/agent/tools/metaphor_search_tool.py +0 -67
  65. langroid/agent/tools/orchestration.py +0 -303
  66. langroid/agent/tools/recipient_tool.py +0 -235
  67. langroid/agent/tools/retrieval_tool.py +0 -32
  68. langroid/agent/tools/rewind_tool.py +0 -137
  69. langroid/agent/tools/segment_extract_tool.py +0 -41
  70. langroid/agent/typed_task.py +0 -19
  71. langroid/agent/xml_tool_message.py +0 -382
  72. langroid/agent_config.py +0 -0
  73. langroid/cachedb/__init__.py +0 -17
  74. langroid/cachedb/base.py +0 -58
  75. langroid/cachedb/momento_cachedb.py +0 -108
  76. langroid/cachedb/redis_cachedb.py +0 -153
  77. langroid/embedding_models/__init__.py +0 -39
  78. langroid/embedding_models/base.py +0 -74
  79. langroid/embedding_models/clustering.py +0 -189
  80. langroid/embedding_models/models.py +0 -461
  81. langroid/embedding_models/protoc/__init__.py +0 -0
  82. langroid/embedding_models/protoc/embeddings.proto +0 -19
  83. langroid/embedding_models/protoc/embeddings_pb2.py +0 -33
  84. langroid/embedding_models/protoc/embeddings_pb2.pyi +0 -50
  85. langroid/embedding_models/protoc/embeddings_pb2_grpc.py +0 -79
  86. langroid/embedding_models/remote_embeds.py +0 -153
  87. langroid/exceptions.py +0 -65
  88. langroid/experimental/team-save.py +0 -391
  89. langroid/language_models/.chainlit/config.toml +0 -121
  90. langroid/language_models/.chainlit/translations/en-US.json +0 -231
  91. langroid/language_models/__init__.py +0 -53
  92. langroid/language_models/azure_openai.py +0 -153
  93. langroid/language_models/base.py +0 -678
  94. langroid/language_models/config.py +0 -18
  95. langroid/language_models/mock_lm.py +0 -124
  96. langroid/language_models/openai_gpt.py +0 -1923
  97. langroid/language_models/prompt_formatter/__init__.py +0 -16
  98. langroid/language_models/prompt_formatter/base.py +0 -40
  99. langroid/language_models/prompt_formatter/hf_formatter.py +0 -132
  100. langroid/language_models/prompt_formatter/llama2_formatter.py +0 -75
  101. langroid/language_models/utils.py +0 -147
  102. langroid/mytypes.py +0 -84
  103. langroid/parsing/__init__.py +0 -52
  104. langroid/parsing/agent_chats.py +0 -38
  105. langroid/parsing/code-parsing.md +0 -86
  106. langroid/parsing/code_parser.py +0 -121
  107. langroid/parsing/config.py +0 -0
  108. langroid/parsing/document_parser.py +0 -718
  109. langroid/parsing/image_text.py +0 -32
  110. langroid/parsing/para_sentence_split.py +0 -62
  111. langroid/parsing/parse_json.py +0 -155
  112. langroid/parsing/parser.py +0 -313
  113. langroid/parsing/repo_loader.py +0 -790
  114. langroid/parsing/routing.py +0 -36
  115. langroid/parsing/search.py +0 -275
  116. langroid/parsing/spider.py +0 -102
  117. langroid/parsing/table_loader.py +0 -94
  118. langroid/parsing/url_loader.py +0 -111
  119. langroid/parsing/url_loader_cookies.py +0 -73
  120. langroid/parsing/urls.py +0 -273
  121. langroid/parsing/utils.py +0 -373
  122. langroid/parsing/web_search.py +0 -155
  123. langroid/prompts/__init__.py +0 -9
  124. langroid/prompts/chat-gpt4-system-prompt.md +0 -68
  125. langroid/prompts/dialog.py +0 -17
  126. langroid/prompts/prompts_config.py +0 -5
  127. langroid/prompts/templates.py +0 -141
  128. langroid/pydantic_v1/__init__.py +0 -10
  129. langroid/pydantic_v1/main.py +0 -4
  130. langroid/utils/.chainlit/config.toml +0 -121
  131. langroid/utils/.chainlit/translations/en-US.json +0 -231
  132. langroid/utils/__init__.py +0 -19
  133. langroid/utils/algorithms/__init__.py +0 -3
  134. langroid/utils/algorithms/graph.py +0 -103
  135. langroid/utils/configuration.py +0 -98
  136. langroid/utils/constants.py +0 -30
  137. langroid/utils/docker.py +0 -37
  138. langroid/utils/git_utils.py +0 -252
  139. langroid/utils/globals.py +0 -49
  140. langroid/utils/llms/__init__.py +0 -0
  141. langroid/utils/llms/strings.py +0 -8
  142. langroid/utils/logging.py +0 -135
  143. langroid/utils/object_registry.py +0 -66
  144. langroid/utils/output/__init__.py +0 -20
  145. langroid/utils/output/citations.py +0 -41
  146. langroid/utils/output/printing.py +0 -99
  147. langroid/utils/output/status.py +0 -40
  148. langroid/utils/pandas_utils.py +0 -30
  149. langroid/utils/pydantic_utils.py +0 -602
  150. langroid/utils/system.py +0 -286
  151. langroid/utils/types.py +0 -93
  152. langroid/utils/web/__init__.py +0 -0
  153. langroid/utils/web/login.py +0 -83
  154. langroid/vector_store/__init__.py +0 -50
  155. langroid/vector_store/base.py +0 -357
  156. langroid/vector_store/chromadb.py +0 -214
  157. langroid/vector_store/lancedb.py +0 -401
  158. langroid/vector_store/meilisearch.py +0 -299
  159. langroid/vector_store/momento.py +0 -278
  160. langroid/vector_store/qdrant_cloud.py +0 -6
  161. langroid/vector_store/qdrantdb.py +0 -468
  162. langroid-0.31.1.dist-info/RECORD +0 -162
  163. {langroid-0.31.1.dist-info → langroid-0.33.3.dist-info/licenses}/LICENSE +0 -0
@@ -1,36 +0,0 @@
1
- import re
2
- from typing import Optional, Tuple
3
-
4
-
5
- def parse_addressed_message(
6
- content: str, addressing: str = "@"
7
- ) -> Tuple[Optional[str], str]:
8
- """In a message-string containing possibly multiple @<recipient> occurrences,
9
- find the last addressee and extract their name,
10
- and the message content following it.
11
-
12
- E.g. "thank you @bob, now I will ask @alice again. @alice, where is the mirror?" =>
13
- ("alice", "where is the mirror?")
14
-
15
- Args:
16
- content (str): The message content.
17
- addressing (str, optional): The addressing character. Defaults to "@".
18
-
19
- Returns:
20
- Tuple[Optional[str], str]:
21
- A tuple containing the last addressee and the subsequent message content.
22
- """
23
- # Regex to find all occurrences of the pattern
24
- pattern = re.compile(rf"{re.escape(addressing)}(\w+)[^\w]")
25
- matches = list(pattern.finditer(content))
26
-
27
- if not matches:
28
- return None, content # No addressee found, return None and original content
29
-
30
- # Get the last match
31
- last_match = matches[-1]
32
- last_addressee = last_match.group(1)
33
- # Extract content after the last addressee
34
- content_after = content[last_match.end() :].strip()
35
-
36
- return last_addressee, content_after
@@ -1,275 +0,0 @@
1
- """
2
- Utils to search for close matches in (a list of) strings.
3
- Useful for retrieval of docs/chunks relevant to a query, in the context of
4
- Retrieval-Augmented Generation (RAG), and SQLChat (e.g., to pull relevant parts of a
5
- large schema).
6
- See tests for examples: tests/main/test_string_search.py
7
- """
8
-
9
- import difflib
10
- from typing import List, Tuple
11
-
12
- from nltk.corpus import stopwords
13
- from nltk.stem import WordNetLemmatizer
14
- from nltk.tokenize import RegexpTokenizer
15
- from rank_bm25 import BM25Okapi
16
- from thefuzz import fuzz, process
17
-
18
- from langroid.mytypes import Document
19
-
20
- from .utils import download_nltk_resource
21
-
22
-
23
- def find_fuzzy_matches_in_docs(
24
- query: str,
25
- docs: List[Document],
26
- docs_clean: List[Document],
27
- k: int,
28
- words_before: int | None = None,
29
- words_after: int | None = None,
30
- ) -> List[Tuple[Document, float]]:
31
- """
32
- Find approximate matches of the query in the docs and return surrounding
33
- characters.
34
-
35
- Args:
36
- query (str): The search string.
37
- docs (List[Document]): List of Document objects to search through.
38
- docs_clean (List[Document]): List of Document objects with cleaned content.
39
- k (int): Number of best matches to return.
40
- words_before (int|None): Number of words to include before each match.
41
- Default None => return max
42
- words_after (int|None): Number of words to include after each match.
43
- Default None => return max
44
-
45
- Returns:
46
- List[Tuple[Document,float]]: List of (Document, score) tuples.
47
- """
48
- if len(docs) == 0:
49
- return []
50
- best_matches = process.extract(
51
- query,
52
- [d.content for d in docs_clean],
53
- limit=k,
54
- scorer=fuzz.partial_ratio,
55
- )
56
-
57
- real_matches = [(m, score) for m, score in best_matches if score > 50]
58
- # find the original docs that corresponding to the matches
59
- orig_doc_matches = []
60
- for i, (m, s) in enumerate(real_matches):
61
- for j, doc_clean in enumerate(docs_clean):
62
- if m in doc_clean.content:
63
- orig_doc_matches.append((docs[j], s))
64
- break
65
- if words_after is None and words_before is None:
66
- return orig_doc_matches
67
- if len(orig_doc_matches) == 0:
68
- return []
69
- if set(orig_doc_matches[0][0].__fields__) != {"content", "metadata"}:
70
- # If there are fields beyond just content and metadata,
71
- # we do NOT want to create new document objects with content fields
72
- # based on words_before and words_after, since we don't know how to
73
- # set those other fields.
74
- return orig_doc_matches
75
-
76
- contextual_matches = []
77
- for match, score in orig_doc_matches:
78
- choice_text = match.content
79
- contexts = []
80
- while choice_text != "":
81
- context, start_pos, end_pos = get_context(
82
- query, choice_text, words_before, words_after
83
- )
84
- if context == "" or end_pos == 0:
85
- break
86
- contexts.append(context)
87
- words = choice_text.split()
88
- end_pos = min(end_pos, len(words))
89
- choice_text = " ".join(words[end_pos:])
90
- if len(contexts) > 0:
91
- contextual_matches.append(
92
- (
93
- Document(
94
- content=" ... ".join(contexts),
95
- metadata=match.metadata,
96
- ),
97
- score,
98
- )
99
- )
100
-
101
- return contextual_matches
102
-
103
-
104
- def preprocess_text(text: str) -> str:
105
- """
106
- Preprocesses the given text by:
107
- 1. Lowercasing all words.
108
- 2. Tokenizing (splitting the text into words).
109
- 3. Removing punctuation.
110
- 4. Removing stopwords.
111
- 5. Lemmatizing words.
112
-
113
- Args:
114
- text (str): The input text.
115
-
116
- Returns:
117
- str: The preprocessed text.
118
- """
119
- # Ensure the NLTK resources are available
120
- for resource in ["punkt", "wordnet", "stopwords"]:
121
- download_nltk_resource(resource)
122
-
123
- # Lowercase the text
124
- text = text.lower()
125
-
126
- # Tokenize the text and remove punctuation
127
- tokenizer = RegexpTokenizer(r"\w+")
128
- tokens = tokenizer.tokenize(text)
129
-
130
- # Remove stopwords
131
- stop_words = set(stopwords.words("english"))
132
- tokens = [t for t in tokens if t not in stop_words]
133
-
134
- # Lemmatize words
135
- lemmatizer = WordNetLemmatizer()
136
- tokens = [lemmatizer.lemmatize(t) for t in tokens]
137
-
138
- # Join the words back into a string
139
- text = " ".join(tokens)
140
-
141
- return text
142
-
143
-
144
- def find_closest_matches_with_bm25(
145
- docs: List[Document],
146
- docs_clean: List[Document],
147
- query: str,
148
- k: int = 5,
149
- ) -> List[Tuple[Document, float]]:
150
- """
151
- Finds the k closest approximate matches using the BM25 algorithm.
152
-
153
- Args:
154
- docs (List[Document]): List of Documents to search through.
155
- docs_clean (List[Document]): List of cleaned Documents
156
- query (str): The search query.
157
- k (int, optional): Number of matches to retrieve. Defaults to 5.
158
-
159
- Returns:
160
- List[Tuple[Document,float]]: List of (Document, score) tuples.
161
- """
162
- if len(docs) == 0:
163
- return []
164
- texts = [doc.content for doc in docs_clean]
165
- query = preprocess_text(query)
166
-
167
- text_words = [text.split() for text in texts]
168
-
169
- bm25 = BM25Okapi(text_words)
170
- query_words = query.split()
171
- doc_scores = bm25.get_scores(query_words)
172
-
173
- # Get indices of top k scores
174
- top_indices = sorted(range(len(doc_scores)), key=lambda i: -doc_scores[i])[:k]
175
-
176
- # return the original docs, based on the scores from cleaned docs
177
- return [(docs[i], doc_scores[i]) for i in top_indices]
178
-
179
-
180
- def get_context(
181
- query: str,
182
- text: str,
183
- words_before: int | None = 100,
184
- words_after: int | None = 100,
185
- ) -> Tuple[str, int, int]:
186
- """
187
- Returns a portion of text containing the best approximate match of the query,
188
- including b words before and a words after the match.
189
-
190
- Args:
191
- query (str): The string to search for.
192
- text (str): The body of text in which to search.
193
- b (int): The number of words before the query to return.
194
- a (int): The number of words after the query to return.
195
-
196
- Returns:
197
- str: A string containing b words before, the match, and a words after
198
- the best approximate match position of the query in the text. If no
199
- match is found, returns empty string.
200
- int: The start position of the match in the text.
201
- int: The end position of the match in the text.
202
-
203
- Example:
204
- >>> get_context("apple", "The quick brown fox jumps over the apple.", 3, 2)
205
- # 'fox jumps over the apple.'
206
- """
207
- if words_after is None and words_before is None:
208
- # return entire text since we're not asked to return a bounded context
209
- return text, 0, 0
210
-
211
- # make sure there is a good enough match to the query
212
- if fuzz.partial_ratio(query, text) < 40:
213
- return "", 0, 0
214
-
215
- sequence_matcher = difflib.SequenceMatcher(None, text, query)
216
- match = sequence_matcher.find_longest_match(0, len(text), 0, len(query))
217
-
218
- if match.size == 0:
219
- return "", 0, 0
220
-
221
- segments = text.split()
222
- n_segs = len(segments)
223
-
224
- start_segment_pos = len(text[: match.a].split())
225
-
226
- words_before = words_before or n_segs
227
- words_after = words_after or n_segs
228
- start_pos = max(0, start_segment_pos - words_before)
229
- end_pos = min(len(segments), start_segment_pos + words_after + len(query.split()))
230
-
231
- return " ".join(segments[start_pos:end_pos]), start_pos, end_pos
232
-
233
-
234
- def eliminate_near_duplicates(passages: List[str], threshold: float = 0.8) -> List[str]:
235
- """
236
- Eliminate near duplicate text passages from a given list using MinHash and LSH.
237
- TODO: this has not been tested and the datasketch lib is not a dependency.
238
- Args:
239
- passages (List[str]): A list of text passages.
240
- threshold (float, optional): Jaccard similarity threshold to consider two
241
- passages as near-duplicates. Default is 0.8.
242
-
243
- Returns:
244
- List[str]: A list of passages after eliminating near duplicates.
245
-
246
- Example:
247
- passages = ["Hello world", "Hello, world!", "Hi there", "Hello world!"]
248
- print(eliminate_near_duplicates(passages))
249
- # ['Hello world', 'Hi there']
250
- """
251
-
252
- from datasketch import MinHash, MinHashLSH
253
-
254
- # Create LSH index
255
- lsh = MinHashLSH(threshold=threshold, num_perm=128)
256
-
257
- # Create MinHash objects for each passage and insert to LSH
258
- minhashes = {}
259
- for idx, passage in enumerate(passages):
260
- m = MinHash(num_perm=128)
261
- for word in passage.split():
262
- m.update(word.encode("utf-8"))
263
- lsh.insert(idx, m)
264
- minhashes[idx] = m
265
-
266
- unique_idxs = set()
267
- for idx in minhashes.keys():
268
- # Query for similar passages (including itself)
269
- result = lsh.query(minhashes[idx])
270
-
271
- # If only the passage itself is returned, it's unique
272
- if len(result) == 1 and idx in result:
273
- unique_idxs.add(idx)
274
-
275
- return [passages[idx] for idx in unique_idxs]
@@ -1,102 +0,0 @@
1
- from typing import List, Set, no_type_check
2
- from urllib.parse import urlparse
3
-
4
- from langroid.exceptions import LangroidImportError
5
-
6
- try:
7
- from pydispatch import dispatcher
8
- from scrapy import signals
9
- from scrapy.crawler import CrawlerRunner
10
- from scrapy.http import Response
11
- from scrapy.linkextractors import LinkExtractor
12
- from scrapy.spiders import CrawlSpider, Rule
13
- from twisted.internet import defer, reactor
14
- except ImportError:
15
- raise LangroidImportError("scrapy", "scrapy")
16
-
17
-
18
- @no_type_check
19
- class DomainSpecificSpider(CrawlSpider): # type: ignore
20
- name = "domain_specific_spider"
21
-
22
- custom_settings = {"DEPTH_LIMIT": 1, "CLOSESPIDER_ITEMCOUNT": 20}
23
-
24
- rules = (Rule(LinkExtractor(), callback="parse_item", follow=True),)
25
-
26
- def __init__(self, start_url: str, k: int = 20, *args, **kwargs): # type: ignore
27
- """Initialize the spider with start_url and k.
28
-
29
- Args:
30
- start_url (str): The starting URL.
31
- k (int, optional): The max desired final URLs. Defaults to 20.
32
- """
33
- super(DomainSpecificSpider, self).__init__(*args, **kwargs)
34
- self.start_urls = [start_url]
35
- self.allowed_domains = [urlparse(start_url).netloc]
36
- self.k = k
37
- self.visited_urls: Set[str] = set()
38
-
39
- def parse_item(self, response: Response): # type: ignore
40
- """Extracts URLs that are within the same domain.
41
-
42
- Args:
43
- response: The scrapy response object.
44
- """
45
- for link in LinkExtractor(allow_domains=self.allowed_domains).extract_links(
46
- response
47
- ):
48
- if len(self.visited_urls) < self.k:
49
- self.visited_urls.add(link.url)
50
- yield {"url": link.url}
51
-
52
-
53
- @no_type_check
54
- def scrapy_fetch_urls(url: str, k: int = 20) -> List[str]:
55
- """Fetches up to k URLs reachable from the input URL using Scrapy.
56
-
57
- Args:
58
- url (str): The starting URL.
59
- k (int, optional): The max desired final URLs. Defaults to 20.
60
-
61
- Returns:
62
- List[str]: List of URLs within the same domain as the input URL.
63
- """
64
- urls = []
65
-
66
- def _collect_urls(spider):
67
- """Handler for the spider_closed signal. Collects the visited URLs."""
68
- nonlocal urls
69
- urls.extend(list(spider.visited_urls))
70
-
71
- # Connect the spider_closed signal with our handler
72
- dispatcher.connect(_collect_urls, signal=signals.spider_closed)
73
-
74
- runner = CrawlerRunner(
75
- {
76
- "USER_AGENT": "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
77
- }
78
- )
79
-
80
- d = runner.crawl(DomainSpecificSpider, start_url=url, k=k)
81
-
82
- # Block until crawling is done and then stop the reactor
83
- crawl_deferred = defer.Deferred()
84
-
85
- def _crawl_done(_):
86
- reactor.stop()
87
- crawl_deferred.callback(urls)
88
-
89
- d.addBoth(_crawl_done)
90
-
91
- # Start the reactor, it will stop once the crawl is done
92
- reactor.run(installSignalHandlers=0)
93
-
94
- # This will block until the deferred gets a result
95
- return crawl_deferred.result
96
-
97
-
98
- # Test the function
99
- if __name__ == "__main__":
100
- fetched_urls = scrapy_fetch_urls("https://example.com", 5)
101
- for url in fetched_urls:
102
- print(url)
@@ -1,94 +0,0 @@
1
- from csv import Sniffer
2
- from typing import List
3
-
4
- import pandas as pd
5
-
6
-
7
- def read_tabular_data(path_or_url: str, sep: None | str = None) -> pd.DataFrame:
8
- """
9
- Reads tabular data from a file or URL and returns a pandas DataFrame.
10
- The separator is auto-detected if not specified.
11
-
12
- Args:
13
- path_or_url (str): Path or URL to the file to be read.
14
-
15
- Returns:
16
- pd.DataFrame: Data from file or URL as a pandas DataFrame.
17
-
18
- Raises:
19
- ValueError: If the data cannot be read or is misformatted.
20
- """
21
- try:
22
- if sep is None:
23
- # Read the first few lines to guess the separator
24
- with pd.io.common.get_handle(path_or_url, "r") as file_handler:
25
- first_lines = "".join(file_handler.handle.readlines(5))
26
- sep = Sniffer().sniff(first_lines).delimiter
27
- # If it's a local file, reset to the beginning
28
- if hasattr(file_handler.handle, "seek"):
29
- file_handler.handle.seek(0)
30
-
31
- # Read the data
32
-
33
- # get non-blank column names
34
- with pd.io.common.get_handle(path_or_url, "r") as f:
35
- header_line = f.handle.readline().strip()
36
- valid_cols = [col for col in header_line.split(sep) if col]
37
- valid_cols = [c.replace('"', "").replace("'", "") for c in valid_cols]
38
- if hasattr(f.handle, "seek"):
39
- f.handle.seek(0)
40
-
41
- # use only those columns
42
- data = pd.read_csv(path_or_url, sep=sep, usecols=valid_cols)
43
- data.columns = data.columns.str.strip() # e.g. " column 1 " -> "column 1"
44
-
45
- return data
46
-
47
- except Exception as e:
48
- raise ValueError(
49
- "Unable to read data. "
50
- "Please ensure it is correctly formatted. Error: " + str(e)
51
- )
52
-
53
-
54
- def describe_dataframe(
55
- df: pd.DataFrame, filter_fields: List[str] = [], n_vals: int = 10
56
- ) -> str:
57
- """
58
- Generates a description of the columns in the dataframe,
59
- along with a listing of up to `n_vals` unique values for each column.
60
- Intended to be used to insert into an LLM context so it can generate
61
- appropriate queries or filters on the df.
62
-
63
- Args:
64
- df (pd.DataFrame): The dataframe to describe.
65
- filter_fields (list): A list of fields that can be used for filtering.
66
- When non-empty, the values-list will be restricted to these.
67
- n_vals (int): How many unique values to show for each column.
68
-
69
- Returns:
70
- str: A description of the dataframe.
71
- """
72
- description = []
73
- for column in df.columns.to_list():
74
- unique_values = df[column].dropna().unique()
75
- unique_count = len(unique_values)
76
- if column not in filter_fields:
77
- values_desc = f"{unique_count} unique values"
78
- else:
79
- if unique_count > n_vals:
80
- displayed_values = unique_values[:n_vals]
81
- more_count = unique_count - n_vals
82
- values_desc = f" Values - {displayed_values}, ... {more_count} more"
83
- else:
84
- values_desc = f" Values - {unique_values}"
85
- col_type = "string" if df[column].dtype == "object" else df[column].dtype
86
- col_desc = f"* {column} ({col_type}); {values_desc}"
87
- description.append(col_desc)
88
-
89
- all_cols = "\n".join(description)
90
-
91
- return f"""
92
- Name of each field, its type and unique values (up to {n_vals}):
93
- {all_cols}
94
- """
@@ -1,111 +0,0 @@
1
- import logging
2
- import os
3
- from tempfile import NamedTemporaryFile
4
- from typing import List, no_type_check
5
-
6
- import requests
7
- import trafilatura
8
- from trafilatura.downloads import (
9
- add_to_compressed_dict,
10
- buffered_downloads,
11
- load_download_buffer,
12
- )
13
-
14
- from langroid.mytypes import DocMetaData, Document
15
- from langroid.parsing.document_parser import DocumentParser, ImagePdfParser
16
- from langroid.parsing.parser import Parser, ParsingConfig
17
-
18
- logging.getLogger("trafilatura").setLevel(logging.ERROR)
19
-
20
-
21
- class URLLoader:
22
- """
23
- Load a list of URLs and extract the text content.
24
- Alternative approaches could use `bs4` or `scrapy`.
25
-
26
- TODO - this currently does not handle cookie dialogs,
27
- i.e. if there is a cookie pop-up, most/all of the extracted
28
- content could be cookie policy text.
29
- We could use `playwright` to simulate a user clicking
30
- the "accept" button on the cookie dialog.
31
- """
32
-
33
- def __init__(self, urls: List[str], parser: Parser = Parser(ParsingConfig())):
34
- self.urls = urls
35
- self.parser = parser
36
-
37
- @no_type_check
38
- def load(self) -> List[Document]:
39
- docs = []
40
- threads = 4
41
- # converted the input list to an internal format
42
- dl_dict = add_to_compressed_dict(self.urls)
43
- # processing loop
44
- while not dl_dict.done:
45
- buffer, dl_dict = load_download_buffer(
46
- dl_dict,
47
- sleep_time=5,
48
- )
49
- for url, result in buffered_downloads(buffer, threads):
50
- if (
51
- url.lower().endswith(".pdf")
52
- or url.lower().endswith(".docx")
53
- or url.lower().endswith(".doc")
54
- ):
55
- doc_parser = DocumentParser.create(
56
- url,
57
- self.parser.config,
58
- )
59
- new_chunks = doc_parser.get_doc_chunks()
60
- if len(new_chunks) == 0:
61
- # If the document is empty, try to extract images
62
- img_parser = ImagePdfParser(url, self.parser.config)
63
- new_chunks = img_parser.get_doc_chunks()
64
- docs.extend(new_chunks)
65
- else:
66
- # Try to detect content type and handle accordingly
67
- headers = requests.head(url).headers
68
- content_type = headers.get("Content-Type", "").lower()
69
- temp_file_suffix = None
70
- if "application/pdf" in content_type:
71
- temp_file_suffix = ".pdf"
72
- elif (
73
- "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
74
- in content_type
75
- ):
76
- temp_file_suffix = ".docx"
77
- elif "application/msword" in content_type:
78
- temp_file_suffix = ".doc"
79
-
80
- if temp_file_suffix:
81
- # Download the document content
82
- response = requests.get(url)
83
- with NamedTemporaryFile(
84
- delete=False, suffix=temp_file_suffix
85
- ) as temp_file:
86
- temp_file.write(response.content)
87
- temp_file_path = temp_file.name
88
- # Process the downloaded document
89
- doc_parser = DocumentParser.create(
90
- temp_file_path, self.parser.config
91
- )
92
- docs.extend(doc_parser.get_doc_chunks())
93
- # Clean up the temporary file
94
- os.remove(temp_file_path)
95
- else:
96
- text = trafilatura.extract(
97
- result,
98
- no_fallback=False,
99
- favor_recall=True,
100
- )
101
- if (
102
- text is None
103
- and result is not None
104
- and isinstance(result, str)
105
- ):
106
- text = result
107
- if text is not None and text != "":
108
- docs.append(
109
- Document(content=text, metadata=DocMetaData(source=url))
110
- )
111
- return docs
@@ -1,73 +0,0 @@
1
- import logging
2
- from typing import List, no_type_check
3
-
4
- import trafilatura
5
- from playwright.sync_api import sync_playwright
6
-
7
- from langroid.mytypes import DocMetaData, Document
8
-
9
- logging.getLogger("trafilatura").setLevel(logging.ERROR)
10
-
11
-
12
- def accept_cookies_and_extract_content(url: str) -> str:
13
- with sync_playwright() as playwright:
14
- browser = playwright.chromium.launch(headless=True)
15
- context = browser.new_context()
16
- page = context.new_page()
17
- page.goto(url)
18
-
19
- # List of possible selectors or texts on the cookie consent buttons
20
- possible_selectors = [
21
- 'text="Accept"',
22
- 'text="Agree"',
23
- 'text="OK"',
24
- 'text="Continue"',
25
- ]
26
-
27
- # Try to click each possible consent button
28
- for selector in possible_selectors:
29
- try:
30
- page.click(selector)
31
- print(f"Clicked {selector}")
32
- break # If click is successful, break out of the loop
33
- except Exception:
34
- print(f"Could not click {selector}")
35
-
36
- # Extract and return the page's text content
37
- content = page.content()
38
-
39
- context.close()
40
- browser.close()
41
- content_str: str = content if isinstance(content, str) else ""
42
- return content_str
43
-
44
-
45
- class URLLoader:
46
- """
47
- Load a list of URLs and extract the text content.
48
- Alternative approaches could use `bs4` or `scrapy`.
49
-
50
- TODO - this currently does not handle cookie dialogs,
51
- i.e. if there is a cookie pop-up, most/all of the extracted
52
- content could be cookie policy text.
53
- We could use `playwright` to simulate a user clicking
54
- the "accept" button on the cookie dialog.
55
- """
56
-
57
- def __init__(self, urls: List[str]):
58
- self.urls = urls
59
-
60
- @no_type_check
61
- def load(self) -> List[Document]:
62
- docs = []
63
- # converted the input list to an internal format
64
- for url in self.urls:
65
- html_content = accept_cookies_and_extract_content(url)
66
- text = trafilatura.extract(
67
- html_content,
68
- no_fallback=False,
69
- favor_recall=True,
70
- )
71
- if text is not None and text != "":
72
- docs.append(Document(content=text, metadata=DocMetaData(source=url)))
73
- return docs