langroid 0.33.4__py3-none-any.whl → 0.33.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langroid/__init__.py +106 -0
- langroid/agent/__init__.py +41 -0
- langroid/agent/base.py +1983 -0
- langroid/agent/batch.py +398 -0
- langroid/agent/callbacks/__init__.py +0 -0
- langroid/agent/callbacks/chainlit.py +598 -0
- langroid/agent/chat_agent.py +1899 -0
- langroid/agent/chat_document.py +454 -0
- langroid/agent/openai_assistant.py +882 -0
- langroid/agent/special/__init__.py +59 -0
- langroid/agent/special/arangodb/__init__.py +0 -0
- langroid/agent/special/arangodb/arangodb_agent.py +656 -0
- langroid/agent/special/arangodb/system_messages.py +186 -0
- langroid/agent/special/arangodb/tools.py +107 -0
- langroid/agent/special/arangodb/utils.py +36 -0
- langroid/agent/special/doc_chat_agent.py +1466 -0
- langroid/agent/special/lance_doc_chat_agent.py +262 -0
- langroid/agent/special/lance_rag/__init__.py +9 -0
- langroid/agent/special/lance_rag/critic_agent.py +198 -0
- langroid/agent/special/lance_rag/lance_rag_task.py +82 -0
- langroid/agent/special/lance_rag/query_planner_agent.py +260 -0
- langroid/agent/special/lance_tools.py +61 -0
- langroid/agent/special/neo4j/__init__.py +0 -0
- langroid/agent/special/neo4j/csv_kg_chat.py +174 -0
- langroid/agent/special/neo4j/neo4j_chat_agent.py +433 -0
- langroid/agent/special/neo4j/system_messages.py +120 -0
- langroid/agent/special/neo4j/tools.py +32 -0
- langroid/agent/special/relevance_extractor_agent.py +127 -0
- langroid/agent/special/retriever_agent.py +56 -0
- langroid/agent/special/sql/__init__.py +17 -0
- langroid/agent/special/sql/sql_chat_agent.py +654 -0
- langroid/agent/special/sql/utils/__init__.py +21 -0
- langroid/agent/special/sql/utils/description_extractors.py +190 -0
- langroid/agent/special/sql/utils/populate_metadata.py +85 -0
- langroid/agent/special/sql/utils/system_message.py +35 -0
- langroid/agent/special/sql/utils/tools.py +64 -0
- langroid/agent/special/table_chat_agent.py +263 -0
- langroid/agent/task.py +2095 -0
- langroid/agent/tool_message.py +393 -0
- langroid/agent/tools/__init__.py +38 -0
- langroid/agent/tools/duckduckgo_search_tool.py +50 -0
- langroid/agent/tools/file_tools.py +234 -0
- langroid/agent/tools/google_search_tool.py +39 -0
- langroid/agent/tools/metaphor_search_tool.py +68 -0
- langroid/agent/tools/orchestration.py +303 -0
- langroid/agent/tools/recipient_tool.py +235 -0
- langroid/agent/tools/retrieval_tool.py +32 -0
- langroid/agent/tools/rewind_tool.py +137 -0
- langroid/agent/tools/segment_extract_tool.py +41 -0
- langroid/agent/xml_tool_message.py +382 -0
- langroid/cachedb/__init__.py +17 -0
- langroid/cachedb/base.py +58 -0
- langroid/cachedb/momento_cachedb.py +108 -0
- langroid/cachedb/redis_cachedb.py +153 -0
- langroid/embedding_models/__init__.py +39 -0
- langroid/embedding_models/base.py +74 -0
- langroid/embedding_models/models.py +461 -0
- langroid/embedding_models/protoc/__init__.py +0 -0
- langroid/embedding_models/protoc/embeddings.proto +19 -0
- langroid/embedding_models/protoc/embeddings_pb2.py +33 -0
- langroid/embedding_models/protoc/embeddings_pb2.pyi +50 -0
- langroid/embedding_models/protoc/embeddings_pb2_grpc.py +79 -0
- langroid/embedding_models/remote_embeds.py +153 -0
- langroid/exceptions.py +71 -0
- langroid/language_models/__init__.py +53 -0
- langroid/language_models/azure_openai.py +153 -0
- langroid/language_models/base.py +678 -0
- langroid/language_models/config.py +18 -0
- langroid/language_models/mock_lm.py +124 -0
- langroid/language_models/openai_gpt.py +1964 -0
- langroid/language_models/prompt_formatter/__init__.py +16 -0
- langroid/language_models/prompt_formatter/base.py +40 -0
- langroid/language_models/prompt_formatter/hf_formatter.py +132 -0
- langroid/language_models/prompt_formatter/llama2_formatter.py +75 -0
- langroid/language_models/utils.py +151 -0
- langroid/mytypes.py +84 -0
- langroid/parsing/__init__.py +52 -0
- langroid/parsing/agent_chats.py +38 -0
- langroid/parsing/code_parser.py +121 -0
- langroid/parsing/document_parser.py +718 -0
- langroid/parsing/para_sentence_split.py +62 -0
- langroid/parsing/parse_json.py +155 -0
- langroid/parsing/parser.py +313 -0
- langroid/parsing/repo_loader.py +790 -0
- langroid/parsing/routing.py +36 -0
- langroid/parsing/search.py +275 -0
- langroid/parsing/spider.py +102 -0
- langroid/parsing/table_loader.py +94 -0
- langroid/parsing/url_loader.py +111 -0
- langroid/parsing/urls.py +273 -0
- langroid/parsing/utils.py +373 -0
- langroid/parsing/web_search.py +156 -0
- langroid/prompts/__init__.py +9 -0
- langroid/prompts/dialog.py +17 -0
- langroid/prompts/prompts_config.py +5 -0
- langroid/prompts/templates.py +141 -0
- langroid/pydantic_v1/__init__.py +10 -0
- langroid/pydantic_v1/main.py +4 -0
- langroid/utils/__init__.py +19 -0
- langroid/utils/algorithms/__init__.py +3 -0
- langroid/utils/algorithms/graph.py +103 -0
- langroid/utils/configuration.py +98 -0
- langroid/utils/constants.py +30 -0
- langroid/utils/git_utils.py +252 -0
- langroid/utils/globals.py +49 -0
- langroid/utils/logging.py +135 -0
- langroid/utils/object_registry.py +66 -0
- langroid/utils/output/__init__.py +20 -0
- langroid/utils/output/citations.py +41 -0
- langroid/utils/output/printing.py +99 -0
- langroid/utils/output/status.py +40 -0
- langroid/utils/pandas_utils.py +30 -0
- langroid/utils/pydantic_utils.py +602 -0
- langroid/utils/system.py +286 -0
- langroid/utils/types.py +93 -0
- langroid/vector_store/__init__.py +50 -0
- langroid/vector_store/base.py +359 -0
- langroid/vector_store/chromadb.py +214 -0
- langroid/vector_store/lancedb.py +406 -0
- langroid/vector_store/meilisearch.py +299 -0
- langroid/vector_store/momento.py +278 -0
- langroid/vector_store/qdrantdb.py +468 -0
- {langroid-0.33.4.dist-info → langroid-0.33.7.dist-info}/METADATA +95 -94
- langroid-0.33.7.dist-info/RECORD +127 -0
- {langroid-0.33.4.dist-info → langroid-0.33.7.dist-info}/WHEEL +1 -1
- langroid-0.33.4.dist-info/RECORD +0 -7
- langroid-0.33.4.dist-info/entry_points.txt +0 -4
- pyproject.toml +0 -356
- {langroid-0.33.4.dist-info → langroid-0.33.7.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,790 @@
|
|
1
|
+
import itertools
|
2
|
+
import json
|
3
|
+
import logging
|
4
|
+
import os
|
5
|
+
import subprocess
|
6
|
+
import tempfile
|
7
|
+
import time
|
8
|
+
from collections import deque
|
9
|
+
from pathlib import Path
|
10
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
11
|
+
from urllib.parse import urlparse
|
12
|
+
|
13
|
+
from dotenv import load_dotenv
|
14
|
+
from github import Github
|
15
|
+
from github.ContentFile import ContentFile
|
16
|
+
from github.Label import Label
|
17
|
+
from github.Repository import Repository
|
18
|
+
|
19
|
+
from langroid.mytypes import DocMetaData, Document
|
20
|
+
from langroid.parsing.document_parser import DocumentParser, DocumentType
|
21
|
+
from langroid.parsing.parser import Parser, ParsingConfig
|
22
|
+
from langroid.pydantic_v1 import BaseModel, BaseSettings, Field
|
23
|
+
|
24
|
+
logger = logging.getLogger(__name__)
|
25
|
+
|
26
|
+
|
27
|
+
def _get_decoded_content(content_file: ContentFile) -> str:
|
28
|
+
if content_file.encoding == "base64":
|
29
|
+
return content_file.decoded_content.decode("utf-8") or ""
|
30
|
+
elif content_file.encoding == "none":
|
31
|
+
return content_file.content or ""
|
32
|
+
else:
|
33
|
+
raise ValueError(f"Unsupported encoding: {content_file.encoding}")
|
34
|
+
|
35
|
+
|
36
|
+
def _has_files(directory: str) -> bool:
|
37
|
+
"""
|
38
|
+
Recursively checks if there is at least one file in a directory.
|
39
|
+
"""
|
40
|
+
for dirpath, dirnames, filenames in os.walk(directory):
|
41
|
+
if filenames:
|
42
|
+
return True
|
43
|
+
return False
|
44
|
+
|
45
|
+
|
46
|
+
# Pydantic model for GitHub issue data
|
47
|
+
class IssueData(BaseModel):
|
48
|
+
state: str = Field(..., description="State of issue e.g. open or closed")
|
49
|
+
year: int = Field(..., description="Year issue was created")
|
50
|
+
month: int = Field(..., description="Month issue was created")
|
51
|
+
day: int = Field(..., description="Day issue was created")
|
52
|
+
assignee: Optional[str] = Field(..., description="Assignee of issue")
|
53
|
+
size: Optional[str] = Field(..., description="Size of issue, e.g. XS, S, M, L, XXL")
|
54
|
+
text: str = Field(..., description="Text of issue, i.e. description body")
|
55
|
+
|
56
|
+
|
57
|
+
def get_issue_size(labels: List[Label]) -> str | None:
|
58
|
+
sizes = ["XS", "S", "M", "L", "XL", "XXL"]
|
59
|
+
return next((label.name for label in labels if label.name in sizes), None)
|
60
|
+
|
61
|
+
|
62
|
+
class RepoLoaderConfig(BaseSettings):
|
63
|
+
"""
|
64
|
+
Configuration for RepoLoader.
|
65
|
+
"""
|
66
|
+
|
67
|
+
non_code_types: List[str] = [
|
68
|
+
"md",
|
69
|
+
"txt",
|
70
|
+
"text",
|
71
|
+
]
|
72
|
+
|
73
|
+
file_types: List[str] = [
|
74
|
+
"py",
|
75
|
+
"md",
|
76
|
+
"yml",
|
77
|
+
"yaml",
|
78
|
+
"txt",
|
79
|
+
"text",
|
80
|
+
"sh",
|
81
|
+
"ini",
|
82
|
+
"toml",
|
83
|
+
"cfg",
|
84
|
+
"json",
|
85
|
+
"rst",
|
86
|
+
"Makefile",
|
87
|
+
"Dockerfile",
|
88
|
+
]
|
89
|
+
|
90
|
+
exclude_dirs: List[str] = [
|
91
|
+
".gitignore",
|
92
|
+
".gitmodules",
|
93
|
+
".gitattributes",
|
94
|
+
".git",
|
95
|
+
".idea",
|
96
|
+
".vscode",
|
97
|
+
".circleci",
|
98
|
+
]
|
99
|
+
|
100
|
+
|
101
|
+
class RepoLoader:
|
102
|
+
"""
|
103
|
+
Class for recursively getting all file content in a repo.
|
104
|
+
"""
|
105
|
+
|
106
|
+
def __init__(
|
107
|
+
self,
|
108
|
+
url: str,
|
109
|
+
config: RepoLoaderConfig = RepoLoaderConfig(),
|
110
|
+
):
|
111
|
+
"""
|
112
|
+
Args:
|
113
|
+
url: full github url of repo, or just "owner/repo"
|
114
|
+
config: configuration for RepoLoader
|
115
|
+
"""
|
116
|
+
self.url = url
|
117
|
+
self.config = config
|
118
|
+
self.clone_path: Optional[str] = None
|
119
|
+
self.log_file = ".logs/repo_loader/download_log.json"
|
120
|
+
os.makedirs(os.path.dirname(self.log_file), exist_ok=True)
|
121
|
+
if not os.path.exists(self.log_file):
|
122
|
+
with open(self.log_file, "w") as f:
|
123
|
+
json.dump({"junk": "ignore"}, f)
|
124
|
+
with open(self.log_file, "r") as f:
|
125
|
+
log = json.load(f)
|
126
|
+
if self.url in log and os.path.exists(log[self.url]):
|
127
|
+
logger.info(f"Repo Already downloaded in {log[self.url]}")
|
128
|
+
self.clone_path = log[self.url]
|
129
|
+
|
130
|
+
if "github.com" in self.url:
|
131
|
+
repo_name = self.url.split("github.com/")[1]
|
132
|
+
else:
|
133
|
+
repo_name = self.url
|
134
|
+
load_dotenv()
|
135
|
+
# authenticated calls to github api have higher rate limit
|
136
|
+
token = os.getenv("GITHUB_ACCESS_TOKEN")
|
137
|
+
g = Github(token)
|
138
|
+
self.repo = self._get_repo_with_retry(g, repo_name)
|
139
|
+
|
140
|
+
@staticmethod
|
141
|
+
def _get_repo_with_retry(
|
142
|
+
g: Github, repo_name: str, max_retries: int = 5
|
143
|
+
) -> Repository:
|
144
|
+
"""
|
145
|
+
Get a repo from the GitHub API, retrying if the request fails,
|
146
|
+
with exponential backoff.
|
147
|
+
|
148
|
+
Args:
|
149
|
+
g: GitHub object
|
150
|
+
repo_name: name of repo
|
151
|
+
max_retries: maximum number of retries
|
152
|
+
Returns:
|
153
|
+
Repo: GitHub repo object
|
154
|
+
|
155
|
+
"""
|
156
|
+
base_delay = 2 # base delay in seconds
|
157
|
+
max_delay = 60 # maximum delay in seconds
|
158
|
+
|
159
|
+
for attempt in range(max_retries):
|
160
|
+
try:
|
161
|
+
return g.get_repo(repo_name)
|
162
|
+
except Exception as e:
|
163
|
+
delay = min(max_delay, base_delay * 2**attempt)
|
164
|
+
logger.info(
|
165
|
+
f"Attempt {attempt+1} failed with error: {str(e)}. "
|
166
|
+
f"Retrying in {delay} seconds..."
|
167
|
+
)
|
168
|
+
time.sleep(delay)
|
169
|
+
raise Exception(f"Failed to get repo {repo_name} after {max_retries} attempts.")
|
170
|
+
|
171
|
+
def _get_dir_name(self) -> str:
|
172
|
+
return urlparse(self.url).path.replace("/", "_")
|
173
|
+
|
174
|
+
def get_issues(self, k: int | None = 100) -> List[IssueData]:
|
175
|
+
"""Get up to k issues from the GitHub repo."""
|
176
|
+
if k is None:
|
177
|
+
issues = self.repo.get_issues(state="all")
|
178
|
+
else:
|
179
|
+
issues = self.repo.get_issues(state="all")[:k]
|
180
|
+
issue_data_list = []
|
181
|
+
for issue in issues:
|
182
|
+
issue_data = IssueData(
|
183
|
+
state=issue.state,
|
184
|
+
year=issue.created_at.year,
|
185
|
+
month=issue.created_at.month,
|
186
|
+
day=issue.created_at.day,
|
187
|
+
assignee=issue.assignee.login if issue.assignee else None,
|
188
|
+
size=get_issue_size(issue.labels),
|
189
|
+
text=issue.body or "No issue description body.",
|
190
|
+
)
|
191
|
+
issue_data_list.append(issue_data)
|
192
|
+
|
193
|
+
return issue_data_list
|
194
|
+
|
195
|
+
@staticmethod
|
196
|
+
def _file_type(name: str) -> str:
|
197
|
+
"""
|
198
|
+
Get the file type of a file name.
|
199
|
+
Args:
|
200
|
+
name: name of file, can be "a", "a.b", or ".b"
|
201
|
+
Returns:
|
202
|
+
str: file type; "a" => "a", "a.b" => "b", ".b" => "b"
|
203
|
+
some examples:
|
204
|
+
"Makefile" => "Makefile",
|
205
|
+
"script.py" => "py",
|
206
|
+
".gitignore" => "gitignore"
|
207
|
+
"""
|
208
|
+
# "a" -> ("a", ""), "a.b" -> ("a", ".b"), ".b" -> (".b", "")
|
209
|
+
file_parts = os.path.splitext(name)
|
210
|
+
if file_parts[1] == "":
|
211
|
+
file_type = file_parts[0] # ("a", "") => "a"
|
212
|
+
else:
|
213
|
+
file_type = file_parts[1][1:] # (*,".b") => "b"
|
214
|
+
return file_type
|
215
|
+
|
216
|
+
def _is_code(self, file_type: str) -> bool:
|
217
|
+
"""
|
218
|
+
Check if a file type is code.
|
219
|
+
|
220
|
+
Args:
|
221
|
+
file_type: file type, e.g. "py", "md", "txt"
|
222
|
+
Returns:
|
223
|
+
bool: whether file type is code
|
224
|
+
"""
|
225
|
+
return file_type not in self.config.non_code_types
|
226
|
+
|
227
|
+
def _is_allowed(self, content: ContentFile) -> bool:
|
228
|
+
"""
|
229
|
+
Check if a file or directory content is allowed to be included.
|
230
|
+
|
231
|
+
Args:
|
232
|
+
content (ContentFile): The file or directory Content object.
|
233
|
+
|
234
|
+
Returns:
|
235
|
+
bool: Whether the file or directory is allowed to be included.
|
236
|
+
"""
|
237
|
+
if content.type == "dir":
|
238
|
+
return content.name not in self.config.exclude_dirs
|
239
|
+
elif content.type == "file":
|
240
|
+
return self._file_type(content.name) in self.config.file_types
|
241
|
+
else:
|
242
|
+
return False
|
243
|
+
|
244
|
+
def default_clone_path(self) -> str:
|
245
|
+
return tempfile.mkdtemp(suffix=self._get_dir_name())
|
246
|
+
|
247
|
+
def clone(self, path: Optional[str] = None) -> Optional[str]:
|
248
|
+
"""
|
249
|
+
Clone a GitHub repository to a local directory specified by `path`,
|
250
|
+
if it has not already been cloned.
|
251
|
+
|
252
|
+
Args:
|
253
|
+
path (str): The local directory where the repository should be cloned.
|
254
|
+
If not specified, a temporary directory will be created.
|
255
|
+
|
256
|
+
Returns:
|
257
|
+
str: The path to the local directory where the repository was cloned.
|
258
|
+
"""
|
259
|
+
with open(self.log_file, "r") as f:
|
260
|
+
log: Dict[str, str] = json.load(f)
|
261
|
+
|
262
|
+
if (
|
263
|
+
self.url in log
|
264
|
+
and os.path.exists(log[self.url])
|
265
|
+
and _has_files(log[self.url])
|
266
|
+
):
|
267
|
+
logger.warning(f"Repo Already downloaded in {log[self.url]}")
|
268
|
+
self.clone_path = log[self.url]
|
269
|
+
return self.clone_path
|
270
|
+
|
271
|
+
self.clone_path = path
|
272
|
+
if path is None:
|
273
|
+
path = self.default_clone_path()
|
274
|
+
self.clone_path = path
|
275
|
+
|
276
|
+
try:
|
277
|
+
subprocess.run(["git", "clone", self.url, path], check=True)
|
278
|
+
log[self.url] = path
|
279
|
+
with open(self.log_file, "w") as f:
|
280
|
+
json.dump(log, f)
|
281
|
+
return self.clone_path
|
282
|
+
except subprocess.CalledProcessError as e:
|
283
|
+
logger.error(f"Git clone failed: {e}")
|
284
|
+
except Exception as e:
|
285
|
+
logger.error(f"An error occurred while trying to clone the repository:{e}")
|
286
|
+
|
287
|
+
return self.clone_path
|
288
|
+
|
289
|
+
def load_tree_from_github(
|
290
|
+
self, depth: int, lines: int = 0
|
291
|
+
) -> Dict[str, Union[str, List[Dict[str, Any]]]]:
|
292
|
+
"""
|
293
|
+
Get a nested dictionary of GitHub repository file and directory names
|
294
|
+
up to a certain depth, with file contents.
|
295
|
+
|
296
|
+
Args:
|
297
|
+
depth (int): The depth level.
|
298
|
+
lines (int): The number of lines of file contents to include.
|
299
|
+
|
300
|
+
Returns:
|
301
|
+
Dict[str, Union[str, List[Dict]]]:
|
302
|
+
A dictionary containing file and directory names, with file contents.
|
303
|
+
"""
|
304
|
+
root_contents = self.repo.get_contents("")
|
305
|
+
if not isinstance(root_contents, list):
|
306
|
+
root_contents = [root_contents]
|
307
|
+
repo_structure = {
|
308
|
+
"type": "dir",
|
309
|
+
"name": "",
|
310
|
+
"dirs": [],
|
311
|
+
"files": [],
|
312
|
+
"path": "",
|
313
|
+
}
|
314
|
+
|
315
|
+
# A queue of tuples (current_node, current_depth, parent_structure)
|
316
|
+
queue = deque([(root_contents, 0, repo_structure)])
|
317
|
+
|
318
|
+
while queue:
|
319
|
+
current_node, current_depth, parent_structure = queue.popleft()
|
320
|
+
|
321
|
+
for content in current_node:
|
322
|
+
if not self._is_allowed(content):
|
323
|
+
continue
|
324
|
+
if content.type == "dir" and current_depth < depth:
|
325
|
+
# Create a new sub-dictionary for this directory
|
326
|
+
new_dir = {
|
327
|
+
"type": "dir",
|
328
|
+
"name": content.name,
|
329
|
+
"dirs": [],
|
330
|
+
"files": [],
|
331
|
+
"path": content.path,
|
332
|
+
}
|
333
|
+
parent_structure["dirs"].append(new_dir)
|
334
|
+
contents = self.repo.get_contents(content.path)
|
335
|
+
if not isinstance(contents, list):
|
336
|
+
contents = [contents]
|
337
|
+
queue.append(
|
338
|
+
(
|
339
|
+
contents,
|
340
|
+
current_depth + 1,
|
341
|
+
new_dir,
|
342
|
+
)
|
343
|
+
)
|
344
|
+
elif content.type == "file":
|
345
|
+
file_content = "\n".join(
|
346
|
+
_get_decoded_content(content).splitlines()[:lines]
|
347
|
+
)
|
348
|
+
file_dict = {
|
349
|
+
"type": "file",
|
350
|
+
"name": content.name,
|
351
|
+
"content": file_content,
|
352
|
+
"path": content.path,
|
353
|
+
}
|
354
|
+
parent_structure["files"].append(file_dict)
|
355
|
+
|
356
|
+
return repo_structure
|
357
|
+
|
358
|
+
def load(
|
359
|
+
self,
|
360
|
+
path: Optional[str] = None,
|
361
|
+
depth: int = 3,
|
362
|
+
lines: int = 0,
|
363
|
+
) -> Tuple[Dict[str, Union[str, List[Dict[str, Any]]]], List[Document]]:
|
364
|
+
"""
|
365
|
+
From a local folder `path` (if None, the repo clone path), get:
|
366
|
+
a nested dictionary (tree) of dicts, files and contents
|
367
|
+
a list of Document objects for each file.
|
368
|
+
|
369
|
+
Args:
|
370
|
+
path (str): The local folder path; if none, use self.clone_path()
|
371
|
+
depth (int): The depth level.
|
372
|
+
lines (int): The number of lines of file contents to include.
|
373
|
+
|
374
|
+
Returns:
|
375
|
+
Tuple of (dict, List_of_Documents):
|
376
|
+
A dictionary containing file and directory names, with file
|
377
|
+
contents, and a list of Document objects for each file.
|
378
|
+
"""
|
379
|
+
if path is None:
|
380
|
+
if self.clone_path is None or not _has_files(self.clone_path):
|
381
|
+
self.clone()
|
382
|
+
path = self.clone_path
|
383
|
+
if path is None:
|
384
|
+
raise ValueError("Unable to clone repo")
|
385
|
+
return self.load_from_folder(
|
386
|
+
path=path,
|
387
|
+
depth=depth,
|
388
|
+
lines=lines,
|
389
|
+
file_types=self.config.file_types,
|
390
|
+
exclude_dirs=self.config.exclude_dirs,
|
391
|
+
url=self.url,
|
392
|
+
)
|
393
|
+
|
394
|
+
@staticmethod
|
395
|
+
def load_from_folder(
|
396
|
+
path: str,
|
397
|
+
depth: int = 3,
|
398
|
+
lines: int = 0,
|
399
|
+
file_types: Optional[List[str]] = None,
|
400
|
+
exclude_dirs: Optional[List[str]] = None,
|
401
|
+
url: str = "",
|
402
|
+
) -> Tuple[Dict[str, Union[str, List[Dict[str, Any]]]], List[Document]]:
|
403
|
+
"""
|
404
|
+
From a local folder `path` (required), get:
|
405
|
+
a nested dictionary (tree) of dicts, files and contents, restricting to
|
406
|
+
desired file_types and excluding undesired directories.
|
407
|
+
a list of Document objects for each file.
|
408
|
+
|
409
|
+
Args:
|
410
|
+
path (str): The local folder path, required.
|
411
|
+
depth (int): The depth level. Optional, default 3.
|
412
|
+
lines (int): The number of lines of file contents to include.
|
413
|
+
Optional, default 0 (no lines => empty string).
|
414
|
+
file_types (List[str]): The file types to include.
|
415
|
+
Optional, default None (all).
|
416
|
+
exclude_dirs (List[str]): The directories to exclude.
|
417
|
+
Optional, default None (no exclusions).
|
418
|
+
url (str): Optional url, to be stored in docs as metadata. Default "".
|
419
|
+
|
420
|
+
Returns:
|
421
|
+
Tuple of (dict, List_of_Documents):
|
422
|
+
A dictionary containing file and directory names, with file contents.
|
423
|
+
A list of Document objects for each file.
|
424
|
+
"""
|
425
|
+
|
426
|
+
folder_structure = {
|
427
|
+
"type": "dir",
|
428
|
+
"name": "",
|
429
|
+
"dirs": [],
|
430
|
+
"files": [],
|
431
|
+
"path": "",
|
432
|
+
}
|
433
|
+
# A queue of tuples (current_path, current_depth, parent_structure)
|
434
|
+
queue = deque([(path, 0, folder_structure)])
|
435
|
+
docs = []
|
436
|
+
exclude_dirs = exclude_dirs or []
|
437
|
+
while queue:
|
438
|
+
current_path, current_depth, parent_structure = queue.popleft()
|
439
|
+
|
440
|
+
for item in os.listdir(current_path):
|
441
|
+
item_path = os.path.join(current_path, item)
|
442
|
+
relative_path = os.path.relpath(item_path, path)
|
443
|
+
if (os.path.isdir(item_path) and item in exclude_dirs) or (
|
444
|
+
os.path.isfile(item_path)
|
445
|
+
and file_types is not None
|
446
|
+
and RepoLoader._file_type(item) not in file_types
|
447
|
+
):
|
448
|
+
continue
|
449
|
+
|
450
|
+
if os.path.isdir(item_path) and current_depth < depth:
|
451
|
+
# Create a new sub-dictionary for this directory
|
452
|
+
new_dir = {
|
453
|
+
"type": "dir",
|
454
|
+
"name": item,
|
455
|
+
"dirs": [],
|
456
|
+
"files": [],
|
457
|
+
"path": relative_path,
|
458
|
+
}
|
459
|
+
parent_structure["dirs"].append(new_dir)
|
460
|
+
queue.append((item_path, current_depth + 1, new_dir))
|
461
|
+
elif os.path.isfile(item_path):
|
462
|
+
# Add the file to the current dictionary
|
463
|
+
with open(item_path, "r") as f:
|
464
|
+
file_lines = list(itertools.islice(f, lines))
|
465
|
+
file_content = "\n".join(line.strip() for line in file_lines)
|
466
|
+
if file_content == "":
|
467
|
+
continue
|
468
|
+
|
469
|
+
file_dict = {
|
470
|
+
"type": "file",
|
471
|
+
"name": item,
|
472
|
+
"content": file_content,
|
473
|
+
"path": relative_path,
|
474
|
+
}
|
475
|
+
parent_structure["files"].append(file_dict)
|
476
|
+
docs.append(
|
477
|
+
Document(
|
478
|
+
content=file_content,
|
479
|
+
metadata=DocMetaData(
|
480
|
+
repo=url,
|
481
|
+
source=relative_path,
|
482
|
+
url=url,
|
483
|
+
filename=item,
|
484
|
+
extension=RepoLoader._file_type(item),
|
485
|
+
language=RepoLoader._file_type(item),
|
486
|
+
),
|
487
|
+
)
|
488
|
+
)
|
489
|
+
return folder_structure, docs
|
490
|
+
|
491
|
+
@staticmethod
|
492
|
+
def get_documents(
|
493
|
+
path: str | bytes,
|
494
|
+
parser: Parser = Parser(ParsingConfig()),
|
495
|
+
file_types: Optional[List[str]] = None,
|
496
|
+
exclude_dirs: Optional[List[str]] = None,
|
497
|
+
depth: int = -1,
|
498
|
+
lines: Optional[int] = None,
|
499
|
+
doc_type: str | DocumentType | None = None,
|
500
|
+
) -> List[Document]:
|
501
|
+
"""
|
502
|
+
Recursively get all files under a path as Document objects.
|
503
|
+
|
504
|
+
Args:
|
505
|
+
path (str|bytes): The path to the directory or file, or bytes content.
|
506
|
+
The bytes option is meant to support the case where the content
|
507
|
+
has already been read from a file in an upstream process
|
508
|
+
(e.g. from an API or a database), and we want to avoid having to
|
509
|
+
write it to a temporary file just to read it again.
|
510
|
+
(which can be very slow for large files,
|
511
|
+
especially in a docker container)
|
512
|
+
parser (Parser): Parser to use to parse files.
|
513
|
+
file_types (List[str], optional): List of file extensions OR
|
514
|
+
filenames OR file_path_names to include.
|
515
|
+
Defaults to None, which includes all files.
|
516
|
+
exclude_dirs (List[str], optional): List of directories to exclude.
|
517
|
+
Defaults to None, which includes all directories.
|
518
|
+
depth (int, optional): Max depth of recursion. Defaults to -1,
|
519
|
+
which includes all depths.
|
520
|
+
lines (int, optional): Number of lines to read from each file.
|
521
|
+
Defaults to None, which reads all lines.
|
522
|
+
doc_type (str|DocumentType, optional): The type of document to parse.
|
523
|
+
|
524
|
+
Returns:
|
525
|
+
List[Document]: List of Document objects representing files.
|
526
|
+
|
527
|
+
"""
|
528
|
+
docs = []
|
529
|
+
file_paths = []
|
530
|
+
if isinstance(path, bytes):
|
531
|
+
file_paths.append(path)
|
532
|
+
else:
|
533
|
+
path_obj = Path(path).resolve()
|
534
|
+
|
535
|
+
if path_obj.is_file():
|
536
|
+
file_paths.append(str(path_obj))
|
537
|
+
else:
|
538
|
+
path_depth = len(path_obj.parts)
|
539
|
+
for root, dirs, files in os.walk(path):
|
540
|
+
# Exclude directories if needed
|
541
|
+
if exclude_dirs:
|
542
|
+
dirs[:] = [d for d in dirs if d not in exclude_dirs]
|
543
|
+
|
544
|
+
current_depth = len(Path(root).resolve().parts) - path_depth
|
545
|
+
if depth == -1 or current_depth <= depth:
|
546
|
+
for file in files:
|
547
|
+
file_path = str(Path(root) / file)
|
548
|
+
if (
|
549
|
+
file_types is None
|
550
|
+
or RepoLoader._file_type(file_path) in file_types
|
551
|
+
or os.path.basename(file_path) in file_types
|
552
|
+
or file_path in file_types
|
553
|
+
):
|
554
|
+
file_paths.append(file_path)
|
555
|
+
|
556
|
+
for file_path in file_paths:
|
557
|
+
docs.extend(
|
558
|
+
DocumentParser.chunks_from_path_or_bytes(
|
559
|
+
file_path,
|
560
|
+
parser,
|
561
|
+
doc_type=doc_type,
|
562
|
+
lines=lines,
|
563
|
+
)
|
564
|
+
)
|
565
|
+
return docs
|
566
|
+
|
567
|
+
def load_docs_from_github(
|
568
|
+
self,
|
569
|
+
k: Optional[int] = None,
|
570
|
+
depth: Optional[int] = None,
|
571
|
+
lines: Optional[int] = None,
|
572
|
+
) -> List[Document]:
|
573
|
+
"""
|
574
|
+
Directly from GitHub, recursively get all files in a repo that have one of the
|
575
|
+
extensions, possibly up to a max number of files, max depth, and max number
|
576
|
+
of lines per file (if any of these are specified).
|
577
|
+
|
578
|
+
Args:
|
579
|
+
k (int): max number of files to load, or None for all files
|
580
|
+
depth (int): max depth to recurse, or None for infinite depth
|
581
|
+
lines (int): max number of lines to get, from a file, or None for all lines
|
582
|
+
|
583
|
+
Returns:
|
584
|
+
list of Document objects, each has fields `content` and `metadata`,
|
585
|
+
and `metadata` has fields `url`, `filename`, `extension`, `language`
|
586
|
+
"""
|
587
|
+
contents = self.repo.get_contents("")
|
588
|
+
if not isinstance(contents, list):
|
589
|
+
contents = [contents]
|
590
|
+
stack = list(zip(contents, [0] * len(contents))) # stack of (content, depth)
|
591
|
+
# recursively get all files in repo that have one of the extensions
|
592
|
+
docs = []
|
593
|
+
i = 0
|
594
|
+
|
595
|
+
while stack:
|
596
|
+
if k is not None and i == k:
|
597
|
+
break
|
598
|
+
file_content, d = stack.pop()
|
599
|
+
if not self._is_allowed(file_content):
|
600
|
+
continue
|
601
|
+
if file_content.type == "dir":
|
602
|
+
if depth is None or d <= depth:
|
603
|
+
items = self.repo.get_contents(file_content.path)
|
604
|
+
if not isinstance(items, list):
|
605
|
+
items = [items]
|
606
|
+
stack.extend(list(zip(items, [d + 1] * len(items))))
|
607
|
+
else:
|
608
|
+
if depth is None or d <= depth:
|
609
|
+
# need to decode the file content, which is in bytes
|
610
|
+
contents = self.repo.get_contents(file_content.path)
|
611
|
+
if isinstance(contents, list):
|
612
|
+
contents = contents[0]
|
613
|
+
text = _get_decoded_content(contents)
|
614
|
+
if lines is not None:
|
615
|
+
text = "\n".join(text.split("\n")[:lines])
|
616
|
+
i += 1
|
617
|
+
|
618
|
+
# Note `source` is important, it may be used to cite
|
619
|
+
# evidence for an answer.
|
620
|
+
# See URLLoader
|
621
|
+
# TODO we should use Pydantic to enforce/standardize this
|
622
|
+
|
623
|
+
docs.append(
|
624
|
+
Document(
|
625
|
+
content=text,
|
626
|
+
metadata=DocMetaData(
|
627
|
+
repo=self.url,
|
628
|
+
source=file_content.html_url,
|
629
|
+
url=file_content.html_url,
|
630
|
+
filename=file_content.name,
|
631
|
+
extension=self._file_type(file_content.name),
|
632
|
+
language=self._file_type(file_content.name),
|
633
|
+
),
|
634
|
+
)
|
635
|
+
)
|
636
|
+
return docs
|
637
|
+
|
638
|
+
@staticmethod
|
639
|
+
def select(
|
640
|
+
structure: Dict[str, Union[str, List[Dict[str, Any]]]],
|
641
|
+
includes: List[str],
|
642
|
+
excludes: List[str] = [],
|
643
|
+
) -> Dict[str, Union[str, List[Dict[str, Any]]]]:
|
644
|
+
"""
|
645
|
+
Filter a structure dictionary for certain directories and files.
|
646
|
+
|
647
|
+
Args:
|
648
|
+
structure (Dict[str, Union[str, List[Dict]]]): The structure dictionary.
|
649
|
+
includes (List[str]): A list of desired directories and files.
|
650
|
+
For files, either full file names or "file type" can be specified.
|
651
|
+
E.g. "toml" will include all files with the ".toml" extension,
|
652
|
+
or "Makefile" will include all files named "Makefile".
|
653
|
+
excludes (List[str]): A list of directories and files to exclude.
|
654
|
+
Similar to `includes`, full file/dir names or "file type" can be
|
655
|
+
specified. Optional, defaults to empty list.
|
656
|
+
|
657
|
+
|
658
|
+
Returns:
|
659
|
+
Dict[str, Union[str, List[Dict]]]: The filtered structure dictionary.
|
660
|
+
"""
|
661
|
+
filtered_structure = {
|
662
|
+
"type": structure["type"],
|
663
|
+
"name": structure["name"],
|
664
|
+
"dirs": [],
|
665
|
+
"files": [],
|
666
|
+
"path": structure["path"],
|
667
|
+
}
|
668
|
+
|
669
|
+
for dir in structure["dirs"]:
|
670
|
+
if (
|
671
|
+
dir["name"] in includes
|
672
|
+
or RepoLoader._file_type(dir["name"]) in includes
|
673
|
+
) and (
|
674
|
+
dir["name"] not in excludes
|
675
|
+
and RepoLoader._file_type(dir["name"]) not in excludes
|
676
|
+
):
|
677
|
+
# If the directory is in the select list, include the whole subtree
|
678
|
+
filtered_structure["dirs"].append(dir)
|
679
|
+
else:
|
680
|
+
# Otherwise, filter the directory's contents
|
681
|
+
filtered_dir = RepoLoader.select(dir, includes)
|
682
|
+
if (
|
683
|
+
filtered_dir["dirs"] or filtered_dir["files"]
|
684
|
+
): # only add if not empty
|
685
|
+
filtered_structure["dirs"].append(filtered_dir)
|
686
|
+
|
687
|
+
for file in structure["files"]:
|
688
|
+
if (
|
689
|
+
file["name"] in includes
|
690
|
+
or RepoLoader._file_type(file["name"]) in includes
|
691
|
+
) and (
|
692
|
+
file["name"] not in excludes
|
693
|
+
and RepoLoader._file_type(file["name"]) not in excludes
|
694
|
+
):
|
695
|
+
filtered_structure["files"].append(file)
|
696
|
+
|
697
|
+
return filtered_structure
|
698
|
+
|
699
|
+
@staticmethod
|
700
|
+
def ls(structure: Dict[str, Union[str, List[Dict]]], depth: int = 0) -> List[str]:
|
701
|
+
"""
|
702
|
+
Get a list of names of files or directories up to a certain depth from a
|
703
|
+
structure dictionary.
|
704
|
+
|
705
|
+
Args:
|
706
|
+
structure (Dict[str, Union[str, List[Dict]]]): The structure dictionary.
|
707
|
+
depth (int, optional): The depth level. Defaults to 0.
|
708
|
+
|
709
|
+
Returns:
|
710
|
+
List[str]: A list of names of files or directories.
|
711
|
+
"""
|
712
|
+
names = []
|
713
|
+
|
714
|
+
# A queue of tuples (current_structure, current_depth)
|
715
|
+
queue = deque([(structure, 0)])
|
716
|
+
|
717
|
+
while queue:
|
718
|
+
current_structure, current_depth = queue.popleft()
|
719
|
+
|
720
|
+
if current_depth <= depth:
|
721
|
+
names.append(current_structure["name"])
|
722
|
+
|
723
|
+
for dir in current_structure["dirs"]:
|
724
|
+
queue.append((dir, current_depth + 1))
|
725
|
+
|
726
|
+
for file in current_structure["files"]:
|
727
|
+
# add file names only if depth is less than the limit
|
728
|
+
if current_depth < depth:
|
729
|
+
names.append(file["name"])
|
730
|
+
names = [n for n in names if n not in ["", None]]
|
731
|
+
return names
|
732
|
+
|
733
|
+
@staticmethod
|
734
|
+
def list_files(
|
735
|
+
dir: str,
|
736
|
+
depth: int = 1,
|
737
|
+
include_types: List[str] = [],
|
738
|
+
exclude_types: List[str] = [],
|
739
|
+
) -> List[str]:
|
740
|
+
"""
|
741
|
+
Recursively list all files in a directory, up to a certain depth.
|
742
|
+
|
743
|
+
Args:
|
744
|
+
dir (str): The directory path, relative to root.
|
745
|
+
depth (int, optional): The depth level. Defaults to 1.
|
746
|
+
include_types (List[str], optional): A list of file types to include.
|
747
|
+
Defaults to empty list.
|
748
|
+
exclude_types (List[str], optional): A list of file types to exclude.
|
749
|
+
Defaults to empty list.
|
750
|
+
Returns:
|
751
|
+
List[str]: A list of file names.
|
752
|
+
"""
|
753
|
+
depth = depth if depth >= 0 else 200
|
754
|
+
output = []
|
755
|
+
|
756
|
+
for root, dirs, files in os.walk(dir):
|
757
|
+
if root.count(os.sep) - dir.count(os.sep) < depth:
|
758
|
+
level = root.count(os.sep) - dir.count(os.sep)
|
759
|
+
sub_indent = " " * 4 * (level + 1)
|
760
|
+
for d in dirs:
|
761
|
+
output.append("{}{}/".format(sub_indent, d))
|
762
|
+
for f in files:
|
763
|
+
if include_types and RepoLoader._file_type(f) not in include_types:
|
764
|
+
continue
|
765
|
+
if exclude_types and RepoLoader._file_type(f) in exclude_types:
|
766
|
+
continue
|
767
|
+
output.append("{}{}".format(sub_indent, f))
|
768
|
+
return output
|
769
|
+
|
770
|
+
@staticmethod
|
771
|
+
def show_file_contents(tree: Dict[str, Union[str, List[Dict[str, Any]]]]) -> str:
|
772
|
+
"""
|
773
|
+
Print the contents of all files from a structure dictionary.
|
774
|
+
|
775
|
+
Args:
|
776
|
+
tree (Dict[str, Union[str, List[Dict]]]): The structure dictionary.
|
777
|
+
"""
|
778
|
+
contents = ""
|
779
|
+
for dir in tree["dirs"]:
|
780
|
+
contents += RepoLoader.show_file_contents(dir)
|
781
|
+
for file in tree["files"]:
|
782
|
+
path = file["path"]
|
783
|
+
contents += f"""
|
784
|
+
{path}:
|
785
|
+
--------------------
|
786
|
+
{file["content"]}
|
787
|
+
|
788
|
+
"""
|
789
|
+
|
790
|
+
return contents
|