langroid 0.31.1__py3-none-any.whl → 0.33.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (163) hide show
  1. {langroid-0.31.1.dist-info → langroid-0.33.3.dist-info}/METADATA +150 -124
  2. langroid-0.33.3.dist-info/RECORD +7 -0
  3. {langroid-0.31.1.dist-info → langroid-0.33.3.dist-info}/WHEEL +1 -1
  4. langroid-0.33.3.dist-info/entry_points.txt +4 -0
  5. pyproject.toml +317 -212
  6. langroid/__init__.py +0 -106
  7. langroid/agent/.chainlit/config.toml +0 -121
  8. langroid/agent/.chainlit/translations/bn.json +0 -231
  9. langroid/agent/.chainlit/translations/en-US.json +0 -229
  10. langroid/agent/.chainlit/translations/gu.json +0 -231
  11. langroid/agent/.chainlit/translations/he-IL.json +0 -231
  12. langroid/agent/.chainlit/translations/hi.json +0 -231
  13. langroid/agent/.chainlit/translations/kn.json +0 -231
  14. langroid/agent/.chainlit/translations/ml.json +0 -231
  15. langroid/agent/.chainlit/translations/mr.json +0 -231
  16. langroid/agent/.chainlit/translations/ta.json +0 -231
  17. langroid/agent/.chainlit/translations/te.json +0 -231
  18. langroid/agent/.chainlit/translations/zh-CN.json +0 -229
  19. langroid/agent/__init__.py +0 -41
  20. langroid/agent/base.py +0 -1981
  21. langroid/agent/batch.py +0 -398
  22. langroid/agent/callbacks/__init__.py +0 -0
  23. langroid/agent/callbacks/chainlit.py +0 -598
  24. langroid/agent/chat_agent.py +0 -1899
  25. langroid/agent/chat_document.py +0 -454
  26. langroid/agent/helpers.py +0 -0
  27. langroid/agent/junk +0 -13
  28. langroid/agent/openai_assistant.py +0 -882
  29. langroid/agent/special/__init__.py +0 -59
  30. langroid/agent/special/arangodb/__init__.py +0 -0
  31. langroid/agent/special/arangodb/arangodb_agent.py +0 -656
  32. langroid/agent/special/arangodb/system_messages.py +0 -186
  33. langroid/agent/special/arangodb/tools.py +0 -107
  34. langroid/agent/special/arangodb/utils.py +0 -36
  35. langroid/agent/special/doc_chat_agent.py +0 -1466
  36. langroid/agent/special/lance_doc_chat_agent.py +0 -262
  37. langroid/agent/special/lance_rag/__init__.py +0 -9
  38. langroid/agent/special/lance_rag/critic_agent.py +0 -198
  39. langroid/agent/special/lance_rag/lance_rag_task.py +0 -82
  40. langroid/agent/special/lance_rag/query_planner_agent.py +0 -260
  41. langroid/agent/special/lance_tools.py +0 -61
  42. langroid/agent/special/neo4j/__init__.py +0 -0
  43. langroid/agent/special/neo4j/csv_kg_chat.py +0 -174
  44. langroid/agent/special/neo4j/neo4j_chat_agent.py +0 -433
  45. langroid/agent/special/neo4j/system_messages.py +0 -120
  46. langroid/agent/special/neo4j/tools.py +0 -32
  47. langroid/agent/special/relevance_extractor_agent.py +0 -127
  48. langroid/agent/special/retriever_agent.py +0 -56
  49. langroid/agent/special/sql/__init__.py +0 -17
  50. langroid/agent/special/sql/sql_chat_agent.py +0 -654
  51. langroid/agent/special/sql/utils/__init__.py +0 -21
  52. langroid/agent/special/sql/utils/description_extractors.py +0 -190
  53. langroid/agent/special/sql/utils/populate_metadata.py +0 -85
  54. langroid/agent/special/sql/utils/system_message.py +0 -35
  55. langroid/agent/special/sql/utils/tools.py +0 -64
  56. langroid/agent/special/table_chat_agent.py +0 -263
  57. langroid/agent/structured_message.py +0 -9
  58. langroid/agent/task.py +0 -2093
  59. langroid/agent/tool_message.py +0 -393
  60. langroid/agent/tools/__init__.py +0 -38
  61. langroid/agent/tools/duckduckgo_search_tool.py +0 -50
  62. langroid/agent/tools/file_tools.py +0 -234
  63. langroid/agent/tools/google_search_tool.py +0 -39
  64. langroid/agent/tools/metaphor_search_tool.py +0 -67
  65. langroid/agent/tools/orchestration.py +0 -303
  66. langroid/agent/tools/recipient_tool.py +0 -235
  67. langroid/agent/tools/retrieval_tool.py +0 -32
  68. langroid/agent/tools/rewind_tool.py +0 -137
  69. langroid/agent/tools/segment_extract_tool.py +0 -41
  70. langroid/agent/typed_task.py +0 -19
  71. langroid/agent/xml_tool_message.py +0 -382
  72. langroid/agent_config.py +0 -0
  73. langroid/cachedb/__init__.py +0 -17
  74. langroid/cachedb/base.py +0 -58
  75. langroid/cachedb/momento_cachedb.py +0 -108
  76. langroid/cachedb/redis_cachedb.py +0 -153
  77. langroid/embedding_models/__init__.py +0 -39
  78. langroid/embedding_models/base.py +0 -74
  79. langroid/embedding_models/clustering.py +0 -189
  80. langroid/embedding_models/models.py +0 -461
  81. langroid/embedding_models/protoc/__init__.py +0 -0
  82. langroid/embedding_models/protoc/embeddings.proto +0 -19
  83. langroid/embedding_models/protoc/embeddings_pb2.py +0 -33
  84. langroid/embedding_models/protoc/embeddings_pb2.pyi +0 -50
  85. langroid/embedding_models/protoc/embeddings_pb2_grpc.py +0 -79
  86. langroid/embedding_models/remote_embeds.py +0 -153
  87. langroid/exceptions.py +0 -65
  88. langroid/experimental/team-save.py +0 -391
  89. langroid/language_models/.chainlit/config.toml +0 -121
  90. langroid/language_models/.chainlit/translations/en-US.json +0 -231
  91. langroid/language_models/__init__.py +0 -53
  92. langroid/language_models/azure_openai.py +0 -153
  93. langroid/language_models/base.py +0 -678
  94. langroid/language_models/config.py +0 -18
  95. langroid/language_models/mock_lm.py +0 -124
  96. langroid/language_models/openai_gpt.py +0 -1923
  97. langroid/language_models/prompt_formatter/__init__.py +0 -16
  98. langroid/language_models/prompt_formatter/base.py +0 -40
  99. langroid/language_models/prompt_formatter/hf_formatter.py +0 -132
  100. langroid/language_models/prompt_formatter/llama2_formatter.py +0 -75
  101. langroid/language_models/utils.py +0 -147
  102. langroid/mytypes.py +0 -84
  103. langroid/parsing/__init__.py +0 -52
  104. langroid/parsing/agent_chats.py +0 -38
  105. langroid/parsing/code-parsing.md +0 -86
  106. langroid/parsing/code_parser.py +0 -121
  107. langroid/parsing/config.py +0 -0
  108. langroid/parsing/document_parser.py +0 -718
  109. langroid/parsing/image_text.py +0 -32
  110. langroid/parsing/para_sentence_split.py +0 -62
  111. langroid/parsing/parse_json.py +0 -155
  112. langroid/parsing/parser.py +0 -313
  113. langroid/parsing/repo_loader.py +0 -790
  114. langroid/parsing/routing.py +0 -36
  115. langroid/parsing/search.py +0 -275
  116. langroid/parsing/spider.py +0 -102
  117. langroid/parsing/table_loader.py +0 -94
  118. langroid/parsing/url_loader.py +0 -111
  119. langroid/parsing/url_loader_cookies.py +0 -73
  120. langroid/parsing/urls.py +0 -273
  121. langroid/parsing/utils.py +0 -373
  122. langroid/parsing/web_search.py +0 -155
  123. langroid/prompts/__init__.py +0 -9
  124. langroid/prompts/chat-gpt4-system-prompt.md +0 -68
  125. langroid/prompts/dialog.py +0 -17
  126. langroid/prompts/prompts_config.py +0 -5
  127. langroid/prompts/templates.py +0 -141
  128. langroid/pydantic_v1/__init__.py +0 -10
  129. langroid/pydantic_v1/main.py +0 -4
  130. langroid/utils/.chainlit/config.toml +0 -121
  131. langroid/utils/.chainlit/translations/en-US.json +0 -231
  132. langroid/utils/__init__.py +0 -19
  133. langroid/utils/algorithms/__init__.py +0 -3
  134. langroid/utils/algorithms/graph.py +0 -103
  135. langroid/utils/configuration.py +0 -98
  136. langroid/utils/constants.py +0 -30
  137. langroid/utils/docker.py +0 -37
  138. langroid/utils/git_utils.py +0 -252
  139. langroid/utils/globals.py +0 -49
  140. langroid/utils/llms/__init__.py +0 -0
  141. langroid/utils/llms/strings.py +0 -8
  142. langroid/utils/logging.py +0 -135
  143. langroid/utils/object_registry.py +0 -66
  144. langroid/utils/output/__init__.py +0 -20
  145. langroid/utils/output/citations.py +0 -41
  146. langroid/utils/output/printing.py +0 -99
  147. langroid/utils/output/status.py +0 -40
  148. langroid/utils/pandas_utils.py +0 -30
  149. langroid/utils/pydantic_utils.py +0 -602
  150. langroid/utils/system.py +0 -286
  151. langroid/utils/types.py +0 -93
  152. langroid/utils/web/__init__.py +0 -0
  153. langroid/utils/web/login.py +0 -83
  154. langroid/vector_store/__init__.py +0 -50
  155. langroid/vector_store/base.py +0 -357
  156. langroid/vector_store/chromadb.py +0 -214
  157. langroid/vector_store/lancedb.py +0 -401
  158. langroid/vector_store/meilisearch.py +0 -299
  159. langroid/vector_store/momento.py +0 -278
  160. langroid/vector_store/qdrant_cloud.py +0 -6
  161. langroid/vector_store/qdrantdb.py +0 -468
  162. langroid-0.31.1.dist-info/RECORD +0 -162
  163. {langroid-0.31.1.dist-info → langroid-0.33.3.dist-info/licenses}/LICENSE +0 -0
@@ -1,790 +0,0 @@
1
- import itertools
2
- import json
3
- import logging
4
- import os
5
- import subprocess
6
- import tempfile
7
- import time
8
- from collections import deque
9
- from pathlib import Path
10
- from typing import Any, Dict, List, Optional, Tuple, Union
11
- from urllib.parse import urlparse
12
-
13
- from dotenv import load_dotenv
14
- from github import Github
15
- from github.ContentFile import ContentFile
16
- from github.Label import Label
17
- from github.Repository import Repository
18
-
19
- from langroid.mytypes import DocMetaData, Document
20
- from langroid.parsing.document_parser import DocumentParser, DocumentType
21
- from langroid.parsing.parser import Parser, ParsingConfig
22
- from langroid.pydantic_v1 import BaseModel, BaseSettings, Field
23
-
24
- logger = logging.getLogger(__name__)
25
-
26
-
27
- def _get_decoded_content(content_file: ContentFile) -> str:
28
- if content_file.encoding == "base64":
29
- return content_file.decoded_content.decode("utf-8") or ""
30
- elif content_file.encoding == "none":
31
- return content_file.content or ""
32
- else:
33
- raise ValueError(f"Unsupported encoding: {content_file.encoding}")
34
-
35
-
36
- def _has_files(directory: str) -> bool:
37
- """
38
- Recursively checks if there is at least one file in a directory.
39
- """
40
- for dirpath, dirnames, filenames in os.walk(directory):
41
- if filenames:
42
- return True
43
- return False
44
-
45
-
46
- # Pydantic model for GitHub issue data
47
- class IssueData(BaseModel):
48
- state: str = Field(..., description="State of issue e.g. open or closed")
49
- year: int = Field(..., description="Year issue was created")
50
- month: int = Field(..., description="Month issue was created")
51
- day: int = Field(..., description="Day issue was created")
52
- assignee: Optional[str] = Field(..., description="Assignee of issue")
53
- size: Optional[str] = Field(..., description="Size of issue, e.g. XS, S, M, L, XXL")
54
- text: str = Field(..., description="Text of issue, i.e. description body")
55
-
56
-
57
- def get_issue_size(labels: List[Label]) -> str | None:
58
- sizes = ["XS", "S", "M", "L", "XL", "XXL"]
59
- return next((label.name for label in labels if label.name in sizes), None)
60
-
61
-
62
- class RepoLoaderConfig(BaseSettings):
63
- """
64
- Configuration for RepoLoader.
65
- """
66
-
67
- non_code_types: List[str] = [
68
- "md",
69
- "txt",
70
- "text",
71
- ]
72
-
73
- file_types: List[str] = [
74
- "py",
75
- "md",
76
- "yml",
77
- "yaml",
78
- "txt",
79
- "text",
80
- "sh",
81
- "ini",
82
- "toml",
83
- "cfg",
84
- "json",
85
- "rst",
86
- "Makefile",
87
- "Dockerfile",
88
- ]
89
-
90
- exclude_dirs: List[str] = [
91
- ".gitignore",
92
- ".gitmodules",
93
- ".gitattributes",
94
- ".git",
95
- ".idea",
96
- ".vscode",
97
- ".circleci",
98
- ]
99
-
100
-
101
- class RepoLoader:
102
- """
103
- Class for recursively getting all file content in a repo.
104
- """
105
-
106
- def __init__(
107
- self,
108
- url: str,
109
- config: RepoLoaderConfig = RepoLoaderConfig(),
110
- ):
111
- """
112
- Args:
113
- url: full github url of repo, or just "owner/repo"
114
- config: configuration for RepoLoader
115
- """
116
- self.url = url
117
- self.config = config
118
- self.clone_path: Optional[str] = None
119
- self.log_file = ".logs/repo_loader/download_log.json"
120
- os.makedirs(os.path.dirname(self.log_file), exist_ok=True)
121
- if not os.path.exists(self.log_file):
122
- with open(self.log_file, "w") as f:
123
- json.dump({"junk": "ignore"}, f)
124
- with open(self.log_file, "r") as f:
125
- log = json.load(f)
126
- if self.url in log and os.path.exists(log[self.url]):
127
- logger.info(f"Repo Already downloaded in {log[self.url]}")
128
- self.clone_path = log[self.url]
129
-
130
- if "github.com" in self.url:
131
- repo_name = self.url.split("github.com/")[1]
132
- else:
133
- repo_name = self.url
134
- load_dotenv()
135
- # authenticated calls to github api have higher rate limit
136
- token = os.getenv("GITHUB_ACCESS_TOKEN")
137
- g = Github(token)
138
- self.repo = self._get_repo_with_retry(g, repo_name)
139
-
140
- @staticmethod
141
- def _get_repo_with_retry(
142
- g: Github, repo_name: str, max_retries: int = 5
143
- ) -> Repository:
144
- """
145
- Get a repo from the GitHub API, retrying if the request fails,
146
- with exponential backoff.
147
-
148
- Args:
149
- g: GitHub object
150
- repo_name: name of repo
151
- max_retries: maximum number of retries
152
- Returns:
153
- Repo: GitHub repo object
154
-
155
- """
156
- base_delay = 2 # base delay in seconds
157
- max_delay = 60 # maximum delay in seconds
158
-
159
- for attempt in range(max_retries):
160
- try:
161
- return g.get_repo(repo_name)
162
- except Exception as e:
163
- delay = min(max_delay, base_delay * 2**attempt)
164
- logger.info(
165
- f"Attempt {attempt+1} failed with error: {str(e)}. "
166
- f"Retrying in {delay} seconds..."
167
- )
168
- time.sleep(delay)
169
- raise Exception(f"Failed to get repo {repo_name} after {max_retries} attempts.")
170
-
171
- def _get_dir_name(self) -> str:
172
- return urlparse(self.url).path.replace("/", "_")
173
-
174
- def get_issues(self, k: int | None = 100) -> List[IssueData]:
175
- """Get up to k issues from the GitHub repo."""
176
- if k is None:
177
- issues = self.repo.get_issues(state="all")
178
- else:
179
- issues = self.repo.get_issues(state="all")[:k]
180
- issue_data_list = []
181
- for issue in issues:
182
- issue_data = IssueData(
183
- state=issue.state,
184
- year=issue.created_at.year,
185
- month=issue.created_at.month,
186
- day=issue.created_at.day,
187
- assignee=issue.assignee.login if issue.assignee else None,
188
- size=get_issue_size(issue.labels),
189
- text=issue.body or "No issue description body.",
190
- )
191
- issue_data_list.append(issue_data)
192
-
193
- return issue_data_list
194
-
195
- @staticmethod
196
- def _file_type(name: str) -> str:
197
- """
198
- Get the file type of a file name.
199
- Args:
200
- name: name of file, can be "a", "a.b", or ".b"
201
- Returns:
202
- str: file type; "a" => "a", "a.b" => "b", ".b" => "b"
203
- some examples:
204
- "Makefile" => "Makefile",
205
- "script.py" => "py",
206
- ".gitignore" => "gitignore"
207
- """
208
- # "a" -> ("a", ""), "a.b" -> ("a", ".b"), ".b" -> (".b", "")
209
- file_parts = os.path.splitext(name)
210
- if file_parts[1] == "":
211
- file_type = file_parts[0] # ("a", "") => "a"
212
- else:
213
- file_type = file_parts[1][1:] # (*,".b") => "b"
214
- return file_type
215
-
216
- def _is_code(self, file_type: str) -> bool:
217
- """
218
- Check if a file type is code.
219
-
220
- Args:
221
- file_type: file type, e.g. "py", "md", "txt"
222
- Returns:
223
- bool: whether file type is code
224
- """
225
- return file_type not in self.config.non_code_types
226
-
227
- def _is_allowed(self, content: ContentFile) -> bool:
228
- """
229
- Check if a file or directory content is allowed to be included.
230
-
231
- Args:
232
- content (ContentFile): The file or directory Content object.
233
-
234
- Returns:
235
- bool: Whether the file or directory is allowed to be included.
236
- """
237
- if content.type == "dir":
238
- return content.name not in self.config.exclude_dirs
239
- elif content.type == "file":
240
- return self._file_type(content.name) in self.config.file_types
241
- else:
242
- return False
243
-
244
- def default_clone_path(self) -> str:
245
- return tempfile.mkdtemp(suffix=self._get_dir_name())
246
-
247
- def clone(self, path: Optional[str] = None) -> Optional[str]:
248
- """
249
- Clone a GitHub repository to a local directory specified by `path`,
250
- if it has not already been cloned.
251
-
252
- Args:
253
- path (str): The local directory where the repository should be cloned.
254
- If not specified, a temporary directory will be created.
255
-
256
- Returns:
257
- str: The path to the local directory where the repository was cloned.
258
- """
259
- with open(self.log_file, "r") as f:
260
- log: Dict[str, str] = json.load(f)
261
-
262
- if (
263
- self.url in log
264
- and os.path.exists(log[self.url])
265
- and _has_files(log[self.url])
266
- ):
267
- logger.warning(f"Repo Already downloaded in {log[self.url]}")
268
- self.clone_path = log[self.url]
269
- return self.clone_path
270
-
271
- self.clone_path = path
272
- if path is None:
273
- path = self.default_clone_path()
274
- self.clone_path = path
275
-
276
- try:
277
- subprocess.run(["git", "clone", self.url, path], check=True)
278
- log[self.url] = path
279
- with open(self.log_file, "w") as f:
280
- json.dump(log, f)
281
- return self.clone_path
282
- except subprocess.CalledProcessError as e:
283
- logger.error(f"Git clone failed: {e}")
284
- except Exception as e:
285
- logger.error(f"An error occurred while trying to clone the repository:{e}")
286
-
287
- return self.clone_path
288
-
289
- def load_tree_from_github(
290
- self, depth: int, lines: int = 0
291
- ) -> Dict[str, Union[str, List[Dict[str, Any]]]]:
292
- """
293
- Get a nested dictionary of GitHub repository file and directory names
294
- up to a certain depth, with file contents.
295
-
296
- Args:
297
- depth (int): The depth level.
298
- lines (int): The number of lines of file contents to include.
299
-
300
- Returns:
301
- Dict[str, Union[str, List[Dict]]]:
302
- A dictionary containing file and directory names, with file contents.
303
- """
304
- root_contents = self.repo.get_contents("")
305
- if not isinstance(root_contents, list):
306
- root_contents = [root_contents]
307
- repo_structure = {
308
- "type": "dir",
309
- "name": "",
310
- "dirs": [],
311
- "files": [],
312
- "path": "",
313
- }
314
-
315
- # A queue of tuples (current_node, current_depth, parent_structure)
316
- queue = deque([(root_contents, 0, repo_structure)])
317
-
318
- while queue:
319
- current_node, current_depth, parent_structure = queue.popleft()
320
-
321
- for content in current_node:
322
- if not self._is_allowed(content):
323
- continue
324
- if content.type == "dir" and current_depth < depth:
325
- # Create a new sub-dictionary for this directory
326
- new_dir = {
327
- "type": "dir",
328
- "name": content.name,
329
- "dirs": [],
330
- "files": [],
331
- "path": content.path,
332
- }
333
- parent_structure["dirs"].append(new_dir)
334
- contents = self.repo.get_contents(content.path)
335
- if not isinstance(contents, list):
336
- contents = [contents]
337
- queue.append(
338
- (
339
- contents,
340
- current_depth + 1,
341
- new_dir,
342
- )
343
- )
344
- elif content.type == "file":
345
- file_content = "\n".join(
346
- _get_decoded_content(content).splitlines()[:lines]
347
- )
348
- file_dict = {
349
- "type": "file",
350
- "name": content.name,
351
- "content": file_content,
352
- "path": content.path,
353
- }
354
- parent_structure["files"].append(file_dict)
355
-
356
- return repo_structure
357
-
358
- def load(
359
- self,
360
- path: Optional[str] = None,
361
- depth: int = 3,
362
- lines: int = 0,
363
- ) -> Tuple[Dict[str, Union[str, List[Dict[str, Any]]]], List[Document]]:
364
- """
365
- From a local folder `path` (if None, the repo clone path), get:
366
- a nested dictionary (tree) of dicts, files and contents
367
- a list of Document objects for each file.
368
-
369
- Args:
370
- path (str): The local folder path; if none, use self.clone_path()
371
- depth (int): The depth level.
372
- lines (int): The number of lines of file contents to include.
373
-
374
- Returns:
375
- Tuple of (dict, List_of_Documents):
376
- A dictionary containing file and directory names, with file
377
- contents, and a list of Document objects for each file.
378
- """
379
- if path is None:
380
- if self.clone_path is None or not _has_files(self.clone_path):
381
- self.clone()
382
- path = self.clone_path
383
- if path is None:
384
- raise ValueError("Unable to clone repo")
385
- return self.load_from_folder(
386
- path=path,
387
- depth=depth,
388
- lines=lines,
389
- file_types=self.config.file_types,
390
- exclude_dirs=self.config.exclude_dirs,
391
- url=self.url,
392
- )
393
-
394
- @staticmethod
395
- def load_from_folder(
396
- path: str,
397
- depth: int = 3,
398
- lines: int = 0,
399
- file_types: Optional[List[str]] = None,
400
- exclude_dirs: Optional[List[str]] = None,
401
- url: str = "",
402
- ) -> Tuple[Dict[str, Union[str, List[Dict[str, Any]]]], List[Document]]:
403
- """
404
- From a local folder `path` (required), get:
405
- a nested dictionary (tree) of dicts, files and contents, restricting to
406
- desired file_types and excluding undesired directories.
407
- a list of Document objects for each file.
408
-
409
- Args:
410
- path (str): The local folder path, required.
411
- depth (int): The depth level. Optional, default 3.
412
- lines (int): The number of lines of file contents to include.
413
- Optional, default 0 (no lines => empty string).
414
- file_types (List[str]): The file types to include.
415
- Optional, default None (all).
416
- exclude_dirs (List[str]): The directories to exclude.
417
- Optional, default None (no exclusions).
418
- url (str): Optional url, to be stored in docs as metadata. Default "".
419
-
420
- Returns:
421
- Tuple of (dict, List_of_Documents):
422
- A dictionary containing file and directory names, with file contents.
423
- A list of Document objects for each file.
424
- """
425
-
426
- folder_structure = {
427
- "type": "dir",
428
- "name": "",
429
- "dirs": [],
430
- "files": [],
431
- "path": "",
432
- }
433
- # A queue of tuples (current_path, current_depth, parent_structure)
434
- queue = deque([(path, 0, folder_structure)])
435
- docs = []
436
- exclude_dirs = exclude_dirs or []
437
- while queue:
438
- current_path, current_depth, parent_structure = queue.popleft()
439
-
440
- for item in os.listdir(current_path):
441
- item_path = os.path.join(current_path, item)
442
- relative_path = os.path.relpath(item_path, path)
443
- if (os.path.isdir(item_path) and item in exclude_dirs) or (
444
- os.path.isfile(item_path)
445
- and file_types is not None
446
- and RepoLoader._file_type(item) not in file_types
447
- ):
448
- continue
449
-
450
- if os.path.isdir(item_path) and current_depth < depth:
451
- # Create a new sub-dictionary for this directory
452
- new_dir = {
453
- "type": "dir",
454
- "name": item,
455
- "dirs": [],
456
- "files": [],
457
- "path": relative_path,
458
- }
459
- parent_structure["dirs"].append(new_dir)
460
- queue.append((item_path, current_depth + 1, new_dir))
461
- elif os.path.isfile(item_path):
462
- # Add the file to the current dictionary
463
- with open(item_path, "r") as f:
464
- file_lines = list(itertools.islice(f, lines))
465
- file_content = "\n".join(line.strip() for line in file_lines)
466
- if file_content == "":
467
- continue
468
-
469
- file_dict = {
470
- "type": "file",
471
- "name": item,
472
- "content": file_content,
473
- "path": relative_path,
474
- }
475
- parent_structure["files"].append(file_dict)
476
- docs.append(
477
- Document(
478
- content=file_content,
479
- metadata=DocMetaData(
480
- repo=url,
481
- source=relative_path,
482
- url=url,
483
- filename=item,
484
- extension=RepoLoader._file_type(item),
485
- language=RepoLoader._file_type(item),
486
- ),
487
- )
488
- )
489
- return folder_structure, docs
490
-
491
- @staticmethod
492
- def get_documents(
493
- path: str | bytes,
494
- parser: Parser = Parser(ParsingConfig()),
495
- file_types: Optional[List[str]] = None,
496
- exclude_dirs: Optional[List[str]] = None,
497
- depth: int = -1,
498
- lines: Optional[int] = None,
499
- doc_type: str | DocumentType | None = None,
500
- ) -> List[Document]:
501
- """
502
- Recursively get all files under a path as Document objects.
503
-
504
- Args:
505
- path (str|bytes): The path to the directory or file, or bytes content.
506
- The bytes option is meant to support the case where the content
507
- has already been read from a file in an upstream process
508
- (e.g. from an API or a database), and we want to avoid having to
509
- write it to a temporary file just to read it again.
510
- (which can be very slow for large files,
511
- especially in a docker container)
512
- parser (Parser): Parser to use to parse files.
513
- file_types (List[str], optional): List of file extensions OR
514
- filenames OR file_path_names to include.
515
- Defaults to None, which includes all files.
516
- exclude_dirs (List[str], optional): List of directories to exclude.
517
- Defaults to None, which includes all directories.
518
- depth (int, optional): Max depth of recursion. Defaults to -1,
519
- which includes all depths.
520
- lines (int, optional): Number of lines to read from each file.
521
- Defaults to None, which reads all lines.
522
- doc_type (str|DocumentType, optional): The type of document to parse.
523
-
524
- Returns:
525
- List[Document]: List of Document objects representing files.
526
-
527
- """
528
- docs = []
529
- file_paths = []
530
- if isinstance(path, bytes):
531
- file_paths.append(path)
532
- else:
533
- path_obj = Path(path).resolve()
534
-
535
- if path_obj.is_file():
536
- file_paths.append(str(path_obj))
537
- else:
538
- path_depth = len(path_obj.parts)
539
- for root, dirs, files in os.walk(path):
540
- # Exclude directories if needed
541
- if exclude_dirs:
542
- dirs[:] = [d for d in dirs if d not in exclude_dirs]
543
-
544
- current_depth = len(Path(root).resolve().parts) - path_depth
545
- if depth == -1 or current_depth <= depth:
546
- for file in files:
547
- file_path = str(Path(root) / file)
548
- if (
549
- file_types is None
550
- or RepoLoader._file_type(file_path) in file_types
551
- or os.path.basename(file_path) in file_types
552
- or file_path in file_types
553
- ):
554
- file_paths.append(file_path)
555
-
556
- for file_path in file_paths:
557
- docs.extend(
558
- DocumentParser.chunks_from_path_or_bytes(
559
- file_path,
560
- parser,
561
- doc_type=doc_type,
562
- lines=lines,
563
- )
564
- )
565
- return docs
566
-
567
- def load_docs_from_github(
568
- self,
569
- k: Optional[int] = None,
570
- depth: Optional[int] = None,
571
- lines: Optional[int] = None,
572
- ) -> List[Document]:
573
- """
574
- Directly from GitHub, recursively get all files in a repo that have one of the
575
- extensions, possibly up to a max number of files, max depth, and max number
576
- of lines per file (if any of these are specified).
577
-
578
- Args:
579
- k (int): max number of files to load, or None for all files
580
- depth (int): max depth to recurse, or None for infinite depth
581
- lines (int): max number of lines to get, from a file, or None for all lines
582
-
583
- Returns:
584
- list of Document objects, each has fields `content` and `metadata`,
585
- and `metadata` has fields `url`, `filename`, `extension`, `language`
586
- """
587
- contents = self.repo.get_contents("")
588
- if not isinstance(contents, list):
589
- contents = [contents]
590
- stack = list(zip(contents, [0] * len(contents))) # stack of (content, depth)
591
- # recursively get all files in repo that have one of the extensions
592
- docs = []
593
- i = 0
594
-
595
- while stack:
596
- if k is not None and i == k:
597
- break
598
- file_content, d = stack.pop()
599
- if not self._is_allowed(file_content):
600
- continue
601
- if file_content.type == "dir":
602
- if depth is None or d <= depth:
603
- items = self.repo.get_contents(file_content.path)
604
- if not isinstance(items, list):
605
- items = [items]
606
- stack.extend(list(zip(items, [d + 1] * len(items))))
607
- else:
608
- if depth is None or d <= depth:
609
- # need to decode the file content, which is in bytes
610
- contents = self.repo.get_contents(file_content.path)
611
- if isinstance(contents, list):
612
- contents = contents[0]
613
- text = _get_decoded_content(contents)
614
- if lines is not None:
615
- text = "\n".join(text.split("\n")[:lines])
616
- i += 1
617
-
618
- # Note `source` is important, it may be used to cite
619
- # evidence for an answer.
620
- # See URLLoader
621
- # TODO we should use Pydantic to enforce/standardize this
622
-
623
- docs.append(
624
- Document(
625
- content=text,
626
- metadata=DocMetaData(
627
- repo=self.url,
628
- source=file_content.html_url,
629
- url=file_content.html_url,
630
- filename=file_content.name,
631
- extension=self._file_type(file_content.name),
632
- language=self._file_type(file_content.name),
633
- ),
634
- )
635
- )
636
- return docs
637
-
638
- @staticmethod
639
- def select(
640
- structure: Dict[str, Union[str, List[Dict[str, Any]]]],
641
- includes: List[str],
642
- excludes: List[str] = [],
643
- ) -> Dict[str, Union[str, List[Dict[str, Any]]]]:
644
- """
645
- Filter a structure dictionary for certain directories and files.
646
-
647
- Args:
648
- structure (Dict[str, Union[str, List[Dict]]]): The structure dictionary.
649
- includes (List[str]): A list of desired directories and files.
650
- For files, either full file names or "file type" can be specified.
651
- E.g. "toml" will include all files with the ".toml" extension,
652
- or "Makefile" will include all files named "Makefile".
653
- excludes (List[str]): A list of directories and files to exclude.
654
- Similar to `includes`, full file/dir names or "file type" can be
655
- specified. Optional, defaults to empty list.
656
-
657
-
658
- Returns:
659
- Dict[str, Union[str, List[Dict]]]: The filtered structure dictionary.
660
- """
661
- filtered_structure = {
662
- "type": structure["type"],
663
- "name": structure["name"],
664
- "dirs": [],
665
- "files": [],
666
- "path": structure["path"],
667
- }
668
-
669
- for dir in structure["dirs"]:
670
- if (
671
- dir["name"] in includes
672
- or RepoLoader._file_type(dir["name"]) in includes
673
- ) and (
674
- dir["name"] not in excludes
675
- and RepoLoader._file_type(dir["name"]) not in excludes
676
- ):
677
- # If the directory is in the select list, include the whole subtree
678
- filtered_structure["dirs"].append(dir)
679
- else:
680
- # Otherwise, filter the directory's contents
681
- filtered_dir = RepoLoader.select(dir, includes)
682
- if (
683
- filtered_dir["dirs"] or filtered_dir["files"]
684
- ): # only add if not empty
685
- filtered_structure["dirs"].append(filtered_dir)
686
-
687
- for file in structure["files"]:
688
- if (
689
- file["name"] in includes
690
- or RepoLoader._file_type(file["name"]) in includes
691
- ) and (
692
- file["name"] not in excludes
693
- and RepoLoader._file_type(file["name"]) not in excludes
694
- ):
695
- filtered_structure["files"].append(file)
696
-
697
- return filtered_structure
698
-
699
- @staticmethod
700
- def ls(structure: Dict[str, Union[str, List[Dict]]], depth: int = 0) -> List[str]:
701
- """
702
- Get a list of names of files or directories up to a certain depth from a
703
- structure dictionary.
704
-
705
- Args:
706
- structure (Dict[str, Union[str, List[Dict]]]): The structure dictionary.
707
- depth (int, optional): The depth level. Defaults to 0.
708
-
709
- Returns:
710
- List[str]: A list of names of files or directories.
711
- """
712
- names = []
713
-
714
- # A queue of tuples (current_structure, current_depth)
715
- queue = deque([(structure, 0)])
716
-
717
- while queue:
718
- current_structure, current_depth = queue.popleft()
719
-
720
- if current_depth <= depth:
721
- names.append(current_structure["name"])
722
-
723
- for dir in current_structure["dirs"]:
724
- queue.append((dir, current_depth + 1))
725
-
726
- for file in current_structure["files"]:
727
- # add file names only if depth is less than the limit
728
- if current_depth < depth:
729
- names.append(file["name"])
730
- names = [n for n in names if n not in ["", None]]
731
- return names
732
-
733
- @staticmethod
734
- def list_files(
735
- dir: str,
736
- depth: int = 1,
737
- include_types: List[str] = [],
738
- exclude_types: List[str] = [],
739
- ) -> List[str]:
740
- """
741
- Recursively list all files in a directory, up to a certain depth.
742
-
743
- Args:
744
- dir (str): The directory path, relative to root.
745
- depth (int, optional): The depth level. Defaults to 1.
746
- include_types (List[str], optional): A list of file types to include.
747
- Defaults to empty list.
748
- exclude_types (List[str], optional): A list of file types to exclude.
749
- Defaults to empty list.
750
- Returns:
751
- List[str]: A list of file names.
752
- """
753
- depth = depth if depth >= 0 else 200
754
- output = []
755
-
756
- for root, dirs, files in os.walk(dir):
757
- if root.count(os.sep) - dir.count(os.sep) < depth:
758
- level = root.count(os.sep) - dir.count(os.sep)
759
- sub_indent = " " * 4 * (level + 1)
760
- for d in dirs:
761
- output.append("{}{}/".format(sub_indent, d))
762
- for f in files:
763
- if include_types and RepoLoader._file_type(f) not in include_types:
764
- continue
765
- if exclude_types and RepoLoader._file_type(f) in exclude_types:
766
- continue
767
- output.append("{}{}".format(sub_indent, f))
768
- return output
769
-
770
- @staticmethod
771
- def show_file_contents(tree: Dict[str, Union[str, List[Dict[str, Any]]]]) -> str:
772
- """
773
- Print the contents of all files from a structure dictionary.
774
-
775
- Args:
776
- tree (Dict[str, Union[str, List[Dict]]]): The structure dictionary.
777
- """
778
- contents = ""
779
- for dir in tree["dirs"]:
780
- contents += RepoLoader.show_file_contents(dir)
781
- for file in tree["files"]:
782
- path = file["path"]
783
- contents += f"""
784
- {path}:
785
- --------------------
786
- {file["content"]}
787
-
788
- """
789
-
790
- return contents