langroid 0.58.2__py3-none-any.whl → 0.59.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. langroid/agent/base.py +39 -17
  2. langroid/agent/base.py-e +2216 -0
  3. langroid/agent/callbacks/chainlit.py +2 -1
  4. langroid/agent/chat_agent.py +73 -55
  5. langroid/agent/chat_agent.py-e +2086 -0
  6. langroid/agent/chat_document.py +7 -7
  7. langroid/agent/chat_document.py-e +513 -0
  8. langroid/agent/openai_assistant.py +9 -9
  9. langroid/agent/openai_assistant.py-e +882 -0
  10. langroid/agent/special/arangodb/arangodb_agent.py +10 -18
  11. langroid/agent/special/arangodb/arangodb_agent.py-e +648 -0
  12. langroid/agent/special/arangodb/tools.py +3 -3
  13. langroid/agent/special/doc_chat_agent.py +16 -14
  14. langroid/agent/special/lance_rag/critic_agent.py +2 -2
  15. langroid/agent/special/lance_rag/query_planner_agent.py +4 -4
  16. langroid/agent/special/lance_tools.py +6 -5
  17. langroid/agent/special/lance_tools.py-e +61 -0
  18. langroid/agent/special/neo4j/neo4j_chat_agent.py +3 -7
  19. langroid/agent/special/neo4j/neo4j_chat_agent.py-e +430 -0
  20. langroid/agent/special/relevance_extractor_agent.py +1 -1
  21. langroid/agent/special/sql/sql_chat_agent.py +11 -3
  22. langroid/agent/task.py +9 -87
  23. langroid/agent/task.py-e +2418 -0
  24. langroid/agent/tool_message.py +33 -17
  25. langroid/agent/tool_message.py-e +400 -0
  26. langroid/agent/tools/file_tools.py +4 -2
  27. langroid/agent/tools/file_tools.py-e +234 -0
  28. langroid/agent/tools/mcp/fastmcp_client.py +19 -6
  29. langroid/agent/tools/mcp/fastmcp_client.py-e +584 -0
  30. langroid/agent/tools/orchestration.py +22 -17
  31. langroid/agent/tools/orchestration.py-e +301 -0
  32. langroid/agent/tools/recipient_tool.py +3 -3
  33. langroid/agent/tools/task_tool.py +22 -16
  34. langroid/agent/tools/task_tool.py-e +249 -0
  35. langroid/agent/xml_tool_message.py +90 -35
  36. langroid/agent/xml_tool_message.py-e +392 -0
  37. langroid/cachedb/base.py +1 -1
  38. langroid/embedding_models/base.py +2 -2
  39. langroid/embedding_models/models.py +3 -7
  40. langroid/embedding_models/models.py-e +563 -0
  41. langroid/exceptions.py +4 -1
  42. langroid/language_models/azure_openai.py +2 -2
  43. langroid/language_models/azure_openai.py-e +134 -0
  44. langroid/language_models/base.py +6 -4
  45. langroid/language_models/base.py-e +812 -0
  46. langroid/language_models/client_cache.py +64 -0
  47. langroid/language_models/config.py +2 -4
  48. langroid/language_models/config.py-e +18 -0
  49. langroid/language_models/model_info.py +9 -1
  50. langroid/language_models/model_info.py-e +483 -0
  51. langroid/language_models/openai_gpt.py +119 -20
  52. langroid/language_models/openai_gpt.py-e +2280 -0
  53. langroid/language_models/provider_params.py +3 -22
  54. langroid/language_models/provider_params.py-e +153 -0
  55. langroid/mytypes.py +11 -4
  56. langroid/mytypes.py-e +132 -0
  57. langroid/parsing/code_parser.py +1 -1
  58. langroid/parsing/file_attachment.py +1 -1
  59. langroid/parsing/file_attachment.py-e +246 -0
  60. langroid/parsing/md_parser.py +14 -4
  61. langroid/parsing/md_parser.py-e +574 -0
  62. langroid/parsing/parser.py +22 -7
  63. langroid/parsing/parser.py-e +410 -0
  64. langroid/parsing/repo_loader.py +3 -1
  65. langroid/parsing/repo_loader.py-e +812 -0
  66. langroid/parsing/search.py +1 -1
  67. langroid/parsing/url_loader.py +17 -51
  68. langroid/parsing/url_loader.py-e +683 -0
  69. langroid/parsing/urls.py +5 -4
  70. langroid/parsing/urls.py-e +279 -0
  71. langroid/prompts/prompts_config.py +1 -1
  72. langroid/pydantic_v1/__init__.py +45 -6
  73. langroid/pydantic_v1/__init__.py-e +36 -0
  74. langroid/pydantic_v1/main.py +11 -4
  75. langroid/pydantic_v1/main.py-e +11 -0
  76. langroid/utils/configuration.py +13 -11
  77. langroid/utils/configuration.py-e +141 -0
  78. langroid/utils/constants.py +1 -1
  79. langroid/utils/constants.py-e +32 -0
  80. langroid/utils/globals.py +21 -5
  81. langroid/utils/globals.py-e +49 -0
  82. langroid/utils/html_logger.py +2 -1
  83. langroid/utils/html_logger.py-e +825 -0
  84. langroid/utils/object_registry.py +1 -1
  85. langroid/utils/object_registry.py-e +66 -0
  86. langroid/utils/pydantic_utils.py +55 -28
  87. langroid/utils/pydantic_utils.py-e +602 -0
  88. langroid/utils/types.py +2 -2
  89. langroid/utils/types.py-e +113 -0
  90. langroid/vector_store/base.py +3 -3
  91. langroid/vector_store/lancedb.py +5 -5
  92. langroid/vector_store/lancedb.py-e +404 -0
  93. langroid/vector_store/meilisearch.py +2 -2
  94. langroid/vector_store/pineconedb.py +4 -4
  95. langroid/vector_store/pineconedb.py-e +427 -0
  96. langroid/vector_store/postgres.py +1 -1
  97. langroid/vector_store/qdrantdb.py +3 -3
  98. langroid/vector_store/weaviatedb.py +1 -1
  99. {langroid-0.58.2.dist-info → langroid-0.59.0b1.dist-info}/METADATA +3 -2
  100. langroid-0.59.0b1.dist-info/RECORD +181 -0
  101. langroid/agent/special/doc_chat_task.py +0 -0
  102. langroid/mcp/__init__.py +0 -1
  103. langroid/mcp/server/__init__.py +0 -1
  104. langroid-0.58.2.dist-info/RECORD +0 -145
  105. {langroid-0.58.2.dist-info → langroid-0.59.0b1.dist-info}/WHEEL +0 -0
  106. {langroid-0.58.2.dist-info → langroid-0.59.0b1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,812 @@
1
+ import itertools
2
+ import json
3
+ import logging
4
+ import os
5
+ import subprocess
6
+ import tempfile
7
+ import time
8
+ from collections import deque
9
+ from pathlib import Path
10
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
11
+ from urllib.parse import urlparse
12
+
13
+ from dotenv import load_dotenv
14
+
15
+ if TYPE_CHECKING:
16
+ from github import Github
17
+ from github.ContentFile import ContentFile
18
+ from github.Label import Label
19
+ from github.Repository import Repository
20
+
21
+ from pydantic_settings import BaseSettings
22
+
23
+ from langroid.mytypes import DocMetaData, Document
24
+ from langroid.parsing.document_parser import DocumentParser, DocumentType
25
+ from langroid.parsing.parser import Parser, ParsingConfig
26
+ from pydantic import BaseModel, Field
27
+
28
+ logger = logging.getLogger(__name__)
29
+
30
+
31
+ def _get_decoded_content(content_file: "ContentFile") -> str:
32
+ if content_file.encoding == "base64":
33
+ return content_file.decoded_content.decode("utf-8") or ""
34
+ elif content_file.encoding == "none":
35
+ return content_file.content or ""
36
+ else:
37
+ raise ValueError(f"Unsupported encoding: {content_file.encoding}")
38
+
39
+
40
+ def _has_files(directory: str) -> bool:
41
+ """
42
+ Recursively checks if there is at least one file in a directory.
43
+ """
44
+ for dirpath, dirnames, filenames in os.walk(directory):
45
+ if filenames:
46
+ return True
47
+ return False
48
+
49
+
50
+ # Pydantic model for GitHub issue data
51
+ class IssueData(BaseModel):
52
+ state: str = Field(..., description="State of issue e.g. open or closed")
53
+ year: int = Field(..., description="Year issue was created")
54
+ month: int = Field(..., description="Month issue was created")
55
+ day: int = Field(..., description="Day issue was created")
56
+ assignee: Optional[str] = Field(..., description="Assignee of issue")
57
+ size: Optional[str] = Field(..., description="Size of issue, e.g. XS, S, M, L, XXL")
58
+ text: str = Field(..., description="Text of issue, i.e. description body")
59
+
60
+
61
+ def get_issue_size(labels: List["Label"]) -> str | None:
62
+ sizes = ["XS", "S", "M", "L", "XL", "XXL"]
63
+ return next((label.name for label in labels if label.name in sizes), None)
64
+
65
+
66
+ class RepoLoaderConfig(BaseSettings):
67
+ """
68
+ Configuration for RepoLoader.
69
+ """
70
+
71
+ non_code_types: List[str] = [
72
+ "md",
73
+ "txt",
74
+ "text",
75
+ ]
76
+
77
+ file_types: List[str] = [
78
+ "py",
79
+ "md",
80
+ "yml",
81
+ "yaml",
82
+ "txt",
83
+ "text",
84
+ "sh",
85
+ "ini",
86
+ "toml",
87
+ "cfg",
88
+ "json",
89
+ "rst",
90
+ "Makefile",
91
+ "Dockerfile",
92
+ ]
93
+
94
+ exclude_dirs: List[str] = [
95
+ ".gitignore",
96
+ ".gitmodules",
97
+ ".gitattributes",
98
+ ".git",
99
+ ".idea",
100
+ ".vscode",
101
+ ".circleci",
102
+ ]
103
+
104
+
105
+ class RepoLoader:
106
+ """
107
+ Class for recursively getting all file content in a repo.
108
+ """
109
+
110
+ def __init__(
111
+ self,
112
+ url: str,
113
+ config: RepoLoaderConfig = RepoLoaderConfig(),
114
+ ):
115
+ """
116
+ Args:
117
+ url: full github url of repo, or just "owner/repo"
118
+ config: configuration for RepoLoader
119
+ """
120
+ self.url = url
121
+ self.config = config
122
+ self.clone_path: Optional[str] = None
123
+ self.log_file = ".logs/repo_loader/download_log.json"
124
+ self.repo: Optional["Repository"] = None # Initialize repo as Optional
125
+
126
+ os.makedirs(os.path.dirname(self.log_file), exist_ok=True)
127
+ if not os.path.exists(self.log_file):
128
+ with open(self.log_file, "w") as f:
129
+ json.dump({"junk": "ignore"}, f)
130
+ with open(self.log_file, "r") as f:
131
+ log = json.load(f)
132
+ if self.url in log and os.path.exists(log[self.url]):
133
+ logger.info(f"Repo Already downloaded in {log[self.url]}")
134
+ self.clone_path = log[self.url]
135
+
136
+ # it's a core dependency, so we don't need to enclose in try/except
137
+ from github import Github # Late import
138
+
139
+ load_dotenv()
140
+ # authenticated calls to github api have higher rate limit
141
+ token = os.getenv("GITHUB_ACCESS_TOKEN")
142
+
143
+ if "github.com" in self.url:
144
+ repo_name = self.url.split("github.com/")[1]
145
+ else:
146
+ repo_name = self.url
147
+
148
+ g = Github(token)
149
+ self.repo = self._get_repo_with_retry(g, repo_name)
150
+
151
+ @staticmethod
152
+ def _get_repo_with_retry(
153
+ g: "Github", repo_name: str, max_retries: int = 5
154
+ ) -> "Repository":
155
+ """
156
+ Get a repo from the GitHub API, retrying if the request fails,
157
+ with exponential backoff.
158
+
159
+ Args:
160
+ g: GitHub object
161
+ repo_name: name of repo
162
+ max_retries: maximum number of retries
163
+ Returns:
164
+ Repo: GitHub repo object
165
+
166
+ """
167
+ base_delay = 2 # base delay in seconds
168
+ max_delay = 60 # maximum delay in seconds
169
+
170
+ for attempt in range(max_retries):
171
+ try:
172
+ return g.get_repo(repo_name)
173
+ except Exception as e:
174
+ delay = min(max_delay, base_delay * 2**attempt)
175
+ logger.info(
176
+ f"Attempt {attempt+1} failed with error: {str(e)}. "
177
+ f"Retrying in {delay} seconds..."
178
+ )
179
+ time.sleep(delay)
180
+ raise Exception(f"Failed to get repo {repo_name} after {max_retries} attempts.")
181
+
182
+ def _get_dir_name(self) -> str:
183
+ return urlparse(self.url).path.replace("/", "_")
184
+
185
+ def get_issues(self, k: int | None = 100) -> List[IssueData]:
186
+ """Get up to k issues from the GitHub repo."""
187
+ if self.repo is None:
188
+ logger.warning("No repo found. Ensure the URL is correct.")
189
+ return [] # Return an empty list rather than raise an error in this case
190
+
191
+ if k is None:
192
+ issues = self.repo.get_issues(state="all")
193
+ else:
194
+ issues = self.repo.get_issues(state="all")[:k]
195
+ issue_data_list = []
196
+ for issue in issues:
197
+ issue_data = IssueData(
198
+ state=issue.state,
199
+ year=issue.created_at.year,
200
+ month=issue.created_at.month,
201
+ day=issue.created_at.day,
202
+ assignee=issue.assignee.login if issue.assignee else None,
203
+ size=get_issue_size(issue.labels),
204
+ text=issue.body or "No issue description body.",
205
+ )
206
+ issue_data_list.append(issue_data)
207
+
208
+ return issue_data_list
209
+
210
+ @staticmethod
211
+ def _file_type(name: str) -> str:
212
+ """
213
+ Get the file type of a file name.
214
+ Args:
215
+ name: name of file, can be "a", "a.b", or ".b"
216
+ Returns:
217
+ str: file type; "a" => "a", "a.b" => "b", ".b" => "b"
218
+ some examples:
219
+ "Makefile" => "Makefile",
220
+ "script.py" => "py",
221
+ ".gitignore" => "gitignore"
222
+ """
223
+ # "a" -> ("a", ""), "a.b" -> ("a", ".b"), ".b" -> (".b", "")
224
+ file_parts = os.path.splitext(name)
225
+ if file_parts[1] == "":
226
+ file_type = file_parts[0] # ("a", "") => "a"
227
+ else:
228
+ file_type = file_parts[1][1:] # (*,".b") => "b"
229
+ return file_type
230
+
231
+ def _is_code(self, file_type: str) -> bool:
232
+ """
233
+ Check if a file type is code.
234
+
235
+ Args:
236
+ file_type: file type, e.g. "py", "md", "txt"
237
+ Returns:
238
+ bool: whether file type is code
239
+ """
240
+ return file_type not in self.config.non_code_types
241
+
242
+ def _is_allowed(self, content: "ContentFile") -> bool:
243
+ """
244
+ Check if a file or directory content is allowed to be included.
245
+
246
+ Args:
247
+ content (ContentFile): The file or directory Content object.
248
+
249
+ Returns:
250
+ bool: Whether the file or directory is allowed to be included.
251
+ """
252
+ if content.type == "dir":
253
+ return content.name not in self.config.exclude_dirs
254
+ elif content.type == "file":
255
+ return self._file_type(content.name) in self.config.file_types
256
+ else:
257
+ return False
258
+
259
+ def default_clone_path(self) -> str:
260
+ return tempfile.mkdtemp(suffix=self._get_dir_name())
261
+
262
+ def clone(self, path: Optional[str] = None) -> Optional[str]:
263
+ """
264
+ Clone a GitHub repository to a local directory specified by `path`,
265
+ if it has not already been cloned.
266
+
267
+ Args:
268
+ path (str): The local directory where the repository should be cloned.
269
+ If not specified, a temporary directory will be created.
270
+
271
+ Returns:
272
+ str: The path to the local directory where the repository was cloned.
273
+ """
274
+ with open(self.log_file, "r") as f:
275
+ log: Dict[str, str] = json.load(f)
276
+
277
+ if (
278
+ self.url in log
279
+ and os.path.exists(log[self.url])
280
+ and _has_files(log[self.url])
281
+ ):
282
+ logger.warning(f"Repo Already downloaded in {log[self.url]}")
283
+ self.clone_path = log[self.url]
284
+ return self.clone_path
285
+
286
+ self.clone_path = path
287
+ if path is None:
288
+ path = self.default_clone_path()
289
+ self.clone_path = path
290
+
291
+ try:
292
+ subprocess.run(["git", "clone", self.url, path], check=True)
293
+ log[self.url] = path
294
+ with open(self.log_file, "w") as f:
295
+ json.dump(log, f)
296
+ return self.clone_path
297
+ except subprocess.CalledProcessError as e:
298
+ logger.error(f"Git clone failed: {e}")
299
+ except Exception as e:
300
+ logger.error(f"An error occurred while trying to clone the repository:{e}")
301
+
302
+ return self.clone_path
303
+
304
+ def load_tree_from_github(
305
+ self, depth: int, lines: int = 0
306
+ ) -> Dict[str, Union[str, List[Dict[str, Any]]]]:
307
+ """
308
+ Get a nested dictionary of GitHub repository file and directory names
309
+ up to a certain depth, with file contents.
310
+
311
+ Args:
312
+ depth (int): The depth level.
313
+ lines (int): The number of lines of file contents to include.
314
+
315
+ Returns:
316
+ Dict[str, Union[str, List[Dict]]]:
317
+ A dictionary containing file and directory names, with file contents.
318
+ """
319
+ if self.repo is None:
320
+ logger.warning("No repo found. Ensure the URL is correct.")
321
+ return {} # Return an empty dict rather than raise an error in this case
322
+
323
+ root_contents = self.repo.get_contents("")
324
+ if not isinstance(root_contents, list):
325
+ root_contents = [root_contents]
326
+ repo_structure = {
327
+ "type": "dir",
328
+ "name": "",
329
+ "dirs": [],
330
+ "files": [],
331
+ "path": "",
332
+ }
333
+
334
+ # A queue of tuples (current_node, current_depth, parent_structure)
335
+ queue = deque([(root_contents, 0, repo_structure)])
336
+
337
+ while queue:
338
+ current_node, current_depth, parent_structure = queue.popleft()
339
+
340
+ for content in current_node:
341
+ if not self._is_allowed(content):
342
+ continue
343
+ if content.type == "dir" and current_depth < depth:
344
+ # Create a new sub-dictionary for this directory
345
+ new_dir = {
346
+ "type": "dir",
347
+ "name": content.name,
348
+ "dirs": [],
349
+ "files": [],
350
+ "path": content.path,
351
+ }
352
+ parent_structure["dirs"].append(new_dir)
353
+ contents = self.repo.get_contents(content.path)
354
+ if not isinstance(contents, list):
355
+ contents = [contents]
356
+ queue.append(
357
+ (
358
+ contents,
359
+ current_depth + 1,
360
+ new_dir,
361
+ )
362
+ )
363
+ elif content.type == "file":
364
+ file_content = "\n".join(
365
+ _get_decoded_content(content).splitlines()[:lines]
366
+ )
367
+ file_dict = {
368
+ "type": "file",
369
+ "name": content.name,
370
+ "content": file_content,
371
+ "path": content.path,
372
+ }
373
+ parent_structure["files"].append(file_dict)
374
+
375
+ return repo_structure
376
+
377
+ def load(
378
+ self,
379
+ path: Optional[str] = None,
380
+ depth: int = 3,
381
+ lines: int = 0,
382
+ ) -> Tuple[Dict[str, Union[str, List[Dict[str, Any]]]], List[Document]]:
383
+ """
384
+ From a local folder `path` (if None, the repo clone path), get:
385
+ a nested dictionary (tree) of dicts, files and contents
386
+ a list of Document objects for each file.
387
+
388
+ Args:
389
+ path (str): The local folder path; if none, use self.clone_path()
390
+ depth (int): The depth level.
391
+ lines (int): The number of lines of file contents to include.
392
+
393
+ Returns:
394
+ Tuple of (dict, List_of_Documents):
395
+ A dictionary containing file and directory names, with file
396
+ contents, and a list of Document objects for each file.
397
+ """
398
+ if path is None:
399
+ if self.clone_path is None or not _has_files(self.clone_path):
400
+ self.clone()
401
+ path = self.clone_path
402
+ if path is None:
403
+ raise ValueError("Unable to clone repo")
404
+ return self.load_from_folder(
405
+ path=path,
406
+ depth=depth,
407
+ lines=lines,
408
+ file_types=self.config.file_types,
409
+ exclude_dirs=self.config.exclude_dirs,
410
+ url=self.url,
411
+ )
412
+
413
+ @staticmethod
414
+ def load_from_folder(
415
+ path: str,
416
+ depth: int = 3,
417
+ lines: int = 0,
418
+ file_types: Optional[List[str]] = None,
419
+ exclude_dirs: Optional[List[str]] = None,
420
+ url: str = "",
421
+ ) -> Tuple[Dict[str, Union[str, List[Dict[str, Any]]]], List[Document]]:
422
+ """
423
+ From a local folder `path` (required), get:
424
+ a nested dictionary (tree) of dicts, files and contents, restricting to
425
+ desired file_types and excluding undesired directories.
426
+ a list of Document objects for each file.
427
+
428
+ Args:
429
+ path (str): The local folder path, required.
430
+ depth (int): The depth level. Optional, default 3.
431
+ lines (int): The number of lines of file contents to include.
432
+ Optional, default 0 (no lines => empty string).
433
+ file_types (List[str]): The file types to include.
434
+ Optional, default None (all).
435
+ exclude_dirs (List[str]): The directories to exclude.
436
+ Optional, default None (no exclusions).
437
+ url (str): Optional url, to be stored in docs as metadata. Default "".
438
+
439
+ Returns:
440
+ Tuple of (dict, List_of_Documents):
441
+ A dictionary containing file and directory names, with file contents.
442
+ A list of Document objects for each file.
443
+ """
444
+
445
+ folder_structure = {
446
+ "type": "dir",
447
+ "name": "",
448
+ "dirs": [],
449
+ "files": [],
450
+ "path": "",
451
+ }
452
+ # A queue of tuples (current_path, current_depth, parent_structure)
453
+ queue = deque([(path, 0, folder_structure)])
454
+ docs = []
455
+ exclude_dirs = exclude_dirs or []
456
+ while queue:
457
+ current_path, current_depth, parent_structure = queue.popleft()
458
+
459
+ for item in os.listdir(current_path):
460
+ item_path = os.path.join(current_path, item)
461
+ relative_path = os.path.relpath(item_path, path)
462
+ if (os.path.isdir(item_path) and item in exclude_dirs) or (
463
+ os.path.isfile(item_path)
464
+ and file_types is not None
465
+ and RepoLoader._file_type(item) not in file_types
466
+ ):
467
+ continue
468
+
469
+ if os.path.isdir(item_path) and current_depth < depth:
470
+ # Create a new sub-dictionary for this directory
471
+ new_dir = {
472
+ "type": "dir",
473
+ "name": item,
474
+ "dirs": [],
475
+ "files": [],
476
+ "path": relative_path,
477
+ }
478
+ parent_structure["dirs"].append(new_dir)
479
+ queue.append((item_path, current_depth + 1, new_dir))
480
+ elif os.path.isfile(item_path):
481
+ # Add the file to the current dictionary
482
+ with open(item_path, "r") as f:
483
+ file_lines = list(itertools.islice(f, lines))
484
+ file_content = "\n".join(line.strip() for line in file_lines)
485
+ if file_content == "":
486
+ continue
487
+
488
+ file_dict = {
489
+ "type": "file",
490
+ "name": item,
491
+ "content": file_content,
492
+ "path": relative_path,
493
+ }
494
+ parent_structure["files"].append(file_dict)
495
+ docs.append(
496
+ Document(
497
+ content=file_content,
498
+ metadata=DocMetaData(
499
+ repo=url,
500
+ source=relative_path,
501
+ url=url,
502
+ filename=item,
503
+ extension=RepoLoader._file_type(item),
504
+ language=RepoLoader._file_type(item),
505
+ ),
506
+ )
507
+ )
508
+ return folder_structure, docs
509
+
510
+ @staticmethod
511
+ def get_documents(
512
+ path: str | bytes,
513
+ parser: Parser = Parser(ParsingConfig()),
514
+ file_types: Optional[List[str]] = None,
515
+ exclude_dirs: Optional[List[str]] = None,
516
+ depth: int = -1,
517
+ lines: Optional[int] = None,
518
+ doc_type: str | DocumentType | None = None,
519
+ ) -> List[Document]:
520
+ """
521
+ Recursively get all files under a path as Document objects.
522
+
523
+ Args:
524
+ path (str|bytes): The path to the directory or file, or bytes content.
525
+ The bytes option is meant to support the case where the content
526
+ has already been read from a file in an upstream process
527
+ (e.g. from an API or a database), and we want to avoid having to
528
+ write it to a temporary file just to read it again.
529
+ (which can be very slow for large files,
530
+ especially in a docker container)
531
+ parser (Parser): Parser to use to parse files.
532
+ file_types (List[str], optional): List of file extensions OR
533
+ filenames OR file_path_names to include.
534
+ Defaults to None, which includes all files.
535
+ exclude_dirs (List[str], optional): List of directories to exclude.
536
+ Defaults to None, which includes all directories.
537
+ depth (int, optional): Max depth of recursion. Defaults to -1,
538
+ which includes all depths.
539
+ lines (int, optional): Number of lines to read from each file.
540
+ Defaults to None, which reads all lines.
541
+ doc_type (str|DocumentType | None, optional): The type of document to parse.
542
+ Returns:
543
+ List[Document]: List of Document objects representing files.
544
+
545
+ """
546
+ docs = []
547
+ file_paths = []
548
+ if isinstance(path, bytes):
549
+ file_paths.append(path)
550
+ else:
551
+ path_obj = Path(path).resolve()
552
+
553
+ if path_obj.is_file():
554
+ file_paths.append(str(path_obj))
555
+ else:
556
+ path_depth = len(path_obj.parts)
557
+ for root, dirs, files in os.walk(path):
558
+ # Exclude directories if needed
559
+ if exclude_dirs:
560
+ dirs[:] = [d for d in dirs if d not in exclude_dirs]
561
+
562
+ current_depth = len(Path(root).resolve().parts) - path_depth
563
+ if depth == -1 or current_depth <= depth:
564
+ for file in files:
565
+ file_path = str(Path(root) / file)
566
+ if (
567
+ file_types is None
568
+ or RepoLoader._file_type(file_path) in file_types
569
+ or os.path.basename(file_path) in file_types
570
+ or file_path in file_types
571
+ ):
572
+ file_paths.append(file_path)
573
+
574
+ for file_path in file_paths:
575
+ docs.extend(
576
+ DocumentParser.chunks_from_path_or_bytes(
577
+ file_path,
578
+ parser,
579
+ doc_type=doc_type,
580
+ lines=lines,
581
+ )
582
+ )
583
+ return docs
584
+
585
+ def load_docs_from_github(
586
+ self,
587
+ k: Optional[int] = None,
588
+ depth: Optional[int] = None,
589
+ lines: Optional[int] = None,
590
+ ) -> List[Document]:
591
+ """
592
+ Directly from GitHub, recursively get all files in a repo that have one of the
593
+ extensions, possibly up to a max number of files, max depth, and max number
594
+ of lines per file (if any of these are specified).
595
+
596
+ Args:
597
+ k (int): max number of files to load, or None for all files
598
+ depth (int): max depth to recurse, or None for infinite depth
599
+ lines (int): max number of lines to get, from a file, or None for all lines
600
+
601
+ Returns:
602
+ list of Document objects, each has fields `content` and `metadata`,
603
+ and `metadata` has fields `url`, `filename`, `extension`, `language`
604
+ """
605
+ if self.repo is None:
606
+ logger.warning("No repo found. Ensure the URL is correct.")
607
+ return [] # Return an empty list rather than raise an error
608
+
609
+ contents = self.repo.get_contents("")
610
+ if not isinstance(contents, list):
611
+ contents = [contents]
612
+ stack = list(zip(contents, [0] * len(contents))) # stack of (content, depth)
613
+ # recursively get all files in repo that have one of the extensions
614
+ docs = []
615
+ i = 0
616
+
617
+ while stack:
618
+ if k is not None and i == k:
619
+ break
620
+ file_content, d = stack.pop()
621
+ if not self._is_allowed(file_content):
622
+ continue
623
+ if file_content.type == "dir":
624
+ if depth is None or d <= depth:
625
+ items = self.repo.get_contents(file_content.path)
626
+ if not isinstance(items, list):
627
+ items = [items]
628
+ stack.extend(list(zip(items, [d + 1] * len(items))))
629
+ else:
630
+ if depth is None or d <= depth:
631
+ # need to decode the file content, which is in bytes
632
+ contents = self.repo.get_contents(file_content.path)
633
+ if isinstance(contents, list):
634
+ contents = contents[0]
635
+ text = _get_decoded_content(contents)
636
+ if lines is not None:
637
+ text = "\n".join(text.split("\n")[:lines])
638
+ i += 1
639
+
640
+ # Note `source` is important, it may be used to cite
641
+ # evidence for an answer.
642
+ # See URLLoader
643
+ # TODO we should use Pydantic to enforce/standardize this
644
+
645
+ docs.append(
646
+ Document(
647
+ content=text,
648
+ metadata=DocMetaData(
649
+ repo=self.url,
650
+ source=file_content.html_url,
651
+ url=file_content.html_url,
652
+ filename=file_content.name,
653
+ extension=self._file_type(file_content.name),
654
+ language=self._file_type(file_content.name),
655
+ ),
656
+ )
657
+ )
658
+ return docs
659
+
660
+ @staticmethod
661
+ def select(
662
+ structure: Dict[str, Union[str, List[Dict[str, Any]]]],
663
+ includes: List[str],
664
+ excludes: List[str] = [],
665
+ ) -> Dict[str, Union[str, List[Dict[str, Any]]]]:
666
+ """
667
+ Filter a structure dictionary for certain directories and files.
668
+
669
+ Args:
670
+ structure (Dict[str, Union[str, List[Dict]]]): The structure dictionary.
671
+ includes (List[str]): A list of desired directories and files.
672
+ For files, either full file names or "file type" can be specified.
673
+ E.g. "toml" will include all files with the ".toml" extension,
674
+ or "Makefile" will include all files named "Makefile".
675
+ excludes (List[str]): A list of directories and files to exclude.
676
+ Similar to `includes`, full file/dir names or "file type" can be
677
+ specified. Optional, defaults to empty list.
678
+
679
+
680
+ Returns:
681
+ Dict[str, Union[str, List[Dict]]]: The filtered structure dictionary.
682
+ """
683
+ filtered_structure = {
684
+ "type": structure["type"],
685
+ "name": structure["name"],
686
+ "dirs": [],
687
+ "files": [],
688
+ "path": structure["path"],
689
+ }
690
+
691
+ for dir in structure["dirs"]:
692
+ if (
693
+ dir["name"] in includes
694
+ or RepoLoader._file_type(dir["name"]) in includes
695
+ ) and (
696
+ dir["name"] not in excludes
697
+ and RepoLoader._file_type(dir["name"]) not in excludes
698
+ ):
699
+ # If the directory is in the select list, include the whole subtree
700
+ filtered_structure["dirs"].append(dir)
701
+ else:
702
+ # Otherwise, filter the directory's contents
703
+ filtered_dir = RepoLoader.select(dir, includes)
704
+ if (
705
+ filtered_dir["dirs"] or filtered_dir["files"]
706
+ ): # only add if not empty
707
+ filtered_structure["dirs"].append(filtered_dir)
708
+
709
+ for file in structure["files"]:
710
+ if (
711
+ file["name"] in includes
712
+ or RepoLoader._file_type(file["name"]) in includes
713
+ ) and (
714
+ file["name"] not in excludes
715
+ and RepoLoader._file_type(file["name"]) not in excludes
716
+ ):
717
+ filtered_structure["files"].append(file)
718
+
719
+ return filtered_structure
720
+
721
+ @staticmethod
722
+ def ls(structure: Dict[str, Union[str, List[Dict]]], depth: int = 0) -> List[str]:
723
+ """
724
+ Get a list of names of files or directories up to a certain depth from a
725
+ structure dictionary.
726
+
727
+ Args:
728
+ structure (Dict[str, Union[str, List[Dict]]]): The structure dictionary.
729
+ depth (int, optional): The depth level. Defaults to 0.
730
+
731
+ Returns:
732
+ List[str]: A list of names of files or directories.
733
+ """
734
+ names = []
735
+
736
+ # A queue of tuples (current_structure, current_depth)
737
+ queue = deque([(structure, 0)])
738
+
739
+ while queue:
740
+ current_structure, current_depth = queue.popleft()
741
+
742
+ if current_depth <= depth:
743
+ names.append(current_structure["name"])
744
+
745
+ for dir in current_structure["dirs"]:
746
+ queue.append((dir, current_depth + 1))
747
+
748
+ for file in current_structure["files"]:
749
+ # add file names only if depth is less than the limit
750
+ if current_depth < depth:
751
+ names.append(file["name"])
752
+ names = [n for n in names if n not in ["", None]]
753
+ return names
754
+
755
+ @staticmethod
756
+ def list_files(
757
+ dir: str,
758
+ depth: int = 1,
759
+ include_types: List[str] = [],
760
+ exclude_types: List[str] = [],
761
+ ) -> List[str]:
762
+ """
763
+ Recursively list all files in a directory, up to a certain depth.
764
+
765
+ Args:
766
+ dir (str): The directory path, relative to root.
767
+ depth (int, optional): The depth level. Defaults to 1.
768
+ include_types (List[str], optional): A list of file types to include.
769
+ Defaults to empty list.
770
+ exclude_types (List[str], optional): A list of file types to exclude.
771
+ Defaults to empty list.
772
+ Returns:
773
+ List[str]: A list of file names.
774
+ """
775
+ depth = depth if depth >= 0 else 200
776
+ output = []
777
+
778
+ for root, dirs, files in os.walk(dir):
779
+ if root.count(os.sep) - dir.count(os.sep) < depth:
780
+ level = root.count(os.sep) - dir.count(os.sep)
781
+ sub_indent = " " * 4 * (level + 1)
782
+ for d in dirs:
783
+ output.append("{}{}/".format(sub_indent, d))
784
+ for f in files:
785
+ if include_types and RepoLoader._file_type(f) not in include_types:
786
+ continue
787
+ if exclude_types and RepoLoader._file_type(f) in exclude_types:
788
+ continue
789
+ output.append("{}{}".format(sub_indent, f))
790
+ return output
791
+
792
+ @staticmethod
793
+ def show_file_contents(tree: Dict[str, Union[str, List[Dict[str, Any]]]]) -> str:
794
+ """
795
+ Print the contents of all files from a structure dictionary.
796
+
797
+ Args:
798
+ tree (Dict[str, Union[str, List[Dict]]]): The structure dictionary.
799
+ """
800
+ contents = ""
801
+ for dir in tree["dirs"]:
802
+ contents += RepoLoader.show_file_contents(dir)
803
+ for file in tree["files"]:
804
+ path = file["path"]
805
+ contents += f"""
806
+ {path}:
807
+ --------------------
808
+ {file["content"]}
809
+
810
+ """
811
+
812
+ return contents