wikigen 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wikigen/__init__.py +7 -0
- wikigen/cli.py +690 -0
- wikigen/config.py +526 -0
- wikigen/defaults.py +78 -0
- wikigen/flows/__init__.py +1 -0
- wikigen/flows/flow.py +38 -0
- wikigen/formatter/help_formatter.py +194 -0
- wikigen/formatter/init_formatter.py +56 -0
- wikigen/formatter/output_formatter.py +290 -0
- wikigen/mcp/__init__.py +12 -0
- wikigen/mcp/chunking.py +127 -0
- wikigen/mcp/embeddings.py +69 -0
- wikigen/mcp/output_resources.py +65 -0
- wikigen/mcp/search_index.py +826 -0
- wikigen/mcp/server.py +232 -0
- wikigen/mcp/vector_index.py +297 -0
- wikigen/metadata/__init__.py +35 -0
- wikigen/metadata/logo.py +28 -0
- wikigen/metadata/project.py +28 -0
- wikigen/metadata/version.py +17 -0
- wikigen/nodes/__init__.py +1 -0
- wikigen/nodes/nodes.py +1080 -0
- wikigen/utils/__init__.py +0 -0
- wikigen/utils/adjust_headings.py +72 -0
- wikigen/utils/call_llm.py +271 -0
- wikigen/utils/crawl_github_files.py +450 -0
- wikigen/utils/crawl_local_files.py +151 -0
- wikigen/utils/llm_providers.py +101 -0
- wikigen/utils/version_check.py +84 -0
- wikigen-1.0.0.dist-info/METADATA +352 -0
- wikigen-1.0.0.dist-info/RECORD +35 -0
- wikigen-1.0.0.dist-info/WHEEL +5 -0
- wikigen-1.0.0.dist-info/entry_points.txt +2 -0
- wikigen-1.0.0.dist-info/licenses/LICENSE +21 -0
- wikigen-1.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,450 @@
|
|
|
1
|
+
import requests
|
|
2
|
+
import base64
|
|
3
|
+
import os
|
|
4
|
+
import tempfile
|
|
5
|
+
import git
|
|
6
|
+
import time
|
|
7
|
+
import fnmatch
|
|
8
|
+
from typing import Union, Set
|
|
9
|
+
from urllib.parse import urlparse
|
|
10
|
+
from ..formatter.output_formatter import print_operation, Icons, Colors, format_size
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def crawl_github_files(
|
|
14
|
+
repo_url,
|
|
15
|
+
token=None,
|
|
16
|
+
max_file_size: int = 1 * 1024 * 1024, # 1 MB
|
|
17
|
+
use_relative_paths: bool = False,
|
|
18
|
+
include_patterns: Union[str, Set[str]] = None,
|
|
19
|
+
exclude_patterns: Union[str, Set[str]] = None,
|
|
20
|
+
):
|
|
21
|
+
"""
|
|
22
|
+
Crawl files from a specific path in a GitHub repository at a specific commit.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
repo_url (str): URL of the GitHub repository with specific path and commit
|
|
26
|
+
(e.g., 'https://github.com/microsoft/autogen/tree/e45a15766746d95f8cfaaa705b0371267bec812e/python/packages/autogen-core/src/autogen_core')
|
|
27
|
+
token (str, optional): **GitHub personal access token.**
|
|
28
|
+
- **Required for private repositories.**
|
|
29
|
+
- **Recommended for public repos to avoid rate limits.**
|
|
30
|
+
- Can be passed explicitly or set via the `GITHUB_TOKEN` environment variable.
|
|
31
|
+
max_file_size (int, optional): Maximum file size in bytes to download (default: 1 MB)
|
|
32
|
+
use_relative_paths (bool, optional): If True, file paths will be relative to the specified subdirectory
|
|
33
|
+
include_patterns (str or set of str, optional): Pattern or set of patterns specifying which files to include (e.g., "*.py", {"*.md", "*.txt"}).
|
|
34
|
+
If None, all files are included.
|
|
35
|
+
exclude_patterns (str or set of str, optional): Pattern or set of patterns specifying which files to exclude.
|
|
36
|
+
If None, no files are excluded.
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
dict: Dictionary with files and statistics
|
|
40
|
+
"""
|
|
41
|
+
# Convert single pattern to set
|
|
42
|
+
if include_patterns and isinstance(include_patterns, str):
|
|
43
|
+
include_patterns = {include_patterns}
|
|
44
|
+
if exclude_patterns and isinstance(exclude_patterns, str):
|
|
45
|
+
exclude_patterns = {exclude_patterns}
|
|
46
|
+
|
|
47
|
+
def should_include_file(file_path: str, file_name: str) -> bool:
|
|
48
|
+
"""Determine if a file should be included based on patterns"""
|
|
49
|
+
# If no include patterns are specified, include all files
|
|
50
|
+
if not include_patterns:
|
|
51
|
+
include_file = True
|
|
52
|
+
else:
|
|
53
|
+
# Check if file matches any include pattern
|
|
54
|
+
include_file = any(
|
|
55
|
+
fnmatch.fnmatch(file_name, pattern) for pattern in include_patterns
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
# If exclude patterns are specified, check if file should be excluded
|
|
59
|
+
if exclude_patterns and include_file:
|
|
60
|
+
# Exclude if file matches any exclude pattern
|
|
61
|
+
exclude_file = any(
|
|
62
|
+
fnmatch.fnmatch(file_path, pattern) for pattern in exclude_patterns
|
|
63
|
+
)
|
|
64
|
+
return not exclude_file
|
|
65
|
+
|
|
66
|
+
return include_file
|
|
67
|
+
|
|
68
|
+
# Detect SSH URL (git@ or .git suffix)
|
|
69
|
+
is_ssh_url = repo_url.startswith("git@") or repo_url.endswith(".git")
|
|
70
|
+
|
|
71
|
+
if is_ssh_url:
|
|
72
|
+
# Clone repo via SSH to temp dir
|
|
73
|
+
with tempfile.TemporaryDirectory() as tmpdirname:
|
|
74
|
+
print(f"Cloning SSH repo {repo_url} to temp dir {tmpdirname} ...")
|
|
75
|
+
try:
|
|
76
|
+
repo = git.Repo.clone_from(repo_url, tmpdirname)
|
|
77
|
+
except (git.GitCommandError, git.GitError, OSError, IOError) as e:
|
|
78
|
+
print(f"Error cloning repo: {e}")
|
|
79
|
+
return {"files": {}, "stats": {"error": str(e)}}
|
|
80
|
+
|
|
81
|
+
# Attempt to checkout specific commit/branch if in URL
|
|
82
|
+
# Parse ref and subdir from SSH URL? SSH URLs don't have branch info embedded
|
|
83
|
+
# So rely on default branch, or user can checkout manually later
|
|
84
|
+
# Optionally, user can pass ref explicitly in future API
|
|
85
|
+
|
|
86
|
+
# Walk directory
|
|
87
|
+
files = {}
|
|
88
|
+
skipped_files = []
|
|
89
|
+
|
|
90
|
+
for root, dirs, filenames in os.walk(tmpdirname):
|
|
91
|
+
for filename in filenames:
|
|
92
|
+
abs_path = os.path.join(root, filename)
|
|
93
|
+
rel_path = os.path.relpath(abs_path, tmpdirname)
|
|
94
|
+
|
|
95
|
+
# Check file size
|
|
96
|
+
try:
|
|
97
|
+
file_size = os.path.getsize(abs_path)
|
|
98
|
+
except OSError:
|
|
99
|
+
continue
|
|
100
|
+
|
|
101
|
+
if file_size > max_file_size:
|
|
102
|
+
skipped_files.append((rel_path, file_size))
|
|
103
|
+
print_operation(f"{rel_path}", Icons.SKIP, indent=2)
|
|
104
|
+
continue
|
|
105
|
+
|
|
106
|
+
# Check include/exclude patterns
|
|
107
|
+
if not should_include_file(rel_path, filename):
|
|
108
|
+
print_operation(f"{rel_path}", Icons.SKIP, indent=2)
|
|
109
|
+
continue
|
|
110
|
+
|
|
111
|
+
# Read content
|
|
112
|
+
try:
|
|
113
|
+
with open(abs_path, "r", encoding="utf-8-sig") as f:
|
|
114
|
+
content = f.read()
|
|
115
|
+
files[rel_path] = content
|
|
116
|
+
print_operation(
|
|
117
|
+
f"{rel_path} {Colors.DARK_GRAY}({format_size(file_size)})",
|
|
118
|
+
Icons.DOWNLOAD,
|
|
119
|
+
indent=2,
|
|
120
|
+
)
|
|
121
|
+
except (IOError, OSError, UnicodeDecodeError, PermissionError) as e:
|
|
122
|
+
print_operation(f"{rel_path}: {e}", Icons.ERROR, indent=2)
|
|
123
|
+
|
|
124
|
+
return {
|
|
125
|
+
"files": files,
|
|
126
|
+
"stats": {
|
|
127
|
+
"downloaded_count": len(files),
|
|
128
|
+
"skipped_count": len(skipped_files),
|
|
129
|
+
"skipped_files": skipped_files,
|
|
130
|
+
"base_path": None,
|
|
131
|
+
"include_patterns": include_patterns,
|
|
132
|
+
"exclude_patterns": exclude_patterns,
|
|
133
|
+
"source": "ssh_clone",
|
|
134
|
+
},
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
# Parse GitHub URL to extract owner, repo, commit/branch, and path
|
|
138
|
+
parsed_url = urlparse(repo_url)
|
|
139
|
+
path_parts = parsed_url.path.strip("/").split("/")
|
|
140
|
+
|
|
141
|
+
if len(path_parts) < 2:
|
|
142
|
+
raise ValueError(f"Invalid GitHub URL: {repo_url}")
|
|
143
|
+
|
|
144
|
+
# Extract the basic components
|
|
145
|
+
owner = path_parts[0]
|
|
146
|
+
repo = path_parts[1]
|
|
147
|
+
|
|
148
|
+
# Setup for GitHub API
|
|
149
|
+
headers = {"Accept": "application/vnd.github.v3+json"}
|
|
150
|
+
if token:
|
|
151
|
+
headers["Authorization"] = f"token {token}"
|
|
152
|
+
|
|
153
|
+
def fetch_branches(owner: str, repo: str):
|
|
154
|
+
"""Get brancshes of the repository"""
|
|
155
|
+
|
|
156
|
+
url = f"https://api.github.com/repos/{owner}/{repo}/branches"
|
|
157
|
+
response = requests.get(url, headers=headers, timeout=(30, 30))
|
|
158
|
+
|
|
159
|
+
if response.status_code == 404:
|
|
160
|
+
if not token:
|
|
161
|
+
print(
|
|
162
|
+
"Error 404: Repository not found or is private.\n"
|
|
163
|
+
"If this is a private repository, please provide a valid GitHub token via the 'token' argument or set the GITHUB_TOKEN environment variable."
|
|
164
|
+
)
|
|
165
|
+
else:
|
|
166
|
+
print(
|
|
167
|
+
"Error 404: Repository not found or insufficient permissions with the provided token.\n"
|
|
168
|
+
"Please verify the repository exists and the token has access to this repository."
|
|
169
|
+
)
|
|
170
|
+
return []
|
|
171
|
+
|
|
172
|
+
if response.status_code != 200:
|
|
173
|
+
print(
|
|
174
|
+
f"Error fetching the branches of {owner}/{repo}: {response.status_code} - {response.text}"
|
|
175
|
+
)
|
|
176
|
+
return []
|
|
177
|
+
|
|
178
|
+
return response.json()
|
|
179
|
+
|
|
180
|
+
def check_tree(owner: str, repo: str, tree: str):
|
|
181
|
+
"""Check the repository has the given tree"""
|
|
182
|
+
|
|
183
|
+
url = f"https://api.github.com/repos/{owner}/{repo}/git/trees/{tree}"
|
|
184
|
+
response = requests.get(url, headers=headers, timeout=(30, 30))
|
|
185
|
+
|
|
186
|
+
return True if response.status_code == 200 else False
|
|
187
|
+
|
|
188
|
+
# Check if URL contains a specific branch/commit
|
|
189
|
+
if len(path_parts) > 2 and "tree" == path_parts[2]:
|
|
190
|
+
|
|
191
|
+
def join_parts(i):
|
|
192
|
+
return "/".join(path_parts[i:])
|
|
193
|
+
|
|
194
|
+
branches = fetch_branches(owner, repo)
|
|
195
|
+
branch_names = map(lambda branch: branch.get("name"), branches)
|
|
196
|
+
|
|
197
|
+
# Fetching branches is not successfully
|
|
198
|
+
if len(branches) == 0:
|
|
199
|
+
return
|
|
200
|
+
|
|
201
|
+
# To check branch name
|
|
202
|
+
relevant_path = join_parts(3)
|
|
203
|
+
|
|
204
|
+
# Find a match with relevant path and get the branch name
|
|
205
|
+
filter_gen = (name for name in branch_names if relevant_path.startswith(name))
|
|
206
|
+
ref = next(filter_gen, None)
|
|
207
|
+
|
|
208
|
+
# If match is not found, check for is it a tree
|
|
209
|
+
if ref is None:
|
|
210
|
+
tree = path_parts[3]
|
|
211
|
+
ref = tree if check_tree(owner, repo, tree) else None
|
|
212
|
+
|
|
213
|
+
# If it is neither a tree nor a branch name
|
|
214
|
+
if ref is None:
|
|
215
|
+
print(
|
|
216
|
+
"The given path does not match with any branch and any tree in the repository.\n"
|
|
217
|
+
"Please verify the path exists."
|
|
218
|
+
)
|
|
219
|
+
return
|
|
220
|
+
|
|
221
|
+
# Combine all parts after the ref as the path
|
|
222
|
+
part_index = 5 if "/" in ref else 4
|
|
223
|
+
specific_path = join_parts(part_index) if part_index < len(path_parts) else ""
|
|
224
|
+
else:
|
|
225
|
+
# Don't put the ref param to query
|
|
226
|
+
# and let GitHub decide default branch
|
|
227
|
+
ref = None
|
|
228
|
+
specific_path = ""
|
|
229
|
+
|
|
230
|
+
# Dictionary to store path -> content mapping
|
|
231
|
+
files = {}
|
|
232
|
+
skipped_files = []
|
|
233
|
+
|
|
234
|
+
def fetch_contents(path):
|
|
235
|
+
"""Fetch contents of the repository at a specific path and commit"""
|
|
236
|
+
url = f"https://api.github.com/repos/{owner}/{repo}/contents/{path}"
|
|
237
|
+
params = {"ref": ref} if ref is not None else {}
|
|
238
|
+
|
|
239
|
+
response = requests.get(url, headers=headers, params=params, timeout=(30, 30))
|
|
240
|
+
|
|
241
|
+
if (
|
|
242
|
+
response.status_code == 403
|
|
243
|
+
and "rate limit exceeded" in response.text.lower()
|
|
244
|
+
):
|
|
245
|
+
reset_time = int(response.headers.get("X-RateLimit-Reset", 0))
|
|
246
|
+
wait_time = max(reset_time - time.time(), 0) + 1
|
|
247
|
+
print(f"Rate limit exceeded. Waiting for {wait_time:.0f} seconds...")
|
|
248
|
+
time.sleep(wait_time)
|
|
249
|
+
return fetch_contents(path)
|
|
250
|
+
|
|
251
|
+
if response.status_code == 404:
|
|
252
|
+
if not token:
|
|
253
|
+
print(
|
|
254
|
+
"Error 404: Repository not found or is private.\n"
|
|
255
|
+
"If this is a private repository, please provide a valid GitHub token via the 'token' argument or set the GITHUB_TOKEN environment variable."
|
|
256
|
+
)
|
|
257
|
+
elif not path and ref == "main":
|
|
258
|
+
print(
|
|
259
|
+
"Error 404: Repository not found. Check if the default branch is not 'main'\n"
|
|
260
|
+
"Try adding branch name to the request i.e. python main.py --repo https://github.com/username/repo/tree/master"
|
|
261
|
+
)
|
|
262
|
+
else:
|
|
263
|
+
print(
|
|
264
|
+
f"Error 404: Path '{path}' not found in repository or insufficient permissions with the provided token.\n"
|
|
265
|
+
"Please verify the token has access to this repository and the path exists."
|
|
266
|
+
)
|
|
267
|
+
return
|
|
268
|
+
|
|
269
|
+
if response.status_code != 200:
|
|
270
|
+
print(f"Error fetching {path}: {response.status_code} - {response.text}")
|
|
271
|
+
return
|
|
272
|
+
|
|
273
|
+
contents = response.json()
|
|
274
|
+
|
|
275
|
+
# Handle both single file and directory responses
|
|
276
|
+
if not isinstance(contents, list):
|
|
277
|
+
contents = [contents]
|
|
278
|
+
|
|
279
|
+
for item in contents:
|
|
280
|
+
item_path = item["path"]
|
|
281
|
+
|
|
282
|
+
# Calculate relative path if requested
|
|
283
|
+
if use_relative_paths and specific_path:
|
|
284
|
+
# Make sure the path is relative to the specified subdirectory
|
|
285
|
+
if item_path.startswith(specific_path):
|
|
286
|
+
rel_path = item_path[len(specific_path) :].lstrip("/")
|
|
287
|
+
else:
|
|
288
|
+
rel_path = item_path
|
|
289
|
+
else:
|
|
290
|
+
rel_path = item_path
|
|
291
|
+
|
|
292
|
+
if item["type"] == "file":
|
|
293
|
+
# Check if file should be included based on patterns
|
|
294
|
+
if not should_include_file(rel_path, item["name"]):
|
|
295
|
+
print_operation(f"{rel_path}", Icons.SKIP, indent=2)
|
|
296
|
+
continue
|
|
297
|
+
|
|
298
|
+
# Check file size if available
|
|
299
|
+
file_size = item.get("size", 0)
|
|
300
|
+
if file_size > max_file_size:
|
|
301
|
+
skipped_files.append((item_path, file_size))
|
|
302
|
+
print_operation(f"{rel_path}", Icons.SKIP, indent=2)
|
|
303
|
+
continue
|
|
304
|
+
|
|
305
|
+
# For files, get raw content
|
|
306
|
+
if "download_url" in item and item["download_url"]:
|
|
307
|
+
file_url = item["download_url"]
|
|
308
|
+
file_response = requests.get(
|
|
309
|
+
file_url, headers=headers, timeout=(30, 30)
|
|
310
|
+
)
|
|
311
|
+
|
|
312
|
+
# Final size check in case content-length header is available but differs from metadata
|
|
313
|
+
content_length = int(file_response.headers.get("content-length", 0))
|
|
314
|
+
if content_length > max_file_size:
|
|
315
|
+
skipped_files.append((item_path, content_length))
|
|
316
|
+
print(
|
|
317
|
+
f"Skipping {rel_path}: Content length ({content_length} bytes) exceeds limit ({max_file_size} bytes)"
|
|
318
|
+
)
|
|
319
|
+
continue
|
|
320
|
+
|
|
321
|
+
if file_response.status_code == 200:
|
|
322
|
+
files[rel_path] = file_response.text
|
|
323
|
+
print_operation(
|
|
324
|
+
f"{rel_path} {Colors.DARK_GRAY}({format_size(file_size)})",
|
|
325
|
+
Icons.DOWNLOAD,
|
|
326
|
+
indent=2,
|
|
327
|
+
)
|
|
328
|
+
else:
|
|
329
|
+
print_operation(
|
|
330
|
+
f"{rel_path}: {file_response.status_code}",
|
|
331
|
+
Icons.ERROR,
|
|
332
|
+
indent=2,
|
|
333
|
+
)
|
|
334
|
+
else:
|
|
335
|
+
# Alternative method if download_url is not available
|
|
336
|
+
content_response = requests.get(
|
|
337
|
+
item["url"], headers=headers, timeout=(30, 30)
|
|
338
|
+
)
|
|
339
|
+
if content_response.status_code == 200:
|
|
340
|
+
content_data = content_response.json()
|
|
341
|
+
if (
|
|
342
|
+
content_data.get("encoding") == "base64"
|
|
343
|
+
and "content" in content_data
|
|
344
|
+
):
|
|
345
|
+
# Check size of base64 content before decoding
|
|
346
|
+
if (
|
|
347
|
+
len(content_data["content"]) * 0.75 > max_file_size
|
|
348
|
+
): # Approximate size calculation
|
|
349
|
+
estimated_size = int(
|
|
350
|
+
len(content_data["content"]) * 0.75
|
|
351
|
+
)
|
|
352
|
+
skipped_files.append((item_path, estimated_size))
|
|
353
|
+
print(
|
|
354
|
+
f"Skipping {rel_path}: Encoded content exceeds size limit"
|
|
355
|
+
)
|
|
356
|
+
continue
|
|
357
|
+
|
|
358
|
+
file_content = base64.b64decode(
|
|
359
|
+
content_data["content"]
|
|
360
|
+
).decode("utf-8")
|
|
361
|
+
files[rel_path] = file_content
|
|
362
|
+
print_operation(
|
|
363
|
+
f"{rel_path} {Colors.DARK_GRAY}({format_size(file_size)})",
|
|
364
|
+
Icons.DOWNLOAD,
|
|
365
|
+
indent=2,
|
|
366
|
+
)
|
|
367
|
+
else:
|
|
368
|
+
print_operation(
|
|
369
|
+
f"{rel_path}: Unexpected content format",
|
|
370
|
+
Icons.ERROR,
|
|
371
|
+
indent=2,
|
|
372
|
+
)
|
|
373
|
+
else:
|
|
374
|
+
print_operation(
|
|
375
|
+
f"{rel_path}: {content_response.status_code}",
|
|
376
|
+
Icons.ERROR,
|
|
377
|
+
indent=2,
|
|
378
|
+
)
|
|
379
|
+
|
|
380
|
+
elif item["type"] == "dir":
|
|
381
|
+
# Check if directory should be excluded before recursing
|
|
382
|
+
if exclude_patterns:
|
|
383
|
+
dir_excluded = any(
|
|
384
|
+
fnmatch.fnmatch(item_path, pattern)
|
|
385
|
+
or fnmatch.fnmatch(rel_path, pattern)
|
|
386
|
+
for pattern in exclude_patterns
|
|
387
|
+
)
|
|
388
|
+
if dir_excluded:
|
|
389
|
+
continue
|
|
390
|
+
|
|
391
|
+
# Only recurse if directory is not excluded
|
|
392
|
+
fetch_contents(item_path)
|
|
393
|
+
|
|
394
|
+
# Start crawling from the specified path
|
|
395
|
+
fetch_contents(specific_path)
|
|
396
|
+
|
|
397
|
+
return {
|
|
398
|
+
"files": files,
|
|
399
|
+
"stats": {
|
|
400
|
+
"downloaded_count": len(files),
|
|
401
|
+
"skipped_count": len(skipped_files),
|
|
402
|
+
"skipped_files": skipped_files,
|
|
403
|
+
"base_path": specific_path if use_relative_paths else None,
|
|
404
|
+
"include_patterns": include_patterns,
|
|
405
|
+
"exclude_patterns": exclude_patterns,
|
|
406
|
+
},
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
|
|
410
|
+
# Example usage
|
|
411
|
+
if __name__ == "__main__":
|
|
412
|
+
# Get token from environment variable (recommended for private repos)
|
|
413
|
+
github_token = os.environ.get("GITHUB_TOKEN")
|
|
414
|
+
if not github_token:
|
|
415
|
+
print(
|
|
416
|
+
"Warning: No GitHub token found in environment variable 'GITHUB_TOKEN'.\n"
|
|
417
|
+
"Private repositories will not be accessible without a token.\n"
|
|
418
|
+
"To access private repos, set the environment variable or pass the token explicitly."
|
|
419
|
+
)
|
|
420
|
+
|
|
421
|
+
repo_url = "https://github.com/pydantic/pydantic/tree/6c38dc93f40a47f4d1350adca9ec0d72502e223f/pydantic"
|
|
422
|
+
|
|
423
|
+
# Example: Get Python and Markdown files, but exclude test files
|
|
424
|
+
result = crawl_github_files(
|
|
425
|
+
repo_url,
|
|
426
|
+
token=github_token,
|
|
427
|
+
max_file_size=1 * 1024 * 1024, # 1 MB in bytes
|
|
428
|
+
use_relative_paths=True, # Enable relative paths
|
|
429
|
+
include_patterns={"*.py", "*.md"}, # Include Python and Markdown files
|
|
430
|
+
)
|
|
431
|
+
|
|
432
|
+
files = result["files"]
|
|
433
|
+
stats = result["stats"]
|
|
434
|
+
|
|
435
|
+
print(f"\nDownloaded {stats['downloaded_count']} files.")
|
|
436
|
+
print(f"Skipped {stats['skipped_count']} files due to size limits or patterns.")
|
|
437
|
+
print(f"Base path for relative paths: {stats['base_path']}")
|
|
438
|
+
print(f"Include patterns: {stats['include_patterns']}")
|
|
439
|
+
print(f"Exclude patterns: {stats['exclude_patterns']}")
|
|
440
|
+
|
|
441
|
+
# Display all file paths in the dictionary
|
|
442
|
+
print("\nFiles in dictionary:")
|
|
443
|
+
for file_path in sorted(files.keys()):
|
|
444
|
+
print(f" {file_path}")
|
|
445
|
+
|
|
446
|
+
# Example: accessing content of a specific file
|
|
447
|
+
if files:
|
|
448
|
+
sample_file = next(iter(files))
|
|
449
|
+
print(f"\nSample file: {sample_file}")
|
|
450
|
+
print(f"Content preview: {files[sample_file][:200]}...")
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import fnmatch
|
|
3
|
+
import pathspec
|
|
4
|
+
from ..formatter.output_formatter import (
|
|
5
|
+
print_operation,
|
|
6
|
+
print_info,
|
|
7
|
+
Icons,
|
|
8
|
+
Colors,
|
|
9
|
+
format_size,
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def crawl_local_files(
|
|
14
|
+
directory,
|
|
15
|
+
include_patterns=None,
|
|
16
|
+
exclude_patterns=None,
|
|
17
|
+
max_file_size=None,
|
|
18
|
+
use_relative_paths=True,
|
|
19
|
+
):
|
|
20
|
+
"""
|
|
21
|
+
Crawl files in a local directory with similar interface as crawl_github_files.
|
|
22
|
+
Args:
|
|
23
|
+
directory (str): Path to local directory
|
|
24
|
+
include_patterns (set): File patterns to include (e.g. {"*.py", "*.js"})
|
|
25
|
+
exclude_patterns (set): File patterns to exclude (e.g. {"tests/*"})
|
|
26
|
+
max_file_size (int): Maximum file size in bytes
|
|
27
|
+
use_relative_paths (bool): Whether to use paths relative to directory
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
dict: {"files": {filepath: content}}
|
|
31
|
+
"""
|
|
32
|
+
if not os.path.isdir(directory):
|
|
33
|
+
raise ValueError(f"Directory does not exist: {directory}")
|
|
34
|
+
|
|
35
|
+
files_dict = {}
|
|
36
|
+
|
|
37
|
+
# --- Load .gitignore ---
|
|
38
|
+
gitignore_path = os.path.join(directory, ".gitignore")
|
|
39
|
+
gitignore_spec = None
|
|
40
|
+
if os.path.exists(gitignore_path):
|
|
41
|
+
try:
|
|
42
|
+
with open(gitignore_path, "r", encoding="utf-8-sig") as f:
|
|
43
|
+
gitignore_patterns = f.readlines()
|
|
44
|
+
gitignore_spec = pathspec.PathSpec.from_lines(
|
|
45
|
+
"gitwildmatch", gitignore_patterns
|
|
46
|
+
)
|
|
47
|
+
# Format the .gitignore path to be relative to directory for cleaner output
|
|
48
|
+
gitignore_rel = (
|
|
49
|
+
os.path.relpath(gitignore_path, directory)
|
|
50
|
+
if use_relative_paths
|
|
51
|
+
else gitignore_path
|
|
52
|
+
)
|
|
53
|
+
print_info(".gitignore", gitignore_rel)
|
|
54
|
+
except (IOError, OSError, UnicodeDecodeError) as e:
|
|
55
|
+
print(
|
|
56
|
+
f"Warning: Could not read or parse .gitignore file {gitignore_path}: {e}"
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
all_files = []
|
|
60
|
+
for root, dirs, files in os.walk(directory):
|
|
61
|
+
# Filter directories using .gitignore and exclude_patterns early
|
|
62
|
+
excluded_dirs = set()
|
|
63
|
+
for d in dirs:
|
|
64
|
+
dirpath_rel = os.path.relpath(os.path.join(root, d), directory)
|
|
65
|
+
|
|
66
|
+
if gitignore_spec and gitignore_spec.match_file(dirpath_rel):
|
|
67
|
+
excluded_dirs.add(d)
|
|
68
|
+
continue
|
|
69
|
+
|
|
70
|
+
if exclude_patterns:
|
|
71
|
+
for pattern in exclude_patterns:
|
|
72
|
+
if fnmatch.fnmatch(dirpath_rel, pattern) or fnmatch.fnmatch(
|
|
73
|
+
d, pattern
|
|
74
|
+
):
|
|
75
|
+
excluded_dirs.add(d)
|
|
76
|
+
break
|
|
77
|
+
|
|
78
|
+
for d in dirs.copy():
|
|
79
|
+
if d in excluded_dirs:
|
|
80
|
+
dirs.remove(d)
|
|
81
|
+
|
|
82
|
+
for filename in files:
|
|
83
|
+
filepath = os.path.join(root, filename)
|
|
84
|
+
all_files.append(filepath)
|
|
85
|
+
|
|
86
|
+
for filepath in all_files:
|
|
87
|
+
relpath = (
|
|
88
|
+
os.path.relpath(filepath, directory) if use_relative_paths else filepath
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
# --- Exclusion check ---
|
|
92
|
+
excluded = False
|
|
93
|
+
if gitignore_spec and gitignore_spec.match_file(relpath):
|
|
94
|
+
excluded = True
|
|
95
|
+
|
|
96
|
+
if not excluded and exclude_patterns:
|
|
97
|
+
for pattern in exclude_patterns:
|
|
98
|
+
if fnmatch.fnmatch(relpath, pattern):
|
|
99
|
+
excluded = True
|
|
100
|
+
break
|
|
101
|
+
|
|
102
|
+
included = False
|
|
103
|
+
if include_patterns:
|
|
104
|
+
for pattern in include_patterns:
|
|
105
|
+
if fnmatch.fnmatch(relpath, pattern):
|
|
106
|
+
included = True
|
|
107
|
+
break
|
|
108
|
+
else:
|
|
109
|
+
included = True
|
|
110
|
+
|
|
111
|
+
if not included or excluded:
|
|
112
|
+
print_operation(f"{relpath}", Icons.SKIP, indent=2)
|
|
113
|
+
continue # Skip to next file if not included or excluded
|
|
114
|
+
|
|
115
|
+
if max_file_size and os.path.getsize(filepath) > max_file_size:
|
|
116
|
+
print_operation(f"{relpath}", Icons.SKIP, indent=2)
|
|
117
|
+
continue # Skip large files
|
|
118
|
+
|
|
119
|
+
# --- File is being processed ---
|
|
120
|
+
try:
|
|
121
|
+
with open(filepath, "r", encoding="utf-8-sig") as f:
|
|
122
|
+
content = f.read()
|
|
123
|
+
files_dict[relpath] = content
|
|
124
|
+
file_size = os.path.getsize(filepath)
|
|
125
|
+
print_operation(
|
|
126
|
+
f"{relpath} {Colors.DARK_GRAY}({format_size(file_size)})",
|
|
127
|
+
Icons.DOWNLOAD,
|
|
128
|
+
indent=2,
|
|
129
|
+
)
|
|
130
|
+
except (IOError, OSError, UnicodeDecodeError, PermissionError) as e:
|
|
131
|
+
print_operation(f"{relpath}: {e}", Icons.ERROR, indent=2)
|
|
132
|
+
|
|
133
|
+
return {"files": files_dict}
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
if __name__ == "__main__":
|
|
137
|
+
print("--- Crawling parent directory ('..') ---")
|
|
138
|
+
files_data = crawl_local_files(
|
|
139
|
+
"..",
|
|
140
|
+
exclude_patterns={
|
|
141
|
+
"*.pyc",
|
|
142
|
+
"__pycache__/*",
|
|
143
|
+
".venv/*",
|
|
144
|
+
".git/*",
|
|
145
|
+
"docs/*",
|
|
146
|
+
"output/*",
|
|
147
|
+
},
|
|
148
|
+
)
|
|
149
|
+
print(f"Found {len(files_data['files'])} files:")
|
|
150
|
+
for path in files_data["files"]:
|
|
151
|
+
print(f" {path}")
|