wikigen 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,450 @@
1
+ import requests
2
+ import base64
3
+ import os
4
+ import tempfile
5
+ import git
6
+ import time
7
+ import fnmatch
8
+ from typing import Union, Set
9
+ from urllib.parse import urlparse
10
+ from ..formatter.output_formatter import print_operation, Icons, Colors, format_size
11
+
12
+
13
+ def crawl_github_files(
14
+ repo_url,
15
+ token=None,
16
+ max_file_size: int = 1 * 1024 * 1024, # 1 MB
17
+ use_relative_paths: bool = False,
18
+ include_patterns: Union[str, Set[str]] = None,
19
+ exclude_patterns: Union[str, Set[str]] = None,
20
+ ):
21
+ """
22
+ Crawl files from a specific path in a GitHub repository at a specific commit.
23
+
24
+ Args:
25
+ repo_url (str): URL of the GitHub repository with specific path and commit
26
+ (e.g., 'https://github.com/microsoft/autogen/tree/e45a15766746d95f8cfaaa705b0371267bec812e/python/packages/autogen-core/src/autogen_core')
27
+ token (str, optional): **GitHub personal access token.**
28
+ - **Required for private repositories.**
29
+ - **Recommended for public repos to avoid rate limits.**
30
+ - Can be passed explicitly or set via the `GITHUB_TOKEN` environment variable.
31
+ max_file_size (int, optional): Maximum file size in bytes to download (default: 1 MB)
32
+ use_relative_paths (bool, optional): If True, file paths will be relative to the specified subdirectory
33
+ include_patterns (str or set of str, optional): Pattern or set of patterns specifying which files to include (e.g., "*.py", {"*.md", "*.txt"}).
34
+ If None, all files are included.
35
+ exclude_patterns (str or set of str, optional): Pattern or set of patterns specifying which files to exclude.
36
+ If None, no files are excluded.
37
+
38
+ Returns:
39
+ dict: Dictionary with files and statistics
40
+ """
41
+ # Convert single pattern to set
42
+ if include_patterns and isinstance(include_patterns, str):
43
+ include_patterns = {include_patterns}
44
+ if exclude_patterns and isinstance(exclude_patterns, str):
45
+ exclude_patterns = {exclude_patterns}
46
+
47
+ def should_include_file(file_path: str, file_name: str) -> bool:
48
+ """Determine if a file should be included based on patterns"""
49
+ # If no include patterns are specified, include all files
50
+ if not include_patterns:
51
+ include_file = True
52
+ else:
53
+ # Check if file matches any include pattern
54
+ include_file = any(
55
+ fnmatch.fnmatch(file_name, pattern) for pattern in include_patterns
56
+ )
57
+
58
+ # If exclude patterns are specified, check if file should be excluded
59
+ if exclude_patterns and include_file:
60
+ # Exclude if file matches any exclude pattern
61
+ exclude_file = any(
62
+ fnmatch.fnmatch(file_path, pattern) for pattern in exclude_patterns
63
+ )
64
+ return not exclude_file
65
+
66
+ return include_file
67
+
68
+ # Detect SSH URL (git@ or .git suffix)
69
+ is_ssh_url = repo_url.startswith("git@") or repo_url.endswith(".git")
70
+
71
+ if is_ssh_url:
72
+ # Clone repo via SSH to temp dir
73
+ with tempfile.TemporaryDirectory() as tmpdirname:
74
+ print(f"Cloning SSH repo {repo_url} to temp dir {tmpdirname} ...")
75
+ try:
76
+ repo = git.Repo.clone_from(repo_url, tmpdirname)
77
+ except (git.GitCommandError, git.GitError, OSError, IOError) as e:
78
+ print(f"Error cloning repo: {e}")
79
+ return {"files": {}, "stats": {"error": str(e)}}
80
+
81
+ # Attempt to checkout specific commit/branch if in URL
82
+ # Parse ref and subdir from SSH URL? SSH URLs don't have branch info embedded
83
+ # So rely on default branch, or user can checkout manually later
84
+ # Optionally, user can pass ref explicitly in future API
85
+
86
+ # Walk directory
87
+ files = {}
88
+ skipped_files = []
89
+
90
+ for root, dirs, filenames in os.walk(tmpdirname):
91
+ for filename in filenames:
92
+ abs_path = os.path.join(root, filename)
93
+ rel_path = os.path.relpath(abs_path, tmpdirname)
94
+
95
+ # Check file size
96
+ try:
97
+ file_size = os.path.getsize(abs_path)
98
+ except OSError:
99
+ continue
100
+
101
+ if file_size > max_file_size:
102
+ skipped_files.append((rel_path, file_size))
103
+ print_operation(f"{rel_path}", Icons.SKIP, indent=2)
104
+ continue
105
+
106
+ # Check include/exclude patterns
107
+ if not should_include_file(rel_path, filename):
108
+ print_operation(f"{rel_path}", Icons.SKIP, indent=2)
109
+ continue
110
+
111
+ # Read content
112
+ try:
113
+ with open(abs_path, "r", encoding="utf-8-sig") as f:
114
+ content = f.read()
115
+ files[rel_path] = content
116
+ print_operation(
117
+ f"{rel_path} {Colors.DARK_GRAY}({format_size(file_size)})",
118
+ Icons.DOWNLOAD,
119
+ indent=2,
120
+ )
121
+ except (IOError, OSError, UnicodeDecodeError, PermissionError) as e:
122
+ print_operation(f"{rel_path}: {e}", Icons.ERROR, indent=2)
123
+
124
+ return {
125
+ "files": files,
126
+ "stats": {
127
+ "downloaded_count": len(files),
128
+ "skipped_count": len(skipped_files),
129
+ "skipped_files": skipped_files,
130
+ "base_path": None,
131
+ "include_patterns": include_patterns,
132
+ "exclude_patterns": exclude_patterns,
133
+ "source": "ssh_clone",
134
+ },
135
+ }
136
+
137
+ # Parse GitHub URL to extract owner, repo, commit/branch, and path
138
+ parsed_url = urlparse(repo_url)
139
+ path_parts = parsed_url.path.strip("/").split("/")
140
+
141
+ if len(path_parts) < 2:
142
+ raise ValueError(f"Invalid GitHub URL: {repo_url}")
143
+
144
+ # Extract the basic components
145
+ owner = path_parts[0]
146
+ repo = path_parts[1]
147
+
148
+ # Setup for GitHub API
149
+ headers = {"Accept": "application/vnd.github.v3+json"}
150
+ if token:
151
+ headers["Authorization"] = f"token {token}"
152
+
153
+ def fetch_branches(owner: str, repo: str):
154
+ """Get brancshes of the repository"""
155
+
156
+ url = f"https://api.github.com/repos/{owner}/{repo}/branches"
157
+ response = requests.get(url, headers=headers, timeout=(30, 30))
158
+
159
+ if response.status_code == 404:
160
+ if not token:
161
+ print(
162
+ "Error 404: Repository not found or is private.\n"
163
+ "If this is a private repository, please provide a valid GitHub token via the 'token' argument or set the GITHUB_TOKEN environment variable."
164
+ )
165
+ else:
166
+ print(
167
+ "Error 404: Repository not found or insufficient permissions with the provided token.\n"
168
+ "Please verify the repository exists and the token has access to this repository."
169
+ )
170
+ return []
171
+
172
+ if response.status_code != 200:
173
+ print(
174
+ f"Error fetching the branches of {owner}/{repo}: {response.status_code} - {response.text}"
175
+ )
176
+ return []
177
+
178
+ return response.json()
179
+
180
+ def check_tree(owner: str, repo: str, tree: str):
181
+ """Check the repository has the given tree"""
182
+
183
+ url = f"https://api.github.com/repos/{owner}/{repo}/git/trees/{tree}"
184
+ response = requests.get(url, headers=headers, timeout=(30, 30))
185
+
186
+ return True if response.status_code == 200 else False
187
+
188
+ # Check if URL contains a specific branch/commit
189
+ if len(path_parts) > 2 and "tree" == path_parts[2]:
190
+
191
+ def join_parts(i):
192
+ return "/".join(path_parts[i:])
193
+
194
+ branches = fetch_branches(owner, repo)
195
+ branch_names = map(lambda branch: branch.get("name"), branches)
196
+
197
+ # Fetching branches is not successfully
198
+ if len(branches) == 0:
199
+ return
200
+
201
+ # To check branch name
202
+ relevant_path = join_parts(3)
203
+
204
+ # Find a match with relevant path and get the branch name
205
+ filter_gen = (name for name in branch_names if relevant_path.startswith(name))
206
+ ref = next(filter_gen, None)
207
+
208
+ # If match is not found, check for is it a tree
209
+ if ref is None:
210
+ tree = path_parts[3]
211
+ ref = tree if check_tree(owner, repo, tree) else None
212
+
213
+ # If it is neither a tree nor a branch name
214
+ if ref is None:
215
+ print(
216
+ "The given path does not match with any branch and any tree in the repository.\n"
217
+ "Please verify the path exists."
218
+ )
219
+ return
220
+
221
+ # Combine all parts after the ref as the path
222
+ part_index = 5 if "/" in ref else 4
223
+ specific_path = join_parts(part_index) if part_index < len(path_parts) else ""
224
+ else:
225
+ # Don't put the ref param to query
226
+ # and let GitHub decide default branch
227
+ ref = None
228
+ specific_path = ""
229
+
230
+ # Dictionary to store path -> content mapping
231
+ files = {}
232
+ skipped_files = []
233
+
234
+ def fetch_contents(path):
235
+ """Fetch contents of the repository at a specific path and commit"""
236
+ url = f"https://api.github.com/repos/{owner}/{repo}/contents/{path}"
237
+ params = {"ref": ref} if ref is not None else {}
238
+
239
+ response = requests.get(url, headers=headers, params=params, timeout=(30, 30))
240
+
241
+ if (
242
+ response.status_code == 403
243
+ and "rate limit exceeded" in response.text.lower()
244
+ ):
245
+ reset_time = int(response.headers.get("X-RateLimit-Reset", 0))
246
+ wait_time = max(reset_time - time.time(), 0) + 1
247
+ print(f"Rate limit exceeded. Waiting for {wait_time:.0f} seconds...")
248
+ time.sleep(wait_time)
249
+ return fetch_contents(path)
250
+
251
+ if response.status_code == 404:
252
+ if not token:
253
+ print(
254
+ "Error 404: Repository not found or is private.\n"
255
+ "If this is a private repository, please provide a valid GitHub token via the 'token' argument or set the GITHUB_TOKEN environment variable."
256
+ )
257
+ elif not path and ref == "main":
258
+ print(
259
+ "Error 404: Repository not found. Check if the default branch is not 'main'\n"
260
+ "Try adding branch name to the request i.e. python main.py --repo https://github.com/username/repo/tree/master"
261
+ )
262
+ else:
263
+ print(
264
+ f"Error 404: Path '{path}' not found in repository or insufficient permissions with the provided token.\n"
265
+ "Please verify the token has access to this repository and the path exists."
266
+ )
267
+ return
268
+
269
+ if response.status_code != 200:
270
+ print(f"Error fetching {path}: {response.status_code} - {response.text}")
271
+ return
272
+
273
+ contents = response.json()
274
+
275
+ # Handle both single file and directory responses
276
+ if not isinstance(contents, list):
277
+ contents = [contents]
278
+
279
+ for item in contents:
280
+ item_path = item["path"]
281
+
282
+ # Calculate relative path if requested
283
+ if use_relative_paths and specific_path:
284
+ # Make sure the path is relative to the specified subdirectory
285
+ if item_path.startswith(specific_path):
286
+ rel_path = item_path[len(specific_path) :].lstrip("/")
287
+ else:
288
+ rel_path = item_path
289
+ else:
290
+ rel_path = item_path
291
+
292
+ if item["type"] == "file":
293
+ # Check if file should be included based on patterns
294
+ if not should_include_file(rel_path, item["name"]):
295
+ print_operation(f"{rel_path}", Icons.SKIP, indent=2)
296
+ continue
297
+
298
+ # Check file size if available
299
+ file_size = item.get("size", 0)
300
+ if file_size > max_file_size:
301
+ skipped_files.append((item_path, file_size))
302
+ print_operation(f"{rel_path}", Icons.SKIP, indent=2)
303
+ continue
304
+
305
+ # For files, get raw content
306
+ if "download_url" in item and item["download_url"]:
307
+ file_url = item["download_url"]
308
+ file_response = requests.get(
309
+ file_url, headers=headers, timeout=(30, 30)
310
+ )
311
+
312
+ # Final size check in case content-length header is available but differs from metadata
313
+ content_length = int(file_response.headers.get("content-length", 0))
314
+ if content_length > max_file_size:
315
+ skipped_files.append((item_path, content_length))
316
+ print(
317
+ f"Skipping {rel_path}: Content length ({content_length} bytes) exceeds limit ({max_file_size} bytes)"
318
+ )
319
+ continue
320
+
321
+ if file_response.status_code == 200:
322
+ files[rel_path] = file_response.text
323
+ print_operation(
324
+ f"{rel_path} {Colors.DARK_GRAY}({format_size(file_size)})",
325
+ Icons.DOWNLOAD,
326
+ indent=2,
327
+ )
328
+ else:
329
+ print_operation(
330
+ f"{rel_path}: {file_response.status_code}",
331
+ Icons.ERROR,
332
+ indent=2,
333
+ )
334
+ else:
335
+ # Alternative method if download_url is not available
336
+ content_response = requests.get(
337
+ item["url"], headers=headers, timeout=(30, 30)
338
+ )
339
+ if content_response.status_code == 200:
340
+ content_data = content_response.json()
341
+ if (
342
+ content_data.get("encoding") == "base64"
343
+ and "content" in content_data
344
+ ):
345
+ # Check size of base64 content before decoding
346
+ if (
347
+ len(content_data["content"]) * 0.75 > max_file_size
348
+ ): # Approximate size calculation
349
+ estimated_size = int(
350
+ len(content_data["content"]) * 0.75
351
+ )
352
+ skipped_files.append((item_path, estimated_size))
353
+ print(
354
+ f"Skipping {rel_path}: Encoded content exceeds size limit"
355
+ )
356
+ continue
357
+
358
+ file_content = base64.b64decode(
359
+ content_data["content"]
360
+ ).decode("utf-8")
361
+ files[rel_path] = file_content
362
+ print_operation(
363
+ f"{rel_path} {Colors.DARK_GRAY}({format_size(file_size)})",
364
+ Icons.DOWNLOAD,
365
+ indent=2,
366
+ )
367
+ else:
368
+ print_operation(
369
+ f"{rel_path}: Unexpected content format",
370
+ Icons.ERROR,
371
+ indent=2,
372
+ )
373
+ else:
374
+ print_operation(
375
+ f"{rel_path}: {content_response.status_code}",
376
+ Icons.ERROR,
377
+ indent=2,
378
+ )
379
+
380
+ elif item["type"] == "dir":
381
+ # Check if directory should be excluded before recursing
382
+ if exclude_patterns:
383
+ dir_excluded = any(
384
+ fnmatch.fnmatch(item_path, pattern)
385
+ or fnmatch.fnmatch(rel_path, pattern)
386
+ for pattern in exclude_patterns
387
+ )
388
+ if dir_excluded:
389
+ continue
390
+
391
+ # Only recurse if directory is not excluded
392
+ fetch_contents(item_path)
393
+
394
+ # Start crawling from the specified path
395
+ fetch_contents(specific_path)
396
+
397
+ return {
398
+ "files": files,
399
+ "stats": {
400
+ "downloaded_count": len(files),
401
+ "skipped_count": len(skipped_files),
402
+ "skipped_files": skipped_files,
403
+ "base_path": specific_path if use_relative_paths else None,
404
+ "include_patterns": include_patterns,
405
+ "exclude_patterns": exclude_patterns,
406
+ },
407
+ }
408
+
409
+
410
+ # Example usage
411
+ if __name__ == "__main__":
412
+ # Get token from environment variable (recommended for private repos)
413
+ github_token = os.environ.get("GITHUB_TOKEN")
414
+ if not github_token:
415
+ print(
416
+ "Warning: No GitHub token found in environment variable 'GITHUB_TOKEN'.\n"
417
+ "Private repositories will not be accessible without a token.\n"
418
+ "To access private repos, set the environment variable or pass the token explicitly."
419
+ )
420
+
421
+ repo_url = "https://github.com/pydantic/pydantic/tree/6c38dc93f40a47f4d1350adca9ec0d72502e223f/pydantic"
422
+
423
+ # Example: Get Python and Markdown files, but exclude test files
424
+ result = crawl_github_files(
425
+ repo_url,
426
+ token=github_token,
427
+ max_file_size=1 * 1024 * 1024, # 1 MB in bytes
428
+ use_relative_paths=True, # Enable relative paths
429
+ include_patterns={"*.py", "*.md"}, # Include Python and Markdown files
430
+ )
431
+
432
+ files = result["files"]
433
+ stats = result["stats"]
434
+
435
+ print(f"\nDownloaded {stats['downloaded_count']} files.")
436
+ print(f"Skipped {stats['skipped_count']} files due to size limits or patterns.")
437
+ print(f"Base path for relative paths: {stats['base_path']}")
438
+ print(f"Include patterns: {stats['include_patterns']}")
439
+ print(f"Exclude patterns: {stats['exclude_patterns']}")
440
+
441
+ # Display all file paths in the dictionary
442
+ print("\nFiles in dictionary:")
443
+ for file_path in sorted(files.keys()):
444
+ print(f" {file_path}")
445
+
446
+ # Example: accessing content of a specific file
447
+ if files:
448
+ sample_file = next(iter(files))
449
+ print(f"\nSample file: {sample_file}")
450
+ print(f"Content preview: {files[sample_file][:200]}...")
@@ -0,0 +1,151 @@
1
+ import os
2
+ import fnmatch
3
+ import pathspec
4
+ from ..formatter.output_formatter import (
5
+ print_operation,
6
+ print_info,
7
+ Icons,
8
+ Colors,
9
+ format_size,
10
+ )
11
+
12
+
13
+ def crawl_local_files(
14
+ directory,
15
+ include_patterns=None,
16
+ exclude_patterns=None,
17
+ max_file_size=None,
18
+ use_relative_paths=True,
19
+ ):
20
+ """
21
+ Crawl files in a local directory with similar interface as crawl_github_files.
22
+ Args:
23
+ directory (str): Path to local directory
24
+ include_patterns (set): File patterns to include (e.g. {"*.py", "*.js"})
25
+ exclude_patterns (set): File patterns to exclude (e.g. {"tests/*"})
26
+ max_file_size (int): Maximum file size in bytes
27
+ use_relative_paths (bool): Whether to use paths relative to directory
28
+
29
+ Returns:
30
+ dict: {"files": {filepath: content}}
31
+ """
32
+ if not os.path.isdir(directory):
33
+ raise ValueError(f"Directory does not exist: {directory}")
34
+
35
+ files_dict = {}
36
+
37
+ # --- Load .gitignore ---
38
+ gitignore_path = os.path.join(directory, ".gitignore")
39
+ gitignore_spec = None
40
+ if os.path.exists(gitignore_path):
41
+ try:
42
+ with open(gitignore_path, "r", encoding="utf-8-sig") as f:
43
+ gitignore_patterns = f.readlines()
44
+ gitignore_spec = pathspec.PathSpec.from_lines(
45
+ "gitwildmatch", gitignore_patterns
46
+ )
47
+ # Format the .gitignore path to be relative to directory for cleaner output
48
+ gitignore_rel = (
49
+ os.path.relpath(gitignore_path, directory)
50
+ if use_relative_paths
51
+ else gitignore_path
52
+ )
53
+ print_info(".gitignore", gitignore_rel)
54
+ except (IOError, OSError, UnicodeDecodeError) as e:
55
+ print(
56
+ f"Warning: Could not read or parse .gitignore file {gitignore_path}: {e}"
57
+ )
58
+
59
+ all_files = []
60
+ for root, dirs, files in os.walk(directory):
61
+ # Filter directories using .gitignore and exclude_patterns early
62
+ excluded_dirs = set()
63
+ for d in dirs:
64
+ dirpath_rel = os.path.relpath(os.path.join(root, d), directory)
65
+
66
+ if gitignore_spec and gitignore_spec.match_file(dirpath_rel):
67
+ excluded_dirs.add(d)
68
+ continue
69
+
70
+ if exclude_patterns:
71
+ for pattern in exclude_patterns:
72
+ if fnmatch.fnmatch(dirpath_rel, pattern) or fnmatch.fnmatch(
73
+ d, pattern
74
+ ):
75
+ excluded_dirs.add(d)
76
+ break
77
+
78
+ for d in dirs.copy():
79
+ if d in excluded_dirs:
80
+ dirs.remove(d)
81
+
82
+ for filename in files:
83
+ filepath = os.path.join(root, filename)
84
+ all_files.append(filepath)
85
+
86
+ for filepath in all_files:
87
+ relpath = (
88
+ os.path.relpath(filepath, directory) if use_relative_paths else filepath
89
+ )
90
+
91
+ # --- Exclusion check ---
92
+ excluded = False
93
+ if gitignore_spec and gitignore_spec.match_file(relpath):
94
+ excluded = True
95
+
96
+ if not excluded and exclude_patterns:
97
+ for pattern in exclude_patterns:
98
+ if fnmatch.fnmatch(relpath, pattern):
99
+ excluded = True
100
+ break
101
+
102
+ included = False
103
+ if include_patterns:
104
+ for pattern in include_patterns:
105
+ if fnmatch.fnmatch(relpath, pattern):
106
+ included = True
107
+ break
108
+ else:
109
+ included = True
110
+
111
+ if not included or excluded:
112
+ print_operation(f"{relpath}", Icons.SKIP, indent=2)
113
+ continue # Skip to next file if not included or excluded
114
+
115
+ if max_file_size and os.path.getsize(filepath) > max_file_size:
116
+ print_operation(f"{relpath}", Icons.SKIP, indent=2)
117
+ continue # Skip large files
118
+
119
+ # --- File is being processed ---
120
+ try:
121
+ with open(filepath, "r", encoding="utf-8-sig") as f:
122
+ content = f.read()
123
+ files_dict[relpath] = content
124
+ file_size = os.path.getsize(filepath)
125
+ print_operation(
126
+ f"{relpath} {Colors.DARK_GRAY}({format_size(file_size)})",
127
+ Icons.DOWNLOAD,
128
+ indent=2,
129
+ )
130
+ except (IOError, OSError, UnicodeDecodeError, PermissionError) as e:
131
+ print_operation(f"{relpath}: {e}", Icons.ERROR, indent=2)
132
+
133
+ return {"files": files_dict}
134
+
135
+
136
+ if __name__ == "__main__":
137
+ print("--- Crawling parent directory ('..') ---")
138
+ files_data = crawl_local_files(
139
+ "..",
140
+ exclude_patterns={
141
+ "*.pyc",
142
+ "__pycache__/*",
143
+ ".venv/*",
144
+ ".git/*",
145
+ "docs/*",
146
+ "output/*",
147
+ },
148
+ )
149
+ print(f"Found {len(files_data['files'])} files:")
150
+ for path in files_data["files"]:
151
+ print(f" {path}")