aiptx 2.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (187) hide show
  1. aipt_v2/__init__.py +110 -0
  2. aipt_v2/__main__.py +24 -0
  3. aipt_v2/agents/AIPTxAgent/__init__.py +10 -0
  4. aipt_v2/agents/AIPTxAgent/aiptx_agent.py +211 -0
  5. aipt_v2/agents/__init__.py +46 -0
  6. aipt_v2/agents/base.py +520 -0
  7. aipt_v2/agents/exploit_agent.py +688 -0
  8. aipt_v2/agents/ptt.py +406 -0
  9. aipt_v2/agents/state.py +168 -0
  10. aipt_v2/app.py +957 -0
  11. aipt_v2/browser/__init__.py +31 -0
  12. aipt_v2/browser/automation.py +458 -0
  13. aipt_v2/browser/crawler.py +453 -0
  14. aipt_v2/cli.py +2933 -0
  15. aipt_v2/compliance/__init__.py +71 -0
  16. aipt_v2/compliance/compliance_report.py +449 -0
  17. aipt_v2/compliance/framework_mapper.py +424 -0
  18. aipt_v2/compliance/nist_mapping.py +345 -0
  19. aipt_v2/compliance/owasp_mapping.py +330 -0
  20. aipt_v2/compliance/pci_mapping.py +297 -0
  21. aipt_v2/config.py +341 -0
  22. aipt_v2/core/__init__.py +43 -0
  23. aipt_v2/core/agent.py +630 -0
  24. aipt_v2/core/llm.py +395 -0
  25. aipt_v2/core/memory.py +305 -0
  26. aipt_v2/core/ptt.py +329 -0
  27. aipt_v2/database/__init__.py +14 -0
  28. aipt_v2/database/models.py +232 -0
  29. aipt_v2/database/repository.py +384 -0
  30. aipt_v2/docker/__init__.py +23 -0
  31. aipt_v2/docker/builder.py +260 -0
  32. aipt_v2/docker/manager.py +222 -0
  33. aipt_v2/docker/sandbox.py +371 -0
  34. aipt_v2/evasion/__init__.py +58 -0
  35. aipt_v2/evasion/request_obfuscator.py +272 -0
  36. aipt_v2/evasion/tls_fingerprint.py +285 -0
  37. aipt_v2/evasion/ua_rotator.py +301 -0
  38. aipt_v2/evasion/waf_bypass.py +439 -0
  39. aipt_v2/execution/__init__.py +23 -0
  40. aipt_v2/execution/executor.py +302 -0
  41. aipt_v2/execution/parser.py +544 -0
  42. aipt_v2/execution/terminal.py +337 -0
  43. aipt_v2/health.py +437 -0
  44. aipt_v2/intelligence/__init__.py +194 -0
  45. aipt_v2/intelligence/adaptation.py +474 -0
  46. aipt_v2/intelligence/auth.py +520 -0
  47. aipt_v2/intelligence/chaining.py +775 -0
  48. aipt_v2/intelligence/correlation.py +536 -0
  49. aipt_v2/intelligence/cve_aipt.py +334 -0
  50. aipt_v2/intelligence/cve_info.py +1111 -0
  51. aipt_v2/intelligence/knowledge_graph.py +590 -0
  52. aipt_v2/intelligence/learning.py +626 -0
  53. aipt_v2/intelligence/llm_analyzer.py +502 -0
  54. aipt_v2/intelligence/llm_tool_selector.py +518 -0
  55. aipt_v2/intelligence/payload_generator.py +562 -0
  56. aipt_v2/intelligence/rag.py +239 -0
  57. aipt_v2/intelligence/scope.py +442 -0
  58. aipt_v2/intelligence/searchers/__init__.py +5 -0
  59. aipt_v2/intelligence/searchers/exploitdb_searcher.py +523 -0
  60. aipt_v2/intelligence/searchers/github_searcher.py +467 -0
  61. aipt_v2/intelligence/searchers/google_searcher.py +281 -0
  62. aipt_v2/intelligence/tools.json +443 -0
  63. aipt_v2/intelligence/triage.py +670 -0
  64. aipt_v2/interactive_shell.py +559 -0
  65. aipt_v2/interface/__init__.py +5 -0
  66. aipt_v2/interface/cli.py +230 -0
  67. aipt_v2/interface/main.py +501 -0
  68. aipt_v2/interface/tui.py +1276 -0
  69. aipt_v2/interface/utils.py +583 -0
  70. aipt_v2/llm/__init__.py +39 -0
  71. aipt_v2/llm/config.py +26 -0
  72. aipt_v2/llm/llm.py +514 -0
  73. aipt_v2/llm/memory.py +214 -0
  74. aipt_v2/llm/request_queue.py +89 -0
  75. aipt_v2/llm/utils.py +89 -0
  76. aipt_v2/local_tool_installer.py +1467 -0
  77. aipt_v2/models/__init__.py +15 -0
  78. aipt_v2/models/findings.py +295 -0
  79. aipt_v2/models/phase_result.py +224 -0
  80. aipt_v2/models/scan_config.py +207 -0
  81. aipt_v2/monitoring/grafana/dashboards/aipt-dashboard.json +355 -0
  82. aipt_v2/monitoring/grafana/dashboards/default.yml +17 -0
  83. aipt_v2/monitoring/grafana/datasources/prometheus.yml +17 -0
  84. aipt_v2/monitoring/prometheus.yml +60 -0
  85. aipt_v2/orchestration/__init__.py +52 -0
  86. aipt_v2/orchestration/pipeline.py +398 -0
  87. aipt_v2/orchestration/progress.py +300 -0
  88. aipt_v2/orchestration/scheduler.py +296 -0
  89. aipt_v2/orchestrator.py +2427 -0
  90. aipt_v2/payloads/__init__.py +27 -0
  91. aipt_v2/payloads/cmdi.py +150 -0
  92. aipt_v2/payloads/sqli.py +263 -0
  93. aipt_v2/payloads/ssrf.py +204 -0
  94. aipt_v2/payloads/templates.py +222 -0
  95. aipt_v2/payloads/traversal.py +166 -0
  96. aipt_v2/payloads/xss.py +204 -0
  97. aipt_v2/prompts/__init__.py +60 -0
  98. aipt_v2/proxy/__init__.py +29 -0
  99. aipt_v2/proxy/history.py +352 -0
  100. aipt_v2/proxy/interceptor.py +452 -0
  101. aipt_v2/recon/__init__.py +44 -0
  102. aipt_v2/recon/dns.py +241 -0
  103. aipt_v2/recon/osint.py +367 -0
  104. aipt_v2/recon/subdomain.py +372 -0
  105. aipt_v2/recon/tech_detect.py +311 -0
  106. aipt_v2/reports/__init__.py +17 -0
  107. aipt_v2/reports/generator.py +313 -0
  108. aipt_v2/reports/html_report.py +378 -0
  109. aipt_v2/runtime/__init__.py +53 -0
  110. aipt_v2/runtime/base.py +30 -0
  111. aipt_v2/runtime/docker.py +401 -0
  112. aipt_v2/runtime/local.py +346 -0
  113. aipt_v2/runtime/tool_server.py +205 -0
  114. aipt_v2/runtime/vps.py +830 -0
  115. aipt_v2/scanners/__init__.py +28 -0
  116. aipt_v2/scanners/base.py +273 -0
  117. aipt_v2/scanners/nikto.py +244 -0
  118. aipt_v2/scanners/nmap.py +402 -0
  119. aipt_v2/scanners/nuclei.py +273 -0
  120. aipt_v2/scanners/web.py +454 -0
  121. aipt_v2/scripts/security_audit.py +366 -0
  122. aipt_v2/setup_wizard.py +941 -0
  123. aipt_v2/skills/__init__.py +80 -0
  124. aipt_v2/skills/agents/__init__.py +14 -0
  125. aipt_v2/skills/agents/api_tester.py +706 -0
  126. aipt_v2/skills/agents/base.py +477 -0
  127. aipt_v2/skills/agents/code_review.py +459 -0
  128. aipt_v2/skills/agents/security_agent.py +336 -0
  129. aipt_v2/skills/agents/web_pentest.py +818 -0
  130. aipt_v2/skills/prompts/__init__.py +647 -0
  131. aipt_v2/system_detector.py +539 -0
  132. aipt_v2/telemetry/__init__.py +7 -0
  133. aipt_v2/telemetry/tracer.py +347 -0
  134. aipt_v2/terminal/__init__.py +28 -0
  135. aipt_v2/terminal/executor.py +400 -0
  136. aipt_v2/terminal/sandbox.py +350 -0
  137. aipt_v2/tools/__init__.py +44 -0
  138. aipt_v2/tools/active_directory/__init__.py +78 -0
  139. aipt_v2/tools/active_directory/ad_config.py +238 -0
  140. aipt_v2/tools/active_directory/bloodhound_wrapper.py +447 -0
  141. aipt_v2/tools/active_directory/kerberos_attacks.py +430 -0
  142. aipt_v2/tools/active_directory/ldap_enum.py +533 -0
  143. aipt_v2/tools/active_directory/smb_attacks.py +505 -0
  144. aipt_v2/tools/agents_graph/__init__.py +19 -0
  145. aipt_v2/tools/agents_graph/agents_graph_actions.py +69 -0
  146. aipt_v2/tools/api_security/__init__.py +76 -0
  147. aipt_v2/tools/api_security/api_discovery.py +608 -0
  148. aipt_v2/tools/api_security/graphql_scanner.py +622 -0
  149. aipt_v2/tools/api_security/jwt_analyzer.py +577 -0
  150. aipt_v2/tools/api_security/openapi_fuzzer.py +761 -0
  151. aipt_v2/tools/browser/__init__.py +5 -0
  152. aipt_v2/tools/browser/browser_actions.py +238 -0
  153. aipt_v2/tools/browser/browser_instance.py +535 -0
  154. aipt_v2/tools/browser/tab_manager.py +344 -0
  155. aipt_v2/tools/cloud/__init__.py +70 -0
  156. aipt_v2/tools/cloud/cloud_config.py +273 -0
  157. aipt_v2/tools/cloud/cloud_scanner.py +639 -0
  158. aipt_v2/tools/cloud/prowler_tool.py +571 -0
  159. aipt_v2/tools/cloud/scoutsuite_tool.py +359 -0
  160. aipt_v2/tools/executor.py +307 -0
  161. aipt_v2/tools/parser.py +408 -0
  162. aipt_v2/tools/proxy/__init__.py +5 -0
  163. aipt_v2/tools/proxy/proxy_actions.py +103 -0
  164. aipt_v2/tools/proxy/proxy_manager.py +789 -0
  165. aipt_v2/tools/registry.py +196 -0
  166. aipt_v2/tools/scanners/__init__.py +343 -0
  167. aipt_v2/tools/scanners/acunetix_tool.py +712 -0
  168. aipt_v2/tools/scanners/burp_tool.py +631 -0
  169. aipt_v2/tools/scanners/config.py +156 -0
  170. aipt_v2/tools/scanners/nessus_tool.py +588 -0
  171. aipt_v2/tools/scanners/zap_tool.py +612 -0
  172. aipt_v2/tools/terminal/__init__.py +5 -0
  173. aipt_v2/tools/terminal/terminal_actions.py +37 -0
  174. aipt_v2/tools/terminal/terminal_manager.py +153 -0
  175. aipt_v2/tools/terminal/terminal_session.py +449 -0
  176. aipt_v2/tools/tool_processing.py +108 -0
  177. aipt_v2/utils/__init__.py +17 -0
  178. aipt_v2/utils/logging.py +202 -0
  179. aipt_v2/utils/model_manager.py +187 -0
  180. aipt_v2/utils/searchers/__init__.py +269 -0
  181. aipt_v2/verify_install.py +793 -0
  182. aiptx-2.0.7.dist-info/METADATA +345 -0
  183. aiptx-2.0.7.dist-info/RECORD +187 -0
  184. aiptx-2.0.7.dist-info/WHEEL +5 -0
  185. aiptx-2.0.7.dist-info/entry_points.txt +7 -0
  186. aiptx-2.0.7.dist-info/licenses/LICENSE +21 -0
  187. aiptx-2.0.7.dist-info/top_level.txt +1 -0
@@ -0,0 +1,467 @@
1
+ import os
2
+ import time
3
+ import json
4
+ import datetime
5
+ import dotenv
6
+ from tqdm import tqdm
7
+ import requests
8
+ from git import Repo
9
+ from urllib.parse import quote_plus
10
+ from aipt_v2.utils.searchers.Domain_Filter import repository_filter, code_white_list
11
+ from aipt_v2.utils.searchers.Extension_Filter import for_github_repo_file, for_github_code_file
12
+ from aipt_v2.utils.searchers.util import *
13
+ import utils.searchers.github_config as c
14
+ from scipy.stats import norm
15
+ import csv
16
+ from datetime import datetime as dt
17
+ import shutil
18
+
19
+ dotenv.load_dotenv()
20
+
21
+ class GithubSearcher:
22
+ search_limit_remaining = 30
23
+ search_limit_reset = 0
24
+
25
+ core_limit_remaining = 5000
26
+ core_limit_reset = 0
27
+
28
+ USE_PROXY = False
29
+ proxies = {
30
+ 'http': 'socks5://127.0.0.1:50532',
31
+ 'https': 'socks5://127.0.0.1:50532'
32
+ }
33
+
34
+ def __init__(self) -> None:
35
+ self.session = requests.session()
36
+ self.token = os.getenv('GITHUB_KEY')
37
+
38
+ def convert_to_raw(self, github_blob_url):
39
+ # replace "blob" with "raw" and return new RAW link
40
+ github_raw_url = github_blob_url.replace("/github.com/", "/raw.githubusercontent.com/")
41
+ if "blob/" in github_raw_url:
42
+ github_raw_url = github_raw_url.replace("blob/", "")
43
+ return github_raw_url
44
+
45
+ def filter_items(self, items, keyword, size_limits, loose_mode = False):
46
+ ###########################
47
+ # filter nonrelative repo #
48
+ ###########################
49
+ related_items = []
50
+ # print(items)
51
+ for item in items:
52
+ name_temp = str(item.get('name'))
53
+ description_temp = str(item.get('description'))
54
+ if 'cve' in keyword.lower():
55
+ if result_matches_cve(keyword, name_temp) or result_matches_cve(keyword, description_temp):
56
+ related_items.append(item)
57
+
58
+ if loose_mode:
59
+ related_items.append(item)
60
+
61
+ ########################
62
+ # filter inferior repo #
63
+ ########################
64
+ # filter based on list content
65
+ filtered_items = [item for item in related_items if not any(filter_string in item['html_url'] for filter_string in repository_filter)]
66
+ # filter based on scoring
67
+ extracted_items = []
68
+ # traverse every item
69
+ for item in filtered_items:
70
+ # a new dict to store extracted data
71
+ extracted_data = {}
72
+
73
+ # extract html_url, empty -> skip this item
74
+ html_url = item.get('html_url')
75
+ if not html_url:
76
+ continue
77
+
78
+ # extract description
79
+ description = item.get('description')
80
+ if description is None:
81
+ description_length = 0
82
+ else:
83
+ description_length = len(description)
84
+
85
+ # extract open_issues_count; not exist or not valid -> try to extract open_issues
86
+ open_issues_count = item.get('open_issues_count')
87
+ if open_issues_count is None or not isinstance(open_issues_count, int):
88
+ open_issues = item.get('open_issues')
89
+ if open_issues is not None and isinstance(open_issues, int):
90
+ open_issues_count = open_issues
91
+ else:
92
+ open_issues_count = 0
93
+
94
+ # extract stargazers_count; not exist or not valid -> try to extract watchers_count; not exist or not valid -> try to extract watchers
95
+ # update in original item
96
+ stars_count = item.get('stargazers_count')
97
+ if stars_count is None or not isinstance(stars_count, int):
98
+ watchers_count = item.get('watchers_count')
99
+ if watchers_count is not None and isinstance(watchers_count, int):
100
+ stars_count = watchers_count
101
+ else:
102
+ watchers = item.get('watchers')
103
+ if watchers is not None and isinstance(watchers, int):
104
+ stars_count = watchers
105
+ else:
106
+ stars_count = 0
107
+ item['stars_count'] = stars_count
108
+ stars_count += 1 # to match up with fork calculation
109
+
110
+ # extract forks_count; not exist or not valid -> try to extract forks
111
+ # update in original item
112
+ forks_count = item.get('forks_count')
113
+ if forks_count is None or not isinstance(forks_count, int):
114
+ forks = item.get('forks')
115
+ if forks is not None and isinstance(forks, int):
116
+ forks_count = forks
117
+ else:
118
+ forks_count = 0
119
+ item['forks_count'] = forks_count
120
+ forks_count += 1 # avoid using 0 as divisor
121
+
122
+ # extract create_date; not exist or not valid -> set a default value
123
+ create_date = item.get('created_at')
124
+ if create_date is None or not isinstance(create_date, str):
125
+ create_date = "2020-01-01T02:28:41Z"
126
+
127
+ # extract topics; allow to be empty, and transfer it to a string splitted by comma
128
+ topics_list = item.get('topics', [])
129
+ topics_str = ','.join(topics_list)
130
+
131
+ # add to new dict
132
+ extracted_data['html_url'] = html_url
133
+ extracted_data['description_length'] = description_length
134
+ extracted_data['open_issues_count'] = open_issues_count
135
+ extracted_data['topics'] = topics_str.lower()
136
+ extracted_data['stars_count'] = stars_count
137
+ extracted_data['forks_count'] = forks_count
138
+ extracted_data['create_date'] = create_date
139
+
140
+ extracted_items.append(extracted_data)
141
+
142
+ for item in extracted_items:
143
+ if item['description_length'] <= 300:
144
+ d_score = c.max_confs[0]
145
+ else:
146
+ d_score = c.times_0 * norm.pdf(item['description_length'], loc=c.mus[0], scale=c.sigmas[0])
147
+
148
+ if item['open_issues_count'] <= 30:
149
+ i_score = c.max_confs[1]
150
+ else:
151
+ i_score = c.times_1 * norm.pdf(item['open_issues_count'], loc=c.mus[1], scale=c.sigmas[1])
152
+
153
+ if keyword.strip().lower() in item['topics']:
154
+ t_score = c.max_confs[2]
155
+ else:
156
+ t_score = 0.2 # avoid having a score difference that is too large, as it loses its meaning
157
+
158
+ item['conf_score'] = d_score * c.conf_score_weights[0] + i_score * c.conf_score_weights[1] + t_score * c.conf_score_weights[2]
159
+
160
+
161
+ # transfer to datetime object
162
+ given_time = dt.fromisoformat(item['create_date'].replace("Z", ""))
163
+ current_time = dt.now()
164
+ # get the difference in days
165
+ days_difference = (current_time - given_time).days
166
+ lamda = c.times_2 * norm.pdf(days_difference, loc=c.mus[2], scale=c.sigmas[2]) + c.base_line
167
+ item['efct_score'] = lamda * item['stars_count'] / item['forks_count']
168
+
169
+
170
+ filtered_extracted_items = [item for item in extracted_items if item['conf_score'] >= c.threshold]
171
+
172
+ effective_count = len(filtered_extracted_items)
173
+
174
+ #####################
175
+ # filter giant repo #
176
+ #####################
177
+ size_filtered_items = [item for item in filtered_items if item['size'] <= c.size_limits[2]]
178
+
179
+ #############
180
+ # sort repo #
181
+ #############
182
+ # sort in descending order based on efct_score
183
+ sorted_filtered_extracted_items = sorted(filtered_extracted_items, key=lambda x: x['efct_score'], reverse=True)
184
+ # merge results
185
+ html_index = [item['html_url'] for item in sorted_filtered_extracted_items]
186
+ merged_filtered_items = []
187
+ for url in html_index:
188
+ for item in size_filtered_items:
189
+ if item['html_url'] == url:
190
+ merged_filtered_items.append(item)
191
+
192
+ # grouping
193
+ group1 = [item for item in merged_filtered_items if c.size_limits[0] <= item['size'] <= c.size_limits[1]]
194
+ group2 = [item for item in merged_filtered_items if item['size'] < c.size_limits[0]]
195
+ group3 = [item for item in merged_filtered_items if item['size'] > c.size_limits[1]]
196
+
197
+ # merge groups
198
+ grouped_filtered_items = []
199
+ for item in group1:
200
+ grouped_filtered_items.append(item)
201
+ for item in group2:
202
+ grouped_filtered_items.append(item)
203
+ for item in group3:
204
+ grouped_filtered_items.append(item)
205
+
206
+ return grouped_filtered_items, effective_count
207
+
208
+ def _search_code(self, query_type: str = 'repositories',
209
+ query_body: str = '',
210
+ qualifiers = '',
211
+ page: int = 1,
212
+ per_page: int = 30,
213
+ sort_method: str = '',
214
+ url=None,
215
+ keyword: str = '', size_limits: int = [0, 1000000],
216
+ loose_mode: bool = False ):
217
+ timestamp = int(datetime.datetime.now().timestamp())
218
+ # print("called!")
219
+ if self.search_limit_remaining == 0 and timestamp < self.search_limit_reset + 3:
220
+ time.sleep(self.search_limit_reset - timestamp + 5)
221
+
222
+ if url is None:
223
+ # URL encode the query_body
224
+ query_body_encoded = quote_plus(query_body)
225
+ query_body_encoded = quote_plus(query_body).rstrip('+')
226
+ if loose_mode: # to get broader match result
227
+ query_body_encoded = query_body.replace(' ', '+')
228
+ query_body_encoded = query_body_encoded.replace('"', '')
229
+ # construct the URL, ensuring that the parameters are separated by &, and check if the parameters exist.
230
+ url = f'https://api.github.com/search/{query_type}?q={query_body_encoded}'
231
+ if qualifiers:
232
+ qualifiers_body_encoded = quote_plus(qualifiers)
233
+ qualifiers_body_encoded = qualifiers_body_encoded.replace('+', '&')
234
+ url += f'&{qualifiers_body_encoded}'
235
+ if sort_method:
236
+ url += f'&sort={sort_method}&order=desc'
237
+ if page > 1:
238
+ url += f'&page={page}'
239
+ if per_page > 0:
240
+ url += f'&per_page={per_page}'
241
+ # print(url)
242
+
243
+ header = {
244
+ 'Accept': 'application/vnd.github+json',
245
+ 'Authorization': f'Bearer {self.token}',
246
+ 'X-GitHub-Api-Version': '2022-11-28'
247
+ }
248
+
249
+ if self.USE_PROXY:
250
+ resp = self.session.get(url, headers=header, proxies=self.proxies)
251
+ else:
252
+ resp = self.session.get(url, headers=header)
253
+ next_page = None
254
+ # print(resp.text)
255
+
256
+ if 'link' in resp.headers.keys():
257
+ links = resp.headers['link']
258
+ links = links.split(',')
259
+ for link in links:
260
+ addr, rel = link.split(';')
261
+ addr = addr.strip()[1:-1]
262
+
263
+ if rel.find('next') >= 0:
264
+ next_page = addr
265
+
266
+ if 'X-RateLimit-Remaining' in resp.headers.keys():
267
+ self.search_limit_remaining = int(
268
+ resp.headers['X-RateLimit-Remaining'])
269
+
270
+ if 'X-RateLimit-Reset' in resp.headers.keys():
271
+ self.search_limit_reset = int(resp.headers['X-RateLimit-Reset'])
272
+
273
+ if 'Retry-After' in resp.headers.keys():
274
+ after = int(resp.headers['Retry-After'])
275
+ time.sleep(after + 3)
276
+
277
+ if resp.status_code != 200:
278
+ # print(resp.text)
279
+ return None, None
280
+
281
+ result = json.loads(resp.text)
282
+ items = result.get('items', [])
283
+
284
+ mode = True if loose_mode else False
285
+ if query_type == 'repositories':
286
+ filtered_items, effective_count = self.filter_items(items, keyword, size_limits, loose_mode = mode)
287
+ # update result
288
+ result['effective_count'] = effective_count
289
+ elif query_type == 'code':
290
+ filtered_items = [item for item in items if any(repo in item['html_url'] for repo in code_white_list)]
291
+ # update result
292
+ result['effective_count'] = len(filtered_items)
293
+ # update result
294
+ result['items'] = filtered_items
295
+
296
+ return result, next_page
297
+
298
+ def search_keyword(self, keyword:str, output_dir: str, filter_on: bool = True, loose_mode: bool = False):
299
+ # print("github called")
300
+ os.makedirs(output_dir, exist_ok=True)
301
+ code_result = None
302
+
303
+ mode = True if loose_mode else False
304
+ repo_result, _ = self._search_code(query_type='repositories',
305
+ query_body=f'"{keyword}"',
306
+ qualifiers='',
307
+ per_page=c.per_page,
308
+ size_limits=c.size_limits, keyword = keyword, loose_mode = mode)
309
+ if filter_on:
310
+ code_result, _ = self._search_code(query_type='code',
311
+ query_body=f'"{keyword}"',
312
+ qualifiers='',
313
+ keyword = keyword, loose_mode = mode)
314
+ # print(repo_result)
315
+ # with open ("test.txt", 'w') as f:
316
+ # f.write(json.dumps(repo_result, indent=4, ensure_ascii=False))
317
+ # f.write(json.dumps(code_result, indent=4, ensure_ascii=False))
318
+ total_repo = 0
319
+ total_code = 0
320
+ if repo_result is None:
321
+ print("search repo error")
322
+ else:
323
+ total_repo = repo_result['effective_count']
324
+
325
+ if code_result is None:
326
+ print("search code error")
327
+ else:
328
+ total_code = code_result['effective_count']
329
+
330
+ # Maximum files available on Github
331
+ repo_target_count = min(c.count_each_keyword, total_repo)
332
+ code_target_count = min(c.count_each_keyword, total_code)
333
+
334
+ repo_count = 0
335
+ # Repo Download Iteration
336
+ with tqdm(total=repo_target_count, desc=f'Searching repositories related to {keyword} from GitHub') as pbar:
337
+ try:
338
+ for item in repo_result['items']:
339
+ repo_name = item['name']
340
+ # repo_full_name = item['full_name']
341
+ repo_language = item['language']
342
+ repo_star = item['stars_count']
343
+ repo_url = item['clone_url']
344
+
345
+ repo_directory = os.path.join(output_dir,
346
+ f'{repo_star}_{repo_name}_{repo_language}')
347
+ if not os.path.exists(repo_directory):
348
+ os.mkdir(repo_directory)
349
+ Repo.clone_from(repo_url, repo_directory)
350
+ if filter_on:
351
+ file_count_0, _ = count_files_and_size(repo_directory)
352
+ # filter files, turn off when running pentestagent
353
+ remove_files(repo_directory, filter_list=for_github_repo_file, remove_no_extension=True)
354
+ file_count_1, _ = count_files_and_size(repo_directory)
355
+ if file_count_0 * 0.5 > file_count_1 or file_count_1 > c.file_num_limits:
356
+ shutil.rmtree(repo_directory) # recursively remove directories
357
+ else:
358
+ repo_count += 1
359
+ if repo_count == c.base_limit:
360
+ break
361
+
362
+
363
+ pbar.update()
364
+
365
+ except Exception as e:
366
+ print(e)
367
+
368
+ try:
369
+ repo_result.pop('incomplete_results', None)
370
+ for item in repo_result['items']:
371
+ item.update({key: item[key] for key in ['name', 'clone_url', 'stars_count', 'forks_count'] if key in item})
372
+ for key in list(item.keys()):
373
+ if key not in ['name', 'clone_url', 'stars_count', 'forks_count']:
374
+ del item[key]
375
+
376
+ # repo_result['items'] = [{key: item[key] for key in item if key in ['name', 'clone_url', 'stars_count', 'forks_count']} for item in repo_result['items']]
377
+ total_stars_count = 0 # initialize
378
+ total_forks_count = 0 # initialize
379
+ for item in repo_result['items']:
380
+ total_stars_count += item['stars_count']
381
+ total_forks_count += item['forks_count']
382
+ repo_result['total_stars_count'] = total_stars_count
383
+ repo_result['total_forks_count'] = total_forks_count
384
+ if (repo_result['total_count'] <= c.per_page) or (repo_result['total_count'] > c.per_page and repo_result['effective_count'] < c.per_page):
385
+ repo_result['trend_score'] = c.trend_weights[0] * repo_result['effective_count'] + c.trend_weights[1] * (total_stars_count + total_forks_count)
386
+ else:
387
+ repo_result['trend_score'] = c.trend_weights[0] * repo_result['total_count'] \
388
+ + c.trend_weights[1] * (total_stars_count + total_forks_count) * c.alpha
389
+ try:
390
+ with open(os.path.join(os.path.dirname(output_dir), "Trend_Score.json"), "w") as f:
391
+ f.write(json.dumps(repo_result, indent=4, ensure_ascii=False))
392
+ except Exception as e:
393
+ print(e)
394
+
395
+ except Exception as e:
396
+ print(e)
397
+
398
+ # Code Download Iteration
399
+ code_output_dir = os.path.join(output_dir,"Code_File")
400
+ if not os.path.exists(code_output_dir):
401
+ os.makedirs(code_output_dir)
402
+ index_csv_path = os.path.join(code_output_dir, 'index.csv')
403
+ with tqdm(total=code_target_count, desc=f'Downloading code files related to {keyword} from GitHub') as pbar:
404
+ # initialize index.csv
405
+ with open(index_csv_path, 'w', newline='') as csvfile:
406
+ fieldnames = ['original_path', 'original_name', 'new_name']
407
+ writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
408
+ writer.writeheader()
409
+ try:
410
+ for item in code_result['items']:
411
+ file_name = item['name']
412
+ file_path = item['path']
413
+ file_url = self.convert_to_raw(item['html_url']) # use html_url to download
414
+ suffix = file_name.split(".")[-1]
415
+
416
+ # selectively filter files
417
+ for i in range(len(for_github_code_file)):
418
+ if f".{suffix}" in for_github_code_file[i]:
419
+ continue
420
+
421
+ # generate nonrepeatable file name
422
+ unique_name = f"{dt.now().strftime('%Y%m%d%H%M%S%f')}.{suffix}"
423
+
424
+ # download file
425
+ # Security: Add timeout to prevent indefinite hangs (CWE-400)
426
+ response = requests.get(file_url, timeout=30)
427
+ if response.status_code == 200:
428
+ if not result_matches_cve(keyword, response.content):
429
+ continue
430
+ # save file content to local
431
+ with open(os.path.join(code_output_dir, unique_name), 'wb') as f:
432
+ f.write(response.content)
433
+ # update index.csv
434
+ with open(index_csv_path, 'a', newline='') as csvfile:
435
+ writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
436
+ writer.writerow({
437
+ 'original_path': file_path,
438
+ 'original_name': file_name,
439
+ 'new_name': unique_name
440
+ })
441
+ else:
442
+ print(f"Failed to download {file_url}, status code: {response.status_code}")
443
+
444
+ pbar.update()
445
+
446
+ except Exception as e:
447
+ print(e)
448
+
449
+ # check if index.csv only has headline
450
+ with open(index_csv_path, 'r', newline='') as csvfile:
451
+ reader = csv.reader(csvfile)
452
+ rows = list(reader) # extract all lines into a list
453
+
454
+ if len(rows) == 1:
455
+ os.remove(index_csv_path)
456
+
457
+ print("Recursively removing empty directories...")
458
+ remove_empty_directories(output_dir)
459
+
460
+
461
+ # def main():
462
+ # g = GithubSearcher()
463
+ # app = "CVE-2024-29847"
464
+ # g.search_keyword(f"{app} exploit", "/root/try/exp_web_data")
465
+
466
+ # if __name__ == "__main__":
467
+ # main()