aiptx 2.0.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aipt_v2/__init__.py +110 -0
- aipt_v2/__main__.py +24 -0
- aipt_v2/agents/AIPTxAgent/__init__.py +10 -0
- aipt_v2/agents/AIPTxAgent/aiptx_agent.py +211 -0
- aipt_v2/agents/__init__.py +46 -0
- aipt_v2/agents/base.py +520 -0
- aipt_v2/agents/exploit_agent.py +688 -0
- aipt_v2/agents/ptt.py +406 -0
- aipt_v2/agents/state.py +168 -0
- aipt_v2/app.py +957 -0
- aipt_v2/browser/__init__.py +31 -0
- aipt_v2/browser/automation.py +458 -0
- aipt_v2/browser/crawler.py +453 -0
- aipt_v2/cli.py +2933 -0
- aipt_v2/compliance/__init__.py +71 -0
- aipt_v2/compliance/compliance_report.py +449 -0
- aipt_v2/compliance/framework_mapper.py +424 -0
- aipt_v2/compliance/nist_mapping.py +345 -0
- aipt_v2/compliance/owasp_mapping.py +330 -0
- aipt_v2/compliance/pci_mapping.py +297 -0
- aipt_v2/config.py +341 -0
- aipt_v2/core/__init__.py +43 -0
- aipt_v2/core/agent.py +630 -0
- aipt_v2/core/llm.py +395 -0
- aipt_v2/core/memory.py +305 -0
- aipt_v2/core/ptt.py +329 -0
- aipt_v2/database/__init__.py +14 -0
- aipt_v2/database/models.py +232 -0
- aipt_v2/database/repository.py +384 -0
- aipt_v2/docker/__init__.py +23 -0
- aipt_v2/docker/builder.py +260 -0
- aipt_v2/docker/manager.py +222 -0
- aipt_v2/docker/sandbox.py +371 -0
- aipt_v2/evasion/__init__.py +58 -0
- aipt_v2/evasion/request_obfuscator.py +272 -0
- aipt_v2/evasion/tls_fingerprint.py +285 -0
- aipt_v2/evasion/ua_rotator.py +301 -0
- aipt_v2/evasion/waf_bypass.py +439 -0
- aipt_v2/execution/__init__.py +23 -0
- aipt_v2/execution/executor.py +302 -0
- aipt_v2/execution/parser.py +544 -0
- aipt_v2/execution/terminal.py +337 -0
- aipt_v2/health.py +437 -0
- aipt_v2/intelligence/__init__.py +194 -0
- aipt_v2/intelligence/adaptation.py +474 -0
- aipt_v2/intelligence/auth.py +520 -0
- aipt_v2/intelligence/chaining.py +775 -0
- aipt_v2/intelligence/correlation.py +536 -0
- aipt_v2/intelligence/cve_aipt.py +334 -0
- aipt_v2/intelligence/cve_info.py +1111 -0
- aipt_v2/intelligence/knowledge_graph.py +590 -0
- aipt_v2/intelligence/learning.py +626 -0
- aipt_v2/intelligence/llm_analyzer.py +502 -0
- aipt_v2/intelligence/llm_tool_selector.py +518 -0
- aipt_v2/intelligence/payload_generator.py +562 -0
- aipt_v2/intelligence/rag.py +239 -0
- aipt_v2/intelligence/scope.py +442 -0
- aipt_v2/intelligence/searchers/__init__.py +5 -0
- aipt_v2/intelligence/searchers/exploitdb_searcher.py +523 -0
- aipt_v2/intelligence/searchers/github_searcher.py +467 -0
- aipt_v2/intelligence/searchers/google_searcher.py +281 -0
- aipt_v2/intelligence/tools.json +443 -0
- aipt_v2/intelligence/triage.py +670 -0
- aipt_v2/interactive_shell.py +559 -0
- aipt_v2/interface/__init__.py +5 -0
- aipt_v2/interface/cli.py +230 -0
- aipt_v2/interface/main.py +501 -0
- aipt_v2/interface/tui.py +1276 -0
- aipt_v2/interface/utils.py +583 -0
- aipt_v2/llm/__init__.py +39 -0
- aipt_v2/llm/config.py +26 -0
- aipt_v2/llm/llm.py +514 -0
- aipt_v2/llm/memory.py +214 -0
- aipt_v2/llm/request_queue.py +89 -0
- aipt_v2/llm/utils.py +89 -0
- aipt_v2/local_tool_installer.py +1467 -0
- aipt_v2/models/__init__.py +15 -0
- aipt_v2/models/findings.py +295 -0
- aipt_v2/models/phase_result.py +224 -0
- aipt_v2/models/scan_config.py +207 -0
- aipt_v2/monitoring/grafana/dashboards/aipt-dashboard.json +355 -0
- aipt_v2/monitoring/grafana/dashboards/default.yml +17 -0
- aipt_v2/monitoring/grafana/datasources/prometheus.yml +17 -0
- aipt_v2/monitoring/prometheus.yml +60 -0
- aipt_v2/orchestration/__init__.py +52 -0
- aipt_v2/orchestration/pipeline.py +398 -0
- aipt_v2/orchestration/progress.py +300 -0
- aipt_v2/orchestration/scheduler.py +296 -0
- aipt_v2/orchestrator.py +2427 -0
- aipt_v2/payloads/__init__.py +27 -0
- aipt_v2/payloads/cmdi.py +150 -0
- aipt_v2/payloads/sqli.py +263 -0
- aipt_v2/payloads/ssrf.py +204 -0
- aipt_v2/payloads/templates.py +222 -0
- aipt_v2/payloads/traversal.py +166 -0
- aipt_v2/payloads/xss.py +204 -0
- aipt_v2/prompts/__init__.py +60 -0
- aipt_v2/proxy/__init__.py +29 -0
- aipt_v2/proxy/history.py +352 -0
- aipt_v2/proxy/interceptor.py +452 -0
- aipt_v2/recon/__init__.py +44 -0
- aipt_v2/recon/dns.py +241 -0
- aipt_v2/recon/osint.py +367 -0
- aipt_v2/recon/subdomain.py +372 -0
- aipt_v2/recon/tech_detect.py +311 -0
- aipt_v2/reports/__init__.py +17 -0
- aipt_v2/reports/generator.py +313 -0
- aipt_v2/reports/html_report.py +378 -0
- aipt_v2/runtime/__init__.py +53 -0
- aipt_v2/runtime/base.py +30 -0
- aipt_v2/runtime/docker.py +401 -0
- aipt_v2/runtime/local.py +346 -0
- aipt_v2/runtime/tool_server.py +205 -0
- aipt_v2/runtime/vps.py +830 -0
- aipt_v2/scanners/__init__.py +28 -0
- aipt_v2/scanners/base.py +273 -0
- aipt_v2/scanners/nikto.py +244 -0
- aipt_v2/scanners/nmap.py +402 -0
- aipt_v2/scanners/nuclei.py +273 -0
- aipt_v2/scanners/web.py +454 -0
- aipt_v2/scripts/security_audit.py +366 -0
- aipt_v2/setup_wizard.py +941 -0
- aipt_v2/skills/__init__.py +80 -0
- aipt_v2/skills/agents/__init__.py +14 -0
- aipt_v2/skills/agents/api_tester.py +706 -0
- aipt_v2/skills/agents/base.py +477 -0
- aipt_v2/skills/agents/code_review.py +459 -0
- aipt_v2/skills/agents/security_agent.py +336 -0
- aipt_v2/skills/agents/web_pentest.py +818 -0
- aipt_v2/skills/prompts/__init__.py +647 -0
- aipt_v2/system_detector.py +539 -0
- aipt_v2/telemetry/__init__.py +7 -0
- aipt_v2/telemetry/tracer.py +347 -0
- aipt_v2/terminal/__init__.py +28 -0
- aipt_v2/terminal/executor.py +400 -0
- aipt_v2/terminal/sandbox.py +350 -0
- aipt_v2/tools/__init__.py +44 -0
- aipt_v2/tools/active_directory/__init__.py +78 -0
- aipt_v2/tools/active_directory/ad_config.py +238 -0
- aipt_v2/tools/active_directory/bloodhound_wrapper.py +447 -0
- aipt_v2/tools/active_directory/kerberos_attacks.py +430 -0
- aipt_v2/tools/active_directory/ldap_enum.py +533 -0
- aipt_v2/tools/active_directory/smb_attacks.py +505 -0
- aipt_v2/tools/agents_graph/__init__.py +19 -0
- aipt_v2/tools/agents_graph/agents_graph_actions.py +69 -0
- aipt_v2/tools/api_security/__init__.py +76 -0
- aipt_v2/tools/api_security/api_discovery.py +608 -0
- aipt_v2/tools/api_security/graphql_scanner.py +622 -0
- aipt_v2/tools/api_security/jwt_analyzer.py +577 -0
- aipt_v2/tools/api_security/openapi_fuzzer.py +761 -0
- aipt_v2/tools/browser/__init__.py +5 -0
- aipt_v2/tools/browser/browser_actions.py +238 -0
- aipt_v2/tools/browser/browser_instance.py +535 -0
- aipt_v2/tools/browser/tab_manager.py +344 -0
- aipt_v2/tools/cloud/__init__.py +70 -0
- aipt_v2/tools/cloud/cloud_config.py +273 -0
- aipt_v2/tools/cloud/cloud_scanner.py +639 -0
- aipt_v2/tools/cloud/prowler_tool.py +571 -0
- aipt_v2/tools/cloud/scoutsuite_tool.py +359 -0
- aipt_v2/tools/executor.py +307 -0
- aipt_v2/tools/parser.py +408 -0
- aipt_v2/tools/proxy/__init__.py +5 -0
- aipt_v2/tools/proxy/proxy_actions.py +103 -0
- aipt_v2/tools/proxy/proxy_manager.py +789 -0
- aipt_v2/tools/registry.py +196 -0
- aipt_v2/tools/scanners/__init__.py +343 -0
- aipt_v2/tools/scanners/acunetix_tool.py +712 -0
- aipt_v2/tools/scanners/burp_tool.py +631 -0
- aipt_v2/tools/scanners/config.py +156 -0
- aipt_v2/tools/scanners/nessus_tool.py +588 -0
- aipt_v2/tools/scanners/zap_tool.py +612 -0
- aipt_v2/tools/terminal/__init__.py +5 -0
- aipt_v2/tools/terminal/terminal_actions.py +37 -0
- aipt_v2/tools/terminal/terminal_manager.py +153 -0
- aipt_v2/tools/terminal/terminal_session.py +449 -0
- aipt_v2/tools/tool_processing.py +108 -0
- aipt_v2/utils/__init__.py +17 -0
- aipt_v2/utils/logging.py +202 -0
- aipt_v2/utils/model_manager.py +187 -0
- aipt_v2/utils/searchers/__init__.py +269 -0
- aipt_v2/verify_install.py +793 -0
- aiptx-2.0.7.dist-info/METADATA +345 -0
- aiptx-2.0.7.dist-info/RECORD +187 -0
- aiptx-2.0.7.dist-info/WHEEL +5 -0
- aiptx-2.0.7.dist-info/entry_points.txt +7 -0
- aiptx-2.0.7.dist-info/licenses/LICENSE +21 -0
- aiptx-2.0.7.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,467 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import time
|
|
3
|
+
import json
|
|
4
|
+
import datetime
|
|
5
|
+
import dotenv
|
|
6
|
+
from tqdm import tqdm
|
|
7
|
+
import requests
|
|
8
|
+
from git import Repo
|
|
9
|
+
from urllib.parse import quote_plus
|
|
10
|
+
from aipt_v2.utils.searchers.Domain_Filter import repository_filter, code_white_list
|
|
11
|
+
from aipt_v2.utils.searchers.Extension_Filter import for_github_repo_file, for_github_code_file
|
|
12
|
+
from aipt_v2.utils.searchers.util import *
|
|
13
|
+
import utils.searchers.github_config as c
|
|
14
|
+
from scipy.stats import norm
|
|
15
|
+
import csv
|
|
16
|
+
from datetime import datetime as dt
|
|
17
|
+
import shutil
|
|
18
|
+
|
|
19
|
+
dotenv.load_dotenv()
|
|
20
|
+
|
|
21
|
+
class GithubSearcher:
|
|
22
|
+
search_limit_remaining = 30
|
|
23
|
+
search_limit_reset = 0
|
|
24
|
+
|
|
25
|
+
core_limit_remaining = 5000
|
|
26
|
+
core_limit_reset = 0
|
|
27
|
+
|
|
28
|
+
USE_PROXY = False
|
|
29
|
+
proxies = {
|
|
30
|
+
'http': 'socks5://127.0.0.1:50532',
|
|
31
|
+
'https': 'socks5://127.0.0.1:50532'
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
def __init__(self) -> None:
|
|
35
|
+
self.session = requests.session()
|
|
36
|
+
self.token = os.getenv('GITHUB_KEY')
|
|
37
|
+
|
|
38
|
+
def convert_to_raw(self, github_blob_url):
|
|
39
|
+
# replace "blob" with "raw" and return new RAW link
|
|
40
|
+
github_raw_url = github_blob_url.replace("/github.com/", "/raw.githubusercontent.com/")
|
|
41
|
+
if "blob/" in github_raw_url:
|
|
42
|
+
github_raw_url = github_raw_url.replace("blob/", "")
|
|
43
|
+
return github_raw_url
|
|
44
|
+
|
|
45
|
+
def filter_items(self, items, keyword, size_limits, loose_mode = False):
|
|
46
|
+
###########################
|
|
47
|
+
# filter nonrelative repo #
|
|
48
|
+
###########################
|
|
49
|
+
related_items = []
|
|
50
|
+
# print(items)
|
|
51
|
+
for item in items:
|
|
52
|
+
name_temp = str(item.get('name'))
|
|
53
|
+
description_temp = str(item.get('description'))
|
|
54
|
+
if 'cve' in keyword.lower():
|
|
55
|
+
if result_matches_cve(keyword, name_temp) or result_matches_cve(keyword, description_temp):
|
|
56
|
+
related_items.append(item)
|
|
57
|
+
|
|
58
|
+
if loose_mode:
|
|
59
|
+
related_items.append(item)
|
|
60
|
+
|
|
61
|
+
########################
|
|
62
|
+
# filter inferior repo #
|
|
63
|
+
########################
|
|
64
|
+
# filter based on list content
|
|
65
|
+
filtered_items = [item for item in related_items if not any(filter_string in item['html_url'] for filter_string in repository_filter)]
|
|
66
|
+
# filter based on scoring
|
|
67
|
+
extracted_items = []
|
|
68
|
+
# traverse every item
|
|
69
|
+
for item in filtered_items:
|
|
70
|
+
# a new dict to store extracted data
|
|
71
|
+
extracted_data = {}
|
|
72
|
+
|
|
73
|
+
# extract html_url, empty -> skip this item
|
|
74
|
+
html_url = item.get('html_url')
|
|
75
|
+
if not html_url:
|
|
76
|
+
continue
|
|
77
|
+
|
|
78
|
+
# extract description
|
|
79
|
+
description = item.get('description')
|
|
80
|
+
if description is None:
|
|
81
|
+
description_length = 0
|
|
82
|
+
else:
|
|
83
|
+
description_length = len(description)
|
|
84
|
+
|
|
85
|
+
# extract open_issues_count; not exist or not valid -> try to extract open_issues
|
|
86
|
+
open_issues_count = item.get('open_issues_count')
|
|
87
|
+
if open_issues_count is None or not isinstance(open_issues_count, int):
|
|
88
|
+
open_issues = item.get('open_issues')
|
|
89
|
+
if open_issues is not None and isinstance(open_issues, int):
|
|
90
|
+
open_issues_count = open_issues
|
|
91
|
+
else:
|
|
92
|
+
open_issues_count = 0
|
|
93
|
+
|
|
94
|
+
# extract stargazers_count; not exist or not valid -> try to extract watchers_count; not exist or not valid -> try to extract watchers
|
|
95
|
+
# update in original item
|
|
96
|
+
stars_count = item.get('stargazers_count')
|
|
97
|
+
if stars_count is None or not isinstance(stars_count, int):
|
|
98
|
+
watchers_count = item.get('watchers_count')
|
|
99
|
+
if watchers_count is not None and isinstance(watchers_count, int):
|
|
100
|
+
stars_count = watchers_count
|
|
101
|
+
else:
|
|
102
|
+
watchers = item.get('watchers')
|
|
103
|
+
if watchers is not None and isinstance(watchers, int):
|
|
104
|
+
stars_count = watchers
|
|
105
|
+
else:
|
|
106
|
+
stars_count = 0
|
|
107
|
+
item['stars_count'] = stars_count
|
|
108
|
+
stars_count += 1 # to match up with fork calculation
|
|
109
|
+
|
|
110
|
+
# extract forks_count; not exist or not valid -> try to extract forks
|
|
111
|
+
# update in original item
|
|
112
|
+
forks_count = item.get('forks_count')
|
|
113
|
+
if forks_count is None or not isinstance(forks_count, int):
|
|
114
|
+
forks = item.get('forks')
|
|
115
|
+
if forks is not None and isinstance(forks, int):
|
|
116
|
+
forks_count = forks
|
|
117
|
+
else:
|
|
118
|
+
forks_count = 0
|
|
119
|
+
item['forks_count'] = forks_count
|
|
120
|
+
forks_count += 1 # avoid using 0 as divisor
|
|
121
|
+
|
|
122
|
+
# extract create_date; not exist or not valid -> set a default value
|
|
123
|
+
create_date = item.get('created_at')
|
|
124
|
+
if create_date is None or not isinstance(create_date, str):
|
|
125
|
+
create_date = "2020-01-01T02:28:41Z"
|
|
126
|
+
|
|
127
|
+
# extract topics; allow to be empty, and transfer it to a string splitted by comma
|
|
128
|
+
topics_list = item.get('topics', [])
|
|
129
|
+
topics_str = ','.join(topics_list)
|
|
130
|
+
|
|
131
|
+
# add to new dict
|
|
132
|
+
extracted_data['html_url'] = html_url
|
|
133
|
+
extracted_data['description_length'] = description_length
|
|
134
|
+
extracted_data['open_issues_count'] = open_issues_count
|
|
135
|
+
extracted_data['topics'] = topics_str.lower()
|
|
136
|
+
extracted_data['stars_count'] = stars_count
|
|
137
|
+
extracted_data['forks_count'] = forks_count
|
|
138
|
+
extracted_data['create_date'] = create_date
|
|
139
|
+
|
|
140
|
+
extracted_items.append(extracted_data)
|
|
141
|
+
|
|
142
|
+
for item in extracted_items:
|
|
143
|
+
if item['description_length'] <= 300:
|
|
144
|
+
d_score = c.max_confs[0]
|
|
145
|
+
else:
|
|
146
|
+
d_score = c.times_0 * norm.pdf(item['description_length'], loc=c.mus[0], scale=c.sigmas[0])
|
|
147
|
+
|
|
148
|
+
if item['open_issues_count'] <= 30:
|
|
149
|
+
i_score = c.max_confs[1]
|
|
150
|
+
else:
|
|
151
|
+
i_score = c.times_1 * norm.pdf(item['open_issues_count'], loc=c.mus[1], scale=c.sigmas[1])
|
|
152
|
+
|
|
153
|
+
if keyword.strip().lower() in item['topics']:
|
|
154
|
+
t_score = c.max_confs[2]
|
|
155
|
+
else:
|
|
156
|
+
t_score = 0.2 # avoid having a score difference that is too large, as it loses its meaning
|
|
157
|
+
|
|
158
|
+
item['conf_score'] = d_score * c.conf_score_weights[0] + i_score * c.conf_score_weights[1] + t_score * c.conf_score_weights[2]
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
# transfer to datetime object
|
|
162
|
+
given_time = dt.fromisoformat(item['create_date'].replace("Z", ""))
|
|
163
|
+
current_time = dt.now()
|
|
164
|
+
# get the difference in days
|
|
165
|
+
days_difference = (current_time - given_time).days
|
|
166
|
+
lamda = c.times_2 * norm.pdf(days_difference, loc=c.mus[2], scale=c.sigmas[2]) + c.base_line
|
|
167
|
+
item['efct_score'] = lamda * item['stars_count'] / item['forks_count']
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
filtered_extracted_items = [item for item in extracted_items if item['conf_score'] >= c.threshold]
|
|
171
|
+
|
|
172
|
+
effective_count = len(filtered_extracted_items)
|
|
173
|
+
|
|
174
|
+
#####################
|
|
175
|
+
# filter giant repo #
|
|
176
|
+
#####################
|
|
177
|
+
size_filtered_items = [item for item in filtered_items if item['size'] <= c.size_limits[2]]
|
|
178
|
+
|
|
179
|
+
#############
|
|
180
|
+
# sort repo #
|
|
181
|
+
#############
|
|
182
|
+
# sort in descending order based on efct_score
|
|
183
|
+
sorted_filtered_extracted_items = sorted(filtered_extracted_items, key=lambda x: x['efct_score'], reverse=True)
|
|
184
|
+
# merge results
|
|
185
|
+
html_index = [item['html_url'] for item in sorted_filtered_extracted_items]
|
|
186
|
+
merged_filtered_items = []
|
|
187
|
+
for url in html_index:
|
|
188
|
+
for item in size_filtered_items:
|
|
189
|
+
if item['html_url'] == url:
|
|
190
|
+
merged_filtered_items.append(item)
|
|
191
|
+
|
|
192
|
+
# grouping
|
|
193
|
+
group1 = [item for item in merged_filtered_items if c.size_limits[0] <= item['size'] <= c.size_limits[1]]
|
|
194
|
+
group2 = [item for item in merged_filtered_items if item['size'] < c.size_limits[0]]
|
|
195
|
+
group3 = [item for item in merged_filtered_items if item['size'] > c.size_limits[1]]
|
|
196
|
+
|
|
197
|
+
# merge groups
|
|
198
|
+
grouped_filtered_items = []
|
|
199
|
+
for item in group1:
|
|
200
|
+
grouped_filtered_items.append(item)
|
|
201
|
+
for item in group2:
|
|
202
|
+
grouped_filtered_items.append(item)
|
|
203
|
+
for item in group3:
|
|
204
|
+
grouped_filtered_items.append(item)
|
|
205
|
+
|
|
206
|
+
return grouped_filtered_items, effective_count
|
|
207
|
+
|
|
208
|
+
def _search_code(self, query_type: str = 'repositories',
|
|
209
|
+
query_body: str = '',
|
|
210
|
+
qualifiers = '',
|
|
211
|
+
page: int = 1,
|
|
212
|
+
per_page: int = 30,
|
|
213
|
+
sort_method: str = '',
|
|
214
|
+
url=None,
|
|
215
|
+
keyword: str = '', size_limits: int = [0, 1000000],
|
|
216
|
+
loose_mode: bool = False ):
|
|
217
|
+
timestamp = int(datetime.datetime.now().timestamp())
|
|
218
|
+
# print("called!")
|
|
219
|
+
if self.search_limit_remaining == 0 and timestamp < self.search_limit_reset + 3:
|
|
220
|
+
time.sleep(self.search_limit_reset - timestamp + 5)
|
|
221
|
+
|
|
222
|
+
if url is None:
|
|
223
|
+
# URL encode the query_body
|
|
224
|
+
query_body_encoded = quote_plus(query_body)
|
|
225
|
+
query_body_encoded = quote_plus(query_body).rstrip('+')
|
|
226
|
+
if loose_mode: # to get broader match result
|
|
227
|
+
query_body_encoded = query_body.replace(' ', '+')
|
|
228
|
+
query_body_encoded = query_body_encoded.replace('"', '')
|
|
229
|
+
# construct the URL, ensuring that the parameters are separated by &, and check if the parameters exist.
|
|
230
|
+
url = f'https://api.github.com/search/{query_type}?q={query_body_encoded}'
|
|
231
|
+
if qualifiers:
|
|
232
|
+
qualifiers_body_encoded = quote_plus(qualifiers)
|
|
233
|
+
qualifiers_body_encoded = qualifiers_body_encoded.replace('+', '&')
|
|
234
|
+
url += f'&{qualifiers_body_encoded}'
|
|
235
|
+
if sort_method:
|
|
236
|
+
url += f'&sort={sort_method}&order=desc'
|
|
237
|
+
if page > 1:
|
|
238
|
+
url += f'&page={page}'
|
|
239
|
+
if per_page > 0:
|
|
240
|
+
url += f'&per_page={per_page}'
|
|
241
|
+
# print(url)
|
|
242
|
+
|
|
243
|
+
header = {
|
|
244
|
+
'Accept': 'application/vnd.github+json',
|
|
245
|
+
'Authorization': f'Bearer {self.token}',
|
|
246
|
+
'X-GitHub-Api-Version': '2022-11-28'
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
if self.USE_PROXY:
|
|
250
|
+
resp = self.session.get(url, headers=header, proxies=self.proxies)
|
|
251
|
+
else:
|
|
252
|
+
resp = self.session.get(url, headers=header)
|
|
253
|
+
next_page = None
|
|
254
|
+
# print(resp.text)
|
|
255
|
+
|
|
256
|
+
if 'link' in resp.headers.keys():
|
|
257
|
+
links = resp.headers['link']
|
|
258
|
+
links = links.split(',')
|
|
259
|
+
for link in links:
|
|
260
|
+
addr, rel = link.split(';')
|
|
261
|
+
addr = addr.strip()[1:-1]
|
|
262
|
+
|
|
263
|
+
if rel.find('next') >= 0:
|
|
264
|
+
next_page = addr
|
|
265
|
+
|
|
266
|
+
if 'X-RateLimit-Remaining' in resp.headers.keys():
|
|
267
|
+
self.search_limit_remaining = int(
|
|
268
|
+
resp.headers['X-RateLimit-Remaining'])
|
|
269
|
+
|
|
270
|
+
if 'X-RateLimit-Reset' in resp.headers.keys():
|
|
271
|
+
self.search_limit_reset = int(resp.headers['X-RateLimit-Reset'])
|
|
272
|
+
|
|
273
|
+
if 'Retry-After' in resp.headers.keys():
|
|
274
|
+
after = int(resp.headers['Retry-After'])
|
|
275
|
+
time.sleep(after + 3)
|
|
276
|
+
|
|
277
|
+
if resp.status_code != 200:
|
|
278
|
+
# print(resp.text)
|
|
279
|
+
return None, None
|
|
280
|
+
|
|
281
|
+
result = json.loads(resp.text)
|
|
282
|
+
items = result.get('items', [])
|
|
283
|
+
|
|
284
|
+
mode = True if loose_mode else False
|
|
285
|
+
if query_type == 'repositories':
|
|
286
|
+
filtered_items, effective_count = self.filter_items(items, keyword, size_limits, loose_mode = mode)
|
|
287
|
+
# update result
|
|
288
|
+
result['effective_count'] = effective_count
|
|
289
|
+
elif query_type == 'code':
|
|
290
|
+
filtered_items = [item for item in items if any(repo in item['html_url'] for repo in code_white_list)]
|
|
291
|
+
# update result
|
|
292
|
+
result['effective_count'] = len(filtered_items)
|
|
293
|
+
# update result
|
|
294
|
+
result['items'] = filtered_items
|
|
295
|
+
|
|
296
|
+
return result, next_page
|
|
297
|
+
|
|
298
|
+
def search_keyword(self, keyword:str, output_dir: str, filter_on: bool = True, loose_mode: bool = False):
|
|
299
|
+
# print("github called")
|
|
300
|
+
os.makedirs(output_dir, exist_ok=True)
|
|
301
|
+
code_result = None
|
|
302
|
+
|
|
303
|
+
mode = True if loose_mode else False
|
|
304
|
+
repo_result, _ = self._search_code(query_type='repositories',
|
|
305
|
+
query_body=f'"{keyword}"',
|
|
306
|
+
qualifiers='',
|
|
307
|
+
per_page=c.per_page,
|
|
308
|
+
size_limits=c.size_limits, keyword = keyword, loose_mode = mode)
|
|
309
|
+
if filter_on:
|
|
310
|
+
code_result, _ = self._search_code(query_type='code',
|
|
311
|
+
query_body=f'"{keyword}"',
|
|
312
|
+
qualifiers='',
|
|
313
|
+
keyword = keyword, loose_mode = mode)
|
|
314
|
+
# print(repo_result)
|
|
315
|
+
# with open ("test.txt", 'w') as f:
|
|
316
|
+
# f.write(json.dumps(repo_result, indent=4, ensure_ascii=False))
|
|
317
|
+
# f.write(json.dumps(code_result, indent=4, ensure_ascii=False))
|
|
318
|
+
total_repo = 0
|
|
319
|
+
total_code = 0
|
|
320
|
+
if repo_result is None:
|
|
321
|
+
print("search repo error")
|
|
322
|
+
else:
|
|
323
|
+
total_repo = repo_result['effective_count']
|
|
324
|
+
|
|
325
|
+
if code_result is None:
|
|
326
|
+
print("search code error")
|
|
327
|
+
else:
|
|
328
|
+
total_code = code_result['effective_count']
|
|
329
|
+
|
|
330
|
+
# Maximum files available on Github
|
|
331
|
+
repo_target_count = min(c.count_each_keyword, total_repo)
|
|
332
|
+
code_target_count = min(c.count_each_keyword, total_code)
|
|
333
|
+
|
|
334
|
+
repo_count = 0
|
|
335
|
+
# Repo Download Iteration
|
|
336
|
+
with tqdm(total=repo_target_count, desc=f'Searching repositories related to {keyword} from GitHub') as pbar:
|
|
337
|
+
try:
|
|
338
|
+
for item in repo_result['items']:
|
|
339
|
+
repo_name = item['name']
|
|
340
|
+
# repo_full_name = item['full_name']
|
|
341
|
+
repo_language = item['language']
|
|
342
|
+
repo_star = item['stars_count']
|
|
343
|
+
repo_url = item['clone_url']
|
|
344
|
+
|
|
345
|
+
repo_directory = os.path.join(output_dir,
|
|
346
|
+
f'{repo_star}_{repo_name}_{repo_language}')
|
|
347
|
+
if not os.path.exists(repo_directory):
|
|
348
|
+
os.mkdir(repo_directory)
|
|
349
|
+
Repo.clone_from(repo_url, repo_directory)
|
|
350
|
+
if filter_on:
|
|
351
|
+
file_count_0, _ = count_files_and_size(repo_directory)
|
|
352
|
+
# filter files, turn off when running pentestagent
|
|
353
|
+
remove_files(repo_directory, filter_list=for_github_repo_file, remove_no_extension=True)
|
|
354
|
+
file_count_1, _ = count_files_and_size(repo_directory)
|
|
355
|
+
if file_count_0 * 0.5 > file_count_1 or file_count_1 > c.file_num_limits:
|
|
356
|
+
shutil.rmtree(repo_directory) # recursively remove directories
|
|
357
|
+
else:
|
|
358
|
+
repo_count += 1
|
|
359
|
+
if repo_count == c.base_limit:
|
|
360
|
+
break
|
|
361
|
+
|
|
362
|
+
|
|
363
|
+
pbar.update()
|
|
364
|
+
|
|
365
|
+
except Exception as e:
|
|
366
|
+
print(e)
|
|
367
|
+
|
|
368
|
+
try:
|
|
369
|
+
repo_result.pop('incomplete_results', None)
|
|
370
|
+
for item in repo_result['items']:
|
|
371
|
+
item.update({key: item[key] for key in ['name', 'clone_url', 'stars_count', 'forks_count'] if key in item})
|
|
372
|
+
for key in list(item.keys()):
|
|
373
|
+
if key not in ['name', 'clone_url', 'stars_count', 'forks_count']:
|
|
374
|
+
del item[key]
|
|
375
|
+
|
|
376
|
+
# repo_result['items'] = [{key: item[key] for key in item if key in ['name', 'clone_url', 'stars_count', 'forks_count']} for item in repo_result['items']]
|
|
377
|
+
total_stars_count = 0 # initialize
|
|
378
|
+
total_forks_count = 0 # initialize
|
|
379
|
+
for item in repo_result['items']:
|
|
380
|
+
total_stars_count += item['stars_count']
|
|
381
|
+
total_forks_count += item['forks_count']
|
|
382
|
+
repo_result['total_stars_count'] = total_stars_count
|
|
383
|
+
repo_result['total_forks_count'] = total_forks_count
|
|
384
|
+
if (repo_result['total_count'] <= c.per_page) or (repo_result['total_count'] > c.per_page and repo_result['effective_count'] < c.per_page):
|
|
385
|
+
repo_result['trend_score'] = c.trend_weights[0] * repo_result['effective_count'] + c.trend_weights[1] * (total_stars_count + total_forks_count)
|
|
386
|
+
else:
|
|
387
|
+
repo_result['trend_score'] = c.trend_weights[0] * repo_result['total_count'] \
|
|
388
|
+
+ c.trend_weights[1] * (total_stars_count + total_forks_count) * c.alpha
|
|
389
|
+
try:
|
|
390
|
+
with open(os.path.join(os.path.dirname(output_dir), "Trend_Score.json"), "w") as f:
|
|
391
|
+
f.write(json.dumps(repo_result, indent=4, ensure_ascii=False))
|
|
392
|
+
except Exception as e:
|
|
393
|
+
print(e)
|
|
394
|
+
|
|
395
|
+
except Exception as e:
|
|
396
|
+
print(e)
|
|
397
|
+
|
|
398
|
+
# Code Download Iteration
|
|
399
|
+
code_output_dir = os.path.join(output_dir,"Code_File")
|
|
400
|
+
if not os.path.exists(code_output_dir):
|
|
401
|
+
os.makedirs(code_output_dir)
|
|
402
|
+
index_csv_path = os.path.join(code_output_dir, 'index.csv')
|
|
403
|
+
with tqdm(total=code_target_count, desc=f'Downloading code files related to {keyword} from GitHub') as pbar:
|
|
404
|
+
# initialize index.csv
|
|
405
|
+
with open(index_csv_path, 'w', newline='') as csvfile:
|
|
406
|
+
fieldnames = ['original_path', 'original_name', 'new_name']
|
|
407
|
+
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
|
408
|
+
writer.writeheader()
|
|
409
|
+
try:
|
|
410
|
+
for item in code_result['items']:
|
|
411
|
+
file_name = item['name']
|
|
412
|
+
file_path = item['path']
|
|
413
|
+
file_url = self.convert_to_raw(item['html_url']) # use html_url to download
|
|
414
|
+
suffix = file_name.split(".")[-1]
|
|
415
|
+
|
|
416
|
+
# selectively filter files
|
|
417
|
+
for i in range(len(for_github_code_file)):
|
|
418
|
+
if f".{suffix}" in for_github_code_file[i]:
|
|
419
|
+
continue
|
|
420
|
+
|
|
421
|
+
# generate nonrepeatable file name
|
|
422
|
+
unique_name = f"{dt.now().strftime('%Y%m%d%H%M%S%f')}.{suffix}"
|
|
423
|
+
|
|
424
|
+
# download file
|
|
425
|
+
# Security: Add timeout to prevent indefinite hangs (CWE-400)
|
|
426
|
+
response = requests.get(file_url, timeout=30)
|
|
427
|
+
if response.status_code == 200:
|
|
428
|
+
if not result_matches_cve(keyword, response.content):
|
|
429
|
+
continue
|
|
430
|
+
# save file content to local
|
|
431
|
+
with open(os.path.join(code_output_dir, unique_name), 'wb') as f:
|
|
432
|
+
f.write(response.content)
|
|
433
|
+
# update index.csv
|
|
434
|
+
with open(index_csv_path, 'a', newline='') as csvfile:
|
|
435
|
+
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
|
436
|
+
writer.writerow({
|
|
437
|
+
'original_path': file_path,
|
|
438
|
+
'original_name': file_name,
|
|
439
|
+
'new_name': unique_name
|
|
440
|
+
})
|
|
441
|
+
else:
|
|
442
|
+
print(f"Failed to download {file_url}, status code: {response.status_code}")
|
|
443
|
+
|
|
444
|
+
pbar.update()
|
|
445
|
+
|
|
446
|
+
except Exception as e:
|
|
447
|
+
print(e)
|
|
448
|
+
|
|
449
|
+
# check if index.csv only has headline
|
|
450
|
+
with open(index_csv_path, 'r', newline='') as csvfile:
|
|
451
|
+
reader = csv.reader(csvfile)
|
|
452
|
+
rows = list(reader) # extract all lines into a list
|
|
453
|
+
|
|
454
|
+
if len(rows) == 1:
|
|
455
|
+
os.remove(index_csv_path)
|
|
456
|
+
|
|
457
|
+
print("Recursively removing empty directories...")
|
|
458
|
+
remove_empty_directories(output_dir)
|
|
459
|
+
|
|
460
|
+
|
|
461
|
+
# def main():
|
|
462
|
+
# g = GithubSearcher()
|
|
463
|
+
# app = "CVE-2024-29847"
|
|
464
|
+
# g.search_keyword(f"{app} exploit", "/root/try/exp_web_data")
|
|
465
|
+
|
|
466
|
+
# if __name__ == "__main__":
|
|
467
|
+
# main()
|