jseye 1.0.0__py3-none-any.whl → 1.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- jseye/__init__.py +1 -1
- jseye/__main__.py +9 -0
- jseye/banner.py +59 -12
- jseye/cli.py +87 -42
- jseye/installer.py +2 -5
- jseye/modules/harvest.py +125 -72
- jseye/modules/js_download.py +235 -39
- jseye/modules/js_filter.py +156 -101
- jseye/modules/linkfinder.py +337 -27
- jseye/modules/tiered_analysis.py +304 -0
- jseye/pipeline.py +188 -70
- jseye/utils/cache.py +241 -0
- {jseye-1.0.0.dist-info → jseye-1.0.2.dist-info}/METADATA +2 -2
- jseye-1.0.2.dist-info/RECORD +31 -0
- jseye-1.0.0.dist-info/RECORD +0 -28
- {jseye-1.0.0.dist-info → jseye-1.0.2.dist-info}/WHEEL +0 -0
- {jseye-1.0.0.dist-info → jseye-1.0.2.dist-info}/entry_points.txt +0 -0
- {jseye-1.0.0.dist-info → jseye-1.0.2.dist-info}/licenses/LICENSE +0 -0
- {jseye-1.0.0.dist-info → jseye-1.0.2.dist-info}/top_level.txt +0 -0
jseye/modules/js_download.py
CHANGED
|
@@ -1,82 +1,278 @@
|
|
|
1
1
|
"""
|
|
2
|
-
JavaScript Download Module
|
|
2
|
+
JavaScript Download Module - With caching and parallel processing
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
|
+
import asyncio
|
|
6
|
+
import aiohttp
|
|
5
7
|
import requests
|
|
6
8
|
from pathlib import Path
|
|
7
9
|
from typing import List, Dict
|
|
8
10
|
from urllib.parse import urlparse
|
|
9
11
|
import hashlib
|
|
12
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
10
13
|
|
|
11
14
|
from ..utils.logger import log_progress
|
|
12
15
|
from ..utils.fs import ensure_dir
|
|
16
|
+
from ..utils.cache import JSEyeCache
|
|
13
17
|
|
|
14
18
|
class JSDownloader:
|
|
15
|
-
"""Download JavaScript files"""
|
|
19
|
+
"""Download JavaScript files with caching and parallel processing"""
|
|
16
20
|
|
|
17
21
|
def __init__(self, output_dir: Path):
|
|
18
22
|
self.output_dir = output_dir
|
|
19
23
|
self.js_dir = output_dir / "js_files"
|
|
20
24
|
ensure_dir(self.js_dir)
|
|
21
25
|
|
|
22
|
-
|
|
23
|
-
self.
|
|
26
|
+
# Initialize cache
|
|
27
|
+
self.cache = JSEyeCache(output_dir)
|
|
28
|
+
|
|
29
|
+
# Download settings
|
|
30
|
+
self.max_file_size = 3 * 1024 * 1024 # 3MB limit
|
|
31
|
+
self.timeout = 30 # 30 seconds timeout
|
|
32
|
+
self.max_concurrent = 10 # Max concurrent downloads
|
|
33
|
+
|
|
34
|
+
self.session_headers = {
|
|
24
35
|
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
|
25
|
-
}
|
|
36
|
+
}
|
|
26
37
|
|
|
27
|
-
def
|
|
28
|
-
"""Download a single JavaScript file"""
|
|
38
|
+
async def download_js_file_async(self, session: aiohttp.ClientSession, url: str) -> Dict:
|
|
39
|
+
"""Download a single JavaScript file asynchronously with caching"""
|
|
40
|
+
|
|
41
|
+
# Check cache first
|
|
42
|
+
cached_result = self.cache.get_download_cache(url)
|
|
43
|
+
if cached_result:
|
|
44
|
+
return cached_result
|
|
45
|
+
|
|
29
46
|
try:
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
47
|
+
async with session.get(url, timeout=aiohttp.ClientTimeout(total=self.timeout)) as response:
|
|
48
|
+
# Check content length
|
|
49
|
+
content_length = response.headers.get('content-length')
|
|
50
|
+
if content_length and int(content_length) > self.max_file_size:
|
|
51
|
+
result = {
|
|
52
|
+
'url': url,
|
|
53
|
+
'filepath': None,
|
|
54
|
+
'size': int(content_length),
|
|
55
|
+
'status': 'too_large',
|
|
56
|
+
'error': f'File too large: {int(content_length)} bytes'
|
|
57
|
+
}
|
|
58
|
+
self.cache.set_download_cache(url, result)
|
|
59
|
+
return result
|
|
60
|
+
|
|
61
|
+
# Read content with size limit
|
|
62
|
+
content = await response.read()
|
|
63
|
+
|
|
64
|
+
if len(content) > self.max_file_size:
|
|
65
|
+
result = {
|
|
66
|
+
'url': url,
|
|
67
|
+
'filepath': None,
|
|
68
|
+
'size': len(content),
|
|
69
|
+
'status': 'too_large',
|
|
70
|
+
'error': f'Content too large: {len(content)} bytes'
|
|
71
|
+
}
|
|
72
|
+
self.cache.set_download_cache(url, result)
|
|
73
|
+
return result
|
|
74
|
+
|
|
75
|
+
# Generate filename from URL hash
|
|
76
|
+
url_hash = hashlib.md5(url.encode()).hexdigest()[:8]
|
|
77
|
+
parsed = urlparse(url)
|
|
78
|
+
domain = parsed.netloc.replace('.', '_')
|
|
79
|
+
filename = f"{domain}_{url_hash}.js"
|
|
80
|
+
|
|
81
|
+
filepath = self.js_dir / filename
|
|
82
|
+
|
|
83
|
+
# Write file
|
|
84
|
+
with open(filepath, 'wb') as f:
|
|
85
|
+
f.write(content)
|
|
86
|
+
|
|
87
|
+
result = {
|
|
88
|
+
'url': url,
|
|
89
|
+
'filepath': str(filepath),
|
|
90
|
+
'size': len(content),
|
|
91
|
+
'status': 'success'
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
# Cache the result
|
|
95
|
+
self.cache.set_download_cache(url, result)
|
|
96
|
+
|
|
97
|
+
return result
|
|
98
|
+
|
|
99
|
+
except asyncio.TimeoutError:
|
|
100
|
+
result = {
|
|
45
101
|
'url': url,
|
|
46
|
-
'filepath':
|
|
47
|
-
'size':
|
|
48
|
-
'status': '
|
|
102
|
+
'filepath': None,
|
|
103
|
+
'size': 0,
|
|
104
|
+
'status': 'timeout',
|
|
105
|
+
'error': 'Download timeout'
|
|
49
106
|
}
|
|
107
|
+
self.cache.set_download_cache(url, result)
|
|
108
|
+
return result
|
|
50
109
|
|
|
51
110
|
except Exception as e:
|
|
52
|
-
|
|
111
|
+
result = {
|
|
53
112
|
'url': url,
|
|
54
113
|
'filepath': None,
|
|
55
114
|
'size': 0,
|
|
56
115
|
'status': 'failed',
|
|
57
116
|
'error': str(e)
|
|
58
117
|
}
|
|
118
|
+
self.cache.set_download_cache(url, result)
|
|
119
|
+
return result
|
|
59
120
|
|
|
60
|
-
def
|
|
61
|
-
"""Download multiple JavaScript files"""
|
|
62
|
-
log_progress(f"Downloading JavaScript files
|
|
121
|
+
async def download_js_files_parallel(self, urls: List[str]) -> List[Dict]:
|
|
122
|
+
"""Download multiple JavaScript files in parallel"""
|
|
123
|
+
log_progress(f">> Downloading {len(urls)} JavaScript files in parallel...")
|
|
124
|
+
|
|
125
|
+
# Create semaphore to limit concurrent downloads
|
|
126
|
+
semaphore = asyncio.Semaphore(self.max_concurrent)
|
|
63
127
|
|
|
64
|
-
|
|
128
|
+
async def download_with_semaphore(session, url):
|
|
129
|
+
async with semaphore:
|
|
130
|
+
return await self.download_js_file_async(session, url)
|
|
131
|
+
|
|
132
|
+
# Create aiohttp session
|
|
133
|
+
connector = aiohttp.TCPConnector(limit=self.max_concurrent)
|
|
134
|
+
timeout = aiohttp.ClientTimeout(total=self.timeout)
|
|
135
|
+
|
|
136
|
+
async with aiohttp.ClientSession(
|
|
137
|
+
connector=connector,
|
|
138
|
+
timeout=timeout,
|
|
139
|
+
headers=self.session_headers
|
|
140
|
+
) as session:
|
|
141
|
+
|
|
142
|
+
# Create tasks for all downloads
|
|
143
|
+
tasks = [download_with_semaphore(session, url) for url in urls]
|
|
144
|
+
|
|
145
|
+
# Execute all downloads in parallel
|
|
146
|
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
147
|
+
|
|
148
|
+
# Filter out exceptions
|
|
149
|
+
valid_results = []
|
|
150
|
+
for i, result in enumerate(results):
|
|
151
|
+
if isinstance(result, Exception):
|
|
152
|
+
log_progress(f"Download exception for {urls[i]}: {result}")
|
|
153
|
+
valid_results.append({
|
|
154
|
+
'url': urls[i],
|
|
155
|
+
'filepath': None,
|
|
156
|
+
'size': 0,
|
|
157
|
+
'status': 'exception',
|
|
158
|
+
'error': str(result)
|
|
159
|
+
})
|
|
160
|
+
else:
|
|
161
|
+
valid_results.append(result)
|
|
162
|
+
|
|
163
|
+
# Count successful downloads
|
|
164
|
+
successful = len([r for r in valid_results if r['status'] == 'success'])
|
|
165
|
+
cached = len([r for r in valid_results if 'cached_at' in r])
|
|
166
|
+
|
|
167
|
+
log_progress(f"[C] Download complete: {successful} new, {cached} cached, {len(valid_results)} total")
|
|
168
|
+
|
|
169
|
+
return valid_results
|
|
170
|
+
|
|
171
|
+
def download_js_files(self, urls: List[str], max_files: int = 200) -> List[Dict]:
|
|
172
|
+
"""
|
|
173
|
+
Download JavaScript files (main entry point)
|
|
174
|
+
|
|
175
|
+
Args:
|
|
176
|
+
urls: List of JavaScript URLs
|
|
177
|
+
max_files: Maximum number of files to download
|
|
178
|
+
|
|
179
|
+
Returns:
|
|
180
|
+
List of download results
|
|
181
|
+
"""
|
|
182
|
+
# Limit number of files
|
|
65
183
|
urls_to_download = urls[:max_files]
|
|
66
184
|
|
|
185
|
+
try:
|
|
186
|
+
# Use async download
|
|
187
|
+
results = asyncio.run(self.download_js_files_parallel(urls_to_download))
|
|
188
|
+
except Exception as e:
|
|
189
|
+
log_progress(f"Parallel download failed, falling back to sequential: {e}")
|
|
190
|
+
results = self.download_js_files_sequential(urls_to_download)
|
|
191
|
+
|
|
192
|
+
return results
|
|
193
|
+
|
|
194
|
+
def download_js_files_sequential(self, urls: List[str]) -> List[Dict]:
|
|
195
|
+
"""Fallback sequential download"""
|
|
196
|
+
log_progress("Using sequential download (fallback)")
|
|
197
|
+
|
|
67
198
|
results = []
|
|
68
|
-
|
|
199
|
+
session = requests.Session()
|
|
200
|
+
session.headers.update(self.session_headers)
|
|
69
201
|
|
|
70
|
-
for i, url in enumerate(
|
|
71
|
-
if i %
|
|
72
|
-
log_progress(f"Downloaded {i}/{len(
|
|
202
|
+
for i, url in enumerate(urls, 1):
|
|
203
|
+
if i % 20 == 0:
|
|
204
|
+
log_progress(f"Downloaded {i}/{len(urls)} files")
|
|
73
205
|
|
|
74
|
-
|
|
75
|
-
|
|
206
|
+
# Check cache first
|
|
207
|
+
cached_result = self.cache.get_download_cache(url)
|
|
208
|
+
if cached_result:
|
|
209
|
+
results.append(cached_result)
|
|
210
|
+
continue
|
|
76
211
|
|
|
77
|
-
|
|
78
|
-
|
|
212
|
+
try:
|
|
213
|
+
response = session.get(url, timeout=self.timeout, stream=True)
|
|
214
|
+
|
|
215
|
+
# Check content length
|
|
216
|
+
content_length = response.headers.get('content-length')
|
|
217
|
+
if content_length and int(content_length) > self.max_file_size:
|
|
218
|
+
result = {
|
|
219
|
+
'url': url,
|
|
220
|
+
'filepath': None,
|
|
221
|
+
'size': int(content_length),
|
|
222
|
+
'status': 'too_large'
|
|
223
|
+
}
|
|
224
|
+
results.append(result)
|
|
225
|
+
continue
|
|
226
|
+
|
|
227
|
+
# Read content with size limit
|
|
228
|
+
content = b''
|
|
229
|
+
for chunk in response.iter_content(chunk_size=8192):
|
|
230
|
+
content += chunk
|
|
231
|
+
if len(content) > self.max_file_size:
|
|
232
|
+
break
|
|
233
|
+
|
|
234
|
+
if len(content) > self.max_file_size:
|
|
235
|
+
result = {
|
|
236
|
+
'url': url,
|
|
237
|
+
'filepath': None,
|
|
238
|
+
'size': len(content),
|
|
239
|
+
'status': 'too_large'
|
|
240
|
+
}
|
|
241
|
+
results.append(result)
|
|
242
|
+
continue
|
|
243
|
+
|
|
244
|
+
# Generate filename and save
|
|
245
|
+
url_hash = hashlib.md5(url.encode()).hexdigest()[:8]
|
|
246
|
+
parsed = urlparse(url)
|
|
247
|
+
domain = parsed.netloc.replace('.', '_')
|
|
248
|
+
filename = f"{domain}_{url_hash}.js"
|
|
249
|
+
filepath = self.js_dir / filename
|
|
250
|
+
|
|
251
|
+
with open(filepath, 'wb') as f:
|
|
252
|
+
f.write(content)
|
|
253
|
+
|
|
254
|
+
result = {
|
|
255
|
+
'url': url,
|
|
256
|
+
'filepath': str(filepath),
|
|
257
|
+
'size': len(content),
|
|
258
|
+
'status': 'success'
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
# Cache result
|
|
262
|
+
self.cache.set_download_cache(url, result)
|
|
263
|
+
results.append(result)
|
|
264
|
+
|
|
265
|
+
except Exception as e:
|
|
266
|
+
result = {
|
|
267
|
+
'url': url,
|
|
268
|
+
'filepath': None,
|
|
269
|
+
'size': 0,
|
|
270
|
+
'status': 'failed',
|
|
271
|
+
'error': str(e)
|
|
272
|
+
}
|
|
273
|
+
results.append(result)
|
|
79
274
|
|
|
80
|
-
|
|
275
|
+
successful = len([r for r in results if r['status'] == 'success'])
|
|
276
|
+
log_progress(f"Sequential download complete: {successful}/{len(urls)} successful")
|
|
81
277
|
|
|
82
278
|
return results
|
jseye/modules/js_filter.py
CHANGED
|
@@ -1,17 +1,17 @@
|
|
|
1
1
|
"""
|
|
2
|
-
JavaScript File Filtering Module
|
|
2
|
+
JavaScript File Filtering Module - Smart Early Prioritization
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
5
|
import re
|
|
6
6
|
from pathlib import Path
|
|
7
|
-
from typing import List, Dict
|
|
7
|
+
from typing import List, Dict, Tuple
|
|
8
8
|
from urllib.parse import urlparse, parse_qs
|
|
9
9
|
|
|
10
10
|
from ..utils.logger import log_progress
|
|
11
11
|
from ..utils.fs import save_lines, save_json
|
|
12
12
|
|
|
13
13
|
class JSFilter:
|
|
14
|
-
"""Filter and prioritize JavaScript files"""
|
|
14
|
+
"""Filter and prioritize JavaScript files with early smart scoring"""
|
|
15
15
|
|
|
16
16
|
def __init__(self, output_dir: Path):
|
|
17
17
|
self.output_dir = output_dir
|
|
@@ -30,46 +30,108 @@ class JSFilter:
|
|
|
30
30
|
r'text/javascript'
|
|
31
31
|
]
|
|
32
32
|
|
|
33
|
-
#
|
|
33
|
+
# 🔥 SMART EARLY SCORING PATTERNS
|
|
34
34
|
self.high_value_patterns = [
|
|
35
|
-
|
|
36
|
-
r'
|
|
37
|
-
r'
|
|
38
|
-
r'
|
|
39
|
-
r'dashboard',
|
|
40
|
-
r'
|
|
41
|
-
r'
|
|
42
|
-
r'
|
|
43
|
-
r'
|
|
44
|
-
r'
|
|
45
|
-
r'
|
|
46
|
-
r'
|
|
47
|
-
r'
|
|
48
|
-
r'
|
|
49
|
-
r'internal'
|
|
35
|
+
# Critical business logic
|
|
36
|
+
(r'/app/', 15),
|
|
37
|
+
(r'/api/', 20),
|
|
38
|
+
(r'/admin/', 25),
|
|
39
|
+
(r'/dashboard/', 20),
|
|
40
|
+
(r'/panel/', 18),
|
|
41
|
+
(r'/manage/', 18),
|
|
42
|
+
(r'/config/', 22),
|
|
43
|
+
(r'/auth/', 20),
|
|
44
|
+
(r'/login/', 18),
|
|
45
|
+
(r'/user/', 15),
|
|
46
|
+
(r'/account/', 15),
|
|
47
|
+
(r'/profile/', 12),
|
|
48
|
+
(r'/settings/', 15),
|
|
49
|
+
(r'/internal/', 25),
|
|
50
|
+
(r'/private/', 25),
|
|
51
|
+
# Custom/business specific
|
|
52
|
+
(r'[^/]+\.(js|jsx|ts|tsx)$', 10), # Custom files
|
|
53
|
+
(r'\?.*=.*', 8), # Has query parameters
|
|
50
54
|
]
|
|
51
55
|
|
|
52
|
-
#
|
|
56
|
+
# 🚫 VENDOR/NOISE PATTERNS (SKIP THESE)
|
|
53
57
|
self.vendor_patterns = [
|
|
54
|
-
|
|
55
|
-
r'
|
|
56
|
-
r'
|
|
57
|
-
r'
|
|
58
|
-
r'
|
|
59
|
-
r'
|
|
60
|
-
r'
|
|
61
|
-
r'
|
|
62
|
-
r'
|
|
63
|
-
|
|
64
|
-
r'
|
|
65
|
-
r'
|
|
66
|
-
r'
|
|
67
|
-
r'
|
|
68
|
-
r'
|
|
69
|
-
r'
|
|
70
|
-
r'
|
|
58
|
+
# Major libraries (SKIP COMPLETELY)
|
|
59
|
+
(r'jquery', -50),
|
|
60
|
+
(r'bootstrap', -50),
|
|
61
|
+
(r'angular', -40),
|
|
62
|
+
(r'react', -40),
|
|
63
|
+
(r'vue', -40),
|
|
64
|
+
(r'lodash', -50),
|
|
65
|
+
(r'moment', -50),
|
|
66
|
+
(r'chart', -45),
|
|
67
|
+
# Analytics/tracking (WASTE OF TIME)
|
|
68
|
+
(r'google', -60),
|
|
69
|
+
(r'facebook', -60),
|
|
70
|
+
(r'twitter', -60),
|
|
71
|
+
(r'analytics', -70),
|
|
72
|
+
(r'gtm', -70),
|
|
73
|
+
(r'gtag', -70),
|
|
74
|
+
(r'tracking', -65),
|
|
75
|
+
# CDNs (SKIP)
|
|
76
|
+
(r'cdn\.', -60),
|
|
77
|
+
(r'unpkg', -60),
|
|
78
|
+
(r'jsdelivr', -60),
|
|
79
|
+
(r'cdnjs', -60),
|
|
80
|
+
# Build artifacts (USUALLY NOISE)
|
|
81
|
+
(r'chunk', -30),
|
|
82
|
+
(r'vendor', -40),
|
|
83
|
+
(r'bundle', -25),
|
|
84
|
+
(r'\.min\.', -20),
|
|
85
|
+
# Version indicators (LESS IMPORTANT)
|
|
86
|
+
(r'v\d+', -15),
|
|
87
|
+
(r'version', -15),
|
|
88
|
+
(r'\d+\.\d+', -10),
|
|
71
89
|
]
|
|
72
90
|
|
|
91
|
+
def calculate_smart_score(self, url: str) -> Tuple[int, str]:
|
|
92
|
+
"""
|
|
93
|
+
🔥 SMART SCORING ALGORITHM
|
|
94
|
+
Returns (score, reason) - Higher score = more important
|
|
95
|
+
"""
|
|
96
|
+
score = 0
|
|
97
|
+
reasons = []
|
|
98
|
+
url_lower = url.lower()
|
|
99
|
+
|
|
100
|
+
# High-value patterns (BUSINESS LOGIC)
|
|
101
|
+
for pattern, points in self.high_value_patterns:
|
|
102
|
+
if re.search(pattern, url_lower):
|
|
103
|
+
score += points
|
|
104
|
+
reasons.append(f"+{points} ({pattern})")
|
|
105
|
+
|
|
106
|
+
# Vendor/noise patterns (SKIP THESE)
|
|
107
|
+
for pattern, points in self.vendor_patterns:
|
|
108
|
+
if re.search(pattern, url_lower):
|
|
109
|
+
score += points # These are negative
|
|
110
|
+
reasons.append(f"{points} ({pattern})")
|
|
111
|
+
|
|
112
|
+
# Path depth bonus (shorter = more important)
|
|
113
|
+
path_depth = url.count('/')
|
|
114
|
+
if path_depth < 4:
|
|
115
|
+
depth_bonus = (4 - path_depth) * 3
|
|
116
|
+
score += depth_bonus
|
|
117
|
+
reasons.append(f"+{depth_bonus} (short path)")
|
|
118
|
+
|
|
119
|
+
# Query parameters bonus (dynamic content)
|
|
120
|
+
if '?' in url and '=' in url:
|
|
121
|
+
score += 8
|
|
122
|
+
reasons.append("+8 (has params)")
|
|
123
|
+
|
|
124
|
+
# File extension analysis
|
|
125
|
+
if url_lower.endswith(('.min.js', '.min.jsx')):
|
|
126
|
+
score -= 15
|
|
127
|
+
reasons.append("-15 (minified)")
|
|
128
|
+
elif url_lower.endswith(('.js', '.jsx', '.ts', '.tsx')):
|
|
129
|
+
score += 5
|
|
130
|
+
reasons.append("+5 (js file)")
|
|
131
|
+
|
|
132
|
+
reason = "; ".join(reasons) if reasons else "default"
|
|
133
|
+
return max(-100, min(100, score)), reason
|
|
134
|
+
|
|
73
135
|
def is_javascript_url(self, url: str) -> bool:
|
|
74
136
|
"""Check if URL points to JavaScript file"""
|
|
75
137
|
url_lower = url.lower()
|
|
@@ -88,47 +150,17 @@ class JSFilter:
|
|
|
88
150
|
|
|
89
151
|
return False
|
|
90
152
|
|
|
91
|
-
def calculate_priority_score(self, url: str) -> int:
|
|
92
|
-
"""Calculate priority score for JS file (higher = more important)"""
|
|
93
|
-
score = 0
|
|
94
|
-
url_lower = url.lower()
|
|
95
|
-
|
|
96
|
-
# High-value indicators (+10 each)
|
|
97
|
-
for pattern in self.high_value_patterns:
|
|
98
|
-
if re.search(pattern, url_lower):
|
|
99
|
-
score += 10
|
|
100
|
-
|
|
101
|
-
# Vendor/library indicators (-5 each)
|
|
102
|
-
for pattern in self.vendor_patterns:
|
|
103
|
-
if re.search(pattern, url_lower):
|
|
104
|
-
score -= 5
|
|
105
|
-
|
|
106
|
-
# Shorter paths are often more important (+1 per missing slash)
|
|
107
|
-
path_depth = url.count('/')
|
|
108
|
-
if path_depth < 5:
|
|
109
|
-
score += (5 - path_depth)
|
|
110
|
-
|
|
111
|
-
# Non-minified files are more readable (+3)
|
|
112
|
-
if '.min.' not in url_lower:
|
|
113
|
-
score += 3
|
|
114
|
-
|
|
115
|
-
# Files with version numbers might be less important (-2)
|
|
116
|
-
if re.search(r'v\d+|version|\d+\.\d+', url_lower):
|
|
117
|
-
score -= 2
|
|
118
|
-
|
|
119
|
-
return max(0, score) # Ensure non-negative
|
|
120
|
-
|
|
121
153
|
def filter_javascript_urls(self, urls: List[str]) -> Dict[str, List[str]]:
|
|
122
154
|
"""
|
|
123
|
-
|
|
155
|
+
>> SMART FILTER: Early prioritization to save massive time
|
|
124
156
|
|
|
125
157
|
Args:
|
|
126
158
|
urls: List of URLs to filter
|
|
127
159
|
|
|
128
160
|
Returns:
|
|
129
|
-
Dictionary with
|
|
161
|
+
Dictionary with tiered JS files for different analysis levels
|
|
130
162
|
"""
|
|
131
|
-
log_progress("
|
|
163
|
+
log_progress(">> Smart filtering JavaScript files with early prioritization...")
|
|
132
164
|
|
|
133
165
|
js_urls = []
|
|
134
166
|
|
|
@@ -139,48 +171,71 @@ class JSFilter:
|
|
|
139
171
|
|
|
140
172
|
log_progress(f"Found {len(js_urls)} JavaScript files")
|
|
141
173
|
|
|
142
|
-
#
|
|
174
|
+
# 🔥 SMART SCORING
|
|
143
175
|
scored_urls = []
|
|
176
|
+
vendor_skipped = 0
|
|
177
|
+
|
|
144
178
|
for url in js_urls:
|
|
145
|
-
score = self.
|
|
146
|
-
|
|
179
|
+
score, reason = self.calculate_smart_score(url)
|
|
180
|
+
|
|
181
|
+
# SKIP VENDOR/NOISE COMPLETELY (HUGE TIME SAVER)
|
|
182
|
+
if score < -30:
|
|
183
|
+
vendor_skipped += 1
|
|
184
|
+
continue
|
|
185
|
+
|
|
186
|
+
scored_urls.append({
|
|
187
|
+
'url': url,
|
|
188
|
+
'score': score,
|
|
189
|
+
'reason': reason
|
|
190
|
+
})
|
|
147
191
|
|
|
148
192
|
# Sort by score (highest first)
|
|
149
|
-
scored_urls.sort(key=lambda x: x[
|
|
193
|
+
scored_urls.sort(key=lambda x: x['score'], reverse=True)
|
|
194
|
+
|
|
195
|
+
# >> TIERED ANALYSIS ASSIGNMENT
|
|
196
|
+
total_js = len(scored_urls)
|
|
197
|
+
|
|
198
|
+
# Tier 1: TOP 20% - Full analysis (AST + Regex + Secrets)
|
|
199
|
+
tier1_count = max(5, int(total_js * 0.2))
|
|
200
|
+
tier1_urls = scored_urls[:tier1_count]
|
|
150
201
|
|
|
151
|
-
#
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
low_priority = []
|
|
202
|
+
# Tier 2: NEXT 30% - Medium analysis (Regex + LinkFinder)
|
|
203
|
+
tier2_count = int(total_js * 0.3)
|
|
204
|
+
tier2_urls = scored_urls[tier1_count:tier1_count + tier2_count]
|
|
155
205
|
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
high_priority.append(url)
|
|
159
|
-
elif score >= 5:
|
|
160
|
-
medium_priority.append(url)
|
|
161
|
-
else:
|
|
162
|
-
low_priority.append(url)
|
|
206
|
+
# Tier 3: REMAINING 50% - Light analysis (Regex only)
|
|
207
|
+
tier3_urls = scored_urls[tier1_count + tier2_count:]
|
|
163
208
|
|
|
164
209
|
results = {
|
|
165
|
-
'
|
|
166
|
-
'
|
|
167
|
-
'
|
|
168
|
-
'
|
|
210
|
+
'tier1_full': [item['url'] for item in tier1_urls],
|
|
211
|
+
'tier2_medium': [item['url'] for item in tier2_urls],
|
|
212
|
+
'tier3_light': [item['url'] for item in tier3_urls],
|
|
213
|
+
'all_js': [item['url'] for item in scored_urls],
|
|
214
|
+
'vendor_skipped': vendor_skipped
|
|
169
215
|
}
|
|
170
216
|
|
|
171
|
-
log_progress(f"
|
|
217
|
+
log_progress(f">> Smart tiers: T1({len(tier1_urls)}) T2({len(tier2_urls)}) T3({len(tier3_urls)}) Skipped({vendor_skipped})")
|
|
172
218
|
|
|
173
|
-
# Save results
|
|
219
|
+
# Save tiered results
|
|
220
|
+
save_lines(results['tier1_full'], self.output_dir / "js_tier1_full.txt")
|
|
221
|
+
save_lines(results['tier2_medium'], self.output_dir / "js_tier2_medium.txt")
|
|
222
|
+
save_lines(results['tier3_light'], self.output_dir / "js_tier3_light.txt")
|
|
174
223
|
save_lines(results['all_js'], self.output_dir / "js_files_all.txt")
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
224
|
+
|
|
225
|
+
# Save detailed scoring
|
|
226
|
+
detailed_results = {
|
|
227
|
+
'summary': {
|
|
228
|
+
'total_js_found': len(js_urls),
|
|
229
|
+
'vendor_skipped': vendor_skipped,
|
|
230
|
+
'tier1_count': len(tier1_urls),
|
|
231
|
+
'tier2_count': len(tier2_urls),
|
|
232
|
+
'tier3_count': len(tier3_urls),
|
|
233
|
+
'time_saved_estimate': f"{vendor_skipped * 2 + len(tier3_urls) * 1.5:.1f} seconds"
|
|
234
|
+
},
|
|
235
|
+
'tier1_details': tier1_urls,
|
|
236
|
+
'tier2_details': tier2_urls,
|
|
237
|
+
'tier3_details': tier3_urls
|
|
238
|
+
}
|
|
239
|
+
save_json(detailed_results, self.output_dir / "js_smart_analysis.json")
|
|
185
240
|
|
|
186
241
|
return results
|