jseye 1.0.0__py3-none-any.whl → 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,82 +1,278 @@
1
1
  """
2
- JavaScript Download Module
2
+ JavaScript Download Module - With caching and parallel processing
3
3
  """
4
4
 
5
+ import asyncio
6
+ import aiohttp
5
7
  import requests
6
8
  from pathlib import Path
7
9
  from typing import List, Dict
8
10
  from urllib.parse import urlparse
9
11
  import hashlib
12
+ from concurrent.futures import ThreadPoolExecutor
10
13
 
11
14
  from ..utils.logger import log_progress
12
15
  from ..utils.fs import ensure_dir
16
+ from ..utils.cache import JSEyeCache
13
17
 
14
18
  class JSDownloader:
15
- """Download JavaScript files"""
19
+ """Download JavaScript files with caching and parallel processing"""
16
20
 
17
21
  def __init__(self, output_dir: Path):
18
22
  self.output_dir = output_dir
19
23
  self.js_dir = output_dir / "js_files"
20
24
  ensure_dir(self.js_dir)
21
25
 
22
- self.session = requests.Session()
23
- self.session.headers.update({
26
+ # Initialize cache
27
+ self.cache = JSEyeCache(output_dir)
28
+
29
+ # Download settings
30
+ self.max_file_size = 3 * 1024 * 1024 # 3MB limit
31
+ self.timeout = 30 # 30 seconds timeout
32
+ self.max_concurrent = 10 # Max concurrent downloads
33
+
34
+ self.session_headers = {
24
35
  'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
25
- })
36
+ }
26
37
 
27
- def download_js_file(self, url: str) -> Dict:
28
- """Download a single JavaScript file"""
38
+ async def download_js_file_async(self, session: aiohttp.ClientSession, url: str) -> Dict:
39
+ """Download a single JavaScript file asynchronously with caching"""
40
+
41
+ # Check cache first
42
+ cached_result = self.cache.get_download_cache(url)
43
+ if cached_result:
44
+ return cached_result
45
+
29
46
  try:
30
- response = self.session.get(url, timeout=30)
31
- response.raise_for_status()
32
-
33
- # Generate filename from URL hash
34
- url_hash = hashlib.md5(url.encode()).hexdigest()[:8]
35
- parsed = urlparse(url)
36
- domain = parsed.netloc.replace('.', '_')
37
- filename = f"{domain}_{url_hash}.js"
38
-
39
- filepath = self.js_dir / filename
40
-
41
- with open(filepath, 'w', encoding='utf-8', errors='ignore') as f:
42
- f.write(response.text)
43
-
44
- return {
47
+ async with session.get(url, timeout=aiohttp.ClientTimeout(total=self.timeout)) as response:
48
+ # Check content length
49
+ content_length = response.headers.get('content-length')
50
+ if content_length and int(content_length) > self.max_file_size:
51
+ result = {
52
+ 'url': url,
53
+ 'filepath': None,
54
+ 'size': int(content_length),
55
+ 'status': 'too_large',
56
+ 'error': f'File too large: {int(content_length)} bytes'
57
+ }
58
+ self.cache.set_download_cache(url, result)
59
+ return result
60
+
61
+ # Read content with size limit
62
+ content = await response.read()
63
+
64
+ if len(content) > self.max_file_size:
65
+ result = {
66
+ 'url': url,
67
+ 'filepath': None,
68
+ 'size': len(content),
69
+ 'status': 'too_large',
70
+ 'error': f'Content too large: {len(content)} bytes'
71
+ }
72
+ self.cache.set_download_cache(url, result)
73
+ return result
74
+
75
+ # Generate filename from URL hash
76
+ url_hash = hashlib.md5(url.encode()).hexdigest()[:8]
77
+ parsed = urlparse(url)
78
+ domain = parsed.netloc.replace('.', '_')
79
+ filename = f"{domain}_{url_hash}.js"
80
+
81
+ filepath = self.js_dir / filename
82
+
83
+ # Write file
84
+ with open(filepath, 'wb') as f:
85
+ f.write(content)
86
+
87
+ result = {
88
+ 'url': url,
89
+ 'filepath': str(filepath),
90
+ 'size': len(content),
91
+ 'status': 'success'
92
+ }
93
+
94
+ # Cache the result
95
+ self.cache.set_download_cache(url, result)
96
+
97
+ return result
98
+
99
+ except asyncio.TimeoutError:
100
+ result = {
45
101
  'url': url,
46
- 'filepath': str(filepath),
47
- 'size': len(response.text),
48
- 'status': 'success'
102
+ 'filepath': None,
103
+ 'size': 0,
104
+ 'status': 'timeout',
105
+ 'error': 'Download timeout'
49
106
  }
107
+ self.cache.set_download_cache(url, result)
108
+ return result
50
109
 
51
110
  except Exception as e:
52
- return {
111
+ result = {
53
112
  'url': url,
54
113
  'filepath': None,
55
114
  'size': 0,
56
115
  'status': 'failed',
57
116
  'error': str(e)
58
117
  }
118
+ self.cache.set_download_cache(url, result)
119
+ return result
59
120
 
60
- def download_js_files(self, urls: List[str], max_files: int = 100) -> List[Dict]:
61
- """Download multiple JavaScript files"""
62
- log_progress(f"Downloading JavaScript files (max {max_files})")
121
+ async def download_js_files_parallel(self, urls: List[str]) -> List[Dict]:
122
+ """Download multiple JavaScript files in parallel"""
123
+ log_progress(f">> Downloading {len(urls)} JavaScript files in parallel...")
124
+
125
+ # Create semaphore to limit concurrent downloads
126
+ semaphore = asyncio.Semaphore(self.max_concurrent)
63
127
 
64
- # Limit number of files to download
128
+ async def download_with_semaphore(session, url):
129
+ async with semaphore:
130
+ return await self.download_js_file_async(session, url)
131
+
132
+ # Create aiohttp session
133
+ connector = aiohttp.TCPConnector(limit=self.max_concurrent)
134
+ timeout = aiohttp.ClientTimeout(total=self.timeout)
135
+
136
+ async with aiohttp.ClientSession(
137
+ connector=connector,
138
+ timeout=timeout,
139
+ headers=self.session_headers
140
+ ) as session:
141
+
142
+ # Create tasks for all downloads
143
+ tasks = [download_with_semaphore(session, url) for url in urls]
144
+
145
+ # Execute all downloads in parallel
146
+ results = await asyncio.gather(*tasks, return_exceptions=True)
147
+
148
+ # Filter out exceptions
149
+ valid_results = []
150
+ for i, result in enumerate(results):
151
+ if isinstance(result, Exception):
152
+ log_progress(f"Download exception for {urls[i]}: {result}")
153
+ valid_results.append({
154
+ 'url': urls[i],
155
+ 'filepath': None,
156
+ 'size': 0,
157
+ 'status': 'exception',
158
+ 'error': str(result)
159
+ })
160
+ else:
161
+ valid_results.append(result)
162
+
163
+ # Count successful downloads
164
+ successful = len([r for r in valid_results if r['status'] == 'success'])
165
+ cached = len([r for r in valid_results if 'cached_at' in r])
166
+
167
+ log_progress(f"[C] Download complete: {successful} new, {cached} cached, {len(valid_results)} total")
168
+
169
+ return valid_results
170
+
171
+ def download_js_files(self, urls: List[str], max_files: int = 200) -> List[Dict]:
172
+ """
173
+ Download JavaScript files (main entry point)
174
+
175
+ Args:
176
+ urls: List of JavaScript URLs
177
+ max_files: Maximum number of files to download
178
+
179
+ Returns:
180
+ List of download results
181
+ """
182
+ # Limit number of files
65
183
  urls_to_download = urls[:max_files]
66
184
 
185
+ try:
186
+ # Use async download
187
+ results = asyncio.run(self.download_js_files_parallel(urls_to_download))
188
+ except Exception as e:
189
+ log_progress(f"Parallel download failed, falling back to sequential: {e}")
190
+ results = self.download_js_files_sequential(urls_to_download)
191
+
192
+ return results
193
+
194
+ def download_js_files_sequential(self, urls: List[str]) -> List[Dict]:
195
+ """Fallback sequential download"""
196
+ log_progress("Using sequential download (fallback)")
197
+
67
198
  results = []
68
- successful = 0
199
+ session = requests.Session()
200
+ session.headers.update(self.session_headers)
69
201
 
70
- for i, url in enumerate(urls_to_download, 1):
71
- if i % 10 == 0:
72
- log_progress(f"Downloaded {i}/{len(urls_to_download)} files")
202
+ for i, url in enumerate(urls, 1):
203
+ if i % 20 == 0:
204
+ log_progress(f"Downloaded {i}/{len(urls)} files")
73
205
 
74
- result = self.download_js_file(url)
75
- results.append(result)
206
+ # Check cache first
207
+ cached_result = self.cache.get_download_cache(url)
208
+ if cached_result:
209
+ results.append(cached_result)
210
+ continue
76
211
 
77
- if result['status'] == 'success':
78
- successful += 1
212
+ try:
213
+ response = session.get(url, timeout=self.timeout, stream=True)
214
+
215
+ # Check content length
216
+ content_length = response.headers.get('content-length')
217
+ if content_length and int(content_length) > self.max_file_size:
218
+ result = {
219
+ 'url': url,
220
+ 'filepath': None,
221
+ 'size': int(content_length),
222
+ 'status': 'too_large'
223
+ }
224
+ results.append(result)
225
+ continue
226
+
227
+ # Read content with size limit
228
+ content = b''
229
+ for chunk in response.iter_content(chunk_size=8192):
230
+ content += chunk
231
+ if len(content) > self.max_file_size:
232
+ break
233
+
234
+ if len(content) > self.max_file_size:
235
+ result = {
236
+ 'url': url,
237
+ 'filepath': None,
238
+ 'size': len(content),
239
+ 'status': 'too_large'
240
+ }
241
+ results.append(result)
242
+ continue
243
+
244
+ # Generate filename and save
245
+ url_hash = hashlib.md5(url.encode()).hexdigest()[:8]
246
+ parsed = urlparse(url)
247
+ domain = parsed.netloc.replace('.', '_')
248
+ filename = f"{domain}_{url_hash}.js"
249
+ filepath = self.js_dir / filename
250
+
251
+ with open(filepath, 'wb') as f:
252
+ f.write(content)
253
+
254
+ result = {
255
+ 'url': url,
256
+ 'filepath': str(filepath),
257
+ 'size': len(content),
258
+ 'status': 'success'
259
+ }
260
+
261
+ # Cache result
262
+ self.cache.set_download_cache(url, result)
263
+ results.append(result)
264
+
265
+ except Exception as e:
266
+ result = {
267
+ 'url': url,
268
+ 'filepath': None,
269
+ 'size': 0,
270
+ 'status': 'failed',
271
+ 'error': str(e)
272
+ }
273
+ results.append(result)
79
274
 
80
- log_progress(f"Successfully downloaded {successful}/{len(urls_to_download)} JavaScript files")
275
+ successful = len([r for r in results if r['status'] == 'success'])
276
+ log_progress(f"Sequential download complete: {successful}/{len(urls)} successful")
81
277
 
82
278
  return results
@@ -1,17 +1,17 @@
1
1
  """
2
- JavaScript File Filtering Module
2
+ JavaScript File Filtering Module - Smart Early Prioritization
3
3
  """
4
4
 
5
5
  import re
6
6
  from pathlib import Path
7
- from typing import List, Dict
7
+ from typing import List, Dict, Tuple
8
8
  from urllib.parse import urlparse, parse_qs
9
9
 
10
10
  from ..utils.logger import log_progress
11
11
  from ..utils.fs import save_lines, save_json
12
12
 
13
13
  class JSFilter:
14
- """Filter and prioritize JavaScript files"""
14
+ """Filter and prioritize JavaScript files with early smart scoring"""
15
15
 
16
16
  def __init__(self, output_dir: Path):
17
17
  self.output_dir = output_dir
@@ -30,46 +30,108 @@ class JSFilter:
30
30
  r'text/javascript'
31
31
  ]
32
32
 
33
- # High-value JS file indicators
33
+ # 🔥 SMART EARLY SCORING PATTERNS
34
34
  self.high_value_patterns = [
35
- r'admin',
36
- r'api',
37
- r'auth',
38
- r'config',
39
- r'dashboard',
40
- r'login',
41
- r'panel',
42
- r'private',
43
- r'secure',
44
- r'user',
45
- r'account',
46
- r'profile',
47
- r'settings',
48
- r'management',
49
- r'internal'
35
+ # Critical business logic
36
+ (r'/app/', 15),
37
+ (r'/api/', 20),
38
+ (r'/admin/', 25),
39
+ (r'/dashboard/', 20),
40
+ (r'/panel/', 18),
41
+ (r'/manage/', 18),
42
+ (r'/config/', 22),
43
+ (r'/auth/', 20),
44
+ (r'/login/', 18),
45
+ (r'/user/', 15),
46
+ (r'/account/', 15),
47
+ (r'/profile/', 12),
48
+ (r'/settings/', 15),
49
+ (r'/internal/', 25),
50
+ (r'/private/', 25),
51
+ # Custom/business specific
52
+ (r'[^/]+\.(js|jsx|ts|tsx)$', 10), # Custom files
53
+ (r'\?.*=.*', 8), # Has query parameters
50
54
  ]
51
55
 
52
- # Vendor/library patterns to deprioritize
56
+ # 🚫 VENDOR/NOISE PATTERNS (SKIP THESE)
53
57
  self.vendor_patterns = [
54
- r'jquery',
55
- r'bootstrap',
56
- r'angular',
57
- r'react',
58
- r'vue',
59
- r'lodash',
60
- r'moment',
61
- r'chart',
62
- r'google',
63
- r'facebook',
64
- r'twitter',
65
- r'analytics',
66
- r'gtm',
67
- r'cdn\.',
68
- r'unpkg',
69
- r'jsdelivr',
70
- r'cdnjs'
58
+ # Major libraries (SKIP COMPLETELY)
59
+ (r'jquery', -50),
60
+ (r'bootstrap', -50),
61
+ (r'angular', -40),
62
+ (r'react', -40),
63
+ (r'vue', -40),
64
+ (r'lodash', -50),
65
+ (r'moment', -50),
66
+ (r'chart', -45),
67
+ # Analytics/tracking (WASTE OF TIME)
68
+ (r'google', -60),
69
+ (r'facebook', -60),
70
+ (r'twitter', -60),
71
+ (r'analytics', -70),
72
+ (r'gtm', -70),
73
+ (r'gtag', -70),
74
+ (r'tracking', -65),
75
+ # CDNs (SKIP)
76
+ (r'cdn\.', -60),
77
+ (r'unpkg', -60),
78
+ (r'jsdelivr', -60),
79
+ (r'cdnjs', -60),
80
+ # Build artifacts (USUALLY NOISE)
81
+ (r'chunk', -30),
82
+ (r'vendor', -40),
83
+ (r'bundle', -25),
84
+ (r'\.min\.', -20),
85
+ # Version indicators (LESS IMPORTANT)
86
+ (r'v\d+', -15),
87
+ (r'version', -15),
88
+ (r'\d+\.\d+', -10),
71
89
  ]
72
90
 
91
+ def calculate_smart_score(self, url: str) -> Tuple[int, str]:
92
+ """
93
+ 🔥 SMART SCORING ALGORITHM
94
+ Returns (score, reason) - Higher score = more important
95
+ """
96
+ score = 0
97
+ reasons = []
98
+ url_lower = url.lower()
99
+
100
+ # High-value patterns (BUSINESS LOGIC)
101
+ for pattern, points in self.high_value_patterns:
102
+ if re.search(pattern, url_lower):
103
+ score += points
104
+ reasons.append(f"+{points} ({pattern})")
105
+
106
+ # Vendor/noise patterns (SKIP THESE)
107
+ for pattern, points in self.vendor_patterns:
108
+ if re.search(pattern, url_lower):
109
+ score += points # These are negative
110
+ reasons.append(f"{points} ({pattern})")
111
+
112
+ # Path depth bonus (shorter = more important)
113
+ path_depth = url.count('/')
114
+ if path_depth < 4:
115
+ depth_bonus = (4 - path_depth) * 3
116
+ score += depth_bonus
117
+ reasons.append(f"+{depth_bonus} (short path)")
118
+
119
+ # Query parameters bonus (dynamic content)
120
+ if '?' in url and '=' in url:
121
+ score += 8
122
+ reasons.append("+8 (has params)")
123
+
124
+ # File extension analysis
125
+ if url_lower.endswith(('.min.js', '.min.jsx')):
126
+ score -= 15
127
+ reasons.append("-15 (minified)")
128
+ elif url_lower.endswith(('.js', '.jsx', '.ts', '.tsx')):
129
+ score += 5
130
+ reasons.append("+5 (js file)")
131
+
132
+ reason = "; ".join(reasons) if reasons else "default"
133
+ return max(-100, min(100, score)), reason
134
+
73
135
  def is_javascript_url(self, url: str) -> bool:
74
136
  """Check if URL points to JavaScript file"""
75
137
  url_lower = url.lower()
@@ -88,47 +150,17 @@ class JSFilter:
88
150
 
89
151
  return False
90
152
 
91
- def calculate_priority_score(self, url: str) -> int:
92
- """Calculate priority score for JS file (higher = more important)"""
93
- score = 0
94
- url_lower = url.lower()
95
-
96
- # High-value indicators (+10 each)
97
- for pattern in self.high_value_patterns:
98
- if re.search(pattern, url_lower):
99
- score += 10
100
-
101
- # Vendor/library indicators (-5 each)
102
- for pattern in self.vendor_patterns:
103
- if re.search(pattern, url_lower):
104
- score -= 5
105
-
106
- # Shorter paths are often more important (+1 per missing slash)
107
- path_depth = url.count('/')
108
- if path_depth < 5:
109
- score += (5 - path_depth)
110
-
111
- # Non-minified files are more readable (+3)
112
- if '.min.' not in url_lower:
113
- score += 3
114
-
115
- # Files with version numbers might be less important (-2)
116
- if re.search(r'v\d+|version|\d+\.\d+', url_lower):
117
- score -= 2
118
-
119
- return max(0, score) # Ensure non-negative
120
-
121
153
  def filter_javascript_urls(self, urls: List[str]) -> Dict[str, List[str]]:
122
154
  """
123
- Filter URLs to find JavaScript files and prioritize them
155
+ >> SMART FILTER: Early prioritization to save massive time
124
156
 
125
157
  Args:
126
158
  urls: List of URLs to filter
127
159
 
128
160
  Returns:
129
- Dictionary with 'all_js', 'high_priority', 'medium_priority', 'low_priority'
161
+ Dictionary with tiered JS files for different analysis levels
130
162
  """
131
- log_progress("Filtering JavaScript files...")
163
+ log_progress(">> Smart filtering JavaScript files with early prioritization...")
132
164
 
133
165
  js_urls = []
134
166
 
@@ -139,48 +171,71 @@ class JSFilter:
139
171
 
140
172
  log_progress(f"Found {len(js_urls)} JavaScript files")
141
173
 
142
- # Calculate priority scores
174
+ # 🔥 SMART SCORING
143
175
  scored_urls = []
176
+ vendor_skipped = 0
177
+
144
178
  for url in js_urls:
145
- score = self.calculate_priority_score(url)
146
- scored_urls.append((url, score))
179
+ score, reason = self.calculate_smart_score(url)
180
+
181
+ # SKIP VENDOR/NOISE COMPLETELY (HUGE TIME SAVER)
182
+ if score < -30:
183
+ vendor_skipped += 1
184
+ continue
185
+
186
+ scored_urls.append({
187
+ 'url': url,
188
+ 'score': score,
189
+ 'reason': reason
190
+ })
147
191
 
148
192
  # Sort by score (highest first)
149
- scored_urls.sort(key=lambda x: x[1], reverse=True)
193
+ scored_urls.sort(key=lambda x: x['score'], reverse=True)
194
+
195
+ # >> TIERED ANALYSIS ASSIGNMENT
196
+ total_js = len(scored_urls)
197
+
198
+ # Tier 1: TOP 20% - Full analysis (AST + Regex + Secrets)
199
+ tier1_count = max(5, int(total_js * 0.2))
200
+ tier1_urls = scored_urls[:tier1_count]
150
201
 
151
- # Categorize by priority
152
- high_priority = []
153
- medium_priority = []
154
- low_priority = []
202
+ # Tier 2: NEXT 30% - Medium analysis (Regex + LinkFinder)
203
+ tier2_count = int(total_js * 0.3)
204
+ tier2_urls = scored_urls[tier1_count:tier1_count + tier2_count]
155
205
 
156
- for url, score in scored_urls:
157
- if score >= 15:
158
- high_priority.append(url)
159
- elif score >= 5:
160
- medium_priority.append(url)
161
- else:
162
- low_priority.append(url)
206
+ # Tier 3: REMAINING 50% - Light analysis (Regex only)
207
+ tier3_urls = scored_urls[tier1_count + tier2_count:]
163
208
 
164
209
  results = {
165
- 'all_js': [url for url, _ in scored_urls],
166
- 'high_priority': high_priority,
167
- 'medium_priority': medium_priority,
168
- 'low_priority': low_priority
210
+ 'tier1_full': [item['url'] for item in tier1_urls],
211
+ 'tier2_medium': [item['url'] for item in tier2_urls],
212
+ 'tier3_light': [item['url'] for item in tier3_urls],
213
+ 'all_js': [item['url'] for item in scored_urls],
214
+ 'vendor_skipped': vendor_skipped
169
215
  }
170
216
 
171
- log_progress(f"Prioritized: {len(high_priority)} high, {len(medium_priority)} medium, {len(low_priority)} low")
217
+ log_progress(f">> Smart tiers: T1({len(tier1_urls)}) T2({len(tier2_urls)}) T3({len(tier3_urls)}) Skipped({vendor_skipped})")
172
218
 
173
- # Save results
219
+ # Save tiered results
220
+ save_lines(results['tier1_full'], self.output_dir / "js_tier1_full.txt")
221
+ save_lines(results['tier2_medium'], self.output_dir / "js_tier2_medium.txt")
222
+ save_lines(results['tier3_light'], self.output_dir / "js_tier3_light.txt")
174
223
  save_lines(results['all_js'], self.output_dir / "js_files_all.txt")
175
- save_lines(results['high_priority'], self.output_dir / "js_files_high_priority.txt")
176
- save_lines(results['medium_priority'], self.output_dir / "js_files_medium_priority.txt")
177
- save_lines(results['low_priority'], self.output_dir / "js_files_low_priority.txt")
178
-
179
- # Save detailed results with scores
180
- detailed_results = [
181
- {"url": url, "priority_score": score}
182
- for url, score in scored_urls
183
- ]
184
- save_json(detailed_results, self.output_dir / "js_files_detailed.json")
224
+
225
+ # Save detailed scoring
226
+ detailed_results = {
227
+ 'summary': {
228
+ 'total_js_found': len(js_urls),
229
+ 'vendor_skipped': vendor_skipped,
230
+ 'tier1_count': len(tier1_urls),
231
+ 'tier2_count': len(tier2_urls),
232
+ 'tier3_count': len(tier3_urls),
233
+ 'time_saved_estimate': f"{vendor_skipped * 2 + len(tier3_urls) * 1.5:.1f} seconds"
234
+ },
235
+ 'tier1_details': tier1_urls,
236
+ 'tier2_details': tier2_urls,
237
+ 'tier3_details': tier3_urls
238
+ }
239
+ save_json(detailed_results, self.output_dir / "js_smart_analysis.json")
185
240
 
186
241
  return results