jseye 1.0.0__py3-none-any.whl → 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,304 @@
1
+ """
2
+ Tiered Analysis Engine - Smart analysis based on JS file importance
3
+ """
4
+
5
+ import asyncio
6
+ from pathlib import Path
7
+ from typing import List, Dict, Any
8
+ from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
9
+ import time
10
+
11
+ from ..utils.logger import log_progress
12
+ from ..utils.fs import save_json
13
+ from .analyze_regex import RegexAnalyzer
14
+ from .analyze_ast import ASTAnalyzer
15
+ from .linkfinder import LinkFinderIntegration
16
+ from .secrets import SecretsDetector
17
+
18
+ class TieredAnalysisEngine:
19
+ """
20
+ 🔥 TIERED ANALYSIS MODEL
21
+
22
+ Tier 1: Regex + AST + LinkFinder + Secrets (HEAVY) - Top 20%
23
+ Tier 2: Regex + LinkFinder (MEDIUM) - Next 30%
24
+ Tier 3: Regex only (FAST) - Remaining 50%
25
+ """
26
+
27
+ def __init__(self, output_dir: Path):
28
+ self.output_dir = output_dir
29
+ self.regex_analyzer = RegexAnalyzer(output_dir)
30
+ self.ast_analyzer = ASTAnalyzer(output_dir)
31
+ self.linkfinder = LinkFinderIntegration(output_dir)
32
+ self.secrets_detector = SecretsDetector(output_dir)
33
+
34
+ # Performance tracking
35
+ self.analysis_stats = {
36
+ 'tier1_time': 0,
37
+ 'tier2_time': 0,
38
+ 'tier3_time': 0,
39
+ 'files_analyzed': 0,
40
+ 'time_saved': 0
41
+ }
42
+
43
+ async def analyze_tier1_file(self, js_file: Dict) -> Dict[str, Any]:
44
+ """
45
+ TIER 1: Full analysis (Heavy)
46
+ - Regex analysis
47
+ - AST analysis
48
+ - LinkFinder
49
+ - Secrets detection
50
+ """
51
+ start_time = time.time()
52
+
53
+ try:
54
+ results = {
55
+ 'file': js_file['filepath'],
56
+ 'tier': 1,
57
+ 'analysis': {
58
+ 'regex': [],
59
+ 'ast': [],
60
+ 'linkfinder': [],
61
+ 'secrets': []
62
+ }
63
+ }
64
+
65
+ # Run all analysis types
66
+ if js_file.get('status') == 'success' and js_file.get('filepath'):
67
+ # Regex analysis
68
+ regex_results = self.regex_analyzer.analyze_files([js_file])
69
+ results['analysis']['regex'] = regex_results.get('endpoints', [])
70
+
71
+ # AST analysis
72
+ ast_results = self.ast_analyzer.analyze_files([js_file])
73
+ results['analysis']['ast'] = ast_results.get('endpoints', [])
74
+
75
+ # LinkFinder
76
+ lf_results = self.linkfinder.analyze_js_file(js_file['filepath'])
77
+ results['analysis']['linkfinder'] = lf_results
78
+
79
+ # Secrets
80
+ secret_results = self.secrets_detector.run_mantra([js_file])
81
+ results['analysis']['secrets'] = secret_results
82
+
83
+ analysis_time = time.time() - start_time
84
+ results['analysis_time'] = analysis_time
85
+ self.analysis_stats['tier1_time'] += analysis_time
86
+
87
+ return results
88
+
89
+ except Exception as e:
90
+ return {
91
+ 'file': js_file.get('filepath', 'unknown'),
92
+ 'tier': 1,
93
+ 'error': str(e),
94
+ 'analysis_time': time.time() - start_time
95
+ }
96
+
97
+ async def analyze_tier2_file(self, js_file: Dict) -> Dict[str, Any]:
98
+ """
99
+ TIER 2: Medium analysis
100
+ - Regex analysis
101
+ - LinkFinder
102
+ """
103
+ start_time = time.time()
104
+
105
+ try:
106
+ results = {
107
+ 'file': js_file['filepath'],
108
+ 'tier': 2,
109
+ 'analysis': {
110
+ 'regex': [],
111
+ 'linkfinder': []
112
+ }
113
+ }
114
+
115
+ if js_file.get('status') == 'success' and js_file.get('filepath'):
116
+ # Regex analysis
117
+ regex_results = self.regex_analyzer.analyze_files([js_file])
118
+ results['analysis']['regex'] = regex_results.get('endpoints', [])
119
+
120
+ # Skip AST if regex finds nothing (SMART OPTIMIZATION)
121
+ if len(results['analysis']['regex']) > 0:
122
+ # LinkFinder
123
+ lf_results = self.linkfinder.analyze_js_file(js_file['filepath'])
124
+ results['analysis']['linkfinder'] = lf_results
125
+ else:
126
+ log_progress(f"Tier 2: Skipping LinkFinder for {js_file['filepath']} (no regex results)")
127
+
128
+ analysis_time = time.time() - start_time
129
+ results['analysis_time'] = analysis_time
130
+ self.analysis_stats['tier2_time'] += analysis_time
131
+
132
+ return results
133
+
134
+ except Exception as e:
135
+ return {
136
+ 'file': js_file.get('filepath', 'unknown'),
137
+ 'tier': 2,
138
+ 'error': str(e),
139
+ 'analysis_time': time.time() - start_time
140
+ }
141
+
142
+ async def analyze_tier3_file(self, js_file: Dict) -> Dict[str, Any]:
143
+ """
144
+ TIER 3: Light analysis (Fast)
145
+ - Regex analysis only
146
+ """
147
+ start_time = time.time()
148
+
149
+ try:
150
+ results = {
151
+ 'file': js_file['filepath'],
152
+ 'tier': 3,
153
+ 'analysis': {
154
+ 'regex': []
155
+ }
156
+ }
157
+
158
+ if js_file.get('status') == 'success' and js_file.get('filepath'):
159
+ # Regex analysis only
160
+ regex_results = self.regex_analyzer.analyze_files([js_file])
161
+ results['analysis']['regex'] = regex_results.get('endpoints', [])
162
+
163
+ analysis_time = time.time() - start_time
164
+ results['analysis_time'] = analysis_time
165
+ self.analysis_stats['tier3_time'] += analysis_time
166
+
167
+ return results
168
+
169
+ except Exception as e:
170
+ return {
171
+ 'file': js_file.get('filepath', 'unknown'),
172
+ 'tier': 3,
173
+ 'error': str(e),
174
+ 'analysis_time': time.time() - start_time
175
+ }
176
+
177
+ async def run_tiered_analysis(self, tiered_js_files: Dict[str, List[str]],
178
+ downloaded_files: List[Dict]) -> Dict[str, Any]:
179
+ """
180
+ >> Run tiered analysis with parallel processing
181
+
182
+ Args:
183
+ tiered_js_files: Dictionary with tier1_full, tier2_medium, tier3_light
184
+ downloaded_files: List of downloaded JS file info
185
+
186
+ Returns:
187
+ Combined analysis results
188
+ """
189
+ log_progress("🔥 Starting TIERED ANALYSIS ENGINE")
190
+
191
+ # Map URLs to file info
192
+ url_to_file = {f['url']: f for f in downloaded_files if f.get('status') == 'success'}
193
+
194
+ # Prepare files for each tier
195
+ tier1_files = [url_to_file[url] for url in tiered_js_files.get('tier1_full', []) if url in url_to_file]
196
+ tier2_files = [url_to_file[url] for url in tiered_js_files.get('tier2_medium', []) if url in url_to_file]
197
+ tier3_files = [url_to_file[url] for url in tiered_js_files.get('tier3_light', []) if url in url_to_file]
198
+
199
+ log_progress(f"Tier 1 (HEAVY): {len(tier1_files)} files")
200
+ log_progress(f"Tier 2 (MEDIUM): {len(tier2_files)} files")
201
+ log_progress(f"Tier 3 (LIGHT): {len(tier3_files)} files")
202
+
203
+ all_results = []
204
+
205
+ # Process each tier with appropriate concurrency
206
+ if tier1_files:
207
+ log_progress(">> Processing Tier 1 (Full Analysis)...")
208
+ # Limit concurrency for heavy analysis
209
+ semaphore1 = asyncio.Semaphore(2)
210
+
211
+ async def analyze_tier1_with_semaphore(js_file):
212
+ async with semaphore1:
213
+ return await self.analyze_tier1_file(js_file)
214
+
215
+ tier1_tasks = [analyze_tier1_with_semaphore(f) for f in tier1_files]
216
+ tier1_results = await asyncio.gather(*tier1_tasks, return_exceptions=True)
217
+ all_results.extend([r for r in tier1_results if not isinstance(r, Exception)])
218
+
219
+ if tier2_files:
220
+ log_progress(">> Processing Tier 2 (Medium Analysis)...")
221
+ # More concurrency for medium analysis
222
+ semaphore2 = asyncio.Semaphore(4)
223
+
224
+ async def analyze_tier2_with_semaphore(js_file):
225
+ async with semaphore2:
226
+ return await self.analyze_tier2_file(js_file)
227
+
228
+ tier2_tasks = [analyze_tier2_with_semaphore(f) for f in tier2_files]
229
+ tier2_results = await asyncio.gather(*tier2_tasks, return_exceptions=True)
230
+ all_results.extend([r for r in tier2_results if not isinstance(r, Exception)])
231
+
232
+ if tier3_files:
233
+ log_progress(">> Processing Tier 3 (Light Analysis)...")
234
+ # High concurrency for light analysis
235
+ semaphore3 = asyncio.Semaphore(8)
236
+
237
+ async def analyze_tier3_with_semaphore(js_file):
238
+ async with semaphore3:
239
+ return await self.analyze_tier3_file(js_file)
240
+
241
+ tier3_tasks = [analyze_tier3_with_semaphore(f) for f in tier3_files]
242
+ tier3_results = await asyncio.gather(*tier3_tasks, return_exceptions=True)
243
+ all_results.extend([r for r in tier3_results if not isinstance(r, Exception)])
244
+
245
+ # Aggregate results
246
+ aggregated = self.aggregate_results(all_results)
247
+
248
+ # Save detailed results
249
+ save_json({
250
+ 'summary': aggregated['summary'],
251
+ 'performance_stats': self.analysis_stats,
252
+ 'detailed_results': all_results
253
+ }, self.output_dir / "tiered_analysis_results.json")
254
+
255
+ log_progress(f">> Tiered analysis complete: {aggregated['summary']['total_findings']} findings")
256
+ log_progress(f"[T] Time breakdown: T1({self.analysis_stats['tier1_time']:.1f}s) T2({self.analysis_stats['tier2_time']:.1f}s) T3({self.analysis_stats['tier3_time']:.1f}s)")
257
+
258
+ return aggregated
259
+
260
+ def aggregate_results(self, all_results: List[Dict]) -> Dict[str, Any]:
261
+ """Aggregate results from all tiers"""
262
+ endpoints = []
263
+ secrets = []
264
+ sinks = []
265
+ functions = []
266
+
267
+ tier_stats = {'tier1': 0, 'tier2': 0, 'tier3': 0}
268
+
269
+ for result in all_results:
270
+ if 'error' in result:
271
+ continue
272
+
273
+ tier_stats[f"tier{result['tier']}"] += 1
274
+ analysis = result.get('analysis', {})
275
+
276
+ # Collect endpoints
277
+ if 'regex' in analysis:
278
+ endpoints.extend(analysis['regex'])
279
+ if 'ast' in analysis:
280
+ endpoints.extend(analysis['ast'])
281
+ if 'linkfinder' in analysis:
282
+ endpoints.extend(analysis['linkfinder'])
283
+
284
+ # Collect secrets
285
+ if 'secrets' in analysis:
286
+ secrets.extend(analysis['secrets'])
287
+
288
+ return {
289
+ 'summary': {
290
+ 'total_findings': len(endpoints) + len(secrets),
291
+ 'endpoints_found': len(endpoints),
292
+ 'secrets_found': len(secrets),
293
+ 'files_by_tier': tier_stats,
294
+ 'total_analysis_time': sum([
295
+ self.analysis_stats['tier1_time'],
296
+ self.analysis_stats['tier2_time'],
297
+ self.analysis_stats['tier3_time']
298
+ ])
299
+ },
300
+ 'endpoints': endpoints,
301
+ 'secrets': secrets,
302
+ 'sinks': sinks,
303
+ 'functions': functions
304
+ }