PyPI - jseye - Versions diffs - 1.0.0__py3-none-any.whl → 1.0.2__py3-none-any.whl - Mend

jseye 1.0.0py3-none-any.whl → 1.0.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

jseye/__init__.py +1 -1
jseye/__main__.py +9 -0
jseye/banner.py +59 -12
jseye/cli.py +87 -42
jseye/installer.py +2 -5
jseye/modules/harvest.py +125 -72
jseye/modules/js_download.py +235 -39
jseye/modules/js_filter.py +156 -101
jseye/modules/linkfinder.py +337 -27
jseye/modules/tiered_analysis.py +304 -0
jseye/pipeline.py +188 -70
jseye/utils/cache.py +241 -0
{jseye-1.0.0.dist-info → jseye-1.0.2.dist-info}/METADATA +2 -2
jseye-1.0.2.dist-info/RECORD +31 -0
jseye-1.0.0.dist-info/RECORD +0 -28
{jseye-1.0.0.dist-info → jseye-1.0.2.dist-info}/WHEEL +0 -0
{jseye-1.0.0.dist-info → jseye-1.0.2.dist-info}/entry_points.txt +0 -0
{jseye-1.0.0.dist-info → jseye-1.0.2.dist-info}/licenses/LICENSE +0 -0
{jseye-1.0.0.dist-info → jseye-1.0.2.dist-info}/top_level.txt +0 -0

jseye/modules/js_download.py CHANGED Viewed

@@ -1,82 +1,278 @@
 """
-JavaScript Download Module
+JavaScript Download Module - With caching and parallel processing
 """
+import asyncio
+import aiohttp
 import requests
 from pathlib import Path
 from typing import List, Dict
 from urllib.parse import urlparse
 import hashlib
+from concurrent.futures import ThreadPoolExecutor
 from ..utils.logger import log_progress
 from ..utils.fs import ensure_dir
+from ..utils.cache import JSEyeCache
 class JSDownloader:
-    """Download JavaScript files"""
+    """Download JavaScript files with caching and parallel processing"""
     def __init__(self, output_dir: Path):
         self.output_dir = output_dir
         self.js_dir = output_dir / "js_files"
         ensure_dir(self.js_dir)
-        self.session = requests.Session()
-        self.session.headers.update({
+        # Initialize cache
+        self.cache = JSEyeCache(output_dir)
+        # Download settings
+        self.max_file_size = 3 * 1024 * 1024  # 3MB limit
+        self.timeout = 30  # 30 seconds timeout
+        self.max_concurrent = 10  # Max concurrent downloads
+        self.session_headers = {
             'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
-        })
+        }
-    def download_js_file(self, url: str) -> Dict:
-        """Download a single JavaScript file"""
+    async def download_js_file_async(self, session: aiohttp.ClientSession, url: str) -> Dict:
+        """Download a single JavaScript file asynchronously with caching"""
+        # Check cache first
+        cached_result = self.cache.get_download_cache(url)
+        if cached_result:
+            return cached_result
         try:
-            response = self.session.get(url, timeout=30)
-            response.raise_for_status()
-            # Generate filename from URL hash
-            url_hash = hashlib.md5(url.encode()).hexdigest()[:8]
-            parsed = urlparse(url)
-            domain = parsed.netloc.replace('.', '_')
-            filename = f"{domain}_{url_hash}.js"
-            filepath = self.js_dir / filename
-            with open(filepath, 'w', encoding='utf-8', errors='ignore') as f:
-                f.write(response.text)
-            return {
+            async with session.get(url, timeout=aiohttp.ClientTimeout(total=self.timeout)) as response:
+                # Check content length
+                content_length = response.headers.get('content-length')
+                if content_length and int(content_length) > self.max_file_size:
+                    result = {
+                        'url': url,
+                        'filepath': None,
+                        'size': int(content_length),
+                        'status': 'too_large',
+                        'error': f'File too large: {int(content_length)} bytes'
+                    }
+                    self.cache.set_download_cache(url, result)
+                    return result
+                # Read content with size limit
+                content = await response.read()
+                if len(content) > self.max_file_size:
+                    result = {
+                        'url': url,
+                        'filepath': None,
+                        'size': len(content),
+                        'status': 'too_large',
+                        'error': f'Content too large: {len(content)} bytes'
+                    }
+                    self.cache.set_download_cache(url, result)
+                    return result
+                # Generate filename from URL hash
+                url_hash = hashlib.md5(url.encode()).hexdigest()[:8]
+                parsed = urlparse(url)
+                domain = parsed.netloc.replace('.', '_')
+                filename = f"{domain}_{url_hash}.js"
+                filepath = self.js_dir / filename
+                # Write file
+                with open(filepath, 'wb') as f:
+                    f.write(content)
+                result = {
+                    'url': url,
+                    'filepath': str(filepath),
+                    'size': len(content),
+                    'status': 'success'
+                }
+                # Cache the result
+                self.cache.set_download_cache(url, result)
+                return result
+        except asyncio.TimeoutError:
+            result = {
                 'url': url,
-                'filepath': str(filepath),
-                'size': len(response.text),
-                'status': 'success'
+                'filepath': None,
+                'size': 0,
+                'status': 'timeout',
+                'error': 'Download timeout'
             }
+            self.cache.set_download_cache(url, result)
+            return result
         except Exception as e:
-            return {
+            result = {
                 'url': url,
                 'filepath': None,
                 'size': 0,
                 'status': 'failed',
                 'error': str(e)
             }
+            self.cache.set_download_cache(url, result)
+            return result
-    def download_js_files(self, urls: List[str], max_files: int = 100) -> List[Dict]:
-        """Download multiple JavaScript files"""
-        log_progress(f"Downloading JavaScript files (max {max_files})")
+    async def download_js_files_parallel(self, urls: List[str]) -> List[Dict]:
+        """Download multiple JavaScript files in parallel"""
+        log_progress(f">> Downloading {len(urls)} JavaScript files in parallel...")
+        # Create semaphore to limit concurrent downloads
+        semaphore = asyncio.Semaphore(self.max_concurrent)
-        # Limit number of files to download
+        async def download_with_semaphore(session, url):
+            async with semaphore:
+                return await self.download_js_file_async(session, url)
+        # Create aiohttp session
+        connector = aiohttp.TCPConnector(limit=self.max_concurrent)
+        timeout = aiohttp.ClientTimeout(total=self.timeout)
+        async with aiohttp.ClientSession(
+            connector=connector,
+            timeout=timeout,
+            headers=self.session_headers
+        ) as session:
+            # Create tasks for all downloads
+            tasks = [download_with_semaphore(session, url) for url in urls]
+            # Execute all downloads in parallel
+            results = await asyncio.gather(*tasks, return_exceptions=True)
+            # Filter out exceptions
+            valid_results = []
+            for i, result in enumerate(results):
+                if isinstance(result, Exception):
+                    log_progress(f"Download exception for {urls[i]}: {result}")
+                    valid_results.append({
+                        'url': urls[i],
+                        'filepath': None,
+                        'size': 0,
+                        'status': 'exception',
+                        'error': str(result)
+                    })
+                else:
+                    valid_results.append(result)
+        # Count successful downloads
+        successful = len([r for r in valid_results if r['status'] == 'success'])
+        cached = len([r for r in valid_results if 'cached_at' in r])
+        log_progress(f"[C] Download complete: {successful} new, {cached} cached, {len(valid_results)} total")
+        return valid_results
+    def download_js_files(self, urls: List[str], max_files: int = 200) -> List[Dict]:
+        """
+        Download JavaScript files (main entry point)
+        Args:
+            urls: List of JavaScript URLs
+            max_files: Maximum number of files to download
+        Returns:
+            List of download results
+        """
+        # Limit number of files
         urls_to_download = urls[:max_files]
+        try:
+            # Use async download
+            results = asyncio.run(self.download_js_files_parallel(urls_to_download))
+        except Exception as e:
+            log_progress(f"Parallel download failed, falling back to sequential: {e}")
+            results = self.download_js_files_sequential(urls_to_download)
+        return results
+    def download_js_files_sequential(self, urls: List[str]) -> List[Dict]:
+        """Fallback sequential download"""
+        log_progress("Using sequential download (fallback)")
         results = []
-        successful = 0
+        session = requests.Session()
+        session.headers.update(self.session_headers)
-        for i, url in enumerate(urls_to_download, 1):
-            if i % 10 == 0:
-                log_progress(f"Downloaded {i}/{len(urls_to_download)} files")
+        for i, url in enumerate(urls, 1):
+            if i % 20 == 0:
+                log_progress(f"Downloaded {i}/{len(urls)} files")
-            result = self.download_js_file(url)
-            results.append(result)
+            # Check cache first
+            cached_result = self.cache.get_download_cache(url)
+            if cached_result:
+                results.append(cached_result)
+                continue
-            if result['status'] == 'success':
-                successful += 1
+            try:
+                response = session.get(url, timeout=self.timeout, stream=True)
+                # Check content length
+                content_length = response.headers.get('content-length')
+                if content_length and int(content_length) > self.max_file_size:
+                    result = {
+                        'url': url,
+                        'filepath': None,
+                        'size': int(content_length),
+                        'status': 'too_large'
+                    }
+                    results.append(result)
+                    continue
+                # Read content with size limit
+                content = b''
+                for chunk in response.iter_content(chunk_size=8192):
+                    content += chunk
+                    if len(content) > self.max_file_size:
+                        break
+                if len(content) > self.max_file_size:
+                    result = {
+                        'url': url,
+                        'filepath': None,
+                        'size': len(content),
+                        'status': 'too_large'
+                    }
+                    results.append(result)
+                    continue
+                # Generate filename and save
+                url_hash = hashlib.md5(url.encode()).hexdigest()[:8]
+                parsed = urlparse(url)
+                domain = parsed.netloc.replace('.', '_')
+                filename = f"{domain}_{url_hash}.js"
+                filepath = self.js_dir / filename
+                with open(filepath, 'wb') as f:
+                    f.write(content)
+                result = {
+                    'url': url,
+                    'filepath': str(filepath),
+                    'size': len(content),
+                    'status': 'success'
+                }
+                # Cache result
+                self.cache.set_download_cache(url, result)
+                results.append(result)
+            except Exception as e:
+                result = {
+                    'url': url,
+                    'filepath': None,
+                    'size': 0,
+                    'status': 'failed',
+                    'error': str(e)
+                }
+                results.append(result)
-        log_progress(f"Successfully downloaded {successful}/{len(urls_to_download)} JavaScript files")
+        successful = len([r for r in results if r['status'] == 'success'])
+        log_progress(f"Sequential download complete: {successful}/{len(urls)} successful")
         return results

jseye/modules/js_filter.py CHANGED Viewed

@@ -1,17 +1,17 @@
 """
-JavaScript File Filtering Module
+JavaScript File Filtering Module - Smart Early Prioritization
 """
 import re
 from pathlib import Path
-from typing import List, Dict
+from typing import List, Dict, Tuple
 from urllib.parse import urlparse, parse_qs
 from ..utils.logger import log_progress
 from ..utils.fs import save_lines, save_json
 class JSFilter:
-    """Filter and prioritize JavaScript files"""
+    """Filter and prioritize JavaScript files with early smart scoring"""
     def __init__(self, output_dir: Path):
         self.output_dir = output_dir
@@ -30,46 +30,108 @@ class JSFilter:
             r'text/javascript'
         ]
-        # High-value JS file indicators
+        # 🔥 SMART EARLY SCORING PATTERNS
         self.high_value_patterns = [
-            r'admin',
-            r'api',
-            r'auth',
-            r'config',
-            r'dashboard',
-            r'login',
-            r'panel',
-            r'private',
-            r'secure',
-            r'user',
-            r'account',
-            r'profile',
-            r'settings',
-            r'management',
-            r'internal'
+            # Critical business logic
+            (r'/app/', 15),
+            (r'/api/', 20),
+            (r'/admin/', 25),
+            (r'/dashboard/', 20),
+            (r'/panel/', 18),
+            (r'/manage/', 18),
+            (r'/config/', 22),
+            (r'/auth/', 20),
+            (r'/login/', 18),
+            (r'/user/', 15),
+            (r'/account/', 15),
+            (r'/profile/', 12),
+            (r'/settings/', 15),
+            (r'/internal/', 25),
+            (r'/private/', 25),
+            # Custom/business specific
+            (r'[^/]+\.(js|jsx|ts|tsx)$', 10),  # Custom files
+            (r'\?.*=.*', 8),  # Has query parameters
         ]
-        # Vendor/library patterns to deprioritize
+        # 🚫 VENDOR/NOISE PATTERNS (SKIP THESE)
         self.vendor_patterns = [
-            r'jquery',
-            r'bootstrap',
-            r'angular',
-            r'react',
-            r'vue',
-            r'lodash',
-            r'moment',
-            r'chart',
-            r'google',
-            r'facebook',
-            r'twitter',
-            r'analytics',
-            r'gtm',
-            r'cdn\.',
-            r'unpkg',
-            r'jsdelivr',
-            r'cdnjs'
+            # Major libraries (SKIP COMPLETELY)
+            (r'jquery', -50),
+            (r'bootstrap', -50),
+            (r'angular', -40),
+            (r'react', -40),
+            (r'vue', -40),
+            (r'lodash', -50),
+            (r'moment', -50),
+            (r'chart', -45),
+            # Analytics/tracking (WASTE OF TIME)
+            (r'google', -60),
+            (r'facebook', -60),
+            (r'twitter', -60),
+            (r'analytics', -70),
+            (r'gtm', -70),
+            (r'gtag', -70),
+            (r'tracking', -65),
+            # CDNs (SKIP)
+            (r'cdn\.', -60),
+            (r'unpkg', -60),
+            (r'jsdelivr', -60),
+            (r'cdnjs', -60),
+            # Build artifacts (USUALLY NOISE)
+            (r'chunk', -30),
+            (r'vendor', -40),
+            (r'bundle', -25),
+            (r'\.min\.', -20),
+            # Version indicators (LESS IMPORTANT)
+            (r'v\d+', -15),
+            (r'version', -15),
+            (r'\d+\.\d+', -10),
         ]
+    def calculate_smart_score(self, url: str) -> Tuple[int, str]:
+        """
+        🔥 SMART SCORING ALGORITHM
+        Returns (score, reason) - Higher score = more important
+        """
+        score = 0
+        reasons = []
+        url_lower = url.lower()
+        # High-value patterns (BUSINESS LOGIC)
+        for pattern, points in self.high_value_patterns:
+            if re.search(pattern, url_lower):
+                score += points
+                reasons.append(f"+{points} ({pattern})")
+        # Vendor/noise patterns (SKIP THESE)
+        for pattern, points in self.vendor_patterns:
+            if re.search(pattern, url_lower):
+                score += points  # These are negative
+                reasons.append(f"{points} ({pattern})")
+        # Path depth bonus (shorter = more important)
+        path_depth = url.count('/')
+        if path_depth < 4:
+            depth_bonus = (4 - path_depth) * 3
+            score += depth_bonus
+            reasons.append(f"+{depth_bonus} (short path)")
+        # Query parameters bonus (dynamic content)
+        if '?' in url and '=' in url:
+            score += 8
+            reasons.append("+8 (has params)")
+        # File extension analysis
+        if url_lower.endswith(('.min.js', '.min.jsx')):
+            score -= 15
+            reasons.append("-15 (minified)")
+        elif url_lower.endswith(('.js', '.jsx', '.ts', '.tsx')):
+            score += 5
+            reasons.append("+5 (js file)")
+        reason = "; ".join(reasons) if reasons else "default"
+        return max(-100, min(100, score)), reason
     def is_javascript_url(self, url: str) -> bool:
         """Check if URL points to JavaScript file"""
         url_lower = url.lower()
@@ -88,47 +150,17 @@ class JSFilter:
         return False
-    def calculate_priority_score(self, url: str) -> int:
-        """Calculate priority score for JS file (higher = more important)"""
-        score = 0
-        url_lower = url.lower()
-        # High-value indicators (+10 each)
-        for pattern in self.high_value_patterns:
-            if re.search(pattern, url_lower):
-                score += 10
-        # Vendor/library indicators (-5 each)
-        for pattern in self.vendor_patterns:
-            if re.search(pattern, url_lower):
-                score -= 5
-        # Shorter paths are often more important (+1 per missing slash)
-        path_depth = url.count('/')
-        if path_depth < 5:
-            score += (5 - path_depth)
-        # Non-minified files are more readable (+3)
-        if '.min.' not in url_lower:
-            score += 3
-        # Files with version numbers might be less important (-2)
-        if re.search(r'v\d+|version|\d+\.\d+', url_lower):
-            score -= 2
-        return max(0, score)  # Ensure non-negative
     def filter_javascript_urls(self, urls: List[str]) -> Dict[str, List[str]]:
         """
-        Filter URLs to find JavaScript files and prioritize them
+        >> SMART FILTER: Early prioritization to save massive time
         Args:
             urls: List of URLs to filter
         Returns:
-            Dictionary with 'all_js', 'high_priority', 'medium_priority', 'low_priority'
+            Dictionary with tiered JS files for different analysis levels
         """
-        log_progress("Filtering JavaScript files...")
+        log_progress(">> Smart filtering JavaScript files with early prioritization...")
         js_urls = []
@@ -139,48 +171,71 @@ class JSFilter:
         log_progress(f"Found {len(js_urls)} JavaScript files")
-        # Calculate priority scores
+        # 🔥 SMART SCORING
         scored_urls = []
+        vendor_skipped = 0
         for url in js_urls:
-            score = self.calculate_priority_score(url)
-            scored_urls.append((url, score))
+            score, reason = self.calculate_smart_score(url)
+            # SKIP VENDOR/NOISE COMPLETELY (HUGE TIME SAVER)
+            if score < -30:
+                vendor_skipped += 1
+                continue
+            scored_urls.append({
+                'url': url,
+                'score': score,
+                'reason': reason
+            })
         # Sort by score (highest first)
-        scored_urls.sort(key=lambda x: x[1], reverse=True)
+        scored_urls.sort(key=lambda x: x['score'], reverse=True)
+        # >> TIERED ANALYSIS ASSIGNMENT
+        total_js = len(scored_urls)
+        # Tier 1: TOP 20% - Full analysis (AST + Regex + Secrets)
+        tier1_count = max(5, int(total_js * 0.2))
+        tier1_urls = scored_urls[:tier1_count]
-        # Categorize by priority
-        high_priority = []
-        medium_priority = []
-        low_priority = []
+        # Tier 2: NEXT 30% - Medium analysis (Regex + LinkFinder)
+        tier2_count = int(total_js * 0.3)
+        tier2_urls = scored_urls[tier1_count:tier1_count + tier2_count]
-        for url, score in scored_urls:
-            if score >= 15:
-                high_priority.append(url)
-            elif score >= 5:
-                medium_priority.append(url)
-            else:
-                low_priority.append(url)
+        # Tier 3: REMAINING 50% - Light analysis (Regex only)
+        tier3_urls = scored_urls[tier1_count + tier2_count:]
         results = {
-            'all_js': [url for url, _ in scored_urls],
-            'high_priority': high_priority,
-            'medium_priority': medium_priority,
-            'low_priority': low_priority
+            'tier1_full': [item['url'] for item in tier1_urls],
+            'tier2_medium': [item['url'] for item in tier2_urls],
+            'tier3_light': [item['url'] for item in tier3_urls],
+            'all_js': [item['url'] for item in scored_urls],
+            'vendor_skipped': vendor_skipped
         }
-        log_progress(f"Prioritized: {len(high_priority)} high, {len(medium_priority)} medium, {len(low_priority)} low")
+        log_progress(f">> Smart tiers: T1({len(tier1_urls)}) T2({len(tier2_urls)}) T3({len(tier3_urls)}) Skipped({vendor_skipped})")
-        # Save results
+        # Save tiered results
+        save_lines(results['tier1_full'], self.output_dir / "js_tier1_full.txt")
+        save_lines(results['tier2_medium'], self.output_dir / "js_tier2_medium.txt")
+        save_lines(results['tier3_light'], self.output_dir / "js_tier3_light.txt")
         save_lines(results['all_js'], self.output_dir / "js_files_all.txt")
-        save_lines(results['high_priority'], self.output_dir / "js_files_high_priority.txt")
-        save_lines(results['medium_priority'], self.output_dir / "js_files_medium_priority.txt")
-        save_lines(results['low_priority'], self.output_dir / "js_files_low_priority.txt")
-        # Save detailed results with scores
-        detailed_results = [
-            {"url": url, "priority_score": score}
-            for url, score in scored_urls
-        ]
-        save_json(detailed_results, self.output_dir / "js_files_detailed.json")
+        # Save detailed scoring
+        detailed_results = {
+            'summary': {
+                'total_js_found': len(js_urls),
+                'vendor_skipped': vendor_skipped,
+                'tier1_count': len(tier1_urls),
+                'tier2_count': len(tier2_urls),
+                'tier3_count': len(tier3_urls),
+                'time_saved_estimate': f"{vendor_skipped * 2 + len(tier3_urls) * 1.5:.1f} seconds"
+            },
+            'tier1_details': tier1_urls,
+            'tier2_details': tier2_urls,
+            'tier3_details': tier3_urls
+        }
+        save_json(detailed_results, self.output_dir / "js_smart_analysis.json")
         return results

jseye 1.0.0__py3-none-any.whl → 1.0.2__py3-none-any.whl

jseye 1.0.0py3-none-any.whl → 1.0.2py3-none-any.whl